Updating models

2019-07-08 22:51:28 +02:00 · 2019-07-08 22:51:28 +02:00 · 0663b67c1a
parent f89dcca19d
commit 0663b67c1a
283 changed files with 112904 additions and 133470 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,3 @@
+repos.cfg
+repos_init.cfg
+nvtool*
--- a/MxNet/Classification/RN50v1.5/LICENSE
+++ b/MxNet/Classification/RN50v1.5/LICENSE
@ -1,4 +1,3 @@
-
                                 Apache License
                           Version 2.0, January 2004
                        http://www.apache.org/licenses/
--- a/MxNet/Classification/RN50v1.5/init.py
+++ b/MxNet/Classification/RN50v1.5/init.py
--- a/MxNet/Classification/RN50v1.5/data.py
+++ b/MxNet/Classification/RN50v1.5/data.py
@ -1,5 +1,7 @@
+# -----------------------------------------------------------------------
 # Copyright 2017-2018 The Apache Software Foundation
 #
+#
 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
 # distributed with this work for additional information
--- a/PyTorch/Classification/RN50v1.5/img/.gitkeep
+++ b/PyTorch/Classification/RN50v1.5/img/.gitkeep
--- a/PyTorch/Classification/RN50v1.5/resnet50v1.5/README.md
+++ b/PyTorch/Classification/RN50v1.5/resnet50v1.5/README.md
--- a/PyTorch/Detection/SSD/Dockerfile
+++ b/PyTorch/Detection/SSD/Dockerfile
@ -1,4 +1,4 @@
-FROM nvcr.io/nvidia/pytorch:19.03-py3
+FROM nvcr.io/nvidia/pytorch:19.05-py3

 # Set working directory
 WORKDIR /mlperf
--- a/PyTorch/Detection/SSD/qa/benchmark_baselines/SSD300_pytorch_18.08_inference_fp16.json
+++ b/PyTorch/Detection/SSD/qa/benchmark_baselines/SSD300_pytorch_18.08_inference_fp16.json
@ -1,31 +0,0 @@
-{
-    "model": "",
-    "ngpus": [1, 4, 8],
-    "bs": [2, 4, 8, 16, 32, 64, 128],
-    "metric_keys": ["images_per_second"],
-    "metrics": {
-        "1": {
-            "2": {
-                "images_per_second": 191.25867003414876
-            },
-            "4": {
-                "images_per_second": 340.9537905548054
-            },
-            "8": {
-                "images_per_second": 517.2612062140391
-            },
-            "16": {
-                "images_per_second": 711.5516679788083
-            },
-            "32": {
-                "images_per_second": 812.9203401838566
-            },
-            "64": {
-                "images_per_second": 951.7432815456556
-            },
-            "128": {
-                "images_per_second": 876.1868813828711
-            }
-        }
-    }
-}
--- a/PyTorch/Detection/SSD/qa/benchmark_baselines/SSD300_pytorch_18.08_inference_fp32.json
+++ b/PyTorch/Detection/SSD/qa/benchmark_baselines/SSD300_pytorch_18.08_inference_fp32.json
@ -1,31 +0,0 @@
-{
-    "model": "",
-    "ngpus": [1, 4, 8],
-    "bs": [2, 4, 8, 16, 32, 64, 128],
-    "metric_keys": ["images_per_second"],
-    "metrics": {
-        "1": {
-            "2": {
-                "images_per_second": 174.58768325581374
-            },
-            "4": {
-                "images_per_second": 254.24180710755593
-            },
-            "8": {
-                "images_per_second": 308.95847419165545
-            },
-            "16": {
-                "images_per_second": 419.60746029488445
-            },
-            "32": {
-                "images_per_second": 453.81433823995565
-            },
-            "64": {
-                "images_per_second": 592.6385687558369
-            },
-            "128": {
-                "images_per_second": 603.8453409148115
-            }
-        }
-    }
-}
--- a/PyTorch/Detection/SSD/qa/benchmark_baselines/SSD300_pytorch_18.08_training_fp16.json
+++ b/PyTorch/Detection/SSD/qa/benchmark_baselines/SSD300_pytorch_18.08_training_fp16.json
@ -1,59 +0,0 @@
-{
-    "model": "",
-    "ngpus": [1, 4, 8],
-    "bs": [2, 4, 8, 16, 32, 64],
-    "metric_keys": ["images_per_second"],
-    "metrics": {
-        "1": {
-            "2": {
-                "images_per_second": 40.71944999694824
-            },
-            "4": {
-                "images_per_second": 68.22257804870605
-            },
-            "8": {
-                "images_per_second": 121.42024612426758
-            },
-            "16": {
-                "images_per_second": 159.56442260742188
-            },
-            "32": {
-                "images_per_second": 185.69010543823242
-            }
-        },
-        "4": {
-            "2": {
-                "images_per_second": 40.75998783111572
-            },
-            "4": {
-                "images_per_second": 75.58991050720215
-            },
-            "8": {
-                "images_per_second": 142.64888381958008
-            },
-            "16": {
-                "images_per_second": 256.07005310058594
-            },
-            "32": {
-                "images_per_second": 300.8989944458008
-            }
-        },
-        "8": {
-            "2": {
-                "images_per_second": 61.28578186035156
-            },
-            "4": {
-                "images_per_second": 119.46021270751953
-            },
-            "8": {
-                "images_per_second": 231.7295379638672
-            },
-            "16": {
-                "images_per_second": 430.5494079589844
-            },
-            "32": {
-                "images_per_second": 454.2975769042969
-            }
-        }
-    }
-}
--- a/PyTorch/Detection/SSD/qa/benchmark_baselines/SSD300_pytorch_18.08_training_fp32.json
+++ b/PyTorch/Detection/SSD/qa/benchmark_baselines/SSD300_pytorch_18.08_training_fp32.json
@ -1,59 +0,0 @@
-{
-    "model": "",
-    "ngpus": [1, 4, 8],
-    "bs": [2, 4, 8, 16, 32],
-    "metric_keys": ["images_per_second"],
-    "metrics": {
-        "1": {
-            "2": {
-                "images_per_second": 48.635780334472656
-            },
-            "4": {
-                "images_per_second": 66.06407419840494
-            },
-            "8": {
-                "images_per_second": 83.91736857096353
-            },
-            "16": {
-                "images_per_second": 102.67040761311848
-            },
-            "32": {
-                "images_per_second": 110.02347819010416
-            }
-        },
-        "4": {
-            "2": {
-                "images_per_second": 41.199180603027344
-            },
-            "4": {
-                "images_per_second": 79.85076141357422
-            },
-            "8": {
-                "images_per_second": 145.39981587727863
-            },
-            "16": {
-                "images_per_second": 247.95855712890625
-            },
-            "32": {
-                "images_per_second": 341.29132080078125
-            }
-        },
-        "8": {
-            "2": {
-                "images_per_second": 63.07561111450195
-            },
-            "4": {
-                "images_per_second": 123.25757344563802
-            },
-            "8": {
-                "images_per_second": 237.3413340250651
-            },
-            "16": {
-                "images_per_second": 376.59598795572913
-            },
-            "32": {
-                "images_per_second": 507.9451497395833
-            }
-        }
-    }
-}
--- a/PyTorch/Detection/SSD/qa/benchmark_baselines/SSD300_pytorch_19.01_inference_fp16.json
+++ b/PyTorch/Detection/SSD/qa/benchmark_baselines/SSD300_pytorch_19.01_inference_fp16.json
@ -1,34 +0,0 @@
-{
-   "bs" : [
-      2,
-      4,
-      8,
-      16,
-      32
-   ],
-   "metric_keys" : [
-      "images_per_second"
-   ],
-   "metrics" : {
-      "1" : {
-         "16" : {
-            "images_per_second" : 470.099200788709
-         },
-         "2" : {
-            "images_per_second" : 163.117099093173
-         },
-         "32" : {
-            "images_per_second" : 520.538879400471
-         },
-         "4" : {
-            "images_per_second" : 296.604178917743
-         },
-         "8" : {
-            "images_per_second" : 412.522394180558
-         }
-      }
-   },
-   "ngpus" : [
-      1
-   ]
-}
--- a/PyTorch/Detection/SSD/qa/benchmark_baselines/SSD300_pytorch_19.01_inference_fp32.json
+++ b/PyTorch/Detection/SSD/qa/benchmark_baselines/SSD300_pytorch_19.01_inference_fp32.json
@ -1,34 +0,0 @@
-{
-   "bs" : [
-      2,
-      4,
-      8,
-      16,
-      32
-   ],
-   "metric_keys" : [
-      "images_per_second"
-   ],
-   "metrics" : {
-      "1" : {
-         "16" : {
-            "images_per_second" : 280.570005994299
-         },
-         "2" : {
-            "images_per_second" : 147.914221468741
-         },
-         "32" : {
-            "images_per_second" : 302.430594818483
-         },
-         "4" : {
-            "images_per_second" : 201.622430560779
-         },
-         "8" : {
-            "images_per_second" : 228.159516872363
-         }
-      }
-   },
-   "ngpus" : [
-      1
-   ]
-}
--- a/PyTorch/Detection/SSD/qa/benchmark_baselines/SSD300_pytorch_19.01_training_fp16.json
+++ b/PyTorch/Detection/SSD/qa/benchmark_baselines/SSD300_pytorch_19.01_training_fp16.json
@ -1,52 +0,0 @@
-{
-   "bs" : [
-      2,
-      4,
-      8,
-      16,
-      32
-   ],
-   "metric_keys" : [
-      "images_per_second"
-   ],
-   "metrics" : {
-      "1" : {
-         "16" : {
-            "images_per_second" : 192.623916625977
-         },
-         "2" : {
-            "images_per_second" : 48.7488899230957
-         },
-         "32" : {
-            "images_per_second" : 204.250648498535
-         },
-         "4" : {
-            "images_per_second" : 95.4697418212891
-         },
-         "8" : {
-            "images_per_second" : 164.66495513916
-         }
-      },
-      "4" : {
-         "16" : {
-            "images_per_second" : 701.366027832031
-         },
-         "2" : {
-            "images_per_second" : 154.449935913086
-         },
-         "32" : {
-            "images_per_second" : 771.171325683594
-         },
-         "4" : {
-            "images_per_second" : 300.332641601562
-         },
-         "8" : {
-            "images_per_second" : 550.924163818359
-         }
-      }
-   },
-   "ngpus" : [
-      1,
-      4
-   ]
-}
--- a/PyTorch/Detection/SSD/qa/benchmark_baselines/SSD300_pytorch_19.01_training_fp32.json
+++ b/PyTorch/Detection/SSD/qa/benchmark_baselines/SSD300_pytorch_19.01_training_fp32.json
@ -1,45 +0,0 @@
-{
-   "bs" : [
-      2,
-      4,
-      8,
-      16
-   ],
-   "metric_keys" : [
-      "images_per_second"
-   ],
-   "metrics" : {
-      "1" : {
-         "16" : {
-            "images_per_second" : 121.772495269775
-         },
-         "2" : {
-            "images_per_second" : 56.0
-         },
-         "4" : {
-            "images_per_second" : 90.5315437316895
-         },
-         "8" : {
-            "images_per_second" : 103.113033294678
-         }
-      },
-      "4" : {
-         "16" : {
-            "images_per_second" : 472.226806640625
-         },
-         "2" : {
-            "images_per_second" : 184.061141967773
-         },
-         "4" : {
-            "images_per_second" : 324.639801025391
-         },
-         "8" : {
-            "images_per_second" : 391.055908203125
-         }
-      }
-   },
-   "ngpus" : [
-      1,
-      4
-   ]
-}
--- a/PyTorch/Detection/SSD/qa/benchmark_baselines/SSD300_pytorch_19.05_inference_fp16.json
+++ b/PyTorch/Detection/SSD/qa/benchmark_baselines/SSD300_pytorch_19.05_inference_fp16.json
@ -1,34 +0,0 @@
-{
-   "bs" : [
-      2,
-      4,
-      8,
-      16,
-      32
-   ],
-   "metric_keys" : [
-      "images_per_second"
-   ],
-   "metrics" : {
-      "1" : {
-         "16" : {
-            "images_per_second" : 478.225033
-         },
-         "2" : {
-            "images_per_second" : 148.5965123
-         },
-         "32" : {
-            "images_per_second" : 531.1827376
-         },
-         "4" : {
-            "images_per_second" : 283.3305197
-         },
-         "8" : {
-            "images_per_second" : 418.7012914
-         }
-      }
-   },
-   "ngpus" : [
-      1
-   ]
-}
--- a/PyTorch/Detection/SSD/qa/benchmark_baselines/SSD300_pytorch_19.05_inference_fp32.json
+++ b/PyTorch/Detection/SSD/qa/benchmark_baselines/SSD300_pytorch_19.05_inference_fp32.json
@ -1,34 +0,0 @@
-{
-   "bs" : [
-      2,
-      4,
-      8,
-      16,
-      32
-   ],
-   "metric_keys" : [
-      "images_per_second"
-   ],
-   "metrics" : {
-      "1" : {
-         "16" : {
-            "images_per_second" : 280.4733254
-         },
-         "2" : {
-            "images_per_second" : 143.8231571
-         },
-         "32" : {
-            "images_per_second" : 305.4504603
-         },
-         "4" : {
-            "images_per_second" : 202.6915644
-         },
-         "8" : {
-            "images_per_second" : 230.262872
-         }
-      }
-   },
-   "ngpus" : [
-      1
-   ]
-}
--- a/PyTorch/Detection/SSD/qa/benchmark_performance.py
+++ b/PyTorch/Detection/SSD/qa/benchmark_performance.py
@ -1,81 +0,0 @@
-import argparse
-import subprocess
-
-from qa.qa_utils import compare_benchmarks, load_json, save_json, OKBLUE, ENDC, FAIL
-
-
-
-# parsing
-def parse_testscript_args():
-    parser = argparse.ArgumentParser(description='PyTorch Benchmark Tests')
-    parser.add_argument('--bs', default=[1], type=int, nargs='+')
-    parser.add_argument('--ngpus', default=[1], type=int, nargs='+')
-    parser.add_argument('--benchmark-mode', default='training', choices=['training', 'inference'],
-                        help='benchmark training or inference', required=True)
-    parser.add_argument('--bench-iterations', type=int, default=20, metavar='N',
-                        help='Run N iterations while benchmarking (ignored when training and validation)')
-    parser.add_argument('--bench-warmup', type=int, default=10, metavar='N',
-                        help='Number of warmup iterations for benchmarking')
-    parser.add_argument('--fp16', action='store_true', help='Run model in mixed precision.')
-    parser.add_argument('-j', '--workers', default=4, type=int, metavar='N',
-                        help='number of data loading workers')
-    parser.add_argument('--data', type=str, metavar='<PATH>', required=True,
-                        help='path to the dataset')
-    parser.add_argument('--results-file', default='experiment_raport.json', type=str,
-                        help='file in which to store JSON experiment raport')
-    parser.add_argument('--benchmark-file', type=str, metavar='FILE', required=True,
-                        help='path to the file with baselines')
-    return parser.parse_args()
-
-
-# job command
-command_template = 'python3 {launcher} qa/qa_perf_main.py --bs {bs} --ebs {bs} ' \
-                   '--benchmark-mode {mode} --benchmark-warmup {bw} --benchmark-iterations {bi} {fp16} ' \
-                   '--backbone resnet50 --seed 1 --data {data} --results-file {results_file} --benchmark-file {benchmark_file}'
-
-if __name__ == '__main__':
-    args = parse_testscript_args()
-
-    fp16 = '--fp16' if args.fp16 else ''
-
-    # create results json file
-    # todo: maybe some template json file?
-    results = {'ngpus': args.ngpus,
-               'bs': args.bs,
-               'metric_keys': ['images_per_second'],
-               'metrics': {}}
-
-    for gpu in args.ngpus:
-        results['metrics'][str(gpu)] = {}
-        for bs in args.bs:
-            results['metrics'][str(gpu)][str(bs)] = {'images_per_second': None}
-
-    save_json(args.results_file, results)
-
-    # run qa_perf_main.py tests one by one
-    for gpu in args.ngpus:
-        launcher = '' if gpu == 1 else '-m torch.distributed.launch --nproc_per_node={}'.format(gpu)
-        for bs in args.bs:
-            print('#' * 80)
-            command = command_template.format(launcher=launcher, bs=bs, workers=args.workers, mode=args.benchmark_mode,
-                                              bw=args.bench_warmup, bi=args.bench_iterations, fp16=fp16,
-                                              data=args.data, results_file=args.results_file,
-                                              benchmark_file=args.benchmark_file)
-
-            print('Running "{}"'.format(command))
-
-            process = subprocess.Popen(command, shell=True)
-            output, error = process.communicate()
-
-            if error is not None:
-                print(FAIL + 'Program exited with status {}. Data has not been collected'.format(error) + ENDC)
-            # elif results['metrics'][str(gpu)][str(bs)]['images_per_second'] is None:
-            #     print(WARNING + 'Program did not end sucessfully. Data has not been collected.' + ENDC)
-            else:
-                print(OKBLUE + 'Program ended sucessfully. Data has been collected.' + ENDC)
-
-    results_data = load_json(args.results_file)
-    benchmark_data = load_json(args.benchmark_file)
-    exit_code = compare_benchmarks(results_data, benchmark_data, args, 0.16 if args.benchmark_mode == 'inference' else 0.1)
-    print(exit_code)
-    exit(exit_code)
--- a/PyTorch/Detection/SSD/qa/curve_baselines/SSD300_pytorch_18.08_fp16_full_run_acc_baseline.json
+++ b/PyTorch/Detection/SSD/qa/curve_baselines/SSD300_pytorch_18.08_fp16_full_run_acc_baseline.json
@ -1 +0,0 @@
-{"metric_keys": ["train.loss", "val.acc"], "metrics": {"train.loss": [8.812795396454991, 5.914838795058071, 6, 5.092440919584583, 4.887887316499735, 4.744666463422983, 4.694560192557922, 4.567333741479565, 4.492525351620137, 6, 4.408311570055099, 4.334232046614567, 6, 4.263646488106407, 4.2514614595596445, 4.2171871953656055, 4.206751160226014, 4.1795772798196715, 4.156515416099515, 6, 4.108870625495911, 4.0985876759066855, 4.075221928967139, 4.080158276849438, 6, 4.033980131669857, 4.037739227952915, 6, 3.99941903534935, 6, 3.9875937877263565, 3.971811039999583, 3.980771179282509, 3.953947089124455, 3.9305202960968018, 3.9366443781873546, 3.9252991879350754, 3.8827156307395367, 3.9388060424005102, 3.88922161618695, 3.8874285418914396, 6, 3.8936942113018453, 3.537499847891029, 3.4058184228089177, 6, 6, 3.3219671837627627, 3.295458280363458, 3.262115957955606, 6, 6, 6, 3.2190717260910433, 3.213117691627236, 3.1739242191397987, 3.1791626058811704, 3.2088054501854177, 3.1719801842385507, 3.187761370792139, 3.1809213312432236, 3.1823803410259397, 3.1752594631311677, 3.1709555600928425, 3.1823559530957817], "val.acc": [0.025120322205631106, 0.06065902615325462, 0.08224594352985645, 0.09868630608427395, 0.11402055039858493, 0.11779455253460233, 0.1232203941357061, 0.13708232144631768, 0.13614397127135028, 0.13289094380937685, 0.14004009449749777, 0.1369843423424096, 0.13877603069457692, 0.15418866425831707, 0.1500001994042602, 0.1542573219664272, 0.14771151227315413, 0.15896497766306272, 0.1600724682809656, 0.15881491661088476, 0.16213217020726906, 0.16466781280171408, 0.15738430149539484, 0.16634155547369375, 0.1623110334880526, 0.16394517553182106, 0.1494171026560053, 0.16762167601953265, 0.16063595691096758, 0.16982898253523193, 0.17321918229909394, 0.17242960413896102, 0.1625123530546557, 0.18330429802960516, 0.16333127233412115, 0.17973452067250242, 0.16699022570278652, 0.17183956548028687, 0.17168756775917593, 0.17547718325478198, 0.1750019046551496, 0.18416070771679066, 0.1711460087987496, 0.231325087097653, 0.23716038401167305, 0.23886896590018106, 0.2403412383214709, 0.24380227870861898, 0.24383605475007317, 0.2449733300818802, 0.24508423152154857, 0.24252172333110344, 0.24566254540226004, 0.24661345705692578, 0.25123807624083877, 0.25184439401895475, 0.2519010236397111, 0.25191664071239706, 0.2522156441636805, 0.25215053241008767, 0.2525434296889651, 0.2524917808636186, 0.2527410425201369, 0.2534121449798447, 0.25279479287831214]}, "bs": [64], "model": "", "ngpus": [8]}
--- a/PyTorch/Detection/SSD/qa/curve_baselines/SSD300_pytorch_18.08_fp32_full_run_acc_baseline.json
+++ b/PyTorch/Detection/SSD/qa/curve_baselines/SSD300_pytorch_18.08_fp32_full_run_acc_baseline.json
@ -1 +0,0 @@
-{"metric_keys": ["train.loss", "val.acc"], "metrics": {"train.loss": [9.887425426832973, 6.30290542835752, 5.566619733535567, 5.192713968618468, 4.943981836976963, 4.777146058311629, 4.682364774062644, 4.566371860462505, 4.479279315107254, 5, 4.398730874582149, 4.31779890601812, 4.293896813580043, 4.250142149529603, 4.219812418175577, 4.21572122303159, 4.187492328960302, 4.147948342119242, 4.134799897931028, 4.131298205737984, 4.071315974647822, 4.074750597299968, 4.0595350983882055, 4.042616275720722, 4.029284068070124, 4.02082926113012, 3.9983501902834298, 4.00984974094874, 3.9730074155799167, 5, 3.9646901324326294, 3.952598022061144, 3.944574903713043, 3.9182081201711596, 3.9252539055836775, 3.907297405092997, 3.8867245969813986, 3.87151758639573, 3.8793927009449254, 3.8687505586699107, 3.8750464156204956, 5, 3.8645522469516402, 3.504709825765618, 3.3920036476251862, 3.318732707260998, 5, 3.295415750237011, 3.2602547589347872, 5, 5, 5, 5, 3.199645553613854, 3.1623374312205086, 5, 3.147109237820821, 3.158245995575684, 3.1465386938319977, 3.1480963979746055, 3.151234711101482, 3.146022343739672, 3.1410668343956294, 3.142435818259893, 3.123337645718104], "val.acc": [0.01106397969239677, 0.04958324872172423, 0.07470961174804201, 0.08412781056028416, 0.1052591997157941, 0.11592629309116805, 0.1275672396324061, 0.12472585915140484, 0.13138377072048255, 0.1262696666605193, 0.13354663690485083, 0.14424123617821044, 0.14059169419863984, 0.14768715602101368, 0.15450788443085858, 0.14792122925940135, 0.1508861356435794, 0.157419558440425, 0.15279118544884585, 0.16075469826863828, 0.14747077091644412, 0.16340857637480236, 0.14427366437395484, 0.15709914018423293, 0.16324391683493303, 0.16440443232887508, 0.16479726175439752, 0.17508843799046686, 0.16142292492169025, 0.1643848499786872, 0.16912610131976924, 0.16376330941842296, 0.16894551721633602, 0.17771765128166106, 0.1749561896689298, 0.1695538322677119, 0.16778561571905298, 0.16380194923909086, 0.16994188486879763, 0.1716953661397215, 0.17755697810460197, 0.17187995479426885, 0.1742018462295355, 0.23426649845846764, 0.23613136034024038, 0.24175797706337981, 0.2425279583355936, 0.24352550398110506, 0.24411115979837528, 0.24656561042490024, 0.24383524308920906, 0.24686666489675338, 0.24814559219197632, 0.24840393696219026, 0.251965847689631, 0.25254138256097747, 0.2523565615073023, 0.2529904738785998, 0.253555154014026, 0.2530651493203877, 0.25358174010109197, 0.2537683728256746, 0.2539384684886946, 0.2540280117408162, 0.2534652864501853]}, "bs": [32], "model": "", "ngpus": [8]}
--- a/PyTorch/Detection/SSD/qa/curve_baselines/SSD300_pytorch_19.01_fp16_1epoch_run_acc_baseline.json
+++ b/PyTorch/Detection/SSD/qa/curve_baselines/SSD300_pytorch_19.01_fp16_1epoch_run_acc_baseline.json
@ -1,20 +0,0 @@
-{
-   "metrics" : {
-      "val.acc" : [
-         0.0100971670737651
-      ],
-      "train.loss" : [
-         9.85026645043801
-      ]
-   },
-   "ngpus" : [
-      8
-   ],
-   "metric_keys" : [
-      "train.loss",
-      "val.acc"
-   ],
-   "bs" : [
-      64
-   ]
-}
--- a/PyTorch/Detection/SSD/qa/curve_baselines/SSD300_pytorch_19.01_fp32_1epoch_run_acc_baseline.json
+++ b/PyTorch/Detection/SSD/qa/curve_baselines/SSD300_pytorch_19.01_fp32_1epoch_run_acc_baseline.json
@ -1,20 +0,0 @@
-{
-   "bs" : [
-      32
-   ],
-   "metrics" : {
-      "train.loss" : [
-         8.79916159380589
-      ],
-      "val.acc" : [
-         0.0238952010105531
-      ]
-   },
-   "metric_keys" : [
-      "train.loss",
-      "val.acc"
-   ],
-   "ngpus" : [
-      8
-   ]
-}
--- a/PyTorch/Detection/SSD/qa/qa_accuracy_main.py
+++ b/PyTorch/Detection/SSD/qa/qa_accuracy_main.py
@ -1,73 +0,0 @@
-# core imports
-import os
-import numpy as np
-
-# pytorch imports
-import torch
-import torch.utils.data.distributed
-
-# Apex imports
-try:
-    from apex.parallel.LARC import LARC
-    from apex.parallel import DistributedDataParallel as DDP
-    from apex.fp16_utils import *
-except ImportError:
-    raise ImportError("Please install APEX from https://github.com/nvidia/apex")
-
-# project imports
-from src.train import train_loop
-from main import train, make_parser
-from src.logger import Logger
-from qa.qa_utils import load_json, create_json_file, compare_acc, save_json
-
-RESULT = None
-
-
-def add_benchmark_args(parser):
-    parser.add_argument('--benchmark-mode', type=str, default='epoch-accuracy',
-                        choices=['full-accuracy', 'epoch-accuracy'], required=True)
-    parser.add_argument('--benchmark-file', type=str, default=None, metavar='FILE',
-                        help='path to the file with baselines', required=True)
-    return parser
-
-
-def main(args):
-    if args.local_rank == 0:
-        os.makedirs('./models', exist_ok=True)
-
-    if args.seed is not None:
-        print("Using seed = {}".format(args.seed))
-        torch.manual_seed(args.seed)
-        np.random.seed(seed=args.seed)
-
-    torch.backends.cudnn.benchmark = True
-
-    if args.benchmark_mode == 'epoch-accuracy':
-        args.epochs = 1
-
-    train_loop_func = train_loop
-    logger = Logger('Accuracy test', print_freq=10)
-
-    args.evaluation = list(range(90))
-    train(train_loop_func, logger, args)
-
-    exit_code = 0
-    if args.local_rank == 0:
-        train_loss_results, val_acc_results, train_time_results = logger.print_results()
-        print(train_time_results)
-        print(train_loss_results)
-        print(val_acc_results)
-        measured_results = create_json_file(val_acc_results, train_loss_results, ngpus=8, bs=args.batch_size)
-        save_json('/results/results.json', measured_results)
-        print(measured_results)
-        benchmark_results = load_json(args.benchmark_file)
-        exit_code = compare_acc(measured_results, benchmark_results, args)
-        exit(exit_code)
-
-
-if __name__ == "__main__":
-    parser = make_parser()
-    parser = add_benchmark_args(parser)
-    args = parser.parse_args()
-    print(args)
-    main(args)
--- a/PyTorch/Detection/SSD/qa/qa_perf_main.py
+++ b/PyTorch/Detection/SSD/qa/qa_perf_main.py
@ -1,199 +0,0 @@
-# core imports
-import os
-import numpy as np
-import json
-from pprint import pprint
-import time
-
-# pytorch imports
-import torch
-import torch.utils.data.distributed
-from torch.autograd import Variable
-
-
-# Apex imports
-try:
-    from apex.parallel.LARC import LARC
-    from apex.parallel import DistributedDataParallel as DDP
-    from apex.fp16_utils import *
-except ImportError:
-    raise ImportError("Please install APEX from https://github.com/nvidia/apex")
-
-# project imports
-from main import train, make_parser
-from src.logger import BenchLogger
-# from src.train import benchmark_inference_loop, benchmark_train_loop
-
-from SSD import _C as C
-
-RESULT = None
-
-
-def add_benchmark_args(parser):
-    parser.add_argument('--benchmark-mode', type=str, choices=['training', 'inference'],
-                        default='inference', required=True)
-    parser.add_argument('--results-file', default='experiment_raport.json', type=str,
-                        help='file in which to store JSON experiment raport')
-    parser.add_argument('--benchmark-file', type=str, default=None, metavar='FILE',
-                        help='path to the file with baselines')
-    return parser
-
-def benchmark_train_loop(model, loss_func, epoch, optim, train_dataloader, val_dataloader, encoder, iteration, logger, args, mean, std):
-    start_time = None
-    # tensor for results
-    result = torch.zeros((1,)).cuda()
-    for i, data in enumerate(loop(train_dataloader)):
-        if i >= args.benchmark_warmup:
-            start_time = time.time()
-
-        img = data[0][0][0]
-        bbox = data[0][1][0]
-        label = data[0][2][0]
-        label = label.type(torch.cuda.LongTensor)
-        bbox_offsets = data[0][3][0]
-        # handle random flipping outside of DALI for now
-        bbox_offsets = bbox_offsets.cuda()
-        img, bbox = C.random_horiz_flip(img, bbox, bbox_offsets, 0.5, False)
-
-        if not args.no_cuda:
-            img = img.cuda()
-            bbox = bbox.cuda()
-            label = label.cuda()
-            bbox_offsets = bbox_offsets.cuda()
-        img.sub_(mean).div_(std)
-
-        N = img.shape[0]
-        if bbox_offsets[-1].item() == 0:
-            print("No labels in batch")
-            continue
-        bbox, label = C.box_encoder(N, bbox, bbox_offsets, label, encoder.dboxes.cuda(), 0.5)
-
-        M = bbox.shape[0] // N
-        bbox = bbox.view(N, M, 4)
-        label = label.view(N, M)
-
-
-
-
-
-        ploc, plabel = model(img)
-        ploc, plabel = ploc.float(), plabel.float()
-
-        trans_bbox = bbox.transpose(1, 2).contiguous().cuda()
-
-        if not args.no_cuda:
-            label = label.cuda()
-        gloc = Variable(trans_bbox, requires_grad=False)
-        glabel = Variable(label, requires_grad=False)
-
-        loss = loss_func(ploc, plabel, gloc, glabel)
-
-
-
-        # loss scaling
-        if args.fp16:
-            if args.amp:
-                with optim.scale_loss(loss) as scale_loss:
-                    scale_loss.backward()
-            else:
-                optim.backward(loss)
-        else:
-            loss.backward()
-
-        optim.step()
-        optim.zero_grad()
-        iteration += 1
-
-        # reduce all results from every gpu
-        if i >= args.benchmark_warmup + args.benchmark_iterations:
-            result.data[0] = logger.print_result()
-            if args.N_gpu > 1:
-                torch.distributed.reduce(result, 0)
-            if args.local_rank == 0:
-                global RESULT
-                RESULT = float(result.data[0])
-            return
-
-        if i >= args.benchmark_warmup:
-            logger.update(args.batch_size, time.time() - start_time)
-
-def loop(dataloader):
-    while True:
-        for data in dataloader:
-            yield data
-
-def benchmark_inference_loop(model, loss_func, epoch, optim, train_dataloader, val_dataloader, encoder, iteration, logger, args, mean, std):
-    assert args.N_gpu == 1, 'Inference benchmark only on 1 gpu'
-    start_time = None
-    model.eval()
-    i=-1
-    dataloader = loop(val_dataloader)
-    while True:
-        i+=1
-        with torch.no_grad():
-            torch.cuda.synchronize()
-            if i >= args.benchmark_warmup:
-                start_time = time.time()
-            data = next(dataloader)
-
-            img = data[0]
-
-            if not args.no_cuda:
-                img = img.cuda()
-
-            if args.fp16:
-                img = img.half()
-
-            img.sub_(mean).div_(std)
-            img = Variable(img, requires_grad=False)
-            _ = model(img)
-            torch.cuda.synchronize()
-
-            if i >= args.benchmark_warmup + args.benchmark_iterations:
-                global RESULT
-                RESULT = logger.print_result()
-                return
-
-            if i >= args.benchmark_warmup:
-                logger.update(args.batch_size, time.time() - start_time)
-
-
-def main(args):
-    if args.local_rank == 0:
-        os.makedirs('./models', exist_ok=True)
-
-    if args.seed is not None:
-        print("Using seed = {}".format(args.seed))
-        torch.manual_seed(args.seed)
-        np.random.seed(seed=args.seed)
-
-    torch.backends.cudnn.benchmark = True
-
-    if args.benchmark_mode == 'training':
-        train_loop_func = benchmark_train_loop
-        logger = BenchLogger('Training benchmark')
-    else:
-        train_loop_func = benchmark_inference_loop
-        logger = BenchLogger('Inference benchmark')
-
-    args.epochs = 1
-
-    train(train_loop_func, logger, args)
-
-    if args.local_rank == 0:
-        global RESULT
-        with open(args.results_file) as f:
-            results = json.load(f)
-        results['metrics'][str(args.N_gpu)][str(args.batch_size)] = {'images_per_second': RESULT}
-        pprint(results)
-
-        with open(args.results_file, 'w') as f:
-            json.dump(results, f)
-
-
-if __name__ == "__main__":
-    parser = make_parser()
-    parser = add_benchmark_args(parser)
-    args = parser.parse_args()
-    print(args)
-    main(args)
--- a/PyTorch/Detection/SSD/qa/qa_utils.py
+++ b/PyTorch/Detection/SSD/qa/qa_utils.py
@ -1,115 +0,0 @@
-import json
-
-# terminal stdout colors
-OKBLUE = '\033[94m'
-OKGREEN = '\033[92m'
-WARNING = '\033[93m'
-FAIL = '\033[91m'
-ENDC = '\033[0m'
-
-
-# load results and benchmark
-def load_json(filepath):
-    with open(filepath) as f:
-        data = json.load(f)
-    return data
-
-
-def save_json(filepath, data):
-    with open(filepath, 'w') as f:
-        json.dump(data, f)
-
-
-# compare func
-def compare(measured_value, true_value, pmargin=0.1):
-    assert 0 < pmargin < 1, 'Margin should be in range [0, 1]'
-    return (1 - pmargin) * true_value < measured_value
-
-
-# compare 2 benchmark json files
-def compare_benchmarks(results, benchmark, args, pmargin=0.1):
-    # sanity check
-    for metric in results['metric_keys']:
-        if metric not in benchmark['metric_keys']:
-            assert False, "You want to compare {} metric which doesn't appear in benchmark file".format(metric)
-
-    assert len(args.bs) <= len(benchmark['bs']), 'len(args.bs) <= len(benchmark["bs"] ({} <= {})'.format(len(args.bs),
-                                                                                                         len(benchmark[
-                                                                                                                 'bs']))
-    assert len(args.bs) == len(results['bs']), 'len(args.bs) <= len(results["bs"] ({} == {})'.format(len(args.bs),
-                                                                                                     len(results['bs']))
-    for bs in results['bs']:
-        if bs not in benchmark['bs']:
-            assert False, "You want to compare batch size = {} which doesn't appear in benchmark file".format(bs)
-
-    assert len(args.ngpus) <= len(benchmark['ngpus']), 'len(args.ngpus) <= len(benchmark["ngpus"]) ({} <= {})'.format(
-        len(args.bs), len(benchmark['ngpus']))
-    assert len(args.ngpus) == len(results['ngpus']), 'len(args.ngpus) == len(results["ngpus"]) ({} == {})'.format(
-        len(args.bs), len(results['ngpus']))
-    for gpu in results['ngpus']:
-        if gpu not in benchmark['ngpus']:
-            assert False, "You want to compare {} gpus results which don't appear in benchmark file".format(gpu)
-
-    # compare measured numbers with benchmark
-    exit = 0
-    for metric in results['metric_keys']:
-        for gpu in results['ngpus']:
-            for bs in results['bs']:
-                measured_metric = results['metrics'][str(gpu)][str(bs)][metric]
-                ground_truth_metric = benchmark['metrics'][str(gpu)][str(bs)][metric]
-                ok = compare(measured_metric, ground_truth_metric, pmargin)
-                if ok:
-                    print(OKGREEN + 'BENCHMARK PASSED: metric={} gpu={} bs={}'.format(metric, gpu, bs) + ENDC)
-                else:
-                    print(FAIL + 'BENCHMARK NOT PASSED: metric={} gpu={} bs={}'.format(metric, gpu, bs) + ENDC)
-                    exit = 1
-    return exit
-
-# compare 2 benchmark json files
-def compare_acc(results, benchmark, args):
-    # sanity check
-    for metric in results['metric_keys']:
-        if metric not in benchmark['metric_keys']:
-            assert False, "You want to compare {} metric which doesn't appear in benchmark file".format(metric)
-
-    for bs in results['bs']:
-        if bs not in benchmark['bs']:
-            assert False, "You want to compare batch size = {} which doesn't appear in benchmark file".format(bs)
-
-    for gpu in results['ngpus']:
-        if gpu not in benchmark['ngpus']:
-            assert False, "You want to compare {} gpus results which don't appear in benchmark file".format(gpu)
-
-    # compare measured numbers with benchmark
-    for i, (result, ground_truth) in enumerate(zip(results['metrics']['val.acc'], benchmark['metrics']['val.acc'])):
-        if i > 43: # before first decay accuracy tends to vary more than 15% at ~30th epoch
-            if ground_truth * 0.9 > result:
-                print(FAIL + 'ACCURACY TEST NOT PASSED' + ENDC)
-                return 1
-
-    # compare measured numbers with benchmark
-    for i, (result, ground_truth) in enumerate(zip(results['metrics']['train.loss'], benchmark['metrics']['train.loss'])):
-        if i > 43:
-            if ground_truth * 1.1 < result:
-                print(FAIL + 'LOSS TEST NOT PASSED' + ENDC)
-                return 1
-
-    print(OKGREEN + 'ACCURACY TEST PASSED' + ENDC)
-    return 0
-
-def create_json_file(val_acc_results, train_loss_results, ngpus=8, bs=32):
-    results = {"ngpus": [ngpus],
-               "bs": [bs],
-               "metric_keys": ["train.loss", "val.acc"],
-               "metrics": {
-                   "train.loss": [],
-                   "val.acc": []
-               }
-               }
-
-    for i, ((epoch1, acc), (epoch2, loss)) in enumerate(zip(val_acc_results, train_loss_results)):
-        assert i == epoch1 == epoch2
-        results['metrics']['train.loss'].append(loss)
-        results['metrics']['val.acc'].append(acc)
-
-    return results
--- a/PyTorch/Detection/SSD/qa/testing_DGX1V_8GPU_fp16_1epoch_run.sh
+++ b/PyTorch/Detection/SSD/qa/testing_DGX1V_8GPU_fp16_1epoch_run.sh
@ -1,4 +0,0 @@
-#!/bin/bash
-
-python3 -m torch.distributed.launch --nproc_per_node=8 qa/qa_accuracy_main.py --bs 64 --fp16 --warmup 300 --learning-rate 2.6e-3 --seed 1 --benchmark-mode epoch-accuracy --benchmark-file qa/curve_baselines/SSD300_pytorch_19.01_fp16_1epoch_run_acc_baseline.json --data $1
-
--- a/PyTorch/Detection/SSD/qa/testing_DGX1V_8GPU_fp16_full_run.sh
+++ b/PyTorch/Detection/SSD/qa/testing_DGX1V_8GPU_fp16_full_run.sh
@ -1,4 +0,0 @@
-#!/bin/bash
-
-python3 -m torch.distributed.launch --nproc_per_node=8 qa/qa_accuracy_main.py --bs 64 --fp16 --warmup 300 --learning-rate 2.6e-3 --seed 1 --benchmark-mode full-accuracy --benchmark-file qa/curve_baselines/SSD300_pytorch_18.08_fp16_full_run_acc_baseline.json --data $1
-
--- a/PyTorch/Detection/SSD/qa/testing_DGX1V_8GPU_fp32_1epoch_run.sh
+++ b/PyTorch/Detection/SSD/qa/testing_DGX1V_8GPU_fp32_1epoch_run.sh
@ -1,4 +0,0 @@
-#!/bin/bash
-
-python3 -m torch.distributed.launch --nproc_per_node=8 qa/qa_accuracy_main.py --bs 32 --warmup 300 --learning-rate 2.6e-3 --seed 1 --benchmark-mode epoch-accuracy --benchmark-file qa/curve_baselines/SSD300_pytorch_19.01_fp32_1epoch_run_acc_baseline.json --data $1
-
--- a/PyTorch/Detection/SSD/qa/testing_DGX1V_8GPU_fp32_full_run.sh
+++ b/PyTorch/Detection/SSD/qa/testing_DGX1V_8GPU_fp32_full_run.sh
@ -1,4 +0,0 @@
-#!/bin/bash
-
-python3 -m torch.distributed.launch --nproc_per_node=8 qa/qa_accuracy_main.py --bs 32 --warmup 300 --learning-rate 2.6e-3 --seed 1 --benchmark-mode full-accuracy --benchmark-file qa/curve_baselines/SSD300_pytorch_18.08_fp32_full_run_acc_baseline.json --data $1
-
--- a/PyTorch/Detection/SSD/qa/testing_DGX1V_inference_benchmark_fp16.sh
+++ b/PyTorch/Detection/SSD/qa/testing_DGX1V_inference_benchmark_fp16.sh
@ -1,3 +0,0 @@
-#!/bin/bash
-
-python ./qa/benchmark_performance.py --benchmark-mode inference --ngpus 1 --bs 2 4 8 16 32 --fp16  --bench-warmup 100 --bench-iterations 200 --benchmark-file qa/benchmark_baselines/SSD300_pytorch_19.05_inference_fp16.json --data $1
--- a/PyTorch/Detection/SSD/qa/testing_DGX1V_inference_benchmark_fp32.sh
+++ b/PyTorch/Detection/SSD/qa/testing_DGX1V_inference_benchmark_fp32.sh
@ -1,3 +0,0 @@
-#!/bin/bash
-
-python ./qa/benchmark_performance.py --benchmark-mode inference --ngpus 1 --bs 2 4 8 16 32 --bench-warmup 100 --bench-iterations 200 --benchmark-file qa/benchmark_baselines/SSD300_pytorch_19.05_inference_fp32.json --data $1
--- a/PyTorch/Detection/SSD/qa/testing_DGX1V_training_benchmark_fp16.sh
+++ b/PyTorch/Detection/SSD/qa/testing_DGX1V_training_benchmark_fp16.sh
@ -1,3 +0,0 @@
-#!/bin/bash
-
-python ./qa/benchmark_performance.py --benchmark-mode training --ngpus 1 4 --bs 2 4 8 16 32 --fp16  --bench-warmup 100 --bench-iterations 200 --benchmark-file qa/benchmark_baselines/SSD300_pytorch_19.01_training_fp16.json --data $1
--- a/PyTorch/Detection/SSD/qa/testing_DGX1V_training_benchmark_fp32.sh
+++ b/PyTorch/Detection/SSD/qa/testing_DGX1V_training_benchmark_fp32.sh
@ -1,3 +0,0 @@
-#!/bin/bash
-
-python ./qa/benchmark_performance.py --benchmark-mode training --ngpus 1 4 --bs 2 4 8 16 --bench-warmup 100 --bench-iterations 200 --benchmark-file qa/benchmark_baselines/SSD300_pytorch_19.01_training_fp32.json --data $1
--- a/PyTorch/Detection/SSD/src/coco_pipeline.py
+++ b/PyTorch/Detection/SSD/src/coco_pipeline.py
@ -35,9 +35,9 @@ class COCOPipeline(Pipeline):
        super(COCOPipeline, self).__init__(batch_size=batch_size, device_id=device_id,
                                           num_threads=num_threads, seed = seed)

-        try:
+        if torch.distributed.is_initialized():
            shard_id = torch.distributed.get_rank()
-        except RuntimeError:
+        else:
            shard_id = 0

        self.input = ops.COCOReader(file_root = file_root, annotations_file = annotations_file,
--- a/PyTorch/LanguageModeling/BERT/.dockerignore
+++ b/PyTorch/LanguageModeling/BERT/.dockerignore
@ -0,0 +1,3 @@
+data/
+vocab/
+results/
--- a/PyTorch/LanguageModeling/BERT/.gitignore
+++ b/PyTorch/LanguageModeling/BERT/.gitignore
@ -0,0 +1,129 @@
+# Initially taken from Github's Python gitignore file
+
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+#Data       
+data/*/*/   
+data/*/*.zip
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+.hypothesis/
+.pytest_cache/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+.python-version
+
+# celery beat schedule file
+celerybeat-schedule
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+# vscode
+.vscode
+
+# TF code
+tensorflow_code
+
+# Models
+models
--- a/PyTorch/LanguageModeling/BERT/Dockerfile
+++ b/PyTorch/LanguageModeling/BERT/Dockerfile
@ -0,0 +1,27 @@
+ARG FROM_IMAGE_NAME=gitlab-master.nvidia.com:5005/dl/dgx/pytorch:19.05-py3-devel
+FROM ${FROM_IMAGE_NAME}
+RUN apt-get update && apt-get install -y pbzip2 pv bzip2 cabextract
+
+
+#WORKDIR /opt
+#RUN cd pytorch/apex \
+# && git fetch origin pull/182/head:norm_fix \
+# && git checkout norm_fix \
+# && python setup.py develop --cuda_ext --cpp_ext
+
+
+WORKDIR /opt
+RUN cd pytorch/apex ; \
+  pip uninstall apex; \
+  pip uninstall apex; \ 
+  git checkout master;  \
+  git pull; \
+  pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" .
+
+WORKDIR /workspace
+RUN git clone https://github.com/attardi/wikiextractor.git
+RUN git clone https://github.com/soskek/bookcorpus.git
+
+WORKDIR /workspace/bert
+COPY . .
+RUN pip install tqdm boto3 requests six ipdb h5py html2text nltk progressbar
--- a/PyTorch/LanguageModeling/BERT/LICENSE
+++ b/PyTorch/LanguageModeling/BERT/LICENSE
@ -0,0 +1,202 @@
+
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright [yyyy] [name of copyright owner]
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
--- a/PyTorch/LanguageModeling/BERT/README.md
+++ b/PyTorch/LanguageModeling/BERT/README.md
@ -0,0 +1,554 @@
+# Bert For PyTorch
+
+This repository provides scripts and recipes to pretrain BERT from a dataset of choice and achieve state of the art accuracy on relevant fine tuning tasks. This is tested and maintained by NVIDIA.
+
+
+## Table Of Contents:
+* [The model](#the-model)
+  * [Default configuration](#default-configuration)
+* [Setup](#setup)
+  * [Requirements](#requirements)
+* [Quick start guide](#quick-start-guide)
+* [Details](#details)
+  * [Command line options](#command-line-options)
+  * [Getting the data](#getting-the-data)
+  * [Training process](#training-process)
+  * [Pre-training](#pre-training)
+  * [Fine tuning](#fine-tuning)
+  * [Enabling mixed precision](#enabling-mixed-precision)
+  * [Inference process](#inference-process)
+* [Benchmarking](#benchmarking)
+  * [Training performance benchmark](#training-performance-benchmark)
+  * [Inference performance benchmark](#inference-performance-benchmark)
+* [Results](#results)
+  * [Training accuracy results](#training-accuracy-results)
+  * [Training stability test](#training-stability-test)
+  * [Training performance results](#training-performance-results)
+      * [NVIDIA DGX-1 (8x V100 16G)](#nvidia-dgx-1-8x-v100-16g)
+      * [NVIDIA DGX-1 (8x V100 32G)](#nvidia-dgx-1-8x-v100-32g)
+      * [NVIDIA DGX-2 (16x V100 32G)](#nvidia-dgx-2-16x-v100-32g)
+  * [Inference performance results](#inference-performance-results)
+      * [NVIDIA DGX-1 16G (1x V100 16G)](#nvidia-dgx-1-16g-1x-v100-16g)
+      * [NVIDIA DGX-1 32G (1x V100 32G)](#nvidia-dgx-1-32g-1x-v100-32g)
+      * [NVIDIA DGX-2 32G (1x V100 32G)](#nvidia-dgx-2-32g-1x-v100-32g)
+* [Changelog](#changelog)
+* [Known issues](#known-issues)
+
+## The model
+
+BERT, or Bidirectional Encoder Representations from Transformers, is a new method of pre-training language representations which obtains state-of-the-art results on a wide array of Natural Language Processing (NLP) tasks. This model is based on [BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding](https://arxiv.org/abs/1810.04805) paper. NVIDIA's BERT 19.04 is an optimized version of [Google's official implementation](https://github.com/google-research/bert), leveraging mixed precision arithmetic and tensor cores on V100 GPUS for faster training times while maintaining target accuracy. 
+
+
+The repository also contains scripts to interactively launch data download, training, benchmarking and inference routines in a Docker container for both pretraining and fine tuning for Question Answering. The major differences between the official implementation of the paper and our version of BERT are as follows:
+- [Mixed precision](https://arxiv.org/abs/1710.03740) training offers significant computational speedup by performing operations in half-precision format, while storing minimal information in single-precision to retain as much information as possible in critical parts of the network. Since the introduction of [tensor cores](https://developer.nvidia.com/tensor-cores) in the Volta and Turing architectures, significant training speedups are experienced by switching to mixed precision -- up to 3x overall speedup on the most arithmetically intense model architectures.  Using [mixed precision training](https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html) previously required two steps:
+- 1. Porting the model to use the FP16 data type where appropriate.
+- 2. Manually adding loss scaling to preserve small gradient values.
+
+Mixed precision is enabled in PyTorch by using the Automatic Mixed Precision (AMP),  library from [APEX](https://github.com/NVIDIA/apex) that casts variables to half-precision upon retrieval, while storing variables in single-precision format. Furthermore, to preserve small gradient magnitudes in backpropagation, a [loss scaling](https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html#lossscaling) step must be included when applying gradients. In PyTorch, loss scaling can be easily applied by using scale_loss() method provided by amp. The scaling value to be used can be [dynamic](https://nvidia.github.io/apex/fp16_utils.html#apex.fp16_utils.DynamicLossScaler) or fixed.
+
+For an in-depth walk through on AMP, check out sample usage [here](https://github.com/NVIDIA/apex/tree/master/apex/amp#usage-and-getting-started). [APEX](https://github.com/NVIDIA/apex) is a PyTorch extension that contains utility libraries, such as AMP, which require minimal network code changes to leverage tensor cores performance.
+
+- Scripts to download dataset for 
+    - Pretraining - [Wikipedia](https://dumps.wikimedia.org/),  [BookCorpus](http://yknzhu.wixsite.com/mbweb)
+    - Fine Tuning - [SQuaD](https://rajpurkar.github.io/SQuAD-explorer/) (Stanford Question Answering Dataset), Pretrained Weights from Google
+- Custom fused CUDA kernels for faster computations
+- Multi-GPU/Multi-Node support using [APEX DDP](https://github.com/NVIDIA/apex#2-distributed-training)
+
+
+These techniques and optimizations improve model performance and reduce training time, allowing you to perform various NLP tasks with no additional effort.
+
+
+Other publicly available implementations of BERT include:
+1. [Hugging Face](https://github.com/huggingface/pytorch-pretrained-BERT)
+2. [codertimo](https://github.com/codertimo/BERT-pytorch)
+
+
+This model trains with mixed precision tensor cores on Volta, therefore researchers can get results much faster than training without tensor cores. This model is tested against each NGC monthly container release to ensure consistent accuracy and performance over time.
+
+### Default configuration
+
+BERT's model architecture is a multi-layer bidirectional Transformer encoder. Based on the model size, we have the following two default configurations of BERT.
+
+| **Model** | **Hidden layers** | **Hidden unit size** | **Attention heads** | **Feedforward filter size** | **Max sequence length** | **Parameters** |
+|:---------:|:----------:|:----:|:---:|:--------:|:---:|:----:|
+|BERTBASE |12 encoder| 768| 12|4 x  768|512|110M|
+|BERTLARGE|24 encoder|1024| 16|4 x 1024|512|330M|
+
+## Setup
+The following section list the requirements in order to start training the BERT model.
+
+### Requirements
+This repository contains `Dockerfile` which extends the TensorFlow NGC container and encapsulates some dependencies.  Aside from these dependencies, ensure you have the following components:
+- [NVIDIA Docker](https://github.com/NVIDIA/nvidia-docker)
+- [PyTorch 19.04-py3](https://ngc.nvidia.com/catalog/containers/nvidia:pytorch) NGC container
+- [NVIDIA Volta based GPU](https://www.nvidia.com/en-us/data-center/volta-gpu-architecture/)
+
+
+For more information about how to get started with NGC containers, see the following sections from the NVIDIA GPU Cloud Documentation and the Deep Learning Documentation:
+- [Getting Started Using NVIDIA GPU Cloud](https://docs.nvidia.com/ngc/ngc-getting-started-guide/index.html)
+- [Accessing And Pulling From The NGC Container Registry](https://docs.nvidia.com/deeplearning/dgx/user-guide/index.html#accessing_registry)
+- [Running PyTorch](https://docs.nvidia.com/deeplearning/dgx/pytorch-release-notes/running.html#running)
+
+## Quick start guide
+To pretrain or fine tune your model for Question Answering using mixed precision with tensor cores or using FP32, perform the following steps using the default parameters of the BERT model.
+
+### 1. Clone the repository.
+
+```bash
+git clone https://github.com/NVIDIA/DeepLearningExamples
+cd DeepLearningExamples/PyTorch/LanguageModeling/BERT
+```
+
+### 2. Build the BERT TensorFlow NGC container.
+
+```bash
+bash scripts/docker/build.sh
+```
+
+### 3. Download and preprocess the dataset.
+This repository provides scripts to download, verify and extract various datasets: 
+SQuaD and swag for fine-tuning as well as Wikipedia and BookCorpus for pretraining. If you just want to do fine-tuning, you can also download  the pretrained weights.
+
+To download, verify, and extract required datasets: 
+
+```bash
+bash scripts/data_download.sh  
+```
+
+Datasets can also be mixed before used for training or inference. In case of training there are two options:
+
+
+The script launches a docker container with current directory mounted and downloads datasets to `data/` folder on the host. 
+
+
+Datasets can also be mixed before used for training or inference
+
+
+### 4. Start an interactive session in the NGC container to run training/inference.
+After you build the container image and download the data, you can start an interactive CLI session as follows:  
+
+```bash
+bash scripts/docker/launch.sh
+```
+
+The `launch.sh` script assumes that the datasets are in the following locations by default after downloading data. 
+- SQuaD v1.1 - `data/squad/v1.1`
+- BERT - `data/pretrained_models_google/uncased_L-24_H-1024_A-16`
+- Wikipedia - `data/wikipedia_corpus/hdf5_shards`
+- BookCorpus -  `data/bookcorpus/hdf5_shards`
+
+
+### 5. Start pre-training.
+BERT is designed to pre-train deep bidirectional representations for language representations. The following scripts are to replicate pretraining on Wikipedia+Book Corpus from the [paper](https://arxiv.org/pdf/1810.04805.pdf). These scripts are general and can be used for pretraining language representations on any corpus of choice.
+
+From within the container, you can use the following script to run pre-training.
+```bash
+bash scripts/run_pretraining.sh <train_batch_size_per_gpu> <learning_rate> <precision> <num_gpus> <warmup_proportion> <train_steps> <save_checkpoint_steps> <create_logfile>
+```
+
+<!-- For FP16 training with XLA using a DGX-1 V100 32G, run:
+```bash
+bash scripts/run_pretraining.sh 14 8 5e-5 fp16_xla 8 5000 2285000 5000 true
+```
+
+For FP32 training without XLA using a DGX-1 V100 32G, run:
+```bash
+bash scripts/run_pretraining.sh 6 6 2e-5 fp32 8 2000 5333333 5000 true
+``` -->
+
+### 6. Start fine tuning.
+The above pretrained BERT representations can be fine tuned with just one additional output layer for a state-of-the-art Question Answering system. From within the container, you can use the following script to run fine-training for SQuaD.
+
+```bash
+bash scripts/run_squad.sh <batch_size_per_gpu> <learning_rate_per_gpu> <precision> <num_gpus> <checkpoint> <epochs>
+```
+
+
+For FP32 training using a DGX-1 V100 32G, run:
+```bash
+bash scripts/run_squad.sh 5 5e-6 fp32 8 /bert/bert_model.ckpt 2
+```
+
+### 7. Start validation/evaluation.
+The `run_squad_inference.sh` script runs inference on a checkpoint fine tuned for SQuaD and evaluates the goodness of predictions on the basis of exact match and F1 score.
+
+```bash
+bash scripts/run_squad_inference.sh <init_checkpoint> <batch_size> <precision> 
+```
+
+For FP32 inference without XLA using a DGX-1 V100 32G, run:
+```bash
+bash scripts/run_squad_inference.sh /results/model.ckpt 8 fp32 
+```
+
+## Details
+The following sections provide greater details of the dataset, running training and inference, and the training results.
+
+### Command line options
+To see the full list of available options and their descriptions, use the -h or --help command line option, for example: 
+```bash
+python run_pretraining.py --help
+python run_squad.py --help 
+```
+
+Aside from options to set hyperparameters, the relevant options to control the behaviour of the `run_pretraining.py` script are: 
+```bash
+  --[no]amp: Whether to enable AMP ops.(default: 'false')
+  --[no]amp_fastmath: Whether to enable AMP fasthmath ops.(default: 'false')
+  --bert_config_file: The config json file corresponding to the pre-trained BERT model. This specifies the model architecture.
+  --[no]do_eval: Whether to run evaluation on the dev set.(default: 'false')
+  --[no]do_train: Whether to run training.(evaluation: 'false')
+  --eval_batch_size: Total batch size for eval.(default: '8')(an integer)
+  --[no]fastmath: Whether to enable loss scaler for fasthmath ops.(default: 'false')
+  --[no]horovod: Whether to use Horovod for multi-gpu runs(default: 'false')
+  --init_checkpoint: Initial checkpoint (usually from a pre-trained BERT model).
+  --input_file: Input TF example files (can be a glob or comma separated).
+  --iterations_per_loop: How many steps to make in each estimator call.(default: '1000')
+```
+
+Aside from options to set hyperparameters, some relevant options to control the behaviour of the run_squad.py script are: 
+```bash
+  --bert_config_file: The config json file corresponding to the pre-trained BERT model. This specifies the model architecture.
+  --[no]do_predict: Whether to run evaluation on the dev set. (default: 'false')
+  --[no]do_train: Whether to run training. (default: 'false')
+  --learning_rate: The initial learning rate for Adam.(default: '5e-06')(a number)
+  --max_answer_length: The maximum length of an answer that can be generated. This is needed because the start and end predictions are not conditioned on one another.(default: '30')(an integer)
+  --max_query_length: The maximum number of tokens for the question. Questions longer than this will be truncated to this length.(default: '64')(an integer)
+  --max_seq_length: The maximum total input sequence length after WordPiece tokenization. Sequences longer than this will be truncated, and sequences shorter than this will be padded.(default: '384')(an integer)
+  --predict_batch_size: Total batch size for predictions.(default: '8')(an integer)
+  --train_batch_size: Total batch size for training.(default: '8')(an integer)
+  --[no]use_fp16: Whether to use fp32 or fp16 arithmetic on GPU.(default: 'false')
+  --[no]use_xla: Whether to enable XLA JIT compilation.(default: 'false')
+  --[no]verbose_logging: If true, all of the warnings related to data processing will be printed. A number of warnings are expected for a normal SQuAD evaluation.(default: 'false')
+  --[no]version_2_with_negative: If true, the SQuAD examples contain some that do not have an answer.(default: 'false')
+```
+
+### Getting the data
+For pre-training BERT, we use the concatenation of Wikipedia (2500M words) as well as Book Corpus (800M words). For Wikipedia, we extract only the text passages from [here](ftp://ftpmirror.your.org/pub/wikimedia/dumps/enwiki/20190301/enwiki-20190301-pages-articles-multistream.xml.bz2) and ignore headers list and tables. It is structured as a document level corpus rather than a shuffled sentence level corpus because it is critical to extract long contiguous sentences. The next step is to run `create_pretraining_data.py` with the document level corpus as input, which generates input data and labels for the masked language modeling and next sentence prediction tasks. Pre-training can also be performed on any corpus of your choice. The collection of data generation scripts are intended to be modular to allow modifications for additional preprocessing steps or to use additional data.
+
+#### Mixing datasets
+
+The repository provides tools to mix datasets for both training and finetuning.
+In case of training there are two options:
+
+a) inter sequence-pair mixing (after pretraining data is created)
+
+In the `data/` directory, `merge_datasets_after_creation.sh` is a tool to mix data from multiple source corpora. To perform this mixing, the source corpora need to be already in the format of pretraining data, i.e. .hdf5 files. To call the script, use:
+
+
+```bash
+cd data
+bash merge_datasets_after_creation.sh  <destination_folder> <input_directories> <num_shards>
+```
+
+For example, to merge the bookcorpus and Wikipedia corpora provided with this repository and create 1024 new shards containing the mixed training instances, first make sure that `data/bookcorpus/hdf5_shards/` and `data/wikipedia_corpus/hdf5_shards/` exist and are filled with .hdf5, then run:
+
+```
+cd data
+bash merge_datasets_after_creation.sh inter_instance_merged_wiki+books bookcorpus/hdf5_shards/,wikipedia_corpus/hdf5_shards/ 1024
+```
+
+b) intra sequence-pair mixing (before pretraining data is created)
+
+
+In the `data/` directory, `merge_datasets_from_start.sh` is a tool to mix data from multiple source corpora. To perform this mixing, the source corpora must each be condensed into a single file that contains the entire corpus text, with line within the file corresponding to a document in the corpus. The script is then called as such:
+
+```
+cd data
+merge_datasets_from_start.sh DESTINATION_FOLDER CORPUS_1 CORPUS_2 CORPUS_3 ...
+```
+
+For example, to merge the bookcorpus and Wikipedia corpora provided with this repository, first make sure that `data/bookcorpus/intermediate_files/bookcorpus.txt` and `data/wikipedia_corpus/intermediate_files/wikipedia.txt` exist, then run:
+
+```
+cd data
+merge_datasets_from_start.sh intra_instance_merged_wiki+books bookcorpus/intermediate_files/bookcorpus.txt wikipedia_corpus/intermediate_files/wikipedia.txt
+```
+
+Note that `merge_datasets_from_start.sh` has a few dependencies, so it may be preferable to modify `data_download_helper.sh` to call the merging script and run `data_download.sh` so that the mixing process is done in a container.
+
+#### Fine Tuning datasets
+
+We can use a pre-trained BERT model for other fine tuning tasks like Question Answering. We use SQuaD for this task. SQuaD v1.1 has 100,000+ question-answer pairs on 500+ articles. SQuaD v2.0 combines v1.1 with an additional 50,000 new unanswerable questions and must not only answer questions but also determine when that is not possible. 
+
+### Training process
+The training process consists of two steps: pre-training and fine tuning.
+
+#### Pre-training
+Pre-training is performed using the `run_pretraining.py` script along with parameters defined in the `scripts/run_pretraining.sh`.
+
+
+The `run_pretraining.sh` script runs a job on a single node  that trains the BERT-large model from scratch using the Wikipedia and Book corpus datasets as training data. By default, the training script:
+- Runs on 8 GPUs with training batch size of 14 and evaluation batch size of 8 per GPU.
+- Has FP16 precision enabled.
+- Runs for 1144000 steps with 10000 warm-up steps.
+- Saves a checkpoint every 5000 iterations (keeps only the latest checkpoint) and at the end of training. All checkpoints, evaluation results and training logs are saved to the `/results` directory (in the container which can be mounted to a local directory).
+- Creates the log file containing all the output.
+- Evaluates the model at the end of training. To skip evaluation, modify `--do_eval` to `False`.
+
+These parameters will train Wikipedia + BooksCorpus to reasonable accuracy on a DGX1 with 32GB V100 cards. If you want to match google’s best results from the BERT paper, you should either train for twice as many steps (2,288,000 steps) on a DGX1, or train on 16 GPUs on a DGX2. The DGX2 having 16 GPUs will be able to fit a batch size twice as large as a DGX1 (224 vs 112), hence the DGX2 can finish in half as many steps. 
+
+
+For example:
+```bash
+run_pretraining.sh <training_batch_size> <eval_batch_size> <learning-rate> <precision> <num_gpus> <warmup_proportion> <training_steps> <save_checkpoint_steps> <create_logfile>
+```
+
+Where:
+- <training_batch_size> is per-gpu batch size used for training. Batch size varies with <precision>, larger batch sizes run more efficiently, but require more memory.
+
+- <eval_batch_size> per-gpu batch size used for evaluation after training.<learning_rate> Default rate of 1e-4 is good for global batch size 256.
+
+- <precision> Type of math in your model, can be either fp32, fp16, fastmath, amp_fm, amp_fm_xla, amp . The options mean:
+
+    - fp32 32 bit IEEE single precision floats.
+
+    - fp16 Hand-coded mixed precision 16 and 32 bit floats.
+
+    - fp16 Hand-coded mixed precision floats, JIT compiled with XLA.
+
+    - fastmath Matmuls done by tensor cores in mixed precision, the rest is done in FP32.
+
+    - amp_fm Alternative FastMath implementation that works by manipulating TensorFlow’s compute graph.
+
+    - amp_fm_xla The amp_fm flag plus XLA JIT compilation.
+
+    - amp Automatic rewrite of TensorFlow compute graph to take advantage of 16 bit arithmetic whenever that is safe.
+
+    - amp_xla The amp flag plus XLA JIT compilation.
+
+- <num_gpus> Number of GPUs to use for training. Must be equal to or smaller than the number of GPUs attached to your node.
+
+- <warmup_steps> Number of warm-up steps at the start of training.
+
+- <training_steps> Total number of training steps.
+
+- <save_checkpoint_steps> Controls how often checkpoints are saved. Default is 5000 steps.
+
+- <create_logfile> Flag indicating if output should be written to a logfile or not (acceptable values are ‘true’ or ‘false’, true indicates output should be saved to a logfile.)
+
+
+For example:
+```bash
+bert_tf/scripts/run_pretraining.sh 14 8 1e-4 fp16_xla 16 10000 1144000 5000 true
+```
+
+Trains BERT-large from scratch on a single DGX-2 using FP16 arithmetic. This will take around 156 hours / 6.5 days. Checkpoints are written out every 5000 steps and all printouts are saved to a logfile.
+
+#### Fine tuning
+Fine tuning is performed using the `run_squad.py` script along with parameters defined in `scripts/run_squad.sh`.
+
+The `run_squad.sh` script trains a model and performs evaluation on the SQuaD v1.1 dataset. By default, the training script: 
+- Uses 8 GPUs and batch size of 10 on each GPU.
+- Has FP16 precision enabled.
+- Is XLA enabled.
+- Runs for 2 epochs.
+- Saves a checkpoint every 1000 iterations (keeps only the latest checkpoint) and at the end of training. All checkpoints, evaluation results and training logs are saved to the `/results` directory (in the container which can be mounted to a local directory).
+- Evaluation is done at the end of training. To skip evaluation, modify `--do_predict` to `False`.
+
+This script outputs checkpoints to the `/results` directory, by default, inside the container. Mount point of `/results` can be changed in the `scripts/docker/launch.sh` file. The training log contains information about:
+- Loss for the final step
+- Training and evaluation performance
+- F1 and exact match score on the Dev Set of SQuaD after evaluation. 
+
+The summary after training is printed in the following format:
+```bash
+I0312 23:10:45.137036 140287431493376 run_squad.py:1332] 0 Total Training Time = 3007.00 Training Time W/O start up overhead = 2855.92 Sentences processed = 175176
+I0312 23:10:45.137243 140287431493376 run_squad.py:1333] 0 Training Performance = 61.3378 sentences/sec
+I0312 23:14:00.550846 140287431493376 run_squad.py:1396] 0 Total Inference Time = 145.46 Inference Time W/O start up overhead = 131.86 Sentences processed = 10840
+I0312 23:14:00.550973 140287431493376 run_squad.py:1397] 0 Inference Performance = 82.2095 sentences/sec
+{"exact_match": 83.69914853358561, "f1": 90.8477003317459}
+```
+
+Multi-gpu training is enabled with the Horovod TensorFlow module. The following example runs training on 8 GPUs: 
+```bash
+mpi_command="mpirun -np 8 -H localhost:8 \
+    --allow-run-as-root -bind-to none -map-by slot \
+    -x NCCL_DEBUG=INFO \
+    -x LD_LIBRARY_PATH \
+    -x PATH -mca pml ob1 -mca btl ^openib" \
+     python run_squad.py --horovod
+```
+
+### Enabling mixed precision
+[Mixed precision](https://arxiv.org/abs/1710.03740) training offers significant computational speedup by performing operations in half-precision format, while storing minimal information in single-precision to retain as much information as possible in critical parts of the network.  Since the introduction of [tensor cores](https://developer.nvidia.com/tensor-cores) in the Volta and Turing architectures, significant training speedups are experienced by switching to mixed precision -- up to 3x overall speedup on the most arithmetically intense model architectures.  Using [mixed precision training](https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html) previously required two steps:
+1. Porting the model to use the FP16 data type where appropriate.
+2. Manually adding loss scaling to preserve small gradient values. 
+This can now be achieved using Automatic Mixed Precision (AMP) for TensorFlow to enable the full [mixed precision methodology](https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html#tensorflow) in your existing TensorFlow model code.  AMP enables mixed precision training on Volta and Turing GPUs automatically. The TensorFlow framework code makes all necessary model changes internally.
+
+In TF-AMP, the computational graph is optimized to use as few casts as necessary and maximize the use of FP16, and the loss scaling is automatically applied inside of supported optimizers. AMP can be configured to work with the existing `tf.contrib` loss scaling manager by disabling the AMP scaling with a single environment variable to perform only the automatic mixed-precision optimization. It accomplishes this by automatically rewriting all computation graphs with the necessary operations to enable mixed precision training and automatic loss scaling.
+
+For information about:
+- How to train using mixed precision, see the [Mixed Precision Training](https://arxiv.org/abs/1710.03740) paper and [Training With Mixed Precision](https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html) documentation.
+- How to access and enable AMP for TensorFlow, see [Using TF-AMP](https://docs.nvidia.com/deeplearning/dgx/tensorflow-user-guide/index.html#tfamp) from the TensorFlow User Guide.
+- Techniques used for mixed precision training, see the [Mixed-Precision Training of Deep Neural Networks](https://devblogs.nvidia.com/mixed-precision-training-deep-neural-networks/) blog.
+
+### Inference process
+Inference on a fine tuned Question Answering system is performed using the `run_squad.py` script along with parameters defined in the `scripts/run_squad_inference.sh`. Inference is supported on single GPU at this moment.
+
+The `run_squad_inference.sh` script trains a model and performs evaluation on the SQuaD v1.1 dataset. By default, the inferencing script: 
+- Has FP16 precision enabled
+- Is XLA enabled
+- Evaluates the latest checkpoint present in `/results` with a batch size of 8
+
+This script outputs predictions file to `/results/predictions.json` and computes F1 score and exact match score using SQuaD's `evaluate-v1.1.py`. Mount point of `/results` can be changed in the `scripts/docker/launch.sh` file. 
+
+The output log contains information about:
+- Evaluation performance
+- F1 and exact match score on the Dev Set of SQuaD after evaluation. 
+
+The summary after inference is printed in the following format:
+```bash
+I0312 23:14:00.550846 140287431493376 run_squad.py:1396] 0 Total Inference Time = 145.46 Inference Time W/O start up overhead = 131.86 Sentences processed = 10840
+I0312 23:14:00.550973 140287431493376 run_squad.py:1397] 0 Inference Performance = 82.2095 sentences/sec
+{"exact_match": 83.69914853358561, "f1": 90.8477003317459}
+```
+
+## Benchmarking
+The following section shows how to run benchmarks measuring the model performance in training and inference modes.
+
+Benchmarking can be performed for both training and inference. Both scripts run the BERT model for fine tuning. You can specify whether benchmarking is performed in FP16 or FP32 by specifying it as an argument to the benchmarking scripts. 
+
+Both of these benchmarking scripts enable you to run a number of epochs and extract performance numbers.
+
+### Training performance benchmark
+Training benchmarking can be performed by running the script: 
+```bash
+scripts/finetune_train_benchmark.sh squad <fp16/fp32> <use_xla> <num_gpu> <batch_size/gpu> <lr> 
+```
+
+### Inference performance benchmark
+Inference benchmarking can be performed by running the script: 
+```bash
+scripts/finetune_inference_benchmark.sh squad <fp16/fp32> <use_xla> <batch_size> <path-to-checkpoint> 
+```
+
+## Results
+The following sections provide details on how we achieved our performance and accuracy in training and inference for Question Answering fine tuning.
+### Training accuracy results
+Our results were obtained by running the `run_squad.py`  training script in the TensorFlow 19.03-py3 NGC container on NVIDIA DGX-1 with 8x V100 32G GPUs.
+
+
+| **Number of GPUs** | **Batch size per GPU** | **Training time with FP16 (Hrs)** | **Training time with FP32 (Hrs)** |
+|:---:|:---:|:----:|:----:|
+| 8 | 4 |||
+
+#### Training stability test
+The following tables compare `F1` scores across 5 different training runs with different seeds, for both FP16 and FP32 respectively.  The runs showcase consistent convergence on all 5 seeds with very little deviation.
+
+| **FP16, 8x GPUs** | **seed #1** | **seed #2** | **seed #3** | **seed #4** | **seed #5** | **mean** | **std** |
+|:-----------:|:-----:|:-----:|:-----:|:-----:|:-----:|:-----:|:-----:|
+|F1         ||
+|Exact match||
+
+| **FP32, 8x GPUs** | **seed #1** | **seed #2** | **seed #3** | **seed #4** | **seed #5** | **mean** | **std** |
+|:-----------:|:-----:|:-----:|:-----:|:-----:|:-----:|:-----:|:-----:|
+|F1         | |
+|Exact match| |
+
+
+### Training performance results
+Our results were obtained by running batch sizes up to 3x GPUs on a 16GB V100 and up to 10x GPUs on a 32G V100 with mixed precision.
+
+#### NVIDIA DGX-1 (8x V100 16G)
+Our results were obtained by running the `scripts/run_pretraining.sh` training script in the TensorFlow 19.03-py3 NGC container on NVIDIA DGX-1 with 8x V100 16G GPUs. Performance numbers (in tokens per second) were averaged over an entire training epoch.
+
+
+| **Number of GPUs** | **Batch size per GPU** | **FP32 sentences/sec** | **FP16 sentences/sec** | **Speed-up with mixed precision** | **Multi-gpu weak scaling with FP32** | **Multi-gpu weak scaling with FP16** |
+|:---:|:---:|:------:|:-----:|:----:|:----:|:----:|
+| 1 | 2 | 5.48 |18.97|3.46 |1.0 |1.0 |
+| 4 | 2 |19.6|60.6|3.09|3.57 |3.2|
+| 8 | 2 |39.21 |121.21|3.09|7.15|6.38|
+
+
+| **Number of GPUs** | **Batch size per GPU** | **FP32 sentences/sec** | **FP16 sentences/sec** | **Speed-up with mixed precision** | **Multi-gpu weak scaling with FP32** | **Multi-gpu weak scaling with FP16** |
+|:---:|:---:|:-----:|:-----:|:---:|:---:|:----:|
+| 1 | 4 |  -  |19.46| - | - |1.0 |
+| 4 | 4 |  -  |75.67| - | - |3.88|
+| 8 | 4 |  -  |151.35| - | - |7.77 |
+
+Note: The respective values for FP32 runs that use a batch size of 4 are not available due to out of memory errors that arise. Batch size of 4 is only available on using FP16.
+
+To achieve these same results, follow the [Quick Start Guide](#quick-start-guide) outlined above.
+
+#### NVIDIA DGX-1 (8x V100 32G)
+Our results were obtained by running the `scripts/run_pretraining.sh` training script in the TensorFlow 19.03-py3 NGC container on NVIDIA DGX-1 with 8x V100 32G GPUs. Performance numbers (in sentences per second) were averaged over an entire training epochs.
+
+
+| **Number of GPUs** | **Batch size per GPU** | **FP32 sentences/sec** | **FP16 sentences/sec** | **Speed-up with mixed precision** | **Multi-gpu weak scaling with FP32** | **Multi-gpu weak scaling with FP16** |
+|---|---|-----|-----|----|----|----|
+| 1 | 7 | 7.56|24.29|3.21|1.0 |1.0 |
+| 4 | 7 |28.84|86.24|2.99|3.81|3.55|
+| 8 | 7 |57.68|172.48|2.99|7.62|7.10|
+
+
+| **Number of GPUs** | **Batch size per GPU** | **FP32 sentences/sec** | **FP16 sentences/sec** | **Speed-up with mixed precision** | **Multi-gpu weak scaling with FP32** | **Multi-gpu weak scaling with FP16** |
+|---|---|-----|-------|---|---|----|
+| 1 | 14|  -  | 26.04 | - | - |1.0 |
+| 4 | 14|  -  | 99.68| - | - |3.87|
+| 8 | 14|  -  |199.35 | - | - |7.65 |
+
+
+Note: The respective values for FP32 runs that use a batch size of 10 are not available due to out of memory errors that arise. Batch size of 10 is only available on using FP16.
+
+To achieve these same results, follow the [Quick Start Guide](#quick-start-guide) outlined above. 
+
+#### NVIDIA DGX-2 (16x V100 32G)
+Our results were obtained by running the `scripts/run_pretraining.sh` training script in the TensorFlow 19.03-py3 NGC container on NVIDIA DGX-2 with 16x V100 32G GPUs. Performance numbers (in sentences per second) were averaged over an entire training epoch.
+
+
+| **Number of GPUs** | **Batch size per GPU** | **FP32 sentences/sec** | **FP16 sentences/sec** | **Speed-up with mixed precision** | **Multi-gpu weak scaling with FP32** | **Multi-gpu weak scaling with FP16** |
+|---|---|------|------|----|-----|----|
+|  1| 7 |  8.47| 26.04|3.07| 1.0 |1.0 |
+|  4| 7 | 32.2 | 92.68|2.87| 3.8|3.80|
+|  8| 7 | 63.84|183.68|2.87| 7.53|7.05|
+| 16| 7 |126.56|365.12|2.87|14.94|14.02|
+
+
+| **Number of GPUs** | **Batch size per GPU** | **FP32 sentences/sec** | **FP16 sentences/sec** | **Speed-up with mixed precision** | **Multi-gpu weak scaling with FP32** | **Multi-gpu weak scaling with FP16** |
+|---|---|---|------|---|---|----|
+|  1| 14| - | 28.28| - | - |1.0 |
+|  4| 14| - | 103.6| - | - |3.66|
+|  8| 14| - |208.32| - | - |7.36|
+| 16| 14| - |416.64| - | - |14.73|
+
+
+Note: The respective values for FP32 runs that use a batch size of 10 are not available due to out of memory errors that arise. Batch size of 10 is only available on using FP16.
+
+To achieve these same results, follow the [Quick Start Guide](#quick-start-guide) outlined above. 
+### Inference performance results
+
+#### NVIDIA DGX-1 16G (1x V100 16G)
+Our results were obtained by running the `scripts/run_squad_inference.sh` training script in the TensorFlow 19.03-py3 NGC container on NVIDIA DGX-1 with 1x V100 16G GPUs. Performance numbers (in sentences per second) were averaged over an entire training epoch.
+
+| **Number of GPUs** | **Batch size per GPU** | **FP32 sentences/sec** | **FP16 sentences/sec** | **Speedup** |
+|---|---|-----|------|----|
+| 1 | 8 ||
+
+To achieve these same results, follow the [Quick Start Guide](#quick-start-guide) outlined above.
+
+
+#### NVIDIA DGX-1 32G (1x V100 32G)
+Our results were obtained by running the `scripts/run_squad_inference.sh` training script in the TensorFlow 19.03-py3 NGC container on NVIDIA DGX-1 with 1x V100 32G GPUs. Performance numbers (in sentences per second) were averaged over an entire training epoch.
+
+| **Number of GPUs** | **Batch size per GPU** | **FP32 sentences/sec** | **FP16 sentences/sec** | **Speedup** |
+|---|---|-----|------|----|
+| 1 | 8 ||
+
+To achieve these same results, follow the [Quick Start Guide](#quick-start-guide) outlined above.
+
+#### NVIDIA DGX-2 32G (1x V100 32G)
+Our results were obtained by running the `scripts/run_squad_inference.sh` training script in the TensorFlow 19.03-py3 NGC container on NVIDIA DGX-2 with 1x V100 32G GPUs. Performance numbers (in sentences per second) were averaged over an entire training epoch.
+
+| **Number of GPUs** | **Batch size per GPU** | **FP32 sentences/sec** | **FP16 sentences/sec** | **Speedup** |
+|---|---|-----|------|----|
+| 1 | 8 ||
+
+To achieve these same results, follow the [Quick Start Guide](#quick-start-guide) outlined above.
+
+## Changelog
+March 2019
+- Initial release
+
+## Known issues
+There are no known issues with this model.
--- a/PyTorch/LanguageModeling/BERT/bert_config.json
+++ b/PyTorch/LanguageModeling/BERT/bert_config.json
@ -0,0 +1,13 @@
+{
+  "attention_probs_dropout_prob": 0.1,
+  "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 1024,
+  "initializer_range": 0.02,
+  "intermediate_size": 4096,
+  "max_position_embeddings": 512,
+  "num_attention_heads": 16,
+  "num_hidden_layers": 24,
+  "type_vocab_size": 2,
+  "vocab_size": 30522
+}
--- a/PyTorch/LanguageModeling/BERT/create_pretraining_data.py
+++ b/PyTorch/LanguageModeling/BERT/create_pretraining_data.py
@ -0,0 +1,472 @@
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Create masked LM/next sentence masked_lm TF examples for BERT."""
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+import argparse
+import logging
+import os
+import random
+from io import open
+import h5py
+import numpy as np
+from tqdm import tqdm, trange
+
+from tokenization import BertTokenizer
+import tokenization as tokenization
+
+import random
+import collections
+
+
+
+
+class TrainingInstance(object):
+  """A single training instance (sentence pair)."""
+
+  def __init__(self, tokens, segment_ids, masked_lm_positions, masked_lm_labels,
+               is_random_next):
+    self.tokens = tokens
+    self.segment_ids = segment_ids
+    self.is_random_next = is_random_next
+    self.masked_lm_positions = masked_lm_positions
+    self.masked_lm_labels = masked_lm_labels
+
+  def __str__(self):
+    s = ""
+    s += "tokens: %s\n" % (" ".join(
+        [tokenization.printable_text(x) for x in self.tokens]))
+    s += "segment_ids: %s\n" % (" ".join([str(x) for x in self.segment_ids]))
+    s += "is_random_next: %s\n" % self.is_random_next
+    s += "masked_lm_positions: %s\n" % (" ".join(
+        [str(x) for x in self.masked_lm_positions]))
+    s += "masked_lm_labels: %s\n" % (" ".join(
+        [tokenization.printable_text(x) for x in self.masked_lm_labels]))
+    s += "\n"
+    return s
+
+  def __repr__(self):
+    return self.__str__()
+
+
+def write_instance_to_example_file(instances, tokenizer, max_seq_length,
+                                    max_predictions_per_seq, output_file):
+  """Create TF example files from `TrainingInstance`s."""
+ 
+
+  total_written = 0
+  features = collections.OrderedDict()
+ 
+  num_instances = len(instances)
+  features["input_ids"] = np.zeros([num_instances, max_seq_length], dtype="int32")
+  features["input_mask"] = np.zeros([num_instances, max_seq_length], dtype="int32")
+  features["segment_ids"] = np.zeros([num_instances, max_seq_length], dtype="int32")
+  features["masked_lm_positions"] =  np.zeros([num_instances, max_predictions_per_seq], dtype="int32")
+  features["masked_lm_ids"] = np.zeros([num_instances, max_predictions_per_seq], dtype="int32")
+  features["next_sentence_labels"] = np.zeros(num_instances, dtype="int32")
+
+
+  for inst_index, instance in enumerate(tqdm(instances)):
+    input_ids = tokenizer.convert_tokens_to_ids(instance.tokens)
+    input_mask = [1] * len(input_ids)
+    segment_ids = list(instance.segment_ids)
+    assert len(input_ids) <= max_seq_length
+
+    while len(input_ids) < max_seq_length:
+      input_ids.append(0)
+      input_mask.append(0)
+      segment_ids.append(0)
+
+    assert len(input_ids) == max_seq_length
+    assert len(input_mask) == max_seq_length
+    assert len(segment_ids) == max_seq_length
+
+    masked_lm_positions = list(instance.masked_lm_positions)
+    masked_lm_ids = tokenizer.convert_tokens_to_ids(instance.masked_lm_labels)
+    masked_lm_weights = [1.0] * len(masked_lm_ids)
+
+    while len(masked_lm_positions) < max_predictions_per_seq:
+      masked_lm_positions.append(0)
+      masked_lm_ids.append(0)
+      masked_lm_weights.append(0.0)
+
+    next_sentence_label = 1 if instance.is_random_next else 0
+
+    
+
+    features["input_ids"][inst_index] = input_ids
+    features["input_mask"][inst_index] = input_mask
+    features["segment_ids"][inst_index] = segment_ids
+    features["masked_lm_positions"][inst_index] = masked_lm_positions
+    features["masked_lm_ids"][inst_index] = masked_lm_ids
+    features["next_sentence_labels"][inst_index] = next_sentence_label
+
+    total_written += 1
+
+    # if inst_index < 20:
+    #   tf.logging.info("*** Example ***")
+    #   tf.logging.info("tokens: %s" % " ".join(
+    #       [tokenization.printable_text(x) for x in instance.tokens]))
+
+    #   for feature_name in features.keys():
+    #     feature = features[feature_name]
+    #     values = []
+    #     if feature.int64_list.value:
+    #       values = feature.int64_list.value
+    #     elif feature.float_list.value:
+    #       values = feature.float_list.value
+    #     tf.logging.info(
+    #         "%s: %s" % (feature_name, " ".join([str(x) for x in values])))
+
+ 
+  print("saving data")
+  f= h5py.File(output_file, 'w')
+  f.create_dataset("input_ids", data=features["input_ids"], dtype='i4', compression='gzip')
+  f.create_dataset("input_mask", data=features["input_mask"], dtype='i1', compression='gzip')
+  f.create_dataset("segment_ids", data=features["segment_ids"], dtype='i1', compression='gzip')
+  f.create_dataset("masked_lm_positions", data=features["masked_lm_positions"], dtype='i4', compression='gzip')
+  f.create_dataset("masked_lm_ids", data=features["masked_lm_ids"], dtype='i4', compression='gzip')
+  f.create_dataset("next_sentence_labels", data=features["next_sentence_labels"], dtype='i1', compression='gzip')
+  f.flush()
+  f.close()
+
+def create_training_instances(input_files, tokenizer, max_seq_length,
+                              dupe_factor, short_seq_prob, masked_lm_prob,
+                              max_predictions_per_seq, rng):
+  """Create `TrainingInstance`s from raw text."""
+  all_documents = [[]]
+
+  # Input file format:
+  # (1) One sentence per line. These should ideally be actual sentences, not
+  # entire paragraphs or arbitrary spans of text. (Because we use the
+  # sentence boundaries for the "next sentence prediction" task).
+  # (2) Blank lines between documents. Document boundaries are needed so
+  # that the "next sentence prediction" task doesn't span between documents.
+  for input_file in input_files:
+    print("creating instance from {}".format(input_file))
+    with open(input_file, "r") as reader:
+      while True:
+        line = tokenization.convert_to_unicode(reader.readline())
+        if not line:
+          break
+        line = line.strip()
+
+        # Empty lines are used as document delimiters
+        if not line:
+          all_documents.append([])
+        tokens = tokenizer.tokenize(line)
+        if tokens:
+          all_documents[-1].append(tokens)
+
+  # Remove empty documents
+  all_documents = [x for x in all_documents if x]
+  rng.shuffle(all_documents)
+
+  vocab_words = list(tokenizer.vocab.keys())
+  instances = []
+  for _ in range(dupe_factor):
+    for document_index in range(len(all_documents)):
+      instances.extend(
+          create_instances_from_document(
+              all_documents, document_index, max_seq_length, short_seq_prob,
+              masked_lm_prob, max_predictions_per_seq, vocab_words, rng))
+
+  rng.shuffle(instances)
+  return instances
+
+
+def create_instances_from_document(
+    all_documents, document_index, max_seq_length, short_seq_prob,
+    masked_lm_prob, max_predictions_per_seq, vocab_words, rng):
+  """Creates `TrainingInstance`s for a single document."""
+  document = all_documents[document_index]
+
+  # Account for [CLS], [SEP], [SEP]
+  max_num_tokens = max_seq_length - 3
+
+  # We *usually* want to fill up the entire sequence since we are padding
+  # to `max_seq_length` anyways, so short sequences are generally wasted
+  # computation. However, we *sometimes*
+  # (i.e., short_seq_prob == 0.1 == 10% of the time) want to use shorter
+  # sequences to minimize the mismatch between pre-training and fine-tuning.
+  # The `target_seq_length` is just a rough target however, whereas
+  # `max_seq_length` is a hard limit.
+  target_seq_length = max_num_tokens
+  if rng.random() < short_seq_prob:
+    target_seq_length = rng.randint(2, max_num_tokens)
+
+  # We DON'T just concatenate all of the tokens from a document into a long
+  # sequence and choose an arbitrary split point because this would make the
+  # next sentence prediction task too easy. Instead, we split the input into
+  # segments "A" and "B" based on the actual "sentences" provided by the user
+  # input.
+  instances = []
+  current_chunk = []
+  current_length = 0
+  i = 0
+  while i < len(document):
+    segment = document[i]
+    current_chunk.append(segment)
+    current_length += len(segment)
+    if i == len(document) - 1 or current_length >= target_seq_length:
+      if current_chunk:
+        # `a_end` is how many segments from `current_chunk` go into the `A`
+        # (first) sentence.
+        a_end = 1
+        if len(current_chunk) >= 2:
+          a_end = rng.randint(1, len(current_chunk) - 1)
+
+        tokens_a = []
+        for j in range(a_end):
+          tokens_a.extend(current_chunk[j])
+
+        tokens_b = []
+        # Random next
+        is_random_next = False
+        if len(current_chunk) == 1 or rng.random() < 0.5:
+          is_random_next = True
+          target_b_length = target_seq_length - len(tokens_a)
+
+          # This should rarely go for more than one iteration for large
+          # corpora. However, just to be careful, we try to make sure that
+          # the random document is not the same as the document
+          # we're processing.
+          for _ in range(10):
+            random_document_index = rng.randint(0, len(all_documents) - 1)
+            if random_document_index != document_index:
+              break
+
+          random_document = all_documents[random_document_index]
+          random_start = rng.randint(0, len(random_document) - 1)
+          for j in range(random_start, len(random_document)):
+            tokens_b.extend(random_document[j])
+            if len(tokens_b) >= target_b_length:
+              break
+          # We didn't actually use these segments so we "put them back" so
+          # they don't go to waste.
+          num_unused_segments = len(current_chunk) - a_end
+          i -= num_unused_segments
+        # Actual next
+        else:
+          is_random_next = False
+          for j in range(a_end, len(current_chunk)):
+            tokens_b.extend(current_chunk[j])
+        truncate_seq_pair(tokens_a, tokens_b, max_num_tokens, rng)
+
+        assert len(tokens_a) >= 1
+        assert len(tokens_b) >= 1
+
+        tokens = []
+        segment_ids = []
+        tokens.append("[CLS]")
+        segment_ids.append(0)
+        for token in tokens_a:
+          tokens.append(token)
+          segment_ids.append(0)
+
+        tokens.append("[SEP]")
+        segment_ids.append(0)
+
+        for token in tokens_b:
+          tokens.append(token)
+          segment_ids.append(1)
+        tokens.append("[SEP]")
+        segment_ids.append(1)
+
+        (tokens, masked_lm_positions,
+         masked_lm_labels) = create_masked_lm_predictions(
+             tokens, masked_lm_prob, max_predictions_per_seq, vocab_words, rng)
+        instance = TrainingInstance(
+            tokens=tokens,
+            segment_ids=segment_ids,
+            is_random_next=is_random_next,
+            masked_lm_positions=masked_lm_positions,
+            masked_lm_labels=masked_lm_labels)
+        instances.append(instance)
+      current_chunk = []
+      current_length = 0
+    i += 1
+
+  return instances
+
+
+MaskedLmInstance = collections.namedtuple("MaskedLmInstance",
+                                          ["index", "label"])
+
+
+def create_masked_lm_predictions(tokens, masked_lm_prob,
+                                 max_predictions_per_seq, vocab_words, rng):
+  """Creates the predictions for the masked LM objective."""
+
+  cand_indexes = []
+  for (i, token) in enumerate(tokens):
+    if token == "[CLS]" or token == "[SEP]":
+      continue
+    cand_indexes.append(i)
+
+  rng.shuffle(cand_indexes)
+
+  output_tokens = list(tokens)
+
+  num_to_predict = min(max_predictions_per_seq,
+                       max(1, int(round(len(tokens) * masked_lm_prob))))
+
+  masked_lms = []
+  covered_indexes = set()
+  for index in cand_indexes:
+    if len(masked_lms) >= num_to_predict:
+      break
+    if index in covered_indexes:
+      continue
+    covered_indexes.add(index)
+
+    masked_token = None
+    # 80% of the time, replace with [MASK]
+    if rng.random() < 0.8:
+      masked_token = "[MASK]"
+    else:
+      # 10% of the time, keep original
+      if rng.random() < 0.5:
+        masked_token = tokens[index]
+      # 10% of the time, replace with random word
+      else:
+        masked_token = vocab_words[rng.randint(0, len(vocab_words) - 1)]
+
+    output_tokens[index] = masked_token
+
+    masked_lms.append(MaskedLmInstance(index=index, label=tokens[index]))
+
+  masked_lms = sorted(masked_lms, key=lambda x: x.index)
+
+  masked_lm_positions = []
+  masked_lm_labels = []
+  for p in masked_lms:
+    masked_lm_positions.append(p.index)
+    masked_lm_labels.append(p.label)
+
+  return (output_tokens, masked_lm_positions, masked_lm_labels)
+
+
+def truncate_seq_pair(tokens_a, tokens_b, max_num_tokens, rng):
+  """Truncates a pair of sequences to a maximum sequence length."""
+  while True:
+    total_length = len(tokens_a) + len(tokens_b)
+    if total_length <= max_num_tokens:
+      break
+
+    trunc_tokens = tokens_a if len(tokens_a) > len(tokens_b) else tokens_b
+    assert len(trunc_tokens) >= 1
+
+    # We want to sometimes truncate from the front and sometimes from the
+    # back to add more randomness and avoid biases.
+    if rng.random() < 0.5:
+      del trunc_tokens[0]
+    else:
+      trunc_tokens.pop()
+
+
+def main():
+
+    parser = argparse.ArgumentParser()
+    ## Required parameters
+    parser.add_argument("--vocab_file",
+                        default=None,
+                        type=str,
+                        required=True,
+                        help="The vocabulary the BERT model will train on.")
+    parser.add_argument("--input_file",
+                        default=None,
+                        type=str,
+                        required=True,
+                        help="The input train corpus. can be directory with .txt files or a path to a single file")
+    parser.add_argument("--output_file",
+                        default=None,
+                        type=str,
+                        required=True,
+                        help="The output file where the model checkpoints will be written.")
+
+    ## Other parameters
+
+    # str
+    parser.add_argument("--bert_model", default="bert-large-uncased", type=str, required=False,
+                        help="Bert pre-trained model selected in the list: bert-base-uncased, "
+                              "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese.")
+
+    #int 
+    parser.add_argument("--max_seq_length",
+                        default=128,
+                        type=int,
+                        help="The maximum total input sequence length after WordPiece tokenization. \n"
+                             "Sequences longer than this will be truncated, and sequences shorter \n"
+                             "than this will be padded.")
+    parser.add_argument("--dupe_factor",
+                        default=10,
+                        type=int,
+                        help="Number of times to duplicate the input data (with different masks).")
+    parser.add_argument("--max_predictions_per_seq",
+                        default=20,
+                        type=int,
+                        help="Maximum sequence length.")
+                             
+
+    # floats
+
+    parser.add_argument("--masked_lm_prob",
+                        default=0.15,
+                        type=float,
+                        help="Masked LM probability.")
+
+    parser.add_argument("--short_seq_prob",
+                        default=0.1,
+                        type=float,
+                        help="Probability to create a sequence shorter than maximum sequence length")
+
+    parser.add_argument("--do_lower_case",
+                        action='store_true',
+                        default=True,
+                        help="Whether to lower case the input text. True for uncased models, False for cased models.")
+    parser.add_argument('--random_seed',
+                        type=int,
+                        default=12345,
+                        help="random seed for initialization")
+
+    args = parser.parse_args()
+
+    tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case)
+
+    
+    input_files = []
+    if os.path.isfile(args.input_file):
+      input_files.append(args.input_file)
+    elif os.path.isdir(args.input_file):
+      input_files = [os.path.join(args.input_file, f) for f in os.listdir(args.input_file) if (os.path.isfile(os.path.join(args.input_file, f)) and f.endswith('.txt') )]
+    else:
+      raise ValueError("{} is not a valid path".format(args.input_file))
+
+    rng = random.Random(args.random_seed)
+    instances = create_training_instances(
+        input_files, tokenizer, args.max_seq_length, args.dupe_factor,
+        args.short_seq_prob, args.masked_lm_prob, args.max_predictions_per_seq,
+        rng)
+
+    output_file = args.output_file
+
+
+    write_instance_to_example_file(instances, tokenizer, args.max_seq_length,
+                                    args.max_predictions_per_seq, output_file)
+
+
+if __name__ == "__main__":
+    main()
--- a/PyTorch/LanguageModeling/BERT/data/README.md
+++ b/PyTorch/LanguageModeling/BERT/data/README.md
@ -0,0 +1,30 @@
+Steps to reproduce datasets from web
+
+1) Build the container
+  * docker build -t bert_prep .
+2) Run the container interactively
+  * nvidia-docker run -it --ipc=host bert_prep
+  * Optional: Mount data volumes
+    * -v yourpath:/workspace/bert/data/wikipedia_corpus/download
+    * -v yourpath:/workspace/bert/data/wikipedia_corpus/extracted_articles
+    * -v yourpath:/workspace/bert/data/wikipedia_corpus/raw_data
+    * -v yourpath:/workspace/bert/data/wikipedia_corpus/intermediate_files
+    * -v yourpath:/workspace/bert/data/wikipedia_corpus/final_text_file_single
+    * -v yourpath:/workspace/bert/data/wikipedia_corpus/final_text_files_sharded
+    * -v yourpath:/workspace/bert/data/wikipedia_corpus/final_tfrecords_sharded
+    * -v yourpath:/workspace/bert/data/bookcorpus/download
+    * -v yourpath:/workspace/bert/data/bookcorpus/final_text_file_single
+    * -v yourpath:/workspace/bert/data/bookcorpus/final_text_files_sharded
+    * -v yourpath:/workspace/bert/data/bookcorpus/final_tfrecords_sharded
+  * Optional: Select visible GPUs
+    * -e CUDA_VISIBLE_DEVICES=0
+
+** Inside of the container starting here**
+3) Download pretrained weights (they contain vocab files for preprocessing)
+  * cd data/pretrained_models_google && python3 download_models.py
+4) "One-click" Wikipedia data download and prep (provides tfrecords)
+  * Set your configuration in data/wikipedia_corpus/config.sh
+  * cd /data/wikipedia_corpus && ./run_preprocessing.sh
+5) "One-click" BookCorpus data download and prep (provided tfrecords)
+  * Set your configuration in data/wikipedia_corpus/config.sh
+  * cd /data/bookcorpus && ./run_preprocessing.sh
--- a/PyTorch/LanguageModeling/BERT/data/bookcorpus/clean_and_merge_text.py
+++ b/PyTorch/LanguageModeling/BERT/data/bookcorpus/clean_and_merge_text.py
@ -0,0 +1,23 @@
+# NVIDIA
+
+import glob
+import os
+import argparse
+
+parser = argparse.ArgumentParser(description='Cleaning and merge downloaded bookcorpus files')
+
+parser.add_argument('download_path', type=str)
+parser.add_argument('output_file', type=str)
+
+args = parser.parse_args()
+
+download_path = args.download_path
+output_file = args.output_file
+
+with open(output_file, "w") as ofile:
+  for filename in glob.glob('{}/*.txt'.format(download_path), recursive=True):
+    with open(filename, mode='r', encoding="utf-8-sig") as file:
+      for line in file:
+        if line.strip() != "":
+          ofile.write(line.strip() + " ")
+    ofile.write("\n\n")
--- a/PyTorch/LanguageModeling/BERT/data/bookcorpus/download_bookcorpus.sh
+++ b/PyTorch/LanguageModeling/BERT/data/bookcorpus/download_bookcorpus.sh
@ -0,0 +1,9 @@
+#! /bin/bash
+
+# Download books
+mkdir -p ./download
+python3 /workspace/bookcorpus/download_files.py --list /workspace/bookcorpus/url_list.jsonl --out ./download --trash-bad-count
+
+# Clean and prep (one book per line)
+python3 ./clean_and_merge_text.py ./download bookcorpus.txt
+
--- a/PyTorch/LanguageModeling/BERT/data/create_datasets_from_start.sh
+++ b/PyTorch/LanguageModeling/BERT/data/create_datasets_from_start.sh
@ -0,0 +1,38 @@
+#!/bin/bash
+
+# Note: There are several directories created to make it clear what has been performed at each stage of preprocessing. The intermediate files may be useful if you want to further clean/prepare/augment the data for your own applications.
+# NLTK was chosen as the default over spaCy simply due to speed of sentence segmentation on the large files.
+
+MERGED_DIR=$1
+args="${*:2}"
+
+source utils/config.sh
+
+mkdir -p ${MERGED_DIR}
+
+corpus_file=${MERGED_DIR}/corpus.txt
+## Shuffle the full corpus texts
+if [ ! -z $3 ]
+then
+  echo "Merging $args"
+  cat $args | sed "/^$/d" | shuf > $corpus_file
+else
+  corpus_file=$2
+fi
+
+# Split articles into one-sentence-per-line format for use with BERT scripts
+echo "Applying sentence segmentation to get one sentence per line"
+mkdir -p ${MERGED_DIR}/final_text_file_single
+python3 utils/sentence_segmentation_nltk.py $corpus_file ${MERGED_DIR}/final_text_file_single/corpus.segmented.nltk.txt
+
+## Shard finalized text so that it has a chance of fitting in memory when creating pretraining data into hdf5 (choose appropriate number of shards for distributed training)
+echo "Shard text files - size is approximate to prevent splitting an article across shards"
+mkdir -p ${MERGED_DIR}/final_text_files_sharded
+python3 utils/shard_text_input_file.py ${MERGED_DIR}/final_text_file_single/corpus.segmented.nltk.txt ${MERGED_DIR}/final_text_files_sharded/corpus.segmented.part.
+
+# Convert sharded text files into hdf5 that are ready for BERT pretraining
+echo "Creating hdf5 for each text shard"
+mkdir -p ${MERGED_DIR}/hdf5_shards
+export TARGET_DIR=${MERGED_DIR}
+. utils/preprocessing_xargs_wrapper.sh ${N_PROCS_PREPROCESS}
+
--- a/PyTorch/LanguageModeling/BERT/data/glue/download_mrpc.sh
+++ b/PyTorch/LanguageModeling/BERT/data/glue/download_mrpc.sh
@ -0,0 +1,7 @@
+#!/usr/bin/env bash
+
+echo "Downloading MRPC data"
+
+wget https://gist.githubusercontent.com/W4ngatang/60c2bdb54d156a41194446737ce03e2e/raw/17b8dd0d724281ed7c3b2aeeda662b92809aadd5/download_glue_data.py
+
+python download_glue_data.py --data_dir . --tasks MRPC
--- a/PyTorch/LanguageModeling/BERT/data/merge_datasets_after_creation.sh
+++ b/PyTorch/LanguageModeling/BERT/data/merge_datasets_after_creation.sh
@ -0,0 +1,29 @@
+#!/bin/bash
+
+
+MERGED_DIR=$1 # e.g wikipedia+bookcorpus
+INPUTFILES=$2 # directories with hdf5 files separated by comma
+NUM_SHARDS=$3
+
+source utils/config.sh
+
+
+META_DIR=$MERGED_DIR/meta
+mkdir -p ${MERGED_DIR}
+mkdir -p ${META_DIR}
+
+echo "create mixed dataset ids"
+echo "python utils/create_mixed_dataset_ids.py --input_files=${INPUTFILES} --num_output_shards=${NUM_SHARDS} --output_dir=${META_DIR} --random_seed=${SEED}"
+python utils/create_mixed_dataset_ids.py --input_files=${INPUTFILES} --num_output_shards=${NUM_SHARDS} --output_dir=${META_DIR} --random_seed=${SEED}
+
+
+echo "Creating hdf5 for each text shard"
+mkdir -p ${MERGED_DIR}/hdf5_shards
+echo "create mixed datasets with hdf5 files"
+echo "python utils/create_mixed_dataset.py --input_files=${INPUTFILES} --output_dir=${MERGED_DIR}/hdf5_shards --lookup=${META_DIR}/lookup_table.pkl --indices_dir=${META_DIR} --index_range=0-${NUM_SHARDS} --random_seed=${SEED}"
+python utils/create_mixed_dataset.py --input_files=${INPUTFILES} --output_dir=${MERGED_DIR}/hdf5_shards --lookup=${META_DIR}/lookup_table.pkl --indices_dir=${META_DIR} --index_range=0-$((NUM_SHARDS-1)) --random_seed=${SEED}
+
+
+rm -rf ${META_DIR}
+
+
--- a/PyTorch/LanguageModeling/BERT/data/squad/squad_download.sh
+++ b/PyTorch/LanguageModeling/BERT/data/squad/squad_download.sh
@ -0,0 +1,60 @@
+#!/usr/bin/env bash
+
+echo "Downloading dataset for squad..."
+
+# Download SQuAD
+
+v1="v1.1"
+mkdir $v1
+wget https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v1.1.json -O $v1/train-v1.1.json
+wget https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v1.1.json -O $v1/dev-v1.1.json
+wget https://worksheets.codalab.org/rest/bundles/0xbcd57bee090b421c982906709c8c27e1/contents/blob/ -O $v1/evaluate-v1.1.py
+
+EXP_TRAIN_v1='981b29407e0affa3b1b156f72073b945  -'
+EXP_DEV_v1='3e85deb501d4e538b6bc56f786231552  -'
+EXP_EVAL_v1='afb04912d18ff20696f7f88eed49bea9  -'
+CALC_TRAIN_v1=`cat ${v1}/train-v1.1.json |md5sum`
+CALC_DEV_v1=`cat ${v1}/dev-v1.1.json |md5sum`
+CALC_EVAL_v1=`cat ${v1}/evaluate-v1.1.py |md5sum`
+
+v2="v2.0"
+mkdir $v2
+wget https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v2.0.json -O $v2/train-v2.0.json
+wget https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v2.0.json -O $v2/dev-v2.0.json
+wget https://worksheets.codalab.org/rest/bundles/0x6b567e1cf2e041ec80d7098f031c5c9e/contents/blob/ -O $v2/evaluate-v2.0.py
+
+EXP_TRAIN_v2='62108c273c268d70893182d5cf8df740  -'
+EXP_DEV_v2='246adae8b7002f8679c027697b0b7cf8  -'
+EXP_EVAL_v2='ff23213bed5516ea4a6d9edb6cd7d627  -'
+
+CALC_TRAIN_v2=`cat ${v2}/train-v2.0.json |md5sum`
+CALC_DEV_v2=`cat ${v2}/dev-v2.0.json |md5sum`
+CALC_EVAL_v2=`cat ${v2}/evaluate-v2.0.py |md5sum`
+
+echo "Squad data download done!"
+
+echo "Verifying Dataset...."
+
+if [ "$EXP_TRAIN_v1" != "$CALC_TRAIN_v1" ]; then
+    echo "train-v1.1.json is corrupted! md5sum doesn't match"
+fi
+
+if [ "$EXP_DEV_v1" != "$CALC_DEV_v1" ]; then
+    echo "dev-v1.1.json is corrupted! md5sum doesn't match"
+fi
+if [ "$EXP_EVAL_v1" != "$CALC_EVAL_v1" ]; then
+    echo "evaluate-v1.1.py is corrupted! md5sum doesn't match"
+fi
+
+
+if [ "$EXP_TRAIN_v2" != "$CALC_TRAIN_v2" ]; then
+    echo "train-v2.0.json is corrupted! md5sum doesn't match"
+fi
+if [ "$EXP_DEV_v2" != "$CALC_DEV_v2" ]; then
+    echo "dev-v2.0.json is corrupted! md5sum doesn't match"
+fi
+if [ "$EXP_EVAL_v2" != "$CALC_EVAL_v2" ]; then
+    echo "evaluate-v2.0.py is corrupted! md5sum doesn't match"
+fi
+
+echo "Complete!"
--- a/PyTorch/LanguageModeling/BERT/data/utils/config.sh
+++ b/PyTorch/LanguageModeling/BERT/data/utils/config.sh
@ -0,0 +1,24 @@
+#! /bin/bash
+
+set -e
+
+USE_BERT_LARGE=true
+MAX_SEQUENCE_LENGTH=512
+MAX_PREDICTIONS_PER_SEQUENCE=80
+MASKED_LM_PROB=0.15
+SEED=12345
+DUPE_FACTOR=5
+DO_LOWER_CASE="True"
+N_LINES_PER_SHARD_APPROX=396000   # Default=396000 creates 256 shards
+
+N_PROCS_PREPROCESS=4    # Adjust this based on memory requirements and available number of cores
+
+BERT_BASE_DIR="/workspace/bert/vocab/uncased_L-12_H-768_A-12"
+BERT_LARGE_DIR="/workspace/bert/vocab/uncased_L-24_H-1024_A-16"
+
+if [ "$USE_BERT_LARGE" = true ] ; then
+  VOCAB_FILE="${BERT_LARGE_DIR}/vocab.txt"
+else
+  VOCAB_FILE="${BERT_BASE_DIR}/vocab.txt"
+fi
+
--- a/PyTorch/LanguageModeling/BERT/data/utils/create_mixed_dataset.py
+++ b/PyTorch/LanguageModeling/BERT/data/utils/create_mixed_dataset.py
@ -0,0 +1,160 @@
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+import argparse
+import logging
+import os
+import random
+from io import open
+import h5py
+import numpy as np
+from tqdm import tqdm, trange
+import random
+import collections
+import math
+import multiprocessing as mp
+"""
+mixing hdf5 shards with each other
+"""
+  
+
+def shard_files(output_files, l_instance_ids, lookuptable, files):
+ 
+    l_input_ids = []
+    l_input_masks = []
+    l_segment_ids = []
+    l_masked_lm_positions = []
+    l_masked_lm_ids = []
+    l_next_sentence_labels = []
+
+    seq_len = 0
+    pred_len = 0
+    with h5py.File(files[0], 'r') as f:
+      seq_len = f['input_ids'].shape[1]
+      pred_len = f['masked_lm_positions'].shape[1]
+
+    assert(seq_len > 0 and pred_len > 0)
+    for i, output_file in enumerate(output_files):
+      output_length = len(l_instance_ids[i])
+      print("preparing to write {} instances to {}".format(output_length, output_file))
+      input_ids = np.ones([output_length, seq_len], dtype=np.int32)
+      input_masks = np.ones([output_length, seq_len], dtype=np.int8)
+      segment_ids = np.ones([output_length, seq_len], dtype=np.int8)
+      masked_lm_positions = np.ones([output_length, pred_len], dtype=np.int32)
+      masked_lm_ids= np.ones([output_length, pred_len], dtype=np.int32)
+      next_sentence_labels = np.ones(output_length, dtype=np.int8)
+      l_input_ids.append(input_ids)
+      l_input_masks.append(input_masks)
+      l_segment_ids.append(segment_ids)
+      l_masked_lm_positions.append(masked_lm_positions)
+      l_masked_lm_ids.append(masked_lm_ids)
+      l_next_sentence_labels.append(next_sentence_labels)
+    for did, f in enumerate(tqdm(files)):
+      h5_f = h5py.File(f, 'r')
+      f_input_ids = h5_f['input_ids'][:]
+      f_input_masks = h5_f['input_mask'][:]
+      f_segment_ids = h5_f['segment_ids'][:]
+      f_masked_lm_positions = h5_f['masked_lm_positions'][:]
+      f_masked_lm_ids = h5_f['masked_lm_ids'][:]
+      f_next_sentence_labels = h5_f['next_sentence_labels'][:]
+      h5_f.close()
+      for out_i, out_file in enumerate(output_files):
+        instance_ids = l_instance_ids[out_i]
+        for l, idx in enumerate(instance_ids):
+          doc_id, line_id = lookuptable[idx]
+          if doc_id == did:
+            l_input_ids[out_i][l] = f_input_ids[line_id]
+            l_input_masks[out_i][l] = f_input_masks[line_id]
+            l_segment_ids[out_i][l] = f_segment_ids[line_id]
+            l_masked_lm_positions[out_i][l] = f_masked_lm_positions[line_id]
+            l_masked_lm_ids[out_i][l] = f_masked_lm_ids[line_id]
+            l_next_sentence_labels[out_i][l] = f_next_sentence_labels[line_id]
+    for out_i, out_file in enumerate(output_files):
+      output_length = len(l_input_ids[out_i])
+      print("writing {} instances to {}".format(output_length, out_file))
+      with h5py.File(out_file, 'w') as f:
+        f.create_dataset("input_ids", data=l_input_ids[out_i], dtype='i4', compression='gzip')
+        f.create_dataset("input_mask", data=l_input_masks[out_i], dtype='i1', compression='gzip')
+        f.create_dataset("segment_ids", data=l_segment_ids[out_i], dtype='i1', compression='gzip')
+        f.create_dataset("masked_lm_positions", data=l_masked_lm_positions[out_i], dtype='i4', compression='gzip')
+        f.create_dataset("masked_lm_ids", data=l_masked_lm_ids[out_i], dtype='i4', compression='gzip')
+        f.create_dataset("next_sentence_labels", data=l_next_sentence_labels[out_i], dtype='i1', compression='gzip')
+
+    
+def main():
+
+    parser = argparse.ArgumentParser()
+    ## Required parameters
+    parser.add_argument("--input_files",
+                        default=None,
+                        type=str,
+                        required=True,
+                        help="comma seperated list of file paths, each path can be either file or directory of files")
+    parser.add_argument("--output_dir",
+                        default=None,
+                        type=str,
+                        required=True,
+                        help="directory for output shards")
+    parser.add_argument("--lookup",
+                        default=None,
+                        type=str,
+                        required=True,
+                        help="path to lookup table")
+    parser.add_argument("--indices_dir",
+                        default=None,
+                        type=str,
+                        required=True,
+                        help="path to shuffled instance indices")
+    parser.add_argument("--index_range",
+                        default=None,
+                        type=str,
+                        required=True,
+                        help="index range of output files to be written out, e.g specify '0-100' for writing out 0.hdf5 , ..., 100.hdf5")
+    parser.add_argument('--random_seed',
+                        type=int,
+                        default=12345,
+                        help="random seed for initialization")
+
+    args = parser.parse_args()
+
+    rng = random.Random(args.random_seed)
+    np.random.seed(args.random_seed)
+
+
+    input_paths = args.input_files.strip().split(',')
+    input_paths = [f for f in input_paths if f]
+
+    input_files = []
+    for path in input_paths:
+      if os.path.isfile(path):
+        assert (path.endswith('.hdf5')), "file must be hdf5 file"
+        input_files.append(path)
+      else:
+        assert os.path.isdir(path)
+        hdf5_files = [os.path.join(path, f) for f in os.listdir(path) if os.path.isfile(os.path.join(path, f)) and f.endswith('.hdf5')]
+        input_files.extend(hdf5_files)
+
+    input_files.sort()
+    assert(os.path.isdir(args.output_dir))
+
+
+
+    print("loading indices file")
+    start_idx, end_idx= int(args.index_range.split('-')[0]), int(args.index_range.split('-')[1])
+    index_files = []
+    instance_ids = []
+    for i in range(start_idx, end_idx + 1):
+      index_files.append(os.path.join(args.indices_dir, "indices_" + str(i) + ".npy"))
+      instance_ids.append( np.load(index_files[-1]))
+
+    output_files = [os.path.join(args.output_dir, indices_file.split('.')[0].split('_')[-1] + ".hdf5") for indices_file in index_files]
+    print("output_files", output_files)
+
+    print("loading lookup table")
+    lookup_table = np.load(args.lookup)
+    shard_files(output_files, instance_ids, lookup_table, input_files)
+
+
+
+if __name__ == "__main__":
+    main()
+
--- a/PyTorch/LanguageModeling/BERT/data/utils/create_mixed_dataset_ids.py
+++ b/PyTorch/LanguageModeling/BERT/data/utils/create_mixed_dataset_ids.py
@ -0,0 +1,134 @@
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+import argparse
+import logging
+import os
+import random
+from io import open
+import h5py
+import numpy as np
+from tqdm import tqdm, trange
+import random
+import collections
+import math
+from tqdm import tqdm
+import multiprocessing as mp
+import pickle
+import json
+"""
+mixing hdf5 shards with each other
+"""
+def load_and_prepare(input_files, num_shards):
+
+  seq_len = None
+  pred_len = None
+
+  input_lengths = []
+  for input_file in input_files:
+    with h5py.File(input_file, 'r') as f:
+      input_lengths.append(len(f['input_ids']))
+      if seq_len is None:
+        seq_len = f['input_ids'].shape[1]
+        pred_len = f['masked_lm_ids'].shape[1]
+
+  assert (isinstance(seq_len, int) and isinstance(pred_len, int))
+
+  total_instances = sum(input_lengths)
+  n_inst_per_file = math.ceil(total_instances * 1.0 / num_shards)
+  permutation = np.random.permutation(total_instances)
+
+
+  instance_indices = []
+  for i in range(0, num_shards):
+    start_pos = i * n_inst_per_file
+    end_pos = min((i+1) * n_inst_per_file, total_instances)
+    instance_indices.append(permutation[start_pos:end_pos])
+
+  return seq_len, pred_len, input_lengths, instance_indices
+
+
+
+    
+def main():
+
+    parser = argparse.ArgumentParser()
+    ## Required parameters
+    parser.add_argument("--input_files",
+                        default=None,
+                        type=str,
+                        required=True,
+                        help="comma seperated list of file paths, each path can be either file or directory of hdf5 files")
+    parser.add_argument("--num_output_shards",
+                        default=None,
+                        type=int,
+                        required=True,
+                        help="number of shards to be created. shards will be created as even as possible.")
+    parser.add_argument("--output_dir",
+                        default=None,
+                        type=str,
+                        required=True,
+                        help="directory for meta files")
+    parser.add_argument('--random_seed',
+                        type=int,
+                        default=12345,
+                        help="random seed for initialization")
+
+    args = parser.parse_args()
+
+    rng = random.Random(args.random_seed)
+    np.random.seed(args.random_seed)
+
+
+    input_paths = args.input_files.strip().split(',')
+    input_paths = [f for f in input_paths if f]
+
+    input_files = []
+    for path in input_paths:
+      if os.path.isfile(path):
+        assert (path.endswith('.hdf5')), "file must be hdf5 file"
+        input_files.append(path)
+      else:
+        assert os.path.isdir(path)
+        hdf5_files = [os.path.join(path, f) for f in os.listdir(path) if os.path.isfile(os.path.join(path, f)) and f.endswith('.hdf5')]
+        input_files.extend(hdf5_files)
+    input_files.sort()
+
+    assert(os.path.isdir(args.output_dir))
+
+    print("load and prepare")
+    seq_len, pred_len, input_lengths, output_inst_indices = load_and_prepare(input_files, args.num_output_shards)
+    print("preparing lookup table")
+    total_num_instances = sum(input_lengths)
+    out_2_in = dict()
+    length_so_far = 0
+    for i, l in enumerate(input_lengths):
+      for j in range(l):
+        out_2_in[length_so_far + j] = (i, j)
+      length_so_far += input_lengths[i]
+
+
+    
+    output_files = [os.path.join(args.output_dir, "indices_" + str(i) + ".npy") for i in range(args.num_output_shards)]
+    print("save data")
+
+
+    with open(os.path.join(args.output_dir, 'lookup_table.pkl'), 'wb') as f:
+      pickle.dump(out_2_in, f)
+
+    for i, out_file in enumerate(output_files):
+      np.save(out_file, output_inst_indices[i])
+    
+
+    meta = {'seq_len': seq_len, 'pred_len':pred_len}
+
+    with open(os.path.join(args.output_dir, 'meta_data.pkl'), 'wb') as f:
+      pickle.dump(meta, f)
+
+    
+
+    
+
+
+
+if __name__ == "__main__":
+    main()
--- a/PyTorch/LanguageModeling/BERT/data/utils/preprocessing.sh
+++ b/PyTorch/LanguageModeling/BERT/data/utils/preprocessing.sh
@ -0,0 +1,23 @@
+#! /bin/bash
+
+SHARD_INDEX=${1}
+INPUT_FILE="${TARGET_DIR}/final_text_files_sharded/corpus.segmented.part.${SHARD_INDEX}.txt"
+
+source /workspace/bert/data/utils/config.sh
+
+OUTPUT_DIR=${TARGET_DIR}/hdf5_shards
+mkdir -p ${OUTPUT_DIR}
+
+OUTPUT_FILE="${OUTPUT_DIR}/${SHARD_INDEX}.hdf5"
+
+python /workspace/bert/create_pretraining_data.py \
+  --input_file=${INPUT_FILE} \
+  --output_file=${OUTPUT_FILE} \
+  --vocab_file=${VOCAB_FILE} \
+  --do_lower_case \
+  --max_seq_length=${MAX_SEQUENCE_LENGTH} \
+  --max_predictions_per_seq=${MAX_PREDICTIONS_PER_SEQUENCE} \
+  --masked_lm_prob=${MASKED_LM_PROB} \
+  --random_seed=${SEED} \
+  --dupe_factor=${DUPE_FACTOR}
+
--- a/PyTorch/LanguageModeling/BERT/data/utils/preprocessing_xargs_wrapper.sh
+++ b/PyTorch/LanguageModeling/BERT/data/utils/preprocessing_xargs_wrapper.sh
@ -0,0 +1,15 @@
+#! /bin/bash
+
+source /workspace/bert/data/utils/config.sh
+
+SHARD_COUNT=0
+rm -rf ${TARGET_DIR}/xarg_list.txt
+touch ${TARGET_DIR}/xarg_list.txt
+for file in ${TARGET_DIR}/final_text_files_sharded/*; do
+  echo ${SHARD_COUNT} >> ${TARGET_DIR}/xarg_list.txt
+  SHARD_COUNT=$((SHARD_COUNT+1))
+done
+
+xargs -n 1 --max-procs=${N_PROCS_PREPROCESS} --arg-file=${TARGET_DIR}/xarg_list.txt /workspace/bert/data/utils/preprocessing.sh
+
+rm ${TARGET_DIR}/xarg_list.txt
--- a/PyTorch/LanguageModeling/BERT/data/utils/sentence_segmentation_nltk.py
+++ b/PyTorch/LanguageModeling/BERT/data/utils/sentence_segmentation_nltk.py
@ -0,0 +1,28 @@
+# NVIDIA
+
+import argparse
+import nltk
+import os
+
+nltk.download('punkt')
+
+parser = argparse.ArgumentParser(description='Sentence Segmentation')
+
+parser.add_argument('input_file', type=str)
+parser.add_argument('output_file', type=str)
+
+args = parser.parse_args()
+
+input_file = args.input_file
+output_file = args.output_file
+
+doc_seperator = "\n"
+
+with open(input_file) as ifile:
+  with open(output_file, "w") as ofile:
+    for line in ifile:
+      if line != "\n":
+        sent_list = nltk.tokenize.sent_tokenize(line)
+        for sent in sent_list:
+          ofile.write(sent + "\n")
+        ofile.write(doc_seperator)
--- a/PyTorch/LanguageModeling/BERT/data/utils/shard_text_input_file.py
+++ b/PyTorch/LanguageModeling/BERT/data/utils/shard_text_input_file.py
@ -0,0 +1,47 @@
+# NVIDIA
+
+import os
+import argparse
+
+parser = argparse.ArgumentParser(description='Dataset sharding')
+
+parser.add_argument('input_file', type=str)
+parser.add_argument('output_file', type=str)
+
+args = parser.parse_args()
+
+input_file = args.input_file
+output_file = args.output_file
+
+doc_seperator = "\n"
+
+line_buffer = []
+shard_size = 396000 # Approximate, will split at next article break
+line_counter = 0
+shard_index = 0
+
+ifile_lines = 0
+with open(input_file) as ifile:
+  for line in ifile:
+    ifile_lines += 1
+
+print("Input file contains", ifile_lines, "lines.")
+
+iline_counter = 1
+with open(input_file) as ifile:
+  for line in ifile:
+    if line_counter < shard_size and iline_counter < ifile_lines:
+      line_buffer.append(line)
+      line_counter += 1
+      iline_counter += 1
+    elif line_counter >= shard_size and line != "\n" and iline_counter < ifile_lines:
+      line_buffer.append(line)
+      line_counter += 1
+      iline_counter += 1
+    else:
+       with open(output_file + str(shard_index) + ".txt", "w") as ofile:
+         for oline in line_buffer:
+           ofile.write(oline)
+         line_buffer = []
+         line_counter = 0
+         shard_index += 1
--- a/PyTorch/LanguageModeling/BERT/data/wikipedia_corpus/download_wikipedia.sh
+++ b/PyTorch/LanguageModeling/BERT/data/wikipedia_corpus/download_wikipedia.sh
@ -0,0 +1,30 @@
+#! /bin/bash
+
+WIKI_DUMP="ftp://ftpmirror.your.org/pub/wikimedia/dumps/enwiki/20190301/enwiki-20190301-pages-articles-multistream.xml.bz2"
+N_PROCS_PREPROCESS=4    # Adjust this based on memory requirements and available number of cores
+
+# Download Wikipedia dump file
+mkdir -p ./download
+
+# Not using --noclobber since it emits an error if exists (incompatible with bash 'set -e')
+echo "Downloading Wikidump"
+if [ ! -f ./download/wikidump.xml.bz2 ]; then
+  wget -O ./download/wikidump.xml.bz2 ${WIKI_DUMP}
+fi
+
+# Extract dump
+echo "Extracting Wikidump"
+mkdir -p ./raw_data
+if [ ! -f ./raw_data/wikidump.xml ]; then
+  pv ./download/wikidump.xml.bz2 | bunzip2 -kdc > ./raw_data/wikidump.xml
+fi
+ 
+# Wikiextractor.py - Creates lots of folders/files in "doc format"
+echo "Running Wikiextractor"
+mkdir -p ./extracted_articles
+/workspace/wikiextractor/WikiExtractor.py ./raw_data/wikidump.xml -b 1000M --processes ${N_PROCS_PREPROCESS} -o ./extracted_articles
+
+# Remove XML Tags and extraneous titles (since they are not sentences)
+# Also clean to remove lines between paragraphs within article and use space-separated articles
+echo "Cleaning and formatting files (one article per line)"
+python3 ./remove_tags_and_clean.py ./extracted_articles ./wikipedia_corpus.txt
--- a/PyTorch/LanguageModeling/BERT/data/wikipedia_corpus/remove_tags_and_clean.py
+++ b/PyTorch/LanguageModeling/BERT/data/wikipedia_corpus/remove_tags_and_clean.py
@ -0,0 +1,39 @@
+# NVIDIA
+
+import glob
+import os
+import argparse
+
+parser = argparse.ArgumentParser(description='Cleaning and merge downloaded bookcorpus files')
+
+parser.add_argument('extracted_articles_path', type=str)
+parser.add_argument('output_file', type=str)
+
+args = parser.parse_args()
+
+extracted_articles_path = args.extracted_articles_path
+output_file = args.output_file
+
+with open(output_file, "w") as ofile:
+  for dirname in glob.glob('{}/*/'.format(extracted_articles_path), recursive=False):
+    for filename in glob.glob(dirname + 'wiki_*', recursive=True):
+      print(filename)
+      article_lines = []
+      article_open = False
+      
+      with open(filename, "r") as file:
+        for line in file:
+          if "<doc id=" in line:
+            article_open = True
+          elif "</doc>" in line:
+            article_open = False
+            for oline in article_lines[1:]:
+              if oline != "\n":
+                ofile.write(oline.rstrip() + " ")
+            ofile.write("\n\n")
+            article_lines = []
+          else:
+            if article_open:
+              article_lines.append(line)
+            
+
--- a/PyTorch/LanguageModeling/BERT/extract_features.py
+++ b/PyTorch/LanguageModeling/BERT/extract_features.py
@ -0,0 +1,297 @@
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Extract pre-computed feature vectors from a PyTorch BERT model."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import argparse
+import collections
+import logging
+import json
+import re
+
+import torch
+from torch.utils.data import TensorDataset, DataLoader, SequentialSampler
+from torch.utils.data.distributed import DistributedSampler
+
+from tokenization import BertTokenizer
+from modeling import BertModel
+
+logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s', 
+                    datefmt = '%m/%d/%Y %H:%M:%S',
+                    level = logging.INFO)
+logger = logging.getLogger(__name__)
+
+
+class InputExample(object):
+
+    def __init__(self, unique_id, text_a, text_b):
+        self.unique_id = unique_id
+        self.text_a = text_a
+        self.text_b = text_b
+
+
+class InputFeatures(object):
+    """A single set of features of data."""
+
+    def __init__(self, unique_id, tokens, input_ids, input_mask, input_type_ids):
+        self.unique_id = unique_id
+        self.tokens = tokens
+        self.input_ids = input_ids
+        self.input_mask = input_mask
+        self.input_type_ids = input_type_ids
+
+
+def convert_examples_to_features(examples, seq_length, tokenizer):
+    """Loads a data file into a list of `InputBatch`s."""
+
+    features = []
+    for (ex_index, example) in enumerate(examples):
+        tokens_a = tokenizer.tokenize(example.text_a)
+
+        tokens_b = None
+        if example.text_b:
+            tokens_b = tokenizer.tokenize(example.text_b)
+
+        if tokens_b:
+            # Modifies `tokens_a` and `tokens_b` in place so that the total
+            # length is less than the specified length.
+            # Account for [CLS], [SEP], [SEP] with "- 3"
+            _truncate_seq_pair(tokens_a, tokens_b, seq_length - 3)
+        else:
+            # Account for [CLS] and [SEP] with "- 2"
+            if len(tokens_a) > seq_length - 2:
+                tokens_a = tokens_a[0:(seq_length - 2)]
+
+        # The convention in BERT is:
+        # (a) For sequence pairs:
+        #  tokens:   [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
+        #  type_ids:   0   0  0    0    0     0      0   0    1  1  1   1  1   1
+        # (b) For single sequences:
+        #  tokens:   [CLS] the dog is hairy . [SEP]
+        #  type_ids:   0   0   0   0  0     0   0
+        #
+        # Where "type_ids" are used to indicate whether this is the first
+        # sequence or the second sequence. The embedding vectors for `type=0` and
+        # `type=1` were learned during pre-training and are added to the wordpiece
+        # embedding vector (and position vector). This is not *strictly* necessary
+        # since the [SEP] token unambigiously separates the sequences, but it makes
+        # it easier for the model to learn the concept of sequences.
+        #
+        # For classification tasks, the first vector (corresponding to [CLS]) is
+        # used as as the "sentence vector". Note that this only makes sense because
+        # the entire model is fine-tuned.
+        tokens = []
+        input_type_ids = []
+        tokens.append("[CLS]")
+        input_type_ids.append(0)
+        for token in tokens_a:
+            tokens.append(token)
+            input_type_ids.append(0)
+        tokens.append("[SEP]")
+        input_type_ids.append(0)
+
+        if tokens_b:
+            for token in tokens_b:
+                tokens.append(token)
+                input_type_ids.append(1)
+            tokens.append("[SEP]")
+            input_type_ids.append(1)
+
+        input_ids = tokenizer.convert_tokens_to_ids(tokens)
+
+        # The mask has 1 for real tokens and 0 for padding tokens. Only real
+        # tokens are attended to.
+        input_mask = [1] * len(input_ids)
+
+        # Zero-pad up to the sequence length.
+        while len(input_ids) < seq_length:
+            input_ids.append(0)
+            input_mask.append(0)
+            input_type_ids.append(0)
+
+        assert len(input_ids) == seq_length
+        assert len(input_mask) == seq_length
+        assert len(input_type_ids) == seq_length
+
+        if ex_index < 5:
+            logger.info("*** Example ***")
+            logger.info("unique_id: %s" % (example.unique_id))
+            logger.info("tokens: %s" % " ".join([str(x) for x in tokens]))
+            logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
+            logger.info("input_mask: %s" % " ".join([str(x) for x in input_mask]))
+            logger.info(
+                "input_type_ids: %s" % " ".join([str(x) for x in input_type_ids]))
+
+        features.append(
+            InputFeatures(
+                unique_id=example.unique_id,
+                tokens=tokens,
+                input_ids=input_ids,
+                input_mask=input_mask,
+                input_type_ids=input_type_ids))
+    return features
+
+
+def _truncate_seq_pair(tokens_a, tokens_b, max_length):
+    """Truncates a sequence pair in place to the maximum length."""
+
+    # This is a simple heuristic which will always truncate the longer sequence
+    # one token at a time. This makes more sense than truncating an equal percent
+    # of tokens from each, since if one sequence is very short then each token
+    # that's truncated likely contains more information than a longer sequence.
+    while True:
+        total_length = len(tokens_a) + len(tokens_b)
+        if total_length <= max_length:
+            break
+        if len(tokens_a) > len(tokens_b):
+            tokens_a.pop()
+        else:
+            tokens_b.pop()
+
+
+def read_examples(input_file):
+    """Read a list of `InputExample`s from an input file."""
+    examples = []
+    unique_id = 0
+    with open(input_file, "r", encoding='utf-8') as reader:
+        while True:
+            line = reader.readline()
+            if not line:
+                break
+            line = line.strip()
+            text_a = None
+            text_b = None
+            m = re.match(r"^(.*) \|\|\| (.*)$", line)
+            if m is None:
+                text_a = line
+            else:
+                text_a = m.group(1)
+                text_b = m.group(2)
+            examples.append(
+                InputExample(unique_id=unique_id, text_a=text_a, text_b=text_b))
+            unique_id += 1
+    return examples
+
+
+def main():
+    parser = argparse.ArgumentParser()
+
+    ## Required parameters
+    parser.add_argument("--input_file", default=None, type=str, required=True)
+    parser.add_argument("--output_file", default=None, type=str, required=True)
+    parser.add_argument("--bert_model", default=None, type=str, required=True,
+                        help="Bert pre-trained model selected in the list: bert-base-uncased, "
+                             "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese.")
+
+    ## Other parameters
+    parser.add_argument("--do_lower_case", action='store_true', help="Set this flag if you are using an uncased model.")
+    parser.add_argument("--layers", default="-1,-2,-3,-4", type=str)
+    parser.add_argument("--max_seq_length", default=128, type=int,
+                        help="The maximum total input sequence length after WordPiece tokenization. Sequences longer "
+                            "than this will be truncated, and sequences shorter than this will be padded.")
+    parser.add_argument("--batch_size", default=32, type=int, help="Batch size for predictions.")
+    parser.add_argument("--local_rank",
+                        type=int,
+                        default=-1,
+                        help = "local_rank for distributed training on gpus")
+    parser.add_argument("--no_cuda",
+                        action='store_true',
+                        help="Whether not to use CUDA when available")
+
+    args = parser.parse_args()
+
+    if args.local_rank == -1 or args.no_cuda:
+        device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
+        n_gpu = torch.cuda.device_count()
+    else:
+        device = torch.device("cuda", args.local_rank)
+        n_gpu = 1
+        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
+        torch.distributed.init_process_group(backend='nccl')
+    logger.info("device: {} n_gpu: {} distributed training: {}".format(device, n_gpu, bool(args.local_rank != -1)))
+
+    layer_indexes = [int(x) for x in args.layers.split(",")]
+
+    tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case)
+
+    examples = read_examples(args.input_file)
+
+    features = convert_examples_to_features(
+        examples=examples, seq_length=args.max_seq_length, tokenizer=tokenizer)
+
+    unique_id_to_feature = {}
+    for feature in features:
+        unique_id_to_feature[feature.unique_id] = feature
+
+    model = BertModel.from_pretrained(args.bert_model)
+    model.to(device)
+
+    if args.local_rank != -1:
+        model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank],
+                                                          output_device=args.local_rank)
+    elif n_gpu > 1:
+        model = torch.nn.DataParallel(model)
+
+    all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
+    all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long)
+    all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long)
+
+    eval_data = TensorDataset(all_input_ids, all_input_mask, all_example_index)
+    if args.local_rank == -1:
+        eval_sampler = SequentialSampler(eval_data)
+    else:
+        eval_sampler = DistributedSampler(eval_data)
+    eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.batch_size)
+
+    model.eval()
+    with open(args.output_file, "w", encoding='utf-8') as writer:
+        for input_ids, input_mask, example_indices in eval_dataloader:
+            input_ids = input_ids.to(device)
+            input_mask = input_mask.to(device)
+
+            all_encoder_layers, _ = model(input_ids, token_type_ids=None, attention_mask=input_mask)
+            all_encoder_layers = all_encoder_layers
+
+            for b, example_index in enumerate(example_indices):
+                feature = features[example_index.item()]
+                unique_id = int(feature.unique_id)
+                # feature = unique_id_to_feature[unique_id]
+                output_json = collections.OrderedDict()
+                output_json["linex_index"] = unique_id
+                all_out_features = []
+                for (i, token) in enumerate(feature.tokens):
+                    all_layers = []
+                    for (j, layer_index) in enumerate(layer_indexes):
+                        layer_output = all_encoder_layers[int(layer_index)].detach().cpu().numpy()
+                        layer_output = layer_output[b]
+                        layers = collections.OrderedDict()
+                        layers["index"] = layer_index
+                        layers["values"] = [
+                            round(x.item(), 6) for x in layer_output[i]
+                        ]
+                        all_layers.append(layers)
+                    out_features = collections.OrderedDict()
+                    out_features["token"] = token
+                    out_features["layers"] = all_layers
+                    all_out_features.append(out_features)
+                output_json["features"] = all_out_features
+                writer.write(json.dumps(output_json) + "\n")
+
+
+if __name__ == "__main__":
+    main()
--- a/PyTorch/LanguageModeling/BERT/file_utils.py
+++ b/PyTorch/LanguageModeling/BERT/file_utils.py
@ -0,0 +1,249 @@
+"""
+Utilities for working with the local dataset cache.
+This file is adapted from the AllenNLP library at https://github.com/allenai/allennlp
+Copyright by the AllenNLP authors.
+"""
+from __future__ import (absolute_import, division, print_function, unicode_literals)
+
+import json
+import logging
+import os
+import shutil
+import tempfile
+from functools import wraps
+from hashlib import sha256
+import sys
+from io import open
+
+import boto3
+import requests
+from botocore.exceptions import ClientError
+from tqdm import tqdm
+
+try:
+    from urllib.parse import urlparse
+except ImportError:
+    from urlparse import urlparse
+
+try:
+    from pathlib import Path
+    PYTORCH_PRETRAINED_BERT_CACHE = Path(os.getenv('PYTORCH_PRETRAINED_BERT_CACHE',
+                                                   Path.home() / '.pytorch_pretrained_bert'))
+except AttributeError:
+    PYTORCH_PRETRAINED_BERT_CACHE = os.getenv('PYTORCH_PRETRAINED_BERT_CACHE',
+                                              os.path.join(os.path.expanduser("~"), '.pytorch_pretrained_bert'))
+
+logger = logging.getLogger(__name__)  # pylint: disable=invalid-name
+
+
+def url_to_filename(url, etag=None):
+    """
+    Convert `url` into a hashed filename in a repeatable way.
+    If `etag` is specified, append its hash to the url's, delimited
+    by a period.
+    """
+    url_bytes = url.encode('utf-8')
+    url_hash = sha256(url_bytes)
+    filename = url_hash.hexdigest()
+
+    if etag:
+        etag_bytes = etag.encode('utf-8')
+        etag_hash = sha256(etag_bytes)
+        filename += '.' + etag_hash.hexdigest()
+
+    return filename
+
+
+def filename_to_url(filename, cache_dir=None):
+    """
+    Return the url and etag (which may be ``None``) stored for `filename`.
+    Raise ``EnvironmentError`` if `filename` or its stored metadata do not exist.
+    """
+    if cache_dir is None:
+        cache_dir = PYTORCH_PRETRAINED_BERT_CACHE
+    if sys.version_info[0] == 3 and isinstance(cache_dir, Path):
+        cache_dir = str(cache_dir)
+
+    cache_path = os.path.join(cache_dir, filename)
+    if not os.path.exists(cache_path):
+        raise EnvironmentError("file {} not found".format(cache_path))
+
+    meta_path = cache_path + '.json'
+    if not os.path.exists(meta_path):
+        raise EnvironmentError("file {} not found".format(meta_path))
+
+    with open(meta_path, encoding="utf-8") as meta_file:
+        metadata = json.load(meta_file)
+    url = metadata['url']
+    etag = metadata['etag']
+
+    return url, etag
+
+
+def cached_path(url_or_filename, cache_dir=None):
+    """
+    Given something that might be a URL (or might be a local path),
+    determine which. If it's a URL, download the file and cache it, and
+    return the path to the cached file. If it's already a local path,
+    make sure the file exists and then return the path.
+    """
+    if cache_dir is None:
+        cache_dir = PYTORCH_PRETRAINED_BERT_CACHE
+    if sys.version_info[0] == 3 and isinstance(url_or_filename, Path):
+        url_or_filename = str(url_or_filename)
+    if sys.version_info[0] == 3 and isinstance(cache_dir, Path):
+        cache_dir = str(cache_dir)
+
+    parsed = urlparse(url_or_filename)
+
+    if parsed.scheme in ('http', 'https', 's3'):
+        # URL, so get it from the cache (downloading if necessary)
+        return get_from_cache(url_or_filename, cache_dir)
+    elif os.path.exists(url_or_filename):
+        # File, and it exists.
+        return url_or_filename
+    elif parsed.scheme == '':
+        # File, but it doesn't exist.
+        raise EnvironmentError("file {} not found".format(url_or_filename))
+    else:
+        # Something unknown
+        raise ValueError("unable to parse {} as a URL or as a local path".format(url_or_filename))
+
+
+def split_s3_path(url):
+    """Split a full s3 path into the bucket name and path."""
+    parsed = urlparse(url)
+    if not parsed.netloc or not parsed.path:
+        raise ValueError("bad s3 path {}".format(url))
+    bucket_name = parsed.netloc
+    s3_path = parsed.path
+    # Remove '/' at beginning of path.
+    if s3_path.startswith("/"):
+        s3_path = s3_path[1:]
+    return bucket_name, s3_path
+
+
+def s3_request(func):
+    """
+    Wrapper function for s3 requests in order to create more helpful error
+    messages.
+    """
+
+    @wraps(func)
+    def wrapper(url, *args, **kwargs):
+        try:
+            return func(url, *args, **kwargs)
+        except ClientError as exc:
+            if int(exc.response["Error"]["Code"]) == 404:
+                raise EnvironmentError("file {} not found".format(url))
+            else:
+                raise
+
+    return wrapper
+
+
+@s3_request
+def s3_etag(url):
+    """Check ETag on S3 object."""
+    s3_resource = boto3.resource("s3")
+    bucket_name, s3_path = split_s3_path(url)
+    s3_object = s3_resource.Object(bucket_name, s3_path)
+    return s3_object.e_tag
+
+
+@s3_request
+def s3_get(url, temp_file):
+    """Pull a file directly from S3."""
+    s3_resource = boto3.resource("s3")
+    bucket_name, s3_path = split_s3_path(url)
+    s3_resource.Bucket(bucket_name).download_fileobj(s3_path, temp_file)
+
+
+def http_get(url, temp_file):
+    req = requests.get(url, stream=True)
+    content_length = req.headers.get('Content-Length')
+    total = int(content_length) if content_length is not None else None
+    progress = tqdm(unit="B", total=total)
+    for chunk in req.iter_content(chunk_size=1024):
+        if chunk: # filter out keep-alive new chunks
+            progress.update(len(chunk))
+            temp_file.write(chunk)
+    progress.close()
+
+
+def get_from_cache(url, cache_dir=None):
+    """
+    Given a URL, look for the corresponding dataset in the local cache.
+    If it's not there, download it. Then return the path to the cached file.
+    """
+    if cache_dir is None:
+        cache_dir = PYTORCH_PRETRAINED_BERT_CACHE
+    if sys.version_info[0] == 3 and isinstance(cache_dir, Path):
+        cache_dir = str(cache_dir)
+
+    if not os.path.exists(cache_dir):
+        os.makedirs(cache_dir)
+
+    # Get eTag to add to filename, if it exists.
+    if url.startswith("s3://"):
+        etag = s3_etag(url)
+    else:
+        response = requests.head(url, allow_redirects=True)
+        if response.status_code != 200:
+            raise IOError("HEAD request failed for url {} with status code {}"
+                          .format(url, response.status_code))
+        etag = response.headers.get("ETag")
+
+    filename = url_to_filename(url, etag)
+
+    # get cache path to put the file
+    cache_path = os.path.join(cache_dir, filename)
+
+    if not os.path.exists(cache_path):
+        # Download to temporary file, then copy to cache dir once finished.
+        # Otherwise you get corrupt cache entries if the download gets interrupted.
+        with tempfile.NamedTemporaryFile() as temp_file:
+            logger.info("%s not found in cache, downloading to %s", url, temp_file.name)
+
+            # GET file object
+            if url.startswith("s3://"):
+                s3_get(url, temp_file)
+            else:
+                http_get(url, temp_file)
+
+            # we are copying the file before closing it, so flush to avoid truncation
+            temp_file.flush()
+            # shutil.copyfileobj() starts at the current position, so go to the start
+            temp_file.seek(0)
+
+            logger.info("copying %s to cache at %s", temp_file.name, cache_path)
+            with open(cache_path, 'wb') as cache_file:
+                shutil.copyfileobj(temp_file, cache_file)
+
+            logger.info("creating metadata file for %s", cache_path)
+            meta = {'url': url, 'etag': etag}
+            meta_path = cache_path + '.json'
+            with open(meta_path, 'w', encoding="utf-8") as meta_file:
+                json.dump(meta, meta_file)
+
+            logger.info("removing temp file %s", temp_file.name)
+
+    return cache_path
+
+
+def read_set_from_file(filename):
+    '''
+    Extract a de-duped collection (set) of text from a file.
+    Expected file format is one item per line.
+    '''
+    collection = set()
+    with open(filename, 'r', encoding='utf-8') as file_:
+        for line in file_:
+            collection.add(line.rstrip())
+    return collection
+
+
+def get_file_extension(path, dot=True, lower=True):
+    ext = os.path.splitext(path)[1]
+    ext = ext if dot else ext[1:]
+    return ext.lower() if lower else ext
--- a/PyTorch/LanguageModeling/BERT/fused_adam_local.py
+++ b/PyTorch/LanguageModeling/BERT/fused_adam_local.py
@ -0,0 +1,205 @@
+import types
+import importlib
+
+import math
+import torch
+
+def warmup_cosine(x, warmup=0.002):
+    if x < warmup:
+        return x/warmup
+    return 0.5 * (1.0 + torch.cos(math.pi * x))
+
+def warmup_constant(x, warmup=0.002):
+    if x < warmup:
+        return x/warmup
+    return 1.0
+
+def warmup_linear(x, warmup=0.002):
+    if x < warmup:
+        return x/warmup
+    return 1.0 - x
+
+SCHEDULES = {
+    'warmup_cosine':warmup_cosine,
+    'warmup_constant':warmup_constant,
+    'warmup_linear':warmup_linear,
+}
+
+class FusedAdamBert(torch.optim.Optimizer):
+
+    """Implements Adam algorithm. Currently GPU-only.  Requires Apex to be installed via
+    ``python setup.py install --cuda_ext --cpp_ext``.
+    It has been proposed in `Adam: A Method for Stochastic Optimization`_.
+    Arguments:
+        params (iterable): iterable of parameters to optimize or dicts defining
+            parameter groups.
+        lr (float, optional): learning rate. (default: 1e-3)
+        betas (Tuple[float, float], optional): coefficients used for computing
+            running averages of gradient and its square. (default: (0.9, 0.999))
+        eps (float, optional): term added to the denominator to improve
+            numerical stability. (default: 1e-8)
+        weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
+        amsgrad (boolean, optional): whether to use the AMSGrad variant of this
+            algorithm from the paper `On the Convergence of Adam and Beyond`_
+            (default: False) NOT SUPPORTED in FusedAdam!
+        eps_inside_sqrt (boolean, optional): in the 'update parameters' step,
+            adds eps to the bias-corrected second moment estimate before
+            evaluating square root instead of adding it to the square root of
+            second moment estimate as in the original paper. (default: False)
+    .. _Adam\: A Method for Stochastic Optimization:
+        https://arxiv.org/abs/1412.6980
+    .. _On the Convergence of Adam and Beyond:
+        https://openreview.net/forum?id=ryQu7f-RZ
+    """
+
+#    def __init__(self, params,
+#                 lr=1e-3, bias_correction = True,
+#                 betas=(0.9, 0.999), eps=1e-8, eps_inside_sqrt = False,
+#                 weight_decay=0., max_grad_norm=0., amsgrad=False):
+
+    def __init__(self, params, lr=1e-3, warmup=-1, t_total=-1, bias_correction=False, betas=(0.9, 0.999), schedule='warmup_linear',
+                 eps=1e-6, eps_inside_sqrt = False, weight_decay=0., max_grad_norm=1.0, amsgrad=False):
+
+
+        global fused_adam_cuda
+        fused_adam_cuda = importlib.import_module("fused_adam_cuda")
+
+        if amsgrad:
+            raise RuntimeError('FusedAdam does not support the AMSGrad variant.')
+        defaults = dict(lr=lr, bias_correction=bias_correction,
+                        betas=betas, eps=eps, weight_decay=weight_decay,
+                        max_grad_norm=max_grad_norm)
+        super(FusedAdamBert, self).__init__(params, defaults)
+        print("LOCAL FUSED ADAM")
+        self.eps_mode = 0 if  eps_inside_sqrt else 1
+        self.schedule = schedule
+        self.t_total = t_total
+        self.warmup = warmup
+ 
+    def get_lr(self):
+        lr = []
+        for group in self.param_groups:
+            for p in group['params']:
+                state = self.state[p]
+                if len(state) == 0:
+                    return [0]
+                if group['t_total'] != -1:
+                    schedule_fct = SCHEDULES[group['schedule']]
+                    lr_scheduled = group['lr'] * schedule_fct(state['step']/group['t_total'], group['warmup'])
+                else:
+                    lr_scheduled = group['lr']
+                lr.append(lr_scheduled)
+        print("LR {}".format(lr_scheduled))
+        return lr
+
+    def step(self, closure=None, grads=None, output_params=None, scale=1., grad_norms=None):
+        """Performs a single optimization step.
+        Arguments:
+            closure (callable, optional): A closure that reevaluates the model
+                and returns the loss.
+            grads (list of tensors, optional): weight gradient to use for the
+                optimizer update. If gradients have type torch.half, parameters
+                are expected to be in type torch.float. (default: None)
+            output params (list of tensors, optional): A reduced precision copy
+                of the updated weights written out in addition to the regular
+                updated weights. Have to be of same type as gradients. (default: None)
+            scale (float, optional): factor to divide gradient tensor values
+                by before applying to weights. (default: 1)
+        """
+        loss = None
+        if closure is not None:
+            loss = closure()
+
+        if grads is None:
+            grads_group = [None]*len(self.param_groups)
+        # backward compatibility
+        # assuming a list/generator of parameter means single group
+        elif isinstance(grads, types.GeneratorType):
+            grads_group = [grads]
+        elif type(grads[0])!=list:
+            grads_group = [grads]
+        else:
+            grads_group = grads
+
+        if output_params is None:
+            output_params_group = [None]*len(self.param_groups)
+        elif isinstance(output_params, types.GeneratorType):
+            output_params_group = [output_params]
+        elif type(output_params[0])!=list:
+            output_params_group = [output_params]
+        else:
+            output_params_group = output_params
+
+        if grad_norms is None:
+            grad_norms = [None]*len(self.param_groups)
+
+        #Compute global norm
+        global_norm = 0.0        
+        for group, grads_this_group, output_params_this_group, grad_norm in zip(self.param_groups, grads_group,
+                                                                                output_params_group, grad_norms):
+            global_norm = (global_norm ** 2 + grad_norm ** 2) ** 0.5
+
+        for group, grads_this_group, output_params_this_group, grad_norm in zip(self.param_groups, grads_group, output_params_group, grad_norms):
+            if grads_this_group is None:
+               grads_this_group = [None]*len(group['params'])
+            if output_params_this_group is None:
+               output_params_this_group = [None]*len(group['params'])
+
+            # compute combined scale factor for this group
+            combined_scale = scale
+            if group['max_grad_norm'] > 0:
+                # norm is in fact norm*scale
+                clip = ((global_norm / scale) + 1e-6) / group['max_grad_norm']
+                if clip > 1:
+                    combined_scale = clip * scale
+
+            bias_correction = 1 if group['bias_correction'] else 0
+
+            for p, grad, output_param in zip(group['params'], grads_this_group, output_params_this_group):
+                #note: p.grad should not ever be set for correct operation of mixed precision optimizer that sometimes sends None gradients
+                if p.grad is None and grad is None:
+                    continue
+                if grad is None:
+                    grad = p.grad.data
+                if grad.is_sparse:
+                    raise RuntimeError('FusedAdam does not support sparse gradients, please consider SparseAdam instead')
+
+                state = self.state[p]
+
+                # State initialization
+                if len(state) == 0:
+                    state['step'] = 0
+                    # Exponential moving average of gradient values
+                    state['exp_avg'] = torch.zeros_like(p.data)
+                    # Exponential moving average of squared gradient values
+                    state['exp_avg_sq'] = torch.zeros_like(p.data)
+
+                exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']
+                beta1, beta2 = group['betas']
+
+                state['step'] += 1
+
+                out_p = torch.tensor([], dtype = torch.float) if output_param is None else output_param
+                #Changes sharath 
+
+                schedule_fct = SCHEDULES[self.schedule]
+                #schedule_fct(state['step']/self.t_total, self.warmup)
+                #step_lr = group['lr'] *  schedule_fct(state['step']/self.t_total, self.warmup)
+                #step_lr = group['lr'] * scale#schedule_fct(state['step']/self.t_total, self.warmup)# schedule_fct(state['step']/group['t_total'], group['warmup']) 
+                #print(scale, step_lr)
+                #print(group['lr']) 
+                fused_adam_cuda.adam(p.data,
+                                     out_p,
+                                     exp_avg,
+                                     exp_avg_sq,
+                                     grad,
+                                     group['lr'], #step_lr,#group['lr'],
+                                     beta1,
+                                     beta2,
+                                     group['eps'],
+                                     combined_scale,
+                                     state['step'],
+                                     self.eps_mode,
+                                     bias_correction,
+                                     group['weight_decay'])
+        return loss
--- a/PyTorch/LanguageModeling/BERT/modeling.py
+++ b/PyTorch/LanguageModeling/BERT/modeling.py
--- a/PyTorch/LanguageModeling/BERT/optimization.py
+++ b/PyTorch/LanguageModeling/BERT/optimization.py
@ -0,0 +1,218 @@
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch optimization for BERT model."""
+
+import math
+import torch
+from torch.optim import Optimizer
+from torch.optim.optimizer import required
+from torch.nn.utils import clip_grad_norm_
+#from fused_adam_local import FusedAdam
+from apex.optimizers import FusedAdam
+
+def warmup_cosine(x, warmup=0.002):
+    if x < warmup:
+        return x/warmup
+    return 0.5 * (1.0 + torch.cos(math.pi * x))
+
+def warmup_constant(x, warmup=0.002):
+    if x < warmup:
+        return x/warmup
+    return 1.0
+
+def warmup_linear(x, warmup=0.002):
+    if x < warmup:
+        return x/warmup
+    # return (1.0 - x)
+
+    return max((x - 1. )/ (warmup - 1.), 0.) 
+
+SCHEDULES = {
+    'warmup_cosine':warmup_cosine,
+    'warmup_constant':warmup_constant,
+    'warmup_linear':warmup_linear,
+}
+
+
+class BertAdam(Optimizer):
+    """Implements BERT version of Adam algorithm with weight decay fix.
+    Params:
+        lr: learning rate
+        warmup: portion of t_total for the warmup, -1  means no warmup. Default: -1
+        t_total: total number of training steps for the learning
+            rate schedule, -1  means constant learning rate. Default: -1
+        schedule: schedule to use for the warmup (see above). Default: 'warmup_linear'
+        b1: Adams b1. Default: 0.9
+        b2: Adams b2. Default: 0.999
+        e: Adams epsilon. Default: 1e-6
+        weight_decay: Weight decay. Default: 0.01
+        max_grad_norm: Maximum norm for the gradients (-1 means no clipping). Default: 1.0
+    """
+    def __init__(self, params, lr=required, warmup=-1, t_total=-1, schedule='warmup_linear',
+                 b1=0.9, b2=0.999, e=1e-6, weight_decay=0.01,
+                 max_grad_norm=1.0):
+        if lr is not required and lr < 0.0:
+            raise ValueError("Invalid learning rate: {} - should be >= 0.0".format(lr))
+        if schedule not in SCHEDULES:
+            raise ValueError("Invalid schedule parameter: {}".format(schedule))
+        if not 0.0 <= warmup < 1.0 and not warmup == -1:
+            raise ValueError("Invalid warmup: {} - should be in [0.0, 1.0[ or -1".format(warmup))
+        if not 0.0 <= b1 < 1.0:
+            raise ValueError("Invalid b1 parameter: {} - should be in [0.0, 1.0[".format(b1))
+        if not 0.0 <= b2 < 1.0:
+            raise ValueError("Invalid b2 parameter: {} - should be in [0.0, 1.0[".format(b2))
+        if not e >= 0.0:
+            raise ValueError("Invalid epsilon value: {} - should be >= 0.0".format(e))
+        defaults = dict(lr=lr, schedule=schedule, warmup=warmup, t_total=t_total,
+                        b1=b1, b2=b2, e=e, weight_decay=weight_decay,
+                        max_grad_norm=max_grad_norm)
+        super(BertAdam, self).__init__(params, defaults)
+
+    def get_lr(self):
+        lr = []
+        for group in self.param_groups:
+            for p in group['params']:
+                state = self.state[p]
+                if len(state) == 0:
+                    return [0]
+                if group['t_total'] != -1:
+                    schedule_fct = SCHEDULES[group['schedule']]
+                    lr_scheduled = group['lr'] * schedule_fct(state['step']/group['t_total'], group['warmup'])
+                else:
+                    lr_scheduled = group['lr']
+                lr.append(lr_scheduled)
+        return lr
+
+    def step(self, closure=None):
+        """Performs a single optimization step.
+
+        Arguments:
+            closure (callable, optional): A closure that reevaluates the model
+                and returns the loss.
+        """
+        loss = None
+        if closure is not None:
+            loss = closure()
+
+        for group in self.param_groups:
+            for p in group['params']:
+                if p.grad is None:
+                    continue
+                grad = p.grad.data
+                if grad.is_sparse:
+                    raise RuntimeError('Adam does not support sparse gradients, please consider SparseAdam instead')
+
+                state = self.state[p]
+
+                # State initialization
+                if len(state) == 0:
+                    state['step'] = 0
+                    # Exponential moving average of gradient values
+                    state['next_m'] = torch.zeros_like(p.data)
+                    # Exponential moving average of squared gradient values
+                    state['next_v'] = torch.zeros_like(p.data)
+
+                next_m, next_v = state['next_m'], state['next_v']
+                beta1, beta2 = group['b1'], group['b2']
+
+                # Add grad clipping
+                if group['max_grad_norm'] > 0:
+                    clip_grad_norm_(p, group['max_grad_norm'])
+
+                # Decay the first and second moment running average coefficient
+                # In-place operations to update the averages at the same time
+                next_m.mul_(beta1).add_(1 - beta1, grad)
+                next_v.mul_(beta2).addcmul_(1 - beta2, grad, grad)
+                update = next_m / (next_v.sqrt() + group['e'])
+
+                # Just adding the square of the weights to the loss function is *not*
+                # the correct way of using L2 regularization/weight decay with Adam,
+                # since that will interact with the m and v parameters in strange ways.
+                #
+                # Instead we want to decay the weights in a manner that doesn't interact
+                # with the m/v parameters. This is equivalent to adding the square
+                # of the weights to the loss with plain (non-momentum) SGD.
+                if group['weight_decay'] > 0.0:
+                    update += group['weight_decay'] * p.data
+
+                if group['t_total'] != -1:
+                    schedule_fct = SCHEDULES[group['schedule']]
+                    lr_scheduled = group['lr'] * schedule_fct(state['step']/group['t_total'], group['warmup'])
+                else:
+                    lr_scheduled = group['lr']
+
+                update_with_lr = lr_scheduled * update
+                p.data.add_(-update_with_lr)
+
+                state['step'] += 1
+
+                # step_size = lr_scheduled * math.sqrt(bias_correction2) / bias_correction1
+                # No bias correction
+                # bias_correction1 = 1 - beta1 ** state['step']
+                # bias_correction2 = 1 - beta2 ** state['step']
+
+        return loss
+
+# =======================================================================
+class BertAdam_FP16(FusedAdam):
+    """Implements BERT version of Adam algorithm with weight decay fix.
+    Params:
+        lr: learning rate
+        warmup: portion of t_total for the warmup, -1  means no warmup. Default: -1
+        t_total: total number of training steps for the learning
+            rate schedule, -1  means constant learning rate. Default: -1
+        schedule: schedule to use for the warmup (see above). Default: 'warmup_linear'
+        b1: Adams b1. Default: 0.9
+        b2: Adams b2. Default: 0.999
+        e: Adams epsilon. Default: 1e-6
+        weight_decay: Weight decay. Default: 0.01
+        max_grad_norm: Maximum norm for the gradients (-1 means no clipping). Default: 1.0
+    """
+    def __init__(self, params, lr, warmup=-1, t_total=-1, bias_correction=False, schedule='warmup_linear',
+                 b1=0.9, b2=0.999, e=1e-6, weight_decay=0.01,
+                 max_grad_norm=1.0):
+        if not lr >= 0.0:
+            raise ValueError("Invalid learning rate: {} - should be >= 0.0".format(lr))
+        if schedule not in SCHEDULES:
+            raise ValueError("Invalid schedule parameter: {}".format(schedule))
+        if not 0.0 <= warmup < 1.0 and not warmup == -1:
+            raise ValueError("Invalid warmup: {} - should be in [0.0, 1.0[ or -1".format(warmup))
+        if not 0.0 <= b1 < 1.0:
+            raise ValueError("Invalid b1 parameter: {} - should be in [0.0, 1.0[".format(b1))
+        if not 0.0 <= b2 < 1.0:
+            raise ValueError("Invalid b2 parameter: {} - should be in [0.0, 1.0[".format(b2))
+        if not e >= 0.0:
+            raise ValueError("Invalid epsilon value: {} - should be >= 0.0".format(e))
+        # defaults = dict(lr=lr, schedule=schedule, warmup=warmup, t_total=t_total,
+        #                 b1=b1, b2=b2, e=e, weight_decay=weight_decay,
+        #                 max_grad_norm=max_grad_norm)
+        super(BertAdam_FP16, self).__init__(params, lr=lr, bias_correction=bias_correction, betas=(b1, b2), eps=e, weight_decay=weight_decay, max_grad_norm=max_grad_norm)#defaults)
+
+    def get_lr(self):
+        lr = []
+        for group in self.param_groups:
+            for p in group['params']:
+                state = self.state[p]
+                if len(state) == 0:
+                    print("returning", state)
+                    return [0]
+                if group['t_total'] != -1:
+                    schedule_fct = SCHEDULES[group['schedule']]
+                    lr_scheduled = group['lr'] * schedule_fct(state['step']/group['t_total'], group['warmup'])
+                else:
+                    lr_scheduled = group['lr']
+                lr.append(lr_scheduled)
+        print("LR {}".format(lr_scheduled))
+        return lr
--- a/PyTorch/LanguageModeling/BERT/requirements.txt
+++ b/PyTorch/LanguageModeling/BERT/requirements.txt
@ -0,0 +1,13 @@
+# progress bars in model download and training scripts
+tqdm
+# Accessing files from S3 directly.
+boto3
+# Used for downloading models over HTTP
+requests
+six
+ipdb
+#Data processing
+h5py
+html2text
+nltk
+progressbar
--- a/PyTorch/LanguageModeling/BERT/run_glue.py
+++ b/PyTorch/LanguageModeling/BERT/run_glue.py
@ -0,0 +1,649 @@
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""BERT finetuning runner."""
+
+from __future__ import absolute_import, division, print_function
+
+import argparse
+import csv
+import logging
+import os
+import random
+import sys
+
+import numpy as np
+import torch
+from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,
+                              TensorDataset)
+from torch.utils.data.distributed import DistributedSampler
+from tqdm import tqdm, trange
+
+from file_utils import PYTORCH_PRETRAINED_BERT_CACHE
+from modeling import BertForSequenceClassification, BertConfig, WEIGHTS_NAME, CONFIG_NAME
+from tokenization import BertTokenizer
+from optimization import BertAdam, warmup_linear
+
+logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
+                    datefmt = '%m/%d/%Y %H:%M:%S',
+                    level = logging.INFO)
+logger = logging.getLogger(__name__)
+
+
+class InputExample(object):
+    """A single training/test example for simple sequence classification."""
+
+    def __init__(self, guid, text_a, text_b=None, label=None):
+        """Constructs a InputExample.
+
+        Args:
+            guid: Unique id for the example.
+            text_a: string. The untokenized text of the first sequence. For single
+            sequence tasks, only this sequence must be specified.
+            text_b: (Optional) string. The untokenized text of the second sequence.
+            Only must be specified for sequence pair tasks.
+            label: (Optional) string. The label of the example. This should be
+            specified for train and dev examples, but not for test examples.
+        """
+        self.guid = guid
+        self.text_a = text_a
+        self.text_b = text_b
+        self.label = label
+
+
+class InputFeatures(object):
+    """A single set of features of data."""
+
+    def __init__(self, input_ids, input_mask, segment_ids, label_id):
+        self.input_ids = input_ids
+        self.input_mask = input_mask
+        self.segment_ids = segment_ids
+        self.label_id = label_id
+
+
+class DataProcessor(object):
+    """Base class for data converters for sequence classification data sets."""
+
+    def get_train_examples(self, data_dir):
+        """Gets a collection of `InputExample`s for the train set."""
+        raise NotImplementedError()
+
+    def get_dev_examples(self, data_dir):
+        """Gets a collection of `InputExample`s for the dev set."""
+        raise NotImplementedError()
+
+    def get_labels(self):
+        """Gets the list of labels for this data set."""
+        raise NotImplementedError()
+
+    @classmethod
+    def _read_tsv(cls, input_file, quotechar=None):
+        """Reads a tab separated value file."""
+        with open(input_file, "r") as f:
+            reader = csv.reader(f, delimiter="\t", quotechar=quotechar)
+            lines = []
+            for line in reader:
+                if sys.version_info[0] == 2:
+                    line = list(unicode(cell, 'utf-8') for cell in line)
+                lines.append(line)
+            return lines
+
+
+class MrpcProcessor(DataProcessor):
+    """Processor for the MRPC data set (GLUE version)."""
+
+    def get_train_examples(self, data_dir):
+        """See base class."""
+        logger.info("LOOKING AT {}".format(os.path.join(data_dir, "train.tsv")))
+        return self._create_examples(
+            self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
+
+    def get_dev_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(
+            self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
+
+    def get_labels(self):
+        """See base class."""
+        return ["0", "1"]
+
+    def _create_examples(self, lines, set_type):
+        """Creates examples for the training and dev sets."""
+        examples = []
+        for (i, line) in enumerate(lines):
+            if i == 0:
+                continue
+            guid = "%s-%s" % (set_type, i)
+            text_a = line[3]
+            text_b = line[4]
+            label = line[0]
+            examples.append(
+                InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
+        return examples
+
+
+class MnliProcessor(DataProcessor):
+    """Processor for the MultiNLI data set (GLUE version)."""
+
+    def get_train_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(
+            self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
+
+    def get_dev_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(
+            self._read_tsv(os.path.join(data_dir, "dev_matched.tsv")),
+            "dev_matched")
+
+    def get_labels(self):
+        """See base class."""
+        return ["contradiction", "entailment", "neutral"]
+
+    def _create_examples(self, lines, set_type):
+        """Creates examples for the training and dev sets."""
+        examples = []
+        for (i, line) in enumerate(lines):
+            if i == 0:
+                continue
+            guid = "%s-%s" % (set_type, line[0])
+            text_a = line[8]
+            text_b = line[9]
+            label = line[-1]
+            examples.append(
+                InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
+        return examples
+
+
+class ColaProcessor(DataProcessor):
+    """Processor for the CoLA data set (GLUE version)."""
+
+    def get_train_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(
+            self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
+
+    def get_dev_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(
+            self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
+
+    def get_labels(self):
+        """See base class."""
+        return ["0", "1"]
+
+    def _create_examples(self, lines, set_type):
+        """Creates examples for the training and dev sets."""
+        examples = []
+        for (i, line) in enumerate(lines):
+            guid = "%s-%s" % (set_type, i)
+            text_a = line[3]
+            label = line[1]
+            examples.append(
+                InputExample(guid=guid, text_a=text_a, text_b=None, label=label))
+        return examples
+
+
+def convert_examples_to_features(examples, label_list, max_seq_length, tokenizer):
+    """Loads a data file into a list of `InputBatch`s."""
+
+    label_map = {label : i for i, label in enumerate(label_list)}
+
+    features = []
+    for (ex_index, example) in enumerate(examples):
+        tokens_a = tokenizer.tokenize(example.text_a)
+
+        tokens_b = None
+        if example.text_b:
+            tokens_b = tokenizer.tokenize(example.text_b)
+            # Modifies `tokens_a` and `tokens_b` in place so that the total
+            # length is less than the specified length.
+            # Account for [CLS], [SEP], [SEP] with "- 3"
+            _truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3)
+        else:
+            # Account for [CLS] and [SEP] with "- 2"
+            if len(tokens_a) > max_seq_length - 2:
+                tokens_a = tokens_a[:(max_seq_length - 2)]
+
+        # The convention in BERT is:
+        # (a) For sequence pairs:
+        #  tokens:   [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
+        #  type_ids: 0   0  0    0    0     0       0 0    1  1  1  1   1 1
+        # (b) For single sequences:
+        #  tokens:   [CLS] the dog is hairy . [SEP]
+        #  type_ids: 0   0   0   0  0     0 0
+        #
+        # Where "type_ids" are used to indicate whether this is the first
+        # sequence or the second sequence. The embedding vectors for `type=0` and
+        # `type=1` were learned during pre-training and are added to the wordpiece
+        # embedding vector (and position vector). This is not *strictly* necessary
+        # since the [SEP] token unambigiously separates the sequences, but it makes
+        # it easier for the model to learn the concept of sequences.
+        #
+        # For classification tasks, the first vector (corresponding to [CLS]) is
+        # used as as the "sentence vector". Note that this only makes sense because
+        # the entire model is fine-tuned.
+        tokens = ["[CLS]"] + tokens_a + ["[SEP]"]
+        segment_ids = [0] * len(tokens)
+
+        if tokens_b:
+            tokens += tokens_b + ["[SEP]"]
+            segment_ids += [1] * (len(tokens_b) + 1)
+
+        input_ids = tokenizer.convert_tokens_to_ids(tokens)
+
+        # The mask has 1 for real tokens and 0 for padding tokens. Only real
+        # tokens are attended to.
+        input_mask = [1] * len(input_ids)
+
+        # Zero-pad up to the sequence length.
+        padding = [0] * (max_seq_length - len(input_ids))
+        input_ids += padding
+        input_mask += padding
+        segment_ids += padding
+
+        assert len(input_ids) == max_seq_length
+        assert len(input_mask) == max_seq_length
+        assert len(segment_ids) == max_seq_length
+
+        label_id = label_map[example.label]
+        if ex_index < 5:
+            logger.info("*** Example ***")
+            logger.info("guid: %s" % (example.guid))
+            logger.info("tokens: %s" % " ".join(
+                    [str(x) for x in tokens]))
+            logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
+            logger.info("input_mask: %s" % " ".join([str(x) for x in input_mask]))
+            logger.info(
+                    "segment_ids: %s" % " ".join([str(x) for x in segment_ids]))
+            logger.info("label: %s (id = %d)" % (example.label, label_id))
+
+        features.append(
+                InputFeatures(input_ids=input_ids,
+                              input_mask=input_mask,
+                              segment_ids=segment_ids,
+                              label_id=label_id))
+    return features
+
+
+def _truncate_seq_pair(tokens_a, tokens_b, max_length):
+    """Truncates a sequence pair in place to the maximum length."""
+
+    # This is a simple heuristic which will always truncate the longer sequence
+    # one token at a time. This makes more sense than truncating an equal percent
+    # of tokens from each, since if one sequence is very short then each token
+    # that's truncated likely contains more information than a longer sequence.
+    while True:
+        total_length = len(tokens_a) + len(tokens_b)
+        if total_length <= max_length:
+            break
+        if len(tokens_a) > len(tokens_b):
+            tokens_a.pop()
+        else:
+            tokens_b.pop()
+
+def accuracy(out, labels):
+    outputs = np.argmax(out, axis=1)
+    return np.sum(outputs == labels)
+
+def main():
+    parser = argparse.ArgumentParser()
+
+    ## Required parameters
+    parser.add_argument("--data_dir",
+                        default=None,
+                        type=str,
+                        required=True,
+                        help="The input data dir. Should contain the .tsv files (or other data files) for the task.")
+    parser.add_argument("--bert_model", default=None, type=str, required=True,
+                        help="Bert pre-trained model selected in the list: bert-base-uncased, "
+                        "bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, "
+                        "bert-base-multilingual-cased, bert-base-chinese.")
+    parser.add_argument("--task_name",
+                        default=None,
+                        type=str,
+                        required=True,
+                        help="The name of the task to train.")
+    parser.add_argument("--output_dir",
+                        default=None,
+                        type=str,
+                        required=True,
+                        help="The output directory where the model predictions and checkpoints will be written.")
+    parser.add_argument("--init_checkpoint",
+                        default=None,
+                        type=str,
+                        required=True,
+                        help="The checkpoint file from pretraining")
+
+    ## Other parameters
+    parser.add_argument("--cache_dir",
+                        default="",
+                        type=str,
+                        help="Where do you want to store the pre-trained models downloaded from s3")
+    parser.add_argument("--max_seq_length",
+                        default=128,
+                        type=int,
+                        help="The maximum total input sequence length after WordPiece tokenization. \n"
+                             "Sequences longer than this will be truncated, and sequences shorter \n"
+                             "than this will be padded.")
+    parser.add_argument("--do_train",
+                        action='store_true',
+                        help="Whether to run training.")
+    parser.add_argument("--do_eval",
+                        action='store_true',
+                        help="Whether to run eval on the dev set.")
+    parser.add_argument("--do_lower_case",
+                        action='store_true',
+                        help="Set this flag if you are using an uncased model.")
+    parser.add_argument("--train_batch_size",
+                        default=32,
+                        type=int,
+                        help="Total batch size for training.")
+    parser.add_argument("--eval_batch_size",
+                        default=8,
+                        type=int,
+                        help="Total batch size for eval.")
+    parser.add_argument("--learning_rate",
+                        default=5e-5,
+                        type=float,
+                        help="The initial learning rate for Adam.")
+    parser.add_argument("--num_train_epochs",
+                        default=3.0,
+                        type=float,
+                        help="Total number of training epochs to perform.")
+    parser.add_argument("--max_steps", default=-1.0, type=float,
+                        help="Total number of training steps to perform.")
+    parser.add_argument("--warmup_proportion",
+                        default=0.1,
+                        type=float,
+                        help="Proportion of training to perform linear learning rate warmup for. "
+                             "E.g., 0.1 = 10%% of training.")
+    parser.add_argument("--no_cuda",
+                        action='store_true',
+                        help="Whether not to use CUDA when available")
+    parser.add_argument("--local_rank",
+                        type=int,
+                        default=-1,
+                        help="local_rank for distributed training on gpus")
+    parser.add_argument('--seed',
+                        type=int,
+                        default=42,
+                        help="random seed for initialization")
+    parser.add_argument('--gradient_accumulation_steps',
+                        type=int,
+                        default=1,
+                        help="Number of updates steps to accumulate before performing a backward/update pass.")
+    parser.add_argument('--fp16',
+                        action='store_true',
+                        help="Whether to use 16-bit float precision instead of 32-bit")
+    parser.add_argument('--loss_scale',
+                        type=float, default=0,
+                        help="Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n"
+                             "0 (default value): dynamic loss scaling.\n"
+                             "Positive power of 2: static loss scaling value.\n")
+    parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.")
+    parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.")
+    args = parser.parse_args()
+
+    if args.server_ip and args.server_port:
+        # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
+        import ptvsd
+        print("Waiting for debugger attach")
+        ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True)
+        ptvsd.wait_for_attach()
+
+    processors = {
+        "cola": ColaProcessor,
+        "mnli": MnliProcessor,
+        "mrpc": MrpcProcessor,
+    }
+
+    num_labels_task = {
+        "cola": 2,
+        "mnli": 3,
+        "mrpc": 2,
+    }
+
+    if args.local_rank == -1 or args.no_cuda:
+        device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
+        n_gpu = torch.cuda.device_count()
+    else:
+        torch.cuda.set_device(args.local_rank)
+        device = torch.device("cuda", args.local_rank)
+        n_gpu = 1
+        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
+        torch.distributed.init_process_group(backend='nccl')
+    logger.info("device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".format(
+        device, n_gpu, bool(args.local_rank != -1), args.fp16))
+
+    if args.gradient_accumulation_steps < 1:
+        raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format(
+                            args.gradient_accumulation_steps))
+
+    args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps
+
+    random.seed(args.seed)
+    np.random.seed(args.seed)
+    torch.manual_seed(args.seed)
+    if n_gpu > 0:
+        torch.cuda.manual_seed_all(args.seed)
+
+    if not args.do_train and not args.do_eval:
+        raise ValueError("At least one of `do_train` or `do_eval` must be True.")
+
+    if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train:
+        print("WARNING: Output directory ({}) already exists and is not empty.".format(args.output_dir))
+    if not os.path.exists(args.output_dir):
+        os.makedirs(args.output_dir)
+
+    task_name = args.task_name.lower()
+
+    if task_name not in processors:
+        raise ValueError("Task not found: %s" % (task_name))
+
+    processor = processors[task_name]()
+    num_labels = num_labels_task[task_name]
+    label_list = processor.get_labels()
+
+    tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case)
+
+    train_examples = None
+    num_train_optimization_steps = None
+    if args.do_train:
+        train_examples = processor.get_train_examples(args.data_dir)
+        num_train_optimization_steps = int(
+            len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs
+        if args.local_rank != -1:
+            num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size()
+
+    # Prepare model
+    cache_dir = args.cache_dir if args.cache_dir else os.path.join(PYTORCH_PRETRAINED_BERT_CACHE, 'distributed_{}'.format(args.local_rank))
+    model = BertForSequenceClassification.from_pretrained(args.bert_model,
+              cache_dir=cache_dir,
+              num_labels = num_labels)
+    model.load_state_dict(torch.load(args.init_checkpoint, map_location='cpu'), strict=False)
+
+    if args.fp16:
+        model.half()
+    model.to(device)
+    if args.local_rank != -1:
+        try:
+            from apex.parallel import DistributedDataParallel as DDP
+        except ImportError:
+            raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.")
+
+        model = DDP(model)
+    elif n_gpu > 1:
+        model = torch.nn.DataParallel(model)
+
+    # Prepare optimizer
+    param_optimizer = list(model.named_parameters())
+    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
+    optimizer_grouped_parameters = [
+        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
+        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
+        ]
+    if args.fp16:
+        try:
+            from apex.optimizers import FP16_Optimizer
+            from apex.optimizers import FusedAdam
+        except ImportError:
+            raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.")
+
+        optimizer = FusedAdam(optimizer_grouped_parameters,
+                              lr=args.learning_rate,
+                              bias_correction=False,
+                              max_grad_norm=1.0)
+        if args.loss_scale == 0:
+            optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)
+        else:
+            optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale)
+
+    else:
+        optimizer = BertAdam(optimizer_grouped_parameters,
+                             lr=args.learning_rate,
+                             warmup=args.warmup_proportion,
+                             t_total=num_train_optimization_steps)
+
+    global_step = 0
+    nb_tr_steps = 0
+    tr_loss = 0
+    if args.do_train:
+        train_features = convert_examples_to_features(
+            train_examples, label_list, args.max_seq_length, tokenizer)
+        logger.info("***** Running training *****")
+        logger.info("  Num examples = %d", len(train_examples))
+        logger.info("  Batch size = %d", args.train_batch_size)
+        logger.info("  Num steps = %d", num_train_optimization_steps)
+        all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long)
+        all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long)
+        all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long)
+        all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long)
+        train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)
+        if args.local_rank == -1:
+            train_sampler = RandomSampler(train_data)
+        else:
+            train_sampler = DistributedSampler(train_data)
+        train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size)
+
+        model.train()
+        for _ in trange(int(args.num_train_epochs), desc="Epoch"):
+            tr_loss = 0
+            nb_tr_examples, nb_tr_steps = 0, 0
+            for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration")):
+                if args.max_steps > 0 and global_step > args.max_steps:
+                    break
+                batch = tuple(t.to(device) for t in batch)
+                input_ids, input_mask, segment_ids, label_ids = batch
+                loss = model(input_ids, segment_ids, input_mask, label_ids)
+                if n_gpu > 1:
+                    loss = loss.mean() # mean() to average on multi-gpu.
+                if args.gradient_accumulation_steps > 1:
+                    loss = loss / args.gradient_accumulation_steps
+
+                if args.fp16:
+                    optimizer.backward(loss)
+                else:
+                    loss.backward()
+
+                tr_loss += loss.item()
+                nb_tr_examples += input_ids.size(0)
+                nb_tr_steps += 1
+                if (step + 1) % args.gradient_accumulation_steps == 0:
+                    if args.fp16:
+                        # modify learning rate with special warm up BERT uses
+                        # if args.fp16 is False, BertAdam is used that handles this automatically
+                        lr_this_step = args.learning_rate * warmup_linear(global_step/num_train_optimization_steps, args.warmup_proportion)
+                        for param_group in optimizer.param_groups:
+                            param_group['lr'] = lr_this_step
+                    optimizer.step()
+                    optimizer.zero_grad()
+                    global_step += 1
+
+    if args.do_train:
+        # Save a trained model and the associated configuration
+        model_to_save = model.module if hasattr(model, 'module') else model  # Only save the model it-self
+        output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME)
+        torch.save(model_to_save.state_dict(), output_model_file)
+        output_config_file = os.path.join(args.output_dir, CONFIG_NAME)
+        with open(output_config_file, 'w') as f:
+            f.write(model_to_save.config.to_json_string())
+
+        # Load a trained model and config that you have fine-tuned
+        config = BertConfig(output_config_file)
+        model = BertForSequenceClassification(config, num_labels=num_labels)
+        model.load_state_dict(torch.load(output_model_file))
+    else:
+        model = BertForSequenceClassification.from_pretrained(args.bert_model, num_labels=num_labels)
+        model.load_state_dict(torch.load(args.init_checkpoint, map_location='cpu'), strict=False)
+    model.to(device)
+
+    if args.do_eval and (args.local_rank == -1 or torch.distributed.get_rank() == 0):
+        eval_examples = processor.get_dev_examples(args.data_dir)
+        eval_features = convert_examples_to_features(
+            eval_examples, label_list, args.max_seq_length, tokenizer)
+        logger.info("***** Running evaluation *****")
+        logger.info("  Num examples = %d", len(eval_examples))
+        logger.info("  Batch size = %d", args.eval_batch_size)
+        all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long)
+        all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long)
+        all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long)
+        all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long)
+        eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)
+        # Run prediction for full data
+        eval_sampler = SequentialSampler(eval_data)
+        eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size)
+
+        model.eval()
+        eval_loss, eval_accuracy = 0, 0
+        nb_eval_steps, nb_eval_examples = 0, 0
+ 
+        for input_ids, input_mask, segment_ids, label_ids in tqdm(eval_dataloader, desc="Evaluating"):
+            input_ids = input_ids.to(device)
+            input_mask = input_mask.to(device)
+            segment_ids = segment_ids.to(device)
+            label_ids = label_ids.to(device)
+
+            with torch.no_grad():
+                tmp_eval_loss = model(input_ids, segment_ids, input_mask, label_ids)
+                logits = model(input_ids, segment_ids, input_mask)
+
+            logits = logits.detach().cpu().numpy()
+            label_ids = label_ids.to('cpu').numpy()
+            tmp_eval_accuracy = accuracy(logits, label_ids)
+
+            eval_loss += tmp_eval_loss.mean().item()
+            eval_accuracy += tmp_eval_accuracy
+
+            nb_eval_examples += input_ids.size(0)
+            nb_eval_steps += 1
+
+        eval_loss = eval_loss / nb_eval_steps
+        eval_accuracy = eval_accuracy / nb_eval_examples
+        loss = tr_loss/nb_tr_steps if args.do_train else None
+        result = {'eval_loss': eval_loss,
+                  'eval_accuracy': eval_accuracy,
+                  'global_step': global_step,
+                  'loss': loss}
+
+        output_eval_file = os.path.join(args.output_dir, "eval_results.txt")
+        with open(output_eval_file, "w") as writer:
+            logger.info("***** Eval results *****")
+            for key in sorted(result.keys()):
+                logger.info("  %s = %s", key, str(result[key]))
+                writer.write("%s = %s\n" % (key, str(result[key])))
+
+if __name__ == "__main__":
+    main()
--- a/PyTorch/LanguageModeling/BERT/run_pretraining.py
+++ b/PyTorch/LanguageModeling/BERT/run_pretraining.py
@ -0,0 +1,417 @@
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""BERT finetuning runner."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+
+#==================
+import csv
+import os
+import logging
+import argparse
+import random
+import h5py
+from tqdm import tqdm, trange
+import os
+import numpy as np
+import torch
+from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, Dataset
+from torch.utils.data.distributed import DistributedSampler
+import math
+from apex import amp
+
+
+
+from tokenization import BertTokenizer
+from modeling import BertForPreTraining, BertConfig
+from optimization import BertAdam, BertAdam_FP16
+
+# from fused_adam_local import FusedAdamBert
+from file_utils import PYTORCH_PRETRAINED_BERT_CACHE
+
+from apex.optimizers import FusedAdam #, FP16_Optimizer
+#from apex.optimizers import FusedAdam
+from apex.parallel import DistributedDataParallel as DDP
+from schedulers import LinearWarmUpScheduler
+
+logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
+                    datefmt = '%m/%d/%Y %H:%M:%S',
+                    level = logging.INFO)
+logger = logging.getLogger(__name__)
+
+class pretraining_dataset(Dataset):
+
+    def __init__(self, input_file, max_pred_length):
+        self.input_file = input_file
+        self.max_pred_length = max_pred_length
+        f = h5py.File(input_file, "r")
+        self.input_ids = np.asarray(f["input_ids"][:]).astype(np.int64)#[num_instances x max_seq_length])
+        self.input_masks = np.asarray(f["input_mask"][:]).astype(np.int64) #[num_instances x max_seq_length]
+        self.segment_ids = np.asarray(f["segment_ids"][:]).astype(np.int64) #[num_instances x max_seq_length]
+        self.masked_lm_positions = np.asarray(f["masked_lm_positions"][:]).astype(np.int64) #[num_instances x max_pred_length]
+        self.masked_lm_ids= np.asarray(f["masked_lm_ids"][:]).astype(np.int64) #[num_instances x max_pred_length]
+        self.next_sentence_labels = np.asarray(f["next_sentence_labels"][:]).astype(np.int64) # [num_instances]
+        f.close()
+
+    def __len__(self):
+        'Denotes the total number of samples'
+        return len(self.input_ids)
+
+    def __getitem__(self, index):
+        
+        input_ids= torch.from_numpy(self.input_ids[index]) # [max_seq_length]
+        input_mask = torch.from_numpy(self.input_masks[index]) #[max_seq_length]
+        segment_ids = torch.from_numpy(self.segment_ids[index])# [max_seq_length]
+        masked_lm_positions = torch.from_numpy(self.masked_lm_positions[index]) #[max_pred_length]
+        masked_lm_ids = torch.from_numpy(self.masked_lm_ids[index]) #[max_pred_length]
+        next_sentence_labels = torch.from_numpy(np.asarray(self.next_sentence_labels[index])) #[1]
+         
+        masked_lm_labels = torch.ones(input_ids.shape, dtype=torch.long) * -1
+        index = self.max_pred_length
+        # store number of  masked tokens in index
+        if len((masked_lm_positions == 0).nonzero()) != 0:
+          index = (masked_lm_positions == 0).nonzero()[0].item()
+        masked_lm_labels[masked_lm_positions[:index]] = masked_lm_ids[:index]
+
+        return [input_ids, segment_ids, input_mask, masked_lm_labels, next_sentence_labels]
+
+def main():    
+
+    print("IN NEW MAIN XD\n")
+    parser = argparse.ArgumentParser()
+
+    ## Required parameters
+    parser.add_argument("--input_dir",
+                        default=None,
+                        type=str,
+                        required=True,
+                        help="The input data dir. Should contain .hdf5 files  for the task.")
+
+    parser.add_argument("--config_file",
+                        default=None,
+                        type=str,
+                        required=True,
+                        help="The BERT model config")
+
+    parser.add_argument("--bert_model", default="bert-large-uncased", type=str,
+                        help="Bert pre-trained model selected in the list: bert-base-uncased, "
+                             "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese.")
+
+    parser.add_argument("--output_dir",
+                        default=None,
+                        type=str,
+                        required=True,
+                        help="The output directory where the model checkpoints will be written.")
+
+    ## Other parameters
+    parser.add_argument("--max_seq_length",
+                        default=512,
+                        type=int,
+                        help="The maximum total input sequence length after WordPiece tokenization. \n"
+                             "Sequences longer than this will be truncated, and sequences shorter \n"
+                             "than this will be padded.")
+    parser.add_argument("--max_predictions_per_seq",
+                        default=80,
+                        type=int,
+                        help="The maximum total of masked tokens in input sequence")
+    parser.add_argument("--train_batch_size",
+                        default=32,
+                        type=int,
+                        help="Total batch size for training.")
+    parser.add_argument("--learning_rate",
+                        default=5e-5,
+                        type=float,
+                        help="The initial learning rate for Adam.")
+    parser.add_argument("--num_train_epochs",
+                        default=3.0,
+                        type=float,
+                        help="Total number of training epochs to perform.")
+    parser.add_argument("--max_steps",
+                        default=1000,
+                        type=float,
+                        help="Total number of training steps to perform.")
+    parser.add_argument("--warmup_proportion",
+                        default=0.01,
+                        type=float,
+                        help="Proportion of training to perform linear learning rate warmup for. "
+                             "E.g., 0.1 = 10%% of training.")
+    parser.add_argument("--local_rank",
+                        type=int,
+                        default=-1,
+                        help="local_rank for distributed training on gpus")
+    parser.add_argument('--seed',
+                        type=int,
+                        default=42,
+                        help="random seed for initialization")
+    parser.add_argument('--gradient_accumulation_steps',
+                        type=int,
+                        default=1,
+                        help="Number of updates steps to accumualte before performing a backward/update pass.")
+    parser.add_argument('--fp16',
+                        default=False,
+                        action='store_true',
+                        help="Whether to use 16-bit float precision instead of 32-bit")
+    parser.add_argument('--loss_scale',
+                        type=float, default=0.0,
+                        help='Loss scaling, positive power of 2 values can improve fp16 convergence.')
+    parser.add_argument('--log_freq',
+                        type=float, default=10.0,
+                        help='frequency of logging loss.')
+    parser.add_argument('--checkpoint_activations',
+                        default=False,
+                        action='store_true',
+                        help="Whether to use gradient checkpointing")
+    parser.add_argument("--resume_from_checkpoint",
+                        default=False,
+                        action='store_true',
+                        help="Whether to resume training from checkpoint.")
+    parser.add_argument('--resume_step',
+                        type=int,
+                        default=-1,
+                        help="Step to resume training from.")
+    parser.add_argument('--num_steps_per_checkpoint',
+                        type=int,
+                        default=2000,
+                        help="Number of update steps until a model checkpoint is saved to disk.")
+
+
+    args = parser.parse_args()
+
+
+    random.seed(args.seed)
+    np.random.seed(args.seed)
+    torch.manual_seed(args.seed)
+
+    assert(torch.cuda.is_available())
+
+    if args.local_rank == -1:
+        device = torch.device("cuda")
+        n_gpu = torch.cuda.device_count()
+    else:
+        torch.cuda.set_device(args.local_rank)
+        device = torch.device("cuda", args.local_rank)
+        n_gpu = 1
+        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
+        torch.distributed.init_process_group(backend='nccl', init_method='env://')
+
+    logger.info("device %s n_gpu %d distributed training %r", device, n_gpu, bool(args.local_rank != -1))
+
+    if args.gradient_accumulation_steps < 1:
+        raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format(
+                            args.gradient_accumulation_steps))
+    if args.train_batch_size % args.gradient_accumulation_steps != 0:
+        raise ValueError("Invalid gradient_accumulation_steps parameter: {}, batch size {} should be divisible".format(
+                            args.gradient_accumulation_steps, args.train_batch_size))
+
+    args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps
+
+
+
+    if not args.resume_from_checkpoint and os.path.exists(args.output_dir) and (os.listdir(args.output_dir) and os.listdir(args.output_dir)!=['logfile.txt']):
+        raise ValueError("Output directory ({}) already exists and is not empty.".format(args.output_dir))
+
+    if not args.resume_from_checkpoint:
+        os.makedirs(args.output_dir, exist_ok=True)
+
+    # Prepare model
+    config = BertConfig.from_json_file(args.config_file)
+    model = BertForPreTraining(config)
+
+
+    if not args.resume_from_checkpoint:
+        global_step = 0
+    else:
+        if args.resume_step == -1:
+            model_names = [f for f in os.listdir(args.output_dir) if f.endswith(".pt")]
+            args.resume_step = max([int(x.split('.pt')[0].split('_')[1].strip()) for x in model_names])
+        
+        global_step = args.resume_step
+
+        checkpoint = torch.load(os.path.join(args.output_dir, "ckpt_{}.pt".format(global_step)), map_location="cpu")
+        model.load_state_dict(checkpoint['model'], strict=False)
+
+        print("resume step from ", args.resume_step)
+
+    model.to(device)
+
+    # Prepare optimizer
+    param_optimizer = list(model.named_parameters())
+    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
+    optimizer_grouped_parameters = [
+        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
+        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
+    ]
+
+    
+
+    if args.fp16:
+
+        optimizer = FusedAdam(optimizer_grouped_parameters,
+                                    lr=args.learning_rate,
+                                    #warmup=args.warmup_proportion,
+                                    #t_total=args.max_steps,
+                                    bias_correction=False,
+                                    weight_decay=0.01,
+                                    max_grad_norm=1.0)
+
+        if args.loss_scale == 0:
+            # optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)
+            model, optimizer = amp.initialize(model, optimizer, opt_level="O2", keep_batchnorm_fp32=False, loss_scale="dynamic")
+        else:
+            # optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale)
+            model, optimizer = amp.initialize(model, optimizer, opt_level="O2", keep_batchnorm_fp32=False, loss_scale=args.loss_scale)
+
+        scheduler = LinearWarmUpScheduler(optimizer, warmup=args.warmup_proportion, total_steps=args.max_steps)
+
+    else:
+        optimizer = BertAdam(optimizer_grouped_parameters,
+                                lr=args.learning_rate,
+                                warmup=args.warmup_proportion,
+                                t_total=args.max_steps)
+        
+
+
+    if args.resume_from_checkpoint:
+        optimizer.load_state_dict(checkpoint['optimizer'])  # , strict=False)
+       
+
+        
+    if args.local_rank != -1:
+        model = DDP(model)
+    elif n_gpu > 1:
+        model = torch.nn.DataParallel(model)
+    
+   
+    files = [os.path.join(args.input_dir, f) for f in os.listdir(args.input_dir) if os.path.isfile(os.path.join(args.input_dir, f))]
+    files.sort()
+
+    num_files = len(files)
+      
+
+    logger.info("***** Running training *****")
+    # logger.info("  Num examples = %d", len(train_data))
+    logger.info("  Batch size = %d", args.train_batch_size)
+    print("  LR = ", args.learning_rate)
+    
+
+    model.train()
+    print("Training. . .")
+
+    most_recent_ckpts_paths = []
+
+    print("Training. . .")
+    tr_loss = 0.0 # total added training loss
+    average_loss = 0.0 # averaged loss every args.log_freq steps
+    epoch = 0
+    training_steps = 0
+    while True:
+        if not args.resume_from_checkpoint:
+            random.shuffle(files)
+            f_start_id = 0
+        else:
+            f_start_id = checkpoint['files'][0]
+            files = checkpoint['files'][1:]
+            args.resume_from_checkpoint = False
+        for f_id in range(f_start_id, len(files)):
+            data_file = files[f_id]
+            logger.info("file no %s file %s" %(f_id, data_file))
+            train_data = pretraining_dataset(input_file=data_file, max_pred_length=args.max_predictions_per_seq)
+
+            if args.local_rank == -1:
+            train_sampler = RandomSampler(train_data)
+            train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size * n_gpu, num_workers=4, pin_memory=True)
+            else:
+            train_sampler = DistributedSampler(train_data)
+
+            train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size, num_workers=4, pin_memory=True)
+            for step, batch in enumerate(tqdm(train_dataloader, desc="File Iteration")):
+            
+                training_steps += 1
+                batch = [t.to(device) for t in batch]
+                input_ids, segment_ids, input_mask, masked_lm_labels, next_sentence_labels = batch#\
+                loss = model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask, masked_lm_labels=masked_lm_labels, next_sentence_label=next_sentence_labels, checkpoint_activations=args.checkpoint_activations)
+                if n_gpu > 1:
+                    loss = loss.mean() # mean() to average on multi-gpu.
+
+                if args.gradient_accumulation_steps > 1:
+                    loss = loss / args.gradient_accumulation_steps
+
+                if args.fp16:
+                #   optimizer.backward(loss)
+                    with amp.scale_loss(loss, optimizer) as scaled_loss:
+                        scaled_loss.backward()
+                else:
+                    loss.backward()
+                tr_loss += loss
+                average_loss += loss.item()
+
+                if training_steps % args.gradient_accumulation_steps == 0:
+                    if args.fp16:
+                        scheduler.step()
+                    optimizer.step()
+                    optimizer.zero_grad()
+                    global_step += 1
+                
+            
+
+                if training_steps == 1 * args.gradient_accumulation_steps:
+                    logger.info("Step:{} Average Loss = {} Step Loss = {} LR {}".format(global_step, average_loss, 
+                                                                                loss.item(), optimizer.param_groups[0]['lr']))
+
+                if training_steps % (args.log_freq * args.gradient_accumulation_steps) == 0:
+                    logger.info("Step:{} Average Loss = {} Step Loss = {} LR {}".format(global_step,  average_loss / args.log_freq, 
+                                                                                loss.item(), optimizer.param_groups[0]['lr']))
+                    average_loss = 0
+
+
+                if global_step >= args.max_steps or training_steps == 1 * args.gradient_accumulation_steps or training_steps % (args.num_steps_per_checkpoint * args.gradient_accumulation_steps) == 0:
+                    if (not torch.distributed.is_initialized() or (torch.distributed.is_initialized() and torch.distributed.get_rank() == 0)):
+                        # Save a trained model
+                        logger.info("** ** * Saving fine - tuned model ** ** * ")
+                        model_to_save = model.module if hasattr(model, 'module') else model  # Only save the model it-self
+                        output_save_file = os.path.join(args.output_dir, "ckpt_{}.pt".format(global_step))
+                       
+                        torch.save({'model' : model_to_save.state_dict(), 
+                                'optimizer' : optimizer.state_dict(), 
+                                'files' : [f_id] + files }, output_save_file)
+                                
+                        most_recent_ckpts_paths.append(output_save_file)
+                        if len(most_recent_ckpts_paths) > 3:
+                            ckpt_to_be_removed = most_recent_ckpts_paths.pop(0)
+                            os.remove(ckpt_to_be_removed)
+
+                    if global_step >= args.max_steps:
+                        tr_loss = tr_loss * args.gradient_accumulation_steps / training_steps
+                        if (torch.distributed.is_initialized()):
+                            tr_loss /= torch.distributed.get_world_size()
+                            torch.distributed.all_reduce(tr_loss)
+                        logger.info("Total Steps:{} Final Loss = {}".format(training_steps, tr_loss.item()))
+                        return
+            del train_dataloader
+            del train_sampler
+            del train_data       
+            #for obj in gc.get_objects():
+            #  if torch.is_tensor(obj) or (hasattr(obj, 'data') and torch.is_tensor(obj.data)):
+            #    del obj
+
+            torch.cuda.empty_cache()
+        epoch += 1
+
+
+if __name__ == "__main__":
+    main()
--- a/PyTorch/LanguageModeling/BERT/run_pretraining_inference.py
+++ b/PyTorch/LanguageModeling/BERT/run_pretraining_inference.py
@ -0,0 +1,300 @@
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""BERT finetuning runner."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+
+#==================
+import csv
+import os
+import logging
+import argparse
+import random
+import h5py
+from tqdm import tqdm, trange
+import os
+import numpy as np
+import torch
+from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, Dataset
+from torch.utils.data.distributed import DistributedSampler
+import math
+import time
+
+from tokenization import BertTokenizer
+from modeling import BertForPreTraining, BertConfig
+
+# from fused_adam_local import FusedAdamBert
+from file_utils import PYTORCH_PRETRAINED_BERT_CACHE
+
+from apex.parallel import DistributedDataParallel as DDP
+import torch.distributed as dist
+
+logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
+                    datefmt = '%m/%d/%Y %H:%M:%S',
+                    level = logging.INFO)
+logger = logging.getLogger(__name__)
+
+class pretraining_dataset(Dataset):
+
+    def __init__(self, input_file, max_pred_length):
+        self.input_file = input_file
+        self.max_pred_length = max_pred_length
+        f = h5py.File(input_file, "r")
+        self.input_ids = np.asarray(f["input_ids"][:]).astype(np.int64)#[num_instances x max_seq_length])
+        self.input_masks = np.asarray(f["input_mask"][:]).astype(np.int64) #[num_instances x max_seq_length]
+        self.segment_ids = np.asarray(f["segment_ids"][:]).astype(np.int64) #[num_instances x max_seq_length]
+        self.masked_lm_positions = np.asarray(f["masked_lm_positions"][:]).astype(np.int64) #[num_instances x max_pred_length]
+        self.masked_lm_ids= np.asarray(f["masked_lm_ids"][:]).astype(np.int64) #[num_instances x max_pred_length]
+        self.next_sentence_labels = np.asarray(f["next_sentence_labels"][:]).astype(np.int64) # [num_instances]
+        f.close()
+
+    def __len__(self):
+        'Denotes the total number of samples'
+        return len(self.input_ids)
+
+    def __getitem__(self, index):
+        
+        input_ids= torch.from_numpy(self.input_ids[index]) # [max_seq_length]
+        input_mask = torch.from_numpy(self.input_masks[index]) #[max_seq_length]
+        segment_ids = torch.from_numpy(self.segment_ids[index])# [max_seq_length]
+        masked_lm_positions = torch.from_numpy(self.masked_lm_positions[index]) #[max_pred_length]
+        masked_lm_ids = torch.from_numpy(self.masked_lm_ids[index]) #[max_pred_length]
+        next_sentence_labels = torch.from_numpy(np.asarray(self.next_sentence_labels[index])) #[1]
+         
+        masked_lm_labels = torch.ones(input_ids.shape, dtype=torch.long) * -1
+        index = self.max_pred_length
+        # store number of  masked tokens in index
+        if len((masked_lm_positions == 0).nonzero()) != 0:
+          index = (masked_lm_positions == 0).nonzero()[0].item()
+        masked_lm_labels[masked_lm_positions[:index]] = masked_lm_ids[:index]
+
+        return [input_ids, segment_ids, input_mask, masked_lm_labels, next_sentence_labels]
+
+def main():    
+
+    print("IN NEW MAIN XD\n")
+    parser = argparse.ArgumentParser()
+
+    ## Required parameters
+    parser.add_argument("--input_dir",
+                        default=None,
+                        type=str,
+                        required=True,
+                        help="The input data dir. Should contain .hdf5 files  for the task.")
+    parser.add_argument("--config_file",
+                        default="bert_config.json",
+                        type=str,
+                        required=False,
+                        help="The BERT model config")
+    parser.add_argument("--ckpt_dir",
+                        default=None,
+                        type=str,
+                        required=True,
+                        help="The ckpt directory, e.g. /results")
+
+    group = parser.add_mutually_exclusive_group(required=True)
+    group.add_argument('--eval', dest='do_eval', action='store_true')
+    group.add_argument('--prediction', dest='do_eval', action='store_false')
+    ## Other parameters
+    parser.add_argument("--bert_model", default="bert-large-uncased", type=str, required=False,
+                        help="Bert pre-trained model selected in the list: bert-base-uncased, "
+                             "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese.")
+    parser.add_argument("--max_seq_length",
+                        default=512,
+                        type=int,
+                        help="The maximum total input sequence length after WordPiece tokenization. \n"
+                             "Sequences longer than this will be truncated, and sequences shorter \n"
+                             "than this will be padded.")
+    parser.add_argument("--max_predictions_per_seq",
+                        default=80,
+                        type=int,
+                        help="The maximum total of masked tokens in input sequence")
+    parser.add_argument("--ckpt_step",
+                        default=-1,
+                        type=int,
+                        required=False,
+                        help="The model checkpoint iteration, e.g. 1000")
+                       
+    parser.add_argument("--eval_batch_size",
+                        default=8,
+                        type=int,
+                        help="Total batch size for training.")
+    parser.add_argument("--max_steps",
+                        default=-1,
+                        type=int,
+                        help="Total number of eval  steps to perform, otherwise use full dataset")
+    parser.add_argument("--no_cuda",
+                        default=False,
+                        action='store_true',
+                        help="Whether not to use CUDA when available")
+    parser.add_argument("--local_rank",
+                        type=int,
+                        default=-1,
+                        help="local_rank for distributed training on gpus")
+    parser.add_argument('--seed',
+                        type=int,
+                        default=42,
+                        help="random seed for initialization")
+    parser.add_argument('--fp16',
+                        default=False,
+                        action='store_true',
+                        help="Whether to use 16-bit float precision instead of 32-bit")
+
+    
+
+    args = parser.parse_args()
+
+    if args.local_rank == -1 or args.no_cuda:
+        device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
+        
+    else:
+        torch.cuda.set_device(args.local_rank)
+        device = torch.device("cuda", args.local_rank)
+        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
+        torch.distributed.init_process_group(backend='nccl', init_method='env://')
+    n_gpu = torch.cuda.device_count()
+    if n_gpu > 1:
+        assert(args.local_rank != -1) # only use torch.distributed for multi-gpu 
+    logger.info("device %s n_gpu %d distributed inference %r", device, n_gpu, bool(args.local_rank != -1))
+
+
+    random.seed(args.seed)
+    np.random.seed(args.seed)
+    torch.manual_seed(args.seed)
+    if n_gpu > 0:
+        torch.cuda.manual_seed_all(args.seed)
+
+
+    
+
+    # Prepare model
+    config = BertConfig.from_json_file(args.config_file)
+    model = BertForPreTraining(config)
+
+
+    if args.ckpt_step == -1:
+        #retrieve latest model
+        model_names = [f for f in os.listdir(args.ckpt_dir) if f.endswith(".model")]
+        args.ckpt_step = max([int(x.split('.model')[0].split('_')[1].strip()) for x in model_names])
+        print("load model saved at iteraton", args.ckpt_step)
+    model_file = os.path.join(args.ckpt_dir, "ckpt_" + str(args.ckpt_step) + ".model")
+    state_dict = torch.load(model_file, map_location="cpu")
+    model.load_state_dict(state_dict, strict=False)
+
+    if args.fp16:
+        model.half() # all parameters and buffers are converted to half precision
+    model.to(device)
+
+    multi_gpu_training = args.local_rank != -1 and torch.distributed.is_initialized()
+    if multi_gpu_training:
+        model = DDP(model)
+   
+    files = [os.path.join(args.input_dir, f) for f in os.listdir(args.input_dir) if os.path.isfile(os.path.join(args.input_dir, f))]
+    files.sort()
+
+      
+    
+
+    logger.info("***** Running evaluation *****")
+    logger.info("  Batch size = %d", args.eval_batch_size)
+    
+
+    model.eval()
+    print("Evaluation. . .")
+    
+    nb_instances = 0
+    max_steps = args.max_steps if args.max_steps > 0  else np.inf
+    global_step = 0
+
+    
+    with torch.no_grad():
+        if args.do_eval:
+            final_loss = 0.0 # 
+            for data_file in files:
+                logger.info("file %s" %( data_file))
+                dataset = pretraining_dataset(input_file=data_file, max_pred_length=args.max_predictions_per_seq)
+                if not multi_gpu_training:
+                    train_sampler = RandomSampler(dataset)
+                    datasetloader = DataLoader(dataset, sampler=train_sampler, batch_size=args.eval_batch_size, num_workers=4, pin_memory=True)
+                else:
+                    train_sampler = DistributedSampler(dataset)
+                    datasetloader = DataLoader(dataset, sampler=train_sampler, batch_size=args.eval_batch_size, num_workers=4, pin_memory=True)
+                for step, batch in enumerate(tqdm(datasetloader, desc="Iteration")):
+                    if global_step > max_steps:
+                        break
+
+
+                    batch = [t.to(device) for t in batch]
+                    input_ids, segment_ids, input_mask, masked_lm_labels, next_sentence_labels = batch#\
+                    loss = model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask, masked_lm_labels=masked_lm_labels, next_sentence_label=next_sentence_labels)
+                    final_loss += loss
+
+                    global_step += 1
+
+                torch.cuda.empty_cache()
+                if global_step > max_steps:
+                    break
+            final_loss /= global_step
+            if multi_gpu_training:
+                final_loss /= torch.distributed.get_world_size()
+                dist.all_reduce(final_loss)
+            if (not multi_gpu_training or (multi_gpu_training and torch.distributed.get_rank() == 0)):       
+                logger.info("Finished: Final Loss = {}".format(final_loss))
+
+
+        else: # inference
+            # if multi_gpu_training:
+            #     torch.distributed.barrier()
+            # start_t0 = time.time()
+            for data_file in files:
+                logger.info("file %s" %( data_file))
+                dataset = pretraining_dataset(input_file=data_file, max_pred_length=args.max_predictions_per_seq)
+                if not multi_gpu_training:
+                    train_sampler = RandomSampler(dataset)
+                    datasetloader = DataLoader(dataset, sampler=train_sampler, batch_size=args.eval_batch_size, num_workers=4, pin_memory=True)
+                else:
+                    train_sampler = DistributedSampler(dataset)
+                    datasetloader = DataLoader(dataset, sampler=train_sampler, batch_size=args.eval_batch_size, num_workers=4, pin_memory=True)
+                for step, batch in enumerate(tqdm(datasetloader, desc="Iteration")):
+                    if global_step > max_steps:
+                        break
+
+
+                    batch = [t.to(device) for t in batch]
+                    input_ids, segment_ids, input_mask, masked_lm_labels, next_sentence_labels = batch#\
+                    
+                    lm_logits, nsp_logits = model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask, masked_lm_labels=None, next_sentence_label=None)
+
+                    nb_instances += input_ids.size(0)
+
+
+                    global_step += 1
+                torch.cuda.empty_cache()
+                if global_step > max_steps:
+                    break
+            # if multi_gpu_training:
+            #     torch.distributed.barrier()
+            if (not multi_gpu_training or (multi_gpu_training and torch.distributed.get_rank() == 0)):       
+                logger.info("Finished")
+
+
+            
+
+
+if __name__ == "__main__":
+    main()
--- a/PyTorch/LanguageModeling/BERT/run_squad.py
+++ b/PyTorch/LanguageModeling/BERT/run_squad.py
--- a/PyTorch/LanguageModeling/BERT/run_swag.py
+++ b/PyTorch/LanguageModeling/BERT/run_swag.py
@ -0,0 +1,561 @@
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""BERT finetuning runner."""
+
+import argparse
+import csv
+import logging
+import os
+import random
+import sys
+from io import open
+
+import numpy as np
+import torch
+from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,
+                              TensorDataset)
+from torch.utils.data.distributed import DistributedSampler
+from tqdm import tqdm, trange
+
+from file_utils import PYTORCH_PRETRAINED_BERT_CACHE
+from modeling import BertForMultipleChoice, BertConfig, WEIGHTS_NAME, CONFIG_NAME
+from optimization import BertAdam, warmup_linear
+from tokenization import BertTokenizer
+
+logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
+                    datefmt = '%m/%d/%Y %H:%M:%S',
+                    level = logging.INFO)
+logger = logging.getLogger(__name__)
+
+
+class SwagExample(object):
+    """A single training/test example for the SWAG dataset."""
+    def __init__(self,
+                 swag_id,
+                 context_sentence,
+                 start_ending,
+                 ending_0,
+                 ending_1,
+                 ending_2,
+                 ending_3,
+                 label = None):
+        self.swag_id = swag_id
+        self.context_sentence = context_sentence
+        self.start_ending = start_ending
+        self.endings = [
+            ending_0,
+            ending_1,
+            ending_2,
+            ending_3,
+        ]
+        self.label = label
+
+    def __str__(self):
+        return self.__repr__()
+
+    def __repr__(self):
+        l = [
+            "swag_id: {}".format(self.swag_id),
+            "context_sentence: {}".format(self.context_sentence),
+            "start_ending: {}".format(self.start_ending),
+            "ending_0: {}".format(self.endings[0]),
+            "ending_1: {}".format(self.endings[1]),
+            "ending_2: {}".format(self.endings[2]),
+            "ending_3: {}".format(self.endings[3]),
+        ]
+
+        if self.label is not None:
+            l.append("label: {}".format(self.label))
+
+        return ", ".join(l)
+
+
+class InputFeatures(object):
+    def __init__(self,
+                 example_id,
+                 choices_features,
+                 label
+
+    ):
+        self.example_id = example_id
+        self.choices_features = [
+            {
+                'input_ids': input_ids,
+                'input_mask': input_mask,
+                'segment_ids': segment_ids
+            }
+            for _, input_ids, input_mask, segment_ids in choices_features
+        ]
+        self.label = label
+
+
+def read_swag_examples(input_file, is_training):
+    with open(input_file, 'r', encoding='utf-8') as f:
+        reader = csv.reader(f)
+        lines = []
+        for line in reader:
+            if sys.version_info[0] == 2:
+                line = list(unicode(cell, 'utf-8') for cell in line)
+            lines.append(line)
+
+    if is_training and lines[0][-1] != 'label':
+        raise ValueError(
+            "For training, the input file must contain a label column."
+        )
+
+    examples = [
+        SwagExample(
+            swag_id = line[2],
+            context_sentence = line[4],
+            start_ending = line[5], # in the swag dataset, the
+                                         # common beginning of each
+                                         # choice is stored in "sent2".
+            ending_0 = line[7],
+            ending_1 = line[8],
+            ending_2 = line[9],
+            ending_3 = line[10],
+            label = int(line[11]) if is_training else None
+        ) for line in lines[1:] # we skip the line with the column names
+    ]
+
+    return examples
+
+def convert_examples_to_features(examples, tokenizer, max_seq_length,
+                                 is_training):
+    """Loads a data file into a list of `InputBatch`s."""
+
+    # Swag is a multiple choice task. To perform this task using Bert,
+    # we will use the formatting proposed in "Improving Language
+    # Understanding by Generative Pre-Training" and suggested by
+    # @jacobdevlin-google in this issue
+    # https://github.com/google-research/bert/issues/38.
+    #
+    # Each choice will correspond to a sample on which we run the
+    # inference. For a given Swag example, we will create the 4
+    # following inputs:
+    # - [CLS] context [SEP] choice_1 [SEP]
+    # - [CLS] context [SEP] choice_2 [SEP]
+    # - [CLS] context [SEP] choice_3 [SEP]
+    # - [CLS] context [SEP] choice_4 [SEP]
+    # The model will output a single value for each input. To get the
+    # final decision of the model, we will run a softmax over these 4
+    # outputs.
+    features = []
+    for example_index, example in enumerate(examples):
+        context_tokens = tokenizer.tokenize(example.context_sentence)
+        start_ending_tokens = tokenizer.tokenize(example.start_ending)
+
+        choices_features = []
+        for ending_index, ending in enumerate(example.endings):
+            # We create a copy of the context tokens in order to be
+            # able to shrink it according to ending_tokens
+            context_tokens_choice = context_tokens[:]
+            ending_tokens = start_ending_tokens + tokenizer.tokenize(ending)
+            # Modifies `context_tokens_choice` and `ending_tokens` in
+            # place so that the total length is less than the
+            # specified length.  Account for [CLS], [SEP], [SEP] with
+            # "- 3"
+            _truncate_seq_pair(context_tokens_choice, ending_tokens, max_seq_length - 3)
+
+            tokens = ["[CLS]"] + context_tokens_choice + ["[SEP]"] + ending_tokens + ["[SEP]"]
+            segment_ids = [0] * (len(context_tokens_choice) + 2) + [1] * (len(ending_tokens) + 1)
+
+            input_ids = tokenizer.convert_tokens_to_ids(tokens)
+            input_mask = [1] * len(input_ids)
+
+            # Zero-pad up to the sequence length.
+            padding = [0] * (max_seq_length - len(input_ids))
+            input_ids += padding
+            input_mask += padding
+            segment_ids += padding
+
+            assert len(input_ids) == max_seq_length
+            assert len(input_mask) == max_seq_length
+            assert len(segment_ids) == max_seq_length
+
+            choices_features.append((tokens, input_ids, input_mask, segment_ids))
+
+        label = example.label
+        if example_index < 5:
+            logger.info("*** Example ***")
+            logger.info("swag_id: {}".format(example.swag_id))
+            for choice_idx, (tokens, input_ids, input_mask, segment_ids) in enumerate(choices_features):
+                logger.info("choice: {}".format(choice_idx))
+                logger.info("tokens: {}".format(' '.join(tokens)))
+                logger.info("input_ids: {}".format(' '.join(map(str, input_ids))))
+                logger.info("input_mask: {}".format(' '.join(map(str, input_mask))))
+                logger.info("segment_ids: {}".format(' '.join(map(str, segment_ids))))
+            if is_training:
+                logger.info("label: {}".format(label))
+
+        features.append(
+            InputFeatures(
+                example_id = example.swag_id,
+                choices_features = choices_features,
+                label = label
+            )
+        )
+
+    return features
+
+def _truncate_seq_pair(tokens_a, tokens_b, max_length):
+    """Truncates a sequence pair in place to the maximum length."""
+
+    # This is a simple heuristic which will always truncate the longer sequence
+    # one token at a time. This makes more sense than truncating an equal percent
+    # of tokens from each, since if one sequence is very short then each token
+    # that's truncated likely contains more information than a longer sequence.
+    while True:
+        total_length = len(tokens_a) + len(tokens_b)
+        if total_length <= max_length:
+            break
+        if len(tokens_a) > len(tokens_b):
+            tokens_a.pop()
+        else:
+            tokens_b.pop()
+
+def accuracy(out, labels):
+    outputs = np.argmax(out, axis=1)
+    return np.sum(outputs == labels)
+
+def select_field(features, field):
+    return [
+        [
+            choice[field]
+            for choice in feature.choices_features
+        ]
+        for feature in features
+    ]
+
+def main():
+    parser = argparse.ArgumentParser()
+
+    ## Required parameters
+    parser.add_argument("--data_dir",
+                        default=None,
+                        type=str,
+                        required=True,
+                        help="The input data dir. Should contain the .csv files (or other data files) for the task.")
+    parser.add_argument("--bert_model", default=None, type=str, required=True,
+                        help="Bert pre-trained model selected in the list: bert-base-uncased, "
+                        "bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, "
+                        "bert-base-multilingual-cased, bert-base-chinese.")
+    parser.add_argument("--output_dir",
+                        default=None,
+                        type=str,
+                        required=True,
+                        help="The output directory where the model checkpoints will be written.")
+    parser.add_argument("--init_checkpoint",
+                        default=None,
+                        type=str,
+                        required=True,
+                        help="The checkpoint file from pretraining")
+
+    ## Other parameters
+    parser.add_argument("--max_seq_length",
+                        default=128,
+                        type=int,
+                        help="The maximum total input sequence length after WordPiece tokenization. \n"
+                             "Sequences longer than this will be truncated, and sequences shorter \n"
+                             "than this will be padded.")
+    parser.add_argument("--do_train",
+                        action='store_true',
+                        help="Whether to run training.")
+    parser.add_argument("--do_eval",
+                        action='store_true',
+                        help="Whether to run eval on the dev set.")
+    parser.add_argument("--do_lower_case",
+                        action='store_true',
+                        help="Set this flag if you are using an uncased model.")
+    parser.add_argument("--train_batch_size",
+                        default=32,
+                        type=int,
+                        help="Total batch size for training.")
+    parser.add_argument("--eval_batch_size",
+                        default=8,
+                        type=int,
+                        help="Total batch size for eval.")
+    parser.add_argument("--learning_rate",
+                        default=5e-5,
+                        type=float,
+                        help="The initial learning rate for Adam.")
+    parser.add_argument("--num_train_epochs",
+                        default=3.0,
+                        type=float,
+                        help="Total number of training epochs to perform.")
+    parser.add_argument("--max_steps", default=-1.0, type=float,
+                        help="Total number of training steps to perform.")
+    parser.add_argument("--warmup_proportion",
+                        default=0.1,
+                        type=float,
+                        help="Proportion of training to perform linear learning rate warmup for. "
+                             "E.g., 0.1 = 10%% of training.")
+    parser.add_argument("--no_cuda",
+                        action='store_true',
+                        help="Whether not to use CUDA when available")
+    parser.add_argument("--local_rank",
+                        type=int,
+                        default=-1,
+                        help="local_rank for distributed training on gpus")
+    parser.add_argument('--seed',
+                        type=int,
+                        default=42,
+                        help="random seed for initialization")
+    parser.add_argument('--gradient_accumulation_steps',
+                        type=int,
+                        default=1,
+                        help="Number of updates steps to accumulate before performing a backward/update pass.")
+    parser.add_argument('--fp16',
+                        action='store_true',
+                        help="Whether to use 16-bit float precision instead of 32-bit")
+    parser.add_argument('--loss_scale',
+                        type=float, default=0,
+                        help="Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n"
+                             "0 (default value): dynamic loss scaling.\n"
+                             "Positive power of 2: static loss scaling value.\n")
+
+    args = parser.parse_args()
+
+    if args.local_rank == -1 or args.no_cuda:
+        device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
+        n_gpu = torch.cuda.device_count()
+    else:
+        torch.cuda.set_device(args.local_rank)
+        device = torch.device("cuda", args.local_rank)
+        n_gpu = 1
+        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
+        torch.distributed.init_process_group(backend='nccl')
+    logger.info("device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".format(
+        device, n_gpu, bool(args.local_rank != -1), args.fp16))
+
+    if args.gradient_accumulation_steps < 1:
+        raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format(
+                            args.gradient_accumulation_steps))
+
+    args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps
+
+    random.seed(args.seed)
+    np.random.seed(args.seed)
+    torch.manual_seed(args.seed)
+    if n_gpu > 0:
+        torch.cuda.manual_seed_all(args.seed)
+
+    if not args.do_train and not args.do_eval:
+        raise ValueError("At least one of `do_train` or `do_eval` must be True.")
+
+    if os.path.exists(args.output_dir) and os.listdir(args.output_dir):
+        print("WARNING: Output directory ({}) already exists and is not empty.".format(args.output_dir))
+    if not os.path.exists(args.output_dir):
+        os.makedirs(args.output_dir)
+
+    tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case)
+
+    train_examples = None
+    num_train_optimization_steps = None
+    if args.do_train:
+        train_examples = read_swag_examples(os.path.join(args.data_dir, 'train.csv'), is_training = True)
+        num_train_optimization_steps = int(
+            len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs
+        if args.local_rank != -1:
+            num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size()
+
+    # Prepare model
+    model = BertForMultipleChoice.from_pretrained(args.bert_model,
+        cache_dir=os.path.join(PYTORCH_PRETRAINED_BERT_CACHE, 'distributed_{}'.format(args.local_rank)),
+        num_choices=4)
+    model.load_state_dict(torch.load(args.init_checkpoint, map_location='cpu'), strict=False)
+
+    if args.fp16:
+        model.half()
+    model.to(device)
+    if args.local_rank != -1:
+        try:
+            from apex.parallel import DistributedDataParallel as DDP
+        except ImportError:
+            raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.")
+
+        model = DDP(model)
+    elif n_gpu > 1:
+        model = torch.nn.DataParallel(model)
+
+    # Prepare optimizer
+    param_optimizer = list(model.named_parameters())
+
+    # hack to remove pooler, which is not used
+    # thus it produce None grad that break apex
+    param_optimizer = [n for n in param_optimizer if 'pooler' not in n[0]]
+
+    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
+    optimizer_grouped_parameters = [
+        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
+        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
+        ]
+    if args.fp16:
+        try:
+            from apex.optimizers import FP16_Optimizer
+            from apex.optimizers import FusedAdam
+        except ImportError:
+            raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.")
+
+        optimizer = FusedAdam(optimizer_grouped_parameters,
+                              lr=args.learning_rate,
+                              bias_correction=False,
+                              max_grad_norm=1.0)
+        if args.loss_scale == 0:
+            optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)
+        else:
+            optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale)
+    else:
+        optimizer = BertAdam(optimizer_grouped_parameters,
+                             lr=args.learning_rate,
+                             warmup=args.warmup_proportion,
+                             t_total=num_train_optimization_steps)
+
+    global_step = 0
+    if args.do_train:
+        train_features = convert_examples_to_features(
+            train_examples, tokenizer, args.max_seq_length, True)
+        logger.info("***** Running training *****")
+        logger.info("  Num examples = %d", len(train_examples))
+        logger.info("  Batch size = %d", args.train_batch_size)
+        logger.info("  Num steps = %d", num_train_optimization_steps)
+        all_input_ids = torch.tensor(select_field(train_features, 'input_ids'), dtype=torch.long)
+        all_input_mask = torch.tensor(select_field(train_features, 'input_mask'), dtype=torch.long)
+        all_segment_ids = torch.tensor(select_field(train_features, 'segment_ids'), dtype=torch.long)
+        all_label = torch.tensor([f.label for f in train_features], dtype=torch.long)
+        train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label)
+        if args.local_rank == -1:
+            train_sampler = RandomSampler(train_data)
+        else:
+            train_sampler = DistributedSampler(train_data)
+        train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size)
+
+        model.train()
+        for _ in trange(int(args.num_train_epochs), desc="Epoch"):
+            tr_loss = 0
+            nb_tr_examples, nb_tr_steps = 0, 0
+            for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration")):
+                # Terminate early for benchmarking
+                if args.max_steps > 0 and global_step > args.max_steps:
+                    break
+
+                batch = tuple(t.to(device) for t in batch)
+                input_ids, input_mask, segment_ids, label_ids = batch
+                loss = model(input_ids, segment_ids, input_mask, label_ids)
+                if n_gpu > 1:
+                    loss = loss.mean() # mean() to average on multi-gpu.
+                if args.fp16 and args.loss_scale != 1.0:
+                    # rescale loss for fp16 training
+                    # see https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html
+                    loss = loss * args.loss_scale
+                if args.gradient_accumulation_steps > 1:
+                    loss = loss / args.gradient_accumulation_steps
+                tr_loss += loss.item()
+                nb_tr_examples += input_ids.size(0)
+                nb_tr_steps += 1
+
+                if args.fp16:
+                    optimizer.backward(loss)
+                else:
+                    loss.backward()
+                if (step + 1) % args.gradient_accumulation_steps == 0:
+                    if args.fp16:
+                        # modify learning rate with special warm up BERT uses
+                        # if args.fp16 is False, BertAdam is used that handles this automatically
+                        lr_this_step = args.learning_rate * warmup_linear(global_step/num_train_optimization_steps, args.warmup_proportion)
+                        for param_group in optimizer.param_groups:
+                            param_group['lr'] = lr_this_step
+                    optimizer.step()
+                    optimizer.zero_grad()
+                    global_step += 1
+
+
+    if args.do_train:
+        # Save a trained model and the associated configuration
+        model_to_save = model.module if hasattr(model, 'module') else model  # Only save the model it-self
+        output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME)
+        torch.save(model_to_save.state_dict(), output_model_file)
+        output_config_file = os.path.join(args.output_dir, CONFIG_NAME)
+        with open(output_config_file, 'w') as f:
+            f.write(model_to_save.config.to_json_string())
+
+        # Load a trained model and config that you have fine-tuned
+        config = BertConfig(output_config_file)
+        model = BertForMultipleChoice(config, num_choices=4)
+        model.load_state_dict(torch.load(output_model_file))
+    else:
+        model = BertForMultipleChoice.from_pretrained(args.bert_model, num_choices=4)
+        model.load_state_dict(torch.load(args.init_checkpoint, map_location='cpu'), strict=False)
+    model.to(device)
+
+
+    if args.do_eval and (args.local_rank == -1 or torch.distributed.get_rank() == 0):
+        eval_examples = read_swag_examples(os.path.join(args.data_dir, 'val.csv'), is_training = True)
+        eval_features = convert_examples_to_features(
+            eval_examples, tokenizer, args.max_seq_length, True)
+        logger.info("***** Running evaluation *****")
+        logger.info("  Num examples = %d", len(eval_examples))
+        logger.info("  Batch size = %d", args.eval_batch_size)
+        all_input_ids = torch.tensor(select_field(eval_features, 'input_ids'), dtype=torch.long)
+        all_input_mask = torch.tensor(select_field(eval_features, 'input_mask'), dtype=torch.long)
+        all_segment_ids = torch.tensor(select_field(eval_features, 'segment_ids'), dtype=torch.long)
+        all_label = torch.tensor([f.label for f in eval_features], dtype=torch.long)
+        eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label)
+        # Run prediction for full data
+        eval_sampler = SequentialSampler(eval_data)
+        eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size)
+
+        model.eval()
+        eval_loss, eval_accuracy = 0, 0
+        nb_eval_steps, nb_eval_examples = 0, 0
+        for input_ids, input_mask, segment_ids, label_ids in tqdm(eval_dataloader, desc="Evaluating"):
+            input_ids = input_ids.to(device)
+            input_mask = input_mask.to(device)
+            segment_ids = segment_ids.to(device)
+            label_ids = label_ids.to(device)
+
+            with torch.no_grad():
+                tmp_eval_loss = model(input_ids, segment_ids, input_mask, label_ids)
+                logits = model(input_ids, segment_ids, input_mask)
+
+            logits = logits.detach().cpu().numpy()
+            label_ids = label_ids.to('cpu').numpy()
+            tmp_eval_accuracy = accuracy(logits, label_ids)
+
+            eval_loss += tmp_eval_loss.mean().item()
+            eval_accuracy += tmp_eval_accuracy
+
+            nb_eval_examples += input_ids.size(0)
+            nb_eval_steps += 1
+
+        eval_loss = eval_loss / nb_eval_steps
+        eval_accuracy = eval_accuracy / nb_eval_examples
+
+        result = {'eval_loss': eval_loss,
+                  'eval_accuracy': eval_accuracy,
+                  'global_step': global_step,
+                  'loss': tr_loss/nb_tr_steps}
+
+        output_eval_file = os.path.join(args.output_dir, "eval_results.txt")
+        with open(output_eval_file, "w") as writer:
+            logger.info("***** Eval results *****")
+            for key in sorted(result.keys()):
+                logger.info("  %s = %s", key, str(result[key]))
+                writer.write("%s = %s\n" % (key, str(result[key])))
+
+
+if __name__ == "__main__":
+    main()
--- a/PyTorch/LanguageModeling/BERT/schedulers.py
+++ b/PyTorch/LanguageModeling/BERT/schedulers.py
@ -0,0 +1,92 @@
+import math
+import torch
+from torch.optim.optimizer import Optimizer
+from apex.optimizers import FP16_Optimizer
+from torch.optim.lr_scheduler import _LRScheduler
+
+
+class LRScheduler(_LRScheduler):
+    def __init__(self, optimizer, last_epoch=-1):
+        # Check if using mixed precision training
+        self.mixed_training = False
+        base_optimizer = optimizer
+        if isinstance(optimizer, FP16_Optimizer):
+            self.mixed_training = True
+            self.fp16_optimizer = optimizer
+            base_optimizer = optimizer.optimizer
+        # Check that optimizer param is valid
+        elif not isinstance(optimizer, Optimizer):
+            raise TypeError('{} is not an Optimizer'.format(
+                type(optimizer).__name__))
+
+        super(LRScheduler, self).__init__(base_optimizer, last_epoch)
+
+    def step(self, epoch=None):
+        # Set the current training step
+        # ('epoch' is used to be consistent with _LRScheduler)
+        if self.mixed_training:
+            # The assumption is that the step will be constant
+            state_dict = self.optimizer.state[self.optimizer.param_groups[0]['params'][0]]
+            if 'step' in state_dict:
+                self.last_epoch = state_dict['step'] + 1
+            else:
+                self.last_epoch = 1
+        else:
+            self.last_epoch = epoch if epoch is not None else self.last_epoch + 1
+
+        for param_group, lr in zip(self.optimizer.param_groups, self.get_lr()):
+            param_group['lr'] = lr
+
+
+class CosineWarmupScheduler(LRScheduler):
+    """
+    Applies a warm up period to the learning rate.
+    """
+
+    def __init__(self, optimizer, warmup, total_steps, last_epoch=-1):
+        self.warmup = warmup
+        self.total_steps = total_steps
+        super(CosineWarmUpScheduler, self).__init__(optimizer, last_epoch)
+
+    def get_lr(self):
+        progress = self.last_epoch / self.total_steps
+        if progress < self.warmup:
+            return [base_lr * progress / self.warmup for base_lr in self.base_lrs]
+        else:
+            return [base_lr * (0.5 * (1.0 + torch.cos(math.pi + progress))) for base_lr in self.base_lrs]
+
+
+class ConstantWarmupScheduler(LRScheduler):
+    """
+    Applies a warm up period to the learning rate.
+    """
+
+    def __init__(self, optimizer, warmup, total_steps, last_epoch=-1):
+        self.warmup = warmup
+        self.total_steps = total_steps
+        super(CosineWarmUpScheduler, self).__init__(optimizer, last_epoch)
+
+    def get_lr(self):
+        progress = self.last_epoch / self.total_steps
+        if progress < self.warmup:
+            return [base_lr * progress / self.warmup for base_lr in self.base_lrs]
+        else:
+            return self.base_lrs
+
+
+class LinearWarmUpScheduler(LRScheduler):
+    """
+    Applies a warm up period to the learning rate.
+    """
+
+    def __init__(self, optimizer, warmup, total_steps, last_epoch=-1):
+        self.warmup = warmup
+        self.total_steps = total_steps
+        super(LinearWarmUpScheduler, self).__init__(optimizer, last_epoch)
+
+    def get_lr(self):
+        progress = self.last_epoch / self.total_steps
+        if progress < self.warmup:
+            return [base_lr * progress / self.warmup for base_lr in self.base_lrs]
+        else:
+            return [base_lr * max(( progress - 1.0)/(self.warmup - 1.0), 0.) for base_lr in self.base_lrs]
--- a/PyTorch/LanguageModeling/BERT/scripts/data_download.sh
+++ b/PyTorch/LanguageModeling/BERT/scripts/data_download.sh
@ -0,0 +1,38 @@
+#!/usr/bin/env bash
+DATA_DIR=${1:-/workspace/bert/data}
+
+# Check running from repository root
+if [ ! -d .git ]; then
+  echo "Not running from repository root! Exiting."
+  exit 1
+fi
+
+# Download vocab files from pretrained model
+cd vocab && python3 download_models.py && rm *.zip && rm ./*/*.ckpt.*
+
+# Download SQUAD
+cd $DATA_DIR/squad && . squad_download.sh
+
+# Download SWAG
+git clone https://github.com/rowanz/swagaf.git $DATA_DIR/swag
+
+# Download GLUE
+cd $DATA_DIR/glue && . download_mrpc.sh
+
+# WIKI Download
+cd $DATA_DIR/wikipedia_corpus && . download_wikipedia.sh
+
+# Bookcorpus  Download
+cd $DATA_DIR/bookcorpus && . download_bookcorpus.sh
+
+cd $DATA_DIR
+# Create HDF5 files for WIKI
+bash create_datasets_from_start.sh wikipedia_corpus ./wikipedia_corpus/wikipedia_corpus.txt \
+  && rm -r ./wikipedia_corpus/final_* \
+
+# Create HDF5 files for Bookcorpus
+bash create_datasets_from_start.sh bookcorpus ./bookcorpus/bookcorpus.txt \
+  && rm -r ./bookcorpus/final_* \
+
+# Create HDF5 files for inter sequence-pair mixed Wikipedia and Bookcorpus
+bash merge_datasets_after_creation.sh merged_wiki+books wikipedia_corpus/hdf5_shards,bookcorpus/hdf5_shards 1024
--- a/PyTorch/LanguageModeling/BERT/scripts/docker/build.sh
+++ b/PyTorch/LanguageModeling/BERT/scripts/docker/build.sh
@ -0,0 +1,9 @@
+#!/bin/bash
+
+# Check running from repository root
+if [ ! -d .git ]; then
+  echo "Not running from repository root! Exiting."
+  exit 1
+fi
+
+docker build . --rm -t bert
--- a/PyTorch/LanguageModeling/BERT/scripts/docker/launch.sh
+++ b/PyTorch/LanguageModeling/BERT/scripts/docker/launch.sh
@ -0,0 +1,23 @@
+#!/bin/bash
+
+# Check running from repository root
+if [ ! -d .git ]; then
+  echo "Not running from repository root! Exiting."
+  exit 1
+fi
+
+DATA_DIR=${1:-"/mnt/dldata/bert"}
+VOCAB_DIR=${2:-"/mnt/dldata/bert/vocab"}
+CHECKPOINT_DIR=${3:-"/mnt/dldata/bert/pretrained_models_nvidia_pytorch"}
+
+docker run -it --rm \
+  --runtime=nvidia \
+  -p 8888:8888 \
+  --shm-size=1g \
+  --ulimit memlock=-1 \
+  --ulimit stack=67108864 \
+  -v $DATA_DIR:/workspace/bert/data \
+  -v $CHECKPOINT_DIR:/workspace/checkpoints \
+  -v $VOCAB_DIR:/workspace/bert/vocab \
+  -v $PWD/results:/results \
+  bert bash
--- a/PyTorch/LanguageModeling/BERT/scripts/run.sh
+++ b/PyTorch/LanguageModeling/BERT/scripts/run.sh
@ -0,0 +1,184 @@
+#!/bin/bash
+#SBATCH -p mlperf		# partition
+#SBATCH -N 1       		# number of nodes
+#SBATCH -t 12:00:00		# wall time
+#SBATCH -J image_classification	# job name
+#SBATCH --exclusive   		# exclusive node access
+#SBATCH --mem=0   		# all mem avail
+#SBATCH --mail-type=FAIL        # only send email on failure
+#SBATCH --ntasks-per-node=8	# n tasks per machine (one task per gpu)
+#SBATCH --threads-per-core=2	# HT is on
+#SBATCH --cores-per-socket=20	# 20 cores on each socket 
+#SBATCH --overcommit
+
+hostname
+#DGXIBDEVICES=$(eval ls /dev/infiniband/ | tr " " "\n" | awk '{printf "--device=/dev/infiniband/%s ",$1}' | sed s'/.$//')
+printf "DGXIBDEVICES=%s\n" "$DGXIBDEVICES"
+printf "VOLS=%s\n" "$VOLS"
+printf "EXTRA_PARAMS=%s\n" "$EXTRA_PARAMS"
+
+cd $CODEDIR
+
+VOLS+=" -v $CHKPTDIR/$SLURM_JOB_ID:/checkpoints"
+
+mkdir -p $CHKPTDIR/$SLURM_JOB_ID
+
+## DO NOT CHANGE ANYTHING BELOW -- DL params are in run_and_time.sh and config_<system>.sh files 
+
+DEBUG=1  # 1 = Print verbose messages for debugging
+
+## Pre-warming the containers ##
+hosts=( `scontrol show hostname |tr "\n" " "` )
+pids=(); for hostn in ${hosts[@]}; do
+  timeout -k 600s 600s \
+  srun -N 1 -n 1 -w $hostn \
+    docker pull $CONT &
+  pids+=($!);
+   pids+=($!); rets+=($?);
+done
+wait "${pids[@]}"
+success=0; for s in ${rets[@]}; do ((success+=s)); done ; if [ $success -ne 0 ]; then echo "ERR: Container pull failed"; exit $success ; fi
+
+IBDEVICES=${IBDEVICES:-$DGXIBDEVICES}
+
+## Check whether we are running in a slurm env
+INSLURM=1
+if [[ -z "$SLURM_JOB_ID" ]]; then
+  INSLURM=0
+  export SLURM_JOB_ID="${DATESTAMP}"
+  export SLURM_NNODES=1
+fi
+if [[ -z "SLURM_JOB_ID" || $SLURM_NNODES -eq 1 ]]; then
+  # don't need IB if not multi-node
+  export IBDEVICES=""
+fi
+
+# Create results directory
+LOGFILE_BASE="${LOGDIR}/${DATESTAMP}"
+mkdir -p $(dirname "${LOGFILE_BASE}")
+
+export CONTNAME="${SLURM_JOB_ID}"
+export DOCKEREXEC="nvidia-docker run --rm --net=host --uts=host --ipc=host --ulimit stack=67108864 --ulimit memlock=-1 --security-opt seccomp=unconfined  $IBDEVICES"
+CMD="python -np $((SLURM_NNODES*DGXNGPU)) -x EXTRA_PARAMS=\"${EXTRA_PARAMS}\" -x NCCL_LL_THRESHOLD=0 -x NCCL_DEBUG=INFO -x NCCL_NET_GDR_READ=1 -x NCCL_SOCKET_IFNAME=^docker0,bond0,lo $BIND ./run_pretraining.sh"
+echo $CMD
+
+mkdir -m 777 -p $LOGDIR
+echo $CMD | tee -a $LOGDIR/$DATESTAMP.log 
+echo "slurm job id" $SLURM_JOB_ID &> $LOGDIR/$DATESTAMP.log 
+
+MASTER_IP=`getent hosts \`hostname\` | cut -d ' ' -f1`
+SSH=''
+SRUN=''
+if [[ $INSLURM -eq 0 ]]; then
+  export hosts=( `hostname` )
+else
+  export hosts=( `scontrol show hostname |tr "\n" " "` )
+  SSH='ssh -q -o UserKnownHostsFile=/dev/null -o StrictHostKeyChecking=no $hostn'
+  SRUN='srun -N 1 -n 1 -w $hostn'
+fi
+unique_hosts=( $(echo "${hosts[@]}" | tr ' ' '\n' | sort -u | tr '\n' ' ' ) )
+export MASTER_HOST=${hosts[0]}
+
+VARS="-e OMPI_MCA_mca_base_param_files=/dev/shm/mpi/${SLURM_JOB_ID}/mca_params.conf -e EXTRA_PARAMS -e GPUS -e BATCHSIZE -e CONT -e DGXSYSTEM=$DGXSYSTEM -e MASTER_HOST -e MASTER_IP -e SLURM_JOB_NUM_NODES -e SLURM_NNODES -e SLURM_NTASKS_PER_NODE -w /workspace/bert"
+
+RUNSLEEPCMD=""
+
+[[ "${PULL}" -eq "1" ]] && docker pull $CONT
+
+## Setting up MPI
+# MPI support files - in /dev/shm/mpi/<jobid>
+# 1. Copy user keys to /dev/shm/mpi/<jobid>
+# 2. Create mca_params.conf
+# 3. Create sshentry.sh to support lauching into containers on worker nodes
+# 4. Create mpi_hosts file
+# 5. Copy standard ssh
+
+if [[ $SLURM_NNODES -ne "1" ]]; then
+
+  # Make keys and copy
+  echo
+
+  [[ $DEBUG == 1 ]] && echo "Setting up ssh keys and config"
+
+  mkdir -p ${HOME}/.ssh/sbatch/${SLURM_JOB_ID}
+  ssh-keygen -t rsa -b 2048 -n "" -f "${HOME}/.ssh/sbatch/${SLURM_JOB_ID}/sshkey.rsa" -C "mxnet_${SLURM_JOB_ID}_"  &>/dev/null
+  echo command=no-port-forwarding,no-agent-forwarding,no-X11-forwarding $(cat ${HOME}/.ssh/sbatch/${SLURM_JOB_ID}/sshkey.rsa.pub) >> ${HOME}/.ssh/authorized_keys
+  chmod 600 ~/.ssh/authorized_keys
+
+  [[ $DEBUG == 1 ]] && echo "Copy keys: srun -n $SLURM_JOB_NUM_NODES  && cp -R ${HOME}/.ssh/sbatch/${SLURM_JOB_ID} /dev/shm/mpi && chmod 700 /dev/shm/mpi/${SLURM_JOB_ID}" 
+
+  srun  -n $SLURM_JOB_NUM_NODES --ntasks-per-node=1 bash -c "mkdir -p /dev/shm/mpi/${SLURM_JOB_ID}; cp -R ${HOME}/.ssh/sbatch/${SLURM_JOB_ID} /dev/shm/mpi; chmod 700 /dev/shm/mpi/${SLURM_JOB_ID}"
+
+  sleep 2 # Making copy
+
+  [[ $DEBUG == 1 ]] && ls /dev/shm
+
+  # Create mpi config file
+  srun  -n $SLURM_JOB_NUM_NODES --ntasks-per-node=1 tee /dev/shm/mpi/${SLURM_JOB_ID}/mca_params.conf <<EOF
+plm_rsh_agent = /usr/bin/ssh
+plm_rsh_args = -i /dev/shm/mpi/${SLURM_JOB_ID}/sshkey.rsa -oStrictHostKeyChecking=no -oUserKnownHostsFile=/dev/null -oLogLevel=ERROR -l ${USER}
+orte_default_hostfile = /dev/shm/mpi/${SLURM_JOB_ID}/mpi_hosts
+btl_openib_warn_default_gid_prefix = 0
+mpi_warn_on_fork = 0
+allow_run_as_root = 1
+EOF
+
+  [[ $DEBUG == 1 ]] && echo "::mca_params.conf=" && cat /dev/shm/mpi/${SLURM_JOB_ID}/mca_params.conf
+
+  # Create ssh helper script that transfers an ssh into a compute node into the running container on that node
+  srun -n $SLURM_JOB_NUM_NODES --ntasks-per-node=1 tee /dev/shm/mpi/${SLURM_JOB_ID}/sshentry.sh <<EOF
+#!/bin/bash
+echo "::sshentry: entered \$(hostname)"
+[[ -f $CONTNAME ]] && "::worker container not found error" && exit 1
+echo "::sshentry: running \$SSH_ORIGINAL_COMMAND"
+exec docker exec $CONTNAME /bin/bash -c "\$SSH_ORIGINAL_COMMAND"
+EOF
+
+  [[ $DEBUG == 1 ]] && echo "::sshentry=" && cat /dev/shm/mpi/${SLURM_JOB_ID}/sshentry.sh
+
+  # Create mpi hostlist
+  for h in ${hosts[@]}; do
+     echo "$h slots=${SLURM_NTASKS_PER_NODE}" >> /dev/shm/mpi/${SLURM_JOB_ID}/mpi_hosts
+  done
+
+  [[ $DEBUG == 1 ]] && echo '::mpi-host file=' && cat /dev/shm/mpi/${SLURM_JOB_ID}/mpi_hosts
+
+  srun -n $SLURM_JOB_NUM_NODES --ntasks-per-node=1 bash -c "cp $(which ssh) /dev/shm/mpi/${SLURM_JOB_ID}/.;  chmod 755 /dev/shm/mpi/${SLURM_JOB_ID}/mca_params.conf;  chmod 755 /dev/shm/mpi/${SLURM_JOB_ID}/sshentry.sh"
+
+  # Check that ssh/mpi dir has correct number of files
+  [[ $(ls /dev/shm/mpi/${SLURM_JOB_ID} | wc -w) -lt 5 ]]  && echo "ERR: /dev/shm/mpi/${SLURM_JOB_ID} doesn't exist or missing ssh/mpi files" && exit $?
+
+fi
+
+# Container launch
+if [[ $INSLURM -eq 1 ]]; then
+
+  # Launch containers behind srun
+
+  [[ $DEBUG == 1 ]] && echo "" && echo ":Launch containers:  srun  -n $SLURM_JOB_NUM_NODES --ntasks-per-node=1 $DOCKEREXEC --name $CONTNAME $VOLS $VARS $CONT bash -c 'sleep infinity'"
+  srun  -n $SLURM_JOB_NUM_NODES --ntasks-per-node=1 $DOCKEREXEC --name $CONTNAME $VOLS $VARS $CONT bash -c 'sleep infinity' & rv=$?
+else
+  $DOCKEREXEC --name $CONTNAME $VOLS $VARS $CONT bash -c 'sleep infinity' & rv=$?
+fi
+[[ $rv -ne 0 ]] && echo "ERR: Launch sleep containers failed." && exit $rv
+echo "sleep 60 while we pull our container, good golly!"
+sleep 60
+
+# Run benchmarks
+echo "sleep again for 20"
+sleep 20
+export EXTRA_PARAMS
+
+(
+# Launching app
+echo 
+echo "Launching user script on master node:"
+  hostn=$MASTER_HOST
+  $(eval echo $SSH) docker exec $VARS $CONTNAME $MPICMD ; rv=$?
+  [[ $rv -ne 0 ]] && echo "ERR: User script failed." && exit $rv
+) |& tee ${LOGFILE_BASE}_$nrun.log
+
+# Clean up (note: on SLURM we skip this, as the epilogue will take care of it)
+if [[ $INSLURM -eq 0 ]]; then
+  docker rm -f $CONTNAME
+fi
--- a/PyTorch/LanguageModeling/BERT/scripts/run_glue.sh
+++ b/PyTorch/LanguageModeling/BERT/scripts/run_glue.sh
@ -0,0 +1,63 @@
+#!/bin/bash
+
+MRPC_DIR=/workspace/bert/data/glue/MRPC
+OUT_DIR=/results/MRPC
+
+mkdir -p $OUT_DIR
+
+echo "Container nvidia build = " $NVIDIA_BUILD_ID
+
+init_checkpoint=${1}
+mode=${2:-"train"}
+max_steps=${3:-"-1.0"} # if < 0, has no effect
+batch_size=${4:-"12"}
+learning_rate=${5:-"5e-6"}
+precision=${6:-"fp32"}
+num_gpu=${7:-"8"}
+epochs=${8:-"2"}
+
+if [ "$mode" != "train" ] ; then
+  num_gpu=1
+fi
+
+use_fp16=""
+if [ "$precision" = "fp16" ] ; then
+  echo "fp16 activated!"
+  use_fp16="--fp16"
+fi
+
+if [ "$num_gpu" = "1" ] ; then
+  mpi_command=""
+else
+  mpi_command="torch.distributed.launch --nproc_per_node=$num_gpu"
+fi
+
+CMD="python -m $mpi_command run_glue.py "
+CMD+="--task_name MRPC "
+if [ "$mode" = "train" ] ; then
+  CMD+="--do_train "
+  CMD+="--train_batch_size=$batch_size "
+else
+  CMD+="--do_eval "
+  CMD+="--eval_batch_size=$batch_size "
+fi
+CMD+="--do_lower_case "
+CMD+="--data_dir $MRPC_DIR "
+CMD+="--bert_model bert-large-uncased "
+CMD+="--init_checkpoint $init_checkpoint "
+CMD+="--max_seq_length 128 "
+CMD+="--learning_rate $learning_rate "
+CMD+="--num_train_epochs $epochs "
+CMD+="--max_steps $max_steps "
+CMD+="--output_dir $OUT_DIR "
+CMD+="$use_fp16"
+
+LOGFILE=$OUT_DIR/logfile
+$CMD |& tee $LOGFILE
+
+sed -r 's/
|(\[A)/\n/g' $LOGFILE > $LOGFILE.edit
+
+throughput=`cat $LOGFILE.edit | grep -E 'Iteration.*[0-9.]+(s/it|it/s)' | tail -1 | egrep -o '[0-9.]+(s/it|it/s)'`
+
+echo "throughput: $throughput"
+
--- a/PyTorch/LanguageModeling/BERT/scripts/run_pretraining.sh
+++ b/PyTorch/LanguageModeling/BERT/scripts/run_pretraining.sh
@ -0,0 +1,152 @@
+#!/bin/bash
+
+echo "Container nvidia build = " $NVIDIA_BUILD_ID
+
+DATASET=wikipedia_corpus # change this for other datasets
+
+DATA_DIR=data/${DATASET}/hdf5_shards/
+BERT_CONFIG=bert_config.json
+RESULTS_DIR=/results
+CHECKPOINTS_DIR=/results/checkpoints
+
+mkdir -p $CHECKPOINTS_DIR
+
+
+if [ ! -d "$DATA_DIR" ] ; then
+   echo "Warning! $DATA_DIR directory missing. Training cannot start"
+fi
+if [ ! -d "$RESULTS_DIR" ] ; then
+   echo "Error! $RESULTS_DIR directory missing."
+   exit -1
+fi
+if [ ! -d "$CHECKPOINTS_DIR" ] ; then
+   echo "Warning! $CHECKPOINTS_DIR directory missing."
+   echo "Checkpoints will be written to $RESULTS_DIR instead."
+   CHECKPOINTS_DIR=$RESULTS_DIR
+fi
+if [ ! -f "$BERT_CONFIG" ] ; then
+   echo "Error! BERT large configuration file not found at $BERT_CONFIG"
+   exit -1
+fi
+
+train_batch_size=${1:-14}
+learning_rate=${2:-"0.4375e-4"}
+precision=${3:-"fp16"}
+num_gpus=${4:-8}
+warmup_proportion=${5:-"0.01"}
+train_steps=${6:-2285714}
+save_checkpoint_steps=${7:-2000}
+resume_training=${8:-"false"}
+create_logfile=${9:-"true"}
+checkpoint_activations=${10:-"false"}
+seed=${11:-42}
+
+PREC=""
+if [ "$precision" = "fp16" ] ; then
+   PREC="--fp16"
+elif [ "$precision" = "fp32" ] ; then
+   PREC=""
+else
+   echo "Unknown <precision> argument"
+   exit -2
+fi
+
+CHECKPOINT_ACTIVATIONS=""
+if [ "$checkpoint_activations" == "true" ] ; then
+   CHECKPOINT_ACTIVATIONS="--checkpoint_activations"
+fi
+
+CHECKPOINT=""
+if [ "$resume_training" == "true" ] ; then
+   CHECKPOINT="--resume_from_checkpoint"
+fi
+
+echo $DATA_DIR
+INPUT_DIR=$DATA_DIR
+CMD=" /workspace/bert/run_pretraining.py"
+CMD+=" --input_dir=$DATA_DIR"
+CMD+=" --output_dir=$CHECKPOINTS_DIR"
+CMD+=" --config_file=$BERT_CONFIG"
+CMD+=" --do_train"
+CMD+=" --bert_model=bert-large-uncased"
+CMD+=" --train_batch_size=$train_batch_size"
+CMD+=" --max_seq_length=512"
+CMD+=" --max_predictions_per_seq=80"
+CMD+=" --max_steps=$train_steps"
+CMD+=" --warmup_proportion=$warmup_proportion"
+CMD+=" --num_steps_per_checkpoint=$save_checkpoint_steps"
+CMD+=" --learning_rate=$learning_rate"
+CMD+=" --seed=$seed"
+CMD+=" $PREC"
+CMD+=" $CHECKPOINT_ACTIVATIONS"
+CMD+=" $CHECKPOINT"
+
+
+if [ "$num_gpus" -gt 1  ] ; then
+   CMD="python3 -m torch.distributed.launch --nproc_per_node=$num_gpus $CMD"
+else
+   CMD="python3  $CMD"
+fi
+
+
+if [ "$create_logfile" = "true" ] ; then
+  export GBS=$(expr $train_batch_size \* $num_gpus)
+  printf -v TAG "pyt_bert_pretraining_%s_gbs%d" "$precision" $GBS
+  DATESTAMP=`date +'%y%m%d%H%M%S'`
+  LOGFILE=$RESULTS_DIR/$TAG.$DATESTAMP.log
+  printf "Logs written to %s\n" "$LOGFILE"
+fi
+
+set -x
+if [ -z "$LOGFILE" ] ; then
+   $CMD
+else
+   (
+     $CMD
+   ) |& tee $LOGFILE
+fi
+
+set +x
+
+echo "finished pretraining, starting benchmarking"
+
+target_loss=15
+THROUGHPUT=10
+THRESHOLD=0.9
+
+throughput=`cat $LOGFILE | grep Iteration | tail -1 | awk -F's/it' '{print $1}' | awk -F',' '{print $2}' | egrep -o [0-9.]+`
+loss=`cat $LOGFILE | grep 'Average Loss' | tail -1 | awk -F'Average Loss =' '{print $2}' | awk -F' ' '{print $1}' | egrep -o [0-9.]+`
+final_loss=`cat $LOGFILE | grep 'Total Steps' | tail -1 | awk -F'Final Loss =' '{print $2}' | awk -F' ' '{print $1}' | egrep -o [0-9.]+`
+
+echo "throughput: $throughput s/it"
+echo "average loss: $loss"
+echo "final loss: $final_loss"
+
+ACCURACY_TEST_RESULT=$(awk 'BEGIN {print ('${loss}' <= '${target_loss}')}')
+
+if [ $ACCURACY_TEST_RESULT == 1 ];
+    then
+        echo "&&&& ACCURACY TEST PASSED"
+    else
+        echo "&&&& ACCURACY TEST FAILED"
+    fi
+
+PERFORMANCE_TEST_RESULT=$(awk 'BEGIN {print ('${throughput}' <= ('${THROUGHPUT}' * '${THRESHOLD}'))}')
+
+if [ $PERFORMANCE_TEST_RESULT == 1 ];
+    then
+        echo "&&&& PERFORMANCE TEST PASSED"
+    else
+        echo "&&&& PERFORMANCE TEST FAILED"
+    fi
+
+if [ $ACCURACY_TEST_RESULT == 1 -a $PERFORMANCE_TEST_RESULT == 1 ];
+    then
+        echo "&&&& PASSED"
+        exit 0
+    else
+        echo "&&&& FAILED"
+        exit 1
+    fi
+
+
--- a/PyTorch/LanguageModeling/BERT/scripts/run_pretraining_inference.sh
+++ b/PyTorch/LanguageModeling/BERT/scripts/run_pretraining_inference.sh
@ -0,0 +1,146 @@
+#!/bin/bash
+
+echo "Container nvidia build = " $NVIDIA_BUILD_ID
+
+DATASET=wikipedia_corpus # change this for other datasets
+
+DATA_DIR=data/${DATASET}/hdf5_shards/
+BERT_CONFIG=bert_config.json
+RESULTS_DIR=/results
+CHECKPOINTS_DIR=/results/checkpoints
+
+
+if [ ! -d "$DATA_DIR" ] ; then
+   echo "Warning! $DATA_DIR directory missing. Inference cannot start"
+fi
+if [ ! -d "$RESULTS_DIR" ] ; then
+   echo "Error! $RESULTS_DIR directory missing."
+   exit -1
+fi
+if [ ! -d "$CHECKPOINTS_DIR" ] ; then
+   echo "Warning! $CHECKPOINTS_DIR directory missing."
+   echo "Checkpoints will be loaded from $RESULTS_DIR instead."
+   CHECKPOINTS_DIR=$RESULTS_DIR
+fi
+if [ ! -f "$BERT_CONFIG" ] ; then
+   echo "Error! BERT large configuration file not found at $BERT_CONFIG"
+   exit -1
+fi
+
+eval_batch_size=${1:-14}
+precision=${2:-"fp16"}
+num_gpus=${3:-8}
+inference_mode=${4:-"eval"}
+model_checkpoint=${5:-"-1"}
+inference_steps=${6:-"-1"}
+create_logfile=${7:-"true"}
+seed=${8:-42}
+
+PREC=""
+if [ "$precision" = "fp16" ] ; then
+   PREC="--fp16"
+elif [ "$precision" = "fp32" ] ; then
+   PREC=""
+else
+   echo "Unknown <precision> argument"
+   exit -2
+fi
+
+
+MODE=""
+if [ "$inference_mode" = "eval" ] ; then
+   MODE="--eval"
+elif [ "$inference_mode" = "prediction" ] ; then
+   MODE="--prediction"
+else
+   echo "Unknown <inference_mode> argument"
+   exit -2
+fi
+
+echo $DATA_DIR
+CMD=" /workspace/bert/run_pretraining_inference.py"
+CMD+=" --input_dir=$DATA_DIR"
+CMD+=" --ckpt_dir=$CHECKPOINTS_DIR"
+CMD+=" --config_file=$BERT_CONFIG"
+CMD+=" --bert_model=bert-large-uncased"
+CMD+=" --eval_batch_size=$eval_batch_size"
+CMD+=" --max_seq_length=512"
+CMD+=" --max_predictions_per_seq=80"
+CMD+=" --max_steps=$inference_steps"
+CMD+=" --ckpt_step=$model_checkpoint"
+CMD+=" --seed=$seed"
+CMD+=" $PREC"
+CMD+=" $MODE"
+
+if [ "$num_gpus" -gt 1 ] ; then
+   CMD="python3 -m torch.distributed.launch --nproc_per_node=$num_gpus $CMD"
+else
+   CMD="python3  $CMD"
+fi
+
+if [ "$create_logfile" = "true" ] ; then
+  export GBS=$((eval_batch_size * num_gpus))
+  printf -v TAG "pyt_bert_pretraining_inference_%s_gbs%d" "$precision" $GBS
+  DATESTAMP=`date +'%y%m%d%H%M%S'`
+  LOGFILE=$RESULTS_DIR/$TAG.$DATESTAMP.log
+  printf "Logs written to %s\n" "$LOGFILE"
+fi
+
+set -x
+if [ -z "$LOGFILE" ] ; then
+   $CMD
+else
+   (
+     $CMD
+   ) |& tee $LOGFILE
+fi
+set +x
+
+target_loss=15
+THROUGHPUT=1.0
+THRESHOLD=0.9
+
+throughput=`cat $LOGFILE | grep Iteration | tail -1 | awk -F'it/s' '{print $1}' | awk -F',' '{print $2}' | egrep -o [0-9.]+`
+
+
+echo "throughput: $throughput it/s"
+
+
+PERFORMANCE_TEST_RESULT=$(awk 'BEGIN {print ('${throughput}' >= \
+      ('${THROUGHPUT}' * '${THRESHOLD}'))}')
+
+if [ $PERFORMANCE_TEST_RESULT == 1 ];
+   then
+      echo "&&&& PERFORMANCE TEST PASSED"
+   else
+      echo "&&&& PERFORMANCE TEST FAILED"
+   fi
+
+
+if [ "$inference_mode" = "eval" ] ; then
+   loss=`cat $LOGFILE | grep Finished | tail -1 | awk -F'Final Loss =' '{print $2}' | awk -F' ' '{print $1}' | egrep -o [0-9.]+`
+
+
+   echo "final loss: $loss"
+
+
+   ACCURACY_TEST_RESULT=$(awk 'BEGIN {print ('${loss}' <= '${target_loss}')}')
+
+   if [ $ACCURACY_TEST_RESULT == 1 ];
+      then
+         echo "&&&& ACCURACY TEST PASSED"
+      else
+         echo "&&&& ACCURACY TEST FAILED"
+      fi
+
+   
+   if [ $ACCURACY_TEST_RESULT == 1 -a $PERFORMANCE_TEST_RESULT == 1 ];
+      then
+         echo "&&&& PASSED"
+         exit 0
+      else
+         echo "&&&& FAILED"
+         exit 1
+      fi
+fi
+
--- a/PyTorch/LanguageModeling/BERT/scripts/run_squad.sh
+++ b/PyTorch/LanguageModeling/BERT/scripts/run_squad.sh
@ -0,0 +1,88 @@
+#!/usr/bin/env bash
+
+#OUT_DIR=/results/SQuAD
+
+
+echo "Container nvidia build = " $NVIDIA_BUILD_ID
+
+init_checkpoint=${1:-"/workspace/checkpoints/bert_uncased.pt"}
+epochs=${2:-"2.0"}
+batch_size=${3:-"24"}
+learning_rate=${4:-"3e-5"}
+precision=${5:-"fp16"}
+num_gpu=${6:-"8"}
+seed=${7:-"42"}
+squad_dir=${8:-"/workspace/bert/data/squad/v1.1"}
+vocab_file=${9:-"/workspace/bert/vocab/vocab"}
+OUT_DIR=${10:-"/results/SQuAD"}
+mode=${11:-"train eval"}
+CONFIG_FILE=${12:-"/workspace/bert/bert_config.json"}
+max_steps=${13:-"-1"}
+
+echo "out dir is $OUT_DIR"
+mkdir -p $OUT_DIR
+if [ ! -d "$OUT_DIR" ]; then
+  echo "ERROR: non existing $OUT_DIR"
+  exit 1
+fi
+
+use_fp16=""
+if [ "$precision" = "fp16" ] ; then
+  echo "fp16 activated!"
+  use_fp16=" --fp16 "
+fi
+
+if [ "$num_gpu" = "1" ] ; then
+  export CUDA_VISIBLE_DEVICES=0
+  mpi_command=""
+else
+  unset CUDA_VISIBLE_DEVICES
+  mpi_command=" -m torch.distributed.launch --nproc_per_node=$num_gpu"
+fi
+
+CMD="python  $mpi_command run_squad.py "
+CMD+="--init_checkpoint=$init_checkpoint "
+if [ "$mode" = "train" ] ; then
+  CMD+="--do_train "
+  CMD+="--train_file=$squad_dir/train-v1.1.json "
+  CMD+="--train_batch_size=$batch_size "
+elif [ "$mode" = "eval" ] ; then
+  CMD+="--do_predict "
+  CMD+="--predict_file=$squad_dir/dev-v1.1.json "
+  CMD+="--predict_batch_size=$batch_size "
+else
+  CMD+=" --do_train "
+  CMD+=" --train_file=$squad_dir/train-v1.1.json "
+  CMD+=" --train_batch_size=$batch_size "
+  CMD+="--do_predict "
+  CMD+="--predict_file=$squad_dir/dev-v1.1.json "
+  CMD+="--predict_batch_size=$batch_size "
+fi
+CMD+=" --do_lower_case "
+# CMD+=" --old "
+# CMD+=" --loss_scale=128 "
+CMD+=" --bert_model=bert-large-uncased "
+CMD+=" --learning_rate=$learning_rate "
+CMD+=" --seed=$seed "
+CMD+=" --num_train_epochs=$epochs "
+CMD+=" --max_seq_length=384 "
+CMD+=" --doc_stride=128 "
+CMD+=" --output_dir=$OUT_DIR "
+CMD+=" --vocab_file=$vocab_file "
+CMD+=" --config_file=$CONFIG_FILE "
+CMD+=" --max_steps=$max_steps "
+CMD+=" $use_fp16"
+
+LOGFILE=$OUT_DIR/logfile.txt
+echo "$CMD |& tee $LOGFILE"
+time $CMD |& tee $LOGFILE
+
+#sed -r 's/
+#|([A)/\n/g' $LOGFILE > $LOGFILE.edit
+throughput=`cat $LOGFILE | grep -E 'Iteration.*[0-9.]+(s/it|it/s)' | tail -1 | egrep -o '[0-9.]+(s/it|it/s)' | head -1 | egrep -o '[0-9.]+'`
+
+if [ "$mode" != "train" ]; then
+python $squad_dir/evaluate-v1.1.py $squad_dir/dev-v1.1.json $OUT_DIR/predictions.json |& tee -a $LOGFILE
+fi
+
+echo "throughput: $throughput"
--- a/PyTorch/LanguageModeling/BERT/scripts/run_swag.sh
+++ b/PyTorch/LanguageModeling/BERT/scripts/run_swag.sh
@ -0,0 +1,62 @@
+#!/bin/bash
+
+SWAG_DIR=/workspace/bert/data/swag
+OUT_DIR=/results/SWAG
+
+mkdir -p $OUT_DIR
+
+echo "Container nvidia build = " $NVIDIA_BUILD_ID
+
+init_checkpoint=${1}
+mode=${2:-"train"}
+max_steps=${3:-"-1.0"} # if < 0, has no effect
+batch_size=${4:-"12"}
+learning_rate=${5:-"5e-6"}
+precision=${6:-"fp32"}
+num_gpu=${7:-"8"}
+epochs=${8:-"2"}
+
+if [ "$mode" != "train" ] ; then
+  num_gpu=1
+fi
+
+use_fp16=""
+if [ "$precision" = "fp16" ] ; then
+  echo "fp16 activated!"
+  use_fp16="--fp16"
+fi
+
+if [ "$num_gpu" = "1" ] ; then
+  mpi_command=""
+else
+  mpi_command="torch.distributed.launch --nproc_per_node=$num_gpu"
+fi
+
+CMD="python -m $mpi_command run_swag.py "
+CMD+="--init_checkpoint=$init_checkpoint "
+if [ "$mode" = "train" ] ; then
+  CMD+="--do_train "
+  CMD+="--train_batch_size=$batch_size "
+else
+  CMD+="--do_eval "
+  CMD+="--eval_batch_size=$batch_size "
+fi
+CMD+="--do_lower_case "
+CMD+="--data_dir $SWAG_DIR/data/ "
+CMD+="--bert_model bert-large-uncased "
+CMD+="--max_seq_length 128 "
+CMD+="--learning_rate $learning_rate "
+CMD+="--num_train_epochs $epochs "
+CMD+="--max_steps $max_steps "
+CMD+="--output_dir $OUT_DIR "
+CMD+="$use_fp16"
+
+LOGFILE=$OUT_DIR/logfile
+$CMD |& tee $LOGFILE
+
+sed -r 's/
|(\[A)/\n/g' $LOGFILE > $LOGFILE.edit
+
+throughput=`cat $LOGFILE.edit | grep -E 'Iteration.*[0-9.]+(s/it|it/s)' | tail -1 | egrep -o '[0-9.]+(s/it|it/s)'`
+
+echo "throughput: $throughput"
+
--- a/PyTorch/LanguageModeling/BERT/scripts/start_pretraining.sh
+++ b/PyTorch/LanguageModeling/BERT/scripts/start_pretraining.sh
@ -0,0 +1,89 @@
+#!/bin/bash
+# purpose: for multinode training on slurm clusters
+node_type=${1:-"dgx1"}
+num_nodes=${2:-1}
+partition=${3:-"default"}
+wall_time=${4:-"12:00:00"}
+job_name=${5:-"pyt_bert"}
+root_dir=${6:-"$PWD"}
+train_batch_size=${7:-4}
+eval_batch_size=${8:-4}
+train_steps=${9:-1000000}
+warmup_proportion=${10:-0.01}
+learning_rate=${11:-1e-4}
+precision=${12:-"fp16"}
+save_checkpoint_steps=${13:-5000}
+results_dir=${14:-"$root_dir/results"}
+checkpoints_dir=${15:-"$root_dir/checkpoints"}
+
+CONT=${CONT:-"gitlab-master.nvidia.com:5005/dl/dgx/pytorch:19.02-py3-devel"}
+
+BENCHMARK=${BENCHMARK:-"bert"}
+BENCHMARK_NAME="bert"
+
+if [ "$node_type" = "dgx1" ] ; then
+   echo "Running on dgx1 systems"
+   DGXSYSTEM="DGX1"
+   DGXNGPU=8
+   DGXSOCKETCORES=20
+   DGXNSOCKET=2
+   DGXHT=2
+   DGXIBDEVICES='--device=/dev/infiniband --device=/dev/infiniband/rdma_cm --device=/dev/infiniband/ucm3 --device=/dev/infiniband/ucm2 --device=/dev/infiniband/ucm1 --device=/dev/infiniband/ucm0 --device=/dev/infiniband/uverbs3 --device=/dev/infiniband/uverbs2 --device=/dev/infiniband/uverbs1 --device=/dev/infiniband/uverbs0 --device=/dev/infiniband/issm3 --device=/dev/infiniband/umad3 --device=/dev/infiniband/issm2 --device=/dev/infiniband/umad2 --device=/dev/infiniband/issm1 --device=/dev/infiniband/umad1 --device=/dev/infiniband/issm0 --device=/dev/infiniband/umad0'
+elif [ "$node_type" = "dgx2h" ] ; then
+   echo "Running on dgx2h systems"
+   DGXSYSTEM="DGX2H"
+   DGXNGPU=16
+   DGXSOCKETCORES=24
+   DGXNSOCKET=2
+   DGXHT=2         # HT is on is 2, HT off is 1
+   DGXIBDEVICES='--device=/dev/infiniband/rdma_cm --device=/dev/infiniband/ucm10 --device=/dev/infiniband/ucm9 --device=/dev/infiniband/ucm8 --device=/dev/infiniband/ucm7 --device=/dev/infiniband/ucm4 --device=/dev/infiniband/ucm3 --device=/dev/infiniband/ucm2 --device=/dev/infiniband/ucm1 --device=/dev/infiniband/uverbs10 --device=/dev/infiniband/uverbs9 --device=/dev/infiniband/uverbs8 --device=/dev/infiniband/uverbs7 --device=/dev/infiniband/uverbs4 --device=/dev/infiniband/uverbs3 --device=/dev/infiniband/uverbs2 --device=/dev/infiniband/uverbs1 --device=/dev/infiniband/issm10 --device=/dev/infiniband/umad10 --device=/dev/infiniband/issm9 --device=/dev/infiniband/umad9 --device=/dev/infiniband/issm8 --device=/dev/infiniband/umad8 --device=/dev/infiniband/issm7 --device=/dev/infiniband/umad7 --device=/dev/infiniband/issm4 --device=/dev/infiniband/umad4 --device=/dev/infiniband/issm3 --device=/dev/infiniband/umad3 --device=/dev/infiniband/issm2 --device=/dev/infiniband/umad2 --device=/dev/infiniband/issm1 --device=/dev/infiniband/umad1'
+else
+   echo "Unknown <node_type>, must be either dgx1 or dgx2"
+   exit -1
+fi
+
+printf -v EXTRA_PARAMS "%d %d %e %s 1 %d %d %d false" $train_batch_size $eval_batch_size $learning_rate "$precision" $warmup_proportion $train_steps $save_checkpoint_steps
+
+export ROOTDIR=$root_dir
+export DATA_DIR=${DATA_DIR:-$CODEDIR/data/wikipedia_corpus/pyt_hdf5_shards}
+
+VOLS="-v $ROOTDIR:/workspace/bert"
+VOLS+=" -v $DATA_DIR:/workspace/bert/data/wikipedia_corpus/pyt_hdf5_shards"
+# VOLS+=" -v $BOOKS_DIR:/workspace/bert/data/bookcorpus/final_tfrecord_sharded"
+VOLS+=" -v $results_dir:/results"
+VOLS+=" -v $checkpoints_dir:/checkpoints"
+
+export VOLS
+export CONT
+export DGXSYSTEM
+export DGXNGPU
+export DGXIBDEVICES
+export EXTRA_PARAMS
+
+set -x
+cd $CODEDIR
+pwd
+
+PART=""
+if [ "$partition" != "default" ] ; then
+   printf -v PART "%s" "-p $partition"
+fi
+
+export GBS=$(expr $num_nodes \* $batch_size \* $DGXNGPU)
+printf -v TAG "%s_%dn_%s_gbs%d" "$job_name" $num_nodes "$precision" $GBS
+export DATESTAMP=`date +'%y%m%d%H%M%S'`
+
+sbatch $PART \
+        -N $num_nodes \
+        -t $wall_time \
+        -J $job_name \
+        --exclusive \
+        --mem=0 \
+        --mail-type=FAIL \
+        --ntasks-per-node=$DGXNGPU \
+        --threads-per-core=$DGXHT \
+        --cores-per-socket=$DGXSOCKETCORES \
+        --output=$LOGDIR/$TAG.$DATESTAMP.log \
+        $CODEDIR/scripts/run.sub
+set +x
+
--- a/PyTorch/LanguageModeling/BERT/tokenization.py
+++ b/PyTorch/LanguageModeling/BERT/tokenization.py
@ -0,0 +1,391 @@
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tokenization classes."""
+
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+import collections
+import logging
+import os
+import unicodedata
+import six
+from io import open
+
+from file_utils import cached_path
+
+logger = logging.getLogger(__name__)
+
+PRETRAINED_VOCAB_ARCHIVE_MAP = {
+    'bert-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt",
+    'bert-large-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-vocab.txt",
+    'bert-base-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-vocab.txt",
+    'bert-large-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-vocab.txt",
+    'bert-base-multilingual-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-uncased-vocab.txt",
+    'bert-base-multilingual-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased-vocab.txt",
+    'bert-base-chinese': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese-vocab.txt",
+}
+PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP = {
+    'bert-base-uncased': 512,
+    'bert-large-uncased': 512,
+    'bert-base-cased': 512,
+    'bert-large-cased': 512,
+    'bert-base-multilingual-uncased': 512,
+    'bert-base-multilingual-cased': 512,
+    'bert-base-chinese': 512,
+}
+VOCAB_NAME = 'vocab.txt'
+
+def convert_to_unicode(text):
+  """Converts `text` to Unicode (if it's not already), assuming utf-8 input."""
+  if six.PY3:
+    if isinstance(text, str):
+      return text
+    elif isinstance(text, bytes):
+      return text.decode("utf-8", "ignore")
+    else:
+      raise ValueError("Unsupported string type: %s" % (type(text)))
+  elif six.PY2:
+    if isinstance(text, str):
+      return text.decode("utf-8", "ignore")
+    elif isinstance(text, unicode):
+      return text
+    else:
+      raise ValueError("Unsupported string type: %s" % (type(text)))
+  else:
+    raise ValueError("Not running on Python2 or Python 3?")
+
+
+def load_vocab(vocab_file):
+    """Loads a vocabulary file into a dictionary."""
+    vocab = collections.OrderedDict()
+    index = 0
+    with open(vocab_file, "r", encoding="utf-8") as reader:
+        while True:
+            token = reader.readline()
+            if not token:
+                break
+            token = token.strip()
+            vocab[token] = index
+            index += 1
+    return vocab
+
+
+def whitespace_tokenize(text):
+    """Runs basic whitespace cleaning and splitting on a piece of text."""
+    text = text.strip()
+    if not text:
+        return []
+    tokens = text.split()
+    return tokens
+
+
+class BertTokenizer(object):
+    """Runs end-to-end tokenization: punctuation splitting + wordpiece"""
+
+    def __init__(self, vocab_file, do_lower_case=True, max_len=None,
+                 never_split=("[UNK]", "[SEP]", "[PAD]", "[CLS]", "[MASK]")):
+        if not os.path.isfile(vocab_file):
+            raise ValueError(
+                "Can't find a vocabulary file at path '{}'. To load the vocabulary from a Google pretrained "
+                "model use `tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`".format(vocab_file))
+        self.vocab = load_vocab(vocab_file)
+        self.ids_to_tokens = collections.OrderedDict(
+            [(ids, tok) for tok, ids in self.vocab.items()])
+        self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case,
+                                              never_split=never_split)
+        self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab)
+        self.max_len = max_len if max_len is not None else int(1e12)
+
+    def tokenize(self, text):
+        split_tokens = []
+        for token in self.basic_tokenizer.tokenize(text):
+            for sub_token in self.wordpiece_tokenizer.tokenize(token):
+                split_tokens.append(sub_token)
+        return split_tokens
+
+    def convert_tokens_to_ids(self, tokens):
+        """Converts a sequence of tokens into ids using the vocab."""
+        ids = []
+        for token in tokens:
+            ids.append(self.vocab[token])
+        if len(ids) > self.max_len:
+            raise ValueError(
+                "Token indices sequence length is longer than the specified maximum "
+                " sequence length for this BERT model ({} > {}). Running this"
+                " sequence through BERT will result in indexing errors".format(len(ids), self.max_len)
+            )
+        return ids
+
+    def convert_ids_to_tokens(self, ids):
+        """Converts a sequence of ids in wordpiece tokens using the vocab."""
+        tokens = []
+        for i in ids:
+            tokens.append(self.ids_to_tokens[i])
+        return tokens
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, cache_dir=None, *inputs, **kwargs):
+        """
+        Instantiate a PreTrainedBertModel from a pre-trained model file.
+        Download and cache the pre-trained model file if needed.
+        """
+        if pretrained_model_name_or_path in PRETRAINED_VOCAB_ARCHIVE_MAP:
+            vocab_file = PRETRAINED_VOCAB_ARCHIVE_MAP[pretrained_model_name_or_path]
+        else:
+            vocab_file = pretrained_model_name_or_path
+        if os.path.isdir(vocab_file):
+            vocab_file = os.path.join(vocab_file, VOCAB_NAME)
+        # redirect to the cache, if necessary
+        try:
+            resolved_vocab_file = cached_path(vocab_file, cache_dir=cache_dir)
+        except EnvironmentError:
+            logger.error(
+                "Model name '{}' was not found in model name list ({}). "
+                "We assumed '{}' was a path or url but couldn't find any file "
+                "associated to this path or url.".format(
+                    pretrained_model_name_or_path,
+                    ', '.join(PRETRAINED_VOCAB_ARCHIVE_MAP.keys()),
+                    vocab_file))
+            return None
+        if resolved_vocab_file == vocab_file:
+            logger.info("loading vocabulary file {}".format(vocab_file))
+        else:
+            logger.info("loading vocabulary file {} from cache at {}".format(
+                vocab_file, resolved_vocab_file))
+        if pretrained_model_name_or_path in PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP:
+            # if we're using a pretrained model, ensure the tokenizer wont index sequences longer
+            # than the number of positional embeddings
+            max_len = PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP[pretrained_model_name_or_path]
+            kwargs['max_len'] = min(kwargs.get('max_len', int(1e12)), max_len)
+        # Instantiate tokenizer.
+        tokenizer = cls(resolved_vocab_file, *inputs, **kwargs)
+        return tokenizer
+
+
+class BasicTokenizer(object):
+    """Runs basic tokenization (punctuation splitting, lower casing, etc.)."""
+
+    def __init__(self,
+                 do_lower_case=True,
+                 never_split=("[UNK]", "[SEP]", "[PAD]", "[CLS]", "[MASK]")):
+        """Constructs a BasicTokenizer.
+
+        Args:
+          do_lower_case: Whether to lower case the input.
+        """
+        self.do_lower_case = do_lower_case
+        self.never_split = never_split
+
+    def tokenize(self, text):
+        """Tokenizes a piece of text."""
+        text = self._clean_text(text)
+        # This was added on November 1st, 2018 for the multilingual and Chinese
+        # models. This is also applied to the English models now, but it doesn't
+        # matter since the English models were not trained on any Chinese data
+        # and generally don't have any Chinese data in them (there are Chinese
+        # characters in the vocabulary because Wikipedia does have some Chinese
+        # words in the English Wikipedia.).
+        text = self._tokenize_chinese_chars(text)
+        orig_tokens = whitespace_tokenize(text)
+        split_tokens = []
+        for token in orig_tokens:
+            if self.do_lower_case and token not in self.never_split:
+                token = token.lower()
+                token = self._run_strip_accents(token)
+            split_tokens.extend(self._run_split_on_punc(token))
+
+        output_tokens = whitespace_tokenize(" ".join(split_tokens))
+        return output_tokens
+
+    def _run_strip_accents(self, text):
+        """Strips accents from a piece of text."""
+        text = unicodedata.normalize("NFD", text)
+        output = []
+        for char in text:
+            cat = unicodedata.category(char)
+            if cat == "Mn":
+                continue
+            output.append(char)
+        return "".join(output)
+
+    def _run_split_on_punc(self, text):
+        """Splits punctuation on a piece of text."""
+        if text in self.never_split:
+            return [text]
+        chars = list(text)
+        i = 0
+        start_new_word = True
+        output = []
+        while i < len(chars):
+            char = chars[i]
+            if _is_punctuation(char):
+                output.append([char])
+                start_new_word = True
+            else:
+                if start_new_word:
+                    output.append([])
+                start_new_word = False
+                output[-1].append(char)
+            i += 1
+
+        return ["".join(x) for x in output]
+
+    def _tokenize_chinese_chars(self, text):
+        """Adds whitespace around any CJK character."""
+        output = []
+        for char in text:
+            cp = ord(char)
+            if self._is_chinese_char(cp):
+                output.append(" ")
+                output.append(char)
+                output.append(" ")
+            else:
+                output.append(char)
+        return "".join(output)
+
+    def _is_chinese_char(self, cp):
+        """Checks whether CP is the codepoint of a CJK character."""
+        # This defines a "chinese character" as anything in the CJK Unicode block:
+        #   https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
+        #
+        # Note that the CJK Unicode block is NOT all Japanese and Korean characters,
+        # despite its name. The modern Korean Hangul alphabet is a different block,
+        # as is Japanese Hiragana and Katakana. Those alphabets are used to write
+        # space-separated words, so they are not treated specially and handled
+        # like the all of the other languages.
+        if ((cp >= 0x4E00 and cp <= 0x9FFF) or  #
+                (cp >= 0x3400 and cp <= 0x4DBF) or  #
+                (cp >= 0x20000 and cp <= 0x2A6DF) or  #
+                (cp >= 0x2A700 and cp <= 0x2B73F) or  #
+                (cp >= 0x2B740 and cp <= 0x2B81F) or  #
+                (cp >= 0x2B820 and cp <= 0x2CEAF) or
+                (cp >= 0xF900 and cp <= 0xFAFF) or  #
+                (cp >= 0x2F800 and cp <= 0x2FA1F)):  #
+            return True
+
+        return False
+
+    def _clean_text(self, text):
+        """Performs invalid character removal and whitespace cleanup on text."""
+        output = []
+        for char in text:
+            cp = ord(char)
+            if cp == 0 or cp == 0xfffd or _is_control(char):
+                continue
+            if _is_whitespace(char):
+                output.append(" ")
+            else:
+                output.append(char)
+        return "".join(output)
+
+
+class WordpieceTokenizer(object):
+    """Runs WordPiece tokenization."""
+
+    def __init__(self, vocab, unk_token="[UNK]", max_input_chars_per_word=100):
+        self.vocab = vocab
+        self.unk_token = unk_token
+        self.max_input_chars_per_word = max_input_chars_per_word
+
+    def tokenize(self, text):
+        """Tokenizes a piece of text into its word pieces.
+
+        This uses a greedy longest-match-first algorithm to perform tokenization
+        using the given vocabulary.
+
+        For example:
+          input = "unaffable"
+          output = ["un", "##aff", "##able"]
+
+        Args:
+          text: A single token or whitespace separated tokens. This should have
+            already been passed through `BasicTokenizer`.
+
+        Returns:
+          A list of wordpiece tokens.
+        """
+
+        output_tokens = []
+        for token in whitespace_tokenize(text):
+            chars = list(token)
+            if len(chars) > self.max_input_chars_per_word:
+                output_tokens.append(self.unk_token)
+                continue
+
+            is_bad = False
+            start = 0
+            sub_tokens = []
+            while start < len(chars):
+                end = len(chars)
+                cur_substr = None
+                while start < end:
+                    substr = "".join(chars[start:end])
+                    if start > 0:
+                        substr = "##" + substr
+                    if substr in self.vocab:
+                        cur_substr = substr
+                        break
+                    end -= 1
+                if cur_substr is None:
+                    is_bad = True
+                    break
+                sub_tokens.append(cur_substr)
+                start = end
+
+            if is_bad:
+                output_tokens.append(self.unk_token)
+            else:
+                output_tokens.extend(sub_tokens)
+        return output_tokens
+
+
+def _is_whitespace(char):
+    """Checks whether `chars` is a whitespace character."""
+    # \t, \n, and \r are technically contorl characters but we treat them
+    # as whitespace since they are generally considered as such.
+    if char == " " or char == "\t" or char == "\n" or char == "\r":
+        return True
+    cat = unicodedata.category(char)
+    if cat == "Zs":
+        return True
+    return False
+
+
+def _is_control(char):
+    """Checks whether `chars` is a control character."""
+    # These are technically control characters but we count them as whitespace
+    # characters.
+    if char == "\t" or char == "\n" or char == "\r":
+        return False
+    cat = unicodedata.category(char)
+    if cat.startswith("C"):
+        return True
+    return False
+
+
+def _is_punctuation(char):
+    """Checks whether `chars` is a punctuation character."""
+    cp = ord(char)
+    # We treat all non-letter/number ASCII as punctuation.
+    # Characters such as "^", "$", and "`" are not in the Unicode
+    # Punctuation class but we treat them as punctuation anyways, for
+    # consistency.
+    if ((cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or
+            (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126)):
+        return True
+    cat = unicodedata.category(char)
+    if cat.startswith("P"):
+        return True
+    return False
--- a/PyTorch/LanguageModeling/BERT/vocab/download_models.py
+++ b/PyTorch/LanguageModeling/BERT/vocab/download_models.py
@ -0,0 +1,123 @@
+# NVIDIA
+
+import hashlib
+import urllib.request
+import zipfile
+
+# Download urls
+model_urls = {
+  'bert_base_uncased' : ('https://storage.googleapis.com/bert_models/2018_10_18/uncased_L-12_H-768_A-12.zip', 'uncased_L-12_H-768_A-12.zip'),
+  'bert_large_uncased' : ('https://storage.googleapis.com/bert_models/2018_10_18/uncased_L-24_H-1024_A-16.zip', 'uncased_L-24_H-1024_A-16.zip'),
+  'bert_base_cased' : ('https://storage.googleapis.com/bert_models/2018_10_18/cased_L-12_H-768_A-12.zip', 'cased_L-12_H-768_A-12.zip'),
+  'bert_large_cased' : ('https://storage.googleapis.com/bert_models/2018_10_18/cased_L-24_H-1024_A-16.zip', 'cased_L-24_H-1024_A-16.zip'),
+  'bert_base_multilingual_cased' : ('https://storage.googleapis.com/bert_models/2018_11_23/multi_cased_L-12_H-768_A-12.zip', 'multi_cased_L-12_H-768_A-12.zip'),
+  'bert_large_multilingual_uncased' : ('https://storage.googleapis.com/bert_models/2018_11_03/multilingual_L-12_H-768_A-12.zip', 'multilingual_L-12_H-768_A-12.zip'),
+  'bert_base_chinese' : ('https://storage.googleapis.com/bert_models/2018_11_03/chinese_L-12_H-768_A-12.zip', 'chinese_L-12_H-768_A-12.zip')
+}
+
+# SHA256sum verification for file download integrity (and checking for changes from the download source over time)
+bert_base_uncased_sha = {
+  'bert_config.json' : '7b4e5f53efbd058c67cda0aacfafb340113ea1b5797d9ce6ee411704ba21fcbc',
+  'bert_model.ckpt.data-00000-of-00001' : '58580dc5e0bf0ae0d2efd51d0e8272b2f808857f0a43a88aaf7549da6d7a8a84',
+  'bert_model.ckpt.index' : '04c1323086e2f1c5b7c0759d8d3e484afbb0ab45f51793daab9f647113a0117b',
+  'bert_model.ckpt.meta' : 'dd5682170a10c3ea0280c2e9b9a45fee894eb62da649bbdea37b38b0ded5f60e',
+  'vocab.txt' : '07eced375cec144d27c900241f3e339478dec958f92fddbc551f295c992038a3',
+}
+
+bert_large_uncased_sha = {
+  'bert_config.json' : 'bfa42236d269e2aeb3a6d30412a33d15dbe8ea597e2b01dc9518c63cc6efafcb',
+  'bert_model.ckpt.data-00000-of-00001' : 'bc6b3363e3be458c99ecf64b7f472d2b7c67534fd8f564c0556a678f90f4eea1',
+  'bert_model.ckpt.index' : '68b52f2205ffc64dc627d1120cf399c1ef1cbc35ea5021d1afc889ffe2ce2093',
+  'bert_model.ckpt.meta' : '6fcce8ff7628f229a885a593625e3d5ff9687542d5ef128d9beb1b0c05edc4a1',
+  'vocab.txt' : '07eced375cec144d27c900241f3e339478dec958f92fddbc551f295c992038a3',
+}
+
+bert_base_cased_sha = {
+  'bert_config.json' : 'f11dfb757bea16339a33e1bf327b0aade6e57fd9c29dc6b84f7ddb20682f48bc',
+  'bert_model.ckpt.data-00000-of-00001' : '734d5a1b68bf98d4e9cb6b6692725d00842a1937af73902e51776905d8f760ea',
+  'bert_model.ckpt.index' : '517d6ef5c41fc2ca1f595276d6fccf5521810d57f5a74e32616151557790f7b1',
+  'bert_model.ckpt.meta' : '5f8a9771ff25dadd61582abb4e3a748215a10a6b55947cbb66d0f0ba1694be98',
+  'vocab.txt' : 'eeaa9875b23b04b4c54ef759d03db9d1ba1554838f8fb26c5d96fa551df93d02',
+}
+
+bert_large_cased_sha = {
+  'bert_config.json' : '7adb2125c8225da495656c982fd1c5f64ba8f20ad020838571a3f8a954c2df57',
+  'bert_model.ckpt.data-00000-of-00001' : '6ff33640f40d472f7a16af0c17b1179ca9dcc0373155fb05335b6a4dd1657ef0',
+  'bert_model.ckpt.index' : 'ef42a53f577fbe07381f4161b13c7cab4f4fc3b167cec6a9ae382c53d18049cf',
+  'bert_model.ckpt.meta' : 'd2ddff3ed33b80091eac95171e94149736ea74eb645e575d942ec4a5e01a40a1',
+  'vocab.txt' : 'eeaa9875b23b04b4c54ef759d03db9d1ba1554838f8fb26c5d96fa551df93d02',
+}
+
+bert_base_multilingual_cased_sha = {
+  'bert_config.json' : 'e76c3964bc14a8bb37a5530cdc802699d2f4a6fddfab0611e153aa2528f234f0',
+  'bert_model.ckpt.data-00000-of-00001' : '55b8a2df41f69c60c5180e50a7c31b7cdf6238909390c4ddf05fbc0d37aa1ac5',
+  'bert_model.ckpt.index' : '7d8509c2a62b4e300feb55f8e5f1eef41638f4998dd4d887736f42d4f6a34b37',
+  'bert_model.ckpt.meta' : '95e5f1997e8831f1c31e5cf530f1a2e99f121e9cd20887f2dce6fe9e3343e3fa',
+  'vocab.txt' : 'fe0fda7c425b48c516fc8f160d594c8022a0808447475c1a7c6d6479763f310c',
+}
+
+bert_large_multilingual_uncased_sha = {
+  'bert_config.json' : '49063bb061390211d2fdd108cada1ed86faa5f90b80c8f6fdddf406afa4c4624',
+  'bert_model.ckpt.data-00000-of-00001' : '3cd83912ebeb0efe2abf35c9f1d5a515d8e80295e61c49b75c8853f756658429',
+  'bert_model.ckpt.index' : '87c372c1a3b1dc7effaaa9103c80a81b3cbab04c7933ced224eec3b8ad2cc8e7',
+  'bert_model.ckpt.meta' : '27f504f34f02acaa6b0f60d65195ec3e3f9505ac14601c6a32b421d0c8413a29',
+  'vocab.txt' : '87b44292b452f6c05afa49b2e488e7eedf79ea4f4c39db6f2f4b37764228ef3f',
+}
+
+bert_base_chinese_sha = {
+  'bert_config.json' : '7aaad0335058e2640bcb2c2e9a932b1cd9da200c46ea7b8957d54431f201c015',
+  'bert_model.ckpt.data-00000-of-00001' : '756699356b78ad0ef1ca9ba6528297bcb3dd1aef5feadd31f4775d7c7fc989ba',
+  'bert_model.ckpt.index' : '46315546e05ce62327b3e2cd1bed22836adcb2ff29735ec87721396edb21b82e',
+  'bert_model.ckpt.meta' : 'c0f8d51e1ab986604bc2b25d6ec0af7fd21ff94cf67081996ec3f3bf5d823047',
+  'vocab.txt' : '45bbac6b341c319adc98a532532882e91a9cefc0329aa57bac9ae761c27b291c',
+}
+
+# Relate SHA to urls for loop below
+model_sha = {
+  'bert_base_uncased' : bert_base_uncased_sha,
+  'bert_large_uncased' : bert_large_uncased_sha,
+  'bert_base_cased' : bert_base_cased_sha,
+  'bert_large_cased' : bert_large_cased_sha,
+  'bert_base_multilingual_cased' : bert_base_multilingual_cased_sha,
+  'bert_large_multilingual_uncased' : bert_large_multilingual_uncased_sha,
+  'bert_base_chinese' : bert_base_chinese_sha
+}
+
+# Helper to get sha256sum of a file
+def sha256sum(filename):
+  h  = hashlib.sha256()
+  b  = bytearray(128*1024)
+  mv = memoryview(b)
+  with open(filename, 'rb', buffering=0) as f:
+    for n in iter(lambda : f.readinto(mv), 0):
+      h.update(mv[:n])
+  return h.hexdigest()
+
+# Iterate over urls: download, unzip, verify sha256sum
+found_mismatch_sha = False
+for model in model_urls:
+  url = model_urls[model][0]
+  file = model_urls[model][1]
+
+  print("Downloading", url)
+  response = urllib.request.urlopen(url)
+  with open(file, "wb") as handle:
+    handle.write(response.read())
+
+  print("Unzipping", file)
+  zip = zipfile.ZipFile(file, 'r')
+  zip.extractall()
+  zip.close()
+
+  sha_dict = model_sha[model]
+  for extracted_file in sha_dict:
+    sha = sha_dict[extracted_file]
+    if sha != sha256sum(file[:-4] + "/" + extracted_file):
+      found_mismatch_sha = True
+      print("SHA256sum does not match on file:", extracted_file, "from download url:", url)
+    else:
+      print(file[:-4] + "/" + extracted_file, "\t", "verified")
+
+if not found_mismatch_sha:
+  print("All downloads pass sha256sum verification.")
+
--- a/PyTorch/LanguageModeling/BERT/vocab/vocab
+++ b/PyTorch/LanguageModeling/BERT/vocab/vocab
--- a/PyTorch/Recommendation/NCF/Dockerfile
+++ b/PyTorch/Recommendation/NCF/Dockerfile
@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

-FROM nvcr.io/nvidia/pytorch:18.12.1-py3
+FROM nvcr.io/nvidia/pytorch:19.05-py3

 RUN apt-get update && \
    apt-get install -y unzip
--- a/PyTorch/Recommendation/NCF/README.md
+++ b/PyTorch/Recommendation/NCF/README.md
@ -1,6 +1,52 @@
-# Neural Collaborative Filtering (NCF)
+# Neural Collaborative Filtering (NCF) for PyTorch
+
+This repository provides a script and recipe to train the Neural Collaborative Filtering (NCF)
+model to achieve state of the art accuracy, and is tested and maintained by NVIDIA.
+
+Table of Contents
+=================
+
+  * [The model](#the-model)
+     * [Model architecture](#model-architecture)
+     * [Default configuration](#default-configuration)
+   * [Feature support matrix](#feature-support-matrix)
+        * [Features](#features)
+  * [Setup](#setup)
+     * [Requirements](#requirements)
+     * [Quick Start Guide](#quick-start-guide)
+  * [Details](#details)
+     * [Scripts and sample code](#scripts-and-sample-code)
+     * [Command-line options](#command-line-options)
+     * [Getting the data](#getting-the-data)
+        * [Dataset guidelines](#dataset-guidelines)
+        * [Multi-dataset](#multi-dataset)
+    * [ML-1m](#ml-1m)
+     * [Training process](#training-process)
+     * [Inference process](#inference-process)
+  * [Mixed precision training](#mixed-precision-training)
+     * [Enabling mixed precision](#enabling-mixed-precision)
+  * [Benchmarking](#benchmarking)
+     * [Training performance benchmark](#training-performance-benchmark)
+     * [Inference performance benchmark](#inference-performance-benchmark)
+  * [Results](#results)
+     * [Training accuracy results](#training-accuracy-results)
+        * [NVIDIA DGX-1 (8x V100 32G)](#nvidia-dgx-1-8x-v100-32g)
+        * [Training stability test](#training-stability-test)
+     * [Training performance results](#training-performance-results)
+        * [NVIDIA DGX-1 (8x V100 16G)](#nvidia-dgx-1-(8x-v100-16g))
+        * [NVIDIA DGX-1 (8x V100 32G)](#nvidia-dgx-1-(8x-v100-32g))
+        * [NVIDIA DGX-2 (16x V100 32G)](#nvidia-dgx-2-(16x-v100-32g))
+     * [Inference performance results](#inference-performance-results)
+        * [NVIDIA DGX-1 (8x V100 16G)](#nvidia-dgx-1-(8x-v100-16g))
+        * [NVIDIA DGX-1 (8x V100 32G)](#nvidia-dgx-1-(8x-v100-32g))
+        * [NVIDIA DGX-2 (16x V100 32G)](#nvidia-dgx-2-(16x-v100-32g))
+  * [Changelog](#changelog)
+  * [Known issues](#known-issues)
+     * [Scaling beyond 8 GPUs](#scaling-beyond-8-gpus)
+     * [Memory usage](#memory-usage)

 ## The model
+
 The NCF model focuses on providing recommendations, also known as collaborative filtering; with implicit feedback. The training data for this model should contain binary information about whether a user interacted with a specific item.
 NCF was first described by Xiangnan He, Lizi Liao, Hanwang Zhang, Liqiang Nie, Xia Hu and Tat-Seng Chua in the [Neural Collaborative Filtering paper](https://arxiv.org/abs/1708.05031).

@ -8,6 +54,23 @@ The implementation in this repository focuses on the NeuMF instantiation of the
 We modified it to use dropout in the FullyConnected layers. This reduces overfitting and increases the final accuracy.
 Training the other two instantiations of NCF (GMF and MLP) is not supported. 
 
+Contrary to the original paper, we benchmark the model on the larger [ML-20m dataset](https://grouplens.org/datasets/movielens/20m/)
+instead of using the smaller [ML-1m](https://grouplens.org/datasets/movielens/1m/) dataset as we think this is more realistic of production type environments.
+However, using the ML-1m dataset is also supported.
+
+This model is trained with mixed precision using Tensor Cores on NVIDIA Volta and Turing GPUs. Therefore, researchers can get results 2x faster than training without Tensor Cores, while experiencing the benefits of mixed precision training. Multi-GPU training is also supported. This model is tested against each NGC monthly container release to ensure consistent accuracy and performance over time.
+
+
+
+### Model architecture
+
+This model is based mainly on Embedding and FullyConnected layers. The control flow is divided into two branches:
+* Multi Layer Perceptron (MLP) branch, which transforms the input through FullyConnected layers with ReLU activations and dropout.
+* Matrix Factorization (MF) branch, which performs collaborative filtering factorization.
+Each user and each item has two embedding vectors associated with it -- one for the MLP branch and the other for the MF branch.
+
+The outputs from those branches are concatenated and fed to the final FullyConnected layer with sigmoid activation.
+This can be interpreted as a probability of a user interacting with a given item.

 <p align="center">
  <img width="70%" src="./img/ncf_diagram.png" />
@ -16,252 +79,483 @@ Figure 1. The architecture of a Neural Collaborative Filtering model. Taken from
 </p>


-Contrary to the original paper, we benchmark the model on the larger [ml-20m dataset](https://grouplens.org/datasets/movielens/20m/)
-instead of using the smaller [ml-1m](https://grouplens.org/datasets/movielens/1m/) dataset as we think this is more realistic of production type environments.
-However, using the ml-1m dataset is also supported.
+### Default configuration

-## Requirements
+The following features were implemented in this model:
+  * Automatic Mixed Precision (AMP)
+  * Data-parallel multi-GPU training and evaluation 
+  * Dropout
+  * Gradient accumulation

-The easiest way to train the model is to use a Docker container. This would require:
-* [nvidia-docker](https://github.com/NVIDIA/nvidia-docker)
-* [PyTorch 18.12.1-py3 NGC container](https://ngc.nvidia.com/catalog/containers/nvidia:pytorch) or newer
-
-For more information about how to get started with NGC containers, see the
-following sections from the NVIDIA GPU Cloud Documentation and the Deep Learning
-Frameworks Documentation:
-* [Getting Started Using NVIDIA GPU Cloud](https://docs.nvidia.com/ngc/ngc-getting-started-guide/index.html)
-* [Accessing And Pulling From The NGC Container Registry](https://docs.nvidia.com/deeplearning/dgx/user-guide/index.html#accessing_registry)
-* [Running PyTorch](https://docs.nvidia.com/deeplearning/dgx/pytorch-release-notes/running.html#running)
+The following performance optimizations were implemented in this model:
+  * FusedAdam optimizer
+  * Approximate train negative sampling
+  * Caching all the positive training samples in the device memory


-## Training using mixed precision with Tensor Cores
-### Supported hardware
-Before you can train using mixed precision with Tensor Cores, ensure that you have an
- NVIDIA Volta based GPU. Other platforms may work, however, are not officially 
- supported.
- 
-### Software changes
- For detailed information about how to train using mixed precision, see the [Mixed 
- Precision Training paper](https://arxiv.org/abs/1710.03740) 
- and [Training With Mixed Precision documentation](https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html).
+### Feature support matrix
+The following features are supported by this model:

-Another option for adding mixed-precision support is available from NVIDIA’s
-[APEX](https://github.com/NVIDIA/apex), a PyTorch extension, that contains utility libraries, such as AMP, which require minimal network code changes to leverage Tensor Core performance.
+| **Feature** | **NCF PyTorch** | 
+|:---:|:--------:|
+| Automatic Mixed Precision (AMP) | Yes |
+| Multi-GPU training with Distributed Data Parallel (DDP) | Yes |
+| Fused Adam | Yes |

-This implementation of the NCF model uses a custom FP16 optimizer to implement mixed precision with static loss scaling.
-The custom FP16 Optimizer was used to take advantage of the performance gains provided by the FusedOptimizer.
+#### Features
+
+* Automatic Mixed Precision - This implementation of NCF uses AMP to implement mixed precision training.
+It allows us to use FP16 training with FP32 master weights by modifying just 3 lines of code. 
+* Multi-GPU training with Distributed Data Parallel - uses Apex's DDP to implement efficient multi-GPU training with NCCL.
+* Fused Adam - We use a special implementation of the Adam implementation provided by the Apex package. It fuses some operations for faster weight updates.
+Since NCF is a relatively lightweight model with a large number of parameters, we’ve observed significant performance improvements from using FusedAdam.


-## Quick start guide
+## Setup
+The following section lists the requirements in order to start training the Neural Collaborative Filtering model.

-### 1. Build and launch an NCF PyTorch Docker container
+### Requirements
+This repository contains Dockerfile which extends the PyTorch NGC container and encapsulates some dependencies. 
+Aside from these dependencies, ensure you have the following components:
+NVIDIA Docker
+PyTorch 19.05-py3 NGC container
+NVIDIA Volta or Turing based GPU

-After Docker is correctly set up, you can build the NCF image with:
+For more information about how to get started with NGC containers, see the following sections from the NVIDIA GPU Cloud Documentation and the Deep Learning Documentation:
+Getting Started Using NVIDIA GPU Cloud
+Accessing And Pulling From The NGC Container Registry
+Running PyTorch
+
+For those unable to use the PyTorch NGC container, to set up the required environment or create your own container, see the versioned NVIDIA Container Support Matrix.  
+  
+### Quick Start Guide
+
+1. Clone the repository.
+```bash
+git clone https://github.com/NVIDIA/DeepLearningExamples
+cd DeepLearningExamples/TensorFlow/Segmentation/UNetIndustrial
+```
+
+2. Build an NCF PyTorch Docker container.
+
+After Docker is setup, you can build the NCF image with:
 ```bash
 docker build . -t nvidia_ncf
 ``` 

-After that the NVIDIA NCF container can be launched with:
+3. Start an interactive session in the NGC container to run preprocessing/training and inference.
+
+The NCF PyTorch container can be launched with:
 ```bash
 mkdir data
 docker run --runtime=nvidia -it --rm --ipc=host  -v ${PWD}/data:/data nvidia_ncf bash
 ```

-This will launch the container and mount the ./data directory as a volume to the /data directory inside the container.
-Any datasets and experiment results (logs, checkpoints etc.) saved to /data will be accessible
-in the './data' directory on the host. 
+This will launch the container and mount the `./data` directory as a volume to the `./data` directory inside the container.
+Any datasets and experiment results (logs, checkpoints etc.) saved to `./data` will be accessible
+in the `./data` directory on the host. 

-### 2. Data preparation
+4. Download and preprocess the data.

-Preprocessing consists of downloading the data, filtering out users that have less than 20 ratings (by default), sorting the data and dropping the duplicates.
+Preprocessing consists of downloading the data, filtering out users that have less than 20 ratings (by default), sorting the data and dropping the duplicates. 
 The preprocessed train and test data is then saved in PyTorch binary format to be loaded just before training.

+Note: Preprocessing requires PyTorch and should therefore be run inside the Docker container.
+
 No data augmentation techniques are used.

-To download and preprocess the ml-20m dataset you can run:
+To download and preprocess the ML-20m dataset you can run:

 ```bash
 ./prepare_dataset.sh
 ```

-Please note that this command will return immediately without downloading anything if the data is already present in the /data directory.
+Note: This command will return immediately without downloading anything if the data is already present in the `./data` directory.

-#### Other datasets
+This will store the preprocessed training and evaluation data in the `./data` directory so that it can be later
+used to train the model (by passing the appropriate `--data` argument to the `ncf.py` script).

-This implementation is tuned for the ml-20m and ml-1m datasets.
-Using other datasets might require tuning some hyperparameters (e.g., learning rate, beta1, beta2)
+5. Start training.

-If you'd like to use your custom dataset you can do it by adding support for it in the prepare_dataset.sh and download_dataset.sh scripts.
-The required format of the data is a CSV file in which the first column contains the userID and the second column contains
-the itemID.
-
-The performance of the model depends on the dataset size.
-Generally, the model should scale better for datasets containing more data points.
-For a smaller dataset the you might experience slower performance.
-
-
-##### ml-1m
-To download and preprocess the ml-1m dataset run:
-```bash
-./prepare_dataset.sh ml-1m
-```
-
-This will store the preprocessed training and evaluation data in the /data directory so that it can be later
-used to train the model (by passing the appropriate --data argument to the ncf.py script).
-
-### 3. Run the training
-After the docker container is launched, the training with the [default hyperparameters](#5-hyperparameters) can be started with:
+After the Docker container is launched, the training with the default hyperparameters can be started with:

 ```bash
 ./prepare_dataset.sh
 python -m torch.distributed.launch --nproc_per_node=8 ncf.py --data /data/cache/ml-20m
 ```

-This will result in a checkpoint file being written to /data/checkpoints/model.pth.
+This will result in a checkpoint file being written to `/data/checkpoints/model.pth`.


-### 4. Test a trained model
+6. Start validation/evaluation.

-The trained model can be evaluated by passing the --mode test flag to the run.sh script:
+The trained model can be evaluated by passing the `--mode test` flag to the `run.sh` script:

 ```bash
 python -m torch.distributed.launch --nproc_per_node=8 ncf.py --data /data/cache/ml-20m  --mode test --checkpoint-path /data/checkpoints/model.pth
 ```

-### 5. Hyperparameters and command line arguments

-The default hyperparameters used are:
+## Details

-* learning rate: 0.0045
-* beta1: 0.25
-* beta2: 0.5
-* training batch size: 1048576
-* epsilon: 1e-8
-* loss scale: 8192
-* negatives sampled for training: 4
-* use mixed precision training: Yes
-* number of GPUs used: 8
+The following sections provide greater details of the dataset, running training and inference, and the training results.
+
+### Scripts and sample code
+
+The `ncf.py` script contains most of the training and validation logic. Data loading and preprocessing code is located in `dataloading.py`.
+The model architecture is defined in `neumf.py`. Some initial data preprocessing is located in `convert.py`.
+The logger directory contains simple bookkeeping utilities for storing training results.
+
+### Command-line options
+
+To see the full list of available options and their descriptions, use the `-h` or `--help` command line option, for example: 
+`python ncf.py --help`
+
+The following example output is printed when running the sample:
+```
+usage: ncf.py [-h] [--data DATA] [-e EPOCHS] [-b BATCH_SIZE]
+              [--valid_batch_size VALID_BATCH_SIZE] [-f FACTORS]
+              [--layers LAYERS [LAYERS ...]] [-n NEGATIVE_SAMPLES]
+              [-l LEARNING_RATE] [-k TOPK] [--seed SEED]
+              [--threshold THRESHOLD] [--valid_negative VALID_NEGATIVE]
+              [--beta1 BETA1] [--beta2 BETA2] [--eps EPS] [--dropout DROPOUT]
+              [--checkpoint_dir CHECKPOINT_DIR] [--mode {train,test}]
+              [--grads_accumulated GRADS_ACCUMULATED] [--opt_level {O0,O2}]
+              [--local_rank LOCAL_RANK]
+
+Train a Neural Collaborative Filtering model:
+
+
+optional arguments:
+  -h, --help            show this help message and exit
+  --data DATA           Path to test and training data files
+  -e EPOCHS, --epochs EPOCHS
+                        Number of epochs for training
+  -b BATCH_SIZE, --batch_size BATCH_SIZE
+                        Number of examples for each iteration
+  --valid_batch_size VALID_BATCH_SIZE
+                        Number of examples in each validation chunk
+  -f FACTORS, --factors FACTORS
+                        Number of predictive factors
+  --layers LAYERS [LAYERS ...]
+                        Sizes of hidden layers for MLP
+  -n NEGATIVE_SAMPLES, --negative_samples NEGATIVE_SAMPLES
+                        Number of negative examples per interaction
+  -l LEARNING_RATE, --learning_rate LEARNING_RATE
+                        Learning rate for optimizer
+  -k TOPK, --topk TOPK  Rank for test examples to be considered a hit
+  --seed SEED, -s SEED  Manually set random seed for torch
+  --threshold THRESHOLD, -t THRESHOLD
+                        Stop training early at threshold
+  --valid_negative VALID_NEGATIVE
+                        Number of negative samples for each positive test
+                        example
+  --beta1 BETA1, -b1 BETA1
+                        Beta1 for Adam
+  --beta2 BETA2, -b2 BETA2
+                        Beta1 for Adam
+  --eps EPS             Epsilon for Adam
+  --dropout DROPOUT     Dropout probability, if equal to 0 will not use
+                        dropout at all
+  --checkpoint_dir CHECKPOINT_DIR
+                        Path to the directory storing the checkpoint file
+  --mode {train,test}   Passing "test" will only run a single evaluation,
+                        otherwise full training will be performed
+  --grads_accumulated GRADS_ACCUMULATED
+                        Number of gradients to accumulate before performing an
+                        optimization step
+  --opt_level {O0,O2}   Optimization level for Automatic Mixed Precision
+  --local_rank LOCAL_RANK
+                        Necessary for multi-GPU training

-All these parameters can be controlled by passing command line arguments to the ncf.py script.
-To get a complete list of all command line arguments with descriptions and default values you can run:
-```bash
-python ncf.py --help
 ```

+### Getting the data

-## Training accuracy results
+The NCF model was trained on the ML-20m dataset.
+For each user, the interaction with the latest timestamp was included in the test set and the rest of the examples are used as the training data. 
+
+This repository contains the `./prepare_dataset.sh` script which will automatically download and preprocess the training and validation datasets. 
+By default, data will be downloaded to the `/data` directory. The preprocessed data will be placed in `/data/cache`.
+
+#### Dataset guidelines
+
+The required format of the data is a CSV file with three columns: `user_id`, `item_id` and `timestamp`. This CSV should contain only the positive examples,  in other words,
+the ones for which an interaction between a user and an item occurred. The negatives will be sampled during the training and validation.
+
+#### Multi-dataset 
+
+This implementation is tuned for the ML-20m and ML-1m datasets.
+Using other datasets might require tuning some hyperparameters (for example, learning rate, beta1 and beta2).
+
+If you'd like to use your custom dataset you can do it by adding support for it in the `prepare_dataset.sh` and `download_dataset.sh` scripts.
+
+The performance of the model depends on the dataset size.
+Generally, the model should scale better for datasets containing more data points.
+For a smaller dataset you might experience slower performance.
+
+
+#### ML-1m
+
+To download, preprocess and train on the ML-1m dataset run:
+```bash
+./prepare_dataset.sh ml-1m
+python -m torch.distributed.launch --nproc_per_node=8 ncf.py --data /data/cache/ml-1m
+```
+
+### Training process
+The name of the training script is `ncf.py`. Because of the multi-GPU support, it should always be run with the torch distributed launcher like this:
+```bash
+python -m torch.distributed.launch --nproc_per_node=<number_of_gpus> ncf.py --data <path_to_dataset> [other_parameters]
+```
+
+The main result of the training are checkpoints stored by default in `/data/checkpoints/`. This location can be controlled
+by the `--checkpoint_dir` command-line argument.
+
+The validation metric is Hit Rate at 10 (HR@10) with 100 test negative samples. This means that for each positive sample in 
+the test set 100 negatives are sampled. All resulting 101 samples are then scored by the model. If the true positive sample is
+among the 10 samples with highest scores we have a "hit" and the metric is equal to 1, otherwise it's equal to 0.
+The HR@10 metric is the number of hits in the entire test set divided by the number of samples in the test set.  
+
+### Inference process
+
+Inference can be launched with the same script used for training by passing the `--mode test` flag:
+```bash
+python -m torch.distributed.launch --nproc_per_node=<number_of_gpus> ncf.py  --data <path_to_dataset> --mode test [other_parameters]
+```
+
+The script will then:
+* Load the checkpoint from the directory specified by the `--checkpoint_dir` directory
+* Run inference on the test dataset
+* Compute and print the validation metric
+
+## Mixed precision training
+
+Mixed precision is the combined use of different numerical precisions in a computational method. [Mixed precision](https://arxiv.org/abs/1710.03740) training offers significant computational speedup by performing operations in half-precision format, while storing minimal information in single-precision to retain as much information as possible in critical parts of the network. Since the introduction of [tensor cores](https://developer.nvidia.com/tensor-cores) in the Volta and Turing architecture, significant training speedups are experienced by switching to mixed precision -- up to 3x overall speedup on the most arithmetically intense model architectures. Using mixed precision training requires two steps:
+1.  Porting the model to use the FP16 data type where appropriate.    
+2.  Adding loss scaling to preserve small gradient values.
+
+The ability to train deep learning networks with lower precision was introduced in the Pascal architecture and first supported in [CUDA 8](https://devblogs.nvidia.com/parallelforall/tag/fp16/) in the NVIDIA Deep Learning SDK.
+
+For information about:
+-   How to train using mixed precision, see the [Mixed Precision Training](https://arxiv.org/abs/1710.03740) paper and [Training With Mixed Precision](https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html) documentation.
+-   Techniques used for mixed precision training, see the [Mixed-Precision Training of Deep Neural Networks](https://devblogs.nvidia.com/mixed-precision-training-deep-neural-networks/) blog.
+-   How to access and enable AMP for TensorFlow, see [Using TF-AMP](https://docs.nvidia.com/deeplearning/dgx/tensorflow-user-guide/index.html#tfamp) from the TensorFlow User Guide.
+-   APEX tools for mixed precision training, see the [NVIDIA Apex: Tools for Easy Mixed-Precision Training in PyTorch](https://devblogs.nvidia.com/apex-pytorch-easy-mixed-precision-training/).
+
+
+### Enabling mixed precision
+
+Using the Automatic Mixed Precision (AMP) package requires two modifications in the source code.
+The first one is to initialize the model and the optimizer using the `amp.initialize` function:
+```python
+model, optimizer = amp.initialize(model, optimizer, opt_level=args.opt_level,
+                                          keep_batchnorm_fp32=False, loss_scale='dynamic')
+```
+
+The second one is to use the AMP's loss scaling context manager:
+```python
+with amp.scale_loss(loss, optimizer) as scaled_loss:
+    scaled_loss.backward()
+```
+
+## Benchmarking
+
+### Training performance benchmark
+
+NCF training on NVIDIA DGX systems is very fast, therefore, in order to measure train and validation throughput, you can simply run the full training job with: 
+```bash
+./prepare_dataset.sh
+python -m torch.distributed.launch --nproc_per_node=8 ncf.py --data /data/cache/ml-20m --epochs 5
+```
+
+At the end of the script, a line reporting the best train throughput is printed.
+
+
+### Inference performance benchmark
+
+Validation throughput can be measured by running the full training job with:
+```bash
+./prepare_dataset.sh
+python -m torch.distributed.launch --nproc_per_node=8 ncf.py --data /data/cache/ml-20m --epochs 5
+```
+
+The best validation throughput is reported to the standard output. 
+
+## Results
+
+The following sections provide details on how we achieved our performance and accuracy in training and inference. 
+
+### Training accuracy results
+
+#### NVIDIA DGX-1 (8x V100 32G)
+
+Our results were obtained by following the steps in the Quick Start Guide in the PyTorch 19.05-py3 NGC container on NVIDIA DGX-1 with 8x V100 32G GPUs.

 The following table lists the best hit rate at 10 for DGX-1 with 8 V100 32G GPUs:

-| **Number of GPUs** | **Full precision HR@10** | **Mixed precision HR@10** | 
+| **Number of GPUs** | **Single precision HR@10** | **Mixed precision HR@10** | 
 |:---:|:--------:|:-------:|
-|1|	0.959015 |0.959485|
-|4|	0.959389 |0.959274|
-|8|	0.959015 |0.96|
+|1|	0.95847 | 0.95845 |
+|4|	0.95887 | 0.95841 |
+|8|	0.95850 | 0.95885 |

-Here's an example validation accuracy curve for mixed precision vs full precision on DGX-1 with 8 V100 32G GPUs:
+Here's an example validation accuracy curve for mixed precision vs single precision on DGX-1 with 8 V100 32G GPUs:

 ![ValidationAccuracy](./img/dgx1v_32_curve.png)

+To reproduce this result, start the NCF Docker container interactively and run:
+```bash
+./prepare_dataset.sh
+python -m torch.distributed.launch --nproc_per_node=8 ncf.py --data /data/cache/ml-20m
+```
+
+Training accuracy results on a DGX-1 with 8 V100 16G GPUs and on DGX-2 should be the same.
+
+#### Training stability test

 The histogram below shows the best HR@10 achieved 
 for 400 experiments using mixed precision and 400 experiments using single precision.
-Mean HR@10 for mixed precision was equal to 0.95917 and for single precision it was equal to
-0.95915.
+Mean HR@10 for mixed precision was equal to 0.95868 and for single precision it was equal to
+0.95867.
 ![hr_histogram](./img/hr_histogram.png)


-## Training performance results
+### Training performance results

-This example is based on [our submission for the MLPerf v0.5 benchmark](https://github.com/mlperf/results/tree/master/v0.5.0/nvidia/submission/code/recommendation/pytorch). Please note that we've introduced some improvements to this version that make time-to-train not directly comparable between it and our MLPerf submission:
- This version uses a more efficient multi-gpu sharding algorithm
- We added dropout operations here to achieve better accuracy
- This version uses 100 negatives by default during the evaluation phase as was done in the original NCF paper. MLPerf version used 999
- We save the model checkpoints in this version. This might make the training a few seconds slower depending on the speed of your storage

-### NVIDIA DGX-1 with 8 V100 16G GPUs
+#### NVIDIA DGX-1 (8x V100 16G)
+
+Our results were obtained by following the steps in the Quick Start Guide in the PyTorch 19.05-py3 NGC container on NVIDIA DGX-1 with 8x V100 16G GPUs. 

 The following table shows the best training throughput:

-| **Number of GPUs (samples/sec)** | **Mixed precision (samples/sec)** | **Full precision (samples/sec)** | **Speedup** | 
-|:---:|:-------------:|:-----------:|:-----:|
-| 1 | 20,027,840 | 9,529,271 | 2.10 |
-| 4 | 62,633,260| 32,719,700 | 1.91 |
-| 8 | 99,332,230| 55,004,590 | 1.81 |
+| **Number of GPUs** | **Batch size per GPU**| **Mixed precision throughput (samples/sec)** | **Single precision throughput (samples/sec)** | **Speed-up with mixed precision** | **Multi-GPU strong scaling with mixed precision** | **Multi-GPU strong scaling with FP32** |
+|:---:|:--------:|:-----:|:-----------:|:-----:|:----:|:---|
+| 1 |1048576| 20,459,365| 9,777,551 | 2.09 |  1 | 1 |
+| 4 |262144 | 61,782,125| 32,583,924 | 1.90 | 3.02 |3.33|
+| 8 |131072 | 98,464,084| 55,365,147 | 1.78 |4.81 |5.66|
+ 
+The following table shows the average time to reach HR@10 of 0.9562 across 5 random seeds. The training time was measured excluding data downloading, preprocessing, validation data generation and library initialization times.

-The following table shows mean time to reach HR@10 of 0.9562 across 5 random seeds. The training time was measured excluding data downloading, preprocessing and library initialization times.
+| **Number of GPUs** | **Batch size per GPU** | **Mixed precision (seconds)** | **Single precision (seconds)** | **Speed-up with mixed precision** | 
+|:---:|:----:|:---------:|:-----------:|:-----:|
+| 1 | 1048576| 67.03 | 142.31 | 2.12 |
+| 4 | 262144| 23.92	| 47.57	| 1.99 |
+| 8 | 131072| 18.82	| 31.48	| 1.67 | 

-| **Number of GPUs (samples/sec)** | **Mixed precision (seconds)** | **Full precision (seconds)** | **Speedup** | 
-|:---:|:-------------:|:-----------:|:-----:|
-| 1 | 78.73 | 153.90 | 1.95 |
-| 4 | 25.80 | 49.41 | 1.92 |
-| 8 | 20.42 | 32.68 | 1.60 |

-### NVIDIA DGX-1 with 8 V100 32G GPUs
+
+#### NVIDIA DGX-1 (8x V100 32G)
+
+Our results were obtained by following the steps in the Quick Start Guide in the PyTorch 19.05-py3 NGC container on NVIDIA DGX-1 with 8x V100 32G GPUs. 

 The following table shows the best training throughput:
 	
-| **Number of GPUs (samples/sec)** | **Mixed precision (samples/sec)** | **Full precision (samples/sec)** | **Speedup** | 
+| **Number of GPUs** | **Batch size per GPU** | **Mixed precision throughput (samples/sec)** | **Single precision throughput (samples/sec)** | **Speed-up with mixed precision** | **Multi-GPU strong scaling with mixed precision** | **Multi-GPU strong scaling with FP32** | 
+|:---:|:----:|:---------:|:-----------:|:-----:|:---:|:---:|
+| 1 | 1048576| 19,314,944 | 9,464,431 | 2.04 | 1 | 1 |
+| 4 | 262144| 58,579,745 |31,577,085 | 1.86 | 3.03 | 3.34 |
+| 8 | 131072| 92,964,306 | 53,972,811 | 1.72 | 4.81	| 5.70 |
+
+The following table shows the average time to reach HR@10 of 0.9562 across 5 random seeds. The training time was measured excluding data downloading, preprocessing, validation data generation and library initialization times.
+
+| **Number of GPUs** | **Mixed precision (seconds)** | **Single precision (seconds)** | **Speed-up with mixed precision** | 
 |:---:|:-------------:|:-----------:|:-----:|
-| 1 | 18,871,650 | 9,206,424 | 2.05 |
-| 4 | 59,413,640 | 31,898,870 | 1.86 |
-| 8 | 94,752,770 | 53,645,640 | 1.77 |
+| 1 | 70.49	| 146.68 | 2.08 |
+| 4 | 24.61	| 49.01	| 1.99 |
+| 8 | 19.72	| 32.25	| 1.64 |

-The following table shows mean time to reach HR@10 of 0.9562 across 5 random seeds. The training time was measured excluding data downloading, preprocessing and library initialization times.

-| **Number of GPUs (samples/sec)** | **Mixed precision (seconds)** | **Full precision (seconds)** | **Speedup** | 
+
+#### NVIDIA DGX-2 (16x V100 32G)
+
+Our results were obtained by following the steps in the Quick Start Guide in the PyTorch 19.05-py3 NGC container on NVIDIA DGX-2 with 16x V100 32G GPUs. 
+
+The following table shows the best training throughput:
+	
+| **Number of GPUs ** | **Batch size per GPU** | **Mixed precision throughput (samples/sec)** | **Single precision throughput (samples/sec)** | **Speed-up with mixed precision** | **Multi-GPU strong scaling with mixed precision** | **Multi-GPU strong scaling with FP32** |
+|:---:|:-----:|:-------:|:-----------:|:-----:|:---:|:---:|
+| 1 | 1048576| 20,645,544 | 10,145,873 | 2.03 | 1 | 1 |
+| 4 | 262144 | 63,608,950 | 34,758,369 | 1.83 | 3.08 | 3.43 |
+| 8 | 131072| 98,887,103 | 57,251,418 | 1.73 | 4.79	| 5.64 |
+| 16 | 65536| 128,976,394 | 82,932,545 | 1.56 | 6.25 | 8.17 |
+
+The following table shows the average time to reach HR@10 of 0.9562 across 5 random seeds. The training time was measured excluding data downloading, preprocessing, validation data generation and library initialization times.
+
+| **Number of GPUs ** | **Mixed precision (seconds)** | **Single precision (seconds)** | **Speed-up with mixed precision** | 
 |:---:|:-------------:|:-----------:|:-----:|
-| 1 | 79.80 | 147.92 | 1.85 |
-| 4 | 27.67 | 47.64 | 1.72 |
-| 8 | 22.61 | 31.62 | 1.40 |
+| 1 | 65.99	|134.93	|2.04|
+| 4 | 26.21	|41.12	|1.57|
+| 8 | 21.96	|29.71	|1.35|
+| 16| 22.15	|28.99	|1.31|

-## Inference performance results

-### NVIDIA DGX-1 with 8 V100 16G GPUs
+### Inference performance results
+
+
+#### NVIDIA DGX-1 (8x V100 16G)
+
+Our results were obtained by following the steps in the Quick Start Guide in the PyTorch 19.05-py3 NGC container on NVIDIA DGX-1 with 8x V100 16G GPUs.

 The following table shows the best inference throughput:

-| **Number of GPUs (samples/sec)** | **Mixed precision (samples/sec)** | **Full precision (samples/sec)** | **Speedup** | 
+| **Number of GPUs ** | **Mixed precision (samples/sec)** | **Single precision (samples/sec)** | **Speed-up with mixed precision** | 
 |:---:|:-------------:|:-----------:|:-----:|
-| 1 | 58,836,420 | 28,964,964 | 2.03 |
+| 1 | 57,163,273 | 28,877,257 | 1.98 |

-### NVIDIA DGX-1 with 8 V100 32G GPUs
+#### NVIDIA DGX-1 (8x V100 32G)
+
+Our results were obtained by following the steps in the Quick Start Guidein the PyTorch 19.05-py3 NGC container on NVIDIA DGX-1 with 8x V100 32G GPUs.

 The following table shows the best inference throughput:

-| **Number of GPUs (samples/sec)** | **Mixed precision (samples/sec)** | **Full precision (samples/sec)** | **Speedup** | 
+| **Number of GPUs** | **Mixed precision (samples/sec)** | **Single precision (samples/sec)** | **Speed-up with mixed precision** | 
 |:---:|:-------------:|:-----------:|:-----:|
-| 1 | 55,317,010 | 28,470,920 | 1.94 |
+| 1 | 54,570,476 | 28,085,521 | 1.94 |
+
+
+#### NVIDIA DGX-2 (16x V100 32G)
+
+Our results were obtained by following the steps in the Quick Start Guide in the PyTorch 19.05-py3 NGC container on NVIDIA DGX-2 with 16x V100 32G GPUs.
+
+The following table shows the best inference throughput:
+
+| **Number of GPUs** | **Mixed precision (samples/sec)** | **Single precision (samples/sec)** | **Speed-up with mixed precision** | 
+|:---:|:-------------:|:-----------:|:-----:|
+| 1 | 58,383,216 | 30,018,043 | 1.94 |


 ## Changelog
 1. January 22, 2018
    * Initial release
+2. May, 2019
+    * Lower memory consumption (down from about 18GB to 10GB for batch size 1M on a single NVIDIA Tesla V100). Achieved by using an approximate method for generating negatives for training.
+    * Automatic Mixed Precision (AMP) with dynamic loss scaling instead of a custom mixed-precision optimizer.
+    * Performance numbers for NVIDIA DGX-2.
+    * Data loading code cleanup.
+    * Default container updated to PyTorch 19.05-py3.
+    * Updated README.md.


-## Known issues 
+## Known issues
+ 
 ### Scaling beyond 8 GPUs
-Neural Collaborative Filtering is a relatively lightweight model that trains quickly with this relatively smaller dataset, ml-20m.
-Because of that the high ratio of communication to computation makes it difficult to 
-efficiently use more than 8 GPUs. Normally this is not an issue because when using 8
-GPUs with fp16 precision the training is sufficiently fast. However, if you’d like to
- scale the training to 16 GPUs and beyond you might try modifying the model so that 
- the communication-computation ratio facilitates better scaling. This could be done e.g.,
+Neural Collaborative Filtering is a relatively lightweight model that trains quickly with this relatively smaller dataset, ML-20m.
+Because of that, the high ratio of communication to computation makes it difficult to 
+efficiently use more than 8 GPUs. Typically, this is not an issue because when using 8
+GPUs with FP16 precision, the training is sufficiently fast. However, if you’d like to
+ scale the training to 16 GPUs and beyond, you might try modifying the model so that 
+ the communication-computation ratio facilitates better scaling. This could be done, for example,
  by finding hyperparameters that enable using a larger batch size or by reducing the 
  number of trainable parameters.

 ### Memory usage
-Training on a single GPU with less than 16GB of memory or switching off FP16 mode might result in out-of-memory errors. To reduce memory usage you can use a smaller batch size.
-However, since we’re using the Adam optimizer, this might require changing the hyperparameters such as learning rate, beta1 and beta2.
-To circumvent this you can use gradient accumulation to combine multiple gradients computed from smaller batches into a single weight update.
-This should keep the “effective” batch size the same as original and enable using the default hyperparameters with much lower memory usage:

-```bash
-python -m torch.distributed.launch --nproc_per_node=8 ncf.py --data /data/cache/ml-20m --grads_accumulated 2 --batch-size 524288
-```
+In the default settings, the additional memory beyond 16G may not be fully utilized.
+This is because we set the default batch size for ML-20m dataset to 1M,
+which is too small to completely fill-up multiple 32G GPUs.
+1M is the batch size for which we experienced the best convergence on the ML-20m dataset.
+However, on other datasets, even faster performance can be possible by finding hyperparameters that work well for larger batches and leverage additional GPU memory.
+

-In the default settings the additional memory beyond 16G may not be fully utilized.
-This is because we set the default batch size for ml-20m dataset to 1M,
-which is too small to completely fill up multiple 32G GPUs.
-1M is the batch size for which we experienced the best convergence on the ml-20m dataset.
-However, on other datasets even faster performance can be possible by finding hyperparameters that work well for larger batches and leverage additional GPU memory.
--- a/PyTorch/Recommendation/NCF/dataloading.py
+++ b/PyTorch/Recommendation/NCF/dataloading.py
@ -0,0 +1,158 @@
+# Copyright (c) 2018, deepakn94, codyaustun, robieta. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# -----------------------------------------------------------------------
+#
+# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import time
+import torch
+import tqdm
+
+class _TestNegSampler:
+    def __init__(self, train_ratings, nb_neg):
+        self.nb_neg = nb_neg
+        self.nb_users = int(train_ratings[:, 0].max()) + 1
+        self.nb_items = int(train_ratings[:, 1].max()) + 1
+
+        # compute unique ids for quickly created hash set and fast lookup
+        ids = (train_ratings[:, 0] * self.nb_items) + train_ratings[:, 1]
+        self.set = set(ids)
+
+    def generate(self, batch_size=128*1024):
+        users = torch.arange(0, self.nb_users).reshape([1, -1]).repeat([self.nb_neg, 1]).transpose(0, 1).reshape(-1)
+
+        items = [-1] * len(users)
+
+        random_items = torch.LongTensor(batch_size).random_(0, self.nb_items).tolist()
+        print('Generating validation negatives...')
+        for idx, u in enumerate(tqdm.tqdm(users.tolist())):
+            if not random_items:
+                random_items = torch.LongTensor(batch_size).random_(0, self.nb_items).tolist()
+            j = random_items.pop()
+            while u * self.nb_items + j in self.set:
+                if not random_items:
+                    random_items = torch.LongTensor(batch_size).random_(0, self.nb_items).tolist()
+                j = random_items.pop()
+
+            items[idx] = j
+        items = torch.LongTensor(items)
+        return items
+
+
+def create_test_data(train_ratings, test_ratings, args):
+    test_users = test_ratings[:,0]
+    test_pos = test_ratings[:,1].reshape(-1,1)
+
+    begin = time.time()
+    sampler = _TestNegSampler(train_ratings.cpu().numpy(), args.valid_negative)
+    test_negs = sampler.generate().cuda()
+    end = time.time()
+    print('Generating validation negatives took: ', end - begin)
+    del train_ratings
+
+    # create items with real sample at last position
+    test_users = test_users.reshape(-1,1).repeat(1, 1 + args.valid_negative)
+    test_items = torch.cat((test_negs.reshape(-1, args.valid_negative), test_pos), dim=1)
+    del test_ratings, test_negs
+
+    # generate dup mask and real indices for exact same behavior on duplication compare to reference
+    # here we need a sort that is stable(keep order of duplicates)
+    sorted_items, indices = torch.sort(test_items) # [1,1,1,2], [3,1,0,2]
+    sum_item_indices = sorted_items.float()+indices.float()/len(indices[0]) #[1.75,1.25,1.0,2.5]
+    indices_order = torch.sort(sum_item_indices)[1] #[2,1,0,3]
+    stable_indices = torch.gather(indices, 1, indices_order) #[0,1,3,2]
+    # produce -1 mask
+    dup_mask = (sorted_items[:,0:-1] == sorted_items[:,1:])
+    dup_mask = torch.cat((torch.zeros_like(test_pos, dtype=torch.uint8), dup_mask),dim=1)
+    dup_mask = torch.gather(dup_mask,1,stable_indices.sort()[1])
+    # produce real sample indices to later check in topk
+    sorted_items, indices = (test_items != test_pos).sort()
+    sum_item_indices = sorted_items.float()+indices.float()/len(indices[0])
+    indices_order = torch.sort(sum_item_indices)[1]
+    stable_indices = torch.gather(indices, 1, indices_order)
+    real_indices = stable_indices[:,0]
+
+    if args.distributed:
+        test_users = torch.chunk(test_users, args.world_size)[args.local_rank]
+        test_items = torch.chunk(test_items, args.world_size)[args.local_rank]
+        dup_mask = torch.chunk(dup_mask, args.world_size)[args.local_rank]
+        real_indices = torch.chunk(real_indices, args.world_size)[args.local_rank]
+
+    test_users = test_users.view(-1).split(args.valid_batch_size)
+    test_items = test_items.view(-1).split(args.valid_batch_size)
+
+    return test_users, test_items, dup_mask, real_indices
+
+
+def prepare_epoch_train_data(train_ratings, nb_items, args):
+    # create label
+    train_label = torch.ones_like(train_ratings[:,0], dtype=torch.float32)
+    neg_label = torch.zeros_like(train_label, dtype=torch.float32)
+    neg_label = neg_label.repeat(args.negative_samples)
+    train_label = torch.cat((train_label,neg_label))
+    del neg_label
+
+    train_users = train_ratings[:,0]
+    train_items = train_ratings[:,1]
+
+    train_users_per_worker = len(train_label) / args.world_size
+    train_users_begin = int(train_users_per_worker * args.local_rank)
+    train_users_end = int(train_users_per_worker * (args.local_rank + 1))
+
+    # prepare data for epoch
+    neg_users = train_users.repeat(args.negative_samples)
+    neg_items = torch.empty_like(neg_users, dtype=torch.int64).random_(0, nb_items)
+
+    epoch_users = torch.cat((train_users, neg_users))
+    epoch_items = torch.cat((train_items, neg_items))
+
+    del neg_users, neg_items
+
+    # shuffle prepared data and split into batches
+    epoch_indices = torch.randperm(train_users_end - train_users_begin, device='cuda:{}'.format(args.local_rank))
+    epoch_indices += train_users_begin
+
+    epoch_users = epoch_users[epoch_indices]
+    epoch_items = epoch_items[epoch_indices]
+    epoch_label = train_label[epoch_indices]
+
+    if args.distributed:
+        local_batch = args.batch_size // args.world_size
+    else:
+        local_batch = args.batch_size
+
+    epoch_users = epoch_users.split(local_batch)
+    epoch_items = epoch_items.split(local_batch)
+    epoch_label = epoch_label.split(local_batch)
+
+    # the last batch will almost certainly be smaller, drop it
+    epoch_users = epoch_users[:-1]
+    epoch_items = epoch_items[:-1]
+    epoch_label = epoch_label[:-1]
+
+    return epoch_users, epoch_items, epoch_label
+
--- a/PyTorch/Recommendation/NCF/download_dataset.sh
+++ b/PyTorch/Recommendation/NCF/download_dataset.sh
@ -3,16 +3,19 @@ RAW_DATADIR=$2

 function download_20m {
 	echo "Download ml-20m"
+	cd ${RAW_DATADIR}
 	curl -O http://files.grouplens.org/datasets/movielens/ml-20m.zip
-	mv ml-20m.zip ${RAW_DATADIR}
+	cd -
 }

 function download_1m {
 	echo "Downloading ml-1m"
+	cd ${RAW_DATADIR}
 	curl -O http://files.grouplens.org/datasets/movielens/ml-1m.zip
-	mv ml-1m.zip ${RAW_DATADIR}
+        cd -
 }

+
 if [[ ${DATASET_NAME} == "ml-1m" ]]
 then
 	download_1m
--- a/PyTorch/Recommendation/NCF/img/dgx1v_32_curve.png
+++ b/PyTorch/Recommendation/NCF/img/dgx1v_32_curve.png
--- a/PyTorch/Recommendation/NCF/img/hr_histogram.png
+++ b/PyTorch/Recommendation/NCF/img/hr_histogram.png
--- a/PyTorch/Recommendation/NCF/logger/analyzer.py
+++ b/PyTorch/Recommendation/NCF/logger/analyzer.py
@ -60,12 +60,11 @@ def collect_by_scope(loglines):

    # gather eval_accuracy
    eval_accuracy_dup = [l.value for l in loglines if l.tag == tags.EVAL_ACCURACY]
-    eval_accuracy = list({l['value']:l for l in eval_accuracy_dup})
+    eval_accuracy = [l['value'] for l in eval_accuracy_dup]
    epoch_stats['eval_accuracy'] = eval_accuracy

    # gather it_per_sec
    eval_it_per_sec = [l.value for l in loglines if l.tag == tags.PERF_IT_PER_SEC]
-    #eval_it_per_sec = list({l['value']:l for l in eval_it_per_sec_dup})
    epoch_stats['it_per_sec'] = eval_it_per_sec


--- a/PyTorch/Recommendation/NCF/ncf.py
+++ b/PyTorch/Recommendation/NCF/ncf.py
@ -35,23 +35,21 @@ import os
 import sys
 import math
 import time
-from datetime import datetime
-from collections import OrderedDict
 from argparse import ArgumentParser

 import torch
 import torch.nn as nn

 import utils
+import dataloading
 from neumf import NeuMF

 from logger.logger import LOGGER, timed_block, timed_function
 from logger import tags
 from logger.autologging import log_hardware, log_args

-from fp_optimizers import Fp16Optimizer
 from apex.parallel import DistributedDataParallel as DDP
-
+from apex import amp

 LOGGER.model = 'ncf'

@ -60,30 +58,28 @@ def parse_args():
                                        " Filtering model")
    parser.add_argument('--data', type=str,
                        help='Path to test and training data files')
-    parser.add_argument('-e', '--epochs', type=int, default=40,
+    parser.add_argument('-e', '--epochs', type=int, default=30,
                        help='Number of epochs for training')
-    parser.add_argument('-b', '--batch-size', type=int, default=1048576,
+    parser.add_argument('-b', '--batch_size', type=int, default=2**20,
                        help='Number of examples for each iteration')
-    parser.add_argument('--valid-batch-size', type=int, default=2**20,
+    parser.add_argument('--valid_batch_size', type=int, default=2**20,
                        help='Number of examples in each validation chunk')
    parser.add_argument('-f', '--factors', type=int, default=64,
                        help='Number of predictive factors')
    parser.add_argument('--layers', nargs='+', type=int,
                        default=[256, 256, 128, 64],
                        help='Sizes of hidden layers for MLP')
-    parser.add_argument('-n', '--negative-samples', type=int, default=4,
+    parser.add_argument('-n', '--negative_samples', type=int, default=4,
                        help='Number of negative examples per interaction')
-    parser.add_argument('-l', '--learning-rate', type=float, default=0.0045,
+    parser.add_argument('-l', '--learning_rate', type=float, default=0.0045,
                        help='Learning rate for optimizer')
    parser.add_argument('-k', '--topk', type=int, default=10,
                        help='Rank for test examples to be considered a hit')
-    parser.add_argument('--seed', '-s', type=int, default=0,
+    parser.add_argument('--seed', '-s', type=int, default=1,
                        help='Manually set random seed for torch')
    parser.add_argument('--threshold', '-t', type=float, default=1.0,
                        help='Stop training early at threshold')
-    parser.add_argument('--no-fp16', action='store_false', dest='fp16',
-                        help='Do not use fp16')
-    parser.add_argument('--valid-negative', type=int, default=100,
+    parser.add_argument('--valid_negative', type=int, default=100,
                        help='Number of negative samples for each positive test example')
    parser.add_argument('--beta1', '-b1', type=float, default=0.25,
                        help='Beta1 for Adam')
@ -93,14 +89,15 @@ def parse_args():
                        help='Epsilon for Adam')
    parser.add_argument('--dropout', type=float, default=0.5,
                        help='Dropout probability, if equal to 0 will not use dropout at all')
-    parser.add_argument('--loss-scale', default=8192, type=int,
-                        help='Loss scale to use for mixed precision training')
-    parser.add_argument('--checkpoint-dir', default='/data/checkpoints/', type=str,
+    parser.add_argument('--checkpoint_dir', default='/data/checkpoints/', type=str,
                        help='Path to the directory storing the checkpoint file')
    parser.add_argument('--mode', choices=['train', 'test'], default='train', type=str,
                        help='Passing "test" will only run a single evaluation, otherwise full training will be performed')
    parser.add_argument('--grads_accumulated', default=1, type=int,
                        help='Number of gradients to accumulate before performing an optimization step')
+    parser.add_argument('--opt_level', default='O2', type=str,
+                        help='Optimization level for Automatic Mixed Precision',
+                        choices=['O0', 'O2'])
    parser.add_argument('--local_rank', default=0, type=int, help='Necessary for multi-GPU training')
    return parser.parse_args()

@ -133,12 +130,8 @@ def init_distributed(local_rank=0):
    return distributed, int(os.environ['WORLD_SIZE'])


-def val_epoch(model, x, y, dup_mask, real_indices, K, samples_per_user, num_user, output=None,
+def val_epoch(model, x, y, dup_mask, real_indices, K, samples_per_user, num_user,
              epoch=None, distributed=False):
-
-    start = datetime.now()
-    log_2 = math.log(2)
-
    model.eval()

    with torch.no_grad():
@ -146,80 +139,36 @@ def val_epoch(model, x, y, dup_mask, real_indices, K, samples_per_user, num_user
        for u,n in zip(x,y):
            p.append(model(u, n, sigmoid=True).detach())

-        del x
-        del y
        temp = torch.cat(p).view(-1,samples_per_user)
-        del p
+        del x, y, p
+
        # set duplicate results for the same item to -1 before topk
        temp[dup_mask] = -1
        out = torch.topk(temp,K)[1]
        # topk in pytorch is stable(if not sort)
-        # key(item):value(predicetion) pairs are ordered as original key(item) order
+        # key(item):value(prediction) pairs are ordered as original key(item) order
        # so we need the first position of real item(stored in real_indices) to check if it is in topk
        ifzero = (out == real_indices.view(-1,1))
        hits = ifzero.sum()
-        ndcg = (log_2 / (torch.nonzero(ifzero)[:,1].view(-1).to(torch.float)+2).log_()).sum()
+        ndcg = (math.log(2) / (torch.nonzero(ifzero)[:,1].view(-1).to(torch.float)+2).log_()).sum()

    LOGGER.log(key=tags.EVAL_SIZE, value={"epoch": epoch, "value": num_user * samples_per_user})
    LOGGER.log(key=tags.EVAL_HP_NUM_USERS, value=num_user)
    LOGGER.log(key=tags.EVAL_HP_NUM_NEG, value=samples_per_user - 1)

-    end = datetime.now()
-
    if distributed:
        torch.distributed.all_reduce(hits, op=torch.distributed.reduce_op.SUM)
        torch.distributed.all_reduce(ndcg, op=torch.distributed.reduce_op.SUM)

-    hits = hits.item()
-    ndcg = ndcg.item()
-
-    if output is not None:
-        result = OrderedDict()
-        result['timestamp'] = datetime.now()
-        result['duration'] = end - start
-        result['epoch'] = epoch
-        result['K'] = K
-        result['hit_rate'] = hits/num_user
-        result['NDCG'] = ndcg/num_user
-        utils.save_result(result, output)
+    hr = hits.item() / num_user
+    ndcg = ndcg.item() / num_user

    model.train()
-    return hits/num_user, ndcg/num_user
-
-
-def generate_neg(users, true_mat, item_range, num_neg, sort=False):
-    # assuming 1-d tensor input
-
-    # for each user in 'users', generate 'num_neg' negative samples in [0, item_range)
-    # also make sure negative sample is not in true sample set with mask
-    # true_mat store a mask matrix where true_mat(user, item) = 0 for true sample
-    # return (neg_user, neg_item)
-
-    # list to append iterations of result
-    neg_u = []
-    neg_i = []
-
-    neg_users = users.repeat(num_neg)
-    while len(neg_users) > 0: # generate then filter loop
-        neg_items = torch.empty_like(neg_users, dtype=torch.int64).random_(0, item_range)
-        neg_mask = true_mat[neg_users, neg_items]
-        neg_u.append(neg_users.masked_select(neg_mask))
-        neg_i.append(neg_items.masked_select(neg_mask))
-
-        neg_users = neg_users.masked_select(1-neg_mask)
-
-    neg_users = torch.cat(neg_u)
-    neg_items = torch.cat(neg_i)
-    if sort == False:
-        return neg_users, neg_items
-
-    sorted_users, sort_indices = torch.sort(neg_users)
-    return sorted_users, neg_items[sort_indices]
+    return hr, ndcg


 def main():
    log_hardware()
-
    args = parse_args()
    args.distributed, args.world_size = init_distributed(args.local_rank)
    log_args(args)
@ -229,90 +178,35 @@ def main():
    if args.seed is not None:
        torch.manual_seed(args.seed)

-    # Save configuration to file
    print("Saving results to {}".format(args.checkpoint_dir))
    if not os.path.exists(args.checkpoint_dir) and args.checkpoint_dir != '':
        os.makedirs(args.checkpoint_dir, exist_ok=True)
    checkpoint_path = os.path.join(args.checkpoint_dir, 'model.pth')

-    # more like load trigger timer now
    LOGGER.log(key=tags.PREPROC_HP_NUM_EVAL, value=args.valid_negative)
    # The default of np.random.choice is replace=True, so does pytorch random_()
    LOGGER.log(key=tags.PREPROC_HP_SAMPLE_EVAL_REPLACEMENT, value=True)
    LOGGER.log(key=tags.INPUT_HP_SAMPLE_TRAIN_REPLACEMENT, value=True)
    LOGGER.log(key=tags.INPUT_STEP_EVAL_NEG_GEN)

-    # sync worker before timing.
+    # sync workers before timing
    if args.distributed:
        torch.distributed.broadcast(torch.tensor([1], device="cuda"), 0)
    torch.cuda.synchronize()

    LOGGER.log(key=tags.RUN_START)
-    run_start_time = time.time()

-    # load not converted data, just seperate one for test
    train_ratings = torch.load(args.data+'/train_ratings.pt', map_location=torch.device('cuda:{}'.format(args.local_rank)))
    test_ratings = torch.load(args.data+'/test_ratings.pt', map_location=torch.device('cuda:{}'.format(args.local_rank)))

-    # get input data
-    # get dims
    nb_maxs = torch.max(train_ratings, 0)[0]
-    nb_users = nb_maxs[0].item()+1
-    nb_items = nb_maxs[1].item()+1
-    train_users = train_ratings[:,0]
-    train_items = train_ratings[:,1]
-    del nb_maxs, train_ratings
-    LOGGER.log(key=tags.INPUT_SIZE, value=len(train_users))
-    # produce things not change between epoch
-    # mask for filtering duplicates with real sample
-    # note: test data is removed before create mask, same as reference
-    mat = torch.cuda.ByteTensor(nb_users, nb_items).fill_(1)
-    mat[train_users, train_items] = 0
-    # create label
-    train_label = torch.ones_like(train_users, dtype=torch.float32)
-    neg_label = torch.zeros_like(train_label, dtype=torch.float32)
-    neg_label = neg_label.repeat(args.negative_samples)
-    train_label = torch.cat((train_label,neg_label))
-    del neg_label
-    if args.fp16:
-        train_label = train_label.half()
+    nb_users = nb_maxs[0].item() + 1
+    nb_items = nb_maxs[1].item() + 1
+    LOGGER.log(key=tags.INPUT_SIZE, value=len(train_ratings))

-    # produce validation negative sample on GPU
    all_test_users = test_ratings.shape[0]

-    test_users = test_ratings[:,0]
-    test_pos = test_ratings[:,1].reshape(-1,1)
-    test_negs = generate_neg(test_users, mat, nb_items, args.valid_negative, True)[1]
-
-    # create items with real sample at last position
-    test_users = test_users.reshape(-1,1).repeat(1,1+args.valid_negative)
-    test_items = torch.cat((test_negs.reshape(-1,args.valid_negative), test_pos), dim=1)
-    del test_ratings, test_negs
-
-    # generate dup mask and real indice for exact same behavior on duplication compare to reference
-    # here we need a sort that is stable(keep order of duplicates)
-    # this is a version works on integer
-    sorted_items, indices = torch.sort(test_items) # [1,1,1,2], [3,1,0,2]
-    sum_item_indices = sorted_items.float()+indices.float()/len(indices[0]) #[1.75,1.25,1.0,2.5]
-    indices_order = torch.sort(sum_item_indices)[1] #[2,1,0,3]
-    stable_indices = torch.gather(indices, 1, indices_order) #[0,1,3,2]
-    # produce -1 mask
-    dup_mask = (sorted_items[:,0:-1] == sorted_items[:,1:])
-    dup_mask = torch.cat((torch.zeros_like(test_pos, dtype=torch.uint8), dup_mask),dim=1)
-    dup_mask = torch.gather(dup_mask,1,stable_indices.sort()[1])
-    # produce real sample indices to later check in topk
-    sorted_items, indices = (test_items != test_pos).sort()
-    sum_item_indices = sorted_items.float()+indices.float()/len(indices[0])
-    indices_order = torch.sort(sum_item_indices)[1]
-    stable_indices = torch.gather(indices, 1, indices_order)
-    real_indices = stable_indices[:,0]
-    del sorted_items, indices, sum_item_indices, indices_order, stable_indices, test_pos
-
-    if args.distributed:
-        test_users = torch.chunk(test_users, args.world_size)[args.local_rank]
-        test_items = torch.chunk(test_items, args.world_size)[args.local_rank]
-        dup_mask = torch.chunk(dup_mask, args.world_size)[args.local_rank]
-        real_indices = torch.chunk(real_indices, args.world_size)[args.local_rank]
+    test_users, test_items, dup_mask, real_indices = dataloading.create_test_data(train_ratings, test_ratings, args)

    # make pytorch memory behavior more consistent later
    torch.cuda.empty_cache()
@ -320,36 +214,33 @@ def main():
    LOGGER.log(key=tags.INPUT_BATCH_SIZE, value=args.batch_size)
    LOGGER.log(key=tags.INPUT_ORDER)  # we shuffled later with randperm

-    print('Load data done [%.1f s]. #user=%d, #item=%d, #train=%d, #test=%d'
-          % (time.time()-run_start_time, nb_users, nb_items, len(train_users),
-             nb_users))
-
    # Create model
    model = NeuMF(nb_users, nb_items,
-                  mf_dim=args.factors, mf_reg=0.,
+                  mf_dim=args.factors,
                  mlp_layer_sizes=args.layers,
-                  mlp_layer_regs=[0. for i in args.layers],
                  dropout=args.dropout)

-    if args.fp16:
-        model = model.half()
+    optimizer = FusedAdam(model.parameters(), lr=args.learning_rate,
+                          betas=(args.beta1, args.beta2), eps=args.eps, eps_inside_sqrt=False)
+
+    criterion = nn.BCEWithLogitsLoss(reduction='none') # use torch.mean() with dim later to avoid copy to host
+    # Move model and loss to GPU
+    model = model.cuda()
+    criterion = criterion.cuda()
+
+    if args.opt_level == "O2":
+        model, optimizer = amp.initialize(model, optimizer, opt_level=args.opt_level,
+                                          keep_batchnorm_fp32=False, loss_scale='dynamic')
+
+    if args.distributed:
+        model = DDP(model)
+
+    local_batch = args.batch_size // args.world_size
+    traced_criterion = torch.jit.trace(criterion.forward,
+                                       (torch.rand(local_batch,1),torch.rand(local_batch,1)))

    print(model)
    print("{} parameters".format(utils.count_parameters(model)))
-
-    # Save model text description
-    with open(os.path.join(args.checkpoint_dir, 'model.txt'), 'w') as file:
-        file.write(str(model))
-
-    # Add optimizer and loss to graph
-    if args.fp16:
-        fp_optimizer = Fp16Optimizer(model, args.loss_scale)
-        params = fp_optimizer.fp32_params
-    else:
-        params = model.parameters()
-
-    optimizer = FusedAdam(params, lr=args.learning_rate, betas=(args.beta1, args.beta2), eps=args.eps, eps_inside_sqrt=False)
-    criterion = nn.BCEWithLogitsLoss(reduction='none') # use torch.mean() with dim later to avoid copy to host
    LOGGER.log(key=tags.OPT_LR, value=args.learning_rate)
    LOGGER.log(key=tags.OPT_NAME, value="Adam")
    LOGGER.log(key=tags.OPT_HP_ADAM_BETA1, value=args.beta1)
@ -357,53 +248,22 @@ def main():
    LOGGER.log(key=tags.OPT_HP_ADAM_EPSILON, value=args.eps)
    LOGGER.log(key=tags.MODEL_HP_LOSS_FN, value=tags.VALUE_BCE)

-    # Move model and loss to GPU
-    model = model.cuda()
-    criterion = criterion.cuda()
-
-    if args.distributed:
-        model = DDP(model)
-        local_batch = args.batch_size // int(os.environ['WORLD_SIZE'])
-    else:
-        local_batch = args.batch_size
-    traced_criterion = torch.jit.trace(criterion.forward, (torch.rand(local_batch,1),torch.rand(local_batch,1)))
-
-    train_users_per_worker = len(train_label) / int(os.environ['WORLD_SIZE'])
-    train_users_begin = int(train_users_per_worker * args.local_rank)
-    train_users_end = int(train_users_per_worker * (args.local_rank + 1))
-
-    # Create files for tracking training
-    valid_results_file = os.path.join(args.checkpoint_dir, 'valid_results.csv')
-    # Calculate initial Hit Ratio and NDCG
-    test_x = test_users.view(-1).split(args.valid_batch_size)
-    test_y = test_items.view(-1).split(args.valid_batch_size)

    if args.mode == 'test':
        state_dict = torch.load(checkpoint_path)
        model.load_state_dict(state_dict)
-    
-    begin = time.time()
-    LOGGER.log(key=tags.EVAL_START, value=-1)
-        
-    hr, ndcg = val_epoch(model, test_x, test_y, dup_mask, real_indices, args.topk, samples_per_user=test_items.size(1),
-                         num_user=all_test_users, distributed=args.distributed)
-    val_time = time.time() - begin
-    print('Initial HR@{K} = {hit_rate:.4f}, NDCG@{K} = {ndcg:.4f}, valid_time: {val_time:.4f}'
-          .format(K=args.topk, hit_rate=hr, ndcg=ndcg, val_time=val_time))
-
-    LOGGER.log(key=tags.EVAL_ACCURACY, value={"epoch": -1, "value": hr})
-    LOGGER.log(key=tags.EVAL_TARGET, value=args.threshold)
-    LOGGER.log(key=tags.EVAL_STOP, value=-1)
-    
-    if args.mode == 'test':
+        hr, ndcg = val_epoch(model, test_users, test_items, dup_mask, real_indices, args.topk,
+                             samples_per_user=args.valid_negative + 1,
+                             num_user=all_test_users, distributed=args.distributed)
+        print('HR@{K} = {hit_rate:.4f}, NDCG@{K} = {ndcg:.4f}'
+              .format(K=args.topk, hit_rate=hr, ndcg=ndcg))
        return
    
    success = False
    max_hr = 0
-    LOGGER.log(key=tags.TRAIN_LOOP)
-    train_throughputs = []
-    eval_throughputs = []
+    train_throughputs, eval_throughputs = [], []

+    LOGGER.log(key=tags.TRAIN_LOOP)
    for epoch in range(args.epochs):

        LOGGER.log(key=tags.TRAIN_EPOCH_START, value=epoch)
@ -412,68 +272,43 @@ def main():

        begin = time.time()

-        # prepare data for epoch
-        neg_users, neg_items = generate_neg(train_users, mat, nb_items, args.negative_samples)
-        epoch_users = torch.cat((train_users,neg_users))
-        epoch_items = torch.cat((train_items,neg_items))
-
-        del neg_users, neg_items
-
-        # shuffle prepared data and split into batches
-        epoch_indices = torch.randperm(train_users_end - train_users_begin, device='cuda:{}'.format(args.local_rank))
-        epoch_indices += train_users_begin
-
-        epoch_users = epoch_users[epoch_indices]
-        epoch_items = epoch_items[epoch_indices]
-        epoch_label = train_label[epoch_indices]
-
-        epoch_users_list = epoch_users.split(local_batch)
-        epoch_items_list = epoch_items.split(local_batch)
-        epoch_label_list = epoch_label.split(local_batch)
-
-        # only print progress bar on rank 0
-        num_batches = len(epoch_users_list)
-        # handle extremely rare case where last batch size < number of worker
-        if len(epoch_users) % args.batch_size < args.world_size:
-            print("epoch_size % batch_size < number of worker!")
-            exit(1)
-
+        epoch_users, epoch_items, epoch_label = dataloading.prepare_epoch_train_data(train_ratings, nb_items, args)
+        num_batches = len(epoch_users)
        for i in range(num_batches // args.grads_accumulated):
            for j in range(args.grads_accumulated):
                batch_idx = (args.grads_accumulated * i) + j
-                user = epoch_users_list[batch_idx]
-                item = epoch_items_list[batch_idx]
-                label = epoch_label_list[batch_idx].view(-1,1)
+                user = epoch_users[batch_idx]
+                item = epoch_items[batch_idx]
+                label = epoch_label[batch_idx].view(-1,1)

                outputs = model(user, item)
                loss = traced_criterion(outputs, label).float()
                loss = torch.mean(loss.view(-1), 0)
-                if args.fp16:
-                    fp_optimizer.backward(loss)
+
+                if args.opt_level == "O2":
+                    with amp.scale_loss(loss, optimizer) as scaled_loss:
+                        scaled_loss.backward()
                else:
                    loss.backward()
-
-            if args.fp16:
-                fp_optimizer.step(optimizer)
-            else:
-                optimizer.step()
+            optimizer.step()

            for p in model.parameters():
-                    p.grad = None
+                p.grad = None

-        del epoch_users, epoch_items, epoch_label, epoch_users_list, epoch_items_list, epoch_label_list, user, item, label
+        del epoch_users, epoch_items, epoch_label
        train_time = time.time() - begin
        begin = time.time()

-        epoch_samples = len(train_users) * (args.negative_samples + 1)
+        epoch_samples = len(train_ratings) * (args.negative_samples + 1)
        train_throughput = epoch_samples / train_time
        train_throughputs.append(train_throughput)
        LOGGER.log(key='train_throughput', value=train_throughput)
        LOGGER.log(key=tags.TRAIN_EPOCH_STOP, value=epoch)
        LOGGER.log(key=tags.EVAL_START, value=epoch)

-        hr, ndcg = val_epoch(model, test_x, test_y, dup_mask, real_indices, args.topk, samples_per_user=test_items.size(1),
-                             num_user=all_test_users, output=valid_results_file, epoch=epoch, distributed=args.distributed)
+        hr, ndcg = val_epoch(model, test_users, test_items, dup_mask, real_indices, args.topk,
+                             samples_per_user=args.valid_negative + 1,
+                             num_user=all_test_users, epoch=epoch, distributed=args.distributed)

        val_time = time.time() - begin
        print('Epoch {epoch}: HR@{K} = {hit_rate:.4f}, NDCG@{K} = {ndcg:.4f},'
@ -486,7 +321,7 @@ def main():
        LOGGER.log(key=tags.EVAL_TARGET, value=args.threshold)
        LOGGER.log(key=tags.EVAL_STOP, value=epoch)

-        eval_size = all_test_users * test_items.size(1)
+        eval_size = all_test_users * (args.valid_negative + 1)
        eval_throughput = eval_size / val_time
        eval_throughputs.append(eval_throughput)
        LOGGER.log(key='eval_throughput', value=eval_throughput)
--- a/PyTorch/Recommendation/NCF/neumf.py
+++ b/PyTorch/Recommendation/NCF/neumf.py
@ -34,8 +34,8 @@ import torch.nn as nn

 import sys
 from os.path import abspath, join, dirname
-# enabling modules discovery from global entrypoint
-sys.path.append(abspath(dirname(__file__)+'/'))
+# enabling modules discovery from the global entrypoint
+sys.path.append(abspath(dirname(__file__) + '/'))

 from logger.logger import LOGGER
 from logger import tags
@ -44,12 +44,8 @@ LOGGER.model = 'ncf'

 class NeuMF(nn.Module):
    def __init__(self, nb_users, nb_items,
-                 mf_dim, mf_reg,
-                 mlp_layer_sizes, mlp_layer_regs,
-                 dropout=0):
+                 mf_dim, mlp_layer_sizes, dropout=0):
        
-        if len(mlp_layer_sizes) != len(mlp_layer_regs):
-            raise RuntimeError('u dummy, layer_sizes != layer_regs!')
        if mlp_layer_sizes[0] % 2 != 0:
            raise RuntimeError('u dummy, mlp_layer_sizes[0] % 2 != 0')
        super(NeuMF, self).__init__()
--- a/PyTorch/Recommendation/NCF/prepare_dataset.sh
+++ b/PyTorch/Recommendation/NCF/prepare_dataset.sh
@ -31,10 +31,11 @@

 #!/bin/bash
 set -e
+set -x

 DATASET_NAME=${1:-'ml-20m'}
-RAW_DATADIR='/data'
-CACHED_DATADIR='/data/cache/'${DATASET_NAME}
+RAW_DATADIR=${2:-'/data'}
+CACHED_DATADIR="${RAW_DATADIR}/cache/${DATASET_NAME}"

 # you can add another option to this case in order to support other datasets
 case ${DATASET_NAME} in
@ -51,9 +52,17 @@ case ${DATASET_NAME} in
 	exit 1
 esac

-mkdir -p ${RAW_DATADIR}
-mkdir -p ${CACHED_DATADIR}
-rm -f log
+if [ ! -d ${RAW_DATADIR} ]; then
+    mkdir -p ${RAW_DATADIR}
+fi
+
+if [ ! -d ${CACHED_DATADIR} ]; then
+    mkdir -p ${CACHED_DATADIR}
+fi
+
+if [ -f log ]; then
+    rm -f log
+fi

 if [ ! -f ${ZIP_PATH} ]; then
    echo 'Dataset not found, downloading...'
@ -76,6 +85,6 @@ else
 fi

 echo "Dataset $DATASET_NAME successfully prepared at: $CACHED_DATADIR\n"
-echo 'You can now run the training with: python -m torch.distributed.launch --nproc_per_node=<number_of_GPUs> ncf.py --data /data/cache/ml-20m'
+echo "You can now run the training with: python -m torch.distributed.launch --nproc_per_node=<number_of_GPUs> ncf.py --data ${CACHED_DATADIR}"


--- a/PyTorch/Recommendation/NCF/requirements.txt
+++ b/PyTorch/Recommendation/NCF/requirements.txt
@ -1 +1,2 @@
 pandas
+tqdm
--- a/PyTorch/Segmentation/MaskRCNN/pytorch/maskrcnn_benchmark/engine/trainer.py
+++ b/PyTorch/Segmentation/MaskRCNN/pytorch/maskrcnn_benchmark/engine/trainer.py
@ -10,6 +10,12 @@ import torch.distributed as dist
 from maskrcnn_benchmark.utils.comm import get_world_size
 from maskrcnn_benchmark.utils.metric_logger import MetricLogger

+try:
+    from apex import amp
+    use_amp = True
+except ImportError:
+    print('Use APEX for multi-precision via apex.amp')
+    use_amp = False

 def reduce_loss_dict(loss_dict):
    """
@ -80,7 +86,7 @@ def do_train(
        # Note: If mixed precision is not used, this ends up doing nothing
        # Otherwise apply loss scaling for mixed-precision recipe
        if use_amp:        
-            with optimizer.scale_loss(losses) as scaled_losses:
+            with amp.scale_loss(losses, optimizer) as scaled_losses:
                scaled_losses.backward()
        else:
            losses.backward()
--- a/PyTorch/Segmentation/MaskRCNN/pytorch/maskrcnn_benchmark/utils/model_zoo.py
+++ b/PyTorch/Segmentation/MaskRCNN/pytorch/maskrcnn_benchmark/utils/model_zoo.py
@ -2,9 +2,14 @@
 import os
 import sys

-from torch.utils.model_zoo import _download_url_to_file
-from torch.utils.model_zoo import urlparse
-from torch.utils.model_zoo import HASH_REGEX
+try:
+    from torch.utils.model_zoo import _download_url_to_file
+    from torch.utils.model_zoo import urlparse
+    from torch.utils.model_zoo import HASH_REGEX
+except:
+    from torch.hub import _download_url_to_file
+    from torch.hub import urlparse
+    from torch.hub import HASH_REGEX

 from maskrcnn_benchmark.utils.comm import is_main_process
 from maskrcnn_benchmark.utils.comm import synchronize
--- a/PyTorch/Segmentation/MaskRCNN/pytorch/tools/train_net.py
+++ b/PyTorch/Segmentation/MaskRCNN/pytorch/tools/train_net.py
@ -97,14 +97,9 @@ def train(cfg, local_rank, distributed):
    if use_amp:
        # Initialize mixed-precision training
        use_mixed_precision = cfg.DTYPE == "float16"
-        amp_handle = amp.init(enabled=use_mixed_precision, verbose=cfg.AMP_VERBOSE)

-        # wrap the optimizer for mixed precision
-        if cfg.SOLVER.ACCUMULATE_GRAD:
-            # also specify number of steps to accumulate over
-            optimizer = amp_handle.wrap_optimizer(optimizer, num_loss=cfg.SOLVER.ACCUMULATE_STEPS)
-        else:
-            optimizer = amp_handle.wrap_optimizer(optimizer)
+        amp_opt_level = 'O1' if use_mixed_precision else 'O0'
+        model, optimizer = amp.initialize(model, optimizer, opt_level=amp_opt_level)

    if distributed:
        if use_apex_ddp:
--- a/PyTorch/SpeechSynthesis/Tacotron2/Dockerfile
+++ b/PyTorch/SpeechSynthesis/Tacotron2/Dockerfile
@ -1,5 +1,10 @@
-FROM nvcr.io/nvidia/pytorch:18.12.1-py3
+FROM nvcr.io/nvidia/pytorch:19.03-py3

 ADD . /workspace/tacotron2
 WORKDIR /workspace/tacotron2
 RUN pip install -r requirements.txt
+RUN cd /workspace; \
+    git clone https://github.com/NVIDIA/apex.git; \
+    cd /workspace/apex; \
+    pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" .
+WORKDIR /workspace/tacotron2
--- a/PyTorch/SpeechSynthesis/Tacotron2/README.md
+++ b/PyTorch/SpeechSynthesis/Tacotron2/README.md
@ -1,79 +1,157 @@
-# Tacotron 2 And WaveGlow v1.0 For PyTorch
+# Tacotron 2 And WaveGlow v1.5 For PyTorch

-This repository provides a script and recipe to train Tacotron 2 and WaveGlow v1.0 to achieve state of the art accuracy, and is tested and maintained by NVIDIA.
+This repository provides a script and recipe to train Tacotron 2 and WaveGlow
+v1.5 models to achieve state of the art accuracy, and is tested and maintained by
+NVIDIA.

-## Table Of Contents
+Table of Contents
+=================
 * [The model](#the-model)
-	* [Default configuration](#default-configuration)
+   * [Model architecture](#model-architecture)
+   * [Default configuration](#default-configuration)
+   * [Feature support matrix](#feature-support-matrix)
+      * [Features](#features)
 * [Setup](#setup)
-	* [Requirements](#requirements)
+   * [Requirements](#requirements)
 * [Quick Start Guide](#quick-start-guide)
 * [Details](#details)
-	* [Training process](#training-process)
-		* [Hyperparameters and command line arguments](#hyperparameters-and-command-line-arguments)
-		    * [Shared parameters](#shared-parameters)
-            * [Shared audio/STFT parameters](#shared-audiostft-parameters)
-            * [Tacotron 2 parameters](#tacotron-2-parameters)
-            * [WaveGlow parameters](#waveglow-parameters)
-	* [Enabling mixed precision](#enabling-mixed-precision)
-	* [Inference process](#inference-process)
+   * [Scripts and sample code](#scripts-and-sample-code)
+   * [Parameters](#parameters)
+      * [Shared parameters](#shared-parameters)
+      * [Shared audio/STFT parameters](#shared-audiostft-parameters)
+      * [Tacotron 2 parameters](#tacotron-2-parameters)
+      * [WaveGlow parameters](#waveglow-parameters)
+   * [Command-line options](#command-line-options)
+   * [Getting the data](#getting-the-data)
+      * [Dataset guidelines](#dataset-guidelines)
+      * [Multi-dataset](#multi-dataset)
+   * [Training process](#training-process)
+   * [Inference process](#inference-process)
+* [Mixed precision training](#mixed-precision-training)
+   * [Enabling mixed precision](#enabling-mixed-precision)
 * [Benchmarking](#benchmarking)
-	* [Inference performance benchmark](#inference-performance-benchmark)
-	* [Training performance benchmark](#training-performance-benchmark)
+   * [Training performance benchmark](#training-performance-benchmark)
+   * [Inference performance benchmark](#inference-performance-benchmark)
 * [Results](#results)
-	* [Training accuracy results](#training-accuracy-results)
-	* [Training performance results](#training-performance-results)
-		* [Expected training time](#expected-training-time)
-	* [Inference performance results](#inference-performance-results)
+   * [Training accuracy results](#training-accuracy-results)
+      * [NVIDIA DGX-1 (8x V100 16G)](#nvidia-dgx-1-8x-v100-16g)
+   * [Training performance results](#training-performance-results)
+      * [NVIDIA DGX-1 (8x V100 16G)](#nvidia-dgx-1-8x-v100-16g)
+      * [Expected training time](#expected-training-time)
+   * [Inference performance results](#inference-performance-results)
+      * [NVIDIA DGX-1 (8x V100 16G)](#nvidia-dgx-1-8x-v100-16g)
 * [Changelog](#changelog)
 * [Known issues](#known-issues)

+## The model

+This text-to-speech (TTS) system is a combination of two neural network
+models:

-# The model
-This text-to-speech (TTS) system is a combination of two neural network models:
-* a modified Tacotron 2 model from the [Natural TTS Synthesis by Conditioning WaveNet on Mel Spectrogram Predictions](https://arxiv.org/abs/1712.05884) paper and
+* a modified Tacotron 2 model from the [Natural TTS Synthesis by Conditioning WaveNet on Mel Spectrogram Predictions](https://arxiv.org/abs/1712.05884)
+paper and
 * a flow-based neural network model from the [WaveGlow: A Flow-based Generative Network for Speech Synthesis](https://arxiv.org/abs/1811.00002) paper.

-The Tacotron 2 and WaveGlow model form a text-to-speech system that enables
-user to synthesise a natural sounding speech from raw transcripts without
-any additional prosody information.
+The Tacotron 2 and WaveGlow models form a text-to-speech system that enables
+users to synthesize natural sounding speech from raw transcripts without
+any additional information such as patterns and/or rhythms of speech.

-Our implementation of Tacotron 2 model differs from the model described in the
-paper. Our implementation uses Dropout instead of Zoneout to regularize the LSTM layers.
-Also, the original text-to-speech system proposed in the paper used the [WaveNet](https://arxiv.org/abs/1609.03499)
-model to synthesize waveforms.
-In our implementation, we use the WaveGlow model for this purpose.
+Our implementation of Tacotron 2 models differs from the model described in the
+paper. Our implementation uses Dropout instead of Zoneout to regularize the
+LSTM layers. Also, the original text-to-speech system proposed in the paper
+uses the [WaveNet](https://arxiv.org/abs/1609.03499) model to synthesize
+waveforms. In our implementation, we use the WaveGlow model for this purpose.

 Both models are based on implementations of NVIDIA GitHub repositories
 [Tacotron 2](https://github.com/NVIDIA/tacotron2) and
 [WaveGlow](https://github.com/NVIDIA/waveglow), and are trained on a publicly
 available [LJ Speech dataset](https://keithito.com/LJ-Speech-Dataset/).

-This model trains with mixed precision tensor cores on Volta, therefore researchers
-can get results much faster than training without tensor cores.  This model is
-tested against each NGC monthly container release to ensure consistent accuracy
-and performance over time.
+The Tacotron 2 and WaveGlow model enables you to efficiently synthesize high
+quality speech from text.

-## Default configuration
-The Tacotron 2 model produces mel spectrograms from input text using
-encoder-decoder architecture. WaveGlow is a flow-based model that consumes the
-mel spectrograms to generate speech. Both models support multi-gpu and mixed
-precision training with dynamic loss scaling (see Apex code [here](https://github.com/NVIDIA/apex/blob/master/apex/fp16_utils/loss_scaler.py)),
-as well as mixed precision inference.
+Both models are trained with mixed precision using Tensor Cores on NVIDIA
+Volta and Turing GPUs. Therefore, researchers can get results 1.5x faster for Tacotron 2
+and 2.2x faster for WaveGlow than training without Tensor Cores, while
+experiencing the benefits of mixed precision training. The models are tested
+against each NGC monthly container release to ensure consistent accuracy and
+performance over time.

-# Setup
-The following sections list the requirements in order to
-start training the Tacotron 2 and WaveGlow models.
+### Model architecture

-## Requirements
-This repository contains `Dockerfile` which extends the PyTorch NGC container
+The Tacotron 2 model is a recurrent sequence-to-sequence model with attention that
+predicts mel-spectrograms from text. The encoder (blue blocks in the figure
+below) transforms the whole text into a fixed-size hidden feature
+representation. This feature representation is then consumed by the
+autoregressive decoder (orange blocks) that produces one spectrogram frame at
+a time. In our implementation, the autoregressive WaveNet (green block) is
+replaced by the flow-based generative WaveGlow.
+
+![](./img/tacotron2_arch.png "Tacotron 2 architecture")
+
+Figure 1. Architecture of the Tacotron 2 model. Taken from the
+[Tacotron 2](https://arxiv.org/abs/1712.05884) paper.
+
+The WaveGlow model is a flow-based generative model that generates audio
+samples from Gaussian distribution using mel-spectrogram conditioning (Figure
+2). During training, the model learns to transform the dataset distribution
+into spherical Gaussian distribution through a series of flows. One step of a
+flow consists of an invertible convolution, followed by a modified WaveNet
+architecture that serves as an affine coupling layer. During inference, the
+network is inverted and audio samples are generated from the Gaussian
+distribution.
+
+![](./img/waveglow_arch.png "WaveGlow architecture")
+
+Figure 2. Architecture of the WaveGlow model. Taken from the
+[WaveGlow](https://arxiv.org/abs/1811.00002) paper.
+
+
+### Default configuration
+
+Both models support multi-GPU and mixed precision training with dynamic loss
+scaling (see Apex code
+[here](https://github.com/NVIDIA/apex/blob/master/apex/fp16_utils/loss_scaler.py)),
+as well as mixed precision inference. To speed up Tacotron 2 training,
+reference mel-spectrograms are generated during a preprocessing step and read
+directly from disk during training, instead of being generated during training.
+
+The following features were implemented in this model:
+
+* data-parallel multi-GPU training
+* dynamic loss scaling with backoff for Tensor Cores (mixed precision)
+training.
+
+### Feature support matrix
+
+The following features are supported by this model.
+
+| Feature               | Tacotron 2 | and WaveGlow |               
+|:-------|---------:|-----------:|
+|[AMP](https://nvidia.github.io/apex/amp.html) | Yes | Yes |
+|[Apex DistributedDataParallel](https://nvidia.github.io/apex/parallel.html) | Yes | Yes |
+
+#### Features 
+
+AMP - a tool that enables Tensor Core-accelerated training. Please refer to section [Enabling mixed precision](#enabling-mixed-precision) for more details.
+
+Apex DistributedDataParallel - a module wrapper that enables easy multiprocess distributed data parallel training, similar to `torch.nn.parallel.DistributedDataParallel`. `DistributedDataParallel` is optimized for use with NCCL. It achieves high performance by overlapping communication with computation during backward() and bucketing smaller gradient transfers to reduce the total number of transfers required.
+
+## Setup
+
+The following section lists the requirements in order to start training the
+Tacotron 2 and WaveGlow models.
+
+### Requirements
+
+This repository contains Dockerfile which extends the PyTorch NGC container
 and encapsulates some dependencies. Aside from these dependencies, ensure you
 have the following components:

 * [NVIDIA Docker](https://github.com/NVIDIA/nvidia-docker)
-* [PyTorch 19.05-py3 NGC container](https://ngc.nvidia.com/registry/nvidia-pytorch) or newer
-* [NVIDIA Volta based GPU](https://www.nvidia.com/en-us/data-center/volta-gpu-architecture/)
+* [PyTorch 19.04-py3 NGC container](https://ngc.nvidia.com/registry/nvidia-pytorch)
+or newer
+* [NVIDIA Volta](https://www.nvidia.com/en-us/data-center/volta-gpu-architecture/) or [Turing](https://www.nvidia.com/en-us/geforce/turing/) based GPU


 For more information about how to get started with NGC containers, see the
@ -84,35 +162,49 @@ Documentation:
 * [Accessing And Pulling From The NGC Container Registry](https://docs.nvidia.com/deeplearning/dgx/user-guide/index.html#accessing_registry)
 * [Running PyTorch](https://docs.nvidia.com/deeplearning/dgx/pytorch-release-notes/running.html#running)

-# Quick Start Guide
-To train your model using mixed precision with tensor cores or using FP32,
-perform the following steps using the default parameters of the Tacrotron 2
-and WaveGlow model on the [LJ Speech](https://keithito.com/LJ-Speech-Dataset/) dataset.
+For those unable to use the PyTorch NGC container, to set up the required
+environment or create your own container, see the versioned
+[NVIDIA Container Support Matrix](https://docs.nvidia.com/deeplearning/dgx/support-matrix/index.html).

-## 1. Clone the repository.
+## Quick Start Guide
+
+To train your model using mixed precision with Tensor Cores or using FP32,
+perform the following steps using the default parameters of the Tacrotron 2
+and WaveGlow model on the [LJ Speech](https://keithito.com/LJ-Speech-Dataset/)
+dataset.
+
+1. Clone the repository.
 ```bash
 git clone https://github.com/NVIDIA/DeepLearningExamples.git
 cd DeepLearningExamples/PyTorch/SpeechSynthesis/Tacotron2
 ```

-## 2. Download and preprocess the dataset.
+2. Download and preprocess the dataset.
 Use the `./scripts/prepare-dataset.sh` download script to automatically
-download and preprocess the training, validation and test datasets. To run this script, issue:
+download and preprocess the training, validation and test datasets. To run
+this script, issue:
 ```bash
 bash scripts/prepare-dataset.sh
 ```
+
+To preprocess the datasets for Tacotron 2 training, use the
+`./scripts/prepare-mels.sh` script:
+```bash
+bash scripts/prepare_mels.sh
+```
+
 Data is downloaded to the `./LJSpeech-1.1` directory (on the host).  The
 `./LJSpeech-1.1` directory is mounted to the `/workspace/tacotron2/LJSpeech-1.1`
-location in the NGC container.  The script will also generate the necessary
-filelists for training and validation in `./filelists` if they are not already present.
+location in the NGC container. The preprocessed mel-spectrograms are stored in the 
+`./LJSpeech-1.1/mels` directory.

-## 3. Build the Tacotron 2 and WaveGlow PyTorch NGC container.
+3. Build the Tacotron 2 and WaveGlow PyTorch NGC container.
 ```bash
 bash scripts/docker/build.sh
 ```

-## 4. Start an interactive session in the NGC container to run training/inference.
-After you build the container image, you can start an interactive CLI session with
+4. Start an interactive session in the NGC container to run training/inference.
+After you build the container image, you can start an interactive CLI session with:

 ```bash
 bash scripts/docker/interactive.sh
@ -121,210 +213,260 @@ bash scripts/docker/interactive.sh
 The `interactive.sh` script requires that the location on the dataset is specified.
 For example, `LJSpeech-1.1`.

-## 5. Start training.
-To run Tacotron 2 training, run:
+5. Start training.
+To start Tacotron 2 training, run:
 ```bash
 bash scripts/train_tacotron2.sh
 ```

-To run WaveGlow training, run:
+To start WaveGlow training, run:
 ```bash
 bash scripts/train_waveglow.sh
 ```

-## 6. Start validation/evaluation.
-Ensure your loss values are comparable to those listed in the table in the 
-Results section. For both models, the loss values are stored in the 
-`./output/nvlog.json` log file. 
+6. Start validation/evaluation.
+Ensure your loss values are comparable to those listed in the table in the
+[Results][#results] section. For both models, the loss values are stored in the
+`./output/nvlog.json` log file.

-After you have trained the Tacotron 2 and WaveGlow models, you should get audio results similar to the
-samples in the `./audio` folder. For details about generating audio, see the 
+After you have trained the Tacotron 2 model for 1500 epochs and the
+WaveGlow model for 800 epochs, you should get audio results similar to the
+samples in the `./audio` folder. For details about generating audio, see the
 [Inference process](#inference-process) section below.

-The training scripts automatically run the validation after each training 
-epoch. The results from the validation are printed to the standard output 
+The training scripts automatically run the validation after each training
+epoch. The results from the validation are printed to the standard output
 (`stdout`) and saved to the log files.

-## 7. Start inference.
-After you have trained the Tacotron 2 and WaveGlow models, you can perform 
-inference using the respective checkpoints that are passed as `--tacotron2` 
-and `--waveglow` arguments. 
+7. Start inference.
+After you have trained the Tacotron 2 and WaveGlow models, you can perform
+inference using the respective checkpoints that are passed as `--tacotron2`
+and `--waveglow` arguments.

 To run inference issue:
 ```bash
-python inference.py --tacotron2 <Tacotron2_checkpoint> --waveglow <WaveGlow_checkpoint> -o output/ -i phrase.txt --fp16-run
+python inference.py --tacotron2 <Tacotron2_checkpoint> --waveglow <WaveGlow_checkpoint> -o output/ -i text.txt --fp16-run
 ```
-The speech is generated from text file passed with `-i` argument.
-If no file is provided or if the provided file cannot be opened, speech will be
-generated from a default text located in the `inference.py` file. To run
-inference in mixed precision, use `--fp16-run` flag. The output audio will 
-be stored in the path specified by `-o` argument.
+The speech is generated from a text file that is passed with `-i` argument. To run
+inference in mixed precision, use the `--amp-run` flag. The output audio will
+be stored in the path specified by the `-o` argument.

-# Details
-The following sections provide greater details of the dataset, running training 
-and inference, and the training results.
+## Details

-## Training process
-The Tacotron2 and WaveGlow models are trained separately and independently. 
-Both models obtain mel spectrograms from short time Fourier transform (STFT) 
-during training. These mel spectrograms are used for loss computation in case 
-of Tacotron 2 and as conditioning input to the network in case of WaveGlow.
+The following sections provide greater details of the dataset, running
+training and inference, and the training results.

-The training loss is averaged over an entire training epoch, whereas the 
-validation loss is averaged over the validation dataset. Performance is 
-reported in total input tokens per second for the Tacotron 2 model, and 
-in total output samples per second for the WaveGlow model. Both measures are 
-recorded as `train_iter_items/sec` (after each iteration) and `train_epoch_items/sec` 
-(averaged over epoch) in the output log. The result is averaged over an 
-entire training epoch and summed over all GPUs that were included in the training.
+### Scripts and sample code

-Even though the training script uses all available GPUs, you can change 
-this behavior by setting the `CUDA_VISIBLE_DEVICES` variable in your 
-environment or by setting the `NV_GPU` variable at the Docker container launch 
-([see section "GPU isolation"](https://github.com/NVIDIA/nvidia-docker/wiki/nvidia-docker#gpu-isolation)). 
+The sample code for Tacotron 2 and WaveGlow has scripts specific to a
+particular model, located in directories `./tacotron2` and `./waveglow`, as well as scripts common to both
+models, located in the `./common` directory. The model-specific scripts are as follows:

-### Hyperparameters and command line arguments
-Here, we list the most important hyperparameters and command line arguments, 
-together with their default values that are used to train Tacotron 2 and 
+* `<model_name>/model.py` - the model architecture, definition of forward and
+inference functions
+* `<model_name>/arg_parser.py` - argument parser for parameters specific to a
+given model
+* `<model_name>/data_function.py` - data loading functions
+* `<model_name>/loss_function.py` - loss function for the model
+
+The common scripts contain layer definitions common to both models
+(`common/layers.py`), some utility scripts (`common/utils.py`) and scripts
+for audio processing (`common/audio_processing.py` and `common/stft.py`). In
+the root directory `./` of this repository, the `./run.py` script is used for
+training while inference can be executed with the `./inference.py` script. The
+scripts `./models.py`, `./data_functions.py` and `./loss_functions.py` call
+the respective scripts in the `<model_name>` directory, depending on what
+model is trained using the `run.py` script.
+
+### Parameters
+
+In this section, we list the most important hyperparameters and command-line arguments,
+together with their default values that are used to train Tacotron 2 and
 WaveGlow models.

 #### Shared parameters
-`--epochs` - number of epochs (Tacotron 2: 1500, WaveGlow: 1000)

-`--learning-rate` - learning rate (Tacotron 2: 1e-3, WaveGlow: 1e-4)
-
-`--batch-size` - batch size (Tacotron 2 FP16/FP32: 80/48, WaveGlow FP16/FP32: 8/4)
-
-`--fp16-run` - use mixed precision training
+* `--epochs` - number of epochs (Tacotron 2: 1500, WaveGlow: 1000)
+* `--learning-rate` - learning rate (Tacotron 2: 1e-3, WaveGlow: 1e-4)
+* `--batch-size` - batch size (Tacotron 2 FP16/FP32: 80/48, WaveGlow FP16/FP32: 8/4)
+* `--amp-run` - use mixed precision training

 #### Shared audio/STFT parameters
-`--sampling-rate` - Sampling rate in Hz of input and output audio (22050)

-`--filter-length` - (1024)
-
-`--hop-length` - Hop length for FFT, i.e., sample stride between consecutive FFTs (256)
-
-`--win-length` - Window size for FFT (1024)
-
-`--mel-fmin` - Lowest frequency in Hz (0.0)
-
-`--mel-fmax` - Highest frequency in Hz (8.000)
+* `--sampling-rate` - sampling rate in Hz of input and output audio (22050)
+* `--filter-length` - (1024)
+* `--hop-length` - hop length for FFT, i.e., sample stride between consecutive FFTs (256)
+* `--win-length` - window size for FFT (1024)
+* `--mel-fmin` - lowest frequency in Hz (0.0)
+* `--mel-fmax` - highest frequency in Hz (8.000)

 #### Tacotron 2 parameters
-`--anneal-steps` - epochs at which to anneal the learning rate (500 1000 1500)

-`--anneal-factor` - factor by which to anneal the learning rate (FP16/FP32: 0.3/0.1)
+* `--anneal-steps` - epochs at which to anneal the learning rate (500 1000 1500)
+* `--anneal-factor` - factor by which to anneal the learning rate (FP16/FP32: 0.3/0.1)

 #### WaveGlow parameters
-`--segment-length` - segment length of input audio processed by the neural network (8000)
+
+* `--segment-length` - segment length of input audio processed by the neural network (8000)


-## Enabling mixed precision
-[Mixed precision](https://arxiv.org/abs/1710.03740) training offers significant 
-computational speedup by performing operations in half-precision format, while 
-storing minimal information in single-precision to retain as much information as 
-possible in critical parts of the network. Since the introduction of 
-[tensor cores](https://developer.nvidia.com/tensor-cores) in the Volta and Turing 
-architectures, significant training speedups are experienced by switching to 
-mixed precision -- up to 3x overall speedup on the most arithmetically intense 
-model architectures. Using [mixed precision training](https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html) 
-previously required two steps:
+### Command-line options
+
+To see the full list of available options and their descriptions, use the `-h` or `--help` command line option, for example:
+```bash
+python train.py --help
+```
+
+
+### Getting the data
+
+The Tacotron 2 and WaveGlow models were trained on the LJSpeech-1.1 dataset.  
+This repository contains the `./scripts/prepare_dataset.sh` script which will automatically download and extract the whole dataset. By default, data will be extracted to the `./LJSpeech-1.1` directory. The dataset directory contains a `README` file, a `wavs` directory with all audio samples, and a file `metadata.csv` that contains audio file names and the corresponding transcripts.
+
+#### Dataset guidelines
+
+The LJSpeech dataset has 13,100 clips that amount to about 24 hours of speech. Since the original dataset has all transcripts in the `metadata.csv` file, in this repository we provide file lists in the `./filelists` directory that determine training and validation subsets; `ljs_audio_text_train_filelist.txt` is a test set used as a training dataset and `ljs_audio_text_val_filelist.txt` is a test set used as a validation dataset.
+
+#### Multi-dataset
+
+To use datasets different than the default LJSpeech dataset:
+
+1. Prepare a directory with all audio files and pass it to the `--dataset-path` command-line option.  
+
+2. Add two text files containing file lists: one for the training subset (`--training-files`) and one for the validation subset (`--validation files`).
+The structure of the filelists should be as follows:
+```bash
+`<audio file path>|<transcript>`
+```
+
+The `<audio file path>` is the relative path to the path provided by the `--dataset-path` option.
+
+
+### Training process
+
+The Tacotron2 and WaveGlow models are trained separately and independently.
+Both models obtain mel-spectrograms from short time Fourier transform (STFT)
+during training. These mel-spectrograms are used for loss computation in case
+of Tacotron 2 and as conditioning input to the network in case of WaveGlow.
+
+The training loss is averaged over an entire training epoch, whereas the
+validation loss is averaged over the validation dataset. Performance is
+reported in total input tokens per second for the Tacotron 2 model and
+in total output samples per second for the WaveGlow model. Both measures are
+recorded as `train_iter_items/sec` (after each iteration) and
+`train_epoch_items/sec` (averaged over epoch) in the output log file `./output/nvlog.json`. The result is
+averaged over an entire training epoch and summed over all GPUs that were
+included in the training.
+
+Even though the training script uses all available GPUs, you can change
+this behavior by setting the `CUDA_VISIBLE_DEVICES` variable in your
+environment or by setting the `NV_GPU` variable at the Docker container launch
+([see section "GPU isolation"](https://github.com/NVIDIA/nvidia-docker/wiki/nvidia-docker#gpu-isolation)).
+
+### Inference process
+
+You can run inference using the `./inference.py` script. This script takes
+text as input and runs Tacotron 2 and then WaveGlow inference to produce an
+audio file. It requires  pre-trained checkpoints from Tacotron 2 and WaveGlow
+models and input text as a text file, with one phrase per line.
+
+To run inference, issue:
+```bash
+python inference.py --tacotron2 <Tacotron2_checkpoint> --waveglow <WaveGlow_checkpoint> -o output/ -i text.txt --amp-run
+```
+Here, `Tacotron2_checkpoint` and `WaveGlow_checkpoint` are pre-trained
+checkpoints for the respective models, and `text.txt` contains input phrases.
+Audio will be saved in the output folder.
+
+You can find all the available options by calling `python inference.py --help`.
+
+## Mixed precision training
+
+*Mixed precision* is the combined use of different numerical precisions in a
+computational method. [Mixed precision](https://arxiv.org/abs/1710.03740)
+training offers significant computational speedup by performing operations in
+half-precision format, while storing minimal information in single-precision
+to retain as much information as possible in critical parts of the network.
+Since the introduction of [Tensor Cores](https://developer.nvidia.com/tensor-cores)
+in the Volta and Turing architecture, significant training speedups are
+experienced by switching to mixed precision -- up to 3x overall speedup on
+the most arithmetically intense model architectures.  Using mixed precision
+training requires two steps:

 1. Porting the model to use the FP16 data type where appropriate.
-2. Manually adding loss scaling to preserve small gradient values.
- 
+2. Adding loss scaling to preserve small gradient values.

-Mixed precision is enabled in PyTorch by using the Automatic Mixed Precision 
-(AMP),  library from [APEX](https://github.com/NVIDIA/apex) that casts variables 
-to half-precision upon retrieval, while storing variables in single-precision 
-format. Furthermore, to preserve small gradient magnitudes in backpropagation, 
-a [loss scaling](https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html#lossscaling) 
-step must be included when applying gradients. In PyTorch, loss scaling can be 
-easily applied by using `scale_loss()` method provided by AMP. The scaling value 
+The ability to train deep learning networks with lower precision was
+introduced in the Pascal architecture and first supported in [CUDA 8](https://devblogs.nvidia.com/parallelforall/tag/fp16/) in the NVIDIA Deep Learning SDK.
+
+For information about:
+* How to train using mixed precision, see the [Mixed Precision Training](https://arxiv.org/abs/1710.03740)
+paper and [Training With Mixed Precision](https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html)
+documentation.
+* Techniques used for mixed precision training, see the [Mixed-Precision Training of Deep Neural Networks](https://devblogs.nvidia.com/mixed-precision-training-deep-neural-networks/)
+blog.
+* How to access and enable AMP for TensorFlow, see [Using TF-AMP](https://docs.nvidia.com/deeplearning/dgx/tensorflow-user-guide/index.html#tfamp)
+from the TensorFlow User Guide.
+* APEX tools for mixed precision training, see the [NVIDIA Apex: Tools for Easy Mixed-Precision Training in PyTorch](https://devblogs.nvidia.com/apex-pytorch-easy-mixed-precision-training/).
+
+
+
+### Enabling mixed precision
+
+Mixed precision is enabled in PyTorch by using the Automatic Mixed Precision
+(AMP)  library from [APEX](https://github.com/NVIDIA/apex) that casts variables
+to half-precision upon retrieval, while storing variables in single-precision
+format. Furthermore, to preserve small gradient magnitudes in backpropagation,
+a [loss scaling](https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html#lossscaling)
+step must be included when applying gradients. In PyTorch, loss scaling can be
+easily applied by using the `scale_loss()` method provided by AMP. The scaling value
 to be used can be [dynamic](https://nvidia.github.io/apex/fp16_utils.html#apex.fp16_utils.DynamicLossScaler) or fixed.

-By default, the `train_tacotron2.sh` and `train_waveglow.sh` scripts will launch 
-mixed precision training with tensor cores. You can change this behaviour by 
-removing the `--fp16-run` flag from the `train.py` script.
+By default, the `train_tacotron2.sh` and `train_waveglow.sh` scripts will
+launch mixed precision training with Tensor Cores. You can change this
+behaviour by removing the `--amp-run` flag from the `train.py` script.

-For an in-depth walk through on AMP, check out sample usage [here](https://github.com/NVIDIA/apex/tree/master/apex/amp#usage-and-getting-started). 
-[APEX](https://github.com/NVIDIA/apex) is a PyTorch extension that contains 
-utility libraries, such as AMP, which require minimal network code changes to 
-leverage tensor cores performance.
-
-To enable mixed precision, you can:
-* Import AMP from APEX, for example: 
+To enable mixed precision, the following steps were performed in the Tacotron 2 and
+WaveGlow models:
+* Import AMP from APEX:
    ```bash
    from apex import amp
+	amp.lists.functional_overrides.FP32_FUNCS.remove('softmax')
+	amp.lists.functional_overrides.FP16_FUNCS.append('softmax')
    ```

-* Initialize an AMP handle, for example: 
+* Initialize AMP:
    ```bash
-    amp_handle = amp.init(enabled=True, verbose=True)
+	model, optimizer = amp.initialize(model, optimizer, opt_level="O1")
    ```

-* Wrap your optimizer with the AMP handle, for example:
+* If running on multi-GPU, wrap the model with `DistributedDataParallel`:
    ```bash
-    optimizer = amp_handle.wrap_optimizer(optimizer)    
-    ```
+  from apex.parallel import DistributedDataParallel as DDP
+  model = DDP(model)
+	```
+
+* Scale loss before backpropagation (assuming loss is stored in a variable
+called `losses`)

-* Scale loss before backpropagation (assuming loss is stored in a variable called losses)
    * Default backpropagate for FP32:
        ```bash
        losses.backward()
        ```
-    
+
    * Scale loss and backpropagate with AMP:
        ```bash
        with optimizer.scale_loss(losses) as scaled_losses:
            scaled_losses.backward()
-        ````
+        ```

-For information about:
-* How to train using mixed precision, see the [Mixed Precision Training](https://arxiv.org/abs/1710.03740) 
-paper and [Training With Mixed Precision](https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html) documentation.
-* Techniques used for mixed precision training, see the [Mixed-Precision Training of Deep Neural Networks](https://devblogs.nvidia.com/mixed-precision-training-deep-neural-networks/) blog.
-* APEX tools for mixed precision training, see the [NVIDIA Apex: Tools for Easy Mixed-Precision Training in PyTorch](https://devblogs.nvidia.com/apex-pytorch-easy-mixed-precision-training/).
+## Benchmarking

-## Inference process
-You can run inference using the `./inference.py` script. This script takes text 
-as input, and runs Tacotron 2 and then WaveGlow inference to produce an audio 
-file. It requires  pre-trained checkpoints from Tacotron 2 and WaveGlow models 
-and input text as a text file, with one phrase per line.
+The following section shows how to run benchmarks measuring the model
+performance in training and inference mode.

-To run inference, issue:
-```bash
-python inference.py --tacotron2 <Tacotron2_checkpoint> --waveglow <WaveGlow_checkpoint> -o output/ -i phrase.txt --fp16-run
-```
-Here, `Tacotron2_checkpoint` and `WaveGlow_checkpoint` are pre-trained 
-checkpoints for the respective models, and `text.txt` contains input phrases. 
-Audio will be saved in the output folder.
+### Training performance benchmark

-You can find all available options by calling `python inference.py --help`.
-
-# Benchmarking
-The following section shows how to run benchmarks measuring the model 
-performance in training mode.
-
-## Inference performance benchmark
-To benchmark the inference performance on a batch size=1, run:
-
-* For FP32
-    ```bash
-    python inference.py --tacotron2 <Tacotron2_checkpoint> --waveglow <WaveGlow_checkpoint> -o output/ --include-warmup -i phrase.txt --log-file=output/nvlog_fp32.json
-    ```
-* For FP16
-    ```bash
-    python inference.py --tacotron2 <Tacotron2_checkpoint> --waveglow <WaveGlow_checkpoint> -o output/ --include-warmup -i phrase.txt --fp16-run --log-file=output/nvlog_fp16.json
-    ```
-
-The output log files will contain performance numbers for Tacotron 2 model
-(number of input tokens per second, reported as `tacotron2_items_per_sec`)
-and for WaveGlow (number of output samples per second, reported as
-`waveglow_items_per_sec`). The `inference.py` script will run a few warmup
-iterations before running the benchmark.
-
-## Training performance benchmark
 To benchmark the training performance on a specific batch size, run:

 **Tacotron 2**
@ -336,7 +478,7 @@ To benchmark the training performance on a specific batch size, run:
        ```
 	* FP16
        ```bash
-        python train.py -m Tacotron2 -o <output_dir> -lr 1e-3 --epochs 10 -bs <batch_size> --weight-decay 1e-6 --grad-clip-thresh 1.0 --cudnn-benchmark=True --log-file nvlog.json --training-files filelists/ljs_audio_text_train_subset_2500_filelist.txt --dataset-path <dataset-path> --fp16-run
+        python train.py -m Tacotron2 -o <output_dir> -lr 1e-3 --epochs 10 -bs <batch_size> --weight-decay 1e-6 --grad-clip-thresh 1.0 --cudnn-benchmark=True --log-file nvlog.json --training-files filelists/ljs_audio_text_train_subset_2500_filelist.txt --dataset-path <dataset-path> --amp-run
        ```

 * For multiple GPUs
@ -346,7 +488,7 @@ To benchmark the training performance on a specific batch size, run:
        ```
 	* FP16
        ```bash
-        python -m multiproc train.py -m Tacotron2 -o <output_dir> -lr 1e-3 --epochs 10 -bs <batch_size> --weight-decay 1e-6 --grad-clip-thresh 1.0 --cudnn-benchmark=True --log-file nvlog.json --training-files filelists/ljs_audio_text_train_subset_2500_filelist.txt --dataset-path <dataset-path> --fp16-run
+        python -m multiproc train.py -m Tacotron2 -o <output_dir> -lr 1e-3 --epochs 10 -bs <batch_size> --weight-decay 1e-6 --grad-clip-thresh 1.0 --cudnn-benchmark=True --log-file nvlog.json --training-files filelists/ljs_audio_text_train_subset_2500_filelist.txt --dataset-path <dataset-path> --amp-run
        ```

 **WaveGlow**
@ -358,7 +500,7 @@ To benchmark the training performance on a specific batch size, run:
        ```
 	* FP16
        ```bash
-        python train.py -m WaveGlow -o <output_dir> -lr 1e-4 --epochs 10 -bs <batch_size> --segment-length 8000 --weight-decay 0 --grad-clip-thresh 65504.0 --cudnn-benchmark=True --log-file nvlog.json --training-files filelists/ljs_audio_text_train_subset_1250_filelist.txt --dataset-path <dataset-path> --fp16-run
+        python train.py -m WaveGlow -o <output_dir> -lr 1e-4 --epochs 10 -bs <batch_size> --segment-length 8000 --weight-decay 0 --grad-clip-thresh 65504.0 --cudnn-benchmark=True --log-file nvlog.json --training-files filelists/ljs_audio_text_train_subset_1250_filelist.txt --dataset-path <dataset-path> --amp-run
        ```

 * For multiple GPUs
@ -368,116 +510,157 @@ To benchmark the training performance on a specific batch size, run:
        ```
 	* FP16
        ```bash
-        python -m multiproc train.py -m WaveGlow -o <output_dir> -lr 1e-4 --epochs 10 -bs <batch_size> --segment-length 8000 --weight-decay 0 --grad-clip-thresh 65504.0 --cudnn-benchmark=True --log-file nvlog.json --training-files filelists/ljs_audio_text_train_subset_1250_filelist.txt --dataset-path <dataset-path> --fp16-run
+        python -m multiproc train.py -m WaveGlow -o <output_dir> -lr 1e-4 --epochs 10 -bs <batch_size> --segment-length 8000 --weight-decay 0 --grad-clip-thresh 65504.0 --cudnn-benchmark=True --log-file nvlog.json --training-files filelists/ljs_audio_text_train_subset_1250_filelist.txt --dataset-path <dataset-path> --amp-run
        ```

-Each of these scripts runs for 10 epochs and for each epoch measures the averaged number of items per second. The performance results can be read from the nvlog.json files produced by the commands.
+Each of these scripts runs for 10 epochs and for each epoch measures the
+average number of items per second. The performance results can be read from
+the `nvlog.json` files produced by the commands.

-# Results
-The following sections provide details on how we achieved our performance 
+### Inference performance benchmark
+
+To benchmark the inference performance on a batch size=1, run:
+
+* For FP32
+    ```bash
+    python inference.py --tacotron2 <Tacotron2_checkpoint> --waveglow <WaveGlow_checkpoint> -o output/ -i text.txt --log-file=output/nvlog_fp32.json
+    ```
+* For FP16
+    ```bash
+    python inference.py --tacotron2 <Tacotron2_checkpoint> --waveglow <WaveGlow_checkpoint> -o output/ -i text.txt --amp-run --log-file=output/nvlog_fp16.json
+    ```
+
+The log files contain performance numbers for Tacotron 2 model
+(number of input tokens per second, reported as `tacotron2_items_per_sec`)
+and for WaveGlow (number of output samples per second, reported as
+`waveglow_items_per_sec`).
+
+
+
+## Results
+
+The following sections provide details on how we achieved our performance
 and accuracy in training and inference.

-## Training accuracy results
-Our results were obtained by running the `./platform/train_{tacotron2,waveglow}_{FP16,FP32}_DGX1_16GB_8GPU.sh` 
-training script in the PyTorch-19.05-py3 NGC container on NVIDIA DGX-1 with 8x V100 16G GPUs.
+### Training accuracy results

-All of the results were produced using the `train.py` as described in the 
+
+##### NVIDIA DGX-1 (8x V100 16G)
+
+Our results were obtained by running the `./platform/train_{tacotron2,waveglow}_{FP16,FP32}_DGX1_16GB_8GPU.sh` training script in the PyTorch-19.04-py3
+NGC container on NVIDIA DGX-1 with 8x V100 16G GPUs.
+
+All of the results were produced using the `train.py` script as described in the
 [Training process](#training-process) section of this document.

 | Loss (Model/Epoch) |       1 |     250 |     500 |     750 |    1000 |
 | :----------------: | ------: | ------: | ------: | ------: | ------: |
-| **Tacotron 2 FP16** | 26.7176 |   0.473 |  0.3985 |  0.3725 |  0.3645 |
-| **Tacotron 2 FP32** |  5.3406 |  0.4317 |  0.3699 |  0.3635 |  0.3629 |
-| **WaveGlow FP16**  | -2.2054 | -5.7602 |  -5.901 | -5.9706 | -6.0258 |
-| **WaveGlow FP32**  | -3.0327 |  -5.858 | -6.0056 | -6.0613 | -6.1087 |
+| Tacotron 2 FP16 | 13.0732 | 0.5736 | 0.4408 | 0.3923 | 0.3735 |
+| Tacotron 2 FP32 | 8.5776 | 0.4807 | 0.3875 | 0.3421 | 0.3308 |
+| WaveGlow FP16  | -2.2054 | -5.7602 |  -5.901 | -5.9706 | -6.0258 |
+| WaveGlow FP32  | -3.0327 |  -5.858 | -6.0056 | -6.0613 | -6.1087 |

 Tacotron 2 FP16 loss - batch size 80 (mean and std over 16 runs)
-![](./img/tacotron2_fp16_loss.png "Tacotron 2 FP16 loss")
+![](./img/tacotron2_amp_loss.png "Tacotron 2 FP16 loss")
+
 Tacotron 2 FP32 loss - batch size 48 (mean and std over 16 runs)
 ![](./img/tacotron2_fp32_loss.png "Tacotron 2 FP16 loss")
+
 WaveGlow FP16 loss - batch size 8 (mean and std over 16 runs)
 ![](./img/waveglow_fp16_loss.png "WaveGlow FP16 loss")
+
 WaveGlow FP32 loss - batch size 4 (mean and std over 16 runs)
 ![](./img/waveglow_fp32_loss.png "WaveGlow FP32 loss")

-## Training performance results
-Our results were obtained by running the `./platform/train_{tacotron2,waveglow}_{FP16,FP32}_DGX1_16GB_8GPU.sh` 
-training script in the PyTorch-19.05-py3 NGC container on NVIDIA DGX-1 with 
-8x V100 16G GPUs. Performance numbers (in input tokens per second for 
-Tacotron 2 and output samples per second for WaveGlow) were averaged over 
+
+### Training performance results
+
+
+##### NVIDIA DGX-1 (8x V100 16G)
+
+Our results were obtained by running the `./platform/train_{tacotron2,waveglow}_{FP16,FP32}_DGX1_16GB_8GPU.sh`
+training script in the PyTorch-19.04-py3 NGC container on NVIDIA DGX-1 with
+8x V100 16G GPUs. Performance numbers (in input tokens per second for
+Tacotron 2 and output samples per second for WaveGlow) were averaged over
 an entire training epoch.

-This table shows the results for Tacotron 2, with batch size equal 80 and 48
-for mixed precision and FP32 training, respectively.
+This table shows the results for Tacotron 2:

-|Number of GPUs|Mixed precision tokens/sec|FP32 tokens/sec|Speed-up with mixed precision|Multi-gpu weak scaling with mixed precision|Multi-gpu weak scaling with FP32|
-|---:|---:|---:|---:|---:|---:|
-|**1**|2,554|1,740|1.47|1.00|1.00|
-|**4**|7,768|5,683|1.37|3.04|3.27|
-|**8**|12,524|10,484|1.19|4.90|6.03|
+|Number of GPUs|Batch size per GPU|Number of tokens used with mixed precision|Number of tokens used with FP32|Speed-up with mixed precision|Multi-GPU weak scaling with mixed precision|Multi-GPU weak scaling with FP32|
+|---:|---:|---:|---:|---:|---:|---:|
+|1|128@FP16, 64@FP32 | 3,746 | 2,087 | 1.79 | 1.00 | 1.00 |
+|4|128@FP16, 64@FP32 | 13,264 | 8,052 | 1.65 | 3.54 | 3.86 |
+|8|128@FP16, 64@FP32 | 25,056 | 15,863 | 1.58 | 6.69 | 7.60 |

-The following table shows the results for WaveGlow, with batch size equal 8 and
-4 for mixed precision and FP32 training, respectively.
+The following table shows the results for WaveGlow:

-|Number of GPUs|Mixed precision samples/sec|FP32 samples/sec|Speed-up with mixed precision|Multi-gpu weak scaling with mixed precision|Multi-gpu weak scaling with FP32|
-|---:|---:|---:|---:|---:|---:|
-|**1**| 76,686 | 36,602 | 2.10 | 1.00 | 1.00 |
-|**4**| 260,826 | 124,514 | 2.09 | 3.40 | 3.40 |
-|**8**| 566,471 | 264,138 | 2.14 | 7.39 | 7.22 |
+|Number of GPUs|Batch size per GPU|Number of samples used with mixed precision|Number of samples used with FP32|Speed-up with mixed precision|Multi-GPU weak scaling with mixed precision|Multi-GPU weak scaling with FP32|
+|---:|---:|---:|---:|---:|---:|---:|
+|1| 10@FP16, 4@FP32 | 79248.87426 | 35695.56774 | 2.22 | 1.00 | 1.00 |
+|4| 10@FP16, 4@FP32 | 275310.0262 | 126497.6265 | 2.18 | 3.47 | 3.54 |
+|8| 10@FP16, 4@FP32 | 576709.4935 | 255155.1798 | 2.26 | 7.28 | 7.15 |

-To achieve these same results, follow the [Quick Start Guide](#quick-start-guide) outlined above.
+To achieve these same results, follow the steps in the [Quick Start Guide](#quick-start-guide).

-### Expected training time
+#### Expected training time

-This table shows the expected training time for convergence for Tacotron 2 (1500 epochs, time in hours).
+The following table shows the expected training time for convergence for Tacotron 2 (1500 epochs):

-|Number of GPUs|Expected training time in hours with mixed precision|Expected training time in hours with FP32|Speed-up with mixed precision|
+|Number of GPUs|Batch size per GPU|Time to train with mixed precision (Hrs)|Time to train with FP32 (Hrs)|Speed-up with mixed precision|
 |---:|---:|---:|---:|
-|**1**| 197.39 | 302.32 | 1.38 |
-|**4**| 63.29 | 88.07 | 1.25 |
-|**8**| 33.72 | 45.51 | 1.33 |
+|1| 128@FP16, 64@FP32 | 137.33 | 227.66 | 1.66 |
+|4| 128@FP16, 64@FP32 | 40.68 | 63.99 | 1.57 |
+|8| 128@FP16, 64@FP32 | 20.74 | 32.47 | 1.57 |

-This table shows the expected training time for convergence for WaveGlow (1000 epochs, time in hours).

-|Number of GPUs|Expected training time in hours with mixed precision|Expected training time in hours with FP32|Speed-up with mixed precision|
+
+The following table shows the expected training time for convergence for WaveGlow (1000 epochs):
+
+|Number of GPUs|Batch size per GPU|Time to train with mixed precision (Hrs)|Time to train with FP32 (Hrs)|Speed-up with mixed precision|
 |---:|---:|---:|---:|
-|**1**| 400.99 | 782.67 | 1.95 |
-|**4**|	89.40 | 213.09 | 2.38 |
-|**8**|	48.43 | 107.27 | 2.21 |
+|1| 10@FP16, 4@FP32 | 358.00 | 793.97 | 2.22 |
+|4| 10@FP16, 4@FP32 | 103.10 | 223.59 | 2.17 |
+|8| 10@FP16, 4@FP32 | 50.40 | 109.45 | 2.17 |

-## Inference performance results
-Our results were obtained by running the `./inference.py` inference script in the 
-PyTorch-19.05-py3 NGC container on NVIDIA DGX-1 with 8x V100 16G GPUs.
-Performance numbers (in input tokens per second for Tacotron 2 and output 
+
+
+### Inference performance results
+
+
+##### NVIDIA DGX-1 (8x V100 16G)
+
+Our results were obtained by running the `./inference.py` inference script in the
+PyTorch-18.12.1-py3 NGC container on NVIDIA DGX-1 with 8x V100 16G GPUs.
+Performance numbers (in input tokens per second for Tacotron 2 and output
 samples per second for WaveGlow) were averaged over 16 runs.


-This table shows the inference performance results for Tacotron 2.
+The following table shows the inference performance results for Tacotron 2.
 Results are measured in the number of input tokens per second.

-|Number of GPUs|Mixed precision tokens/sec|FP32 tokens/sec|Speed-up with mixed precision|
+|Number of GPUs|Number of tokens used with mixed precision|Number of tokens used with FP32|Speed-up with mixed precision|
 |---:|---:|---:|---:|
-|**1**|132|153|0.86|
+|1|168|173|0.97|

-
-This table shows the inference performance results for WaveGlow. 
+The following table shows the inference performance results for WaveGlow.
 Results are measured in the number of output audio samples per second.<sup>1</sup>

-|Number of GPUs|Mixed precision samples/sec|FP32 samples/sec|Speed-up with mixed precision|
+|Number of GPUs|Number of samples used with mixed precision|Number of samples used with FP32|Speed-up with mixed precision|
 |---:|---:|---:|---:|
-|**1**|425379|376037|1.13|
+|1|583318|553380|1.05|

 <sup>1</sup>With sampling rate equal to 22050, one second of audio is generated from 22050 samples.

-To achieve these same results, follow the [Quick Start Guide](#quick-start-guide) outlined above.
+To achieve these same results, follow the steps in the [Quick Start Guide](#quick-start-guide).

-# Changelog
+## Changelog
 March 2019
-
 * Initial release

-# Known issues
-For mixed precision training of Tacotron 2, dropouts on LSTMCells 
-cause overflow leading to dynamic loss scaling going to 1, see [here](https://github.com/NVIDIA/tacotron2/issues/112). 
-The current workaround, which is already applied in our model implementation, 
-is to convert `attention_rnn` and `decoder_rnn` back to FP32 precision.
+June 2019
+* AMP support
+* Data preprocessing for Tacotron 2 training
+* Fixed dropouts on LSTMCells
+
+## Known issues
+There are no known issues in this release.
--- a/PyTorch/SpeechSynthesis/Tacotron2/audio/audio_fp16.wav
+++ b/PyTorch/SpeechSynthesis/Tacotron2/audio/audio_fp16.wav
--- a/PyTorch/SpeechSynthesis/Tacotron2/audio/audio_fp32.wav
+++ b/PyTorch/SpeechSynthesis/Tacotron2/audio/audio_fp32.wav
--- a/Show More
+++ b/Show More
				`@ -1 +0,0 @@`
				{"metric_keys": ["train.loss", "val.acc"], "metrics": {"train.loss": [8.812795396454991, 5.914838795058071, 6, 5.092440919584583, 4.887887316499735, 4.744666463422983, 4.694560192557922, 4.567333741479565, 4.492525351620137, 6, 4.408311570055099, 4.334232046614567, 6, 4.263646488106407, 4.2514614595596445, 4.2171871953656055, 4.206751160226014, 4.1795772798196715, 4.156515416099515, 6, 4.108870625495911, 4.0985876759066855, 4.075221928967139, 4.080158276849438, 6, 4.033980131669857, 4.037739227952915, 6, 3.99941903534935, 6, 3.9875937877263565, 3.971811039999583, 3.980771179282509, 3.953947089124455, 3.9305202960968018, 3.9366443781873546, 3.9252991879350754, 3.8827156307395367, 3.9388060424005102, 3.88922161618695, 3.8874285418914396, 6, 3.8936942113018453, 3.537499847891029, 3.4058184228089177, 6, 6, 3.3219671837627627, 3.295458280363458, 3.262115957955606, 6, 6, 6, 3.2190717260910433, 3.213117691627236, 3.1739242191397987, 3.1791626058811704, 3.2088054501854177, 3.1719801842385507, 3.187761370792139, 3.1809213312432236, 3.1823803410259397, 3.1752594631311677, 3.1709555600928425, 3.1823559530957817], "val.acc": [0.025120322205631106, 0.06065902615325462, 0.08224594352985645, 0.09868630608427395, 0.11402055039858493, 0.11779455253460233, 0.1232203941357061, 0.13708232144631768, 0.13614397127135028, 0.13289094380937685, 0.14004009449749777, 0.1369843423424096, 0.13877603069457692, 0.15418866425831707, 0.1500001994042602, 0.1542573219664272, 0.14771151227315413, 0.15896497766306272, 0.1600724682809656, 0.15881491661088476, 0.16213217020726906, 0.16466781280171408, 0.15738430149539484, 0.16634155547369375, 0.1623110334880526, 0.16394517553182106, 0.1494171026560053, 0.16762167601953265, 0.16063595691096758, 0.16982898253523193, 0.17321918229909394, 0.17242960413896102, 0.1625123530546557, 0.18330429802960516, 0.16333127233412115, 0.17973452067250242, 0.16699022570278652, 0.17183956548028687, 0.17168756775917593, 0.17547718325478198, 0.1750019046551496, 0.18416070771679066, 0.1711460087987496, 0.231325087097653, 0.23716038401167305, 0.23886896590018106, 0.2403412383214709, 0.24380227870861898, 0.24383605475007317, 0.2449733300818802, 0.24508423152154857, 0.24252172333110344, 0.24566254540226004, 0.24661345705692578, 0.25123807624083877, 0.25184439401895475, 0.2519010236397111, 0.25191664071239706, 0.2522156441636805, 0.25215053241008767, 0.2525434296889651, 0.2524917808636186, 0.2527410425201369, 0.2534121449798447, 0.25279479287831214]}, "bs": [64], "model": "", "ngpus": [8]}