Updating models

2019-07-08 22:51:28 +02:00 · 2019-07-08 22:51:28 +02:00 · 0663b67c1a
parent f89dcca19d
commit 0663b67c1a
283 changed files with 112904 additions and 133470 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,3 @@
 repos.cfg
 repos_init.cfg
 nvtool*
--- a/MxNet/Classification/RN50v1.5/LICENSE
+++ b/MxNet/Classification/RN50v1.5/LICENSE
@ -1,4 +1,3 @@
                                 Apache License
                           Version 2.0, January 2004
                        http://www.apache.org/licenses/
--- a/MxNet/Classification/RN50v1.5/init.py
+++ b/MxNet/Classification/RN50v1.5/init.py
--- a/MxNet/Classification/RN50v1.5/data.py
+++ b/MxNet/Classification/RN50v1.5/data.py
@ -1,5 +1,7 @@
 # -----------------------------------------------------------------------
 # Copyright 2017-2018 The Apache Software Foundation
 #
 #
 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
 # distributed with this work for additional information
--- a/PyTorch/Classification/RN50v1.5/img/.gitkeep
+++ b/PyTorch/Classification/RN50v1.5/img/.gitkeep
--- a/PyTorch/Classification/RN50v1.5/resnet50v1.5/README.md
+++ b/PyTorch/Classification/RN50v1.5/resnet50v1.5/README.md
--- a/PyTorch/Detection/SSD/Dockerfile
+++ b/PyTorch/Detection/SSD/Dockerfile
@ -1,4 +1,4 @@
-FROM nvcr.io/nvidia/pytorch:19.03-py3
+FROM nvcr.io/nvidia/pytorch:19.05-py3
 # Set working directory
 WORKDIR /mlperf
--- a/PyTorch/Detection/SSD/qa/benchmark_baselines/SSD300_pytorch_18.08_inference_fp16.json
+++ b/PyTorch/Detection/SSD/qa/benchmark_baselines/SSD300_pytorch_18.08_inference_fp16.json
@ -1,31 +0,0 @@
 {
    "model": "",
    "ngpus": [1, 4, 8],
    "bs": [2, 4, 8, 16, 32, 64, 128],
    "metric_keys": ["images_per_second"],
    "metrics": {
        "1": {
            "2": {
                "images_per_second": 191.25867003414876
            },
            "4": {
                "images_per_second": 340.9537905548054
            },
            "8": {
                "images_per_second": 517.2612062140391
            },
            "16": {
                "images_per_second": 711.5516679788083
            },
            "32": {
                "images_per_second": 812.9203401838566
            },
            "64": {
                "images_per_second": 951.7432815456556
            },
            "128": {
                "images_per_second": 876.1868813828711
            }
        }
    }
 }
--- a/PyTorch/Detection/SSD/qa/benchmark_baselines/SSD300_pytorch_18.08_inference_fp32.json
+++ b/PyTorch/Detection/SSD/qa/benchmark_baselines/SSD300_pytorch_18.08_inference_fp32.json
@ -1,31 +0,0 @@
 {
    "model": "",
    "ngpus": [1, 4, 8],
    "bs": [2, 4, 8, 16, 32, 64, 128],
    "metric_keys": ["images_per_second"],
    "metrics": {
        "1": {
            "2": {
                "images_per_second": 174.58768325581374
            },
            "4": {
                "images_per_second": 254.24180710755593
            },
            "8": {
                "images_per_second": 308.95847419165545
            },
            "16": {
                "images_per_second": 419.60746029488445
            },
            "32": {
                "images_per_second": 453.81433823995565
            },
            "64": {
                "images_per_second": 592.6385687558369
            },
            "128": {
                "images_per_second": 603.8453409148115
            }
        }
    }
 }
--- a/PyTorch/Detection/SSD/qa/benchmark_baselines/SSD300_pytorch_18.08_training_fp16.json
+++ b/PyTorch/Detection/SSD/qa/benchmark_baselines/SSD300_pytorch_18.08_training_fp16.json
@ -1,59 +0,0 @@
 {
    "model": "",
    "ngpus": [1, 4, 8],
    "bs": [2, 4, 8, 16, 32, 64],
    "metric_keys": ["images_per_second"],
    "metrics": {
        "1": {
            "2": {
                "images_per_second": 40.71944999694824
            },
            "4": {
                "images_per_second": 68.22257804870605
            },
            "8": {
                "images_per_second": 121.42024612426758
            },
            "16": {
                "images_per_second": 159.56442260742188
            },
            "32": {
                "images_per_second": 185.69010543823242
            }
        },
        "4": {
            "2": {
                "images_per_second": 40.75998783111572
            },
            "4": {
                "images_per_second": 75.58991050720215
            },
            "8": {
                "images_per_second": 142.64888381958008
            },
            "16": {
                "images_per_second": 256.07005310058594
            },
            "32": {
                "images_per_second": 300.8989944458008
            }
        },
        "8": {
            "2": {
                "images_per_second": 61.28578186035156
            },
            "4": {
                "images_per_second": 119.46021270751953
            },
            "8": {
                "images_per_second": 231.7295379638672
            },
            "16": {
                "images_per_second": 430.5494079589844
            },
            "32": {
                "images_per_second": 454.2975769042969
            }
        }
    }
 }
--- a/PyTorch/Detection/SSD/qa/benchmark_baselines/SSD300_pytorch_18.08_training_fp32.json
+++ b/PyTorch/Detection/SSD/qa/benchmark_baselines/SSD300_pytorch_18.08_training_fp32.json
@ -1,59 +0,0 @@
 {
    "model": "",
    "ngpus": [1, 4, 8],
    "bs": [2, 4, 8, 16, 32],
    "metric_keys": ["images_per_second"],
    "metrics": {
        "1": {
            "2": {
                "images_per_second": 48.635780334472656
            },
            "4": {
                "images_per_second": 66.06407419840494
            },
            "8": {
                "images_per_second": 83.91736857096353
            },
            "16": {
                "images_per_second": 102.67040761311848
            },
            "32": {
                "images_per_second": 110.02347819010416
            }
        },
        "4": {
            "2": {
                "images_per_second": 41.199180603027344
            },
            "4": {
                "images_per_second": 79.85076141357422
            },
            "8": {
                "images_per_second": 145.39981587727863
            },
            "16": {
                "images_per_second": 247.95855712890625
            },
            "32": {
                "images_per_second": 341.29132080078125
            }
        },
        "8": {
            "2": {
                "images_per_second": 63.07561111450195
            },
            "4": {
                "images_per_second": 123.25757344563802
            },
            "8": {
                "images_per_second": 237.3413340250651
            },
            "16": {
                "images_per_second": 376.59598795572913
            },
            "32": {
                "images_per_second": 507.9451497395833
            }
        }
    }
 }
--- a/PyTorch/Detection/SSD/qa/benchmark_baselines/SSD300_pytorch_19.01_inference_fp16.json
+++ b/PyTorch/Detection/SSD/qa/benchmark_baselines/SSD300_pytorch_19.01_inference_fp16.json
@ -1,34 +0,0 @@
 {
   "bs" : [
      2,
      4,
      8,
      16,
      32
   ],
   "metric_keys" : [
      "images_per_second"
   ],
   "metrics" : {
      "1" : {
         "16" : {
            "images_per_second" : 470.099200788709
         },
         "2" : {
            "images_per_second" : 163.117099093173
         },
         "32" : {
            "images_per_second" : 520.538879400471
         },
         "4" : {
            "images_per_second" : 296.604178917743
         },
         "8" : {
            "images_per_second" : 412.522394180558
         }
      }
   },
   "ngpus" : [
      1
   ]
 }
--- a/PyTorch/Detection/SSD/qa/benchmark_baselines/SSD300_pytorch_19.01_inference_fp32.json
+++ b/PyTorch/Detection/SSD/qa/benchmark_baselines/SSD300_pytorch_19.01_inference_fp32.json
@ -1,34 +0,0 @@
 {
   "bs" : [
      2,
      4,
      8,
      16,
      32
   ],
   "metric_keys" : [
      "images_per_second"
   ],
   "metrics" : {
      "1" : {
         "16" : {
            "images_per_second" : 280.570005994299
         },
         "2" : {
            "images_per_second" : 147.914221468741
         },
         "32" : {
            "images_per_second" : 302.430594818483
         },
         "4" : {
            "images_per_second" : 201.622430560779
         },
         "8" : {
            "images_per_second" : 228.159516872363
         }
      }
   },
   "ngpus" : [
      1
   ]
 }
--- a/PyTorch/Detection/SSD/qa/benchmark_baselines/SSD300_pytorch_19.01_training_fp16.json
+++ b/PyTorch/Detection/SSD/qa/benchmark_baselines/SSD300_pytorch_19.01_training_fp16.json
@ -1,52 +0,0 @@
 {
   "bs" : [
      2,
      4,
      8,
      16,
      32
   ],
   "metric_keys" : [
      "images_per_second"
   ],
   "metrics" : {
      "1" : {
         "16" : {
            "images_per_second" : 192.623916625977
         },
         "2" : {
            "images_per_second" : 48.7488899230957
         },
         "32" : {
            "images_per_second" : 204.250648498535
         },
         "4" : {
            "images_per_second" : 95.4697418212891
         },
         "8" : {
            "images_per_second" : 164.66495513916
         }
      },
      "4" : {
         "16" : {
            "images_per_second" : 701.366027832031
         },
         "2" : {
            "images_per_second" : 154.449935913086
         },
         "32" : {
            "images_per_second" : 771.171325683594
         },
         "4" : {
            "images_per_second" : 300.332641601562
         },
         "8" : {
            "images_per_second" : 550.924163818359
         }
      }
   },
   "ngpus" : [
      1,
      4
   ]
 }
--- a/PyTorch/Detection/SSD/qa/benchmark_baselines/SSD300_pytorch_19.01_training_fp32.json
+++ b/PyTorch/Detection/SSD/qa/benchmark_baselines/SSD300_pytorch_19.01_training_fp32.json
@ -1,45 +0,0 @@
 {
   "bs" : [
      2,
      4,
      8,
      16
   ],
   "metric_keys" : [
      "images_per_second"
   ],
   "metrics" : {
      "1" : {
         "16" : {
            "images_per_second" : 121.772495269775
         },
         "2" : {
            "images_per_second" : 56.0
         },
         "4" : {
            "images_per_second" : 90.5315437316895
         },
         "8" : {
            "images_per_second" : 103.113033294678
         }
      },
      "4" : {
         "16" : {
            "images_per_second" : 472.226806640625
         },
         "2" : {
            "images_per_second" : 184.061141967773
         },
         "4" : {
            "images_per_second" : 324.639801025391
         },
         "8" : {
            "images_per_second" : 391.055908203125
         }
      }
   },
   "ngpus" : [
      1,
      4
   ]
 }
--- a/PyTorch/Detection/SSD/qa/benchmark_baselines/SSD300_pytorch_19.05_inference_fp16.json
+++ b/PyTorch/Detection/SSD/qa/benchmark_baselines/SSD300_pytorch_19.05_inference_fp16.json
@ -1,34 +0,0 @@
 {
   "bs" : [
      2,
      4,
      8,
      16,
      32
   ],
   "metric_keys" : [
      "images_per_second"
   ],
   "metrics" : {
      "1" : {
         "16" : {
            "images_per_second" : 478.225033
         },
         "2" : {
            "images_per_second" : 148.5965123
         },
         "32" : {
            "images_per_second" : 531.1827376
         },
         "4" : {
            "images_per_second" : 283.3305197
         },
         "8" : {
            "images_per_second" : 418.7012914
         }
      }
   },
   "ngpus" : [
      1
   ]
 }
--- a/PyTorch/Detection/SSD/qa/benchmark_baselines/SSD300_pytorch_19.05_inference_fp32.json
+++ b/PyTorch/Detection/SSD/qa/benchmark_baselines/SSD300_pytorch_19.05_inference_fp32.json
@ -1,34 +0,0 @@
 {
   "bs" : [
      2,
      4,
      8,
      16,
      32
   ],
   "metric_keys" : [
      "images_per_second"
   ],
   "metrics" : {
      "1" : {
         "16" : {
            "images_per_second" : 280.4733254
         },
         "2" : {
            "images_per_second" : 143.8231571
         },
         "32" : {
            "images_per_second" : 305.4504603
         },
         "4" : {
            "images_per_second" : 202.6915644
         },
         "8" : {
            "images_per_second" : 230.262872
         }
      }
   },
   "ngpus" : [
      1
   ]
 }
--- a/PyTorch/Detection/SSD/qa/benchmark_performance.py
+++ b/PyTorch/Detection/SSD/qa/benchmark_performance.py
@ -1,81 +0,0 @@
 import argparse
 import subprocess
 from qa.qa_utils import compare_benchmarks, load_json, save_json, OKBLUE, ENDC, FAIL
 # parsing
 def parse_testscript_args():
    parser = argparse.ArgumentParser(description='PyTorch Benchmark Tests')
    parser.add_argument('--bs', default=[1], type=int, nargs='+')
    parser.add_argument('--ngpus', default=[1], type=int, nargs='+')
    parser.add_argument('--benchmark-mode', default='training', choices=['training', 'inference'],
                        help='benchmark training or inference', required=True)
    parser.add_argument('--bench-iterations', type=int, default=20, metavar='N',
                        help='Run N iterations while benchmarking (ignored when training and validation)')
    parser.add_argument('--bench-warmup', type=int, default=10, metavar='N',
                        help='Number of warmup iterations for benchmarking')
    parser.add_argument('--fp16', action='store_true', help='Run model in mixed precision.')
    parser.add_argument('-j', '--workers', default=4, type=int, metavar='N',
                        help='number of data loading workers')
    parser.add_argument('--data', type=str, metavar='<PATH>', required=True,
                        help='path to the dataset')
    parser.add_argument('--results-file', default='experiment_raport.json', type=str,
                        help='file in which to store JSON experiment raport')
    parser.add_argument('--benchmark-file', type=str, metavar='FILE', required=True,
                        help='path to the file with baselines')
    return parser.parse_args()
 # job command
 command_template = 'python3 {launcher} qa/qa_perf_main.py --bs {bs} --ebs {bs} ' \
                   '--benchmark-mode {mode} --benchmark-warmup {bw} --benchmark-iterations {bi} {fp16} ' \
                   '--backbone resnet50 --seed 1 --data {data} --results-file {results_file} --benchmark-file {benchmark_file}'
 if __name__ == '__main__':
    args = parse_testscript_args()
    fp16 = '--fp16' if args.fp16 else ''
    # create results json file
    # todo: maybe some template json file?
    results = {'ngpus': args.ngpus,
               'bs': args.bs,
               'metric_keys': ['images_per_second'],
               'metrics': {}}
    for gpu in args.ngpus:
        results['metrics'][str(gpu)] = {}
        for bs in args.bs:
            results['metrics'][str(gpu)][str(bs)] = {'images_per_second': None}
    save_json(args.results_file, results)
    # run qa_perf_main.py tests one by one
    for gpu in args.ngpus:
        launcher = '' if gpu == 1 else '-m torch.distributed.launch --nproc_per_node={}'.format(gpu)
        for bs in args.bs:
            print('#' * 80)
            command = command_template.format(launcher=launcher, bs=bs, workers=args.workers, mode=args.benchmark_mode,
                                              bw=args.bench_warmup, bi=args.bench_iterations, fp16=fp16,
                                              data=args.data, results_file=args.results_file,
                                              benchmark_file=args.benchmark_file)
            print('Running "{}"'.format(command))
            process = subprocess.Popen(command, shell=True)
            output, error = process.communicate()
            if error is not None:
                print(FAIL + 'Program exited with status {}. Data has not been collected'.format(error) + ENDC)
            # elif results['metrics'][str(gpu)][str(bs)]['images_per_second'] is None:
            #     print(WARNING + 'Program did not end sucessfully. Data has not been collected.' + ENDC)
            else:
                print(OKBLUE + 'Program ended sucessfully. Data has been collected.' + ENDC)
    results_data = load_json(args.results_file)
    benchmark_data = load_json(args.benchmark_file)
    exit_code = compare_benchmarks(results_data, benchmark_data, args, 0.16 if args.benchmark_mode == 'inference' else 0.1)
    print(exit_code)
    exit(exit_code)
--- a/PyTorch/Detection/SSD/qa/curve_baselines/SSD300_pytorch_18.08_fp16_full_run_acc_baseline.json
+++ b/PyTorch/Detection/SSD/qa/curve_baselines/SSD300_pytorch_18.08_fp16_full_run_acc_baseline.json
@ -1 +0,0 @@
 {"metric_keys": ["train.loss", "val.acc"], "metrics": {"train.loss": [8.812795396454991, 5.914838795058071, 6, 5.092440919584583, 4.887887316499735, 4.744666463422983, 4.694560192557922, 4.567333741479565, 4.492525351620137, 6, 4.408311570055099, 4.334232046614567, 6, 4.263646488106407, 4.2514614595596445, 4.2171871953656055, 4.206751160226014, 4.1795772798196715, 4.156515416099515, 6, 4.108870625495911, 4.0985876759066855, 4.075221928967139, 4.080158276849438, 6, 4.033980131669857, 4.037739227952915, 6, 3.99941903534935, 6, 3.9875937877263565, 3.971811039999583, 3.980771179282509, 3.953947089124455, 3.9305202960968018, 3.9366443781873546, 3.9252991879350754, 3.8827156307395367, 3.9388060424005102, 3.88922161618695, 3.8874285418914396, 6, 3.8936942113018453, 3.537499847891029, 3.4058184228089177, 6, 6, 3.3219671837627627, 3.295458280363458, 3.262115957955606, 6, 6, 6, 3.2190717260910433, 3.213117691627236, 3.1739242191397987, 3.1791626058811704, 3.2088054501854177, 3.1719801842385507, 3.187761370792139, 3.1809213312432236, 3.1823803410259397, 3.1752594631311677, 3.1709555600928425, 3.1823559530957817], "val.acc": [0.025120322205631106, 0.06065902615325462, 0.08224594352985645, 0.09868630608427395, 0.11402055039858493, 0.11779455253460233, 0.1232203941357061, 0.13708232144631768, 0.13614397127135028, 0.13289094380937685, 0.14004009449749777, 0.1369843423424096, 0.13877603069457692, 0.15418866425831707, 0.1500001994042602, 0.1542573219664272, 0.14771151227315413, 0.15896497766306272, 0.1600724682809656, 0.15881491661088476, 0.16213217020726906, 0.16466781280171408, 0.15738430149539484, 0.16634155547369375, 0.1623110334880526, 0.16394517553182106, 0.1494171026560053, 0.16762167601953265, 0.16063595691096758, 0.16982898253523193, 0.17321918229909394, 0.17242960413896102, 0.1625123530546557, 0.18330429802960516, 0.16333127233412115, 0.17973452067250242, 0.16699022570278652, 0.17183956548028687, 0.17168756775917593, 0.17547718325478198, 0.1750019046551496, 0.18416070771679066, 0.1711460087987496, 0.231325087097653, 0.23716038401167305, 0.23886896590018106, 0.2403412383214709, 0.24380227870861898, 0.24383605475007317, 0.2449733300818802, 0.24508423152154857, 0.24252172333110344, 0.24566254540226004, 0.24661345705692578, 0.25123807624083877, 0.25184439401895475, 0.2519010236397111, 0.25191664071239706, 0.2522156441636805, 0.25215053241008767, 0.2525434296889651, 0.2524917808636186, 0.2527410425201369, 0.2534121449798447, 0.25279479287831214]}, "bs": [64], "model": "", "ngpus": [8]}
--- a/PyTorch/Detection/SSD/qa/curve_baselines/SSD300_pytorch_18.08_fp32_full_run_acc_baseline.json
+++ b/PyTorch/Detection/SSD/qa/curve_baselines/SSD300_pytorch_18.08_fp32_full_run_acc_baseline.json
@ -1 +0,0 @@
 {"metric_keys": ["train.loss", "val.acc"], "metrics": {"train.loss": [9.887425426832973, 6.30290542835752, 5.566619733535567, 5.192713968618468, 4.943981836976963, 4.777146058311629, 4.682364774062644, 4.566371860462505, 4.479279315107254, 5, 4.398730874582149, 4.31779890601812, 4.293896813580043, 4.250142149529603, 4.219812418175577, 4.21572122303159, 4.187492328960302, 4.147948342119242, 4.134799897931028, 4.131298205737984, 4.071315974647822, 4.074750597299968, 4.0595350983882055, 4.042616275720722, 4.029284068070124, 4.02082926113012, 3.9983501902834298, 4.00984974094874, 3.9730074155799167, 5, 3.9646901324326294, 3.952598022061144, 3.944574903713043, 3.9182081201711596, 3.9252539055836775, 3.907297405092997, 3.8867245969813986, 3.87151758639573, 3.8793927009449254, 3.8687505586699107, 3.8750464156204956, 5, 3.8645522469516402, 3.504709825765618, 3.3920036476251862, 3.318732707260998, 5, 3.295415750237011, 3.2602547589347872, 5, 5, 5, 5, 3.199645553613854, 3.1623374312205086, 5, 3.147109237820821, 3.158245995575684, 3.1465386938319977, 3.1480963979746055, 3.151234711101482, 3.146022343739672, 3.1410668343956294, 3.142435818259893, 3.123337645718104], "val.acc": [0.01106397969239677, 0.04958324872172423, 0.07470961174804201, 0.08412781056028416, 0.1052591997157941, 0.11592629309116805, 0.1275672396324061, 0.12472585915140484, 0.13138377072048255, 0.1262696666605193, 0.13354663690485083, 0.14424123617821044, 0.14059169419863984, 0.14768715602101368, 0.15450788443085858, 0.14792122925940135, 0.1508861356435794, 0.157419558440425, 0.15279118544884585, 0.16075469826863828, 0.14747077091644412, 0.16340857637480236, 0.14427366437395484, 0.15709914018423293, 0.16324391683493303, 0.16440443232887508, 0.16479726175439752, 0.17508843799046686, 0.16142292492169025, 0.1643848499786872, 0.16912610131976924, 0.16376330941842296, 0.16894551721633602, 0.17771765128166106, 0.1749561896689298, 0.1695538322677119, 0.16778561571905298, 0.16380194923909086, 0.16994188486879763, 0.1716953661397215, 0.17755697810460197, 0.17187995479426885, 0.1742018462295355, 0.23426649845846764, 0.23613136034024038, 0.24175797706337981, 0.2425279583355936, 0.24352550398110506, 0.24411115979837528, 0.24656561042490024, 0.24383524308920906, 0.24686666489675338, 0.24814559219197632, 0.24840393696219026, 0.251965847689631, 0.25254138256097747, 0.2523565615073023, 0.2529904738785998, 0.253555154014026, 0.2530651493203877, 0.25358174010109197, 0.2537683728256746, 0.2539384684886946, 0.2540280117408162, 0.2534652864501853]}, "bs": [32], "model": "", "ngpus": [8]}
--- a/PyTorch/Detection/SSD/qa/curve_baselines/SSD300_pytorch_19.01_fp16_1epoch_run_acc_baseline.json
+++ b/PyTorch/Detection/SSD/qa/curve_baselines/SSD300_pytorch_19.01_fp16_1epoch_run_acc_baseline.json
@ -1,20 +0,0 @@
 {
   "metrics" : {
      "val.acc" : [
         0.0100971670737651
      ],
      "train.loss" : [
         9.85026645043801
      ]
   },
   "ngpus" : [
      8
   ],
   "metric_keys" : [
      "train.loss",
      "val.acc"
   ],
   "bs" : [
      64
   ]
 }
--- a/PyTorch/Detection/SSD/qa/curve_baselines/SSD300_pytorch_19.01_fp32_1epoch_run_acc_baseline.json
+++ b/PyTorch/Detection/SSD/qa/curve_baselines/SSD300_pytorch_19.01_fp32_1epoch_run_acc_baseline.json
@ -1,20 +0,0 @@
 {
   "bs" : [
      32
   ],
   "metrics" : {
      "train.loss" : [
         8.79916159380589
      ],
      "val.acc" : [
         0.0238952010105531
      ]
   },
   "metric_keys" : [
      "train.loss",
      "val.acc"
   ],
   "ngpus" : [
      8
   ]
 }
--- a/PyTorch/Detection/SSD/qa/qa_accuracy_main.py
+++ b/PyTorch/Detection/SSD/qa/qa_accuracy_main.py
@ -1,73 +0,0 @@
 # core imports
 import os
 import numpy as np
 # pytorch imports
 import torch
 import torch.utils.data.distributed
 # Apex imports
 try:
    from apex.parallel.LARC import LARC
    from apex.parallel import DistributedDataParallel as DDP
    from apex.fp16_utils import *
 except ImportError:
    raise ImportError("Please install APEX from https://github.com/nvidia/apex")
 # project imports
 from src.train import train_loop
 from main import train, make_parser
 from src.logger import Logger
 from qa.qa_utils import load_json, create_json_file, compare_acc, save_json
 RESULT = None
 def add_benchmark_args(parser):
    parser.add_argument('--benchmark-mode', type=str, default='epoch-accuracy',
                        choices=['full-accuracy', 'epoch-accuracy'], required=True)
    parser.add_argument('--benchmark-file', type=str, default=None, metavar='FILE',
                        help='path to the file with baselines', required=True)
    return parser
 def main(args):
    if args.local_rank == 0:
        os.makedirs('./models', exist_ok=True)
    if args.seed is not None:
        print("Using seed = {}".format(args.seed))
        torch.manual_seed(args.seed)
        np.random.seed(seed=args.seed)
    torch.backends.cudnn.benchmark = True
    if args.benchmark_mode == 'epoch-accuracy':
        args.epochs = 1
    train_loop_func = train_loop
    logger = Logger('Accuracy test', print_freq=10)
    args.evaluation = list(range(90))
    train(train_loop_func, logger, args)
    exit_code = 0
    if args.local_rank == 0:
        train_loss_results, val_acc_results, train_time_results = logger.print_results()
        print(train_time_results)
        print(train_loss_results)
        print(val_acc_results)
        measured_results = create_json_file(val_acc_results, train_loss_results, ngpus=8, bs=args.batch_size)
        save_json('/results/results.json', measured_results)
        print(measured_results)
        benchmark_results = load_json(args.benchmark_file)
        exit_code = compare_acc(measured_results, benchmark_results, args)
        exit(exit_code)
 if __name__ == "__main__":
    parser = make_parser()
    parser = add_benchmark_args(parser)
    args = parser.parse_args()
    print(args)
    main(args)
--- a/PyTorch/Detection/SSD/qa/qa_perf_main.py
+++ b/PyTorch/Detection/SSD/qa/qa_perf_main.py
@ -1,199 +0,0 @@
 # core imports
 import os
 import numpy as np
 import json
 from pprint import pprint
 import time
 # pytorch imports
 import torch
 import torch.utils.data.distributed
 from torch.autograd import Variable
 # Apex imports
 try:
    from apex.parallel.LARC import LARC
    from apex.parallel import DistributedDataParallel as DDP
    from apex.fp16_utils import *
 except ImportError:
    raise ImportError("Please install APEX from https://github.com/nvidia/apex")
 # project imports
 from main import train, make_parser
 from src.logger import BenchLogger
 # from src.train import benchmark_inference_loop, benchmark_train_loop
 from SSD import _C as C
 RESULT = None
 def add_benchmark_args(parser):
    parser.add_argument('--benchmark-mode', type=str, choices=['training', 'inference'],
                        default='inference', required=True)
    parser.add_argument('--results-file', default='experiment_raport.json', type=str,
                        help='file in which to store JSON experiment raport')
    parser.add_argument('--benchmark-file', type=str, default=None, metavar='FILE',
                        help='path to the file with baselines')
    return parser
 def benchmark_train_loop(model, loss_func, epoch, optim, train_dataloader, val_dataloader, encoder, iteration, logger, args, mean, std):
    start_time = None
    # tensor for results
    result = torch.zeros((1,)).cuda()
    for i, data in enumerate(loop(train_dataloader)):
        if i >= args.benchmark_warmup:
            start_time = time.time()
        img = data[0][0][0]
        bbox = data[0][1][0]
        label = data[0][2][0]
        label = label.type(torch.cuda.LongTensor)
        bbox_offsets = data[0][3][0]
        # handle random flipping outside of DALI for now
        bbox_offsets = bbox_offsets.cuda()
        img, bbox = C.random_horiz_flip(img, bbox, bbox_offsets, 0.5, False)
        if not args.no_cuda:
            img = img.cuda()
            bbox = bbox.cuda()
            label = label.cuda()
            bbox_offsets = bbox_offsets.cuda()
        img.sub_(mean).div_(std)
        N = img.shape[0]
        if bbox_offsets[-1].item() == 0:
            print("No labels in batch")
            continue
        bbox, label = C.box_encoder(N, bbox, bbox_offsets, label, encoder.dboxes.cuda(), 0.5)
        M = bbox.shape[0] // N
        bbox = bbox.view(N, M, 4)
        label = label.view(N, M)
        ploc, plabel = model(img)
        ploc, plabel = ploc.float(), plabel.float()
        trans_bbox = bbox.transpose(1, 2).contiguous().cuda()
        if not args.no_cuda:
            label = label.cuda()
        gloc = Variable(trans_bbox, requires_grad=False)
        glabel = Variable(label, requires_grad=False)
        loss = loss_func(ploc, plabel, gloc, glabel)
        # loss scaling
        if args.fp16:
            if args.amp:
                with optim.scale_loss(loss) as scale_loss:
                    scale_loss.backward()
            else:
                optim.backward(loss)
        else:
            loss.backward()
        optim.step()
        optim.zero_grad()
        iteration += 1
        # reduce all results from every gpu
        if i >= args.benchmark_warmup + args.benchmark_iterations:
            result.data[0] = logger.print_result()
            if args.N_gpu > 1:
                torch.distributed.reduce(result, 0)
            if args.local_rank == 0:
                global RESULT
                RESULT = float(result.data[0])
            return
        if i >= args.benchmark_warmup:
            logger.update(args.batch_size, time.time() - start_time)
 def loop(dataloader):
    while True:
        for data in dataloader:
            yield data
 def benchmark_inference_loop(model, loss_func, epoch, optim, train_dataloader, val_dataloader, encoder, iteration, logger, args, mean, std):
    assert args.N_gpu == 1, 'Inference benchmark only on 1 gpu'
    start_time = None
    model.eval()
    i=-1
    dataloader = loop(val_dataloader)
    while True:
        i+=1
        with torch.no_grad():
            torch.cuda.synchronize()
            if i >= args.benchmark_warmup:
                start_time = time.time()
            data = next(dataloader)
            img = data[0]
            if not args.no_cuda:
                img = img.cuda()
            if args.fp16:
                img = img.half()
            img.sub_(mean).div_(std)
            img = Variable(img, requires_grad=False)
            _ = model(img)
            torch.cuda.synchronize()
            if i >= args.benchmark_warmup + args.benchmark_iterations:
                global RESULT
                RESULT = logger.print_result()
                return
            if i >= args.benchmark_warmup:
                logger.update(args.batch_size, time.time() - start_time)
 def main(args):
    if args.local_rank == 0:
        os.makedirs('./models', exist_ok=True)
    if args.seed is not None:
        print("Using seed = {}".format(args.seed))
        torch.manual_seed(args.seed)
        np.random.seed(seed=args.seed)
    torch.backends.cudnn.benchmark = True
    if args.benchmark_mode == 'training':
        train_loop_func = benchmark_train_loop
        logger = BenchLogger('Training benchmark')
    else:
        train_loop_func = benchmark_inference_loop
        logger = BenchLogger('Inference benchmark')
    args.epochs = 1
    train(train_loop_func, logger, args)
    if args.local_rank == 0:
        global RESULT
        with open(args.results_file) as f:
            results = json.load(f)
        results['metrics'][str(args.N_gpu)][str(args.batch_size)] = {'images_per_second': RESULT}
        pprint(results)
        with open(args.results_file, 'w') as f:
            json.dump(results, f)
 if __name__ == "__main__":
    parser = make_parser()
    parser = add_benchmark_args(parser)
    args = parser.parse_args()
    print(args)
    main(args)
--- a/PyTorch/Detection/SSD/qa/qa_utils.py
+++ b/PyTorch/Detection/SSD/qa/qa_utils.py
@ -1,115 +0,0 @@
 import json
 # terminal stdout colors
 OKBLUE = '\033[94m'
 OKGREEN = '\033[92m'
 WARNING = '\033[93m'
 FAIL = '\033[91m'
 ENDC = '\033[0m'
 # load results and benchmark
 def load_json(filepath):
    with open(filepath) as f:
        data = json.load(f)
    return data
 def save_json(filepath, data):
    with open(filepath, 'w') as f:
        json.dump(data, f)
 # compare func
 def compare(measured_value, true_value, pmargin=0.1):
    assert 0 < pmargin < 1, 'Margin should be in range [0, 1]'
    return (1 - pmargin) * true_value < measured_value
 # compare 2 benchmark json files
 def compare_benchmarks(results, benchmark, args, pmargin=0.1):
    # sanity check
    for metric in results['metric_keys']:
        if metric not in benchmark['metric_keys']:
            assert False, "You want to compare {} metric which doesn't appear in benchmark file".format(metric)
    assert len(args.bs) <= len(benchmark['bs']), 'len(args.bs) <= len(benchmark["bs"] ({} <= {})'.format(len(args.bs),
                                                                                                         len(benchmark[
                                                                                                                 'bs']))
    assert len(args.bs) == len(results['bs']), 'len(args.bs) <= len(results["bs"] ({} == {})'.format(len(args.bs),
                                                                                                     len(results['bs']))
    for bs in results['bs']:
        if bs not in benchmark['bs']:
            assert False, "You want to compare batch size = {} which doesn't appear in benchmark file".format(bs)
    assert len(args.ngpus) <= len(benchmark['ngpus']), 'len(args.ngpus) <= len(benchmark["ngpus"]) ({} <= {})'.format(
        len(args.bs), len(benchmark['ngpus']))
    assert len(args.ngpus) == len(results['ngpus']), 'len(args.ngpus) == len(results["ngpus"]) ({} == {})'.format(
        len(args.bs), len(results['ngpus']))
    for gpu in results['ngpus']:
        if gpu not in benchmark['ngpus']:
            assert False, "You want to compare {} gpus results which don't appear in benchmark file".format(gpu)
    # compare measured numbers with benchmark
    exit = 0
    for metric in results['metric_keys']:
        for gpu in results['ngpus']:
            for bs in results['bs']:
                measured_metric = results['metrics'][str(gpu)][str(bs)][metric]
                ground_truth_metric = benchmark['metrics'][str(gpu)][str(bs)][metric]
                ok = compare(measured_metric, ground_truth_metric, pmargin)
                if ok:
                    print(OKGREEN + 'BENCHMARK PASSED: metric={} gpu={} bs={}'.format(metric, gpu, bs) + ENDC)
                else:
                    print(FAIL + 'BENCHMARK NOT PASSED: metric={} gpu={} bs={}'.format(metric, gpu, bs) + ENDC)
                    exit = 1
    return exit
 # compare 2 benchmark json files
 def compare_acc(results, benchmark, args):
    # sanity check
    for metric in results['metric_keys']:
        if metric not in benchmark['metric_keys']:
            assert False, "You want to compare {} metric which doesn't appear in benchmark file".format(metric)
    for bs in results['bs']:
        if bs not in benchmark['bs']:
            assert False, "You want to compare batch size = {} which doesn't appear in benchmark file".format(bs)
    for gpu in results['ngpus']:
        if gpu not in benchmark['ngpus']:
            assert False, "You want to compare {} gpus results which don't appear in benchmark file".format(gpu)
    # compare measured numbers with benchmark
    for i, (result, ground_truth) in enumerate(zip(results['metrics']['val.acc'], benchmark['metrics']['val.acc'])):
        if i > 43: # before first decay accuracy tends to vary more than 15% at ~30th epoch
            if ground_truth * 0.9 > result:
                print(FAIL + 'ACCURACY TEST NOT PASSED' + ENDC)
                return 1
    # compare measured numbers with benchmark
    for i, (result, ground_truth) in enumerate(zip(results['metrics']['train.loss'], benchmark['metrics']['train.loss'])):
        if i > 43:
            if ground_truth * 1.1 < result:
                print(FAIL + 'LOSS TEST NOT PASSED' + ENDC)
                return 1
    print(OKGREEN + 'ACCURACY TEST PASSED' + ENDC)
    return 0
 def create_json_file(val_acc_results, train_loss_results, ngpus=8, bs=32):
    results = {"ngpus": [ngpus],
               "bs": [bs],
               "metric_keys": ["train.loss", "val.acc"],
               "metrics": {
                   "train.loss": [],
                   "val.acc": []
               }
               }
    for i, ((epoch1, acc), (epoch2, loss)) in enumerate(zip(val_acc_results, train_loss_results)):
        assert i == epoch1 == epoch2
        results['metrics']['train.loss'].append(loss)
        results['metrics']['val.acc'].append(acc)
    return results
--- a/PyTorch/Detection/SSD/qa/testing_DGX1V_8GPU_fp16_1epoch_run.sh
+++ b/PyTorch/Detection/SSD/qa/testing_DGX1V_8GPU_fp16_1epoch_run.sh
@ -1,4 +0,0 @@
 #!/bin/bash
 python3 -m torch.distributed.launch --nproc_per_node=8 qa/qa_accuracy_main.py --bs 64 --fp16 --warmup 300 --learning-rate 2.6e-3 --seed 1 --benchmark-mode epoch-accuracy --benchmark-file qa/curve_baselines/SSD300_pytorch_19.01_fp16_1epoch_run_acc_baseline.json --data $1
--- a/PyTorch/Detection/SSD/qa/testing_DGX1V_8GPU_fp16_full_run.sh
+++ b/PyTorch/Detection/SSD/qa/testing_DGX1V_8GPU_fp16_full_run.sh
@ -1,4 +0,0 @@
 #!/bin/bash
 python3 -m torch.distributed.launch --nproc_per_node=8 qa/qa_accuracy_main.py --bs 64 --fp16 --warmup 300 --learning-rate 2.6e-3 --seed 1 --benchmark-mode full-accuracy --benchmark-file qa/curve_baselines/SSD300_pytorch_18.08_fp16_full_run_acc_baseline.json --data $1
--- a/PyTorch/Detection/SSD/qa/testing_DGX1V_8GPU_fp32_1epoch_run.sh
+++ b/PyTorch/Detection/SSD/qa/testing_DGX1V_8GPU_fp32_1epoch_run.sh
@ -1,4 +0,0 @@
 #!/bin/bash
 python3 -m torch.distributed.launch --nproc_per_node=8 qa/qa_accuracy_main.py --bs 32 --warmup 300 --learning-rate 2.6e-3 --seed 1 --benchmark-mode epoch-accuracy --benchmark-file qa/curve_baselines/SSD300_pytorch_19.01_fp32_1epoch_run_acc_baseline.json --data $1
--- a/PyTorch/Detection/SSD/qa/testing_DGX1V_8GPU_fp32_full_run.sh
+++ b/PyTorch/Detection/SSD/qa/testing_DGX1V_8GPU_fp32_full_run.sh
@ -1,4 +0,0 @@
 #!/bin/bash
 python3 -m torch.distributed.launch --nproc_per_node=8 qa/qa_accuracy_main.py --bs 32 --warmup 300 --learning-rate 2.6e-3 --seed 1 --benchmark-mode full-accuracy --benchmark-file qa/curve_baselines/SSD300_pytorch_18.08_fp32_full_run_acc_baseline.json --data $1
--- a/PyTorch/Detection/SSD/qa/testing_DGX1V_inference_benchmark_fp16.sh
+++ b/PyTorch/Detection/SSD/qa/testing_DGX1V_inference_benchmark_fp16.sh
@ -1,3 +0,0 @@
 #!/bin/bash
 python ./qa/benchmark_performance.py --benchmark-mode inference --ngpus 1 --bs 2 4 8 16 32 --fp16  --bench-warmup 100 --bench-iterations 200 --benchmark-file qa/benchmark_baselines/SSD300_pytorch_19.05_inference_fp16.json --data $1
--- a/PyTorch/Detection/SSD/qa/testing_DGX1V_inference_benchmark_fp32.sh
+++ b/PyTorch/Detection/SSD/qa/testing_DGX1V_inference_benchmark_fp32.sh
@ -1,3 +0,0 @@
 #!/bin/bash
 python ./qa/benchmark_performance.py --benchmark-mode inference --ngpus 1 --bs 2 4 8 16 32 --bench-warmup 100 --bench-iterations 200 --benchmark-file qa/benchmark_baselines/SSD300_pytorch_19.05_inference_fp32.json --data $1
--- a/PyTorch/Detection/SSD/qa/testing_DGX1V_training_benchmark_fp16.sh
+++ b/PyTorch/Detection/SSD/qa/testing_DGX1V_training_benchmark_fp16.sh
@ -1,3 +0,0 @@
 #!/bin/bash
 python ./qa/benchmark_performance.py --benchmark-mode training --ngpus 1 4 --bs 2 4 8 16 32 --fp16  --bench-warmup 100 --bench-iterations 200 --benchmark-file qa/benchmark_baselines/SSD300_pytorch_19.01_training_fp16.json --data $1
--- a/PyTorch/Detection/SSD/qa/testing_DGX1V_training_benchmark_fp32.sh
+++ b/PyTorch/Detection/SSD/qa/testing_DGX1V_training_benchmark_fp32.sh
@ -1,3 +0,0 @@
 #!/bin/bash
 python ./qa/benchmark_performance.py --benchmark-mode training --ngpus 1 4 --bs 2 4 8 16 --bench-warmup 100 --bench-iterations 200 --benchmark-file qa/benchmark_baselines/SSD300_pytorch_19.01_training_fp32.json --data $1
--- a/PyTorch/Detection/SSD/src/coco_pipeline.py
+++ b/PyTorch/Detection/SSD/src/coco_pipeline.py
@ -35,9 +35,9 @@ class COCOPipeline(Pipeline):
        super(COCOPipeline, self).__init__(batch_size=batch_size, device_id=device_id,
                                           num_threads=num_threads, seed = seed)
-        try:
+        if torch.distributed.is_initialized():
            shard_id = torch.distributed.get_rank()
-        except RuntimeError:
+        else:
            shard_id = 0
        self.input = ops.COCOReader(file_root = file_root, annotations_file = annotations_file,
--- a/PyTorch/LanguageModeling/BERT/.dockerignore
+++ b/PyTorch/LanguageModeling/BERT/.dockerignore
@ -0,0 +1,3 @@
 data/
 vocab/
 results/
--- a/PyTorch/LanguageModeling/BERT/.gitignore
+++ b/PyTorch/LanguageModeling/BERT/.gitignore
@ -0,0 +1,129 @@
 # Initially taken from Github's Python gitignore file
 # Byte-compiled / optimized / DLL files
 __pycache__/
 *.py[cod]
 *$py.class
 # C extensions
 *.so
 #Data       
 data/*/*/   
 data/*/*.zip
 # Distribution / packaging
 .Python
 build/
 develop-eggs/
 dist/
 downloads/
 eggs/
 .eggs/
 lib/
 lib64/
 parts/
 sdist/
 var/
 wheels/
 *.egg-info/
 .installed.cfg
 *.egg
 MANIFEST
 # PyInstaller
 #  Usually these files are written by a python script from a template
 #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 *.manifest
 *.spec
 # Installer logs
 pip-log.txt
 pip-delete-this-directory.txt
 # Unit test / coverage reports
 htmlcov/
 .tox/
 .nox/
 .coverage
 .coverage.*
 .cache
 nosetests.xml
 coverage.xml
 *.cover
 .hypothesis/
 .pytest_cache/
 # Translations
 *.mo
 *.pot
 # Django stuff:
 *.log
 local_settings.py
 db.sqlite3
 # Flask stuff:
 instance/
 .webassets-cache
 # Scrapy stuff:
 .scrapy
 # Sphinx documentation
 docs/_build/
 # PyBuilder
 target/
 # Jupyter Notebook
 .ipynb_checkpoints
 # IPython
 profile_default/
 ipython_config.py
 # pyenv
 .python-version
 # celery beat schedule file
 celerybeat-schedule
 # SageMath parsed files
 *.sage.py
 # Environments
 .env
 .venv
 env/
 venv/
 ENV/
 env.bak/
 venv.bak/
 # Spyder project settings
 .spyderproject
 .spyproject
 # Rope project settings
 .ropeproject
 # mkdocs documentation
 /site
 # mypy
 .mypy_cache/
 .dmypy.json
 dmypy.json
 # Pyre type checker
 .pyre/
 # vscode
 .vscode
 # TF code
 tensorflow_code
 # Models
 models
--- a/PyTorch/LanguageModeling/BERT/Dockerfile
+++ b/PyTorch/LanguageModeling/BERT/Dockerfile
@ -0,0 +1,27 @@
 ARG FROM_IMAGE_NAME=gitlab-master.nvidia.com:5005/dl/dgx/pytorch:19.05-py3-devel
 FROM ${FROM_IMAGE_NAME}
 RUN apt-get update && apt-get install -y pbzip2 pv bzip2 cabextract
 #WORKDIR /opt
 #RUN cd pytorch/apex \
 # && git fetch origin pull/182/head:norm_fix \
 # && git checkout norm_fix \
 # && python setup.py develop --cuda_ext --cpp_ext
 WORKDIR /opt
 RUN cd pytorch/apex ; \
  pip uninstall apex; \
  pip uninstall apex; \ 
  git checkout master;  \
  git pull; \
  pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" .
 WORKDIR /workspace
 RUN git clone https://github.com/attardi/wikiextractor.git
 RUN git clone https://github.com/soskek/bookcorpus.git
 WORKDIR /workspace/bert
 COPY . .
 RUN pip install tqdm boto3 requests six ipdb h5py html2text nltk progressbar
--- a/PyTorch/LanguageModeling/BERT/LICENSE
+++ b/PyTorch/LanguageModeling/BERT/LICENSE
@ -0,0 +1,202 @@
                                 Apache License
                           Version 2.0, January 2004
                        http://www.apache.org/licenses/
   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
   1. Definitions.
      "License" shall mean the terms and conditions for use, reproduction,
      and distribution as defined by Sections 1 through 9 of this document.
      "Licensor" shall mean the copyright owner or entity authorized by
      the copyright owner that is granting the License.
      "Legal Entity" shall mean the union of the acting entity and all
      other entities that control, are controlled by, or are under common
      control with that entity. For the purposes of this definition,
      "control" means (i) the power, direct or indirect, to cause the
      direction or management of such entity, whether by contract or
      otherwise, or (ii) ownership of fifty percent (50%) or more of the
      outstanding shares, or (iii) beneficial ownership of such entity.
      "You" (or "Your") shall mean an individual or Legal Entity
      exercising permissions granted by this License.
      "Source" form shall mean the preferred form for making modifications,
      including but not limited to software source code, documentation
      source, and configuration files.
      "Object" form shall mean any form resulting from mechanical
      transformation or translation of a Source form, including but
      not limited to compiled object code, generated documentation,
      and conversions to other media types.
      "Work" shall mean the work of authorship, whether in Source or
      Object form, made available under the License, as indicated by a
      copyright notice that is included in or attached to the work
      (an example is provided in the Appendix below).
      "Derivative Works" shall mean any work, whether in Source or Object
      form, that is based on (or derived from) the Work and for which the
      editorial revisions, annotations, elaborations, or other modifications
      represent, as a whole, an original work of authorship. For the purposes
      of this License, Derivative Works shall not include works that remain
      separable from, or merely link (or bind by name) to the interfaces of,
      the Work and Derivative Works thereof.
      "Contribution" shall mean any work of authorship, including
      the original version of the Work and any modifications or additions
      to that Work or Derivative Works thereof, that is intentionally
      submitted to Licensor for inclusion in the Work by the copyright owner
      or by an individual or Legal Entity authorized to submit on behalf of
      the copyright owner. For the purposes of this definition, "submitted"
      means any form of electronic, verbal, or written communication sent
      to the Licensor or its representatives, including but not limited to
      communication on electronic mailing lists, source code control systems,
      and issue tracking systems that are managed by, or on behalf of, the
      Licensor for the purpose of discussing and improving the Work, but
      excluding communication that is conspicuously marked or otherwise
      designated in writing by the copyright owner as "Not a Contribution."
      "Contributor" shall mean Licensor and any individual or Legal Entity
      on behalf of whom a Contribution has been received by Licensor and
      subsequently incorporated within the Work.
   2. Grant of Copyright License. Subject to the terms and conditions of
      this License, each Contributor hereby grants to You a perpetual,
      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
      copyright license to reproduce, prepare Derivative Works of,
      publicly display, publicly perform, sublicense, and distribute the
      Work and such Derivative Works in Source or Object form.
   3. Grant of Patent License. Subject to the terms and conditions of
      this License, each Contributor hereby grants to You a perpetual,
      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
      (except as stated in this section) patent license to make, have made,
      use, offer to sell, sell, import, and otherwise transfer the Work,
      where such license applies only to those patent claims licensable
      by such Contributor that are necessarily infringed by their
      Contribution(s) alone or by combination of their Contribution(s)
      with the Work to which such Contribution(s) was submitted. If You
      institute patent litigation against any entity (including a
      cross-claim or counterclaim in a lawsuit) alleging that the Work
      or a Contribution incorporated within the Work constitutes direct
      or contributory patent infringement, then any patent licenses
      granted to You under this License for that Work shall terminate
      as of the date such litigation is filed.
   4. Redistribution. You may reproduce and distribute copies of the
      Work or Derivative Works thereof in any medium, with or without
      modifications, and in Source or Object form, provided that You
      meet the following conditions:
      (a) You must give any other recipients of the Work or
          Derivative Works a copy of this License; and
      (b) You must cause any modified files to carry prominent notices
          stating that You changed the files; and
      (c) You must retain, in the Source form of any Derivative Works
          that You distribute, all copyright, patent, trademark, and
          attribution notices from the Source form of the Work,
          excluding those notices that do not pertain to any part of
          the Derivative Works; and
      (d) If the Work includes a "NOTICE" text file as part of its
          distribution, then any Derivative Works that You distribute must
          include a readable copy of the attribution notices contained
          within such NOTICE file, excluding those notices that do not
          pertain to any part of the Derivative Works, in at least one
          of the following places: within a NOTICE text file distributed
          as part of the Derivative Works; within the Source form or
          documentation, if provided along with the Derivative Works; or,
          within a display generated by the Derivative Works, if and
          wherever such third-party notices normally appear. The contents
          of the NOTICE file are for informational purposes only and
          do not modify the License. You may add Your own attribution
          notices within Derivative Works that You distribute, alongside
          or as an addendum to the NOTICE text from the Work, provided
          that such additional attribution notices cannot be construed
          as modifying the License.
      You may add Your own copyright statement to Your modifications and
      may provide additional or different license terms and conditions
      for use, reproduction, or distribution of Your modifications, or
      for any such Derivative Works as a whole, provided Your use,
      reproduction, and distribution of the Work otherwise complies with
      the conditions stated in this License.
   5. Submission of Contributions. Unless You explicitly state otherwise,
      any Contribution intentionally submitted for inclusion in the Work
      by You to the Licensor shall be under the terms and conditions of
      this License, without any additional terms or conditions.
      Notwithstanding the above, nothing herein shall supersede or modify
      the terms of any separate license agreement you may have executed
      with Licensor regarding such Contributions.
   6. Trademarks. This License does not grant permission to use the trade
      names, trademarks, service marks, or product names of the Licensor,
      except as required for reasonable and customary use in describing the
      origin of the Work and reproducing the content of the NOTICE file.
   7. Disclaimer of Warranty. Unless required by applicable law or
      agreed to in writing, Licensor provides the Work (and each
      Contributor provides its Contributions) on an "AS IS" BASIS,
      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
      implied, including, without limitation, any warranties or conditions
      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
      PARTICULAR PURPOSE. You are solely responsible for determining the
      appropriateness of using or redistributing the Work and assume any
      risks associated with Your exercise of permissions under this License.
   8. Limitation of Liability. In no event and under no legal theory,
      whether in tort (including negligence), contract, or otherwise,
      unless required by applicable law (such as deliberate and grossly
      negligent acts) or agreed to in writing, shall any Contributor be
      liable to You for damages, including any direct, indirect, special,
      incidental, or consequential damages of any character arising as a
      result of this License or out of the use or inability to use the
      Work (including but not limited to damages for loss of goodwill,
      work stoppage, computer failure or malfunction, or any and all
      other commercial damages or losses), even if such Contributor
      has been advised of the possibility of such damages.
   9. Accepting Warranty or Additional Liability. While redistributing
      the Work or Derivative Works thereof, You may choose to offer,
      and charge a fee for, acceptance of support, warranty, indemnity,
      or other liability obligations and/or rights consistent with this
      License. However, in accepting such obligations, You may act only
      on Your own behalf and on Your sole responsibility, not on behalf
      of any other Contributor, and only if You agree to indemnify,
      defend, and hold each Contributor harmless for any liability
      incurred by, or claims asserted against, such Contributor by reason
      of your accepting any such warranty or additional liability.
   END OF TERMS AND CONDITIONS
   APPENDIX: How to apply the Apache License to your work.
      To apply the Apache License to your work, attach the following
      boilerplate notice, with the fields enclosed by brackets "[]"
      replaced with your own identifying information. (Don't include
      the brackets!)  The text should be enclosed in the appropriate
      comment syntax for the file format. We also recommend that a
      file or class name and description of purpose be included on the
      same "printed page" as the copyright notice for easier
      identification within third-party archives.
   Copyright [yyyy] [name of copyright owner]
   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at
       http://www.apache.org/licenses/LICENSE-2.0
   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
--- a/PyTorch/LanguageModeling/BERT/README.md
+++ b/PyTorch/LanguageModeling/BERT/README.md
@ -0,0 +1,554 @@
 # Bert For PyTorch
 This repository provides scripts and recipes to pretrain BERT from a dataset of choice and achieve state of the art accuracy on relevant fine tuning tasks. This is tested and maintained by NVIDIA.
 ## Table Of Contents:
 * [The model](#the-model)
  * [Default configuration](#default-configuration)
 * [Setup](#setup)
  * [Requirements](#requirements)
 * [Quick start guide](#quick-start-guide)
 * [Details](#details)
  * [Command line options](#command-line-options)
  * [Getting the data](#getting-the-data)
  * [Training process](#training-process)
  * [Pre-training](#pre-training)
  * [Fine tuning](#fine-tuning)
  * [Enabling mixed precision](#enabling-mixed-precision)
  * [Inference process](#inference-process)
 * [Benchmarking](#benchmarking)
  * [Training performance benchmark](#training-performance-benchmark)
  * [Inference performance benchmark](#inference-performance-benchmark)
 * [Results](#results)
  * [Training accuracy results](#training-accuracy-results)
  * [Training stability test](#training-stability-test)
  * [Training performance results](#training-performance-results)
      * [NVIDIA DGX-1 (8x V100 16G)](#nvidia-dgx-1-8x-v100-16g)
      * [NVIDIA DGX-1 (8x V100 32G)](#nvidia-dgx-1-8x-v100-32g)
      * [NVIDIA DGX-2 (16x V100 32G)](#nvidia-dgx-2-16x-v100-32g)
  * [Inference performance results](#inference-performance-results)
      * [NVIDIA DGX-1 16G (1x V100 16G)](#nvidia-dgx-1-16g-1x-v100-16g)
      * [NVIDIA DGX-1 32G (1x V100 32G)](#nvidia-dgx-1-32g-1x-v100-32g)
      * [NVIDIA DGX-2 32G (1x V100 32G)](#nvidia-dgx-2-32g-1x-v100-32g)
 * [Changelog](#changelog)
 * [Known issues](#known-issues)
 ## The model
 BERT, or Bidirectional Encoder Representations from Transformers, is a new method of pre-training language representations which obtains state-of-the-art results on a wide array of Natural Language Processing (NLP) tasks. This model is based on [BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding](https://arxiv.org/abs/1810.04805) paper. NVIDIA's BERT 19.04 is an optimized version of [Google's official implementation](https://github.com/google-research/bert), leveraging mixed precision arithmetic and tensor cores on V100 GPUS for faster training times while maintaining target accuracy. 
 The repository also contains scripts to interactively launch data download, training, benchmarking and inference routines in a Docker container for both pretraining and fine tuning for Question Answering. The major differences between the official implementation of the paper and our version of BERT are as follows:
 - [Mixed precision](https://arxiv.org/abs/1710.03740) training offers significant computational speedup by performing operations in half-precision format, while storing minimal information in single-precision to retain as much information as possible in critical parts of the network. Since the introduction of [tensor cores](https://developer.nvidia.com/tensor-cores) in the Volta and Turing architectures, significant training speedups are experienced by switching to mixed precision -- up to 3x overall speedup on the most arithmetically intense model architectures.  Using [mixed precision training](https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html) previously required two steps:
 - 1. Porting the model to use the FP16 data type where appropriate.
 - 2. Manually adding loss scaling to preserve small gradient values.
 Mixed precision is enabled in PyTorch by using the Automatic Mixed Precision (AMP),  library from [APEX](https://github.com/NVIDIA/apex) that casts variables to half-precision upon retrieval, while storing variables in single-precision format. Furthermore, to preserve small gradient magnitudes in backpropagation, a [loss scaling](https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html#lossscaling) step must be included when applying gradients. In PyTorch, loss scaling can be easily applied by using scale_loss() method provided by amp. The scaling value to be used can be [dynamic](https://nvidia.github.io/apex/fp16_utils.html#apex.fp16_utils.DynamicLossScaler) or fixed.
 For an in-depth walk through on AMP, check out sample usage [here](https://github.com/NVIDIA/apex/tree/master/apex/amp#usage-and-getting-started). [APEX](https://github.com/NVIDIA/apex) is a PyTorch extension that contains utility libraries, such as AMP, which require minimal network code changes to leverage tensor cores performance.
 - Scripts to download dataset for 
    - Pretraining - [Wikipedia](https://dumps.wikimedia.org/),  [BookCorpus](http://yknzhu.wixsite.com/mbweb)
    - Fine Tuning - [SQuaD](https://rajpurkar.github.io/SQuAD-explorer/) (Stanford Question Answering Dataset), Pretrained Weights from Google
 - Custom fused CUDA kernels for faster computations
 - Multi-GPU/Multi-Node support using [APEX DDP](https://github.com/NVIDIA/apex#2-distributed-training)
 These techniques and optimizations improve model performance and reduce training time, allowing you to perform various NLP tasks with no additional effort.
 Other publicly available implementations of BERT include:
 1. [Hugging Face](https://github.com/huggingface/pytorch-pretrained-BERT)
 2. [codertimo](https://github.com/codertimo/BERT-pytorch)
 This model trains with mixed precision tensor cores on Volta, therefore researchers can get results much faster than training without tensor cores. This model is tested against each NGC monthly container release to ensure consistent accuracy and performance over time.
 ### Default configuration
 BERT's model architecture is a multi-layer bidirectional Transformer encoder. Based on the model size, we have the following two default configurations of BERT.
 | **Model** | **Hidden layers** | **Hidden unit size** | **Attention heads** | **Feedforward filter size** | **Max sequence length** | **Parameters** |
 |:---------:|:----------:|:----:|:---:|:--------:|:---:|:----:|
 |BERTBASE |12 encoder| 768| 12|4 x  768|512|110M|
 |BERTLARGE|24 encoder|1024| 16|4 x 1024|512|330M|
 ## Setup
 The following section list the requirements in order to start training the BERT model.
 ### Requirements
 This repository contains `Dockerfile` which extends the TensorFlow NGC container and encapsulates some dependencies.  Aside from these dependencies, ensure you have the following components:
 - [NVIDIA Docker](https://github.com/NVIDIA/nvidia-docker)
 - [PyTorch 19.04-py3](https://ngc.nvidia.com/catalog/containers/nvidia:pytorch) NGC container
 - [NVIDIA Volta based GPU](https://www.nvidia.com/en-us/data-center/volta-gpu-architecture/)
 For more information about how to get started with NGC containers, see the following sections from the NVIDIA GPU Cloud Documentation and the Deep Learning Documentation:
 - [Getting Started Using NVIDIA GPU Cloud](https://docs.nvidia.com/ngc/ngc-getting-started-guide/index.html)
 - [Accessing And Pulling From The NGC Container Registry](https://docs.nvidia.com/deeplearning/dgx/user-guide/index.html#accessing_registry)
 - [Running PyTorch](https://docs.nvidia.com/deeplearning/dgx/pytorch-release-notes/running.html#running)
 ## Quick start guide
 To pretrain or fine tune your model for Question Answering using mixed precision with tensor cores or using FP32, perform the following steps using the default parameters of the BERT model.
 ### 1. Clone the repository.
 ```bash
 git clone https://github.com/NVIDIA/DeepLearningExamples
 cd DeepLearningExamples/PyTorch/LanguageModeling/BERT
 ```
 ### 2. Build the BERT TensorFlow NGC container.
 ```bash
 bash scripts/docker/build.sh
 ```
 ### 3. Download and preprocess the dataset.
 This repository provides scripts to download, verify and extract various datasets: 
 SQuaD and swag for fine-tuning as well as Wikipedia and BookCorpus for pretraining. If you just want to do fine-tuning, you can also download  the pretrained weights.
 To download, verify, and extract required datasets: 
 ```bash
 bash scripts/data_download.sh  
 ```
 Datasets can also be mixed before used for training or inference. In case of training there are two options:
 The script launches a docker container with current directory mounted and downloads datasets to `data/` folder on the host. 
 Datasets can also be mixed before used for training or inference
 ### 4. Start an interactive session in the NGC container to run training/inference.
 After you build the container image and download the data, you can start an interactive CLI session as follows:  
 ```bash
 bash scripts/docker/launch.sh
 ```
 The `launch.sh` script assumes that the datasets are in the following locations by default after downloading data. 
 - SQuaD v1.1 - `data/squad/v1.1`
 - BERT - `data/pretrained_models_google/uncased_L-24_H-1024_A-16`
 - Wikipedia - `data/wikipedia_corpus/hdf5_shards`
 - BookCorpus -  `data/bookcorpus/hdf5_shards`
 ### 5. Start pre-training.
 BERT is designed to pre-train deep bidirectional representations for language representations. The following scripts are to replicate pretraining on Wikipedia+Book Corpus from the [paper](https://arxiv.org/pdf/1810.04805.pdf). These scripts are general and can be used for pretraining language representations on any corpus of choice.
 From within the container, you can use the following script to run pre-training.
 ```bash
 bash scripts/run_pretraining.sh <train_batch_size_per_gpu> <learning_rate> <precision> <num_gpus> <warmup_proportion> <train_steps> <save_checkpoint_steps> <create_logfile>
 ```
 <!-- For FP16 training with XLA using a DGX-1 V100 32G, run:
 ```bash
 bash scripts/run_pretraining.sh 14 8 5e-5 fp16_xla 8 5000 2285000 5000 true
 ```
 For FP32 training without XLA using a DGX-1 V100 32G, run:
 ```bash
 bash scripts/run_pretraining.sh 6 6 2e-5 fp32 8 2000 5333333 5000 true
 ``` -->
 ### 6. Start fine tuning.
 The above pretrained BERT representations can be fine tuned with just one additional output layer for a state-of-the-art Question Answering system. From within the container, you can use the following script to run fine-training for SQuaD.
 ```bash
 bash scripts/run_squad.sh <batch_size_per_gpu> <learning_rate_per_gpu> <precision> <num_gpus> <checkpoint> <epochs>
 ```
 For FP32 training using a DGX-1 V100 32G, run:
 ```bash
 bash scripts/run_squad.sh 5 5e-6 fp32 8 /bert/bert_model.ckpt 2
 ```
 ### 7. Start validation/evaluation.
 The `run_squad_inference.sh` script runs inference on a checkpoint fine tuned for SQuaD and evaluates the goodness of predictions on the basis of exact match and F1 score.
 ```bash
 bash scripts/run_squad_inference.sh <init_checkpoint> <batch_size> <precision> 
 ```
 For FP32 inference without XLA using a DGX-1 V100 32G, run:
 ```bash
 bash scripts/run_squad_inference.sh /results/model.ckpt 8 fp32 
 ```
 ## Details
 The following sections provide greater details of the dataset, running training and inference, and the training results.
 ### Command line options
 To see the full list of available options and their descriptions, use the -h or --help command line option, for example: 
 ```bash
 python run_pretraining.py --help
 python run_squad.py --help 
 ```
 Aside from options to set hyperparameters, the relevant options to control the behaviour of the `run_pretraining.py` script are: 
 ```bash
  --[no]amp: Whether to enable AMP ops.(default: 'false')
  --[no]amp_fastmath: Whether to enable AMP fasthmath ops.(default: 'false')
  --bert_config_file: The config json file corresponding to the pre-trained BERT model. This specifies the model architecture.
  --[no]do_eval: Whether to run evaluation on the dev set.(default: 'false')
  --[no]do_train: Whether to run training.(evaluation: 'false')
  --eval_batch_size: Total batch size for eval.(default: '8')(an integer)
  --[no]fastmath: Whether to enable loss scaler for fasthmath ops.(default: 'false')
  --[no]horovod: Whether to use Horovod for multi-gpu runs(default: 'false')
  --init_checkpoint: Initial checkpoint (usually from a pre-trained BERT model).
  --input_file: Input TF example files (can be a glob or comma separated).
  --iterations_per_loop: How many steps to make in each estimator call.(default: '1000')
 ```
 Aside from options to set hyperparameters, some relevant options to control the behaviour of the run_squad.py script are: 
 ```bash
  --bert_config_file: The config json file corresponding to the pre-trained BERT model. This specifies the model architecture.
  --[no]do_predict: Whether to run evaluation on the dev set. (default: 'false')
  --[no]do_train: Whether to run training. (default: 'false')
  --learning_rate: The initial learning rate for Adam.(default: '5e-06')(a number)
  --max_answer_length: The maximum length of an answer that can be generated. This is needed because the start and end predictions are not conditioned on one another.(default: '30')(an integer)
  --max_query_length: The maximum number of tokens for the question. Questions longer than this will be truncated to this length.(default: '64')(an integer)
  --max_seq_length: The maximum total input sequence length after WordPiece tokenization. Sequences longer than this will be truncated, and sequences shorter than this will be padded.(default: '384')(an integer)
  --predict_batch_size: Total batch size for predictions.(default: '8')(an integer)
  --train_batch_size: Total batch size for training.(default: '8')(an integer)
  --[no]use_fp16: Whether to use fp32 or fp16 arithmetic on GPU.(default: 'false')
  --[no]use_xla: Whether to enable XLA JIT compilation.(default: 'false')
  --[no]verbose_logging: If true, all of the warnings related to data processing will be printed. A number of warnings are expected for a normal SQuAD evaluation.(default: 'false')
  --[no]version_2_with_negative: If true, the SQuAD examples contain some that do not have an answer.(default: 'false')
 ```
 ### Getting the data
 For pre-training BERT, we use the concatenation of Wikipedia (2500M words) as well as Book Corpus (800M words). For Wikipedia, we extract only the text passages from [here](ftp://ftpmirror.your.org/pub/wikimedia/dumps/enwiki/20190301/enwiki-20190301-pages-articles-multistream.xml.bz2) and ignore headers list and tables. It is structured as a document level corpus rather than a shuffled sentence level corpus because it is critical to extract long contiguous sentences. The next step is to run `create_pretraining_data.py` with the document level corpus as input, which generates input data and labels for the masked language modeling and next sentence prediction tasks. Pre-training can also be performed on any corpus of your choice. The collection of data generation scripts are intended to be modular to allow modifications for additional preprocessing steps or to use additional data.
 #### Mixing datasets
 The repository provides tools to mix datasets for both training and finetuning.
 In case of training there are two options:
 a) inter sequence-pair mixing (after pretraining data is created)
 In the `data/` directory, `merge_datasets_after_creation.sh` is a tool to mix data from multiple source corpora. To perform this mixing, the source corpora need to be already in the format of pretraining data, i.e. .hdf5 files. To call the script, use:
 ```bash
 cd data
 bash merge_datasets_after_creation.sh  <destination_folder> <input_directories> <num_shards>
 ```
 For example, to merge the bookcorpus and Wikipedia corpora provided with this repository and create 1024 new shards containing the mixed training instances, first make sure that `data/bookcorpus/hdf5_shards/` and `data/wikipedia_corpus/hdf5_shards/` exist and are filled with .hdf5, then run:
 ```
 cd data
 bash merge_datasets_after_creation.sh inter_instance_merged_wiki+books bookcorpus/hdf5_shards/,wikipedia_corpus/hdf5_shards/ 1024
 ```
 b) intra sequence-pair mixing (before pretraining data is created)
 In the `data/` directory, `merge_datasets_from_start.sh` is a tool to mix data from multiple source corpora. To perform this mixing, the source corpora must each be condensed into a single file that contains the entire corpus text, with line within the file corresponding to a document in the corpus. The script is then called as such:
 ```
 cd data
 merge_datasets_from_start.sh DESTINATION_FOLDER CORPUS_1 CORPUS_2 CORPUS_3 ...
 ```
 For example, to merge the bookcorpus and Wikipedia corpora provided with this repository, first make sure that `data/bookcorpus/intermediate_files/bookcorpus.txt` and `data/wikipedia_corpus/intermediate_files/wikipedia.txt` exist, then run:
 ```
 cd data
 merge_datasets_from_start.sh intra_instance_merged_wiki+books bookcorpus/intermediate_files/bookcorpus.txt wikipedia_corpus/intermediate_files/wikipedia.txt
 ```
 Note that `merge_datasets_from_start.sh` has a few dependencies, so it may be preferable to modify `data_download_helper.sh` to call the merging script and run `data_download.sh` so that the mixing process is done in a container.
 #### Fine Tuning datasets
 We can use a pre-trained BERT model for other fine tuning tasks like Question Answering. We use SQuaD for this task. SQuaD v1.1 has 100,000+ question-answer pairs on 500+ articles. SQuaD v2.0 combines v1.1 with an additional 50,000 new unanswerable questions and must not only answer questions but also determine when that is not possible. 
 ### Training process
 The training process consists of two steps: pre-training and fine tuning.
 #### Pre-training
 Pre-training is performed using the `run_pretraining.py` script along with parameters defined in the `scripts/run_pretraining.sh`.
 The `run_pretraining.sh` script runs a job on a single node  that trains the BERT-large model from scratch using the Wikipedia and Book corpus datasets as training data. By default, the training script:
 - Runs on 8 GPUs with training batch size of 14 and evaluation batch size of 8 per GPU.
 - Has FP16 precision enabled.
 - Runs for 1144000 steps with 10000 warm-up steps.
 - Saves a checkpoint every 5000 iterations (keeps only the latest checkpoint) and at the end of training. All checkpoints, evaluation results and training logs are saved to the `/results` directory (in the container which can be mounted to a local directory).
 - Creates the log file containing all the output.
 - Evaluates the model at the end of training. To skip evaluation, modify `--do_eval` to `False`.
 These parameters will train Wikipedia + BooksCorpus to reasonable accuracy on a DGX1 with 32GB V100 cards. If you want to match google’s best results from the BERT paper, you should either train for twice as many steps (2,288,000 steps) on a DGX1, or train on 16 GPUs on a DGX2. The DGX2 having 16 GPUs will be able to fit a batch size twice as large as a DGX1 (224 vs 112), hence the DGX2 can finish in half as many steps. 
 For example:
 ```bash
 run_pretraining.sh <training_batch_size> <eval_batch_size> <learning-rate> <precision> <num_gpus> <warmup_proportion> <training_steps> <save_checkpoint_steps> <create_logfile>
 ```
 Where:
 - <training_batch_size> is per-gpu batch size used for training. Batch size varies with <precision>, larger batch sizes run more efficiently, but require more memory.
 - <eval_batch_size> per-gpu batch size used for evaluation after training.<learning_rate> Default rate of 1e-4 is good for global batch size 256.
 - <precision> Type of math in your model, can be either fp32, fp16, fastmath, amp_fm, amp_fm_xla, amp . The options mean:
    - fp32 32 bit IEEE single precision floats.
    - fp16 Hand-coded mixed precision 16 and 32 bit floats.
    - fp16 Hand-coded mixed precision floats, JIT compiled with XLA.
    - fastmath Matmuls done by tensor cores in mixed precision, the rest is done in FP32.
    - amp_fm Alternative FastMath implementation that works by manipulating TensorFlow’s compute graph.
    - amp_fm_xla The amp_fm flag plus XLA JIT compilation.
    - amp Automatic rewrite of TensorFlow compute graph to take advantage of 16 bit arithmetic whenever that is safe.
    - amp_xla The amp flag plus XLA JIT compilation.
 - <num_gpus> Number of GPUs to use for training. Must be equal to or smaller than the number of GPUs attached to your node.
 - <warmup_steps> Number of warm-up steps at the start of training.
 - <training_steps> Total number of training steps.
 - <save_checkpoint_steps> Controls how often checkpoints are saved. Default is 5000 steps.
 - <create_logfile> Flag indicating if output should be written to a logfile or not (acceptable values are ‘true’ or ‘false’, true indicates output should be saved to a logfile.)
 For example:
 ```bash
 bert_tf/scripts/run_pretraining.sh 14 8 1e-4 fp16_xla 16 10000 1144000 5000 true
 ```
 Trains BERT-large from scratch on a single DGX-2 using FP16 arithmetic. This will take around 156 hours / 6.5 days. Checkpoints are written out every 5000 steps and all printouts are saved to a logfile.
 #### Fine tuning
 Fine tuning is performed using the `run_squad.py` script along with parameters defined in `scripts/run_squad.sh`.
 The `run_squad.sh` script trains a model and performs evaluation on the SQuaD v1.1 dataset. By default, the training script: 
 - Uses 8 GPUs and batch size of 10 on each GPU.
 - Has FP16 precision enabled.
 - Is XLA enabled.
 - Runs for 2 epochs.
 - Saves a checkpoint every 1000 iterations (keeps only the latest checkpoint) and at the end of training. All checkpoints, evaluation results and training logs are saved to the `/results` directory (in the container which can be mounted to a local directory).
 - Evaluation is done at the end of training. To skip evaluation, modify `--do_predict` to `False`.
 This script outputs checkpoints to the `/results` directory, by default, inside the container. Mount point of `/results` can be changed in the `scripts/docker/launch.sh` file. The training log contains information about:
 - Loss for the final step
 - Training and evaluation performance
 - F1 and exact match score on the Dev Set of SQuaD after evaluation. 
 The summary after training is printed in the following format:
 ```bash
 I0312 23:10:45.137036 140287431493376 run_squad.py:1332] 0 Total Training Time = 3007.00 Training Time W/O start up overhead = 2855.92 Sentences processed = 175176
 I0312 23:10:45.137243 140287431493376 run_squad.py:1333] 0 Training Performance = 61.3378 sentences/sec
 I0312 23:14:00.550846 140287431493376 run_squad.py:1396] 0 Total Inference Time = 145.46 Inference Time W/O start up overhead = 131.86 Sentences processed = 10840
 I0312 23:14:00.550973 140287431493376 run_squad.py:1397] 0 Inference Performance = 82.2095 sentences/sec
 {"exact_match": 83.69914853358561, "f1": 90.8477003317459}
 ```
 Multi-gpu training is enabled with the Horovod TensorFlow module. The following example runs training on 8 GPUs: 
 ```bash
 mpi_command="mpirun -np 8 -H localhost:8 \
    --allow-run-as-root -bind-to none -map-by slot \
    -x NCCL_DEBUG=INFO \
    -x LD_LIBRARY_PATH \
    -x PATH -mca pml ob1 -mca btl ^openib" \
     python run_squad.py --horovod
 ```
 ### Enabling mixed precision
 [Mixed precision](https://arxiv.org/abs/1710.03740) training offers significant computational speedup by performing operations in half-precision format, while storing minimal information in single-precision to retain as much information as possible in critical parts of the network.  Since the introduction of [tensor cores](https://developer.nvidia.com/tensor-cores) in the Volta and Turing architectures, significant training speedups are experienced by switching to mixed precision -- up to 3x overall speedup on the most arithmetically intense model architectures.  Using [mixed precision training](https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html) previously required two steps:
 1. Porting the model to use the FP16 data type where appropriate.
 2. Manually adding loss scaling to preserve small gradient values. 
 This can now be achieved using Automatic Mixed Precision (AMP) for TensorFlow to enable the full [mixed precision methodology](https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html#tensorflow) in your existing TensorFlow model code.  AMP enables mixed precision training on Volta and Turing GPUs automatically. The TensorFlow framework code makes all necessary model changes internally.
 In TF-AMP, the computational graph is optimized to use as few casts as necessary and maximize the use of FP16, and the loss scaling is automatically applied inside of supported optimizers. AMP can be configured to work with the existing `tf.contrib` loss scaling manager by disabling the AMP scaling with a single environment variable to perform only the automatic mixed-precision optimization. It accomplishes this by automatically rewriting all computation graphs with the necessary operations to enable mixed precision training and automatic loss scaling.
 For information about:
 - How to train using mixed precision, see the [Mixed Precision Training](https://arxiv.org/abs/1710.03740) paper and [Training With Mixed Precision](https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html) documentation.
 - How to access and enable AMP for TensorFlow, see [Using TF-AMP](https://docs.nvidia.com/deeplearning/dgx/tensorflow-user-guide/index.html#tfamp) from the TensorFlow User Guide.
 - Techniques used for mixed precision training, see the [Mixed-Precision Training of Deep Neural Networks](https://devblogs.nvidia.com/mixed-precision-training-deep-neural-networks/) blog.
 ### Inference process
 Inference on a fine tuned Question Answering system is performed using the `run_squad.py` script along with parameters defined in the `scripts/run_squad_inference.sh`. Inference is supported on single GPU at this moment.
 The `run_squad_inference.sh` script trains a model and performs evaluation on the SQuaD v1.1 dataset. By default, the inferencing script: 
 - Has FP16 precision enabled
 - Is XLA enabled
 - Evaluates the latest checkpoint present in `/results` with a batch size of 8
 This script outputs predictions file to `/results/predictions.json` and computes F1 score and exact match score using SQuaD's `evaluate-v1.1.py`. Mount point of `/results` can be changed in the `scripts/docker/launch.sh` file. 
 The output log contains information about:
 - Evaluation performance
 - F1 and exact match score on the Dev Set of SQuaD after evaluation. 
 The summary after inference is printed in the following format:
 ```bash
 I0312 23:14:00.550846 140287431493376 run_squad.py:1396] 0 Total Inference Time = 145.46 Inference Time W/O start up overhead = 131.86 Sentences processed = 10840
 I0312 23:14:00.550973 140287431493376 run_squad.py:1397] 0 Inference Performance = 82.2095 sentences/sec
 {"exact_match": 83.69914853358561, "f1": 90.8477003317459}
 ```
 ## Benchmarking
 The following section shows how to run benchmarks measuring the model performance in training and inference modes.
 Benchmarking can be performed for both training and inference. Both scripts run the BERT model for fine tuning. You can specify whether benchmarking is performed in FP16 or FP32 by specifying it as an argument to the benchmarking scripts. 
 Both of these benchmarking scripts enable you to run a number of epochs and extract performance numbers.
 ### Training performance benchmark
 Training benchmarking can be performed by running the script: 
 ```bash
 scripts/finetune_train_benchmark.sh squad <fp16/fp32> <use_xla> <num_gpu> <batch_size/gpu> <lr> 
 ```
 ### Inference performance benchmark
 Inference benchmarking can be performed by running the script: 
 ```bash
 scripts/finetune_inference_benchmark.sh squad <fp16/fp32> <use_xla> <batch_size> <path-to-checkpoint> 
 ```
 ## Results
 The following sections provide details on how we achieved our performance and accuracy in training and inference for Question Answering fine tuning.
 ### Training accuracy results
 Our results were obtained by running the `run_squad.py`  training script in the TensorFlow 19.03-py3 NGC container on NVIDIA DGX-1 with 8x V100 32G GPUs.
 | **Number of GPUs** | **Batch size per GPU** | **Training time with FP16 (Hrs)** | **Training time with FP32 (Hrs)** |
 |:---:|:---:|:----:|:----:|
 | 8 | 4 |||
 #### Training stability test
 The following tables compare `F1` scores across 5 different training runs with different seeds, for both FP16 and FP32 respectively.  The runs showcase consistent convergence on all 5 seeds with very little deviation.
 | **FP16, 8x GPUs** | **seed #1** | **seed #2** | **seed #3** | **seed #4** | **seed #5** | **mean** | **std** |
 |:-----------:|:-----:|:-----:|:-----:|:-----:|:-----:|:-----:|:-----:|
 |F1         ||
 |Exact match||
 | **FP32, 8x GPUs** | **seed #1** | **seed #2** | **seed #3** | **seed #4** | **seed #5** | **mean** | **std** |
 |:-----------:|:-----:|:-----:|:-----:|:-----:|:-----:|:-----:|:-----:|
 |F1         | |
 |Exact match| |
 ### Training performance results
 Our results were obtained by running batch sizes up to 3x GPUs on a 16GB V100 and up to 10x GPUs on a 32G V100 with mixed precision.
 #### NVIDIA DGX-1 (8x V100 16G)
 Our results were obtained by running the `scripts/run_pretraining.sh` training script in the TensorFlow 19.03-py3 NGC container on NVIDIA DGX-1 with 8x V100 16G GPUs. Performance numbers (in tokens per second) were averaged over an entire training epoch.
 | **Number of GPUs** | **Batch size per GPU** | **FP32 sentences/sec** | **FP16 sentences/sec** | **Speed-up with mixed precision** | **Multi-gpu weak scaling with FP32** | **Multi-gpu weak scaling with FP16** |
 |:---:|:---:|:------:|:-----:|:----:|:----:|:----:|
 | 1 | 2 | 5.48 |18.97|3.46 |1.0 |1.0 |
 | 4 | 2 |19.6|60.6|3.09|3.57 |3.2|
 | 8 | 2 |39.21 |121.21|3.09|7.15|6.38|
 | **Number of GPUs** | **Batch size per GPU** | **FP32 sentences/sec** | **FP16 sentences/sec** | **Speed-up with mixed precision** | **Multi-gpu weak scaling with FP32** | **Multi-gpu weak scaling with FP16** |
 |:---:|:---:|:-----:|:-----:|:---:|:---:|:----:|
 | 1 | 4 |  -  |19.46| - | - |1.0 |
 | 4 | 4 |  -  |75.67| - | - |3.88|
 | 8 | 4 |  -  |151.35| - | - |7.77 |
 Note: The respective values for FP32 runs that use a batch size of 4 are not available due to out of memory errors that arise. Batch size of 4 is only available on using FP16.
 To achieve these same results, follow the [Quick Start Guide](#quick-start-guide) outlined above.
 #### NVIDIA DGX-1 (8x V100 32G)
 Our results were obtained by running the `scripts/run_pretraining.sh` training script in the TensorFlow 19.03-py3 NGC container on NVIDIA DGX-1 with 8x V100 32G GPUs. Performance numbers (in sentences per second) were averaged over an entire training epochs.
 | **Number of GPUs** | **Batch size per GPU** | **FP32 sentences/sec** | **FP16 sentences/sec** | **Speed-up with mixed precision** | **Multi-gpu weak scaling with FP32** | **Multi-gpu weak scaling with FP16** |
 |---|---|-----|-----|----|----|----|
 | 1 | 7 | 7.56|24.29|3.21|1.0 |1.0 |
 | 4 | 7 |28.84|86.24|2.99|3.81|3.55|
 | 8 | 7 |57.68|172.48|2.99|7.62|7.10|
 | **Number of GPUs** | **Batch size per GPU** | **FP32 sentences/sec** | **FP16 sentences/sec** | **Speed-up with mixed precision** | **Multi-gpu weak scaling with FP32** | **Multi-gpu weak scaling with FP16** |
 |---|---|-----|-------|---|---|----|
 | 1 | 14|  -  | 26.04 | - | - |1.0 |
 | 4 | 14|  -  | 99.68| - | - |3.87|
 | 8 | 14|  -  |199.35 | - | - |7.65 |
 Note: The respective values for FP32 runs that use a batch size of 10 are not available due to out of memory errors that arise. Batch size of 10 is only available on using FP16.
 To achieve these same results, follow the [Quick Start Guide](#quick-start-guide) outlined above. 
 #### NVIDIA DGX-2 (16x V100 32G)
 Our results were obtained by running the `scripts/run_pretraining.sh` training script in the TensorFlow 19.03-py3 NGC container on NVIDIA DGX-2 with 16x V100 32G GPUs. Performance numbers (in sentences per second) were averaged over an entire training epoch.
 | **Number of GPUs** | **Batch size per GPU** | **FP32 sentences/sec** | **FP16 sentences/sec** | **Speed-up with mixed precision** | **Multi-gpu weak scaling with FP32** | **Multi-gpu weak scaling with FP16** |
 |---|---|------|------|----|-----|----|
 |  1| 7 |  8.47| 26.04|3.07| 1.0 |1.0 |
 |  4| 7 | 32.2 | 92.68|2.87| 3.8|3.80|
 |  8| 7 | 63.84|183.68|2.87| 7.53|7.05|
 | 16| 7 |126.56|365.12|2.87|14.94|14.02|
 | **Number of GPUs** | **Batch size per GPU** | **FP32 sentences/sec** | **FP16 sentences/sec** | **Speed-up with mixed precision** | **Multi-gpu weak scaling with FP32** | **Multi-gpu weak scaling with FP16** |
 |---|---|---|------|---|---|----|
 |  1| 14| - | 28.28| - | - |1.0 |
 |  4| 14| - | 103.6| - | - |3.66|
 |  8| 14| - |208.32| - | - |7.36|
 | 16| 14| - |416.64| - | - |14.73|
 Note: The respective values for FP32 runs that use a batch size of 10 are not available due to out of memory errors that arise. Batch size of 10 is only available on using FP16.
 To achieve these same results, follow the [Quick Start Guide](#quick-start-guide) outlined above. 
 ### Inference performance results
 #### NVIDIA DGX-1 16G (1x V100 16G)
 Our results were obtained by running the `scripts/run_squad_inference.sh` training script in the TensorFlow 19.03-py3 NGC container on NVIDIA DGX-1 with 1x V100 16G GPUs. Performance numbers (in sentences per second) were averaged over an entire training epoch.
 | **Number of GPUs** | **Batch size per GPU** | **FP32 sentences/sec** | **FP16 sentences/sec** | **Speedup** |
 |---|---|-----|------|----|
 | 1 | 8 ||
 To achieve these same results, follow the [Quick Start Guide](#quick-start-guide) outlined above.
 #### NVIDIA DGX-1 32G (1x V100 32G)
 Our results were obtained by running the `scripts/run_squad_inference.sh` training script in the TensorFlow 19.03-py3 NGC container on NVIDIA DGX-1 with 1x V100 32G GPUs. Performance numbers (in sentences per second) were averaged over an entire training epoch.
 | **Number of GPUs** | **Batch size per GPU** | **FP32 sentences/sec** | **FP16 sentences/sec** | **Speedup** |
 |---|---|-----|------|----|
 | 1 | 8 ||
 To achieve these same results, follow the [Quick Start Guide](#quick-start-guide) outlined above.
 #### NVIDIA DGX-2 32G (1x V100 32G)
 Our results were obtained by running the `scripts/run_squad_inference.sh` training script in the TensorFlow 19.03-py3 NGC container on NVIDIA DGX-2 with 1x V100 32G GPUs. Performance numbers (in sentences per second) were averaged over an entire training epoch.
 | **Number of GPUs** | **Batch size per GPU** | **FP32 sentences/sec** | **FP16 sentences/sec** | **Speedup** |
 |---|---|-----|------|----|
 | 1 | 8 ||
 To achieve these same results, follow the [Quick Start Guide](#quick-start-guide) outlined above.
 ## Changelog
 March 2019
 - Initial release
 ## Known issues
 There are no known issues with this model.
--- a/PyTorch/LanguageModeling/BERT/bert_config.json
+++ b/PyTorch/LanguageModeling/BERT/bert_config.json
@ -0,0 +1,13 @@
 {
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 1024,
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "max_position_embeddings": 512,
  "num_attention_heads": 16,
  "num_hidden_layers": 24,
  "type_vocab_size": 2,
  "vocab_size": 30522
 }
--- a/PyTorch/LanguageModeling/BERT/create_pretraining_data.py
+++ b/PyTorch/LanguageModeling/BERT/create_pretraining_data.py
@ -0,0 +1,472 @@
 # coding=utf-8
 # Copyright 2018 The Google AI Language Team Authors.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Create masked LM/next sentence masked_lm TF examples for BERT."""
 from __future__ import absolute_import, division, print_function, unicode_literals
 import argparse
 import logging
 import os
 import random
 from io import open
 import h5py
 import numpy as np
 from tqdm import tqdm, trange
 from tokenization import BertTokenizer
 import tokenization as tokenization
 import random
 import collections
 class TrainingInstance(object):
  """A single training instance (sentence pair)."""
  def __init__(self, tokens, segment_ids, masked_lm_positions, masked_lm_labels,
               is_random_next):
    self.tokens = tokens
    self.segment_ids = segment_ids
    self.is_random_next = is_random_next
    self.masked_lm_positions = masked_lm_positions
    self.masked_lm_labels = masked_lm_labels
  def __str__(self):
    s = ""
    s += "tokens: %s\n" % (" ".join(
        [tokenization.printable_text(x) for x in self.tokens]))
    s += "segment_ids: %s\n" % (" ".join([str(x) for x in self.segment_ids]))
    s += "is_random_next: %s\n" % self.is_random_next
    s += "masked_lm_positions: %s\n" % (" ".join(
        [str(x) for x in self.masked_lm_positions]))
    s += "masked_lm_labels: %s\n" % (" ".join(
        [tokenization.printable_text(x) for x in self.masked_lm_labels]))
    s += "\n"
    return s
  def __repr__(self):
    return self.__str__()
 def write_instance_to_example_file(instances, tokenizer, max_seq_length,
                                    max_predictions_per_seq, output_file):
  """Create TF example files from `TrainingInstance`s."""
  total_written = 0
  features = collections.OrderedDict()
  num_instances = len(instances)
  features["input_ids"] = np.zeros([num_instances, max_seq_length], dtype="int32")
  features["input_mask"] = np.zeros([num_instances, max_seq_length], dtype="int32")
  features["segment_ids"] = np.zeros([num_instances, max_seq_length], dtype="int32")
  features["masked_lm_positions"] =  np.zeros([num_instances, max_predictions_per_seq], dtype="int32")
  features["masked_lm_ids"] = np.zeros([num_instances, max_predictions_per_seq], dtype="int32")
  features["next_sentence_labels"] = np.zeros(num_instances, dtype="int32")
  for inst_index, instance in enumerate(tqdm(instances)):
    input_ids = tokenizer.convert_tokens_to_ids(instance.tokens)
    input_mask = [1] * len(input_ids)
    segment_ids = list(instance.segment_ids)
    assert len(input_ids) <= max_seq_length
    while len(input_ids) < max_seq_length:
      input_ids.append(0)
      input_mask.append(0)
      segment_ids.append(0)
    assert len(input_ids) == max_seq_length
    assert len(input_mask) == max_seq_length
    assert len(segment_ids) == max_seq_length
    masked_lm_positions = list(instance.masked_lm_positions)
    masked_lm_ids = tokenizer.convert_tokens_to_ids(instance.masked_lm_labels)
    masked_lm_weights = [1.0] * len(masked_lm_ids)
    while len(masked_lm_positions) < max_predictions_per_seq:
      masked_lm_positions.append(0)
      masked_lm_ids.append(0)
      masked_lm_weights.append(0.0)
    next_sentence_label = 1 if instance.is_random_next else 0
    features["input_ids"][inst_index] = input_ids
    features["input_mask"][inst_index] = input_mask
    features["segment_ids"][inst_index] = segment_ids
    features["masked_lm_positions"][inst_index] = masked_lm_positions
    features["masked_lm_ids"][inst_index] = masked_lm_ids
    features["next_sentence_labels"][inst_index] = next_sentence_label
    total_written += 1
    # if inst_index < 20:
    #   tf.logging.info("*** Example ***")
    #   tf.logging.info("tokens: %s" % " ".join(
    #       [tokenization.printable_text(x) for x in instance.tokens]))
    #   for feature_name in features.keys():
    #     feature = features[feature_name]
    #     values = []
    #     if feature.int64_list.value:
    #       values = feature.int64_list.value
    #     elif feature.float_list.value:
    #       values = feature.float_list.value
    #     tf.logging.info(
    #         "%s: %s" % (feature_name, " ".join([str(x) for x in values])))
  print("saving data")
  f= h5py.File(output_file, 'w')
  f.create_dataset("input_ids", data=features["input_ids"], dtype='i4', compression='gzip')
  f.create_dataset("input_mask", data=features["input_mask"], dtype='i1', compression='gzip')
  f.create_dataset("segment_ids", data=features["segment_ids"], dtype='i1', compression='gzip')
  f.create_dataset("masked_lm_positions", data=features["masked_lm_positions"], dtype='i4', compression='gzip')
  f.create_dataset("masked_lm_ids", data=features["masked_lm_ids"], dtype='i4', compression='gzip')
  f.create_dataset("next_sentence_labels", data=features["next_sentence_labels"], dtype='i1', compression='gzip')
  f.flush()
  f.close()
 def create_training_instances(input_files, tokenizer, max_seq_length,
                              dupe_factor, short_seq_prob, masked_lm_prob,
                              max_predictions_per_seq, rng):
  """Create `TrainingInstance`s from raw text."""
  all_documents = [[]]
  # Input file format:
  # (1) One sentence per line. These should ideally be actual sentences, not
  # entire paragraphs or arbitrary spans of text. (Because we use the
  # sentence boundaries for the "next sentence prediction" task).
  # (2) Blank lines between documents. Document boundaries are needed so
  # that the "next sentence prediction" task doesn't span between documents.
  for input_file in input_files:
    print("creating instance from {}".format(input_file))
    with open(input_file, "r") as reader:
      while True:
        line = tokenization.convert_to_unicode(reader.readline())
        if not line:
          break
        line = line.strip()
        # Empty lines are used as document delimiters
        if not line:
          all_documents.append([])
        tokens = tokenizer.tokenize(line)
        if tokens:
          all_documents[-1].append(tokens)
  # Remove empty documents
  all_documents = [x for x in all_documents if x]
  rng.shuffle(all_documents)
  vocab_words = list(tokenizer.vocab.keys())
  instances = []
  for _ in range(dupe_factor):
    for document_index in range(len(all_documents)):
      instances.extend(
          create_instances_from_document(
              all_documents, document_index, max_seq_length, short_seq_prob,
              masked_lm_prob, max_predictions_per_seq, vocab_words, rng))
  rng.shuffle(instances)
  return instances
 def create_instances_from_document(
    all_documents, document_index, max_seq_length, short_seq_prob,
    masked_lm_prob, max_predictions_per_seq, vocab_words, rng):
  """Creates `TrainingInstance`s for a single document."""
  document = all_documents[document_index]
  # Account for [CLS], [SEP], [SEP]
  max_num_tokens = max_seq_length - 3
  # We *usually* want to fill up the entire sequence since we are padding
  # to `max_seq_length` anyways, so short sequences are generally wasted
  # computation. However, we *sometimes*
  # (i.e., short_seq_prob == 0.1 == 10% of the time) want to use shorter
  # sequences to minimize the mismatch between pre-training and fine-tuning.
  # The `target_seq_length` is just a rough target however, whereas
  # `max_seq_length` is a hard limit.
  target_seq_length = max_num_tokens
  if rng.random() < short_seq_prob:
    target_seq_length = rng.randint(2, max_num_tokens)
  # We DON'T just concatenate all of the tokens from a document into a long
  # sequence and choose an arbitrary split point because this would make the
  # next sentence prediction task too easy. Instead, we split the input into
  # segments "A" and "B" based on the actual "sentences" provided by the user
  # input.
  instances = []
  current_chunk = []
  current_length = 0
  i = 0
  while i < len(document):
    segment = document[i]
    current_chunk.append(segment)
    current_length += len(segment)
    if i == len(document) - 1 or current_length >= target_seq_length:
      if current_chunk:
        # `a_end` is how many segments from `current_chunk` go into the `A`
        # (first) sentence.
        a_end = 1
        if len(current_chunk) >= 2:
          a_end = rng.randint(1, len(current_chunk) - 1)
        tokens_a = []
        for j in range(a_end):
          tokens_a.extend(current_chunk[j])
        tokens_b = []
        # Random next
        is_random_next = False
        if len(current_chunk) == 1 or rng.random() < 0.5:
          is_random_next = True
          target_b_length = target_seq_length - len(tokens_a)
          # This should rarely go for more than one iteration for large
          # corpora. However, just to be careful, we try to make sure that
          # the random document is not the same as the document
          # we're processing.
          for _ in range(10):
            random_document_index = rng.randint(0, len(all_documents) - 1)
            if random_document_index != document_index:
              break
          random_document = all_documents[random_document_index]
          random_start = rng.randint(0, len(random_document) - 1)
          for j in range(random_start, len(random_document)):
            tokens_b.extend(random_document[j])
            if len(tokens_b) >= target_b_length:
              break
          # We didn't actually use these segments so we "put them back" so
          # they don't go to waste.
          num_unused_segments = len(current_chunk) - a_end
          i -= num_unused_segments
        # Actual next
        else:
          is_random_next = False
          for j in range(a_end, len(current_chunk)):
            tokens_b.extend(current_chunk[j])
        truncate_seq_pair(tokens_a, tokens_b, max_num_tokens, rng)
        assert len(tokens_a) >= 1
        assert len(tokens_b) >= 1
        tokens = []
        segment_ids = []
        tokens.append("[CLS]")
        segment_ids.append(0)
        for token in tokens_a:
          tokens.append(token)
          segment_ids.append(0)
        tokens.append("[SEP]")
        segment_ids.append(0)
        for token in tokens_b:
          tokens.append(token)
          segment_ids.append(1)
        tokens.append("[SEP]")
        segment_ids.append(1)
        (tokens, masked_lm_positions,
         masked_lm_labels) = create_masked_lm_predictions(
             tokens, masked_lm_prob, max_predictions_per_seq, vocab_words, rng)
        instance = TrainingInstance(
            tokens=tokens,
            segment_ids=segment_ids,
            is_random_next=is_random_next,
            masked_lm_positions=masked_lm_positions,
            masked_lm_labels=masked_lm_labels)
        instances.append(instance)
      current_chunk = []
      current_length = 0
    i += 1
  return instances
 MaskedLmInstance = collections.namedtuple("MaskedLmInstance",
                                          ["index", "label"])
 def create_masked_lm_predictions(tokens, masked_lm_prob,
                                 max_predictions_per_seq, vocab_words, rng):
  """Creates the predictions for the masked LM objective."""
  cand_indexes = []
  for (i, token) in enumerate(tokens):
    if token == "[CLS]" or token == "[SEP]":
      continue
    cand_indexes.append(i)
  rng.shuffle(cand_indexes)
  output_tokens = list(tokens)
  num_to_predict = min(max_predictions_per_seq,
                       max(1, int(round(len(tokens) * masked_lm_prob))))
  masked_lms = []
  covered_indexes = set()
  for index in cand_indexes:
    if len(masked_lms) >= num_to_predict:
      break
    if index in covered_indexes:
      continue
    covered_indexes.add(index)
    masked_token = None
    # 80% of the time, replace with [MASK]
    if rng.random() < 0.8:
      masked_token = "[MASK]"
    else:
      # 10% of the time, keep original
      if rng.random() < 0.5:
        masked_token = tokens[index]
      # 10% of the time, replace with random word
      else:
        masked_token = vocab_words[rng.randint(0, len(vocab_words) - 1)]
    output_tokens[index] = masked_token
    masked_lms.append(MaskedLmInstance(index=index, label=tokens[index]))
  masked_lms = sorted(masked_lms, key=lambda x: x.index)
  masked_lm_positions = []
  masked_lm_labels = []
  for p in masked_lms:
    masked_lm_positions.append(p.index)
    masked_lm_labels.append(p.label)
  return (output_tokens, masked_lm_positions, masked_lm_labels)
 def truncate_seq_pair(tokens_a, tokens_b, max_num_tokens, rng):
  """Truncates a pair of sequences to a maximum sequence length."""
  while True:
    total_length = len(tokens_a) + len(tokens_b)
    if total_length <= max_num_tokens:
      break
    trunc_tokens = tokens_a if len(tokens_a) > len(tokens_b) else tokens_b
    assert len(trunc_tokens) >= 1
    # We want to sometimes truncate from the front and sometimes from the
    # back to add more randomness and avoid biases.
    if rng.random() < 0.5:
      del trunc_tokens[0]
    else:
      trunc_tokens.pop()
 def main():
    parser = argparse.ArgumentParser()
    ## Required parameters
    parser.add_argument("--vocab_file",
                        default=None,
                        type=str,
                        required=True,
                        help="The vocabulary the BERT model will train on.")
    parser.add_argument("--input_file",
                        default=None,
                        type=str,
                        required=True,
                        help="The input train corpus. can be directory with .txt files or a path to a single file")
    parser.add_argument("--output_file",
                        default=None,
                        type=str,
                        required=True,
                        help="The output file where the model checkpoints will be written.")
    ## Other parameters
    # str
    parser.add_argument("--bert_model", default="bert-large-uncased", type=str, required=False,
                        help="Bert pre-trained model selected in the list: bert-base-uncased, "
                              "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese.")
    #int 
    parser.add_argument("--max_seq_length",
                        default=128,
                        type=int,
                        help="The maximum total input sequence length after WordPiece tokenization. \n"
                             "Sequences longer than this will be truncated, and sequences shorter \n"
                             "than this will be padded.")
    parser.add_argument("--dupe_factor",
                        default=10,
                        type=int,
                        help="Number of times to duplicate the input data (with different masks).")
    parser.add_argument("--max_predictions_per_seq",
                        default=20,
                        type=int,
                        help="Maximum sequence length.")
    # floats
    parser.add_argument("--masked_lm_prob",
                        default=0.15,
                        type=float,
                        help="Masked LM probability.")
    parser.add_argument("--short_seq_prob",
                        default=0.1,
                        type=float,
                        help="Probability to create a sequence shorter than maximum sequence length")
    parser.add_argument("--do_lower_case",
                        action='store_true',
                        default=True,
                        help="Whether to lower case the input text. True for uncased models, False for cased models.")
    parser.add_argument('--random_seed',
                        type=int,
                        default=12345,
                        help="random seed for initialization")
    args = parser.parse_args()
    tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case)
    input_files = []
    if os.path.isfile(args.input_file):
      input_files.append(args.input_file)
    elif os.path.isdir(args.input_file):
      input_files = [os.path.join(args.input_file, f) for f in os.listdir(args.input_file) if (os.path.isfile(os.path.join(args.input_file, f)) and f.endswith('.txt') )]
    else:
      raise ValueError("{} is not a valid path".format(args.input_file))
    rng = random.Random(args.random_seed)
    instances = create_training_instances(
        input_files, tokenizer, args.max_seq_length, args.dupe_factor,
        args.short_seq_prob, args.masked_lm_prob, args.max_predictions_per_seq,
        rng)
    output_file = args.output_file
    write_instance_to_example_file(instances, tokenizer, args.max_seq_length,
                                    args.max_predictions_per_seq, output_file)
 if __name__ == "__main__":
    main()
--- a/PyTorch/LanguageModeling/BERT/data/README.md
+++ b/PyTorch/LanguageModeling/BERT/data/README.md
@ -0,0 +1,30 @@
 Steps to reproduce datasets from web
 1) Build the container
  * docker build -t bert_prep .
 2) Run the container interactively
  * nvidia-docker run -it --ipc=host bert_prep
  * Optional: Mount data volumes
    * -v yourpath:/workspace/bert/data/wikipedia_corpus/download
    * -v yourpath:/workspace/bert/data/wikipedia_corpus/extracted_articles
    * -v yourpath:/workspace/bert/data/wikipedia_corpus/raw_data
    * -v yourpath:/workspace/bert/data/wikipedia_corpus/intermediate_files
    * -v yourpath:/workspace/bert/data/wikipedia_corpus/final_text_file_single
    * -v yourpath:/workspace/bert/data/wikipedia_corpus/final_text_files_sharded
    * -v yourpath:/workspace/bert/data/wikipedia_corpus/final_tfrecords_sharded
    * -v yourpath:/workspace/bert/data/bookcorpus/download
    * -v yourpath:/workspace/bert/data/bookcorpus/final_text_file_single
    * -v yourpath:/workspace/bert/data/bookcorpus/final_text_files_sharded
    * -v yourpath:/workspace/bert/data/bookcorpus/final_tfrecords_sharded
  * Optional: Select visible GPUs
    * -e CUDA_VISIBLE_DEVICES=0
 ** Inside of the container starting here**
 3) Download pretrained weights (they contain vocab files for preprocessing)
  * cd data/pretrained_models_google && python3 download_models.py
 4) "One-click" Wikipedia data download and prep (provides tfrecords)
  * Set your configuration in data/wikipedia_corpus/config.sh
  * cd /data/wikipedia_corpus && ./run_preprocessing.sh
 5) "One-click" BookCorpus data download and prep (provided tfrecords)
  * Set your configuration in data/wikipedia_corpus/config.sh
  * cd /data/bookcorpus && ./run_preprocessing.sh
--- a/PyTorch/LanguageModeling/BERT/data/bookcorpus/clean_and_merge_text.py
+++ b/PyTorch/LanguageModeling/BERT/data/bookcorpus/clean_and_merge_text.py
@ -0,0 +1,23 @@
 # NVIDIA
 import glob
 import os
 import argparse
 parser = argparse.ArgumentParser(description='Cleaning and merge downloaded bookcorpus files')
 parser.add_argument('download_path', type=str)
 parser.add_argument('output_file', type=str)
 args = parser.parse_args()
 download_path = args.download_path
 output_file = args.output_file
 with open(output_file, "w") as ofile:
  for filename in glob.glob('{}/*.txt'.format(download_path), recursive=True):
    with open(filename, mode='r', encoding="utf-8-sig") as file:
      for line in file:
        if line.strip() != "":
          ofile.write(line.strip() + " ")
    ofile.write("\n\n")
--- a/PyTorch/LanguageModeling/BERT/data/bookcorpus/download_bookcorpus.sh
+++ b/PyTorch/LanguageModeling/BERT/data/bookcorpus/download_bookcorpus.sh
@ -0,0 +1,9 @@
 #! /bin/bash
 # Download books
 mkdir -p ./download
 python3 /workspace/bookcorpus/download_files.py --list /workspace/bookcorpus/url_list.jsonl --out ./download --trash-bad-count
 # Clean and prep (one book per line)
 python3 ./clean_and_merge_text.py ./download bookcorpus.txt
--- a/PyTorch/LanguageModeling/BERT/data/create_datasets_from_start.sh
+++ b/PyTorch/LanguageModeling/BERT/data/create_datasets_from_start.sh
@ -0,0 +1,38 @@
 #!/bin/bash
 # Note: There are several directories created to make it clear what has been performed at each stage of preprocessing. The intermediate files may be useful if you want to further clean/prepare/augment the data for your own applications.
 # NLTK was chosen as the default over spaCy simply due to speed of sentence segmentation on the large files.
 MERGED_DIR=$1
 args="${*:2}"
 source utils/config.sh
 mkdir -p ${MERGED_DIR}
 corpus_file=${MERGED_DIR}/corpus.txt
 ## Shuffle the full corpus texts
 if [ ! -z $3 ]
 then
  echo "Merging $args"
  cat $args | sed "/^$/d" | shuf > $corpus_file
 else
  corpus_file=$2
 fi
 # Split articles into one-sentence-per-line format for use with BERT scripts
 echo "Applying sentence segmentation to get one sentence per line"
 mkdir -p ${MERGED_DIR}/final_text_file_single
 python3 utils/sentence_segmentation_nltk.py $corpus_file ${MERGED_DIR}/final_text_file_single/corpus.segmented.nltk.txt
 ## Shard finalized text so that it has a chance of fitting in memory when creating pretraining data into hdf5 (choose appropriate number of shards for distributed training)
 echo "Shard text files - size is approximate to prevent splitting an article across shards"
 mkdir -p ${MERGED_DIR}/final_text_files_sharded
 python3 utils/shard_text_input_file.py ${MERGED_DIR}/final_text_file_single/corpus.segmented.nltk.txt ${MERGED_DIR}/final_text_files_sharded/corpus.segmented.part.
 # Convert sharded text files into hdf5 that are ready for BERT pretraining
 echo "Creating hdf5 for each text shard"
 mkdir -p ${MERGED_DIR}/hdf5_shards
 export TARGET_DIR=${MERGED_DIR}
 . utils/preprocessing_xargs_wrapper.sh ${N_PROCS_PREPROCESS}
--- a/PyTorch/LanguageModeling/BERT/data/glue/download_mrpc.sh
+++ b/PyTorch/LanguageModeling/BERT/data/glue/download_mrpc.sh
@ -0,0 +1,7 @@
 #!/usr/bin/env bash
 echo "Downloading MRPC data"
 wget https://gist.githubusercontent.com/W4ngatang/60c2bdb54d156a41194446737ce03e2e/raw/17b8dd0d724281ed7c3b2aeeda662b92809aadd5/download_glue_data.py
 python download_glue_data.py --data_dir . --tasks MRPC
--- a/PyTorch/LanguageModeling/BERT/data/merge_datasets_after_creation.sh
+++ b/PyTorch/LanguageModeling/BERT/data/merge_datasets_after_creation.sh
@ -0,0 +1,29 @@
 #!/bin/bash
 MERGED_DIR=$1 # e.g wikipedia+bookcorpus
 INPUTFILES=$2 # directories with hdf5 files separated by comma
 NUM_SHARDS=$3
 source utils/config.sh
 META_DIR=$MERGED_DIR/meta
 mkdir -p ${MERGED_DIR}
 mkdir -p ${META_DIR}
 echo "create mixed dataset ids"
 echo "python utils/create_mixed_dataset_ids.py --input_files=${INPUTFILES} --num_output_shards=${NUM_SHARDS} --output_dir=${META_DIR} --random_seed=${SEED}"
 python utils/create_mixed_dataset_ids.py --input_files=${INPUTFILES} --num_output_shards=${NUM_SHARDS} --output_dir=${META_DIR} --random_seed=${SEED}
 echo "Creating hdf5 for each text shard"
 mkdir -p ${MERGED_DIR}/hdf5_shards
 echo "create mixed datasets with hdf5 files"
 echo "python utils/create_mixed_dataset.py --input_files=${INPUTFILES} --output_dir=${MERGED_DIR}/hdf5_shards --lookup=${META_DIR}/lookup_table.pkl --indices_dir=${META_DIR} --index_range=0-${NUM_SHARDS} --random_seed=${SEED}"
 python utils/create_mixed_dataset.py --input_files=${INPUTFILES} --output_dir=${MERGED_DIR}/hdf5_shards --lookup=${META_DIR}/lookup_table.pkl --indices_dir=${META_DIR} --index_range=0-$((NUM_SHARDS-1)) --random_seed=${SEED}
 rm -rf ${META_DIR}
--- a/PyTorch/LanguageModeling/BERT/data/squad/squad_download.sh
+++ b/PyTorch/LanguageModeling/BERT/data/squad/squad_download.sh
@ -0,0 +1,60 @@
 #!/usr/bin/env bash
 echo "Downloading dataset for squad..."
 # Download SQuAD
 v1="v1.1"
 mkdir $v1
 wget https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v1.1.json -O $v1/train-v1.1.json
 wget https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v1.1.json -O $v1/dev-v1.1.json
 wget https://worksheets.codalab.org/rest/bundles/0xbcd57bee090b421c982906709c8c27e1/contents/blob/ -O $v1/evaluate-v1.1.py
 EXP_TRAIN_v1='981b29407e0affa3b1b156f72073b945  -'
 EXP_DEV_v1='3e85deb501d4e538b6bc56f786231552  -'
 EXP_EVAL_v1='afb04912d18ff20696f7f88eed49bea9  -'
 CALC_TRAIN_v1=`cat ${v1}/train-v1.1.json |md5sum`
 CALC_DEV_v1=`cat ${v1}/dev-v1.1.json |md5sum`
 CALC_EVAL_v1=`cat ${v1}/evaluate-v1.1.py |md5sum`
 v2="v2.0"
 mkdir $v2
 wget https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v2.0.json -O $v2/train-v2.0.json
 wget https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v2.0.json -O $v2/dev-v2.0.json
 wget https://worksheets.codalab.org/rest/bundles/0x6b567e1cf2e041ec80d7098f031c5c9e/contents/blob/ -O $v2/evaluate-v2.0.py
 EXP_TRAIN_v2='62108c273c268d70893182d5cf8df740  -'
 EXP_DEV_v2='246adae8b7002f8679c027697b0b7cf8  -'
 EXP_EVAL_v2='ff23213bed5516ea4a6d9edb6cd7d627  -'
 CALC_TRAIN_v2=`cat ${v2}/train-v2.0.json |md5sum`
 CALC_DEV_v2=`cat ${v2}/dev-v2.0.json |md5sum`
 CALC_EVAL_v2=`cat ${v2}/evaluate-v2.0.py |md5sum`
 echo "Squad data download done!"
 echo "Verifying Dataset...."
 if [ "$EXP_TRAIN_v1" != "$CALC_TRAIN_v1" ]; then
    echo "train-v1.1.json is corrupted! md5sum doesn't match"
 fi
 if [ "$EXP_DEV_v1" != "$CALC_DEV_v1" ]; then
    echo "dev-v1.1.json is corrupted! md5sum doesn't match"
 fi
 if [ "$EXP_EVAL_v1" != "$CALC_EVAL_v1" ]; then
    echo "evaluate-v1.1.py is corrupted! md5sum doesn't match"
 fi
 if [ "$EXP_TRAIN_v2" != "$CALC_TRAIN_v2" ]; then
    echo "train-v2.0.json is corrupted! md5sum doesn't match"
 fi
 if [ "$EXP_DEV_v2" != "$CALC_DEV_v2" ]; then
    echo "dev-v2.0.json is corrupted! md5sum doesn't match"
 fi
 if [ "$EXP_EVAL_v2" != "$CALC_EVAL_v2" ]; then
    echo "evaluate-v2.0.py is corrupted! md5sum doesn't match"
 fi
 echo "Complete!"
--- a/PyTorch/LanguageModeling/BERT/data/utils/config.sh
+++ b/PyTorch/LanguageModeling/BERT/data/utils/config.sh
@ -0,0 +1,24 @@
 #! /bin/bash
 set -e
 USE_BERT_LARGE=true
 MAX_SEQUENCE_LENGTH=512
 MAX_PREDICTIONS_PER_SEQUENCE=80
 MASKED_LM_PROB=0.15
 SEED=12345
 DUPE_FACTOR=5
 DO_LOWER_CASE="True"
 N_LINES_PER_SHARD_APPROX=396000   # Default=396000 creates 256 shards
 N_PROCS_PREPROCESS=4    # Adjust this based on memory requirements and available number of cores
 BERT_BASE_DIR="/workspace/bert/vocab/uncased_L-12_H-768_A-12"
 BERT_LARGE_DIR="/workspace/bert/vocab/uncased_L-24_H-1024_A-16"
 if [ "$USE_BERT_LARGE" = true ] ; then
  VOCAB_FILE="${BERT_LARGE_DIR}/vocab.txt"
 else
  VOCAB_FILE="${BERT_BASE_DIR}/vocab.txt"
 fi
--- a/PyTorch/LanguageModeling/BERT/data/utils/create_mixed_dataset.py
+++ b/PyTorch/LanguageModeling/BERT/data/utils/create_mixed_dataset.py
@ -0,0 +1,160 @@
 from __future__ import absolute_import, division, print_function, unicode_literals
 import argparse
 import logging
 import os
 import random
 from io import open
 import h5py
 import numpy as np
 from tqdm import tqdm, trange
 import random
 import collections
 import math
 import multiprocessing as mp
 """
 mixing hdf5 shards with each other
 """
 def shard_files(output_files, l_instance_ids, lookuptable, files):
    l_input_ids = []
    l_input_masks = []
    l_segment_ids = []
    l_masked_lm_positions = []
    l_masked_lm_ids = []
    l_next_sentence_labels = []
    seq_len = 0
    pred_len = 0
    with h5py.File(files[0], 'r') as f:
      seq_len = f['input_ids'].shape[1]
      pred_len = f['masked_lm_positions'].shape[1]
    assert(seq_len > 0 and pred_len > 0)
    for i, output_file in enumerate(output_files):
      output_length = len(l_instance_ids[i])
      print("preparing to write {} instances to {}".format(output_length, output_file))
      input_ids = np.ones([output_length, seq_len], dtype=np.int32)
      input_masks = np.ones([output_length, seq_len], dtype=np.int8)
      segment_ids = np.ones([output_length, seq_len], dtype=np.int8)
      masked_lm_positions = np.ones([output_length, pred_len], dtype=np.int32)
      masked_lm_ids= np.ones([output_length, pred_len], dtype=np.int32)
      next_sentence_labels = np.ones(output_length, dtype=np.int8)
      l_input_ids.append(input_ids)
      l_input_masks.append(input_masks)
      l_segment_ids.append(segment_ids)
      l_masked_lm_positions.append(masked_lm_positions)
      l_masked_lm_ids.append(masked_lm_ids)
      l_next_sentence_labels.append(next_sentence_labels)
    for did, f in enumerate(tqdm(files)):
      h5_f = h5py.File(f, 'r')
      f_input_ids = h5_f['input_ids'][:]
      f_input_masks = h5_f['input_mask'][:]
      f_segment_ids = h5_f['segment_ids'][:]
      f_masked_lm_positions = h5_f['masked_lm_positions'][:]
      f_masked_lm_ids = h5_f['masked_lm_ids'][:]
      f_next_sentence_labels = h5_f['next_sentence_labels'][:]
      h5_f.close()
      for out_i, out_file in enumerate(output_files):
        instance_ids = l_instance_ids[out_i]
        for l, idx in enumerate(instance_ids):
          doc_id, line_id = lookuptable[idx]
          if doc_id == did:
            l_input_ids[out_i][l] = f_input_ids[line_id]
            l_input_masks[out_i][l] = f_input_masks[line_id]
            l_segment_ids[out_i][l] = f_segment_ids[line_id]
            l_masked_lm_positions[out_i][l] = f_masked_lm_positions[line_id]
            l_masked_lm_ids[out_i][l] = f_masked_lm_ids[line_id]
            l_next_sentence_labels[out_i][l] = f_next_sentence_labels[line_id]
    for out_i, out_file in enumerate(output_files):
      output_length = len(l_input_ids[out_i])
      print("writing {} instances to {}".format(output_length, out_file))
      with h5py.File(out_file, 'w') as f:
        f.create_dataset("input_ids", data=l_input_ids[out_i], dtype='i4', compression='gzip')
        f.create_dataset("input_mask", data=l_input_masks[out_i], dtype='i1', compression='gzip')
        f.create_dataset("segment_ids", data=l_segment_ids[out_i], dtype='i1', compression='gzip')
        f.create_dataset("masked_lm_positions", data=l_masked_lm_positions[out_i], dtype='i4', compression='gzip')
        f.create_dataset("masked_lm_ids", data=l_masked_lm_ids[out_i], dtype='i4', compression='gzip')
        f.create_dataset("next_sentence_labels", data=l_next_sentence_labels[out_i], dtype='i1', compression='gzip')
 def main():
    parser = argparse.ArgumentParser()
    ## Required parameters
    parser.add_argument("--input_files",
                        default=None,
                        type=str,
                        required=True,
                        help="comma seperated list of file paths, each path can be either file or directory of files")
    parser.add_argument("--output_dir",
                        default=None,
                        type=str,
                        required=True,
                        help="directory for output shards")
    parser.add_argument("--lookup",
                        default=None,
                        type=str,
                        required=True,
                        help="path to lookup table")
    parser.add_argument("--indices_dir",
                        default=None,
                        type=str,
                        required=True,
                        help="path to shuffled instance indices")
    parser.add_argument("--index_range",
                        default=None,
                        type=str,
                        required=True,
                        help="index range of output files to be written out, e.g specify '0-100' for writing out 0.hdf5 , ..., 100.hdf5")
    parser.add_argument('--random_seed',
                        type=int,
                        default=12345,
                        help="random seed for initialization")
    args = parser.parse_args()
    rng = random.Random(args.random_seed)
    np.random.seed(args.random_seed)
    input_paths = args.input_files.strip().split(',')
    input_paths = [f for f in input_paths if f]
    input_files = []
    for path in input_paths:
      if os.path.isfile(path):
        assert (path.endswith('.hdf5')), "file must be hdf5 file"
        input_files.append(path)
      else:
        assert os.path.isdir(path)
        hdf5_files = [os.path.join(path, f) for f in os.listdir(path) if os.path.isfile(os.path.join(path, f)) and f.endswith('.hdf5')]
        input_files.extend(hdf5_files)
    input_files.sort()
    assert(os.path.isdir(args.output_dir))
    print("loading indices file")
    start_idx, end_idx= int(args.index_range.split('-')[0]), int(args.index_range.split('-')[1])
    index_files = []
    instance_ids = []
    for i in range(start_idx, end_idx + 1):
      index_files.append(os.path.join(args.indices_dir, "indices_" + str(i) + ".npy"))
      instance_ids.append( np.load(index_files[-1]))
    output_files = [os.path.join(args.output_dir, indices_file.split('.')[0].split('_')[-1] + ".hdf5") for indices_file in index_files]
    print("output_files", output_files)
    print("loading lookup table")
    lookup_table = np.load(args.lookup)
    shard_files(output_files, instance_ids, lookup_table, input_files)
 if __name__ == "__main__":
    main()
--- a/PyTorch/LanguageModeling/BERT/data/utils/create_mixed_dataset_ids.py
+++ b/PyTorch/LanguageModeling/BERT/data/utils/create_mixed_dataset_ids.py
@ -0,0 +1,134 @@
 from __future__ import absolute_import, division, print_function, unicode_literals
 import argparse
 import logging
 import os
 import random
 from io import open
 import h5py
 import numpy as np
 from tqdm import tqdm, trange
 import random
 import collections
 import math
 from tqdm import tqdm
 import multiprocessing as mp
 import pickle
 import json
 """
 mixing hdf5 shards with each other
 """
 def load_and_prepare(input_files, num_shards):
  seq_len = None
  pred_len = None
  input_lengths = []
  for input_file in input_files:
    with h5py.File(input_file, 'r') as f:
      input_lengths.append(len(f['input_ids']))
      if seq_len is None:
        seq_len = f['input_ids'].shape[1]
        pred_len = f['masked_lm_ids'].shape[1]
  assert (isinstance(seq_len, int) and isinstance(pred_len, int))
  total_instances = sum(input_lengths)
  n_inst_per_file = math.ceil(total_instances * 1.0 / num_shards)
  permutation = np.random.permutation(total_instances)
  instance_indices = []
  for i in range(0, num_shards):
    start_pos = i * n_inst_per_file
    end_pos = min((i+1) * n_inst_per_file, total_instances)
    instance_indices.append(permutation[start_pos:end_pos])
  return seq_len, pred_len, input_lengths, instance_indices
 def main():
    parser = argparse.ArgumentParser()
    ## Required parameters
    parser.add_argument("--input_files",
                        default=None,
                        type=str,
                        required=True,
                        help="comma seperated list of file paths, each path can be either file or directory of hdf5 files")
    parser.add_argument("--num_output_shards",
                        default=None,
                        type=int,
                        required=True,
                        help="number of shards to be created. shards will be created as even as possible.")
    parser.add_argument("--output_dir",
                        default=None,
                        type=str,
                        required=True,
                        help="directory for meta files")
    parser.add_argument('--random_seed',
                        type=int,
                        default=12345,
                        help="random seed for initialization")
    args = parser.parse_args()
    rng = random.Random(args.random_seed)
    np.random.seed(args.random_seed)
    input_paths = args.input_files.strip().split(',')
    input_paths = [f for f in input_paths if f]
    input_files = []
    for path in input_paths:
      if os.path.isfile(path):
        assert (path.endswith('.hdf5')), "file must be hdf5 file"
        input_files.append(path)
      else:
        assert os.path.isdir(path)
        hdf5_files = [os.path.join(path, f) for f in os.listdir(path) if os.path.isfile(os.path.join(path, f)) and f.endswith('.hdf5')]
        input_files.extend(hdf5_files)
    input_files.sort()
    assert(os.path.isdir(args.output_dir))
    print("load and prepare")
    seq_len, pred_len, input_lengths, output_inst_indices = load_and_prepare(input_files, args.num_output_shards)
    print("preparing lookup table")
    total_num_instances = sum(input_lengths)
    out_2_in = dict()
    length_so_far = 0
    for i, l in enumerate(input_lengths):
      for j in range(l):
        out_2_in[length_so_far + j] = (i, j)
      length_so_far += input_lengths[i]
    output_files = [os.path.join(args.output_dir, "indices_" + str(i) + ".npy") for i in range(args.num_output_shards)]
    print("save data")
    with open(os.path.join(args.output_dir, 'lookup_table.pkl'), 'wb') as f:
      pickle.dump(out_2_in, f)
    for i, out_file in enumerate(output_files):
      np.save(out_file, output_inst_indices[i])
    meta = {'seq_len': seq_len, 'pred_len':pred_len}
    with open(os.path.join(args.output_dir, 'meta_data.pkl'), 'wb') as f:
      pickle.dump(meta, f)
 if __name__ == "__main__":
    main()
--- a/PyTorch/LanguageModeling/BERT/data/utils/preprocessing.sh
+++ b/PyTorch/LanguageModeling/BERT/data/utils/preprocessing.sh
@ -0,0 +1,23 @@
 #! /bin/bash
 SHARD_INDEX=${1}
 INPUT_FILE="${TARGET_DIR}/final_text_files_sharded/corpus.segmented.part.${SHARD_INDEX}.txt"
 source /workspace/bert/data/utils/config.sh
 OUTPUT_DIR=${TARGET_DIR}/hdf5_shards
 mkdir -p ${OUTPUT_DIR}
 OUTPUT_FILE="${OUTPUT_DIR}/${SHARD_INDEX}.hdf5"
 python /workspace/bert/create_pretraining_data.py \
  --input_file=${INPUT_FILE} \
  --output_file=${OUTPUT_FILE} \
  --vocab_file=${VOCAB_FILE} \
  --do_lower_case \
  --max_seq_length=${MAX_SEQUENCE_LENGTH} \
  --max_predictions_per_seq=${MAX_PREDICTIONS_PER_SEQUENCE} \
  --masked_lm_prob=${MASKED_LM_PROB} \
  --random_seed=${SEED} \
  --dupe_factor=${DUPE_FACTOR}
--- a/PyTorch/LanguageModeling/BERT/data/utils/preprocessing_xargs_wrapper.sh
+++ b/PyTorch/LanguageModeling/BERT/data/utils/preprocessing_xargs_wrapper.sh
@ -0,0 +1,15 @@
 #! /bin/bash
 source /workspace/bert/data/utils/config.sh
 SHARD_COUNT=0
 rm -rf ${TARGET_DIR}/xarg_list.txt
 touch ${TARGET_DIR}/xarg_list.txt
 for file in ${TARGET_DIR}/final_text_files_sharded/*; do
  echo ${SHARD_COUNT} >> ${TARGET_DIR}/xarg_list.txt
  SHARD_COUNT=$((SHARD_COUNT+1))
 done
 xargs -n 1 --max-procs=${N_PROCS_PREPROCESS} --arg-file=${TARGET_DIR}/xarg_list.txt /workspace/bert/data/utils/preprocessing.sh
 rm ${TARGET_DIR}/xarg_list.txt
--- a/PyTorch/LanguageModeling/BERT/data/utils/sentence_segmentation_nltk.py
+++ b/PyTorch/LanguageModeling/BERT/data/utils/sentence_segmentation_nltk.py
@ -0,0 +1,28 @@
 # NVIDIA
 import argparse
 import nltk
 import os
 nltk.download('punkt')
 parser = argparse.ArgumentParser(description='Sentence Segmentation')
 parser.add_argument('input_file', type=str)
 parser.add_argument('output_file', type=str)
 args = parser.parse_args()
 input_file = args.input_file
 output_file = args.output_file
 doc_seperator = "\n"
 with open(input_file) as ifile:
  with open(output_file, "w") as ofile:
    for line in ifile:
      if line != "\n":
        sent_list = nltk.tokenize.sent_tokenize(line)
        for sent in sent_list:
          ofile.write(sent + "\n")
        ofile.write(doc_seperator)
--- a/PyTorch/LanguageModeling/BERT/data/utils/shard_text_input_file.py
+++ b/PyTorch/LanguageModeling/BERT/data/utils/shard_text_input_file.py
@ -0,0 +1,47 @@
 # NVIDIA
 import os
 import argparse
 parser = argparse.ArgumentParser(description='Dataset sharding')
 parser.add_argument('input_file', type=str)
 parser.add_argument('output_file', type=str)
 args = parser.parse_args()
 input_file = args.input_file
 output_file = args.output_file
 doc_seperator = "\n"
 line_buffer = []
 shard_size = 396000 # Approximate, will split at next article break
 line_counter = 0
 shard_index = 0
 ifile_lines = 0
 with open(input_file) as ifile:
  for line in ifile:
    ifile_lines += 1
 print("Input file contains", ifile_lines, "lines.")
 iline_counter = 1
 with open(input_file) as ifile:
  for line in ifile:
    if line_counter < shard_size and iline_counter < ifile_lines:
      line_buffer.append(line)
      line_counter += 1
      iline_counter += 1
    elif line_counter >= shard_size and line != "\n" and iline_counter < ifile_lines:
      line_buffer.append(line)
      line_counter += 1
      iline_counter += 1
    else:
       with open(output_file + str(shard_index) + ".txt", "w") as ofile:
         for oline in line_buffer:
           ofile.write(oline)
         line_buffer = []
         line_counter = 0
         shard_index += 1
--- a/PyTorch/LanguageModeling/BERT/data/wikipedia_corpus/download_wikipedia.sh
+++ b/PyTorch/LanguageModeling/BERT/data/wikipedia_corpus/download_wikipedia.sh
@ -0,0 +1,30 @@
 #! /bin/bash
 WIKI_DUMP="ftp://ftpmirror.your.org/pub/wikimedia/dumps/enwiki/20190301/enwiki-20190301-pages-articles-multistream.xml.bz2"
 N_PROCS_PREPROCESS=4    # Adjust this based on memory requirements and available number of cores
 # Download Wikipedia dump file
 mkdir -p ./download
 # Not using --noclobber since it emits an error if exists (incompatible with bash 'set -e')
 echo "Downloading Wikidump"
 if [ ! -f ./download/wikidump.xml.bz2 ]; then
  wget -O ./download/wikidump.xml.bz2 ${WIKI_DUMP}
 fi
 # Extract dump
 echo "Extracting Wikidump"
 mkdir -p ./raw_data
 if [ ! -f ./raw_data/wikidump.xml ]; then
  pv ./download/wikidump.xml.bz2 | bunzip2 -kdc > ./raw_data/wikidump.xml
 fi
 # Wikiextractor.py - Creates lots of folders/files in "doc format"
 echo "Running Wikiextractor"
 mkdir -p ./extracted_articles
 /workspace/wikiextractor/WikiExtractor.py ./raw_data/wikidump.xml -b 1000M --processes ${N_PROCS_PREPROCESS} -o ./extracted_articles
 # Remove XML Tags and extraneous titles (since they are not sentences)
 # Also clean to remove lines between paragraphs within article and use space-separated articles
 echo "Cleaning and formatting files (one article per line)"
 python3 ./remove_tags_and_clean.py ./extracted_articles ./wikipedia_corpus.txt
--- a/PyTorch/LanguageModeling/BERT/data/wikipedia_corpus/remove_tags_and_clean.py
+++ b/PyTorch/LanguageModeling/BERT/data/wikipedia_corpus/remove_tags_and_clean.py
@ -0,0 +1,39 @@
 # NVIDIA
 import glob
 import os
 import argparse
 parser = argparse.ArgumentParser(description='Cleaning and merge downloaded bookcorpus files')
 parser.add_argument('extracted_articles_path', type=str)
 parser.add_argument('output_file', type=str)
 args = parser.parse_args()
 extracted_articles_path = args.extracted_articles_path
 output_file = args.output_file
 with open(output_file, "w") as ofile:
  for dirname in glob.glob('{}/*/'.format(extracted_articles_path), recursive=False):
    for filename in glob.glob(dirname + 'wiki_*', recursive=True):
      print(filename)
      article_lines = []
      article_open = False
      with open(filename, "r") as file:
        for line in file:
          if "<doc id=" in line:
            article_open = True
          elif "</doc>" in line:
            article_open = False
            for oline in article_lines[1:]:
              if oline != "\n":
                ofile.write(oline.rstrip() + " ")
            ofile.write("\n\n")
            article_lines = []
          else:
            if article_open:
              article_lines.append(line)
--- a/PyTorch/LanguageModeling/BERT/extract_features.py
+++ b/PyTorch/LanguageModeling/BERT/extract_features.py
@ -0,0 +1,297 @@
 # coding=utf-8
 # Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Extract pre-computed feature vectors from a PyTorch BERT model."""
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 import argparse
 import collections
 import logging
 import json
 import re
 import torch
 from torch.utils.data import TensorDataset, DataLoader, SequentialSampler
 from torch.utils.data.distributed import DistributedSampler
 from tokenization import BertTokenizer
 from modeling import BertModel
 logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s', 
                    datefmt = '%m/%d/%Y %H:%M:%S',
                    level = logging.INFO)
 logger = logging.getLogger(__name__)
 class InputExample(object):
    def __init__(self, unique_id, text_a, text_b):
        self.unique_id = unique_id
        self.text_a = text_a
        self.text_b = text_b
 class InputFeatures(object):
    """A single set of features of data."""
    def __init__(self, unique_id, tokens, input_ids, input_mask, input_type_ids):
        self.unique_id = unique_id
        self.tokens = tokens
        self.input_ids = input_ids
        self.input_mask = input_mask
        self.input_type_ids = input_type_ids
 def convert_examples_to_features(examples, seq_length, tokenizer):
    """Loads a data file into a list of `InputBatch`s."""
    features = []
    for (ex_index, example) in enumerate(examples):
        tokens_a = tokenizer.tokenize(example.text_a)
        tokens_b = None
        if example.text_b:
            tokens_b = tokenizer.tokenize(example.text_b)
        if tokens_b:
            # Modifies `tokens_a` and `tokens_b` in place so that the total
            # length is less than the specified length.
            # Account for [CLS], [SEP], [SEP] with "- 3"
            _truncate_seq_pair(tokens_a, tokens_b, seq_length - 3)
        else:
            # Account for [CLS] and [SEP] with "- 2"
            if len(tokens_a) > seq_length - 2:
                tokens_a = tokens_a[0:(seq_length - 2)]
        # The convention in BERT is:
        # (a) For sequence pairs:
        #  tokens:   [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
        #  type_ids:   0   0  0    0    0     0      0   0    1  1  1   1  1   1
        # (b) For single sequences:
        #  tokens:   [CLS] the dog is hairy . [SEP]
        #  type_ids:   0   0   0   0  0     0   0
        #
        # Where "type_ids" are used to indicate whether this is the first
        # sequence or the second sequence. The embedding vectors for `type=0` and
        # `type=1` were learned during pre-training and are added to the wordpiece
        # embedding vector (and position vector). This is not *strictly* necessary
        # since the [SEP] token unambigiously separates the sequences, but it makes
        # it easier for the model to learn the concept of sequences.
        #
        # For classification tasks, the first vector (corresponding to [CLS]) is
        # used as as the "sentence vector". Note that this only makes sense because
        # the entire model is fine-tuned.
        tokens = []
        input_type_ids = []
        tokens.append("[CLS]")
        input_type_ids.append(0)
        for token in tokens_a:
            tokens.append(token)
            input_type_ids.append(0)
        tokens.append("[SEP]")
        input_type_ids.append(0)
        if tokens_b:
            for token in tokens_b:
                tokens.append(token)
                input_type_ids.append(1)
            tokens.append("[SEP]")
            input_type_ids.append(1)
        input_ids = tokenizer.convert_tokens_to_ids(tokens)
        # The mask has 1 for real tokens and 0 for padding tokens. Only real
        # tokens are attended to.
        input_mask = [1] * len(input_ids)
        # Zero-pad up to the sequence length.
        while len(input_ids) < seq_length:
            input_ids.append(0)
            input_mask.append(0)
            input_type_ids.append(0)
        assert len(input_ids) == seq_length
        assert len(input_mask) == seq_length
        assert len(input_type_ids) == seq_length
        if ex_index < 5:
            logger.info("*** Example ***")
            logger.info("unique_id: %s" % (example.unique_id))
            logger.info("tokens: %s" % " ".join([str(x) for x in tokens]))
            logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
            logger.info("input_mask: %s" % " ".join([str(x) for x in input_mask]))
            logger.info(
                "input_type_ids: %s" % " ".join([str(x) for x in input_type_ids]))
        features.append(
            InputFeatures(
                unique_id=example.unique_id,
                tokens=tokens,
                input_ids=input_ids,
                input_mask=input_mask,
                input_type_ids=input_type_ids))
    return features
 def _truncate_seq_pair(tokens_a, tokens_b, max_length):
    """Truncates a sequence pair in place to the maximum length."""
    # This is a simple heuristic which will always truncate the longer sequence
    # one token at a time. This makes more sense than truncating an equal percent
    # of tokens from each, since if one sequence is very short then each token
    # that's truncated likely contains more information than a longer sequence.
    while True:
        total_length = len(tokens_a) + len(tokens_b)
        if total_length <= max_length:
            break
        if len(tokens_a) > len(tokens_b):
            tokens_a.pop()
        else:
            tokens_b.pop()
 def read_examples(input_file):
    """Read a list of `InputExample`s from an input file."""
    examples = []
    unique_id = 0
    with open(input_file, "r", encoding='utf-8') as reader:
        while True:
            line = reader.readline()
            if not line:
                break
            line = line.strip()
            text_a = None
            text_b = None
            m = re.match(r"^(.*) \|\|\| (.*)$", line)
            if m is None:
                text_a = line
            else:
                text_a = m.group(1)
                text_b = m.group(2)
            examples.append(
                InputExample(unique_id=unique_id, text_a=text_a, text_b=text_b))
            unique_id += 1
    return examples
 def main():
    parser = argparse.ArgumentParser()
    ## Required parameters
    parser.add_argument("--input_file", default=None, type=str, required=True)
    parser.add_argument("--output_file", default=None, type=str, required=True)
    parser.add_argument("--bert_model", default=None, type=str, required=True,
                        help="Bert pre-trained model selected in the list: bert-base-uncased, "
                             "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese.")
    ## Other parameters
    parser.add_argument("--do_lower_case", action='store_true', help="Set this flag if you are using an uncased model.")
    parser.add_argument("--layers", default="-1,-2,-3,-4", type=str)
    parser.add_argument("--max_seq_length", default=128, type=int,
                        help="The maximum total input sequence length after WordPiece tokenization. Sequences longer "
                            "than this will be truncated, and sequences shorter than this will be padded.")
    parser.add_argument("--batch_size", default=32, type=int, help="Batch size for predictions.")
    parser.add_argument("--local_rank",
                        type=int,
                        default=-1,
                        help = "local_rank for distributed training on gpus")
    parser.add_argument("--no_cuda",
                        action='store_true',
                        help="Whether not to use CUDA when available")
    args = parser.parse_args()
    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
        n_gpu = torch.cuda.device_count()
    else:
        device = torch.device("cuda", args.local_rank)
        n_gpu = 1
        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.distributed.init_process_group(backend='nccl')
    logger.info("device: {} n_gpu: {} distributed training: {}".format(device, n_gpu, bool(args.local_rank != -1)))
    layer_indexes = [int(x) for x in args.layers.split(",")]
    tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case)
    examples = read_examples(args.input_file)
    features = convert_examples_to_features(
        examples=examples, seq_length=args.max_seq_length, tokenizer=tokenizer)
    unique_id_to_feature = {}
    for feature in features:
        unique_id_to_feature[feature.unique_id] = feature
    model = BertModel.from_pretrained(args.bert_model)
    model.to(device)
    if args.local_rank != -1:
        model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank],
                                                          output_device=args.local_rank)
    elif n_gpu > 1:
        model = torch.nn.DataParallel(model)
    all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
    all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long)
    all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long)
    eval_data = TensorDataset(all_input_ids, all_input_mask, all_example_index)
    if args.local_rank == -1:
        eval_sampler = SequentialSampler(eval_data)
    else:
        eval_sampler = DistributedSampler(eval_data)
    eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.batch_size)
    model.eval()
    with open(args.output_file, "w", encoding='utf-8') as writer:
        for input_ids, input_mask, example_indices in eval_dataloader:
            input_ids = input_ids.to(device)
            input_mask = input_mask.to(device)
            all_encoder_layers, _ = model(input_ids, token_type_ids=None, attention_mask=input_mask)
            all_encoder_layers = all_encoder_layers
            for b, example_index in enumerate(example_indices):
                feature = features[example_index.item()]
                unique_id = int(feature.unique_id)
                # feature = unique_id_to_feature[unique_id]
                output_json = collections.OrderedDict()
                output_json["linex_index"] = unique_id
                all_out_features = []
                for (i, token) in enumerate(feature.tokens):
                    all_layers = []
                    for (j, layer_index) in enumerate(layer_indexes):
                        layer_output = all_encoder_layers[int(layer_index)].detach().cpu().numpy()
                        layer_output = layer_output[b]
                        layers = collections.OrderedDict()
                        layers["index"] = layer_index
                        layers["values"] = [
                            round(x.item(), 6) for x in layer_output[i]
                        ]
                        all_layers.append(layers)
                    out_features = collections.OrderedDict()
                    out_features["token"] = token
                    out_features["layers"] = all_layers
                    all_out_features.append(out_features)
                output_json["features"] = all_out_features
                writer.write(json.dumps(output_json) + "\n")
 if __name__ == "__main__":
    main()
--- a/PyTorch/LanguageModeling/BERT/file_utils.py
+++ b/PyTorch/LanguageModeling/BERT/file_utils.py
@ -0,0 +1,249 @@
 """
 Utilities for working with the local dataset cache.
 This file is adapted from the AllenNLP library at https://github.com/allenai/allennlp
 Copyright by the AllenNLP authors.
 """
 from __future__ import (absolute_import, division, print_function, unicode_literals)
 import json
 import logging
 import os
 import shutil
 import tempfile
 from functools import wraps
 from hashlib import sha256
 import sys
 from io import open
 import boto3
 import requests
 from botocore.exceptions import ClientError
 from tqdm import tqdm
 try:
    from urllib.parse import urlparse
 except ImportError:
    from urlparse import urlparse
 try:
    from pathlib import Path
    PYTORCH_PRETRAINED_BERT_CACHE = Path(os.getenv('PYTORCH_PRETRAINED_BERT_CACHE',
                                                   Path.home() / '.pytorch_pretrained_bert'))
 except AttributeError:
    PYTORCH_PRETRAINED_BERT_CACHE = os.getenv('PYTORCH_PRETRAINED_BERT_CACHE',
                                              os.path.join(os.path.expanduser("~"), '.pytorch_pretrained_bert'))
 logger = logging.getLogger(__name__)  # pylint: disable=invalid-name
 def url_to_filename(url, etag=None):
    """
    Convert `url` into a hashed filename in a repeatable way.
    If `etag` is specified, append its hash to the url's, delimited
    by a period.
    """
    url_bytes = url.encode('utf-8')
    url_hash = sha256(url_bytes)
    filename = url_hash.hexdigest()
    if etag:
        etag_bytes = etag.encode('utf-8')
        etag_hash = sha256(etag_bytes)
        filename += '.' + etag_hash.hexdigest()
    return filename
 def filename_to_url(filename, cache_dir=None):
    """
    Return the url and etag (which may be ``None``) stored for `filename`.
    Raise ``EnvironmentError`` if `filename` or its stored metadata do not exist.
    """
    if cache_dir is None:
        cache_dir = PYTORCH_PRETRAINED_BERT_CACHE
    if sys.version_info[0] == 3 and isinstance(cache_dir, Path):
        cache_dir = str(cache_dir)
    cache_path = os.path.join(cache_dir, filename)
    if not os.path.exists(cache_path):
        raise EnvironmentError("file {} not found".format(cache_path))
    meta_path = cache_path + '.json'
    if not os.path.exists(meta_path):
        raise EnvironmentError("file {} not found".format(meta_path))
    with open(meta_path, encoding="utf-8") as meta_file:
        metadata = json.load(meta_file)
    url = metadata['url']
    etag = metadata['etag']
    return url, etag
 def cached_path(url_or_filename, cache_dir=None):
    """
    Given something that might be a URL (or might be a local path),
    determine which. If it's a URL, download the file and cache it, and
    return the path to the cached file. If it's already a local path,
    make sure the file exists and then return the path.
    """
    if cache_dir is None:
        cache_dir = PYTORCH_PRETRAINED_BERT_CACHE
    if sys.version_info[0] == 3 and isinstance(url_or_filename, Path):
        url_or_filename = str(url_or_filename)
    if sys.version_info[0] == 3 and isinstance(cache_dir, Path):
        cache_dir = str(cache_dir)
    parsed = urlparse(url_or_filename)
    if parsed.scheme in ('http', 'https', 's3'):
        # URL, so get it from the cache (downloading if necessary)
        return get_from_cache(url_or_filename, cache_dir)
    elif os.path.exists(url_or_filename):
        # File, and it exists.
        return url_or_filename
    elif parsed.scheme == '':
        # File, but it doesn't exist.
        raise EnvironmentError("file {} not found".format(url_or_filename))
    else:
        # Something unknown
        raise ValueError("unable to parse {} as a URL or as a local path".format(url_or_filename))
 def split_s3_path(url):
    """Split a full s3 path into the bucket name and path."""
    parsed = urlparse(url)
    if not parsed.netloc or not parsed.path:
        raise ValueError("bad s3 path {}".format(url))
    bucket_name = parsed.netloc
    s3_path = parsed.path
    # Remove '/' at beginning of path.
    if s3_path.startswith("/"):
        s3_path = s3_path[1:]
    return bucket_name, s3_path
 def s3_request(func):
    """
    Wrapper function for s3 requests in order to create more helpful error
    messages.
    """
    @wraps(func)
    def wrapper(url, *args, **kwargs):
        try:
            return func(url, *args, **kwargs)
        except ClientError as exc:
            if int(exc.response["Error"]["Code"]) == 404:
                raise EnvironmentError("file {} not found".format(url))
            else:
                raise
    return wrapper
@s3_request
 def s3_etag(url):
    """Check ETag on S3 object."""
    s3_resource = boto3.resource("s3")
    bucket_name, s3_path = split_s3_path(url)
    s3_object = s3_resource.Object(bucket_name, s3_path)
    return s3_object.e_tag
@s3_request
 def s3_get(url, temp_file):
    """Pull a file directly from S3."""
    s3_resource = boto3.resource("s3")
    bucket_name, s3_path = split_s3_path(url)
    s3_resource.Bucket(bucket_name).download_fileobj(s3_path, temp_file)
 def http_get(url, temp_file):
    req = requests.get(url, stream=True)
    content_length = req.headers.get('Content-Length')
    total = int(content_length) if content_length is not None else None
    progress = tqdm(unit="B", total=total)
    for chunk in req.iter_content(chunk_size=1024):
        if chunk: # filter out keep-alive new chunks
            progress.update(len(chunk))
            temp_file.write(chunk)
    progress.close()
 def get_from_cache(url, cache_dir=None):
    """
    Given a URL, look for the corresponding dataset in the local cache.
    If it's not there, download it. Then return the path to the cached file.
    """
    if cache_dir is None:
        cache_dir = PYTORCH_PRETRAINED_BERT_CACHE
    if sys.version_info[0] == 3 and isinstance(cache_dir, Path):
        cache_dir = str(cache_dir)
    if not os.path.exists(cache_dir):
        os.makedirs(cache_dir)
    # Get eTag to add to filename, if it exists.
    if url.startswith("s3://"):
        etag = s3_etag(url)
    else:
        response = requests.head(url, allow_redirects=True)
        if response.status_code != 200:
            raise IOError("HEAD request failed for url {} with status code {}"
                          .format(url, response.status_code))
        etag = response.headers.get("ETag")
    filename = url_to_filename(url, etag)
    # get cache path to put the file
    cache_path = os.path.join(cache_dir, filename)
    if not os.path.exists(cache_path):
        # Download to temporary file, then copy to cache dir once finished.
        # Otherwise you get corrupt cache entries if the download gets interrupted.
        with tempfile.NamedTemporaryFile() as temp_file:
            logger.info("%s not found in cache, downloading to %s", url, temp_file.name)
            # GET file object
            if url.startswith("s3://"):
                s3_get(url, temp_file)
            else:
                http_get(url, temp_file)
            # we are copying the file before closing it, so flush to avoid truncation
            temp_file.flush()
            # shutil.copyfileobj() starts at the current position, so go to the start
            temp_file.seek(0)
            logger.info("copying %s to cache at %s", temp_file.name, cache_path)
            with open(cache_path, 'wb') as cache_file:
                shutil.copyfileobj(temp_file, cache_file)
            logger.info("creating metadata file for %s", cache_path)
            meta = {'url': url, 'etag': etag}
            meta_path = cache_path + '.json'
            with open(meta_path, 'w', encoding="utf-8") as meta_file:
                json.dump(meta, meta_file)
            logger.info("removing temp file %s", temp_file.name)
    return cache_path
 def read_set_from_file(filename):
    '''
    Extract a de-duped collection (set) of text from a file.
    Expected file format is one item per line.
    '''
    collection = set()
    with open(filename, 'r', encoding='utf-8') as file_:
        for line in file_:
            collection.add(line.rstrip())
    return collection
 def get_file_extension(path, dot=True, lower=True):
    ext = os.path.splitext(path)[1]
    ext = ext if dot else ext[1:]
    return ext.lower() if lower else ext
--- a/PyTorch/LanguageModeling/BERT/fused_adam_local.py
+++ b/PyTorch/LanguageModeling/BERT/fused_adam_local.py
@ -0,0 +1,205 @@
 import types
 import importlib
 import math
 import torch
 def warmup_cosine(x, warmup=0.002):
    if x < warmup:
        return x/warmup
    return 0.5 * (1.0 + torch.cos(math.pi * x))
 def warmup_constant(x, warmup=0.002):
    if x < warmup:
        return x/warmup
    return 1.0
 def warmup_linear(x, warmup=0.002):
    if x < warmup:
        return x/warmup
    return 1.0 - x
 SCHEDULES = {
    'warmup_cosine':warmup_cosine,
    'warmup_constant':warmup_constant,
    'warmup_linear':warmup_linear,
 }
 class FusedAdamBert(torch.optim.Optimizer):
    """Implements Adam algorithm. Currently GPU-only.  Requires Apex to be installed via
    ``python setup.py install --cuda_ext --cpp_ext``.
    It has been proposed in `Adam: A Method for Stochastic Optimization`_.
    Arguments:
        params (iterable): iterable of parameters to optimize or dicts defining
            parameter groups.
        lr (float, optional): learning rate. (default: 1e-3)
        betas (Tuple[float, float], optional): coefficients used for computing
            running averages of gradient and its square. (default: (0.9, 0.999))
        eps (float, optional): term added to the denominator to improve
            numerical stability. (default: 1e-8)
        weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
        amsgrad (boolean, optional): whether to use the AMSGrad variant of this
            algorithm from the paper `On the Convergence of Adam and Beyond`_
            (default: False) NOT SUPPORTED in FusedAdam!
        eps_inside_sqrt (boolean, optional): in the 'update parameters' step,
            adds eps to the bias-corrected second moment estimate before
            evaluating square root instead of adding it to the square root of
            second moment estimate as in the original paper. (default: False)
    .. _Adam\: A Method for Stochastic Optimization:
        https://arxiv.org/abs/1412.6980
    .. _On the Convergence of Adam and Beyond:
        https://openreview.net/forum?id=ryQu7f-RZ
    """
 #    def __init__(self, params,
 #                 lr=1e-3, bias_correction = True,
 #                 betas=(0.9, 0.999), eps=1e-8, eps_inside_sqrt = False,
 #                 weight_decay=0., max_grad_norm=0., amsgrad=False):
    def __init__(self, params, lr=1e-3, warmup=-1, t_total=-1, bias_correction=False, betas=(0.9, 0.999), schedule='warmup_linear',
                 eps=1e-6, eps_inside_sqrt = False, weight_decay=0., max_grad_norm=1.0, amsgrad=False):
        global fused_adam_cuda
        fused_adam_cuda = importlib.import_module("fused_adam_cuda")
        if amsgrad:
            raise RuntimeError('FusedAdam does not support the AMSGrad variant.')
        defaults = dict(lr=lr, bias_correction=bias_correction,
                        betas=betas, eps=eps, weight_decay=weight_decay,
                        max_grad_norm=max_grad_norm)
        super(FusedAdamBert, self).__init__(params, defaults)
        print("LOCAL FUSED ADAM")
        self.eps_mode = 0 if  eps_inside_sqrt else 1
        self.schedule = schedule
        self.t_total = t_total
        self.warmup = warmup
    def get_lr(self):
        lr = []
        for group in self.param_groups:
            for p in group['params']:
                state = self.state[p]
                if len(state) == 0:
                    return [0]
                if group['t_total'] != -1:
                    schedule_fct = SCHEDULES[group['schedule']]
                    lr_scheduled = group['lr'] * schedule_fct(state['step']/group['t_total'], group['warmup'])
                else:
                    lr_scheduled = group['lr']
                lr.append(lr_scheduled)
        print("LR {}".format(lr_scheduled))
        return lr
    def step(self, closure=None, grads=None, output_params=None, scale=1., grad_norms=None):
        """Performs a single optimization step.
        Arguments:
            closure (callable, optional): A closure that reevaluates the model
                and returns the loss.
            grads (list of tensors, optional): weight gradient to use for the
                optimizer update. If gradients have type torch.half, parameters
                are expected to be in type torch.float. (default: None)
            output params (list of tensors, optional): A reduced precision copy
                of the updated weights written out in addition to the regular
                updated weights. Have to be of same type as gradients. (default: None)
            scale (float, optional): factor to divide gradient tensor values
                by before applying to weights. (default: 1)
        """
        loss = None
        if closure is not None:
            loss = closure()
        if grads is None:
            grads_group = [None]*len(self.param_groups)
        # backward compatibility
        # assuming a list/generator of parameter means single group
        elif isinstance(grads, types.GeneratorType):
            grads_group = [grads]
        elif type(grads[0])!=list:
            grads_group = [grads]
        else:
            grads_group = grads
        if output_params is None:
            output_params_group = [None]*len(self.param_groups)
        elif isinstance(output_params, types.GeneratorType):
            output_params_group = [output_params]
        elif type(output_params[0])!=list:
            output_params_group = [output_params]
        else:
            output_params_group = output_params
        if grad_norms is None:
            grad_norms = [None]*len(self.param_groups)
        #Compute global norm
        global_norm = 0.0        
        for group, grads_this_group, output_params_this_group, grad_norm in zip(self.param_groups, grads_group,
                                                                                output_params_group, grad_norms):
            global_norm = (global_norm ** 2 + grad_norm ** 2) ** 0.5
        for group, grads_this_group, output_params_this_group, grad_norm in zip(self.param_groups, grads_group, output_params_group, grad_norms):
            if grads_this_group is None:
               grads_this_group = [None]*len(group['params'])
            if output_params_this_group is None:
               output_params_this_group = [None]*len(group['params'])
            # compute combined scale factor for this group
            combined_scale = scale
            if group['max_grad_norm'] > 0:
                # norm is in fact norm*scale
                clip = ((global_norm / scale) + 1e-6) / group['max_grad_norm']
                if clip > 1:
                    combined_scale = clip * scale
            bias_correction = 1 if group['bias_correction'] else 0
            for p, grad, output_param in zip(group['params'], grads_this_group, output_params_this_group):
                #note: p.grad should not ever be set for correct operation of mixed precision optimizer that sometimes sends None gradients
                if p.grad is None and grad is None:
                    continue
                if grad is None:
                    grad = p.grad.data
                if grad.is_sparse:
                    raise RuntimeError('FusedAdam does not support sparse gradients, please consider SparseAdam instead')
                state = self.state[p]
                # State initialization
                if len(state) == 0:
                    state['step'] = 0
                    # Exponential moving average of gradient values
                    state['exp_avg'] = torch.zeros_like(p.data)
                    # Exponential moving average of squared gradient values
                    state['exp_avg_sq'] = torch.zeros_like(p.data)
                exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']
                beta1, beta2 = group['betas']
                state['step'] += 1
                out_p = torch.tensor([], dtype = torch.float) if output_param is None else output_param
                #Changes sharath 
                schedule_fct = SCHEDULES[self.schedule]
                #schedule_fct(state['step']/self.t_total, self.warmup)
                #step_lr = group['lr'] *  schedule_fct(state['step']/self.t_total, self.warmup)
                #step_lr = group['lr'] * scale#schedule_fct(state['step']/self.t_total, self.warmup)# schedule_fct(state['step']/group['t_total'], group['warmup']) 
                #print(scale, step_lr)
                #print(group['lr']) 
                fused_adam_cuda.adam(p.data,
                                     out_p,
                                     exp_avg,
                                     exp_avg_sq,
                                     grad,
                                     group['lr'], #step_lr,#group['lr'],
                                     beta1,
                                     beta2,
                                     group['eps'],
                                     combined_scale,
                                     state['step'],
                                     self.eps_mode,
                                     bias_correction,
                                     group['weight_decay'])
        return loss
--- a/PyTorch/LanguageModeling/BERT/modeling.py
+++ b/PyTorch/LanguageModeling/BERT/modeling.py
--- a/PyTorch/LanguageModeling/BERT/optimization.py
+++ b/PyTorch/LanguageModeling/BERT/optimization.py
@ -0,0 +1,218 @@
 # coding=utf-8
 # Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """PyTorch optimization for BERT model."""
 import math
 import torch
 from torch.optim import Optimizer
 from torch.optim.optimizer import required
 from torch.nn.utils import clip_grad_norm_
 #from fused_adam_local import FusedAdam
 from apex.optimizers import FusedAdam
 def warmup_cosine(x, warmup=0.002):
    if x < warmup:
        return x/warmup
    return 0.5 * (1.0 + torch.cos(math.pi * x))
 def warmup_constant(x, warmup=0.002):
    if x < warmup:
        return x/warmup
    return 1.0
 def warmup_linear(x, warmup=0.002):
    if x < warmup:
        return x/warmup
    # return (1.0 - x)
    return max((x - 1. )/ (warmup - 1.), 0.) 
 SCHEDULES = {
    'warmup_cosine':warmup_cosine,
    'warmup_constant':warmup_constant,
    'warmup_linear':warmup_linear,
 }
 class BertAdam(Optimizer):
    """Implements BERT version of Adam algorithm with weight decay fix.
    Params:
        lr: learning rate
        warmup: portion of t_total for the warmup, -1  means no warmup. Default: -1
        t_total: total number of training steps for the learning
            rate schedule, -1  means constant learning rate. Default: -1
        schedule: schedule to use for the warmup (see above). Default: 'warmup_linear'
        b1: Adams b1. Default: 0.9
        b2: Adams b2. Default: 0.999
        e: Adams epsilon. Default: 1e-6
        weight_decay: Weight decay. Default: 0.01
        max_grad_norm: Maximum norm for the gradients (-1 means no clipping). Default: 1.0
    """
    def __init__(self, params, lr=required, warmup=-1, t_total=-1, schedule='warmup_linear',
                 b1=0.9, b2=0.999, e=1e-6, weight_decay=0.01,
                 max_grad_norm=1.0):
        if lr is not required and lr < 0.0:
            raise ValueError("Invalid learning rate: {} - should be >= 0.0".format(lr))
        if schedule not in SCHEDULES:
            raise ValueError("Invalid schedule parameter: {}".format(schedule))
        if not 0.0 <= warmup < 1.0 and not warmup == -1:
            raise ValueError("Invalid warmup: {} - should be in [0.0, 1.0[ or -1".format(warmup))
        if not 0.0 <= b1 < 1.0:
            raise ValueError("Invalid b1 parameter: {} - should be in [0.0, 1.0[".format(b1))
        if not 0.0 <= b2 < 1.0:
            raise ValueError("Invalid b2 parameter: {} - should be in [0.0, 1.0[".format(b2))
        if not e >= 0.0:
            raise ValueError("Invalid epsilon value: {} - should be >= 0.0".format(e))
        defaults = dict(lr=lr, schedule=schedule, warmup=warmup, t_total=t_total,
                        b1=b1, b2=b2, e=e, weight_decay=weight_decay,
                        max_grad_norm=max_grad_norm)
        super(BertAdam, self).__init__(params, defaults)
    def get_lr(self):
        lr = []
        for group in self.param_groups:
            for p in group['params']:
                state = self.state[p]
                if len(state) == 0:
                    return [0]
                if group['t_total'] != -1:
                    schedule_fct = SCHEDULES[group['schedule']]
                    lr_scheduled = group['lr'] * schedule_fct(state['step']/group['t_total'], group['warmup'])
                else:
                    lr_scheduled = group['lr']
                lr.append(lr_scheduled)
        return lr
    def step(self, closure=None):
        """Performs a single optimization step.
        Arguments:
            closure (callable, optional): A closure that reevaluates the model
                and returns the loss.
        """
        loss = None
        if closure is not None:
            loss = closure()
        for group in self.param_groups:
            for p in group['params']:
                if p.grad is None:
                    continue
                grad = p.grad.data
                if grad.is_sparse:
                    raise RuntimeError('Adam does not support sparse gradients, please consider SparseAdam instead')
                state = self.state[p]
                # State initialization
                if len(state) == 0:
                    state['step'] = 0
                    # Exponential moving average of gradient values
                    state['next_m'] = torch.zeros_like(p.data)
                    # Exponential moving average of squared gradient values
                    state['next_v'] = torch.zeros_like(p.data)
                next_m, next_v = state['next_m'], state['next_v']
                beta1, beta2 = group['b1'], group['b2']
                # Add grad clipping
                if group['max_grad_norm'] > 0:
                    clip_grad_norm_(p, group['max_grad_norm'])
                # Decay the first and second moment running average coefficient
                # In-place operations to update the averages at the same time
                next_m.mul_(beta1).add_(1 - beta1, grad)
                next_v.mul_(beta2).addcmul_(1 - beta2, grad, grad)
                update = next_m / (next_v.sqrt() + group['e'])
                # Just adding the square of the weights to the loss function is *not*
                # the correct way of using L2 regularization/weight decay with Adam,
                # since that will interact with the m and v parameters in strange ways.
                #
                # Instead we want to decay the weights in a manner that doesn't interact
                # with the m/v parameters. This is equivalent to adding the square
                # of the weights to the loss with plain (non-momentum) SGD.
                if group['weight_decay'] > 0.0:
                    update += group['weight_decay'] * p.data
                if group['t_total'] != -1:
                    schedule_fct = SCHEDULES[group['schedule']]
                    lr_scheduled = group['lr'] * schedule_fct(state['step']/group['t_total'], group['warmup'])
                else:
                    lr_scheduled = group['lr']
                update_with_lr = lr_scheduled * update
                p.data.add_(-update_with_lr)
                state['step'] += 1
                # step_size = lr_scheduled * math.sqrt(bias_correction2) / bias_correction1
                # No bias correction
                # bias_correction1 = 1 - beta1 ** state['step']
                # bias_correction2 = 1 - beta2 ** state['step']
        return loss
 # =======================================================================
 class BertAdam_FP16(FusedAdam):
    """Implements BERT version of Adam algorithm with weight decay fix.
    Params:
        lr: learning rate
        warmup: portion of t_total for the warmup, -1  means no warmup. Default: -1
        t_total: total number of training steps for the learning
            rate schedule, -1  means constant learning rate. Default: -1
        schedule: schedule to use for the warmup (see above). Default: 'warmup_linear'
        b1: Adams b1. Default: 0.9
        b2: Adams b2. Default: 0.999
        e: Adams epsilon. Default: 1e-6
        weight_decay: Weight decay. Default: 0.01
        max_grad_norm: Maximum norm for the gradients (-1 means no clipping). Default: 1.0
    """
    def __init__(self, params, lr, warmup=-1, t_total=-1, bias_correction=False, schedule='warmup_linear',
                 b1=0.9, b2=0.999, e=1e-6, weight_decay=0.01,
                 max_grad_norm=1.0):
        if not lr >= 0.0:
            raise ValueError("Invalid learning rate: {} - should be >= 0.0".format(lr))
        if schedule not in SCHEDULES:
            raise ValueError("Invalid schedule parameter: {}".format(schedule))
        if not 0.0 <= warmup < 1.0 and not warmup == -1:
            raise ValueError("Invalid warmup: {} - should be in [0.0, 1.0[ or -1".format(warmup))
        if not 0.0 <= b1 < 1.0:
            raise ValueError("Invalid b1 parameter: {} - should be in [0.0, 1.0[".format(b1))
        if not 0.0 <= b2 < 1.0:
            raise ValueError("Invalid b2 parameter: {} - should be in [0.0, 1.0[".format(b2))
        if not e >= 0.0:
            raise ValueError("Invalid epsilon value: {} - should be >= 0.0".format(e))
        # defaults = dict(lr=lr, schedule=schedule, warmup=warmup, t_total=t_total,
        #                 b1=b1, b2=b2, e=e, weight_decay=weight_decay,
        #                 max_grad_norm=max_grad_norm)
        super(BertAdam_FP16, self).__init__(params, lr=lr, bias_correction=bias_correction, betas=(b1, b2), eps=e, weight_decay=weight_decay, max_grad_norm=max_grad_norm)#defaults)
    def get_lr(self):
        lr = []
        for group in self.param_groups:
            for p in group['params']:
                state = self.state[p]
                if len(state) == 0:
                    print("returning", state)
                    return [0]
                if group['t_total'] != -1:
                    schedule_fct = SCHEDULES[group['schedule']]
                    lr_scheduled = group['lr'] * schedule_fct(state['step']/group['t_total'], group['warmup'])
                else:
                    lr_scheduled = group['lr']
                lr.append(lr_scheduled)
        print("LR {}".format(lr_scheduled))
        return lr
--- a/PyTorch/LanguageModeling/BERT/requirements.txt
+++ b/PyTorch/LanguageModeling/BERT/requirements.txt
@ -0,0 +1,13 @@
 # progress bars in model download and training scripts
 tqdm
 # Accessing files from S3 directly.
 boto3
 # Used for downloading models over HTTP
 requests
 six
 ipdb
 #Data processing
 h5py
 html2text
 nltk
 progressbar
--- a/PyTorch/LanguageModeling/BERT/run_glue.py
+++ b/PyTorch/LanguageModeling/BERT/run_glue.py
@ -0,0 +1,649 @@
 # coding=utf-8
 # Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team.
 # Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """BERT finetuning runner."""
 from __future__ import absolute_import, division, print_function
 import argparse
 import csv
 import logging
 import os
 import random
 import sys
 import numpy as np
 import torch
 from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,
                              TensorDataset)
 from torch.utils.data.distributed import DistributedSampler
 from tqdm import tqdm, trange
 from file_utils import PYTORCH_PRETRAINED_BERT_CACHE
 from modeling import BertForSequenceClassification, BertConfig, WEIGHTS_NAME, CONFIG_NAME
 from tokenization import BertTokenizer
 from optimization import BertAdam, warmup_linear
 logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
                    datefmt = '%m/%d/%Y %H:%M:%S',
                    level = logging.INFO)
 logger = logging.getLogger(__name__)
 class InputExample(object):
    """A single training/test example for simple sequence classification."""
    def __init__(self, guid, text_a, text_b=None, label=None):
        """Constructs a InputExample.
        Args:
            guid: Unique id for the example.
            text_a: string. The untokenized text of the first sequence. For single
            sequence tasks, only this sequence must be specified.
            text_b: (Optional) string. The untokenized text of the second sequence.
            Only must be specified for sequence pair tasks.
            label: (Optional) string. The label of the example. This should be
            specified for train and dev examples, but not for test examples.
        """
        self.guid = guid
        self.text_a = text_a
        self.text_b = text_b
        self.label = label
 class InputFeatures(object):
    """A single set of features of data."""
    def __init__(self, input_ids, input_mask, segment_ids, label_id):
        self.input_ids = input_ids
        self.input_mask = input_mask
        self.segment_ids = segment_ids
        self.label_id = label_id
 class DataProcessor(object):
    """Base class for data converters for sequence classification data sets."""
    def get_train_examples(self, data_dir):
        """Gets a collection of `InputExample`s for the train set."""
        raise NotImplementedError()
    def get_dev_examples(self, data_dir):
        """Gets a collection of `InputExample`s for the dev set."""
        raise NotImplementedError()
    def get_labels(self):
        """Gets the list of labels for this data set."""
        raise NotImplementedError()
    @classmethod
    def _read_tsv(cls, input_file, quotechar=None):
        """Reads a tab separated value file."""
        with open(input_file, "r") as f:
            reader = csv.reader(f, delimiter="\t", quotechar=quotechar)
            lines = []
            for line in reader:
                if sys.version_info[0] == 2:
                    line = list(unicode(cell, 'utf-8') for cell in line)
                lines.append(line)
            return lines
 class MrpcProcessor(DataProcessor):
    """Processor for the MRPC data set (GLUE version)."""
    def get_train_examples(self, data_dir):
        """See base class."""
        logger.info("LOOKING AT {}".format(os.path.join(data_dir, "train.tsv")))
        return self._create_examples(
            self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
    def get_dev_examples(self, data_dir):
        """See base class."""
        return self._create_examples(
            self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
    def get_labels(self):
        """See base class."""
        return ["0", "1"]
    def _create_examples(self, lines, set_type):
        """Creates examples for the training and dev sets."""
        examples = []
        for (i, line) in enumerate(lines):
            if i == 0:
                continue
            guid = "%s-%s" % (set_type, i)
            text_a = line[3]
            text_b = line[4]
            label = line[0]
            examples.append(
                InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
        return examples
 class MnliProcessor(DataProcessor):
    """Processor for the MultiNLI data set (GLUE version)."""
    def get_train_examples(self, data_dir):
        """See base class."""
        return self._create_examples(
            self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
    def get_dev_examples(self, data_dir):
        """See base class."""
        return self._create_examples(
            self._read_tsv(os.path.join(data_dir, "dev_matched.tsv")),
            "dev_matched")
    def get_labels(self):
        """See base class."""
        return ["contradiction", "entailment", "neutral"]
    def _create_examples(self, lines, set_type):
        """Creates examples for the training and dev sets."""
        examples = []
        for (i, line) in enumerate(lines):
            if i == 0:
                continue
            guid = "%s-%s" % (set_type, line[0])
            text_a = line[8]
            text_b = line[9]
            label = line[-1]
            examples.append(
                InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
        return examples
 class ColaProcessor(DataProcessor):
    """Processor for the CoLA data set (GLUE version)."""
    def get_train_examples(self, data_dir):
        """See base class."""
        return self._create_examples(
            self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
    def get_dev_examples(self, data_dir):
        """See base class."""
        return self._create_examples(
            self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
    def get_labels(self):
        """See base class."""
        return ["0", "1"]
    def _create_examples(self, lines, set_type):
        """Creates examples for the training and dev sets."""
        examples = []
        for (i, line) in enumerate(lines):
            guid = "%s-%s" % (set_type, i)
            text_a = line[3]
            label = line[1]
            examples.append(
                InputExample(guid=guid, text_a=text_a, text_b=None, label=label))
        return examples
 def convert_examples_to_features(examples, label_list, max_seq_length, tokenizer):
    """Loads a data file into a list of `InputBatch`s."""
    label_map = {label : i for i, label in enumerate(label_list)}
    features = []
    for (ex_index, example) in enumerate(examples):
        tokens_a = tokenizer.tokenize(example.text_a)
        tokens_b = None
        if example.text_b:
            tokens_b = tokenizer.tokenize(example.text_b)
            # Modifies `tokens_a` and `tokens_b` in place so that the total
            # length is less than the specified length.
            # Account for [CLS], [SEP], [SEP] with "- 3"
            _truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3)
        else:
            # Account for [CLS] and [SEP] with "- 2"
            if len(tokens_a) > max_seq_length - 2:
                tokens_a = tokens_a[:(max_seq_length - 2)]
        # The convention in BERT is:
        # (a) For sequence pairs:
        #  tokens:   [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
        #  type_ids: 0   0  0    0    0     0       0 0    1  1  1  1   1 1
        # (b) For single sequences:
        #  tokens:   [CLS] the dog is hairy . [SEP]
        #  type_ids: 0   0   0   0  0     0 0
        #
        # Where "type_ids" are used to indicate whether this is the first
        # sequence or the second sequence. The embedding vectors for `type=0` and
        # `type=1` were learned during pre-training and are added to the wordpiece
        # embedding vector (and position vector). This is not *strictly* necessary
        # since the [SEP] token unambigiously separates the sequences, but it makes
        # it easier for the model to learn the concept of sequences.
        #
        # For classification tasks, the first vector (corresponding to [CLS]) is
        # used as as the "sentence vector". Note that this only makes sense because
        # the entire model is fine-tuned.
        tokens = ["[CLS]"] + tokens_a + ["[SEP]"]
        segment_ids = [0] * len(tokens)
        if tokens_b:
            tokens += tokens_b + ["[SEP]"]
            segment_ids += [1] * (len(tokens_b) + 1)
        input_ids = tokenizer.convert_tokens_to_ids(tokens)
        # The mask has 1 for real tokens and 0 for padding tokens. Only real
        # tokens are attended to.
        input_mask = [1] * len(input_ids)
        # Zero-pad up to the sequence length.
        padding = [0] * (max_seq_length - len(input_ids))
        input_ids += padding
        input_mask += padding
        segment_ids += padding
        assert len(input_ids) == max_seq_length
        assert len(input_mask) == max_seq_length
        assert len(segment_ids) == max_seq_length
        label_id = label_map[example.label]
        if ex_index < 5:
            logger.info("*** Example ***")
            logger.info("guid: %s" % (example.guid))
            logger.info("tokens: %s" % " ".join(
                    [str(x) for x in tokens]))
            logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
            logger.info("input_mask: %s" % " ".join([str(x) for x in input_mask]))
            logger.info(
                    "segment_ids: %s" % " ".join([str(x) for x in segment_ids]))
            logger.info("label: %s (id = %d)" % (example.label, label_id))
        features.append(
                InputFeatures(input_ids=input_ids,
                              input_mask=input_mask,
                              segment_ids=segment_ids,
                              label_id=label_id))
    return features
 def _truncate_seq_pair(tokens_a, tokens_b, max_length):
    """Truncates a sequence pair in place to the maximum length."""
    # This is a simple heuristic which will always truncate the longer sequence
    # one token at a time. This makes more sense than truncating an equal percent
    # of tokens from each, since if one sequence is very short then each token
    # that's truncated likely contains more information than a longer sequence.
    while True:
        total_length = len(tokens_a) + len(tokens_b)
        if total_length <= max_length:
            break
        if len(tokens_a) > len(tokens_b):
            tokens_a.pop()
        else:
            tokens_b.pop()
 def accuracy(out, labels):
    outputs = np.argmax(out, axis=1)
    return np.sum(outputs == labels)
 def main():
    parser = argparse.ArgumentParser()
    ## Required parameters
    parser.add_argument("--data_dir",
                        default=None,
                        type=str,
                        required=True,
                        help="The input data dir. Should contain the .tsv files (or other data files) for the task.")
    parser.add_argument("--bert_model", default=None, type=str, required=True,
                        help="Bert pre-trained model selected in the list: bert-base-uncased, "
                        "bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, "
                        "bert-base-multilingual-cased, bert-base-chinese.")
    parser.add_argument("--task_name",
                        default=None,
                        type=str,
                        required=True,
                        help="The name of the task to train.")
    parser.add_argument("--output_dir",
                        default=None,
                        type=str,
                        required=True,
                        help="The output directory where the model predictions and checkpoints will be written.")
    parser.add_argument("--init_checkpoint",
                        default=None,
                        type=str,
                        required=True,
                        help="The checkpoint file from pretraining")
    ## Other parameters
    parser.add_argument("--cache_dir",
                        default="",
                        type=str,
                        help="Where do you want to store the pre-trained models downloaded from s3")
    parser.add_argument("--max_seq_length",
                        default=128,
                        type=int,
                        help="The maximum total input sequence length after WordPiece tokenization. \n"
                             "Sequences longer than this will be truncated, and sequences shorter \n"
                             "than this will be padded.")
    parser.add_argument("--do_train",
                        action='store_true',
                        help="Whether to run training.")
    parser.add_argument("--do_eval",
                        action='store_true',
                        help="Whether to run eval on the dev set.")
    parser.add_argument("--do_lower_case",
                        action='store_true',
                        help="Set this flag if you are using an uncased model.")
    parser.add_argument("--train_batch_size",
                        default=32,
                        type=int,
                        help="Total batch size for training.")
    parser.add_argument("--eval_batch_size",
                        default=8,
                        type=int,
                        help="Total batch size for eval.")
    parser.add_argument("--learning_rate",
                        default=5e-5,
                        type=float,
                        help="The initial learning rate for Adam.")
    parser.add_argument("--num_train_epochs",
                        default=3.0,
                        type=float,
                        help="Total number of training epochs to perform.")
    parser.add_argument("--max_steps", default=-1.0, type=float,
                        help="Total number of training steps to perform.")
    parser.add_argument("--warmup_proportion",
                        default=0.1,
                        type=float,
                        help="Proportion of training to perform linear learning rate warmup for. "
                             "E.g., 0.1 = 10%% of training.")
    parser.add_argument("--no_cuda",
                        action='store_true',
                        help="Whether not to use CUDA when available")
    parser.add_argument("--local_rank",
                        type=int,
                        default=-1,
                        help="local_rank for distributed training on gpus")
    parser.add_argument('--seed',
                        type=int,
                        default=42,
                        help="random seed for initialization")
    parser.add_argument('--gradient_accumulation_steps',
                        type=int,
                        default=1,
                        help="Number of updates steps to accumulate before performing a backward/update pass.")
    parser.add_argument('--fp16',
                        action='store_true',
                        help="Whether to use 16-bit float precision instead of 32-bit")
    parser.add_argument('--loss_scale',
                        type=float, default=0,
                        help="Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n"
                             "0 (default value): dynamic loss scaling.\n"
                             "Positive power of 2: static loss scaling value.\n")
    parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.")
    parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.")
    args = parser.parse_args()
    if args.server_ip and args.server_port:
        # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
        import ptvsd
        print("Waiting for debugger attach")
        ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True)
        ptvsd.wait_for_attach()
    processors = {
        "cola": ColaProcessor,
        "mnli": MnliProcessor,
        "mrpc": MrpcProcessor,
    }
    num_labels_task = {
        "cola": 2,
        "mnli": 3,
        "mrpc": 2,
    }
    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
        n_gpu = torch.cuda.device_count()
    else:
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
        n_gpu = 1
        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.distributed.init_process_group(backend='nccl')
    logger.info("device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".format(
        device, n_gpu, bool(args.local_rank != -1), args.fp16))
    if args.gradient_accumulation_steps < 1:
        raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format(
                            args.gradient_accumulation_steps))
    args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps
    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if n_gpu > 0:
        torch.cuda.manual_seed_all(args.seed)
    if not args.do_train and not args.do_eval:
        raise ValueError("At least one of `do_train` or `do_eval` must be True.")
    if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train:
        print("WARNING: Output directory ({}) already exists and is not empty.".format(args.output_dir))
    if not os.path.exists(args.output_dir):
        os.makedirs(args.output_dir)
    task_name = args.task_name.lower()
    if task_name not in processors:
        raise ValueError("Task not found: %s" % (task_name))
    processor = processors[task_name]()
    num_labels = num_labels_task[task_name]
    label_list = processor.get_labels()
    tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case)
    train_examples = None
    num_train_optimization_steps = None
    if args.do_train:
        train_examples = processor.get_train_examples(args.data_dir)
        num_train_optimization_steps = int(
            len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs
        if args.local_rank != -1:
            num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size()
    # Prepare model
    cache_dir = args.cache_dir if args.cache_dir else os.path.join(PYTORCH_PRETRAINED_BERT_CACHE, 'distributed_{}'.format(args.local_rank))
    model = BertForSequenceClassification.from_pretrained(args.bert_model,
              cache_dir=cache_dir,
              num_labels = num_labels)
    model.load_state_dict(torch.load(args.init_checkpoint, map_location='cpu'), strict=False)
    if args.fp16:
        model.half()
    model.to(device)
    if args.local_rank != -1:
        try:
            from apex.parallel import DistributedDataParallel as DDP
        except ImportError:
            raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.")
        model = DDP(model)
    elif n_gpu > 1:
        model = torch.nn.DataParallel(model)
    # Prepare optimizer
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
        ]
    if args.fp16:
        try:
            from apex.optimizers import FP16_Optimizer
            from apex.optimizers import FusedAdam
        except ImportError:
            raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.")
        optimizer = FusedAdam(optimizer_grouped_parameters,
                              lr=args.learning_rate,
                              bias_correction=False,
                              max_grad_norm=1.0)
        if args.loss_scale == 0:
            optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)
        else:
            optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale)
    else:
        optimizer = BertAdam(optimizer_grouped_parameters,
                             lr=args.learning_rate,
                             warmup=args.warmup_proportion,
                             t_total=num_train_optimization_steps)
    global_step = 0
    nb_tr_steps = 0
    tr_loss = 0
    if args.do_train:
        train_features = convert_examples_to_features(
            train_examples, label_list, args.max_seq_length, tokenizer)
        logger.info("***** Running training *****")
        logger.info("  Num examples = %d", len(train_examples))
        logger.info("  Batch size = %d", args.train_batch_size)
        logger.info("  Num steps = %d", num_train_optimization_steps)
        all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long)
        all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long)
        all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long)
        all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long)
        train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)
        if args.local_rank == -1:
            train_sampler = RandomSampler(train_data)
        else:
            train_sampler = DistributedSampler(train_data)
        train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size)
        model.train()
        for _ in trange(int(args.num_train_epochs), desc="Epoch"):
            tr_loss = 0
            nb_tr_examples, nb_tr_steps = 0, 0
            for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration")):
                if args.max_steps > 0 and global_step > args.max_steps:
                    break
                batch = tuple(t.to(device) for t in batch)
                input_ids, input_mask, segment_ids, label_ids = batch
                loss = model(input_ids, segment_ids, input_mask, label_ids)
                if n_gpu > 1:
                    loss = loss.mean() # mean() to average on multi-gpu.
                if args.gradient_accumulation_steps > 1:
                    loss = loss / args.gradient_accumulation_steps
                if args.fp16:
                    optimizer.backward(loss)
                else:
                    loss.backward()
                tr_loss += loss.item()
                nb_tr_examples += input_ids.size(0)
                nb_tr_steps += 1
                if (step + 1) % args.gradient_accumulation_steps == 0:
                    if args.fp16:
                        # modify learning rate with special warm up BERT uses
                        # if args.fp16 is False, BertAdam is used that handles this automatically
                        lr_this_step = args.learning_rate * warmup_linear(global_step/num_train_optimization_steps, args.warmup_proportion)
                        for param_group in optimizer.param_groups:
                            param_group['lr'] = lr_this_step
                    optimizer.step()
                    optimizer.zero_grad()
                    global_step += 1
    if args.do_train:
        # Save a trained model and the associated configuration
        model_to_save = model.module if hasattr(model, 'module') else model  # Only save the model it-self
        output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME)
        torch.save(model_to_save.state_dict(), output_model_file)
        output_config_file = os.path.join(args.output_dir, CONFIG_NAME)
        with open(output_config_file, 'w') as f:
            f.write(model_to_save.config.to_json_string())
        # Load a trained model and config that you have fine-tuned
        config = BertConfig(output_config_file)
        model = BertForSequenceClassification(config, num_labels=num_labels)
        model.load_state_dict(torch.load(output_model_file))
    else:
        model = BertForSequenceClassification.from_pretrained(args.bert_model, num_labels=num_labels)
        model.load_state_dict(torch.load(args.init_checkpoint, map_location='cpu'), strict=False)
    model.to(device)
    if args.do_eval and (args.local_rank == -1 or torch.distributed.get_rank() == 0):
        eval_examples = processor.get_dev_examples(args.data_dir)
        eval_features = convert_examples_to_features(
            eval_examples, label_list, args.max_seq_length, tokenizer)
        logger.info("***** Running evaluation *****")
        logger.info("  Num examples = %d", len(eval_examples))
        logger.info("  Batch size = %d", args.eval_batch_size)
        all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long)
        all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long)
        all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long)
        all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long)
        eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)
        # Run prediction for full data
        eval_sampler = SequentialSampler(eval_data)
        eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size)
        model.eval()
        eval_loss, eval_accuracy = 0, 0
        nb_eval_steps, nb_eval_examples = 0, 0
        for input_ids, input_mask, segment_ids, label_ids in tqdm(eval_dataloader, desc="Evaluating"):
            input_ids = input_ids.to(device)
            input_mask = input_mask.to(device)
            segment_ids = segment_ids.to(device)
            label_ids = label_ids.to(device)
            with torch.no_grad():
                tmp_eval_loss = model(input_ids, segment_ids, input_mask, label_ids)
                logits = model(input_ids, segment_ids, input_mask)
            logits = logits.detach().cpu().numpy()
            label_ids = label_ids.to('cpu').numpy()
            tmp_eval_accuracy = accuracy(logits, label_ids)
            eval_loss += tmp_eval_loss.mean().item()
            eval_accuracy += tmp_eval_accuracy
            nb_eval_examples += input_ids.size(0)
            nb_eval_steps += 1
        eval_loss = eval_loss / nb_eval_steps
        eval_accuracy = eval_accuracy / nb_eval_examples
        loss = tr_loss/nb_tr_steps if args.do_train else None
        result = {'eval_loss': eval_loss,
                  'eval_accuracy': eval_accuracy,
                  'global_step': global_step,
                  'loss': loss}
        output_eval_file = os.path.join(args.output_dir, "eval_results.txt")
        with open(output_eval_file, "w") as writer:
            logger.info("***** Eval results *****")
            for key in sorted(result.keys()):
                logger.info("  %s = %s", key, str(result[key]))
                writer.write("%s = %s\n" % (key, str(result[key])))
 if __name__ == "__main__":
    main()
--- a/PyTorch/LanguageModeling/BERT/run_pretraining.py
+++ b/PyTorch/LanguageModeling/BERT/run_pretraining.py
@ -0,0 +1,417 @@
 # coding=utf-8
 # Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """BERT finetuning runner."""
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 #==================
 import csv
 import os
 import logging
 import argparse
 import random
 import h5py
 from tqdm import tqdm, trange
 import os
 import numpy as np
 import torch
 from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, Dataset
 from torch.utils.data.distributed import DistributedSampler
 import math
 from apex import amp
 from tokenization import BertTokenizer
 from modeling import BertForPreTraining, BertConfig
 from optimization import BertAdam, BertAdam_FP16
 # from fused_adam_local import FusedAdamBert
 from file_utils import PYTORCH_PRETRAINED_BERT_CACHE
 from apex.optimizers import FusedAdam #, FP16_Optimizer
 #from apex.optimizers import FusedAdam
 from apex.parallel import DistributedDataParallel as DDP
 from schedulers import LinearWarmUpScheduler
 logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
                    datefmt = '%m/%d/%Y %H:%M:%S',
                    level = logging.INFO)
 logger = logging.getLogger(__name__)
 class pretraining_dataset(Dataset):
    def __init__(self, input_file, max_pred_length):
        self.input_file = input_file
        self.max_pred_length = max_pred_length
        f = h5py.File(input_file, "r")
        self.input_ids = np.asarray(f["input_ids"][:]).astype(np.int64)#[num_instances x max_seq_length])
        self.input_masks = np.asarray(f["input_mask"][:]).astype(np.int64) #[num_instances x max_seq_length]
        self.segment_ids = np.asarray(f["segment_ids"][:]).astype(np.int64) #[num_instances x max_seq_length]
        self.masked_lm_positions = np.asarray(f["masked_lm_positions"][:]).astype(np.int64) #[num_instances x max_pred_length]
        self.masked_lm_ids= np.asarray(f["masked_lm_ids"][:]).astype(np.int64) #[num_instances x max_pred_length]
        self.next_sentence_labels = np.asarray(f["next_sentence_labels"][:]).astype(np.int64) # [num_instances]
        f.close()
    def __len__(self):
        'Denotes the total number of samples'
        return len(self.input_ids)
    def __getitem__(self, index):
        input_ids= torch.from_numpy(self.input_ids[index]) # [max_seq_length]
        input_mask = torch.from_numpy(self.input_masks[index]) #[max_seq_length]
        segment_ids = torch.from_numpy(self.segment_ids[index])# [max_seq_length]
        masked_lm_positions = torch.from_numpy(self.masked_lm_positions[index]) #[max_pred_length]
        masked_lm_ids = torch.from_numpy(self.masked_lm_ids[index]) #[max_pred_length]
        next_sentence_labels = torch.from_numpy(np.asarray(self.next_sentence_labels[index])) #[1]
        masked_lm_labels = torch.ones(input_ids.shape, dtype=torch.long) * -1
        index = self.max_pred_length
        # store number of  masked tokens in index
        if len((masked_lm_positions == 0).nonzero()) != 0:
          index = (masked_lm_positions == 0).nonzero()[0].item()
        masked_lm_labels[masked_lm_positions[:index]] = masked_lm_ids[:index]
        return [input_ids, segment_ids, input_mask, masked_lm_labels, next_sentence_labels]
 def main():    
    print("IN NEW MAIN XD\n")
    parser = argparse.ArgumentParser()
    ## Required parameters
    parser.add_argument("--input_dir",
                        default=None,
                        type=str,
                        required=True,
                        help="The input data dir. Should contain .hdf5 files  for the task.")
    parser.add_argument("--config_file",
                        default=None,
                        type=str,
                        required=True,
                        help="The BERT model config")
    parser.add_argument("--bert_model", default="bert-large-uncased", type=str,
                        help="Bert pre-trained model selected in the list: bert-base-uncased, "
                             "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese.")
    parser.add_argument("--output_dir",
                        default=None,
                        type=str,
                        required=True,
                        help="The output directory where the model checkpoints will be written.")
    ## Other parameters
    parser.add_argument("--max_seq_length",
                        default=512,
                        type=int,
                        help="The maximum total input sequence length after WordPiece tokenization. \n"
                             "Sequences longer than this will be truncated, and sequences shorter \n"
                             "than this will be padded.")
    parser.add_argument("--max_predictions_per_seq",
                        default=80,
                        type=int,
                        help="The maximum total of masked tokens in input sequence")
    parser.add_argument("--train_batch_size",
                        default=32,
                        type=int,
                        help="Total batch size for training.")
    parser.add_argument("--learning_rate",
                        default=5e-5,
                        type=float,
                        help="The initial learning rate for Adam.")
    parser.add_argument("--num_train_epochs",
                        default=3.0,
                        type=float,
                        help="Total number of training epochs to perform.")
    parser.add_argument("--max_steps",
                        default=1000,
                        type=float,
                        help="Total number of training steps to perform.")
    parser.add_argument("--warmup_proportion",
                        default=0.01,
                        type=float,
                        help="Proportion of training to perform linear learning rate warmup for. "
                             "E.g., 0.1 = 10%% of training.")
    parser.add_argument("--local_rank",
                        type=int,
                        default=-1,
                        help="local_rank for distributed training on gpus")
    parser.add_argument('--seed',
                        type=int,
                        default=42,
                        help="random seed for initialization")
    parser.add_argument('--gradient_accumulation_steps',
                        type=int,
                        default=1,
                        help="Number of updates steps to accumualte before performing a backward/update pass.")
    parser.add_argument('--fp16',
                        default=False,
                        action='store_true',
                        help="Whether to use 16-bit float precision instead of 32-bit")
    parser.add_argument('--loss_scale',
                        type=float, default=0.0,
                        help='Loss scaling, positive power of 2 values can improve fp16 convergence.')
    parser.add_argument('--log_freq',
                        type=float, default=10.0,
                        help='frequency of logging loss.')
    parser.add_argument('--checkpoint_activations',
                        default=False,
                        action='store_true',
                        help="Whether to use gradient checkpointing")
    parser.add_argument("--resume_from_checkpoint",
                        default=False,
                        action='store_true',
                        help="Whether to resume training from checkpoint.")
    parser.add_argument('--resume_step',
                        type=int,
                        default=-1,
                        help="Step to resume training from.")
    parser.add_argument('--num_steps_per_checkpoint',
                        type=int,
                        default=2000,
                        help="Number of update steps until a model checkpoint is saved to disk.")
    args = parser.parse_args()
    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    assert(torch.cuda.is_available())
    if args.local_rank == -1:
        device = torch.device("cuda")
        n_gpu = torch.cuda.device_count()
    else:
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
        n_gpu = 1
        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.distributed.init_process_group(backend='nccl', init_method='env://')
    logger.info("device %s n_gpu %d distributed training %r", device, n_gpu, bool(args.local_rank != -1))
    if args.gradient_accumulation_steps < 1:
        raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format(
                            args.gradient_accumulation_steps))
    if args.train_batch_size % args.gradient_accumulation_steps != 0:
        raise ValueError("Invalid gradient_accumulation_steps parameter: {}, batch size {} should be divisible".format(
                            args.gradient_accumulation_steps, args.train_batch_size))
    args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps
    if not args.resume_from_checkpoint and os.path.exists(args.output_dir) and (os.listdir(args.output_dir) and os.listdir(args.output_dir)!=['logfile.txt']):
        raise ValueError("Output directory ({}) already exists and is not empty.".format(args.output_dir))
    if not args.resume_from_checkpoint:
        os.makedirs(args.output_dir, exist_ok=True)
    # Prepare model
    config = BertConfig.from_json_file(args.config_file)
    model = BertForPreTraining(config)
    if not args.resume_from_checkpoint:
        global_step = 0
    else:
        if args.resume_step == -1:
            model_names = [f for f in os.listdir(args.output_dir) if f.endswith(".pt")]
            args.resume_step = max([int(x.split('.pt')[0].split('_')[1].strip()) for x in model_names])
        global_step = args.resume_step
        checkpoint = torch.load(os.path.join(args.output_dir, "ckpt_{}.pt".format(global_step)), map_location="cpu")
        model.load_state_dict(checkpoint['model'], strict=False)
        print("resume step from ", args.resume_step)
    model.to(device)
    # Prepare optimizer
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
    ]
    if args.fp16:
        optimizer = FusedAdam(optimizer_grouped_parameters,
                                    lr=args.learning_rate,
                                    #warmup=args.warmup_proportion,
                                    #t_total=args.max_steps,
                                    bias_correction=False,
                                    weight_decay=0.01,
                                    max_grad_norm=1.0)
        if args.loss_scale == 0:
            # optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)
            model, optimizer = amp.initialize(model, optimizer, opt_level="O2", keep_batchnorm_fp32=False, loss_scale="dynamic")
        else:
            # optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale)
            model, optimizer = amp.initialize(model, optimizer, opt_level="O2", keep_batchnorm_fp32=False, loss_scale=args.loss_scale)
        scheduler = LinearWarmUpScheduler(optimizer, warmup=args.warmup_proportion, total_steps=args.max_steps)
    else:
        optimizer = BertAdam(optimizer_grouped_parameters,
                                lr=args.learning_rate,
                                warmup=args.warmup_proportion,
                                t_total=args.max_steps)
    if args.resume_from_checkpoint:
        optimizer.load_state_dict(checkpoint['optimizer'])  # , strict=False)
    if args.local_rank != -1:
        model = DDP(model)
    elif n_gpu > 1:
        model = torch.nn.DataParallel(model)
    files = [os.path.join(args.input_dir, f) for f in os.listdir(args.input_dir) if os.path.isfile(os.path.join(args.input_dir, f))]
    files.sort()
    num_files = len(files)
    logger.info("***** Running training *****")
    # logger.info("  Num examples = %d", len(train_data))
    logger.info("  Batch size = %d", args.train_batch_size)
    print("  LR = ", args.learning_rate)
    model.train()
    print("Training. . .")
    most_recent_ckpts_paths = []
    print("Training. . .")
    tr_loss = 0.0 # total added training loss
    average_loss = 0.0 # averaged loss every args.log_freq steps
    epoch = 0
    training_steps = 0
    while True:
        if not args.resume_from_checkpoint:
            random.shuffle(files)
            f_start_id = 0
        else:
            f_start_id = checkpoint['files'][0]
            files = checkpoint['files'][1:]
            args.resume_from_checkpoint = False
        for f_id in range(f_start_id, len(files)):
            data_file = files[f_id]
            logger.info("file no %s file %s" %(f_id, data_file))
            train_data = pretraining_dataset(input_file=data_file, max_pred_length=args.max_predictions_per_seq)
            if args.local_rank == -1:
            train_sampler = RandomSampler(train_data)
            train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size * n_gpu, num_workers=4, pin_memory=True)
            else:
            train_sampler = DistributedSampler(train_data)
            train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size, num_workers=4, pin_memory=True)
            for step, batch in enumerate(tqdm(train_dataloader, desc="File Iteration")):
                training_steps += 1
                batch = [t.to(device) for t in batch]
                input_ids, segment_ids, input_mask, masked_lm_labels, next_sentence_labels = batch#\
                loss = model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask, masked_lm_labels=masked_lm_labels, next_sentence_label=next_sentence_labels, checkpoint_activations=args.checkpoint_activations)
                if n_gpu > 1:
                    loss = loss.mean() # mean() to average on multi-gpu.
                if args.gradient_accumulation_steps > 1:
                    loss = loss / args.gradient_accumulation_steps
                if args.fp16:
                #   optimizer.backward(loss)
                    with amp.scale_loss(loss, optimizer) as scaled_loss:
                        scaled_loss.backward()
                else:
                    loss.backward()
                tr_loss += loss
                average_loss += loss.item()
                if training_steps % args.gradient_accumulation_steps == 0:
                    if args.fp16:
                        scheduler.step()
                    optimizer.step()
                    optimizer.zero_grad()
                    global_step += 1
                if training_steps == 1 * args.gradient_accumulation_steps:
                    logger.info("Step:{} Average Loss = {} Step Loss = {} LR {}".format(global_step, average_loss, 
                                                                                loss.item(), optimizer.param_groups[0]['lr']))
                if training_steps % (args.log_freq * args.gradient_accumulation_steps) == 0:
                    logger.info("Step:{} Average Loss = {} Step Loss = {} LR {}".format(global_step,  average_loss / args.log_freq, 
                                                                                loss.item(), optimizer.param_groups[0]['lr']))
                    average_loss = 0
                if global_step >= args.max_steps or training_steps == 1 * args.gradient_accumulation_steps or training_steps % (args.num_steps_per_checkpoint * args.gradient_accumulation_steps) == 0:
                    if (not torch.distributed.is_initialized() or (torch.distributed.is_initialized() and torch.distributed.get_rank() == 0)):
                        # Save a trained model
                        logger.info("** ** * Saving fine - tuned model ** ** * ")
                        model_to_save = model.module if hasattr(model, 'module') else model  # Only save the model it-self
                        output_save_file = os.path.join(args.output_dir, "ckpt_{}.pt".format(global_step))
                        torch.save({'model' : model_to_save.state_dict(), 
                                'optimizer' : optimizer.state_dict(), 
                                'files' : [f_id] + files }, output_save_file)
                        most_recent_ckpts_paths.append(output_save_file)
                        if len(most_recent_ckpts_paths) > 3:
                            ckpt_to_be_removed = most_recent_ckpts_paths.pop(0)
                            os.remove(ckpt_to_be_removed)
                    if global_step >= args.max_steps:
                        tr_loss = tr_loss * args.gradient_accumulation_steps / training_steps
                        if (torch.distributed.is_initialized()):
                            tr_loss /= torch.distributed.get_world_size()
                            torch.distributed.all_reduce(tr_loss)
                        logger.info("Total Steps:{} Final Loss = {}".format(training_steps, tr_loss.item()))
                        return
            del train_dataloader
            del train_sampler
            del train_data       
            #for obj in gc.get_objects():
            #  if torch.is_tensor(obj) or (hasattr(obj, 'data') and torch.is_tensor(obj.data)):
            #    del obj
            torch.cuda.empty_cache()
        epoch += 1
 if __name__ == "__main__":
    main()
--- a/PyTorch/LanguageModeling/BERT/run_pretraining_inference.py
+++ b/PyTorch/LanguageModeling/BERT/run_pretraining_inference.py
@ -0,0 +1,300 @@
 # coding=utf-8
 # Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """BERT finetuning runner."""
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 #==================
 import csv
 import os
 import logging
 import argparse
 import random
 import h5py
 from tqdm import tqdm, trange
 import os
 import numpy as np
 import torch
 from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, Dataset
 from torch.utils.data.distributed import DistributedSampler
 import math
 import time
 from tokenization import BertTokenizer
 from modeling import BertForPreTraining, BertConfig
 # from fused_adam_local import FusedAdamBert
 from file_utils import PYTORCH_PRETRAINED_BERT_CACHE
 from apex.parallel import DistributedDataParallel as DDP
 import torch.distributed as dist
 logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
                    datefmt = '%m/%d/%Y %H:%M:%S',
                    level = logging.INFO)
 logger = logging.getLogger(__name__)
 class pretraining_dataset(Dataset):
    def __init__(self, input_file, max_pred_length):
        self.input_file = input_file
        self.max_pred_length = max_pred_length
        f = h5py.File(input_file, "r")
        self.input_ids = np.asarray(f["input_ids"][:]).astype(np.int64)#[num_instances x max_seq_length])
        self.input_masks = np.asarray(f["input_mask"][:]).astype(np.int64) #[num_instances x max_seq_length]
        self.segment_ids = np.asarray(f["segment_ids"][:]).astype(np.int64) #[num_instances x max_seq_length]
        self.masked_lm_positions = np.asarray(f["masked_lm_positions"][:]).astype(np.int64) #[num_instances x max_pred_length]
        self.masked_lm_ids= np.asarray(f["masked_lm_ids"][:]).astype(np.int64) #[num_instances x max_pred_length]
        self.next_sentence_labels = np.asarray(f["next_sentence_labels"][:]).astype(np.int64) # [num_instances]
        f.close()
    def __len__(self):
        'Denotes the total number of samples'
        return len(self.input_ids)
    def __getitem__(self, index):
        input_ids= torch.from_numpy(self.input_ids[index]) # [max_seq_length]
        input_mask = torch.from_numpy(self.input_masks[index]) #[max_seq_length]
        segment_ids = torch.from_numpy(self.segment_ids[index])# [max_seq_length]
        masked_lm_positions = torch.from_numpy(self.masked_lm_positions[index]) #[max_pred_length]
        masked_lm_ids = torch.from_numpy(self.masked_lm_ids[index]) #[max_pred_length]
        next_sentence_labels = torch.from_numpy(np.asarray(self.next_sentence_labels[index])) #[1]
        masked_lm_labels = torch.ones(input_ids.shape, dtype=torch.long) * -1
        index = self.max_pred_length
        # store number of  masked tokens in index
        if len((masked_lm_positions == 0).nonzero()) != 0:
          index = (masked_lm_positions == 0).nonzero()[0].item()
        masked_lm_labels[masked_lm_positions[:index]] = masked_lm_ids[:index]
        return [input_ids, segment_ids, input_mask, masked_lm_labels, next_sentence_labels]
 def main():    
    print("IN NEW MAIN XD\n")
    parser = argparse.ArgumentParser()
    ## Required parameters
    parser.add_argument("--input_dir",
                        default=None,
                        type=str,
                        required=True,
                        help="The input data dir. Should contain .hdf5 files  for the task.")
    parser.add_argument("--config_file",
                        default="bert_config.json",
                        type=str,
                        required=False,
                        help="The BERT model config")
    parser.add_argument("--ckpt_dir",
                        default=None,
                        type=str,
                        required=True,
                        help="The ckpt directory, e.g. /results")
    group = parser.add_mutually_exclusive_group(required=True)
    group.add_argument('--eval', dest='do_eval', action='store_true')
    group.add_argument('--prediction', dest='do_eval', action='store_false')
    ## Other parameters
    parser.add_argument("--bert_model", default="bert-large-uncased", type=str, required=False,
                        help="Bert pre-trained model selected in the list: bert-base-uncased, "
                             "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese.")
    parser.add_argument("--max_seq_length",
                        default=512,
                        type=int,
                        help="The maximum total input sequence length after WordPiece tokenization. \n"
                             "Sequences longer than this will be truncated, and sequences shorter \n"
                             "than this will be padded.")
    parser.add_argument("--max_predictions_per_seq",
                        default=80,
                        type=int,
                        help="The maximum total of masked tokens in input sequence")
    parser.add_argument("--ckpt_step",
                        default=-1,
                        type=int,
                        required=False,
                        help="The model checkpoint iteration, e.g. 1000")
    parser.add_argument("--eval_batch_size",
                        default=8,
                        type=int,
                        help="Total batch size for training.")
    parser.add_argument("--max_steps",
                        default=-1,
                        type=int,
                        help="Total number of eval  steps to perform, otherwise use full dataset")
    parser.add_argument("--no_cuda",
                        default=False,
                        action='store_true',
                        help="Whether not to use CUDA when available")
    parser.add_argument("--local_rank",
                        type=int,
                        default=-1,
                        help="local_rank for distributed training on gpus")
    parser.add_argument('--seed',
                        type=int,
                        default=42,
                        help="random seed for initialization")
    parser.add_argument('--fp16',
                        default=False,
                        action='store_true',
                        help="Whether to use 16-bit float precision instead of 32-bit")
    args = parser.parse_args()
    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
    else:
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.distributed.init_process_group(backend='nccl', init_method='env://')
    n_gpu = torch.cuda.device_count()
    if n_gpu > 1:
        assert(args.local_rank != -1) # only use torch.distributed for multi-gpu 
    logger.info("device %s n_gpu %d distributed inference %r", device, n_gpu, bool(args.local_rank != -1))
    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if n_gpu > 0:
        torch.cuda.manual_seed_all(args.seed)
    # Prepare model
    config = BertConfig.from_json_file(args.config_file)
    model = BertForPreTraining(config)
    if args.ckpt_step == -1:
        #retrieve latest model
        model_names = [f for f in os.listdir(args.ckpt_dir) if f.endswith(".model")]
        args.ckpt_step = max([int(x.split('.model')[0].split('_')[1].strip()) for x in model_names])
        print("load model saved at iteraton", args.ckpt_step)
    model_file = os.path.join(args.ckpt_dir, "ckpt_" + str(args.ckpt_step) + ".model")
    state_dict = torch.load(model_file, map_location="cpu")
    model.load_state_dict(state_dict, strict=False)
    if args.fp16:
        model.half() # all parameters and buffers are converted to half precision
    model.to(device)
    multi_gpu_training = args.local_rank != -1 and torch.distributed.is_initialized()
    if multi_gpu_training:
        model = DDP(model)
    files = [os.path.join(args.input_dir, f) for f in os.listdir(args.input_dir) if os.path.isfile(os.path.join(args.input_dir, f))]
    files.sort()
    logger.info("***** Running evaluation *****")
    logger.info("  Batch size = %d", args.eval_batch_size)
    model.eval()
    print("Evaluation. . .")
    nb_instances = 0
    max_steps = args.max_steps if args.max_steps > 0  else np.inf
    global_step = 0
    with torch.no_grad():
        if args.do_eval:
            final_loss = 0.0 # 
            for data_file in files:
                logger.info("file %s" %( data_file))
                dataset = pretraining_dataset(input_file=data_file, max_pred_length=args.max_predictions_per_seq)
                if not multi_gpu_training:
                    train_sampler = RandomSampler(dataset)
                    datasetloader = DataLoader(dataset, sampler=train_sampler, batch_size=args.eval_batch_size, num_workers=4, pin_memory=True)
                else:
                    train_sampler = DistributedSampler(dataset)
                    datasetloader = DataLoader(dataset, sampler=train_sampler, batch_size=args.eval_batch_size, num_workers=4, pin_memory=True)
                for step, batch in enumerate(tqdm(datasetloader, desc="Iteration")):
                    if global_step > max_steps:
                        break
                    batch = [t.to(device) for t in batch]
                    input_ids, segment_ids, input_mask, masked_lm_labels, next_sentence_labels = batch#\
                    loss = model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask, masked_lm_labels=masked_lm_labels, next_sentence_label=next_sentence_labels)
                    final_loss += loss
                    global_step += 1
                torch.cuda.empty_cache()
                if global_step > max_steps:
                    break
            final_loss /= global_step
            if multi_gpu_training:
                final_loss /= torch.distributed.get_world_size()
                dist.all_reduce(final_loss)
            if (not multi_gpu_training or (multi_gpu_training and torch.distributed.get_rank() == 0)):       
                logger.info("Finished: Final Loss = {}".format(final_loss))
        else: # inference
            # if multi_gpu_training:
            #     torch.distributed.barrier()
            # start_t0 = time.time()
            for data_file in files:
                logger.info("file %s" %( data_file))
                dataset = pretraining_dataset(input_file=data_file, max_pred_length=args.max_predictions_per_seq)
                if not multi_gpu_training:
                    train_sampler = RandomSampler(dataset)
                    datasetloader = DataLoader(dataset, sampler=train_sampler, batch_size=args.eval_batch_size, num_workers=4, pin_memory=True)
                else:
                    train_sampler = DistributedSampler(dataset)
                    datasetloader = DataLoader(dataset, sampler=train_sampler, batch_size=args.eval_batch_size, num_workers=4, pin_memory=True)
                for step, batch in enumerate(tqdm(datasetloader, desc="Iteration")):
                    if global_step > max_steps:
                        break
                    batch = [t.to(device) for t in batch]
                    input_ids, segment_ids, input_mask, masked_lm_labels, next_sentence_labels = batch#\
                    lm_logits, nsp_logits = model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask, masked_lm_labels=None, next_sentence_label=None)
                    nb_instances += input_ids.size(0)
                    global_step += 1
                torch.cuda.empty_cache()
                if global_step > max_steps:
                    break
            # if multi_gpu_training:
            #     torch.distributed.barrier()
            if (not multi_gpu_training or (multi_gpu_training and torch.distributed.get_rank() == 0)):       
                logger.info("Finished")
 if __name__ == "__main__":
    main()
--- a/PyTorch/LanguageModeling/BERT/run_squad.py
+++ b/PyTorch/LanguageModeling/BERT/run_squad.py
--- a/PyTorch/LanguageModeling/BERT/run_swag.py
+++ b/PyTorch/LanguageModeling/BERT/run_swag.py
@ -0,0 +1,561 @@
 # coding=utf-8
 # Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team.
 # Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """BERT finetuning runner."""
 import argparse
 import csv
 import logging
 import os
 import random
 import sys
 from io import open
 import numpy as np
 import torch
 from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,
                              TensorDataset)
 from torch.utils.data.distributed import DistributedSampler
 from tqdm import tqdm, trange
 from file_utils import PYTORCH_PRETRAINED_BERT_CACHE
 from modeling import BertForMultipleChoice, BertConfig, WEIGHTS_NAME, CONFIG_NAME
 from optimization import BertAdam, warmup_linear
 from tokenization import BertTokenizer
 logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
                    datefmt = '%m/%d/%Y %H:%M:%S',
                    level = logging.INFO)
 logger = logging.getLogger(__name__)
 class SwagExample(object):
    """A single training/test example for the SWAG dataset."""
    def __init__(self,
                 swag_id,
                 context_sentence,
                 start_ending,
                 ending_0,
                 ending_1,
                 ending_2,
                 ending_3,
                 label = None):
        self.swag_id = swag_id
        self.context_sentence = context_sentence
        self.start_ending = start_ending
        self.endings = [
            ending_0,
            ending_1,
            ending_2,
            ending_3,
        ]
        self.label = label
    def __str__(self):
        return self.__repr__()
    def __repr__(self):
        l = [
            "swag_id: {}".format(self.swag_id),
            "context_sentence: {}".format(self.context_sentence),
            "start_ending: {}".format(self.start_ending),
            "ending_0: {}".format(self.endings[0]),
            "ending_1: {}".format(self.endings[1]),
            "ending_2: {}".format(self.endings[2]),
            "ending_3: {}".format(self.endings[3]),
        ]
        if self.label is not None:
            l.append("label: {}".format(self.label))
        return ", ".join(l)
 class InputFeatures(object):
    def __init__(self,
                 example_id,
                 choices_features,
                 label
    ):
        self.example_id = example_id
        self.choices_features = [
            {
                'input_ids': input_ids,
                'input_mask': input_mask,
                'segment_ids': segment_ids
            }
            for _, input_ids, input_mask, segment_ids in choices_features
        ]
        self.label = label
 def read_swag_examples(input_file, is_training):
    with open(input_file, 'r', encoding='utf-8') as f:
        reader = csv.reader(f)
        lines = []
        for line in reader:
            if sys.version_info[0] == 2:
                line = list(unicode(cell, 'utf-8') for cell in line)
            lines.append(line)
    if is_training and lines[0][-1] != 'label':
        raise ValueError(
            "For training, the input file must contain a label column."
        )
    examples = [
        SwagExample(
            swag_id = line[2],
            context_sentence = line[4],
            start_ending = line[5], # in the swag dataset, the
                                         # common beginning of each
                                         # choice is stored in "sent2".
            ending_0 = line[7],
            ending_1 = line[8],
            ending_2 = line[9],
            ending_3 = line[10],
            label = int(line[11]) if is_training else None
        ) for line in lines[1:] # we skip the line with the column names
    ]
    return examples
 def convert_examples_to_features(examples, tokenizer, max_seq_length,
                                 is_training):
    """Loads a data file into a list of `InputBatch`s."""
    # Swag is a multiple choice task. To perform this task using Bert,
    # we will use the formatting proposed in "Improving Language
    # Understanding by Generative Pre-Training" and suggested by
    # @jacobdevlin-google in this issue
    # https://github.com/google-research/bert/issues/38.
    #
    # Each choice will correspond to a sample on which we run the
    # inference. For a given Swag example, we will create the 4
    # following inputs:
    # - [CLS] context [SEP] choice_1 [SEP]
    # - [CLS] context [SEP] choice_2 [SEP]
    # - [CLS] context [SEP] choice_3 [SEP]
    # - [CLS] context [SEP] choice_4 [SEP]
    # The model will output a single value for each input. To get the
    # final decision of the model, we will run a softmax over these 4
    # outputs.
    features = []
    for example_index, example in enumerate(examples):
        context_tokens = tokenizer.tokenize(example.context_sentence)
        start_ending_tokens = tokenizer.tokenize(example.start_ending)
        choices_features = []
        for ending_index, ending in enumerate(example.endings):
            # We create a copy of the context tokens in order to be
            # able to shrink it according to ending_tokens
            context_tokens_choice = context_tokens[:]
            ending_tokens = start_ending_tokens + tokenizer.tokenize(ending)
            # Modifies `context_tokens_choice` and `ending_tokens` in
            # place so that the total length is less than the
            # specified length.  Account for [CLS], [SEP], [SEP] with
            # "- 3"
            _truncate_seq_pair(context_tokens_choice, ending_tokens, max_seq_length - 3)
            tokens = ["[CLS]"] + context_tokens_choice + ["[SEP]"] + ending_tokens + ["[SEP]"]
            segment_ids = [0] * (len(context_tokens_choice) + 2) + [1] * (len(ending_tokens) + 1)
            input_ids = tokenizer.convert_tokens_to_ids(tokens)
            input_mask = [1] * len(input_ids)
            # Zero-pad up to the sequence length.
            padding = [0] * (max_seq_length - len(input_ids))
            input_ids += padding
            input_mask += padding
            segment_ids += padding
            assert len(input_ids) == max_seq_length
            assert len(input_mask) == max_seq_length
            assert len(segment_ids) == max_seq_length
            choices_features.append((tokens, input_ids, input_mask, segment_ids))
        label = example.label
        if example_index < 5:
            logger.info("*** Example ***")
            logger.info("swag_id: {}".format(example.swag_id))
            for choice_idx, (tokens, input_ids, input_mask, segment_ids) in enumerate(choices_features):
                logger.info("choice: {}".format(choice_idx))
                logger.info("tokens: {}".format(' '.join(tokens)))
                logger.info("input_ids: {}".format(' '.join(map(str, input_ids))))
                logger.info("input_mask: {}".format(' '.join(map(str, input_mask))))
                logger.info("segment_ids: {}".format(' '.join(map(str, segment_ids))))
            if is_training:
                logger.info("label: {}".format(label))
        features.append(
            InputFeatures(
                example_id = example.swag_id,
                choices_features = choices_features,
                label = label
            )
        )
    return features
 def _truncate_seq_pair(tokens_a, tokens_b, max_length):
    """Truncates a sequence pair in place to the maximum length."""
    # This is a simple heuristic which will always truncate the longer sequence
    # one token at a time. This makes more sense than truncating an equal percent
    # of tokens from each, since if one sequence is very short then each token
    # that's truncated likely contains more information than a longer sequence.
    while True:
        total_length = len(tokens_a) + len(tokens_b)
        if total_length <= max_length:
            break
        if len(tokens_a) > len(tokens_b):
            tokens_a.pop()
        else:
            tokens_b.pop()
 def accuracy(out, labels):
    outputs = np.argmax(out, axis=1)
    return np.sum(outputs == labels)
 def select_field(features, field):
    return [
        [
            choice[field]
            for choice in feature.choices_features
        ]
        for feature in features
    ]
 def main():
    parser = argparse.ArgumentParser()
    ## Required parameters
    parser.add_argument("--data_dir",
                        default=None,
                        type=str,
                        required=True,
                        help="The input data dir. Should contain the .csv files (or other data files) for the task.")
    parser.add_argument("--bert_model", default=None, type=str, required=True,
                        help="Bert pre-trained model selected in the list: bert-base-uncased, "
                        "bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, "
                        "bert-base-multilingual-cased, bert-base-chinese.")
    parser.add_argument("--output_dir",
                        default=None,
                        type=str,
                        required=True,
                        help="The output directory where the model checkpoints will be written.")
    parser.add_argument("--init_checkpoint",
                        default=None,
                        type=str,
                        required=True,
                        help="The checkpoint file from pretraining")
    ## Other parameters
    parser.add_argument("--max_seq_length",
                        default=128,
                        type=int,
                        help="The maximum total input sequence length after WordPiece tokenization. \n"
                             "Sequences longer than this will be truncated, and sequences shorter \n"
                             "than this will be padded.")
    parser.add_argument("--do_train",
                        action='store_true',
                        help="Whether to run training.")
    parser.add_argument("--do_eval",
                        action='store_true',
                        help="Whether to run eval on the dev set.")
    parser.add_argument("--do_lower_case",
                        action='store_true',
                        help="Set this flag if you are using an uncased model.")
    parser.add_argument("--train_batch_size",
                        default=32,
                        type=int,
                        help="Total batch size for training.")
    parser.add_argument("--eval_batch_size",
                        default=8,
                        type=int,
                        help="Total batch size for eval.")
    parser.add_argument("--learning_rate",
                        default=5e-5,
                        type=float,
                        help="The initial learning rate for Adam.")
    parser.add_argument("--num_train_epochs",
                        default=3.0,
                        type=float,
                        help="Total number of training epochs to perform.")
    parser.add_argument("--max_steps", default=-1.0, type=float,
                        help="Total number of training steps to perform.")
    parser.add_argument("--warmup_proportion",
                        default=0.1,
                        type=float,
                        help="Proportion of training to perform linear learning rate warmup for. "
                             "E.g., 0.1 = 10%% of training.")
    parser.add_argument("--no_cuda",
                        action='store_true',
                        help="Whether not to use CUDA when available")
    parser.add_argument("--local_rank",
                        type=int,
                        default=-1,
                        help="local_rank for distributed training on gpus")
    parser.add_argument('--seed',
                        type=int,
                        default=42,
                        help="random seed for initialization")
    parser.add_argument('--gradient_accumulation_steps',
                        type=int,
                        default=1,
                        help="Number of updates steps to accumulate before performing a backward/update pass.")
    parser.add_argument('--fp16',
                        action='store_true',
                        help="Whether to use 16-bit float precision instead of 32-bit")
    parser.add_argument('--loss_scale',
                        type=float, default=0,
                        help="Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n"
                             "0 (default value): dynamic loss scaling.\n"
                             "Positive power of 2: static loss scaling value.\n")
    args = parser.parse_args()
    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
        n_gpu = torch.cuda.device_count()
    else:
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
        n_gpu = 1
        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.distributed.init_process_group(backend='nccl')
    logger.info("device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".format(
        device, n_gpu, bool(args.local_rank != -1), args.fp16))
    if args.gradient_accumulation_steps < 1:
        raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format(
                            args.gradient_accumulation_steps))
    args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps
    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if n_gpu > 0:
        torch.cuda.manual_seed_all(args.seed)
    if not args.do_train and not args.do_eval:
        raise ValueError("At least one of `do_train` or `do_eval` must be True.")
    if os.path.exists(args.output_dir) and os.listdir(args.output_dir):
        print("WARNING: Output directory ({}) already exists and is not empty.".format(args.output_dir))
    if not os.path.exists(args.output_dir):
        os.makedirs(args.output_dir)
    tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case)
    train_examples = None
    num_train_optimization_steps = None
    if args.do_train:
        train_examples = read_swag_examples(os.path.join(args.data_dir, 'train.csv'), is_training = True)
        num_train_optimization_steps = int(
            len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs
        if args.local_rank != -1:
            num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size()
    # Prepare model
    model = BertForMultipleChoice.from_pretrained(args.bert_model,
        cache_dir=os.path.join(PYTORCH_PRETRAINED_BERT_CACHE, 'distributed_{}'.format(args.local_rank)),
        num_choices=4)
    model.load_state_dict(torch.load(args.init_checkpoint, map_location='cpu'), strict=False)
    if args.fp16:
        model.half()
    model.to(device)
    if args.local_rank != -1:
        try:
            from apex.parallel import DistributedDataParallel as DDP
        except ImportError:
            raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.")
        model = DDP(model)
    elif n_gpu > 1:
        model = torch.nn.DataParallel(model)
    # Prepare optimizer
    param_optimizer = list(model.named_parameters())
    # hack to remove pooler, which is not used
    # thus it produce None grad that break apex
    param_optimizer = [n for n in param_optimizer if 'pooler' not in n[0]]
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
        ]
    if args.fp16:
        try:
            from apex.optimizers import FP16_Optimizer
            from apex.optimizers import FusedAdam
        except ImportError:
            raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.")
        optimizer = FusedAdam(optimizer_grouped_parameters,
                              lr=args.learning_rate,
                              bias_correction=False,
                              max_grad_norm=1.0)
        if args.loss_scale == 0:
            optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)
        else:
            optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale)
    else:
        optimizer = BertAdam(optimizer_grouped_parameters,
                             lr=args.learning_rate,
                             warmup=args.warmup_proportion,
                             t_total=num_train_optimization_steps)
    global_step = 0
    if args.do_train:
        train_features = convert_examples_to_features(
            train_examples, tokenizer, args.max_seq_length, True)
        logger.info("***** Running training *****")
        logger.info("  Num examples = %d", len(train_examples))
        logger.info("  Batch size = %d", args.train_batch_size)
        logger.info("  Num steps = %d", num_train_optimization_steps)
        all_input_ids = torch.tensor(select_field(train_features, 'input_ids'), dtype=torch.long)
        all_input_mask = torch.tensor(select_field(train_features, 'input_mask'), dtype=torch.long)
        all_segment_ids = torch.tensor(select_field(train_features, 'segment_ids'), dtype=torch.long)
        all_label = torch.tensor([f.label for f in train_features], dtype=torch.long)
        train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label)
        if args.local_rank == -1:
            train_sampler = RandomSampler(train_data)
        else:
            train_sampler = DistributedSampler(train_data)
        train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size)
        model.train()
        for _ in trange(int(args.num_train_epochs), desc="Epoch"):
            tr_loss = 0
            nb_tr_examples, nb_tr_steps = 0, 0
            for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration")):
                # Terminate early for benchmarking
                if args.max_steps > 0 and global_step > args.max_steps:
                    break
                batch = tuple(t.to(device) for t in batch)
                input_ids, input_mask, segment_ids, label_ids = batch
                loss = model(input_ids, segment_ids, input_mask, label_ids)
                if n_gpu > 1:
                    loss = loss.mean() # mean() to average on multi-gpu.
                if args.fp16 and args.loss_scale != 1.0:
                    # rescale loss for fp16 training
                    # see https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html
                    loss = loss * args.loss_scale
                if args.gradient_accumulation_steps > 1:
                    loss = loss / args.gradient_accumulation_steps
                tr_loss += loss.item()
                nb_tr_examples += input_ids.size(0)
                nb_tr_steps += 1
                if args.fp16:
                    optimizer.backward(loss)
                else:
                    loss.backward()
                if (step + 1) % args.gradient_accumulation_steps == 0:
                    if args.fp16:
                        # modify learning rate with special warm up BERT uses
                        # if args.fp16 is False, BertAdam is used that handles this automatically
                        lr_this_step = args.learning_rate * warmup_linear(global_step/num_train_optimization_steps, args.warmup_proportion)
                        for param_group in optimizer.param_groups:
                            param_group['lr'] = lr_this_step
                    optimizer.step()
                    optimizer.zero_grad()
                    global_step += 1
    if args.do_train:
        # Save a trained model and the associated configuration
        model_to_save = model.module if hasattr(model, 'module') else model  # Only save the model it-self
        output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME)
        torch.save(model_to_save.state_dict(), output_model_file)
        output_config_file = os.path.join(args.output_dir, CONFIG_NAME)
        with open(output_config_file, 'w') as f:
            f.write(model_to_save.config.to_json_string())
        # Load a trained model and config that you have fine-tuned
        config = BertConfig(output_config_file)
        model = BertForMultipleChoice(config, num_choices=4)
        model.load_state_dict(torch.load(output_model_file))
    else:
        model = BertForMultipleChoice.from_pretrained(args.bert_model, num_choices=4)
        model.load_state_dict(torch.load(args.init_checkpoint, map_location='cpu'), strict=False)
    model.to(device)
    if args.do_eval and (args.local_rank == -1 or torch.distributed.get_rank() == 0):
        eval_examples = read_swag_examples(os.path.join(args.data_dir, 'val.csv'), is_training = True)
        eval_features = convert_examples_to_features(
            eval_examples, tokenizer, args.max_seq_length, True)
        logger.info("***** Running evaluation *****")
        logger.info("  Num examples = %d", len(eval_examples))
        logger.info("  Batch size = %d", args.eval_batch_size)
        all_input_ids = torch.tensor(select_field(eval_features, 'input_ids'), dtype=torch.long)
        all_input_mask = torch.tensor(select_field(eval_features, 'input_mask'), dtype=torch.long)
        all_segment_ids = torch.tensor(select_field(eval_features, 'segment_ids'), dtype=torch.long)
        all_label = torch.tensor([f.label for f in eval_features], dtype=torch.long)
        eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label)
        # Run prediction for full data
        eval_sampler = SequentialSampler(eval_data)
        eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size)
        model.eval()
        eval_loss, eval_accuracy = 0, 0
        nb_eval_steps, nb_eval_examples = 0, 0
        for input_ids, input_mask, segment_ids, label_ids in tqdm(eval_dataloader, desc="Evaluating"):
            input_ids = input_ids.to(device)
            input_mask = input_mask.to(device)
            segment_ids = segment_ids.to(device)
            label_ids = label_ids.to(device)
            with torch.no_grad():
                tmp_eval_loss = model(input_ids, segment_ids, input_mask, label_ids)
                logits = model(input_ids, segment_ids, input_mask)
            logits = logits.detach().cpu().numpy()
            label_ids = label_ids.to('cpu').numpy()
            tmp_eval_accuracy = accuracy(logits, label_ids)
            eval_loss += tmp_eval_loss.mean().item()
            eval_accuracy += tmp_eval_accuracy
            nb_eval_examples += input_ids.size(0)
            nb_eval_steps += 1
        eval_loss = eval_loss / nb_eval_steps
        eval_accuracy = eval_accuracy / nb_eval_examples
        result = {'eval_loss': eval_loss,
                  'eval_accuracy': eval_accuracy,
                  'global_step': global_step,
                  'loss': tr_loss/nb_tr_steps}
        output_eval_file = os.path.join(args.output_dir, "eval_results.txt")
        with open(output_eval_file, "w") as writer:
            logger.info("***** Eval results *****")
            for key in sorted(result.keys()):
                logger.info("  %s = %s", key, str(result[key]))
                writer.write("%s = %s\n" % (key, str(result[key])))
 if __name__ == "__main__":
    main()
--- a/PyTorch/LanguageModeling/BERT/schedulers.py
+++ b/PyTorch/LanguageModeling/BERT/schedulers.py
@ -0,0 +1,92 @@
 import math
 import torch
 from torch.optim.optimizer import Optimizer
 from apex.optimizers import FP16_Optimizer
 from torch.optim.lr_scheduler import _LRScheduler
 class LRScheduler(_LRScheduler):
    def __init__(self, optimizer, last_epoch=-1):
        # Check if using mixed precision training
        self.mixed_training = False
        base_optimizer = optimizer
        if isinstance(optimizer, FP16_Optimizer):
            self.mixed_training = True
            self.fp16_optimizer = optimizer
            base_optimizer = optimizer.optimizer
        # Check that optimizer param is valid
        elif not isinstance(optimizer, Optimizer):
            raise TypeError('{} is not an Optimizer'.format(
                type(optimizer).__name__))
        super(LRScheduler, self).__init__(base_optimizer, last_epoch)
    def step(self, epoch=None):
        # Set the current training step
        # ('epoch' is used to be consistent with _LRScheduler)
        if self.mixed_training:
            # The assumption is that the step will be constant
            state_dict = self.optimizer.state[self.optimizer.param_groups[0]['params'][0]]
            if 'step' in state_dict:
                self.last_epoch = state_dict['step'] + 1
            else:
                self.last_epoch = 1
        else:
            self.last_epoch = epoch if epoch is not None else self.last_epoch + 1
        for param_group, lr in zip(self.optimizer.param_groups, self.get_lr()):
            param_group['lr'] = lr
 class CosineWarmupScheduler(LRScheduler):
    """
    Applies a warm up period to the learning rate.
    """
    def __init__(self, optimizer, warmup, total_steps, last_epoch=-1):
        self.warmup = warmup
        self.total_steps = total_steps
        super(CosineWarmUpScheduler, self).__init__(optimizer, last_epoch)
    def get_lr(self):
        progress = self.last_epoch / self.total_steps
        if progress < self.warmup:
            return [base_lr * progress / self.warmup for base_lr in self.base_lrs]
        else:
            return [base_lr * (0.5 * (1.0 + torch.cos(math.pi + progress))) for base_lr in self.base_lrs]
 class ConstantWarmupScheduler(LRScheduler):
    """
    Applies a warm up period to the learning rate.
    """
    def __init__(self, optimizer, warmup, total_steps, last_epoch=-1):
        self.warmup = warmup
        self.total_steps = total_steps
        super(CosineWarmUpScheduler, self).__init__(optimizer, last_epoch)
    def get_lr(self):
        progress = self.last_epoch / self.total_steps
        if progress < self.warmup:
            return [base_lr * progress / self.warmup for base_lr in self.base_lrs]
        else:
            return self.base_lrs
 class LinearWarmUpScheduler(LRScheduler):
    """
    Applies a warm up period to the learning rate.
    """
    def __init__(self, optimizer, warmup, total_steps, last_epoch=-1):
        self.warmup = warmup
        self.total_steps = total_steps
        super(LinearWarmUpScheduler, self).__init__(optimizer, last_epoch)
    def get_lr(self):
        progress = self.last_epoch / self.total_steps
        if progress < self.warmup:
            return [base_lr * progress / self.warmup for base_lr in self.base_lrs]
        else:
            return [base_lr * max(( progress - 1.0)/(self.warmup - 1.0), 0.) for base_lr in self.base_lrs]
--- a/PyTorch/LanguageModeling/BERT/scripts/data_download.sh
+++ b/PyTorch/LanguageModeling/BERT/scripts/data_download.sh
@ -0,0 +1,38 @@
 #!/usr/bin/env bash
 DATA_DIR=${1:-/workspace/bert/data}
 # Check running from repository root
 if [ ! -d .git ]; then
  echo "Not running from repository root! Exiting."
  exit 1
 fi
 # Download vocab files from pretrained model
 cd vocab && python3 download_models.py && rm *.zip && rm ./*/*.ckpt.*
 # Download SQUAD
 cd $DATA_DIR/squad && . squad_download.sh
 # Download SWAG
 git clone https://github.com/rowanz/swagaf.git $DATA_DIR/swag
 # Download GLUE
 cd $DATA_DIR/glue && . download_mrpc.sh
 # WIKI Download
 cd $DATA_DIR/wikipedia_corpus && . download_wikipedia.sh
 # Bookcorpus  Download
 cd $DATA_DIR/bookcorpus && . download_bookcorpus.sh
 cd $DATA_DIR
 # Create HDF5 files for WIKI
 bash create_datasets_from_start.sh wikipedia_corpus ./wikipedia_corpus/wikipedia_corpus.txt \
  && rm -r ./wikipedia_corpus/final_* \
 # Create HDF5 files for Bookcorpus
 bash create_datasets_from_start.sh bookcorpus ./bookcorpus/bookcorpus.txt \
  && rm -r ./bookcorpus/final_* \
 # Create HDF5 files for inter sequence-pair mixed Wikipedia and Bookcorpus
 bash merge_datasets_after_creation.sh merged_wiki+books wikipedia_corpus/hdf5_shards,bookcorpus/hdf5_shards 1024
--- a/PyTorch/LanguageModeling/BERT/scripts/docker/build.sh
+++ b/PyTorch/LanguageModeling/BERT/scripts/docker/build.sh
@ -0,0 +1,9 @@
 #!/bin/bash
 # Check running from repository root
 if [ ! -d .git ]; then
  echo "Not running from repository root! Exiting."
  exit 1
 fi
 docker build . --rm -t bert
--- a/PyTorch/LanguageModeling/BERT/scripts/docker/launch.sh
+++ b/PyTorch/LanguageModeling/BERT/scripts/docker/launch.sh
@ -0,0 +1,23 @@
 #!/bin/bash
 # Check running from repository root
 if [ ! -d .git ]; then
  echo "Not running from repository root! Exiting."
  exit 1
 fi
 DATA_DIR=${1:-"/mnt/dldata/bert"}
 VOCAB_DIR=${2:-"/mnt/dldata/bert/vocab"}
 CHECKPOINT_DIR=${3:-"/mnt/dldata/bert/pretrained_models_nvidia_pytorch"}
 docker run -it --rm \
  --runtime=nvidia \
  -p 8888:8888 \
  --shm-size=1g \
  --ulimit memlock=-1 \
  --ulimit stack=67108864 \
  -v $DATA_DIR:/workspace/bert/data \
  -v $CHECKPOINT_DIR:/workspace/checkpoints \
  -v $VOCAB_DIR:/workspace/bert/vocab \
  -v $PWD/results:/results \
  bert bash
--- a/PyTorch/LanguageModeling/BERT/scripts/run.sh
+++ b/PyTorch/LanguageModeling/BERT/scripts/run.sh
@ -0,0 +1,184 @@
 #!/bin/bash
 #SBATCH -p mlperf		# partition
 #SBATCH -N 1       		# number of nodes
 #SBATCH -t 12:00:00		# wall time
 #SBATCH -J image_classification	# job name
 #SBATCH --exclusive   		# exclusive node access
 #SBATCH --mem=0   		# all mem avail
 #SBATCH --mail-type=FAIL        # only send email on failure
 #SBATCH --ntasks-per-node=8	# n tasks per machine (one task per gpu)
 #SBATCH --threads-per-core=2	# HT is on
 #SBATCH --cores-per-socket=20	# 20 cores on each socket 
 #SBATCH --overcommit
 hostname
 #DGXIBDEVICES=$(eval ls /dev/infiniband/ | tr " " "\n" | awk '{printf "--device=/dev/infiniband/%s ",$1}' | sed s'/.$//')
 printf "DGXIBDEVICES=%s\n" "$DGXIBDEVICES"
 printf "VOLS=%s\n" "$VOLS"
 printf "EXTRA_PARAMS=%s\n" "$EXTRA_PARAMS"
 cd $CODEDIR
 VOLS+=" -v $CHKPTDIR/$SLURM_JOB_ID:/checkpoints"
 mkdir -p $CHKPTDIR/$SLURM_JOB_ID
 ## DO NOT CHANGE ANYTHING BELOW -- DL params are in run_and_time.sh and config_<system>.sh files 
 DEBUG=1  # 1 = Print verbose messages for debugging
 ## Pre-warming the containers ##
 hosts=( `scontrol show hostname |tr "\n" " "` )
 pids=(); for hostn in ${hosts[@]}; do
  timeout -k 600s 600s \
  srun -N 1 -n 1 -w $hostn \
    docker pull $CONT &
  pids+=($!);
   pids+=($!); rets+=($?);
 done
 wait "${pids[@]}"
 success=0; for s in ${rets[@]}; do ((success+=s)); done ; if [ $success -ne 0 ]; then echo "ERR: Container pull failed"; exit $success ; fi
 IBDEVICES=${IBDEVICES:-$DGXIBDEVICES}
 ## Check whether we are running in a slurm env
 INSLURM=1
 if [[ -z "$SLURM_JOB_ID" ]]; then
  INSLURM=0
  export SLURM_JOB_ID="${DATESTAMP}"
  export SLURM_NNODES=1
 fi
 if [[ -z "SLURM_JOB_ID" || $SLURM_NNODES -eq 1 ]]; then
  # don't need IB if not multi-node
  export IBDEVICES=""
 fi
 # Create results directory
 LOGFILE_BASE="${LOGDIR}/${DATESTAMP}"
 mkdir -p $(dirname "${LOGFILE_BASE}")
 export CONTNAME="${SLURM_JOB_ID}"
 export DOCKEREXEC="nvidia-docker run --rm --net=host --uts=host --ipc=host --ulimit stack=67108864 --ulimit memlock=-1 --security-opt seccomp=unconfined  $IBDEVICES"
 CMD="python -np $((SLURM_NNODES*DGXNGPU)) -x EXTRA_PARAMS=\"${EXTRA_PARAMS}\" -x NCCL_LL_THRESHOLD=0 -x NCCL_DEBUG=INFO -x NCCL_NET_GDR_READ=1 -x NCCL_SOCKET_IFNAME=^docker0,bond0,lo $BIND ./run_pretraining.sh"
 echo $CMD
 mkdir -m 777 -p $LOGDIR
 echo $CMD | tee -a $LOGDIR/$DATESTAMP.log 
 echo "slurm job id" $SLURM_JOB_ID &> $LOGDIR/$DATESTAMP.log 
 MASTER_IP=`getent hosts \`hostname\` | cut -d ' ' -f1`
 SSH=''
 SRUN=''
 if [[ $INSLURM -eq 0 ]]; then
  export hosts=( `hostname` )
 else
  export hosts=( `scontrol show hostname |tr "\n" " "` )
  SSH='ssh -q -o UserKnownHostsFile=/dev/null -o StrictHostKeyChecking=no $hostn'
  SRUN='srun -N 1 -n 1 -w $hostn'
 fi
 unique_hosts=( $(echo "${hosts[@]}" | tr ' ' '\n' | sort -u | tr '\n' ' ' ) )
 export MASTER_HOST=${hosts[0]}
 VARS="-e OMPI_MCA_mca_base_param_files=/dev/shm/mpi/${SLURM_JOB_ID}/mca_params.conf -e EXTRA_PARAMS -e GPUS -e BATCHSIZE -e CONT -e DGXSYSTEM=$DGXSYSTEM -e MASTER_HOST -e MASTER_IP -e SLURM_JOB_NUM_NODES -e SLURM_NNODES -e SLURM_NTASKS_PER_NODE -w /workspace/bert"
 RUNSLEEPCMD=""
 [[ "${PULL}" -eq "1" ]] && docker pull $CONT
 ## Setting up MPI
 # MPI support files - in /dev/shm/mpi/<jobid>
 # 1. Copy user keys to /dev/shm/mpi/<jobid>
 # 2. Create mca_params.conf
 # 3. Create sshentry.sh to support lauching into containers on worker nodes
 # 4. Create mpi_hosts file
 # 5. Copy standard ssh
 if [[ $SLURM_NNODES -ne "1" ]]; then
  # Make keys and copy
  echo
  [[ $DEBUG == 1 ]] && echo "Setting up ssh keys and config"
  mkdir -p ${HOME}/.ssh/sbatch/${SLURM_JOB_ID}
  ssh-keygen -t rsa -b 2048 -n "" -f "${HOME}/.ssh/sbatch/${SLURM_JOB_ID}/sshkey.rsa" -C "mxnet_${SLURM_JOB_ID}_"  &>/dev/null
  echo command=no-port-forwarding,no-agent-forwarding,no-X11-forwarding $(cat ${HOME}/.ssh/sbatch/${SLURM_JOB_ID}/sshkey.rsa.pub) >> ${HOME}/.ssh/authorized_keys
  chmod 600 ~/.ssh/authorized_keys
  [[ $DEBUG == 1 ]] && echo "Copy keys: srun -n $SLURM_JOB_NUM_NODES  && cp -R ${HOME}/.ssh/sbatch/${SLURM_JOB_ID} /dev/shm/mpi && chmod 700 /dev/shm/mpi/${SLURM_JOB_ID}" 
  srun  -n $SLURM_JOB_NUM_NODES --ntasks-per-node=1 bash -c "mkdir -p /dev/shm/mpi/${SLURM_JOB_ID}; cp -R ${HOME}/.ssh/sbatch/${SLURM_JOB_ID} /dev/shm/mpi; chmod 700 /dev/shm/mpi/${SLURM_JOB_ID}"
  sleep 2 # Making copy
  [[ $DEBUG == 1 ]] && ls /dev/shm
  # Create mpi config file
  srun  -n $SLURM_JOB_NUM_NODES --ntasks-per-node=1 tee /dev/shm/mpi/${SLURM_JOB_ID}/mca_params.conf <<EOF
 plm_rsh_agent = /usr/bin/ssh
 plm_rsh_args = -i /dev/shm/mpi/${SLURM_JOB_ID}/sshkey.rsa -oStrictHostKeyChecking=no -oUserKnownHostsFile=/dev/null -oLogLevel=ERROR -l ${USER}
 orte_default_hostfile = /dev/shm/mpi/${SLURM_JOB_ID}/mpi_hosts
 btl_openib_warn_default_gid_prefix = 0
 mpi_warn_on_fork = 0
 allow_run_as_root = 1
 EOF
  [[ $DEBUG == 1 ]] && echo "::mca_params.conf=" && cat /dev/shm/mpi/${SLURM_JOB_ID}/mca_params.conf
  # Create ssh helper script that transfers an ssh into a compute node into the running container on that node
  srun -n $SLURM_JOB_NUM_NODES --ntasks-per-node=1 tee /dev/shm/mpi/${SLURM_JOB_ID}/sshentry.sh <<EOF
 #!/bin/bash
 echo "::sshentry: entered \$(hostname)"
 [[ -f $CONTNAME ]] && "::worker container not found error" && exit 1
 echo "::sshentry: running \$SSH_ORIGINAL_COMMAND"
 exec docker exec $CONTNAME /bin/bash -c "\$SSH_ORIGINAL_COMMAND"
 EOF
  [[ $DEBUG == 1 ]] && echo "::sshentry=" && cat /dev/shm/mpi/${SLURM_JOB_ID}/sshentry.sh
  # Create mpi hostlist
  for h in ${hosts[@]}; do
     echo "$h slots=${SLURM_NTASKS_PER_NODE}" >> /dev/shm/mpi/${SLURM_JOB_ID}/mpi_hosts
  done
  [[ $DEBUG == 1 ]] && echo '::mpi-host file=' && cat /dev/shm/mpi/${SLURM_JOB_ID}/mpi_hosts
  srun -n $SLURM_JOB_NUM_NODES --ntasks-per-node=1 bash -c "cp $(which ssh) /dev/shm/mpi/${SLURM_JOB_ID}/.;  chmod 755 /dev/shm/mpi/${SLURM_JOB_ID}/mca_params.conf;  chmod 755 /dev/shm/mpi/${SLURM_JOB_ID}/sshentry.sh"
  # Check that ssh/mpi dir has correct number of files
  [[ $(ls /dev/shm/mpi/${SLURM_JOB_ID} | wc -w) -lt 5 ]]  && echo "ERR: /dev/shm/mpi/${SLURM_JOB_ID} doesn't exist or missing ssh/mpi files" && exit $?
 fi
 # Container launch
 if [[ $INSLURM -eq 1 ]]; then
  # Launch containers behind srun
  [[ $DEBUG == 1 ]] && echo "" && echo ":Launch containers:  srun  -n $SLURM_JOB_NUM_NODES --ntasks-per-node=1 $DOCKEREXEC --name $CONTNAME $VOLS $VARS $CONT bash -c 'sleep infinity'"
  srun  -n $SLURM_JOB_NUM_NODES --ntasks-per-node=1 $DOCKEREXEC --name $CONTNAME $VOLS $VARS $CONT bash -c 'sleep infinity' & rv=$?
 else
  $DOCKEREXEC --name $CONTNAME $VOLS $VARS $CONT bash -c 'sleep infinity' & rv=$?
 fi
 [[ $rv -ne 0 ]] && echo "ERR: Launch sleep containers failed." && exit $rv
 echo "sleep 60 while we pull our container, good golly!"
 sleep 60
 # Run benchmarks
 echo "sleep again for 20"
 sleep 20
 export EXTRA_PARAMS
 (
 # Launching app
 echo 
 echo "Launching user script on master node:"
  hostn=$MASTER_HOST
  $(eval echo $SSH) docker exec $VARS $CONTNAME $MPICMD ; rv=$?
  [[ $rv -ne 0 ]] && echo "ERR: User script failed." && exit $rv
 ) |& tee ${LOGFILE_BASE}_$nrun.log
 # Clean up (note: on SLURM we skip this, as the epilogue will take care of it)
 if [[ $INSLURM -eq 0 ]]; then
  docker rm -f $CONTNAME
 fi
--- a/PyTorch/LanguageModeling/BERT/scripts/run_glue.sh
+++ b/PyTorch/LanguageModeling/BERT/scripts/run_glue.sh
@ -0,0 +1,63 @@
 #!/bin/bash
 MRPC_DIR=/workspace/bert/data/glue/MRPC
 OUT_DIR=/results/MRPC
 mkdir -p $OUT_DIR
 echo "Container nvidia build = " $NVIDIA_BUILD_ID
 init_checkpoint=${1}
 mode=${2:-"train"}
 max_steps=${3:-"-1.0"} # if < 0, has no effect
 batch_size=${4:-"12"}
 learning_rate=${5:-"5e-6"}
 precision=${6:-"fp32"}
 num_gpu=${7:-"8"}
 epochs=${8:-"2"}
 if [ "$mode" != "train" ] ; then
  num_gpu=1
 fi
 use_fp16=""
 if [ "$precision" = "fp16" ] ; then
  echo "fp16 activated!"
  use_fp16="--fp16"
 fi
 if [ "$num_gpu" = "1" ] ; then
  mpi_command=""
 else
  mpi_command="torch.distributed.launch --nproc_per_node=$num_gpu"
 fi
 CMD="python -m $mpi_command run_glue.py "
 CMD+="--task_name MRPC "
 if [ "$mode" = "train" ] ; then
  CMD+="--do_train "
  CMD+="--train_batch_size=$batch_size "
 else
  CMD+="--do_eval "
  CMD+="--eval_batch_size=$batch_size "
 fi
 CMD+="--do_lower_case "
 CMD+="--data_dir $MRPC_DIR "
 CMD+="--bert_model bert-large-uncased "
 CMD+="--init_checkpoint $init_checkpoint "
 CMD+="--max_seq_length 128 "
 CMD+="--learning_rate $learning_rate "
 CMD+="--num_train_epochs $epochs "
 CMD+="--max_steps $max_steps "
 CMD+="--output_dir $OUT_DIR "
 CMD+="$use_fp16"
 LOGFILE=$OUT_DIR/logfile
 $CMD |& tee $LOGFILE
 sed -r 's/
|(\[A)/\n/g' $LOGFILE > $LOGFILE.edit
 throughput=`cat $LOGFILE.edit | grep -E 'Iteration.*[0-9.]+(s/it|it/s)' | tail -1 | egrep -o '[0-9.]+(s/it|it/s)'`
 echo "throughput: $throughput"
--- a/PyTorch/LanguageModeling/BERT/scripts/run_pretraining.sh
+++ b/PyTorch/LanguageModeling/BERT/scripts/run_pretraining.sh
@ -0,0 +1,152 @@
 #!/bin/bash
 echo "Container nvidia build = " $NVIDIA_BUILD_ID
 DATASET=wikipedia_corpus # change this for other datasets
 DATA_DIR=data/${DATASET}/hdf5_shards/
 BERT_CONFIG=bert_config.json
 RESULTS_DIR=/results
 CHECKPOINTS_DIR=/results/checkpoints
 mkdir -p $CHECKPOINTS_DIR
 if [ ! -d "$DATA_DIR" ] ; then
   echo "Warning! $DATA_DIR directory missing. Training cannot start"
 fi
 if [ ! -d "$RESULTS_DIR" ] ; then
   echo "Error! $RESULTS_DIR directory missing."
   exit -1
 fi
 if [ ! -d "$CHECKPOINTS_DIR" ] ; then
   echo "Warning! $CHECKPOINTS_DIR directory missing."
   echo "Checkpoints will be written to $RESULTS_DIR instead."
   CHECKPOINTS_DIR=$RESULTS_DIR
 fi
 if [ ! -f "$BERT_CONFIG" ] ; then
   echo "Error! BERT large configuration file not found at $BERT_CONFIG"
   exit -1
 fi
 train_batch_size=${1:-14}
 learning_rate=${2:-"0.4375e-4"}
 precision=${3:-"fp16"}
 num_gpus=${4:-8}
 warmup_proportion=${5:-"0.01"}
 train_steps=${6:-2285714}
 save_checkpoint_steps=${7:-2000}
 resume_training=${8:-"false"}
 create_logfile=${9:-"true"}
 checkpoint_activations=${10:-"false"}
 seed=${11:-42}
 PREC=""
 if [ "$precision" = "fp16" ] ; then
   PREC="--fp16"
 elif [ "$precision" = "fp32" ] ; then
   PREC=""
 else
   echo "Unknown <precision> argument"
   exit -2
 fi
 CHECKPOINT_ACTIVATIONS=""
 if [ "$checkpoint_activations" == "true" ] ; then
   CHECKPOINT_ACTIVATIONS="--checkpoint_activations"
 fi
 CHECKPOINT=""
 if [ "$resume_training" == "true" ] ; then
   CHECKPOINT="--resume_from_checkpoint"
 fi
 echo $DATA_DIR
 INPUT_DIR=$DATA_DIR
 CMD=" /workspace/bert/run_pretraining.py"
 CMD+=" --input_dir=$DATA_DIR"
 CMD+=" --output_dir=$CHECKPOINTS_DIR"
 CMD+=" --config_file=$BERT_CONFIG"
 CMD+=" --do_train"
 CMD+=" --bert_model=bert-large-uncased"
 CMD+=" --train_batch_size=$train_batch_size"
 CMD+=" --max_seq_length=512"
 CMD+=" --max_predictions_per_seq=80"
 CMD+=" --max_steps=$train_steps"
 CMD+=" --warmup_proportion=$warmup_proportion"
 CMD+=" --num_steps_per_checkpoint=$save_checkpoint_steps"
 CMD+=" --learning_rate=$learning_rate"
 CMD+=" --seed=$seed"
 CMD+=" $PREC"
 CMD+=" $CHECKPOINT_ACTIVATIONS"
 CMD+=" $CHECKPOINT"
 if [ "$num_gpus" -gt 1  ] ; then
   CMD="python3 -m torch.distributed.launch --nproc_per_node=$num_gpus $CMD"
 else
   CMD="python3  $CMD"
 fi
 if [ "$create_logfile" = "true" ] ; then
  export GBS=$(expr $train_batch_size \* $num_gpus)
  printf -v TAG "pyt_bert_pretraining_%s_gbs%d" "$precision" $GBS
  DATESTAMP=`date +'%y%m%d%H%M%S'`
  LOGFILE=$RESULTS_DIR/$TAG.$DATESTAMP.log
  printf "Logs written to %s\n" "$LOGFILE"
 fi
 set -x
 if [ -z "$LOGFILE" ] ; then
   $CMD
 else
   (
     $CMD
   ) |& tee $LOGFILE
 fi
 set +x
 echo "finished pretraining, starting benchmarking"
 target_loss=15
 THROUGHPUT=10
 THRESHOLD=0.9
 throughput=`cat $LOGFILE | grep Iteration | tail -1 | awk -F's/it' '{print $1}' | awk -F',' '{print $2}' | egrep -o [0-9.]+`
 loss=`cat $LOGFILE | grep 'Average Loss' | tail -1 | awk -F'Average Loss =' '{print $2}' | awk -F' ' '{print $1}' | egrep -o [0-9.]+`
 final_loss=`cat $LOGFILE | grep 'Total Steps' | tail -1 | awk -F'Final Loss =' '{print $2}' | awk -F' ' '{print $1}' | egrep -o [0-9.]+`
 echo "throughput: $throughput s/it"
 echo "average loss: $loss"
 echo "final loss: $final_loss"
 ACCURACY_TEST_RESULT=$(awk 'BEGIN {print ('${loss}' <= '${target_loss}')}')
 if [ $ACCURACY_TEST_RESULT == 1 ];
    then
        echo "&&&& ACCURACY TEST PASSED"
    else
        echo "&&&& ACCURACY TEST FAILED"
    fi
 PERFORMANCE_TEST_RESULT=$(awk 'BEGIN {print ('${throughput}' <= ('${THROUGHPUT}' * '${THRESHOLD}'))}')
 if [ $PERFORMANCE_TEST_RESULT == 1 ];
    then
        echo "&&&& PERFORMANCE TEST PASSED"
    else
        echo "&&&& PERFORMANCE TEST FAILED"
    fi
 if [ $ACCURACY_TEST_RESULT == 1 -a $PERFORMANCE_TEST_RESULT == 1 ];
    then
        echo "&&&& PASSED"
        exit 0
    else
        echo "&&&& FAILED"
        exit 1
    fi
--- a/PyTorch/LanguageModeling/BERT/scripts/run_pretraining_inference.sh
+++ b/PyTorch/LanguageModeling/BERT/scripts/run_pretraining_inference.sh
@ -0,0 +1,146 @@
 #!/bin/bash
 echo "Container nvidia build = " $NVIDIA_BUILD_ID
 DATASET=wikipedia_corpus # change this for other datasets
 DATA_DIR=data/${DATASET}/hdf5_shards/
 BERT_CONFIG=bert_config.json
 RESULTS_DIR=/results
 CHECKPOINTS_DIR=/results/checkpoints
 if [ ! -d "$DATA_DIR" ] ; then
   echo "Warning! $DATA_DIR directory missing. Inference cannot start"
 fi
 if [ ! -d "$RESULTS_DIR" ] ; then
   echo "Error! $RESULTS_DIR directory missing."
   exit -1
 fi
 if [ ! -d "$CHECKPOINTS_DIR" ] ; then
   echo "Warning! $CHECKPOINTS_DIR directory missing."
   echo "Checkpoints will be loaded from $RESULTS_DIR instead."
   CHECKPOINTS_DIR=$RESULTS_DIR
 fi
 if [ ! -f "$BERT_CONFIG" ] ; then
   echo "Error! BERT large configuration file not found at $BERT_CONFIG"
   exit -1
 fi
 eval_batch_size=${1:-14}
 precision=${2:-"fp16"}
 num_gpus=${3:-8}
 inference_mode=${4:-"eval"}
 model_checkpoint=${5:-"-1"}
 inference_steps=${6:-"-1"}
 create_logfile=${7:-"true"}
 seed=${8:-42}
 PREC=""
 if [ "$precision" = "fp16" ] ; then
   PREC="--fp16"
 elif [ "$precision" = "fp32" ] ; then
   PREC=""
 else
   echo "Unknown <precision> argument"
   exit -2
 fi
 MODE=""
 if [ "$inference_mode" = "eval" ] ; then
   MODE="--eval"
 elif [ "$inference_mode" = "prediction" ] ; then
   MODE="--prediction"
 else
   echo "Unknown <inference_mode> argument"
   exit -2
 fi
 echo $DATA_DIR
 CMD=" /workspace/bert/run_pretraining_inference.py"
 CMD+=" --input_dir=$DATA_DIR"
 CMD+=" --ckpt_dir=$CHECKPOINTS_DIR"
 CMD+=" --config_file=$BERT_CONFIG"
 CMD+=" --bert_model=bert-large-uncased"
 CMD+=" --eval_batch_size=$eval_batch_size"
 CMD+=" --max_seq_length=512"
 CMD+=" --max_predictions_per_seq=80"
 CMD+=" --max_steps=$inference_steps"
 CMD+=" --ckpt_step=$model_checkpoint"
 CMD+=" --seed=$seed"
 CMD+=" $PREC"
 CMD+=" $MODE"
 if [ "$num_gpus" -gt 1 ] ; then
   CMD="python3 -m torch.distributed.launch --nproc_per_node=$num_gpus $CMD"
 else
   CMD="python3  $CMD"
 fi
 if [ "$create_logfile" = "true" ] ; then
  export GBS=$((eval_batch_size * num_gpus))
  printf -v TAG "pyt_bert_pretraining_inference_%s_gbs%d" "$precision" $GBS
  DATESTAMP=`date +'%y%m%d%H%M%S'`
  LOGFILE=$RESULTS_DIR/$TAG.$DATESTAMP.log
  printf "Logs written to %s\n" "$LOGFILE"
 fi
 set -x
 if [ -z "$LOGFILE" ] ; then
   $CMD
 else
   (
     $CMD
   ) |& tee $LOGFILE
 fi
 set +x
 target_loss=15
 THROUGHPUT=1.0
 THRESHOLD=0.9
 throughput=`cat $LOGFILE | grep Iteration | tail -1 | awk -F'it/s' '{print $1}' | awk -F',' '{print $2}' | egrep -o [0-9.]+`
 echo "throughput: $throughput it/s"
 PERFORMANCE_TEST_RESULT=$(awk 'BEGIN {print ('${throughput}' >= \
      ('${THROUGHPUT}' * '${THRESHOLD}'))}')
 if [ $PERFORMANCE_TEST_RESULT == 1 ];
   then
      echo "&&&& PERFORMANCE TEST PASSED"
   else
      echo "&&&& PERFORMANCE TEST FAILED"
   fi
 if [ "$inference_mode" = "eval" ] ; then
   loss=`cat $LOGFILE | grep Finished | tail -1 | awk -F'Final Loss =' '{print $2}' | awk -F' ' '{print $1}' | egrep -o [0-9.]+`
   echo "final loss: $loss"
   ACCURACY_TEST_RESULT=$(awk 'BEGIN {print ('${loss}' <= '${target_loss}')}')
   if [ $ACCURACY_TEST_RESULT == 1 ];
      then
         echo "&&&& ACCURACY TEST PASSED"
      else
         echo "&&&& ACCURACY TEST FAILED"
      fi
   if [ $ACCURACY_TEST_RESULT == 1 -a $PERFORMANCE_TEST_RESULT == 1 ];
      then
         echo "&&&& PASSED"
         exit 0
      else
         echo "&&&& FAILED"
         exit 1
      fi
 fi
--- a/PyTorch/LanguageModeling/BERT/scripts/run_squad.sh
+++ b/PyTorch/LanguageModeling/BERT/scripts/run_squad.sh
@ -0,0 +1,88 @@
 #!/usr/bin/env bash
 #OUT_DIR=/results/SQuAD
 echo "Container nvidia build = " $NVIDIA_BUILD_ID
 init_checkpoint=${1:-"/workspace/checkpoints/bert_uncased.pt"}
 epochs=${2:-"2.0"}
 batch_size=${3:-"24"}
 learning_rate=${4:-"3e-5"}
 precision=${5:-"fp16"}
 num_gpu=${6:-"8"}
 seed=${7:-"42"}
 squad_dir=${8:-"/workspace/bert/data/squad/v1.1"}
 vocab_file=${9:-"/workspace/bert/vocab/vocab"}
 OUT_DIR=${10:-"/results/SQuAD"}
 mode=${11:-"train eval"}
 CONFIG_FILE=${12:-"/workspace/bert/bert_config.json"}
 max_steps=${13:-"-1"}
 echo "out dir is $OUT_DIR"
 mkdir -p $OUT_DIR
 if [ ! -d "$OUT_DIR" ]; then
  echo "ERROR: non existing $OUT_DIR"
  exit 1
 fi
 use_fp16=""
 if [ "$precision" = "fp16" ] ; then
  echo "fp16 activated!"
  use_fp16=" --fp16 "
 fi
 if [ "$num_gpu" = "1" ] ; then
  export CUDA_VISIBLE_DEVICES=0
  mpi_command=""
 else
  unset CUDA_VISIBLE_DEVICES
  mpi_command=" -m torch.distributed.launch --nproc_per_node=$num_gpu"
 fi
 CMD="python  $mpi_command run_squad.py "
 CMD+="--init_checkpoint=$init_checkpoint "
 if [ "$mode" = "train" ] ; then
  CMD+="--do_train "
  CMD+="--train_file=$squad_dir/train-v1.1.json "
  CMD+="--train_batch_size=$batch_size "
 elif [ "$mode" = "eval" ] ; then
  CMD+="--do_predict "
  CMD+="--predict_file=$squad_dir/dev-v1.1.json "
  CMD+="--predict_batch_size=$batch_size "
 else
  CMD+=" --do_train "
  CMD+=" --train_file=$squad_dir/train-v1.1.json "
  CMD+=" --train_batch_size=$batch_size "
  CMD+="--do_predict "
  CMD+="--predict_file=$squad_dir/dev-v1.1.json "
  CMD+="--predict_batch_size=$batch_size "
 fi
 CMD+=" --do_lower_case "
 # CMD+=" --old "
 # CMD+=" --loss_scale=128 "
 CMD+=" --bert_model=bert-large-uncased "
 CMD+=" --learning_rate=$learning_rate "
 CMD+=" --seed=$seed "
 CMD+=" --num_train_epochs=$epochs "
 CMD+=" --max_seq_length=384 "
 CMD+=" --doc_stride=128 "
 CMD+=" --output_dir=$OUT_DIR "
 CMD+=" --vocab_file=$vocab_file "
 CMD+=" --config_file=$CONFIG_FILE "
 CMD+=" --max_steps=$max_steps "
 CMD+=" $use_fp16"
 LOGFILE=$OUT_DIR/logfile.txt
 echo "$CMD |& tee $LOGFILE"
 time $CMD |& tee $LOGFILE
 #sed -r 's/
 #|([A)/\n/g' $LOGFILE > $LOGFILE.edit
 throughput=`cat $LOGFILE | grep -E 'Iteration.*[0-9.]+(s/it|it/s)' | tail -1 | egrep -o '[0-9.]+(s/it|it/s)' | head -1 | egrep -o '[0-9.]+'`
 if [ "$mode" != "train" ]; then
 python $squad_dir/evaluate-v1.1.py $squad_dir/dev-v1.1.json $OUT_DIR/predictions.json |& tee -a $LOGFILE
 fi
 echo "throughput: $throughput"
--- a/PyTorch/LanguageModeling/BERT/scripts/run_swag.sh
+++ b/PyTorch/LanguageModeling/BERT/scripts/run_swag.sh
@ -0,0 +1,62 @@
 #!/bin/bash
 SWAG_DIR=/workspace/bert/data/swag
 OUT_DIR=/results/SWAG
 mkdir -p $OUT_DIR
 echo "Container nvidia build = " $NVIDIA_BUILD_ID
 init_checkpoint=${1}
 mode=${2:-"train"}
 max_steps=${3:-"-1.0"} # if < 0, has no effect
 batch_size=${4:-"12"}
 learning_rate=${5:-"5e-6"}
 precision=${6:-"fp32"}
 num_gpu=${7:-"8"}
 epochs=${8:-"2"}
 if [ "$mode" != "train" ] ; then
  num_gpu=1
 fi
 use_fp16=""
 if [ "$precision" = "fp16" ] ; then
  echo "fp16 activated!"
  use_fp16="--fp16"
 fi
 if [ "$num_gpu" = "1" ] ; then
  mpi_command=""
 else
  mpi_command="torch.distributed.launch --nproc_per_node=$num_gpu"
 fi
 CMD="python -m $mpi_command run_swag.py "
 CMD+="--init_checkpoint=$init_checkpoint "
 if [ "$mode" = "train" ] ; then
  CMD+="--do_train "
  CMD+="--train_batch_size=$batch_size "
 else
  CMD+="--do_eval "
  CMD+="--eval_batch_size=$batch_size "
 fi
 CMD+="--do_lower_case "
 CMD+="--data_dir $SWAG_DIR/data/ "
 CMD+="--bert_model bert-large-uncased "
 CMD+="--max_seq_length 128 "
 CMD+="--learning_rate $learning_rate "
 CMD+="--num_train_epochs $epochs "
 CMD+="--max_steps $max_steps "
 CMD+="--output_dir $OUT_DIR "
 CMD+="$use_fp16"
 LOGFILE=$OUT_DIR/logfile
 $CMD |& tee $LOGFILE
 sed -r 's/
|(\[A)/\n/g' $LOGFILE > $LOGFILE.edit
 throughput=`cat $LOGFILE.edit | grep -E 'Iteration.*[0-9.]+(s/it|it/s)' | tail -1 | egrep -o '[0-9.]+(s/it|it/s)'`
 echo "throughput: $throughput"
--- a/PyTorch/LanguageModeling/BERT/scripts/start_pretraining.sh
+++ b/PyTorch/LanguageModeling/BERT/scripts/start_pretraining.sh
@ -0,0 +1,89 @@
 #!/bin/bash
 # purpose: for multinode training on slurm clusters
 node_type=${1:-"dgx1"}
 num_nodes=${2:-1}
 partition=${3:-"default"}
 wall_time=${4:-"12:00:00"}
 job_name=${5:-"pyt_bert"}
 root_dir=${6:-"$PWD"}
 train_batch_size=${7:-4}
 eval_batch_size=${8:-4}
 train_steps=${9:-1000000}
 warmup_proportion=${10:-0.01}
 learning_rate=${11:-1e-4}
 precision=${12:-"fp16"}
 save_checkpoint_steps=${13:-5000}
 results_dir=${14:-"$root_dir/results"}
 checkpoints_dir=${15:-"$root_dir/checkpoints"}
 CONT=${CONT:-"gitlab-master.nvidia.com:5005/dl/dgx/pytorch:19.02-py3-devel"}
 BENCHMARK=${BENCHMARK:-"bert"}
 BENCHMARK_NAME="bert"
 if [ "$node_type" = "dgx1" ] ; then
   echo "Running on dgx1 systems"
   DGXSYSTEM="DGX1"
   DGXNGPU=8
   DGXSOCKETCORES=20
   DGXNSOCKET=2
   DGXHT=2
   DGXIBDEVICES='--device=/dev/infiniband --device=/dev/infiniband/rdma_cm --device=/dev/infiniband/ucm3 --device=/dev/infiniband/ucm2 --device=/dev/infiniband/ucm1 --device=/dev/infiniband/ucm0 --device=/dev/infiniband/uverbs3 --device=/dev/infiniband/uverbs2 --device=/dev/infiniband/uverbs1 --device=/dev/infiniband/uverbs0 --device=/dev/infiniband/issm3 --device=/dev/infiniband/umad3 --device=/dev/infiniband/issm2 --device=/dev/infiniband/umad2 --device=/dev/infiniband/issm1 --device=/dev/infiniband/umad1 --device=/dev/infiniband/issm0 --device=/dev/infiniband/umad0'
 elif [ "$node_type" = "dgx2h" ] ; then
   echo "Running on dgx2h systems"
   DGXSYSTEM="DGX2H"
   DGXNGPU=16
   DGXSOCKETCORES=24
   DGXNSOCKET=2
   DGXHT=2         # HT is on is 2, HT off is 1
   DGXIBDEVICES='--device=/dev/infiniband/rdma_cm --device=/dev/infiniband/ucm10 --device=/dev/infiniband/ucm9 --device=/dev/infiniband/ucm8 --device=/dev/infiniband/ucm7 --device=/dev/infiniband/ucm4 --device=/dev/infiniband/ucm3 --device=/dev/infiniband/ucm2 --device=/dev/infiniband/ucm1 --device=/dev/infiniband/uverbs10 --device=/dev/infiniband/uverbs9 --device=/dev/infiniband/uverbs8 --device=/dev/infiniband/uverbs7 --device=/dev/infiniband/uverbs4 --device=/dev/infiniband/uverbs3 --device=/dev/infiniband/uverbs2 --device=/dev/infiniband/uverbs1 --device=/dev/infiniband/issm10 --device=/dev/infiniband/umad10 --device=/dev/infiniband/issm9 --device=/dev/infiniband/umad9 --device=/dev/infiniband/issm8 --device=/dev/infiniband/umad8 --device=/dev/infiniband/issm7 --device=/dev/infiniband/umad7 --device=/dev/infiniband/issm4 --device=/dev/infiniband/umad4 --device=/dev/infiniband/issm3 --device=/dev/infiniband/umad3 --device=/dev/infiniband/issm2 --device=/dev/infiniband/umad2 --device=/dev/infiniband/issm1 --device=/dev/infiniband/umad1'
 else
   echo "Unknown <node_type>, must be either dgx1 or dgx2"
   exit -1
 fi
 printf -v EXTRA_PARAMS "%d %d %e %s 1 %d %d %d false" $train_batch_size $eval_batch_size $learning_rate "$precision" $warmup_proportion $train_steps $save_checkpoint_steps
 export ROOTDIR=$root_dir
 export DATA_DIR=${DATA_DIR:-$CODEDIR/data/wikipedia_corpus/pyt_hdf5_shards}
 VOLS="-v $ROOTDIR:/workspace/bert"
 VOLS+=" -v $DATA_DIR:/workspace/bert/data/wikipedia_corpus/pyt_hdf5_shards"
 # VOLS+=" -v $BOOKS_DIR:/workspace/bert/data/bookcorpus/final_tfrecord_sharded"
 VOLS+=" -v $results_dir:/results"
 VOLS+=" -v $checkpoints_dir:/checkpoints"
 export VOLS
 export CONT
 export DGXSYSTEM
 export DGXNGPU
 export DGXIBDEVICES
 export EXTRA_PARAMS
 set -x
 cd $CODEDIR
 pwd
 PART=""
 if [ "$partition" != "default" ] ; then
   printf -v PART "%s" "-p $partition"
 fi
 export GBS=$(expr $num_nodes \* $batch_size \* $DGXNGPU)
 printf -v TAG "%s_%dn_%s_gbs%d" "$job_name" $num_nodes "$precision" $GBS
 export DATESTAMP=`date +'%y%m%d%H%M%S'`
 sbatch $PART \
        -N $num_nodes \
        -t $wall_time \
        -J $job_name \
        --exclusive \
        --mem=0 \
        --mail-type=FAIL \
        --ntasks-per-node=$DGXNGPU \
        --threads-per-core=$DGXHT \
        --cores-per-socket=$DGXSOCKETCORES \
        --output=$LOGDIR/$TAG.$DATESTAMP.log \
        $CODEDIR/scripts/run.sub
 set +x
--- a/PyTorch/LanguageModeling/BERT/tokenization.py
+++ b/PyTorch/LanguageModeling/BERT/tokenization.py
@ -0,0 +1,391 @@
 # coding=utf-8
 # Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Tokenization classes."""
 from __future__ import absolute_import, division, print_function, unicode_literals
 import collections
 import logging
 import os
 import unicodedata
 import six
 from io import open
 from file_utils import cached_path
 logger = logging.getLogger(__name__)
 PRETRAINED_VOCAB_ARCHIVE_MAP = {
    'bert-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt",
    'bert-large-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-vocab.txt",
    'bert-base-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-vocab.txt",
    'bert-large-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-vocab.txt",
    'bert-base-multilingual-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-uncased-vocab.txt",
    'bert-base-multilingual-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased-vocab.txt",
    'bert-base-chinese': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese-vocab.txt",
 }
 PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP = {
    'bert-base-uncased': 512,
    'bert-large-uncased': 512,
    'bert-base-cased': 512,
    'bert-large-cased': 512,
    'bert-base-multilingual-uncased': 512,
    'bert-base-multilingual-cased': 512,
    'bert-base-chinese': 512,
 }
 VOCAB_NAME = 'vocab.txt'
 def convert_to_unicode(text):
  """Converts `text` to Unicode (if it's not already), assuming utf-8 input."""
  if six.PY3:
    if isinstance(text, str):
      return text
    elif isinstance(text, bytes):
      return text.decode("utf-8", "ignore")
    else:
      raise ValueError("Unsupported string type: %s" % (type(text)))
  elif six.PY2:
    if isinstance(text, str):
      return text.decode("utf-8", "ignore")
    elif isinstance(text, unicode):
      return text
    else:
      raise ValueError("Unsupported string type: %s" % (type(text)))
  else:
    raise ValueError("Not running on Python2 or Python 3?")
 def load_vocab(vocab_file):
    """Loads a vocabulary file into a dictionary."""
    vocab = collections.OrderedDict()
    index = 0
    with open(vocab_file, "r", encoding="utf-8") as reader:
        while True:
            token = reader.readline()
            if not token:
                break
            token = token.strip()
            vocab[token] = index
            index += 1
    return vocab
 def whitespace_tokenize(text):
    """Runs basic whitespace cleaning and splitting on a piece of text."""
    text = text.strip()
    if not text:
        return []
    tokens = text.split()
    return tokens
 class BertTokenizer(object):
    """Runs end-to-end tokenization: punctuation splitting + wordpiece"""
    def __init__(self, vocab_file, do_lower_case=True, max_len=None,
                 never_split=("[UNK]", "[SEP]", "[PAD]", "[CLS]", "[MASK]")):
        if not os.path.isfile(vocab_file):
            raise ValueError(
                "Can't find a vocabulary file at path '{}'. To load the vocabulary from a Google pretrained "
                "model use `tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`".format(vocab_file))
        self.vocab = load_vocab(vocab_file)
        self.ids_to_tokens = collections.OrderedDict(
            [(ids, tok) for tok, ids in self.vocab.items()])
        self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case,
                                              never_split=never_split)
        self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab)
        self.max_len = max_len if max_len is not None else int(1e12)
    def tokenize(self, text):
        split_tokens = []
        for token in self.basic_tokenizer.tokenize(text):
            for sub_token in self.wordpiece_tokenizer.tokenize(token):
                split_tokens.append(sub_token)
        return split_tokens
    def convert_tokens_to_ids(self, tokens):
        """Converts a sequence of tokens into ids using the vocab."""
        ids = []
        for token in tokens:
            ids.append(self.vocab[token])
        if len(ids) > self.max_len:
            raise ValueError(
                "Token indices sequence length is longer than the specified maximum "
                " sequence length for this BERT model ({} > {}). Running this"
                " sequence through BERT will result in indexing errors".format(len(ids), self.max_len)
            )
        return ids
    def convert_ids_to_tokens(self, ids):
        """Converts a sequence of ids in wordpiece tokens using the vocab."""
        tokens = []
        for i in ids:
            tokens.append(self.ids_to_tokens[i])
        return tokens
    @classmethod
    def from_pretrained(cls, pretrained_model_name_or_path, cache_dir=None, *inputs, **kwargs):
        """
        Instantiate a PreTrainedBertModel from a pre-trained model file.
        Download and cache the pre-trained model file if needed.
        """
        if pretrained_model_name_or_path in PRETRAINED_VOCAB_ARCHIVE_MAP:
            vocab_file = PRETRAINED_VOCAB_ARCHIVE_MAP[pretrained_model_name_or_path]
        else:
            vocab_file = pretrained_model_name_or_path
        if os.path.isdir(vocab_file):
            vocab_file = os.path.join(vocab_file, VOCAB_NAME)
        # redirect to the cache, if necessary
        try:
            resolved_vocab_file = cached_path(vocab_file, cache_dir=cache_dir)
        except EnvironmentError:
            logger.error(
                "Model name '{}' was not found in model name list ({}). "
                "We assumed '{}' was a path or url but couldn't find any file "
                "associated to this path or url.".format(
                    pretrained_model_name_or_path,
                    ', '.join(PRETRAINED_VOCAB_ARCHIVE_MAP.keys()),
                    vocab_file))
            return None
        if resolved_vocab_file == vocab_file:
            logger.info("loading vocabulary file {}".format(vocab_file))
        else:
            logger.info("loading vocabulary file {} from cache at {}".format(
                vocab_file, resolved_vocab_file))
        if pretrained_model_name_or_path in PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP:
            # if we're using a pretrained model, ensure the tokenizer wont index sequences longer
            # than the number of positional embeddings
            max_len = PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP[pretrained_model_name_or_path]
            kwargs['max_len'] = min(kwargs.get('max_len', int(1e12)), max_len)
        # Instantiate tokenizer.
        tokenizer = cls(resolved_vocab_file, *inputs, **kwargs)
        return tokenizer
 class BasicTokenizer(object):
    """Runs basic tokenization (punctuation splitting, lower casing, etc.)."""
    def __init__(self,
                 do_lower_case=True,
                 never_split=("[UNK]", "[SEP]", "[PAD]", "[CLS]", "[MASK]")):
        """Constructs a BasicTokenizer.
        Args:
          do_lower_case: Whether to lower case the input.
        """
        self.do_lower_case = do_lower_case
        self.never_split = never_split
    def tokenize(self, text):
        """Tokenizes a piece of text."""
        text = self._clean_text(text)
        # This was added on November 1st, 2018 for the multilingual and Chinese
        # models. This is also applied to the English models now, but it doesn't
        # matter since the English models were not trained on any Chinese data
        # and generally don't have any Chinese data in them (there are Chinese
        # characters in the vocabulary because Wikipedia does have some Chinese
        # words in the English Wikipedia.).
        text = self._tokenize_chinese_chars(text)
        orig_tokens = whitespace_tokenize(text)
        split_tokens = []
        for token in orig_tokens:
            if self.do_lower_case and token not in self.never_split:
                token = token.lower()
                token = self._run_strip_accents(token)
            split_tokens.extend(self._run_split_on_punc(token))
        output_tokens = whitespace_tokenize(" ".join(split_tokens))
        return output_tokens
    def _run_strip_accents(self, text):
        """Strips accents from a piece of text."""
        text = unicodedata.normalize("NFD", text)
        output = []
        for char in text:
            cat = unicodedata.category(char)
            if cat == "Mn":
                continue
            output.append(char)
        return "".join(output)
    def _run_split_on_punc(self, text):
        """Splits punctuation on a piece of text."""
        if text in self.never_split:
            return [text]
        chars = list(text)
        i = 0
        start_new_word = True
        output = []
        while i < len(chars):
            char = chars[i]
            if _is_punctuation(char):
                output.append([char])
                start_new_word = True
            else:
                if start_new_word:
                    output.append([])
                start_new_word = False
                output[-1].append(char)
            i += 1
        return ["".join(x) for x in output]
    def _tokenize_chinese_chars(self, text):
        """Adds whitespace around any CJK character."""
        output = []
        for char in text:
            cp = ord(char)
            if self._is_chinese_char(cp):
                output.append(" ")
                output.append(char)
                output.append(" ")
            else:
                output.append(char)
        return "".join(output)
    def _is_chinese_char(self, cp):
        """Checks whether CP is the codepoint of a CJK character."""
        # This defines a "chinese character" as anything in the CJK Unicode block:
        #   https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
        #
        # Note that the CJK Unicode block is NOT all Japanese and Korean characters,
        # despite its name. The modern Korean Hangul alphabet is a different block,
        # as is Japanese Hiragana and Katakana. Those alphabets are used to write
        # space-separated words, so they are not treated specially and handled
        # like the all of the other languages.
        if ((cp >= 0x4E00 and cp <= 0x9FFF) or  #
                (cp >= 0x3400 and cp <= 0x4DBF) or  #
                (cp >= 0x20000 and cp <= 0x2A6DF) or  #
                (cp >= 0x2A700 and cp <= 0x2B73F) or  #
                (cp >= 0x2B740 and cp <= 0x2B81F) or  #
                (cp >= 0x2B820 and cp <= 0x2CEAF) or
                (cp >= 0xF900 and cp <= 0xFAFF) or  #
                (cp >= 0x2F800 and cp <= 0x2FA1F)):  #
            return True
        return False
    def _clean_text(self, text):
        """Performs invalid character removal and whitespace cleanup on text."""
        output = []
        for char in text:
            cp = ord(char)
            if cp == 0 or cp == 0xfffd or _is_control(char):
                continue
            if _is_whitespace(char):
                output.append(" ")
            else:
                output.append(char)
        return "".join(output)
 class WordpieceTokenizer(object):
    """Runs WordPiece tokenization."""
    def __init__(self, vocab, unk_token="[UNK]", max_input_chars_per_word=100):
        self.vocab = vocab
        self.unk_token = unk_token
        self.max_input_chars_per_word = max_input_chars_per_word
    def tokenize(self, text):
        """Tokenizes a piece of text into its word pieces.
        This uses a greedy longest-match-first algorithm to perform tokenization
        using the given vocabulary.
        For example:
          input = "unaffable"
          output = ["un", "##aff", "##able"]
        Args:
          text: A single token or whitespace separated tokens. This should have
            already been passed through `BasicTokenizer`.
        Returns:
          A list of wordpiece tokens.
        """
        output_tokens = []
        for token in whitespace_tokenize(text):
            chars = list(token)
            if len(chars) > self.max_input_chars_per_word:
                output_tokens.append(self.unk_token)
                continue
            is_bad = False
            start = 0
            sub_tokens = []
            while start < len(chars):
                end = len(chars)
                cur_substr = None
                while start < end:
                    substr = "".join(chars[start:end])
                    if start > 0:
                        substr = "##" + substr
                    if substr in self.vocab:
                        cur_substr = substr
                        break
                    end -= 1
                if cur_substr is None:
                    is_bad = True
                    break
                sub_tokens.append(cur_substr)
                start = end
            if is_bad:
                output_tokens.append(self.unk_token)
            else:
                output_tokens.extend(sub_tokens)
        return output_tokens
 def _is_whitespace(char):
    """Checks whether `chars` is a whitespace character."""
    # \t, \n, and \r are technically contorl characters but we treat them
    # as whitespace since they are generally considered as such.
    if char == " " or char == "\t" or char == "\n" or char == "\r":
        return True
    cat = unicodedata.category(char)
    if cat == "Zs":
        return True
    return False
 def _is_control(char):
    """Checks whether `chars` is a control character."""
    # These are technically control characters but we count them as whitespace
    # characters.
    if char == "\t" or char == "\n" or char == "\r":
        return False
    cat = unicodedata.category(char)
    if cat.startswith("C"):
        return True
    return False
 def _is_punctuation(char):
    """Checks whether `chars` is a punctuation character."""
    cp = ord(char)
    # We treat all non-letter/number ASCII as punctuation.
    # Characters such as "^", "$", and "`" are not in the Unicode
    # Punctuation class but we treat them as punctuation anyways, for
    # consistency.
    if ((cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or
            (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126)):
        return True
    cat = unicodedata.category(char)
    if cat.startswith("P"):
        return True
    return False
--- a/PyTorch/LanguageModeling/BERT/vocab/download_models.py
+++ b/PyTorch/LanguageModeling/BERT/vocab/download_models.py
@ -0,0 +1,123 @@
 # NVIDIA
 import hashlib
 import urllib.request
 import zipfile
 # Download urls
 model_urls = {
  'bert_base_uncased' : ('https://storage.googleapis.com/bert_models/2018_10_18/uncased_L-12_H-768_A-12.zip', 'uncased_L-12_H-768_A-12.zip'),
  'bert_large_uncased' : ('https://storage.googleapis.com/bert_models/2018_10_18/uncased_L-24_H-1024_A-16.zip', 'uncased_L-24_H-1024_A-16.zip'),
  'bert_base_cased' : ('https://storage.googleapis.com/bert_models/2018_10_18/cased_L-12_H-768_A-12.zip', 'cased_L-12_H-768_A-12.zip'),
  'bert_large_cased' : ('https://storage.googleapis.com/bert_models/2018_10_18/cased_L-24_H-1024_A-16.zip', 'cased_L-24_H-1024_A-16.zip'),
  'bert_base_multilingual_cased' : ('https://storage.googleapis.com/bert_models/2018_11_23/multi_cased_L-12_H-768_A-12.zip', 'multi_cased_L-12_H-768_A-12.zip'),
  'bert_large_multilingual_uncased' : ('https://storage.googleapis.com/bert_models/2018_11_03/multilingual_L-12_H-768_A-12.zip', 'multilingual_L-12_H-768_A-12.zip'),
  'bert_base_chinese' : ('https://storage.googleapis.com/bert_models/2018_11_03/chinese_L-12_H-768_A-12.zip', 'chinese_L-12_H-768_A-12.zip')
 }
 # SHA256sum verification for file download integrity (and checking for changes from the download source over time)
 bert_base_uncased_sha = {
  'bert_config.json' : '7b4e5f53efbd058c67cda0aacfafb340113ea1b5797d9ce6ee411704ba21fcbc',
  'bert_model.ckpt.data-00000-of-00001' : '58580dc5e0bf0ae0d2efd51d0e8272b2f808857f0a43a88aaf7549da6d7a8a84',
  'bert_model.ckpt.index' : '04c1323086e2f1c5b7c0759d8d3e484afbb0ab45f51793daab9f647113a0117b',
  'bert_model.ckpt.meta' : 'dd5682170a10c3ea0280c2e9b9a45fee894eb62da649bbdea37b38b0ded5f60e',
  'vocab.txt' : '07eced375cec144d27c900241f3e339478dec958f92fddbc551f295c992038a3',
 }
 bert_large_uncased_sha = {
  'bert_config.json' : 'bfa42236d269e2aeb3a6d30412a33d15dbe8ea597e2b01dc9518c63cc6efafcb',
  'bert_model.ckpt.data-00000-of-00001' : 'bc6b3363e3be458c99ecf64b7f472d2b7c67534fd8f564c0556a678f90f4eea1',
  'bert_model.ckpt.index' : '68b52f2205ffc64dc627d1120cf399c1ef1cbc35ea5021d1afc889ffe2ce2093',
  'bert_model.ckpt.meta' : '6fcce8ff7628f229a885a593625e3d5ff9687542d5ef128d9beb1b0c05edc4a1',
  'vocab.txt' : '07eced375cec144d27c900241f3e339478dec958f92fddbc551f295c992038a3',
 }
 bert_base_cased_sha = {
  'bert_config.json' : 'f11dfb757bea16339a33e1bf327b0aade6e57fd9c29dc6b84f7ddb20682f48bc',
  'bert_model.ckpt.data-00000-of-00001' : '734d5a1b68bf98d4e9cb6b6692725d00842a1937af73902e51776905d8f760ea',
  'bert_model.ckpt.index' : '517d6ef5c41fc2ca1f595276d6fccf5521810d57f5a74e32616151557790f7b1',
  'bert_model.ckpt.meta' : '5f8a9771ff25dadd61582abb4e3a748215a10a6b55947cbb66d0f0ba1694be98',
  'vocab.txt' : 'eeaa9875b23b04b4c54ef759d03db9d1ba1554838f8fb26c5d96fa551df93d02',
 }
 bert_large_cased_sha = {
  'bert_config.json' : '7adb2125c8225da495656c982fd1c5f64ba8f20ad020838571a3f8a954c2df57',
  'bert_model.ckpt.data-00000-of-00001' : '6ff33640f40d472f7a16af0c17b1179ca9dcc0373155fb05335b6a4dd1657ef0',
  'bert_model.ckpt.index' : 'ef42a53f577fbe07381f4161b13c7cab4f4fc3b167cec6a9ae382c53d18049cf',
  'bert_model.ckpt.meta' : 'd2ddff3ed33b80091eac95171e94149736ea74eb645e575d942ec4a5e01a40a1',
  'vocab.txt' : 'eeaa9875b23b04b4c54ef759d03db9d1ba1554838f8fb26c5d96fa551df93d02',
 }
 bert_base_multilingual_cased_sha = {
  'bert_config.json' : 'e76c3964bc14a8bb37a5530cdc802699d2f4a6fddfab0611e153aa2528f234f0',
  'bert_model.ckpt.data-00000-of-00001' : '55b8a2df41f69c60c5180e50a7c31b7cdf6238909390c4ddf05fbc0d37aa1ac5',
  'bert_model.ckpt.index' : '7d8509c2a62b4e300feb55f8e5f1eef41638f4998dd4d887736f42d4f6a34b37',
  'bert_model.ckpt.meta' : '95e5f1997e8831f1c31e5cf530f1a2e99f121e9cd20887f2dce6fe9e3343e3fa',
  'vocab.txt' : 'fe0fda7c425b48c516fc8f160d594c8022a0808447475c1a7c6d6479763f310c',
 }
 bert_large_multilingual_uncased_sha = {
  'bert_config.json' : '49063bb061390211d2fdd108cada1ed86faa5f90b80c8f6fdddf406afa4c4624',
  'bert_model.ckpt.data-00000-of-00001' : '3cd83912ebeb0efe2abf35c9f1d5a515d8e80295e61c49b75c8853f756658429',
  'bert_model.ckpt.index' : '87c372c1a3b1dc7effaaa9103c80a81b3cbab04c7933ced224eec3b8ad2cc8e7',
  'bert_model.ckpt.meta' : '27f504f34f02acaa6b0f60d65195ec3e3f9505ac14601c6a32b421d0c8413a29',
  'vocab.txt' : '87b44292b452f6c05afa49b2e488e7eedf79ea4f4c39db6f2f4b37764228ef3f',
 }
 bert_base_chinese_sha = {
  'bert_config.json' : '7aaad0335058e2640bcb2c2e9a932b1cd9da200c46ea7b8957d54431f201c015',
  'bert_model.ckpt.data-00000-of-00001' : '756699356b78ad0ef1ca9ba6528297bcb3dd1aef5feadd31f4775d7c7fc989ba',
  'bert_model.ckpt.index' : '46315546e05ce62327b3e2cd1bed22836adcb2ff29735ec87721396edb21b82e',
  'bert_model.ckpt.meta' : 'c0f8d51e1ab986604bc2b25d6ec0af7fd21ff94cf67081996ec3f3bf5d823047',
  'vocab.txt' : '45bbac6b341c319adc98a532532882e91a9cefc0329aa57bac9ae761c27b291c',
 }
 # Relate SHA to urls for loop below
 model_sha = {
  'bert_base_uncased' : bert_base_uncased_sha,
  'bert_large_uncased' : bert_large_uncased_sha,
  'bert_base_cased' : bert_base_cased_sha,
  'bert_large_cased' : bert_large_cased_sha,
  'bert_base_multilingual_cased' : bert_base_multilingual_cased_sha,
  'bert_large_multilingual_uncased' : bert_large_multilingual_uncased_sha,
  'bert_base_chinese' : bert_base_chinese_sha
 }
 # Helper to get sha256sum of a file
 def sha256sum(filename):
  h  = hashlib.sha256()
  b  = bytearray(128*1024)
  mv = memoryview(b)
  with open(filename, 'rb', buffering=0) as f:
    for n in iter(lambda : f.readinto(mv), 0):
      h.update(mv[:n])
  return h.hexdigest()
 # Iterate over urls: download, unzip, verify sha256sum
 found_mismatch_sha = False
 for model in model_urls:
  url = model_urls[model][0]
  file = model_urls[model][1]
  print("Downloading", url)
  response = urllib.request.urlopen(url)
  with open(file, "wb") as handle:
    handle.write(response.read())
  print("Unzipping", file)
  zip = zipfile.ZipFile(file, 'r')
  zip.extractall()
  zip.close()
  sha_dict = model_sha[model]
  for extracted_file in sha_dict:
    sha = sha_dict[extracted_file]
    if sha != sha256sum(file[:-4] + "/" + extracted_file):
      found_mismatch_sha = True
      print("SHA256sum does not match on file:", extracted_file, "from download url:", url)
    else:
      print(file[:-4] + "/" + extracted_file, "\t", "verified")
 if not found_mismatch_sha:
  print("All downloads pass sha256sum verification.")
--- a/PyTorch/LanguageModeling/BERT/vocab/vocab
+++ b/PyTorch/LanguageModeling/BERT/vocab/vocab
--- a/PyTorch/Recommendation/NCF/Dockerfile
+++ b/PyTorch/Recommendation/NCF/Dockerfile
@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
-FROM nvcr.io/nvidia/pytorch:18.12.1-py3
+FROM nvcr.io/nvidia/pytorch:19.05-py3
 RUN apt-get update && \
    apt-get install -y unzip
--- a/PyTorch/Recommendation/NCF/README.md
+++ b/PyTorch/Recommendation/NCF/README.md
@ -1,6 +1,52 @@
-# Neural Collaborative Filtering (NCF)
+# Neural Collaborative Filtering (NCF) for PyTorch
 This repository provides a script and recipe to train the Neural Collaborative Filtering (NCF)
 model to achieve state of the art accuracy, and is tested and maintained by NVIDIA.
 Table of Contents
 =================
  * [The model](#the-model)
     * [Model architecture](#model-architecture)
     * [Default configuration](#default-configuration)
   * [Feature support matrix](#feature-support-matrix)
        * [Features](#features)
  * [Setup](#setup)
     * [Requirements](#requirements)
     * [Quick Start Guide](#quick-start-guide)
  * [Details](#details)
     * [Scripts and sample code](#scripts-and-sample-code)
     * [Command-line options](#command-line-options)
     * [Getting the data](#getting-the-data)
        * [Dataset guidelines](#dataset-guidelines)
        * [Multi-dataset](#multi-dataset)
    * [ML-1m](#ml-1m)
     * [Training process](#training-process)
     * [Inference process](#inference-process)
  * [Mixed precision training](#mixed-precision-training)
     * [Enabling mixed precision](#enabling-mixed-precision)
  * [Benchmarking](#benchmarking)
     * [Training performance benchmark](#training-performance-benchmark)
     * [Inference performance benchmark](#inference-performance-benchmark)
  * [Results](#results)
     * [Training accuracy results](#training-accuracy-results)
        * [NVIDIA DGX-1 (8x V100 32G)](#nvidia-dgx-1-8x-v100-32g)
        * [Training stability test](#training-stability-test)
     * [Training performance results](#training-performance-results)
        * [NVIDIA DGX-1 (8x V100 16G)](#nvidia-dgx-1-(8x-v100-16g))
        * [NVIDIA DGX-1 (8x V100 32G)](#nvidia-dgx-1-(8x-v100-32g))
        * [NVIDIA DGX-2 (16x V100 32G)](#nvidia-dgx-2-(16x-v100-32g))
     * [Inference performance results](#inference-performance-results)
        * [NVIDIA DGX-1 (8x V100 16G)](#nvidia-dgx-1-(8x-v100-16g))
        * [NVIDIA DGX-1 (8x V100 32G)](#nvidia-dgx-1-(8x-v100-32g))
        * [NVIDIA DGX-2 (16x V100 32G)](#nvidia-dgx-2-(16x-v100-32g))
  * [Changelog](#changelog)
  * [Known issues](#known-issues)
     * [Scaling beyond 8 GPUs](#scaling-beyond-8-gpus)
     * [Memory usage](#memory-usage)
 ## The model
 The NCF model focuses on providing recommendations, also known as collaborative filtering; with implicit feedback. The training data for this model should contain binary information about whether a user interacted with a specific item.
 NCF was first described by Xiangnan He, Lizi Liao, Hanwang Zhang, Liqiang Nie, Xia Hu and Tat-Seng Chua in the [Neural Collaborative Filtering paper](https://arxiv.org/abs/1708.05031).
@ -8,6 +54,23 @@ The implementation in this repository focuses on the NeuMF instantiation of the
 We modified it to use dropout in the FullyConnected layers. This reduces overfitting and increases the final accuracy.
 Training the other two instantiations of NCF (GMF and MLP) is not supported. 
 Contrary to the original paper, we benchmark the model on the larger [ML-20m dataset](https://grouplens.org/datasets/movielens/20m/)
 instead of using the smaller [ML-1m](https://grouplens.org/datasets/movielens/1m/) dataset as we think this is more realistic of production type environments.
 However, using the ML-1m dataset is also supported.
 This model is trained with mixed precision using Tensor Cores on NVIDIA Volta and Turing GPUs. Therefore, researchers can get results 2x faster than training without Tensor Cores, while experiencing the benefits of mixed precision training. Multi-GPU training is also supported. This model is tested against each NGC monthly container release to ensure consistent accuracy and performance over time.
 ### Model architecture
 This model is based mainly on Embedding and FullyConnected layers. The control flow is divided into two branches:
 * Multi Layer Perceptron (MLP) branch, which transforms the input through FullyConnected layers with ReLU activations and dropout.
 * Matrix Factorization (MF) branch, which performs collaborative filtering factorization.
 Each user and each item has two embedding vectors associated with it -- one for the MLP branch and the other for the MF branch.
 The outputs from those branches are concatenated and fed to the final FullyConnected layer with sigmoid activation.
 This can be interpreted as a probability of a user interacting with a given item.
 <p align="center">
  <img width="70%" src="./img/ncf_diagram.png" />
@ -16,252 +79,483 @@ Figure 1. The architecture of a Neural Collaborative Filtering model. Taken from
 </p>
-Contrary to the original paper, we benchmark the model on the larger [ml-20m dataset](https://grouplens.org/datasets/movielens/20m/)
+### Default configuration
 instead of using the smaller [ml-1m](https://grouplens.org/datasets/movielens/1m/) dataset as we think this is more realistic of production type environments.
 However, using the ml-1m dataset is also supported.
-## Requirements
+The following features were implemented in this model:
  * Automatic Mixed Precision (AMP)
  * Data-parallel multi-GPU training and evaluation 
  * Dropout
  * Gradient accumulation
-The easiest way to train the model is to use a Docker container. This would require:
+The following performance optimizations were implemented in this model:
-* [nvidia-docker](https://github.com/NVIDIA/nvidia-docker)
+  * FusedAdam optimizer
-* [PyTorch 18.12.1-py3 NGC container](https://ngc.nvidia.com/catalog/containers/nvidia:pytorch) or newer
+  * Approximate train negative sampling
-
+  * Caching all the positive training samples in the device memory
 For more information about how to get started with NGC containers, see the
 following sections from the NVIDIA GPU Cloud Documentation and the Deep Learning
 Frameworks Documentation:
 * [Getting Started Using NVIDIA GPU Cloud](https://docs.nvidia.com/ngc/ngc-getting-started-guide/index.html)
 * [Accessing And Pulling From The NGC Container Registry](https://docs.nvidia.com/deeplearning/dgx/user-guide/index.html#accessing_registry)
 * [Running PyTorch](https://docs.nvidia.com/deeplearning/dgx/pytorch-release-notes/running.html#running)
-## Training using mixed precision with Tensor Cores
+### Feature support matrix
-### Supported hardware
+The following features are supported by this model:
 Before you can train using mixed precision with Tensor Cores, ensure that you have an
 NVIDIA Volta based GPU. Other platforms may work, however, are not officially 
 supported.
-### Software changes
+| **Feature** | **NCF PyTorch** | 
- For detailed information about how to train using mixed precision, see the [Mixed 
+|:---:|:--------:|
- Precision Training paper](https://arxiv.org/abs/1710.03740) 
+| Automatic Mixed Precision (AMP) | Yes |
- and [Training With Mixed Precision documentation](https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html).
+| Multi-GPU training with Distributed Data Parallel (DDP) | Yes |
 | Fused Adam | Yes |
-Another option for adding mixed-precision support is available from NVIDIA’s
+#### Features
 [APEX](https://github.com/NVIDIA/apex), a PyTorch extension, that contains utility libraries, such as AMP, which require minimal network code changes to leverage Tensor Core performance.
-This implementation of the NCF model uses a custom FP16 optimizer to implement mixed precision with static loss scaling.
+* Automatic Mixed Precision - This implementation of NCF uses AMP to implement mixed precision training.
-The custom FP16 Optimizer was used to take advantage of the performance gains provided by the FusedOptimizer.
+It allows us to use FP16 training with FP32 master weights by modifying just 3 lines of code. 
 * Multi-GPU training with Distributed Data Parallel - uses Apex's DDP to implement efficient multi-GPU training with NCCL.
 * Fused Adam - We use a special implementation of the Adam implementation provided by the Apex package. It fuses some operations for faster weight updates.
 Since NCF is a relatively lightweight model with a large number of parameters, we’ve observed significant performance improvements from using FusedAdam.
-## Quick start guide
+## Setup
 The following section lists the requirements in order to start training the Neural Collaborative Filtering model.
-### 1. Build and launch an NCF PyTorch Docker container
+### Requirements
 This repository contains Dockerfile which extends the PyTorch NGC container and encapsulates some dependencies. 
 Aside from these dependencies, ensure you have the following components:
 NVIDIA Docker
 PyTorch 19.05-py3 NGC container
 NVIDIA Volta or Turing based GPU
-After Docker is correctly set up, you can build the NCF image with:
+For more information about how to get started with NGC containers, see the following sections from the NVIDIA GPU Cloud Documentation and the Deep Learning Documentation:
 Getting Started Using NVIDIA GPU Cloud
 Accessing And Pulling From The NGC Container Registry
 Running PyTorch
 For those unable to use the PyTorch NGC container, to set up the required environment or create your own container, see the versioned NVIDIA Container Support Matrix.  
 ### Quick Start Guide
 1. Clone the repository.
 ```bash
 git clone https://github.com/NVIDIA/DeepLearningExamples
 cd DeepLearningExamples/TensorFlow/Segmentation/UNetIndustrial
 ```
 2. Build an NCF PyTorch Docker container.
 After Docker is setup, you can build the NCF image with:
 ```bash
 docker build . -t nvidia_ncf
 ``` 
-After that the NVIDIA NCF container can be launched with:
+3. Start an interactive session in the NGC container to run preprocessing/training and inference.
 The NCF PyTorch container can be launched with:
 ```bash
 mkdir data
 docker run --runtime=nvidia -it --rm --ipc=host  -v ${PWD}/data:/data nvidia_ncf bash
 ```
-This will launch the container and mount the ./data directory as a volume to the /data directory inside the container.
+This will launch the container and mount the `./data` directory as a volume to the `./data` directory inside the container.
-Any datasets and experiment results (logs, checkpoints etc.) saved to /data will be accessible
+Any datasets and experiment results (logs, checkpoints etc.) saved to `./data` will be accessible
-in the './data' directory on the host. 
+in the `./data` directory on the host. 
-### 2. Data preparation
+4. Download and preprocess the data.
 Preprocessing consists of downloading the data, filtering out users that have less than 20 ratings (by default), sorting the data and dropping the duplicates. 
 The preprocessed train and test data is then saved in PyTorch binary format to be loaded just before training.
 Note: Preprocessing requires PyTorch and should therefore be run inside the Docker container.
 No data augmentation techniques are used.
-To download and preprocess the ml-20m dataset you can run:
+To download and preprocess the ML-20m dataset you can run:
 ```bash
 ./prepare_dataset.sh
 ```
-Please note that this command will return immediately without downloading anything if the data is already present in the /data directory.
+Note: This command will return immediately without downloading anything if the data is already present in the `./data` directory.
-#### Other datasets
+This will store the preprocessed training and evaluation data in the `./data` directory so that it can be later
 used to train the model (by passing the appropriate `--data` argument to the `ncf.py` script).
-This implementation is tuned for the ml-20m and ml-1m datasets.
+5. Start training.
 Using other datasets might require tuning some hyperparameters (e.g., learning rate, beta1, beta2)
-If you'd like to use your custom dataset you can do it by adding support for it in the prepare_dataset.sh and download_dataset.sh scripts.
+After the Docker container is launched, the training with the default hyperparameters can be started with:
 The required format of the data is a CSV file in which the first column contains the userID and the second column contains
 the itemID.
 The performance of the model depends on the dataset size.
 Generally, the model should scale better for datasets containing more data points.
 For a smaller dataset the you might experience slower performance.
 ##### ml-1m
 To download and preprocess the ml-1m dataset run:
 ```bash
 ./prepare_dataset.sh ml-1m
 ```
 This will store the preprocessed training and evaluation data in the /data directory so that it can be later
 used to train the model (by passing the appropriate --data argument to the ncf.py script).
 ### 3. Run the training
 After the docker container is launched, the training with the [default hyperparameters](#5-hyperparameters) can be started with:
 ```bash
 ./prepare_dataset.sh
 python -m torch.distributed.launch --nproc_per_node=8 ncf.py --data /data/cache/ml-20m
 ```
-This will result in a checkpoint file being written to /data/checkpoints/model.pth.
+This will result in a checkpoint file being written to `/data/checkpoints/model.pth`.
-### 4. Test a trained model
+6. Start validation/evaluation.
-The trained model can be evaluated by passing the --mode test flag to the run.sh script:
+The trained model can be evaluated by passing the `--mode test` flag to the `run.sh` script:
 ```bash
 python -m torch.distributed.launch --nproc_per_node=8 ncf.py --data /data/cache/ml-20m  --mode test --checkpoint-path /data/checkpoints/model.pth
 ```
 ### 5. Hyperparameters and command line arguments
-The default hyperparameters used are:
+## Details
-* learning rate: 0.0045
+The following sections provide greater details of the dataset, running training and inference, and the training results.
-* beta1: 0.25
+
-* beta2: 0.5
+### Scripts and sample code
-* training batch size: 1048576
+
-* epsilon: 1e-8
+The `ncf.py` script contains most of the training and validation logic. Data loading and preprocessing code is located in `dataloading.py`.
-* loss scale: 8192
+The model architecture is defined in `neumf.py`. Some initial data preprocessing is located in `convert.py`.
-* negatives sampled for training: 4
+The logger directory contains simple bookkeeping utilities for storing training results.
-* use mixed precision training: Yes
+
-* number of GPUs used: 8
+### Command-line options
 To see the full list of available options and their descriptions, use the `-h` or `--help` command line option, for example: 
 `python ncf.py --help`
 The following example output is printed when running the sample:
 ```
 usage: ncf.py [-h] [--data DATA] [-e EPOCHS] [-b BATCH_SIZE]
              [--valid_batch_size VALID_BATCH_SIZE] [-f FACTORS]
              [--layers LAYERS [LAYERS ...]] [-n NEGATIVE_SAMPLES]
              [-l LEARNING_RATE] [-k TOPK] [--seed SEED]
              [--threshold THRESHOLD] [--valid_negative VALID_NEGATIVE]
              [--beta1 BETA1] [--beta2 BETA2] [--eps EPS] [--dropout DROPOUT]
              [--checkpoint_dir CHECKPOINT_DIR] [--mode {train,test}]
              [--grads_accumulated GRADS_ACCUMULATED] [--opt_level {O0,O2}]
              [--local_rank LOCAL_RANK]
 Train a Neural Collaborative Filtering model:
 optional arguments:
  -h, --help            show this help message and exit
  --data DATA           Path to test and training data files
  -e EPOCHS, --epochs EPOCHS
                        Number of epochs for training
  -b BATCH_SIZE, --batch_size BATCH_SIZE
                        Number of examples for each iteration
  --valid_batch_size VALID_BATCH_SIZE
                        Number of examples in each validation chunk
  -f FACTORS, --factors FACTORS
                        Number of predictive factors
  --layers LAYERS [LAYERS ...]
                        Sizes of hidden layers for MLP
  -n NEGATIVE_SAMPLES, --negative_samples NEGATIVE_SAMPLES
                        Number of negative examples per interaction
  -l LEARNING_RATE, --learning_rate LEARNING_RATE
                        Learning rate for optimizer
  -k TOPK, --topk TOPK  Rank for test examples to be considered a hit
  --seed SEED, -s SEED  Manually set random seed for torch
  --threshold THRESHOLD, -t THRESHOLD
                        Stop training early at threshold
  --valid_negative VALID_NEGATIVE
                        Number of negative samples for each positive test
                        example
  --beta1 BETA1, -b1 BETA1
                        Beta1 for Adam
  --beta2 BETA2, -b2 BETA2
                        Beta1 for Adam
  --eps EPS             Epsilon for Adam
  --dropout DROPOUT     Dropout probability, if equal to 0 will not use
                        dropout at all
  --checkpoint_dir CHECKPOINT_DIR
                        Path to the directory storing the checkpoint file
  --mode {train,test}   Passing "test" will only run a single evaluation,
                        otherwise full training will be performed
  --grads_accumulated GRADS_ACCUMULATED
                        Number of gradients to accumulate before performing an
                        optimization step
  --opt_level {O0,O2}   Optimization level for Automatic Mixed Precision
  --local_rank LOCAL_RANK
                        Necessary for multi-GPU training
 All these parameters can be controlled by passing command line arguments to the ncf.py script.
 To get a complete list of all command line arguments with descriptions and default values you can run:
 ```bash
 python ncf.py --help
 ```
 ### Getting the data
-## Training accuracy results
+The NCF model was trained on the ML-20m dataset.
 For each user, the interaction with the latest timestamp was included in the test set and the rest of the examples are used as the training data. 
 This repository contains the `./prepare_dataset.sh` script which will automatically download and preprocess the training and validation datasets. 
 By default, data will be downloaded to the `/data` directory. The preprocessed data will be placed in `/data/cache`.
 #### Dataset guidelines
 The required format of the data is a CSV file with three columns: `user_id`, `item_id` and `timestamp`. This CSV should contain only the positive examples,  in other words,
 the ones for which an interaction between a user and an item occurred. The negatives will be sampled during the training and validation.
 #### Multi-dataset 
 This implementation is tuned for the ML-20m and ML-1m datasets.
 Using other datasets might require tuning some hyperparameters (for example, learning rate, beta1 and beta2).
 If you'd like to use your custom dataset you can do it by adding support for it in the `prepare_dataset.sh` and `download_dataset.sh` scripts.
 The performance of the model depends on the dataset size.
 Generally, the model should scale better for datasets containing more data points.
 For a smaller dataset you might experience slower performance.
 #### ML-1m
 To download, preprocess and train on the ML-1m dataset run:
 ```bash
 ./prepare_dataset.sh ml-1m
 python -m torch.distributed.launch --nproc_per_node=8 ncf.py --data /data/cache/ml-1m
 ```
 ### Training process
 The name of the training script is `ncf.py`. Because of the multi-GPU support, it should always be run with the torch distributed launcher like this:
 ```bash
 python -m torch.distributed.launch --nproc_per_node=<number_of_gpus> ncf.py --data <path_to_dataset> [other_parameters]
 ```
 The main result of the training are checkpoints stored by default in `/data/checkpoints/`. This location can be controlled
 by the `--checkpoint_dir` command-line argument.
 The validation metric is Hit Rate at 10 (HR@10) with 100 test negative samples. This means that for each positive sample in 
 the test set 100 negatives are sampled. All resulting 101 samples are then scored by the model. If the true positive sample is
 among the 10 samples with highest scores we have a "hit" and the metric is equal to 1, otherwise it's equal to 0.
 The HR@10 metric is the number of hits in the entire test set divided by the number of samples in the test set.  
 ### Inference process
 Inference can be launched with the same script used for training by passing the `--mode test` flag:
 ```bash
 python -m torch.distributed.launch --nproc_per_node=<number_of_gpus> ncf.py  --data <path_to_dataset> --mode test [other_parameters]
 ```
 The script will then:
 * Load the checkpoint from the directory specified by the `--checkpoint_dir` directory
 * Run inference on the test dataset
 * Compute and print the validation metric
 ## Mixed precision training
 Mixed precision is the combined use of different numerical precisions in a computational method. [Mixed precision](https://arxiv.org/abs/1710.03740) training offers significant computational speedup by performing operations in half-precision format, while storing minimal information in single-precision to retain as much information as possible in critical parts of the network. Since the introduction of [tensor cores](https://developer.nvidia.com/tensor-cores) in the Volta and Turing architecture, significant training speedups are experienced by switching to mixed precision -- up to 3x overall speedup on the most arithmetically intense model architectures. Using mixed precision training requires two steps:
 1.  Porting the model to use the FP16 data type where appropriate.    
 2.  Adding loss scaling to preserve small gradient values.
 The ability to train deep learning networks with lower precision was introduced in the Pascal architecture and first supported in [CUDA 8](https://devblogs.nvidia.com/parallelforall/tag/fp16/) in the NVIDIA Deep Learning SDK.
 For information about:
 -   How to train using mixed precision, see the [Mixed Precision Training](https://arxiv.org/abs/1710.03740) paper and [Training With Mixed Precision](https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html) documentation.
 -   Techniques used for mixed precision training, see the [Mixed-Precision Training of Deep Neural Networks](https://devblogs.nvidia.com/mixed-precision-training-deep-neural-networks/) blog.
 -   How to access and enable AMP for TensorFlow, see [Using TF-AMP](https://docs.nvidia.com/deeplearning/dgx/tensorflow-user-guide/index.html#tfamp) from the TensorFlow User Guide.
 -   APEX tools for mixed precision training, see the [NVIDIA Apex: Tools for Easy Mixed-Precision Training in PyTorch](https://devblogs.nvidia.com/apex-pytorch-easy-mixed-precision-training/).
 ### Enabling mixed precision
 Using the Automatic Mixed Precision (AMP) package requires two modifications in the source code.
 The first one is to initialize the model and the optimizer using the `amp.initialize` function:
 ```python
 model, optimizer = amp.initialize(model, optimizer, opt_level=args.opt_level,
                                          keep_batchnorm_fp32=False, loss_scale='dynamic')
 ```
 The second one is to use the AMP's loss scaling context manager:
 ```python
 with amp.scale_loss(loss, optimizer) as scaled_loss:
    scaled_loss.backward()
 ```
 ## Benchmarking
 ### Training performance benchmark
 NCF training on NVIDIA DGX systems is very fast, therefore, in order to measure train and validation throughput, you can simply run the full training job with: 
 ```bash
 ./prepare_dataset.sh
 python -m torch.distributed.launch --nproc_per_node=8 ncf.py --data /data/cache/ml-20m --epochs 5
 ```
 At the end of the script, a line reporting the best train throughput is printed.
 ### Inference performance benchmark
 Validation throughput can be measured by running the full training job with:
 ```bash
 ./prepare_dataset.sh
 python -m torch.distributed.launch --nproc_per_node=8 ncf.py --data /data/cache/ml-20m --epochs 5
 ```
 The best validation throughput is reported to the standard output. 
 ## Results
 The following sections provide details on how we achieved our performance and accuracy in training and inference. 
 ### Training accuracy results
 #### NVIDIA DGX-1 (8x V100 32G)
 Our results were obtained by following the steps in the Quick Start Guide in the PyTorch 19.05-py3 NGC container on NVIDIA DGX-1 with 8x V100 32G GPUs.
 The following table lists the best hit rate at 10 for DGX-1 with 8 V100 32G GPUs:
-| **Number of GPUs** | **Full precision HR@10** | **Mixed precision HR@10** | 
+| **Number of GPUs** | **Single precision HR@10** | **Mixed precision HR@10** | 
 |:---:|:--------:|:-------:|
-|1|	0.959015 |0.959485|
+|1|	0.95847 | 0.95845 |
-|4|	0.959389 |0.959274|
+|4|	0.95887 | 0.95841 |
-|8|	0.959015 |0.96|
+|8|	0.95850 | 0.95885 |
-Here's an example validation accuracy curve for mixed precision vs full precision on DGX-1 with 8 V100 32G GPUs:
+Here's an example validation accuracy curve for mixed precision vs single precision on DGX-1 with 8 V100 32G GPUs:
 ![ValidationAccuracy](./img/dgx1v_32_curve.png)
 To reproduce this result, start the NCF Docker container interactively and run:
 ```bash
 ./prepare_dataset.sh
 python -m torch.distributed.launch --nproc_per_node=8 ncf.py --data /data/cache/ml-20m
 ```
 Training accuracy results on a DGX-1 with 8 V100 16G GPUs and on DGX-2 should be the same.
 #### Training stability test
 The histogram below shows the best HR@10 achieved 
 for 400 experiments using mixed precision and 400 experiments using single precision.
-Mean HR@10 for mixed precision was equal to 0.95917 and for single precision it was equal to
+Mean HR@10 for mixed precision was equal to 0.95868 and for single precision it was equal to
-0.95915.
+0.95867.
 ![hr_histogram](./img/hr_histogram.png)
-## Training performance results
+### Training performance results
 This example is based on [our submission for the MLPerf v0.5 benchmark](https://github.com/mlperf/results/tree/master/v0.5.0/nvidia/submission/code/recommendation/pytorch). Please note that we've introduced some improvements to this version that make time-to-train not directly comparable between it and our MLPerf submission:
 - This version uses a more efficient multi-gpu sharding algorithm
 - We added dropout operations here to achieve better accuracy
 - This version uses 100 negatives by default during the evaluation phase as was done in the original NCF paper. MLPerf version used 999
 - We save the model checkpoints in this version. This might make the training a few seconds slower depending on the speed of your storage
-### NVIDIA DGX-1 with 8 V100 16G GPUs
+#### NVIDIA DGX-1 (8x V100 16G)
 Our results were obtained by following the steps in the Quick Start Guide in the PyTorch 19.05-py3 NGC container on NVIDIA DGX-1 with 8x V100 16G GPUs. 
 The following table shows the best training throughput:
-| **Number of GPUs (samples/sec)** | **Mixed precision (samples/sec)** | **Full precision (samples/sec)** | **Speedup** | 
+| **Number of GPUs** | **Batch size per GPU**| **Mixed precision throughput (samples/sec)** | **Single precision throughput (samples/sec)** | **Speed-up with mixed precision** | **Multi-GPU strong scaling with mixed precision** | **Multi-GPU strong scaling with FP32** |
-|:---:|:-------------:|:-----------:|:-----:|
+|:---:|:--------:|:-----:|:-----------:|:-----:|:----:|:---|
-| 1 | 20,027,840 | 9,529,271 | 2.10 |
+| 1 |1048576| 20,459,365| 9,777,551 | 2.09 |  1 | 1 |
-| 4 | 62,633,260| 32,719,700 | 1.91 |
+| 4 |262144 | 61,782,125| 32,583,924 | 1.90 | 3.02 |3.33|
-| 8 | 99,332,230| 55,004,590 | 1.81 |
+| 8 |131072 | 98,464,084| 55,365,147 | 1.78 |4.81 |5.66|
-The following table shows mean time to reach HR@10 of 0.9562 across 5 random seeds. The training time was measured excluding data downloading, preprocessing and library initialization times.
+The following table shows the average time to reach HR@10 of 0.9562 across 5 random seeds. The training time was measured excluding data downloading, preprocessing, validation data generation and library initialization times.
-| **Number of GPUs (samples/sec)** | **Mixed precision (seconds)** | **Full precision (seconds)** | **Speedup** | 
+| **Number of GPUs** | **Batch size per GPU** | **Mixed precision (seconds)** | **Single precision (seconds)** | **Speed-up with mixed precision** | 
-|:---:|:-------------:|:-----------:|:-----:|
+|:---:|:----:|:---------:|:-----------:|:-----:|
-| 1 | 78.73 | 153.90 | 1.95 |
+| 1 | 1048576| 67.03 | 142.31 | 2.12 |
-| 4 | 25.80 | 49.41 | 1.92 |
+| 4 | 262144| 23.92	| 47.57	| 1.99 |
-| 8 | 20.42 | 32.68 | 1.60 |
+| 8 | 131072| 18.82	| 31.48	| 1.67 | 
-### NVIDIA DGX-1 with 8 V100 32G GPUs
+
 #### NVIDIA DGX-1 (8x V100 32G)
 Our results were obtained by following the steps in the Quick Start Guide in the PyTorch 19.05-py3 NGC container on NVIDIA DGX-1 with 8x V100 32G GPUs. 
 The following table shows the best training throughput:
-| **Number of GPUs (samples/sec)** | **Mixed precision (samples/sec)** | **Full precision (samples/sec)** | **Speedup** | 
+| **Number of GPUs** | **Batch size per GPU** | **Mixed precision throughput (samples/sec)** | **Single precision throughput (samples/sec)** | **Speed-up with mixed precision** | **Multi-GPU strong scaling with mixed precision** | **Multi-GPU strong scaling with FP32** | 
 |:---:|:----:|:---------:|:-----------:|:-----:|:---:|:---:|
 | 1 | 1048576| 19,314,944 | 9,464,431 | 2.04 | 1 | 1 |
 | 4 | 262144| 58,579,745 |31,577,085 | 1.86 | 3.03 | 3.34 |
 | 8 | 131072| 92,964,306 | 53,972,811 | 1.72 | 4.81	| 5.70 |
 The following table shows the average time to reach HR@10 of 0.9562 across 5 random seeds. The training time was measured excluding data downloading, preprocessing, validation data generation and library initialization times.
 | **Number of GPUs** | **Mixed precision (seconds)** | **Single precision (seconds)** | **Speed-up with mixed precision** | 
 |:---:|:-------------:|:-----------:|:-----:|
-| 1 | 18,871,650 | 9,206,424 | 2.05 |
+| 1 | 70.49	| 146.68 | 2.08 |
-| 4 | 59,413,640 | 31,898,870 | 1.86 |
+| 4 | 24.61	| 49.01	| 1.99 |
-| 8 | 94,752,770 | 53,645,640 | 1.77 |
+| 8 | 19.72	| 32.25	| 1.64 |
 The following table shows mean time to reach HR@10 of 0.9562 across 5 random seeds. The training time was measured excluding data downloading, preprocessing and library initialization times.
-| **Number of GPUs (samples/sec)** | **Mixed precision (seconds)** | **Full precision (seconds)** | **Speedup** | 
+
 #### NVIDIA DGX-2 (16x V100 32G)
 Our results were obtained by following the steps in the Quick Start Guide in the PyTorch 19.05-py3 NGC container on NVIDIA DGX-2 with 16x V100 32G GPUs. 
 The following table shows the best training throughput:
 | **Number of GPUs ** | **Batch size per GPU** | **Mixed precision throughput (samples/sec)** | **Single precision throughput (samples/sec)** | **Speed-up with mixed precision** | **Multi-GPU strong scaling with mixed precision** | **Multi-GPU strong scaling with FP32** |
 |:---:|:-----:|:-------:|:-----------:|:-----:|:---:|:---:|
 | 1 | 1048576| 20,645,544 | 10,145,873 | 2.03 | 1 | 1 |
 | 4 | 262144 | 63,608,950 | 34,758,369 | 1.83 | 3.08 | 3.43 |
 | 8 | 131072| 98,887,103 | 57,251,418 | 1.73 | 4.79	| 5.64 |
 | 16 | 65536| 128,976,394 | 82,932,545 | 1.56 | 6.25 | 8.17 |
 The following table shows the average time to reach HR@10 of 0.9562 across 5 random seeds. The training time was measured excluding data downloading, preprocessing, validation data generation and library initialization times.
 | **Number of GPUs ** | **Mixed precision (seconds)** | **Single precision (seconds)** | **Speed-up with mixed precision** | 
 |:---:|:-------------:|:-----------:|:-----:|
-| 1 | 79.80 | 147.92 | 1.85 |
+| 1 | 65.99	|134.93	|2.04|
-| 4 | 27.67 | 47.64 | 1.72 |
+| 4 | 26.21	|41.12	|1.57|
-| 8 | 22.61 | 31.62 | 1.40 |
+| 8 | 21.96	|29.71	|1.35|
 | 16| 22.15	|28.99	|1.31|
 ## Inference performance results
-### NVIDIA DGX-1 with 8 V100 16G GPUs
+### Inference performance results
 #### NVIDIA DGX-1 (8x V100 16G)
 Our results were obtained by following the steps in the Quick Start Guide in the PyTorch 19.05-py3 NGC container on NVIDIA DGX-1 with 8x V100 16G GPUs.
 The following table shows the best inference throughput:
-| **Number of GPUs (samples/sec)** | **Mixed precision (samples/sec)** | **Full precision (samples/sec)** | **Speedup** | 
+| **Number of GPUs ** | **Mixed precision (samples/sec)** | **Single precision (samples/sec)** | **Speed-up with mixed precision** | 
 |:---:|:-------------:|:-----------:|:-----:|
-| 1 | 58,836,420 | 28,964,964 | 2.03 |
+| 1 | 57,163,273 | 28,877,257 | 1.98 |
-### NVIDIA DGX-1 with 8 V100 32G GPUs
+#### NVIDIA DGX-1 (8x V100 32G)
 Our results were obtained by following the steps in the Quick Start Guidein the PyTorch 19.05-py3 NGC container on NVIDIA DGX-1 with 8x V100 32G GPUs.
 The following table shows the best inference throughput:
-| **Number of GPUs (samples/sec)** | **Mixed precision (samples/sec)** | **Full precision (samples/sec)** | **Speedup** | 
+| **Number of GPUs** | **Mixed precision (samples/sec)** | **Single precision (samples/sec)** | **Speed-up with mixed precision** | 
 |:---:|:-------------:|:-----------:|:-----:|
-| 1 | 55,317,010 | 28,470,920 | 1.94 |
+| 1 | 54,570,476 | 28,085,521 | 1.94 |
 #### NVIDIA DGX-2 (16x V100 32G)
 Our results were obtained by following the steps in the Quick Start Guide in the PyTorch 19.05-py3 NGC container on NVIDIA DGX-2 with 16x V100 32G GPUs.
 The following table shows the best inference throughput:
 | **Number of GPUs** | **Mixed precision (samples/sec)** | **Single precision (samples/sec)** | **Speed-up with mixed precision** | 
 |:---:|:-------------:|:-----------:|:-----:|
 | 1 | 58,383,216 | 30,018,043 | 1.94 |
 ## Changelog
 1. January 22, 2018
    * Initial release
 2. May, 2019
    * Lower memory consumption (down from about 18GB to 10GB for batch size 1M on a single NVIDIA Tesla V100). Achieved by using an approximate method for generating negatives for training.
    * Automatic Mixed Precision (AMP) with dynamic loss scaling instead of a custom mixed-precision optimizer.
    * Performance numbers for NVIDIA DGX-2.
    * Data loading code cleanup.
    * Default container updated to PyTorch 19.05-py3.
    * Updated README.md.
 ## Known issues
 ### Scaling beyond 8 GPUs
-Neural Collaborative Filtering is a relatively lightweight model that trains quickly with this relatively smaller dataset, ml-20m.
+Neural Collaborative Filtering is a relatively lightweight model that trains quickly with this relatively smaller dataset, ML-20m.
-Because of that the high ratio of communication to computation makes it difficult to 
+Because of that, the high ratio of communication to computation makes it difficult to 
-efficiently use more than 8 GPUs. Normally this is not an issue because when using 8
+efficiently use more than 8 GPUs. Typically, this is not an issue because when using 8
-GPUs with fp16 precision the training is sufficiently fast. However, if you’d like to
+GPUs with FP16 precision, the training is sufficiently fast. However, if you’d like to
- scale the training to 16 GPUs and beyond you might try modifying the model so that 
+ scale the training to 16 GPUs and beyond, you might try modifying the model so that 
- the communication-computation ratio facilitates better scaling. This could be done e.g.,
+ the communication-computation ratio facilitates better scaling. This could be done, for example,
  by finding hyperparameters that enable using a larger batch size or by reducing the 
  number of trainable parameters.
 ### Memory usage
 Training on a single GPU with less than 16GB of memory or switching off FP16 mode might result in out-of-memory errors. To reduce memory usage you can use a smaller batch size.
 However, since we’re using the Adam optimizer, this might require changing the hyperparameters such as learning rate, beta1 and beta2.
 To circumvent this you can use gradient accumulation to combine multiple gradients computed from smaller batches into a single weight update.
 This should keep the “effective” batch size the same as original and enable using the default hyperparameters with much lower memory usage:
-```bash
+In the default settings, the additional memory beyond 16G may not be fully utilized.
-python -m torch.distributed.launch --nproc_per_node=8 ncf.py --data /data/cache/ml-20m --grads_accumulated 2 --batch-size 524288
+This is because we set the default batch size for ML-20m dataset to 1M,
-```
+which is too small to completely fill-up multiple 32G GPUs.
 1M is the batch size for which we experienced the best convergence on the ML-20m dataset.
 However, on other datasets, even faster performance can be possible by finding hyperparameters that work well for larger batches and leverage additional GPU memory.
 In the default settings the additional memory beyond 16G may not be fully utilized.
 This is because we set the default batch size for ml-20m dataset to 1M,
 which is too small to completely fill up multiple 32G GPUs.
 1M is the batch size for which we experienced the best convergence on the ml-20m dataset.
 However, on other datasets even faster performance can be possible by finding hyperparameters that work well for larger batches and leverage additional GPU memory.
--- a/PyTorch/Recommendation/NCF/dataloading.py
+++ b/PyTorch/Recommendation/NCF/dataloading.py
@ -0,0 +1,158 @@
 # Copyright (c) 2018, deepakn94, codyaustun, robieta. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
 # -----------------------------------------------------------------------
 #
 # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import time
 import torch
 import tqdm
 class _TestNegSampler:
    def __init__(self, train_ratings, nb_neg):
        self.nb_neg = nb_neg
        self.nb_users = int(train_ratings[:, 0].max()) + 1
        self.nb_items = int(train_ratings[:, 1].max()) + 1
        # compute unique ids for quickly created hash set and fast lookup
        ids = (train_ratings[:, 0] * self.nb_items) + train_ratings[:, 1]
        self.set = set(ids)
    def generate(self, batch_size=128*1024):
        users = torch.arange(0, self.nb_users).reshape([1, -1]).repeat([self.nb_neg, 1]).transpose(0, 1).reshape(-1)
        items = [-1] * len(users)
        random_items = torch.LongTensor(batch_size).random_(0, self.nb_items).tolist()
        print('Generating validation negatives...')
        for idx, u in enumerate(tqdm.tqdm(users.tolist())):
            if not random_items:
                random_items = torch.LongTensor(batch_size).random_(0, self.nb_items).tolist()
            j = random_items.pop()
            while u * self.nb_items + j in self.set:
                if not random_items:
                    random_items = torch.LongTensor(batch_size).random_(0, self.nb_items).tolist()
                j = random_items.pop()
            items[idx] = j
        items = torch.LongTensor(items)
        return items
 def create_test_data(train_ratings, test_ratings, args):
    test_users = test_ratings[:,0]
    test_pos = test_ratings[:,1].reshape(-1,1)
    begin = time.time()
    sampler = _TestNegSampler(train_ratings.cpu().numpy(), args.valid_negative)
    test_negs = sampler.generate().cuda()
    end = time.time()
    print('Generating validation negatives took: ', end - begin)
    del train_ratings
    # create items with real sample at last position
    test_users = test_users.reshape(-1,1).repeat(1, 1 + args.valid_negative)
    test_items = torch.cat((test_negs.reshape(-1, args.valid_negative), test_pos), dim=1)
    del test_ratings, test_negs
    # generate dup mask and real indices for exact same behavior on duplication compare to reference
    # here we need a sort that is stable(keep order of duplicates)
    sorted_items, indices = torch.sort(test_items) # [1,1,1,2], [3,1,0,2]
    sum_item_indices = sorted_items.float()+indices.float()/len(indices[0]) #[1.75,1.25,1.0,2.5]
    indices_order = torch.sort(sum_item_indices)[1] #[2,1,0,3]
    stable_indices = torch.gather(indices, 1, indices_order) #[0,1,3,2]
    # produce -1 mask
    dup_mask = (sorted_items[:,0:-1] == sorted_items[:,1:])
    dup_mask = torch.cat((torch.zeros_like(test_pos, dtype=torch.uint8), dup_mask),dim=1)
    dup_mask = torch.gather(dup_mask,1,stable_indices.sort()[1])
    # produce real sample indices to later check in topk
    sorted_items, indices = (test_items != test_pos).sort()
    sum_item_indices = sorted_items.float()+indices.float()/len(indices[0])
    indices_order = torch.sort(sum_item_indices)[1]
    stable_indices = torch.gather(indices, 1, indices_order)
    real_indices = stable_indices[:,0]
    if args.distributed:
        test_users = torch.chunk(test_users, args.world_size)[args.local_rank]
        test_items = torch.chunk(test_items, args.world_size)[args.local_rank]
        dup_mask = torch.chunk(dup_mask, args.world_size)[args.local_rank]
        real_indices = torch.chunk(real_indices, args.world_size)[args.local_rank]
    test_users = test_users.view(-1).split(args.valid_batch_size)
    test_items = test_items.view(-1).split(args.valid_batch_size)
    return test_users, test_items, dup_mask, real_indices
 def prepare_epoch_train_data(train_ratings, nb_items, args):
    # create label
    train_label = torch.ones_like(train_ratings[:,0], dtype=torch.float32)
    neg_label = torch.zeros_like(train_label, dtype=torch.float32)
    neg_label = neg_label.repeat(args.negative_samples)
    train_label = torch.cat((train_label,neg_label))
    del neg_label
    train_users = train_ratings[:,0]
    train_items = train_ratings[:,1]
    train_users_per_worker = len(train_label) / args.world_size
    train_users_begin = int(train_users_per_worker * args.local_rank)
    train_users_end = int(train_users_per_worker * (args.local_rank + 1))
    # prepare data for epoch
    neg_users = train_users.repeat(args.negative_samples)
    neg_items = torch.empty_like(neg_users, dtype=torch.int64).random_(0, nb_items)
    epoch_users = torch.cat((train_users, neg_users))
    epoch_items = torch.cat((train_items, neg_items))
    del neg_users, neg_items
    # shuffle prepared data and split into batches
    epoch_indices = torch.randperm(train_users_end - train_users_begin, device='cuda:{}'.format(args.local_rank))
    epoch_indices += train_users_begin
    epoch_users = epoch_users[epoch_indices]
    epoch_items = epoch_items[epoch_indices]
    epoch_label = train_label[epoch_indices]
    if args.distributed:
        local_batch = args.batch_size // args.world_size
    else:
        local_batch = args.batch_size
    epoch_users = epoch_users.split(local_batch)
    epoch_items = epoch_items.split(local_batch)
    epoch_label = epoch_label.split(local_batch)
    # the last batch will almost certainly be smaller, drop it
    epoch_users = epoch_users[:-1]
    epoch_items = epoch_items[:-1]
    epoch_label = epoch_label[:-1]
    return epoch_users, epoch_items, epoch_label
--- a/PyTorch/Recommendation/NCF/download_dataset.sh
+++ b/PyTorch/Recommendation/NCF/download_dataset.sh
@ -3,16 +3,19 @@ RAW_DATADIR=$2
 function download_20m {
 	echo "Download ml-20m"
 	cd ${RAW_DATADIR}
 	curl -O http://files.grouplens.org/datasets/movielens/ml-20m.zip
-	mv ml-20m.zip ${RAW_DATADIR}
+	cd -
 }
 function download_1m {
 	echo "Downloading ml-1m"
 	cd ${RAW_DATADIR}
 	curl -O http://files.grouplens.org/datasets/movielens/ml-1m.zip
-	mv ml-1m.zip ${RAW_DATADIR}
+        cd -
 }
 if [[ ${DATASET_NAME} == "ml-1m" ]]
 then
 	download_1m
--- a/PyTorch/Recommendation/NCF/img/dgx1v_32_curve.png
+++ b/PyTorch/Recommendation/NCF/img/dgx1v_32_curve.png
--- a/PyTorch/Recommendation/NCF/img/hr_histogram.png
+++ b/PyTorch/Recommendation/NCF/img/hr_histogram.png
--- a/PyTorch/Recommendation/NCF/logger/analyzer.py
+++ b/PyTorch/Recommendation/NCF/logger/analyzer.py
@ -60,12 +60,11 @@ def collect_by_scope(loglines):
    # gather eval_accuracy
    eval_accuracy_dup = [l.value for l in loglines if l.tag == tags.EVAL_ACCURACY]
-    eval_accuracy = list({l['value']:l for l in eval_accuracy_dup})
+    eval_accuracy = [l['value'] for l in eval_accuracy_dup]
    epoch_stats['eval_accuracy'] = eval_accuracy
    # gather it_per_sec
    eval_it_per_sec = [l.value for l in loglines if l.tag == tags.PERF_IT_PER_SEC]
    #eval_it_per_sec = list({l['value']:l for l in eval_it_per_sec_dup})
    epoch_stats['it_per_sec'] = eval_it_per_sec
--- a/PyTorch/Recommendation/NCF/ncf.py
+++ b/PyTorch/Recommendation/NCF/ncf.py
@ -35,23 +35,21 @@ import os
 import sys
 import math
 import time
 from datetime import datetime
 from collections import OrderedDict
 from argparse import ArgumentParser
 import torch
 import torch.nn as nn
 import utils
 import dataloading
 from neumf import NeuMF
 from logger.logger import LOGGER, timed_block, timed_function
 from logger import tags
 from logger.autologging import log_hardware, log_args
 from fp_optimizers import Fp16Optimizer
 from apex.parallel import DistributedDataParallel as DDP
-
+from apex import amp
 LOGGER.model = 'ncf'
@ -60,30 +58,28 @@ def parse_args():
                                        " Filtering model")
    parser.add_argument('--data', type=str,
                        help='Path to test and training data files')
-    parser.add_argument('-e', '--epochs', type=int, default=40,
+    parser.add_argument('-e', '--epochs', type=int, default=30,
                        help='Number of epochs for training')
-    parser.add_argument('-b', '--batch-size', type=int, default=1048576,
+    parser.add_argument('-b', '--batch_size', type=int, default=2**20,
                        help='Number of examples for each iteration')
-    parser.add_argument('--valid-batch-size', type=int, default=2**20,
+    parser.add_argument('--valid_batch_size', type=int, default=2**20,
                        help='Number of examples in each validation chunk')
    parser.add_argument('-f', '--factors', type=int, default=64,
                        help='Number of predictive factors')
    parser.add_argument('--layers', nargs='+', type=int,
                        default=[256, 256, 128, 64],
                        help='Sizes of hidden layers for MLP')
-    parser.add_argument('-n', '--negative-samples', type=int, default=4,
+    parser.add_argument('-n', '--negative_samples', type=int, default=4,
                        help='Number of negative examples per interaction')
-    parser.add_argument('-l', '--learning-rate', type=float, default=0.0045,
+    parser.add_argument('-l', '--learning_rate', type=float, default=0.0045,
                        help='Learning rate for optimizer')
    parser.add_argument('-k', '--topk', type=int, default=10,
                        help='Rank for test examples to be considered a hit')
-    parser.add_argument('--seed', '-s', type=int, default=0,
+    parser.add_argument('--seed', '-s', type=int, default=1,
                        help='Manually set random seed for torch')
    parser.add_argument('--threshold', '-t', type=float, default=1.0,
                        help='Stop training early at threshold')
-    parser.add_argument('--no-fp16', action='store_false', dest='fp16',
+    parser.add_argument('--valid_negative', type=int, default=100,
                        help='Do not use fp16')
    parser.add_argument('--valid-negative', type=int, default=100,
                        help='Number of negative samples for each positive test example')
    parser.add_argument('--beta1', '-b1', type=float, default=0.25,
                        help='Beta1 for Adam')
@ -93,14 +89,15 @@ def parse_args():
                        help='Epsilon for Adam')
    parser.add_argument('--dropout', type=float, default=0.5,
                        help='Dropout probability, if equal to 0 will not use dropout at all')
-    parser.add_argument('--loss-scale', default=8192, type=int,
+    parser.add_argument('--checkpoint_dir', default='/data/checkpoints/', type=str,
                        help='Loss scale to use for mixed precision training')
    parser.add_argument('--checkpoint-dir', default='/data/checkpoints/', type=str,
                        help='Path to the directory storing the checkpoint file')
    parser.add_argument('--mode', choices=['train', 'test'], default='train', type=str,
                        help='Passing "test" will only run a single evaluation, otherwise full training will be performed')
    parser.add_argument('--grads_accumulated', default=1, type=int,
                        help='Number of gradients to accumulate before performing an optimization step')
    parser.add_argument('--opt_level', default='O2', type=str,
                        help='Optimization level for Automatic Mixed Precision',
                        choices=['O0', 'O2'])
    parser.add_argument('--local_rank', default=0, type=int, help='Necessary for multi-GPU training')
    return parser.parse_args()
@ -133,12 +130,8 @@ def init_distributed(local_rank=0):
    return distributed, int(os.environ['WORLD_SIZE'])
-def val_epoch(model, x, y, dup_mask, real_indices, K, samples_per_user, num_user, output=None,
+def val_epoch(model, x, y, dup_mask, real_indices, K, samples_per_user, num_user,
              epoch=None, distributed=False):
    start = datetime.now()
    log_2 = math.log(2)
    model.eval()
    with torch.no_grad():
@ -146,80 +139,36 @@ def val_epoch(model, x, y, dup_mask, real_indices, K, samples_per_user, num_user
        for u,n in zip(x,y):
            p.append(model(u, n, sigmoid=True).detach())
        del x
        del y
        temp = torch.cat(p).view(-1,samples_per_user)
-        del p
+        del x, y, p
        # set duplicate results for the same item to -1 before topk
        temp[dup_mask] = -1
        out = torch.topk(temp,K)[1]
        # topk in pytorch is stable(if not sort)
-        # key(item):value(predicetion) pairs are ordered as original key(item) order
+        # key(item):value(prediction) pairs are ordered as original key(item) order
        # so we need the first position of real item(stored in real_indices) to check if it is in topk
        ifzero = (out == real_indices.view(-1,1))
        hits = ifzero.sum()
-        ndcg = (log_2 / (torch.nonzero(ifzero)[:,1].view(-1).to(torch.float)+2).log_()).sum()
+        ndcg = (math.log(2) / (torch.nonzero(ifzero)[:,1].view(-1).to(torch.float)+2).log_()).sum()
    LOGGER.log(key=tags.EVAL_SIZE, value={"epoch": epoch, "value": num_user * samples_per_user})
    LOGGER.log(key=tags.EVAL_HP_NUM_USERS, value=num_user)
    LOGGER.log(key=tags.EVAL_HP_NUM_NEG, value=samples_per_user - 1)
    end = datetime.now()
    if distributed:
        torch.distributed.all_reduce(hits, op=torch.distributed.reduce_op.SUM)
        torch.distributed.all_reduce(ndcg, op=torch.distributed.reduce_op.SUM)
-    hits = hits.item()
+    hr = hits.item() / num_user
-    ndcg = ndcg.item()
+    ndcg = ndcg.item() / num_user
    if output is not None:
        result = OrderedDict()
        result['timestamp'] = datetime.now()
        result['duration'] = end - start
        result['epoch'] = epoch
        result['K'] = K
        result['hit_rate'] = hits/num_user
        result['NDCG'] = ndcg/num_user
        utils.save_result(result, output)
    model.train()
-    return hits/num_user, ndcg/num_user
+    return hr, ndcg
 def generate_neg(users, true_mat, item_range, num_neg, sort=False):
    # assuming 1-d tensor input
    # for each user in 'users', generate 'num_neg' negative samples in [0, item_range)
    # also make sure negative sample is not in true sample set with mask
    # true_mat store a mask matrix where true_mat(user, item) = 0 for true sample
    # return (neg_user, neg_item)
    # list to append iterations of result
    neg_u = []
    neg_i = []
    neg_users = users.repeat(num_neg)
    while len(neg_users) > 0: # generate then filter loop
        neg_items = torch.empty_like(neg_users, dtype=torch.int64).random_(0, item_range)
        neg_mask = true_mat[neg_users, neg_items]
        neg_u.append(neg_users.masked_select(neg_mask))
        neg_i.append(neg_items.masked_select(neg_mask))
        neg_users = neg_users.masked_select(1-neg_mask)
    neg_users = torch.cat(neg_u)
    neg_items = torch.cat(neg_i)
    if sort == False:
        return neg_users, neg_items
    sorted_users, sort_indices = torch.sort(neg_users)
    return sorted_users, neg_items[sort_indices]
 def main():
    log_hardware()
    args = parse_args()
    args.distributed, args.world_size = init_distributed(args.local_rank)
    log_args(args)
@ -229,90 +178,35 @@ def main():
    if args.seed is not None:
        torch.manual_seed(args.seed)
    # Save configuration to file
    print("Saving results to {}".format(args.checkpoint_dir))
    if not os.path.exists(args.checkpoint_dir) and args.checkpoint_dir != '':
        os.makedirs(args.checkpoint_dir, exist_ok=True)
    checkpoint_path = os.path.join(args.checkpoint_dir, 'model.pth')
    # more like load trigger timer now
    LOGGER.log(key=tags.PREPROC_HP_NUM_EVAL, value=args.valid_negative)
    # The default of np.random.choice is replace=True, so does pytorch random_()
    LOGGER.log(key=tags.PREPROC_HP_SAMPLE_EVAL_REPLACEMENT, value=True)
    LOGGER.log(key=tags.INPUT_HP_SAMPLE_TRAIN_REPLACEMENT, value=True)
    LOGGER.log(key=tags.INPUT_STEP_EVAL_NEG_GEN)
-    # sync worker before timing.
+    # sync workers before timing
    if args.distributed:
        torch.distributed.broadcast(torch.tensor([1], device="cuda"), 0)
    torch.cuda.synchronize()
    LOGGER.log(key=tags.RUN_START)
    run_start_time = time.time()
    # load not converted data, just seperate one for test
    train_ratings = torch.load(args.data+'/train_ratings.pt', map_location=torch.device('cuda:{}'.format(args.local_rank)))
    test_ratings = torch.load(args.data+'/test_ratings.pt', map_location=torch.device('cuda:{}'.format(args.local_rank)))
    # get input data
    # get dims
    nb_maxs = torch.max(train_ratings, 0)[0]
-    nb_users = nb_maxs[0].item()+1
+    nb_users = nb_maxs[0].item() + 1
-    nb_items = nb_maxs[1].item()+1
+    nb_items = nb_maxs[1].item() + 1
-    train_users = train_ratings[:,0]
+    LOGGER.log(key=tags.INPUT_SIZE, value=len(train_ratings))
    train_items = train_ratings[:,1]
    del nb_maxs, train_ratings
    LOGGER.log(key=tags.INPUT_SIZE, value=len(train_users))
    # produce things not change between epoch
    # mask for filtering duplicates with real sample
    # note: test data is removed before create mask, same as reference
    mat = torch.cuda.ByteTensor(nb_users, nb_items).fill_(1)
    mat[train_users, train_items] = 0
    # create label
    train_label = torch.ones_like(train_users, dtype=torch.float32)
    neg_label = torch.zeros_like(train_label, dtype=torch.float32)
    neg_label = neg_label.repeat(args.negative_samples)
    train_label = torch.cat((train_label,neg_label))
    del neg_label
    if args.fp16:
        train_label = train_label.half()
    # produce validation negative sample on GPU
    all_test_users = test_ratings.shape[0]
-    test_users = test_ratings[:,0]
+    test_users, test_items, dup_mask, real_indices = dataloading.create_test_data(train_ratings, test_ratings, args)
    test_pos = test_ratings[:,1].reshape(-1,1)
    test_negs = generate_neg(test_users, mat, nb_items, args.valid_negative, True)[1]
    # create items with real sample at last position
    test_users = test_users.reshape(-1,1).repeat(1,1+args.valid_negative)
    test_items = torch.cat((test_negs.reshape(-1,args.valid_negative), test_pos), dim=1)
    del test_ratings, test_negs
    # generate dup mask and real indice for exact same behavior on duplication compare to reference
    # here we need a sort that is stable(keep order of duplicates)
    # this is a version works on integer
    sorted_items, indices = torch.sort(test_items) # [1,1,1,2], [3,1,0,2]
    sum_item_indices = sorted_items.float()+indices.float()/len(indices[0]) #[1.75,1.25,1.0,2.5]
    indices_order = torch.sort(sum_item_indices)[1] #[2,1,0,3]
    stable_indices = torch.gather(indices, 1, indices_order) #[0,1,3,2]
    # produce -1 mask
    dup_mask = (sorted_items[:,0:-1] == sorted_items[:,1:])
    dup_mask = torch.cat((torch.zeros_like(test_pos, dtype=torch.uint8), dup_mask),dim=1)
    dup_mask = torch.gather(dup_mask,1,stable_indices.sort()[1])
    # produce real sample indices to later check in topk
    sorted_items, indices = (test_items != test_pos).sort()
    sum_item_indices = sorted_items.float()+indices.float()/len(indices[0])
    indices_order = torch.sort(sum_item_indices)[1]
    stable_indices = torch.gather(indices, 1, indices_order)
    real_indices = stable_indices[:,0]
    del sorted_items, indices, sum_item_indices, indices_order, stable_indices, test_pos
    if args.distributed:
        test_users = torch.chunk(test_users, args.world_size)[args.local_rank]
        test_items = torch.chunk(test_items, args.world_size)[args.local_rank]
        dup_mask = torch.chunk(dup_mask, args.world_size)[args.local_rank]
        real_indices = torch.chunk(real_indices, args.world_size)[args.local_rank]
    # make pytorch memory behavior more consistent later
    torch.cuda.empty_cache()
@ -320,36 +214,33 @@ def main():
    LOGGER.log(key=tags.INPUT_BATCH_SIZE, value=args.batch_size)
    LOGGER.log(key=tags.INPUT_ORDER)  # we shuffled later with randperm
    print('Load data done [%.1f s]. #user=%d, #item=%d, #train=%d, #test=%d'
          % (time.time()-run_start_time, nb_users, nb_items, len(train_users),
             nb_users))
    # Create model
    model = NeuMF(nb_users, nb_items,
-                  mf_dim=args.factors, mf_reg=0.,
+                  mf_dim=args.factors,
                  mlp_layer_sizes=args.layers,
                  mlp_layer_regs=[0. for i in args.layers],
                  dropout=args.dropout)
-    if args.fp16:
+    optimizer = FusedAdam(model.parameters(), lr=args.learning_rate,
-        model = model.half()
+                          betas=(args.beta1, args.beta2), eps=args.eps, eps_inside_sqrt=False)
    criterion = nn.BCEWithLogitsLoss(reduction='none') # use torch.mean() with dim later to avoid copy to host
    # Move model and loss to GPU
    model = model.cuda()
    criterion = criterion.cuda()
    if args.opt_level == "O2":
        model, optimizer = amp.initialize(model, optimizer, opt_level=args.opt_level,
                                          keep_batchnorm_fp32=False, loss_scale='dynamic')
    if args.distributed:
        model = DDP(model)
    local_batch = args.batch_size // args.world_size
    traced_criterion = torch.jit.trace(criterion.forward,
                                       (torch.rand(local_batch,1),torch.rand(local_batch,1)))
    print(model)
    print("{} parameters".format(utils.count_parameters(model)))
    # Save model text description
    with open(os.path.join(args.checkpoint_dir, 'model.txt'), 'w') as file:
        file.write(str(model))
    # Add optimizer and loss to graph
    if args.fp16:
        fp_optimizer = Fp16Optimizer(model, args.loss_scale)
        params = fp_optimizer.fp32_params
    else:
        params = model.parameters()
    optimizer = FusedAdam(params, lr=args.learning_rate, betas=(args.beta1, args.beta2), eps=args.eps, eps_inside_sqrt=False)
    criterion = nn.BCEWithLogitsLoss(reduction='none') # use torch.mean() with dim later to avoid copy to host
    LOGGER.log(key=tags.OPT_LR, value=args.learning_rate)
    LOGGER.log(key=tags.OPT_NAME, value="Adam")
    LOGGER.log(key=tags.OPT_HP_ADAM_BETA1, value=args.beta1)
@ -357,53 +248,22 @@ def main():
    LOGGER.log(key=tags.OPT_HP_ADAM_EPSILON, value=args.eps)
    LOGGER.log(key=tags.MODEL_HP_LOSS_FN, value=tags.VALUE_BCE)
    # Move model and loss to GPU
    model = model.cuda()
    criterion = criterion.cuda()
    if args.distributed:
        model = DDP(model)
        local_batch = args.batch_size // int(os.environ['WORLD_SIZE'])
    else:
        local_batch = args.batch_size
    traced_criterion = torch.jit.trace(criterion.forward, (torch.rand(local_batch,1),torch.rand(local_batch,1)))
    train_users_per_worker = len(train_label) / int(os.environ['WORLD_SIZE'])
    train_users_begin = int(train_users_per_worker * args.local_rank)
    train_users_end = int(train_users_per_worker * (args.local_rank + 1))
    # Create files for tracking training
    valid_results_file = os.path.join(args.checkpoint_dir, 'valid_results.csv')
    # Calculate initial Hit Ratio and NDCG
    test_x = test_users.view(-1).split(args.valid_batch_size)
    test_y = test_items.view(-1).split(args.valid_batch_size)
    if args.mode == 'test':
        state_dict = torch.load(checkpoint_path)
        model.load_state_dict(state_dict)
-    
+        hr, ndcg = val_epoch(model, test_users, test_items, dup_mask, real_indices, args.topk,
-    begin = time.time()
+                             samples_per_user=args.valid_negative + 1,
    LOGGER.log(key=tags.EVAL_START, value=-1)
    hr, ndcg = val_epoch(model, test_x, test_y, dup_mask, real_indices, args.topk, samples_per_user=test_items.size(1),
                             num_user=all_test_users, distributed=args.distributed)
-    val_time = time.time() - begin
+        print('HR@{K} = {hit_rate:.4f}, NDCG@{K} = {ndcg:.4f}'
-    print('Initial HR@{K} = {hit_rate:.4f}, NDCG@{K} = {ndcg:.4f}, valid_time: {val_time:.4f}'
+              .format(K=args.topk, hit_rate=hr, ndcg=ndcg))
          .format(K=args.topk, hit_rate=hr, ndcg=ndcg, val_time=val_time))
    LOGGER.log(key=tags.EVAL_ACCURACY, value={"epoch": -1, "value": hr})
    LOGGER.log(key=tags.EVAL_TARGET, value=args.threshold)
    LOGGER.log(key=tags.EVAL_STOP, value=-1)
    if args.mode == 'test':
        return
    success = False
    max_hr = 0
-    LOGGER.log(key=tags.TRAIN_LOOP)
+    train_throughputs, eval_throughputs = [], []
    train_throughputs = []
    eval_throughputs = []
    LOGGER.log(key=tags.TRAIN_LOOP)
    for epoch in range(args.epochs):
        LOGGER.log(key=tags.TRAIN_EPOCH_START, value=epoch)
@ -412,68 +272,43 @@ def main():
        begin = time.time()
-        # prepare data for epoch
+        epoch_users, epoch_items, epoch_label = dataloading.prepare_epoch_train_data(train_ratings, nb_items, args)
-        neg_users, neg_items = generate_neg(train_users, mat, nb_items, args.negative_samples)
+        num_batches = len(epoch_users)
        epoch_users = torch.cat((train_users,neg_users))
        epoch_items = torch.cat((train_items,neg_items))
        del neg_users, neg_items
        # shuffle prepared data and split into batches
        epoch_indices = torch.randperm(train_users_end - train_users_begin, device='cuda:{}'.format(args.local_rank))
        epoch_indices += train_users_begin
        epoch_users = epoch_users[epoch_indices]
        epoch_items = epoch_items[epoch_indices]
        epoch_label = train_label[epoch_indices]
        epoch_users_list = epoch_users.split(local_batch)
        epoch_items_list = epoch_items.split(local_batch)
        epoch_label_list = epoch_label.split(local_batch)
        # only print progress bar on rank 0
        num_batches = len(epoch_users_list)
        # handle extremely rare case where last batch size < number of worker
        if len(epoch_users) % args.batch_size < args.world_size:
            print("epoch_size % batch_size < number of worker!")
            exit(1)
        for i in range(num_batches // args.grads_accumulated):
            for j in range(args.grads_accumulated):
                batch_idx = (args.grads_accumulated * i) + j
-                user = epoch_users_list[batch_idx]
+                user = epoch_users[batch_idx]
-                item = epoch_items_list[batch_idx]
+                item = epoch_items[batch_idx]
-                label = epoch_label_list[batch_idx].view(-1,1)
+                label = epoch_label[batch_idx].view(-1,1)
                outputs = model(user, item)
                loss = traced_criterion(outputs, label).float()
                loss = torch.mean(loss.view(-1), 0)
-                if args.fp16:
+
-                    fp_optimizer.backward(loss)
+                if args.opt_level == "O2":
                    with amp.scale_loss(loss, optimizer) as scaled_loss:
                        scaled_loss.backward()
                else:
                    loss.backward()
            if args.fp16:
                fp_optimizer.step(optimizer)
            else:
            optimizer.step()
            for p in model.parameters():
                p.grad = None
-        del epoch_users, epoch_items, epoch_label, epoch_users_list, epoch_items_list, epoch_label_list, user, item, label
+        del epoch_users, epoch_items, epoch_label
        train_time = time.time() - begin
        begin = time.time()
-        epoch_samples = len(train_users) * (args.negative_samples + 1)
+        epoch_samples = len(train_ratings) * (args.negative_samples + 1)
        train_throughput = epoch_samples / train_time
        train_throughputs.append(train_throughput)
        LOGGER.log(key='train_throughput', value=train_throughput)
        LOGGER.log(key=tags.TRAIN_EPOCH_STOP, value=epoch)
        LOGGER.log(key=tags.EVAL_START, value=epoch)
-        hr, ndcg = val_epoch(model, test_x, test_y, dup_mask, real_indices, args.topk, samples_per_user=test_items.size(1),
+        hr, ndcg = val_epoch(model, test_users, test_items, dup_mask, real_indices, args.topk,
-                             num_user=all_test_users, output=valid_results_file, epoch=epoch, distributed=args.distributed)
+                             samples_per_user=args.valid_negative + 1,
                             num_user=all_test_users, epoch=epoch, distributed=args.distributed)
        val_time = time.time() - begin
        print('Epoch {epoch}: HR@{K} = {hit_rate:.4f}, NDCG@{K} = {ndcg:.4f},'
@ -486,7 +321,7 @@ def main():
        LOGGER.log(key=tags.EVAL_TARGET, value=args.threshold)
        LOGGER.log(key=tags.EVAL_STOP, value=epoch)
-        eval_size = all_test_users * test_items.size(1)
+        eval_size = all_test_users * (args.valid_negative + 1)
        eval_throughput = eval_size / val_time
        eval_throughputs.append(eval_throughput)
        LOGGER.log(key='eval_throughput', value=eval_throughput)
--- a/PyTorch/Recommendation/NCF/neumf.py
+++ b/PyTorch/Recommendation/NCF/neumf.py
@ -34,8 +34,8 @@ import torch.nn as nn
 import sys
 from os.path import abspath, join, dirname
-# enabling modules discovery from global entrypoint
+# enabling modules discovery from the global entrypoint
-sys.path.append(abspath(dirname(__file__)+'/'))
+sys.path.append(abspath(dirname(__file__) + '/'))
 from logger.logger import LOGGER
 from logger import tags
@ -44,12 +44,8 @@ LOGGER.model = 'ncf'
 class NeuMF(nn.Module):
    def __init__(self, nb_users, nb_items,
-                 mf_dim, mf_reg,
+                 mf_dim, mlp_layer_sizes, dropout=0):
                 mlp_layer_sizes, mlp_layer_regs,
                 dropout=0):
        if len(mlp_layer_sizes) != len(mlp_layer_regs):
            raise RuntimeError('u dummy, layer_sizes != layer_regs!')
        if mlp_layer_sizes[0] % 2 != 0:
            raise RuntimeError('u dummy, mlp_layer_sizes[0] % 2 != 0')
        super(NeuMF, self).__init__()
--- a/PyTorch/Recommendation/NCF/prepare_dataset.sh
+++ b/PyTorch/Recommendation/NCF/prepare_dataset.sh
@ -31,10 +31,11 @@
 #!/bin/bash
 set -e
 set -x
 DATASET_NAME=${1:-'ml-20m'}
-RAW_DATADIR='/data'
+RAW_DATADIR=${2:-'/data'}
-CACHED_DATADIR='/data/cache/'${DATASET_NAME}
+CACHED_DATADIR="${RAW_DATADIR}/cache/${DATASET_NAME}"
 # you can add another option to this case in order to support other datasets
 case ${DATASET_NAME} in
@ -51,9 +52,17 @@ case ${DATASET_NAME} in
 	exit 1
 esac
-mkdir -p ${RAW_DATADIR}
+if [ ! -d ${RAW_DATADIR} ]; then
-mkdir -p ${CACHED_DATADIR}
+    mkdir -p ${RAW_DATADIR}
-rm -f log
+fi
 if [ ! -d ${CACHED_DATADIR} ]; then
    mkdir -p ${CACHED_DATADIR}
 fi
 if [ -f log ]; then
    rm -f log
 fi
 if [ ! -f ${ZIP_PATH} ]; then
    echo 'Dataset not found, downloading...'
@ -76,6 +85,6 @@ else
 fi
 echo "Dataset $DATASET_NAME successfully prepared at: $CACHED_DATADIR\n"
-echo 'You can now run the training with: python -m torch.distributed.launch --nproc_per_node=<number_of_GPUs> ncf.py --data /data/cache/ml-20m'
+echo "You can now run the training with: python -m torch.distributed.launch --nproc_per_node=<number_of_GPUs> ncf.py --data ${CACHED_DATADIR}"
--- a/PyTorch/Recommendation/NCF/requirements.txt
+++ b/PyTorch/Recommendation/NCF/requirements.txt
@ -1 +1,2 @@
 pandas
 tqdm
--- a/PyTorch/Segmentation/MaskRCNN/pytorch/maskrcnn_benchmark/engine/trainer.py
+++ b/PyTorch/Segmentation/MaskRCNN/pytorch/maskrcnn_benchmark/engine/trainer.py
@ -10,6 +10,12 @@ import torch.distributed as dist
 from maskrcnn_benchmark.utils.comm import get_world_size
 from maskrcnn_benchmark.utils.metric_logger import MetricLogger
 try:
    from apex import amp
    use_amp = True
 except ImportError:
    print('Use APEX for multi-precision via apex.amp')
    use_amp = False
 def reduce_loss_dict(loss_dict):
    """
@ -80,7 +86,7 @@ def do_train(
        # Note: If mixed precision is not used, this ends up doing nothing
        # Otherwise apply loss scaling for mixed-precision recipe
        if use_amp:        
-            with optimizer.scale_loss(losses) as scaled_losses:
+            with amp.scale_loss(losses, optimizer) as scaled_losses:
                scaled_losses.backward()
        else:
            losses.backward()
--- a/PyTorch/Segmentation/MaskRCNN/pytorch/maskrcnn_benchmark/utils/model_zoo.py
+++ b/PyTorch/Segmentation/MaskRCNN/pytorch/maskrcnn_benchmark/utils/model_zoo.py
@ -2,9 +2,14 @@
 import os
 import sys
-from torch.utils.model_zoo import _download_url_to_file
+try:
-from torch.utils.model_zoo import urlparse
+    from torch.utils.model_zoo import _download_url_to_file
-from torch.utils.model_zoo import HASH_REGEX
+    from torch.utils.model_zoo import urlparse
    from torch.utils.model_zoo import HASH_REGEX
 except:
    from torch.hub import _download_url_to_file
    from torch.hub import urlparse
    from torch.hub import HASH_REGEX
 from maskrcnn_benchmark.utils.comm import is_main_process
 from maskrcnn_benchmark.utils.comm import synchronize
--- a/PyTorch/Segmentation/MaskRCNN/pytorch/tools/train_net.py
+++ b/PyTorch/Segmentation/MaskRCNN/pytorch/tools/train_net.py
@ -97,14 +97,9 @@ def train(cfg, local_rank, distributed):
    if use_amp:
        # Initialize mixed-precision training
        use_mixed_precision = cfg.DTYPE == "float16"
        amp_handle = amp.init(enabled=use_mixed_precision, verbose=cfg.AMP_VERBOSE)
-        # wrap the optimizer for mixed precision
+        amp_opt_level = 'O1' if use_mixed_precision else 'O0'
-        if cfg.SOLVER.ACCUMULATE_GRAD:
+        model, optimizer = amp.initialize(model, optimizer, opt_level=amp_opt_level)
            # also specify number of steps to accumulate over
            optimizer = amp_handle.wrap_optimizer(optimizer, num_loss=cfg.SOLVER.ACCUMULATE_STEPS)
        else:
            optimizer = amp_handle.wrap_optimizer(optimizer)
    if distributed:
        if use_apex_ddp:
--- a/PyTorch/SpeechSynthesis/Tacotron2/Dockerfile
+++ b/PyTorch/SpeechSynthesis/Tacotron2/Dockerfile
@ -1,5 +1,10 @@
-FROM nvcr.io/nvidia/pytorch:18.12.1-py3
+FROM nvcr.io/nvidia/pytorch:19.03-py3
 ADD . /workspace/tacotron2
 WORKDIR /workspace/tacotron2
 RUN pip install -r requirements.txt
 RUN cd /workspace; \
    git clone https://github.com/NVIDIA/apex.git; \
    cd /workspace/apex; \
    pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" .
 WORKDIR /workspace/tacotron2
--- a/PyTorch/SpeechSynthesis/Tacotron2/README.md
+++ b/PyTorch/SpeechSynthesis/Tacotron2/README.md
@ -1,79 +1,157 @@
-# Tacotron 2 And WaveGlow v1.0 For PyTorch
+# Tacotron 2 And WaveGlow v1.5 For PyTorch
-This repository provides a script and recipe to train Tacotron 2 and WaveGlow v1.0 to achieve state of the art accuracy, and is tested and maintained by NVIDIA.
+This repository provides a script and recipe to train Tacotron 2 and WaveGlow
 v1.5 models to achieve state of the art accuracy, and is tested and maintained by
 NVIDIA.
-## Table Of Contents
+Table of Contents
 =================
 * [The model](#the-model)
   * [Model architecture](#model-architecture)
   * [Default configuration](#default-configuration)
   * [Feature support matrix](#feature-support-matrix)
      * [Features](#features)
 * [Setup](#setup)
   * [Requirements](#requirements)
 * [Quick Start Guide](#quick-start-guide)
 * [Details](#details)
-	* [Training process](#training-process)
+   * [Scripts and sample code](#scripts-and-sample-code)
-		* [Hyperparameters and command line arguments](#hyperparameters-and-command-line-arguments)
+   * [Parameters](#parameters)
      * [Shared parameters](#shared-parameters)
      * [Shared audio/STFT parameters](#shared-audiostft-parameters)
      * [Tacotron 2 parameters](#tacotron-2-parameters)
      * [WaveGlow parameters](#waveglow-parameters)
-	* [Enabling mixed precision](#enabling-mixed-precision)
+   * [Command-line options](#command-line-options)
   * [Getting the data](#getting-the-data)
      * [Dataset guidelines](#dataset-guidelines)
      * [Multi-dataset](#multi-dataset)
   * [Training process](#training-process)
   * [Inference process](#inference-process)
 * [Mixed precision training](#mixed-precision-training)
   * [Enabling mixed precision](#enabling-mixed-precision)
 * [Benchmarking](#benchmarking)
 	* [Inference performance benchmark](#inference-performance-benchmark)
   * [Training performance benchmark](#training-performance-benchmark)
   * [Inference performance benchmark](#inference-performance-benchmark)
 * [Results](#results)
   * [Training accuracy results](#training-accuracy-results)
      * [NVIDIA DGX-1 (8x V100 16G)](#nvidia-dgx-1-8x-v100-16g)
   * [Training performance results](#training-performance-results)
      * [NVIDIA DGX-1 (8x V100 16G)](#nvidia-dgx-1-8x-v100-16g)
      * [Expected training time](#expected-training-time)
   * [Inference performance results](#inference-performance-results)
      * [NVIDIA DGX-1 (8x V100 16G)](#nvidia-dgx-1-8x-v100-16g)
 * [Changelog](#changelog)
 * [Known issues](#known-issues)
 ## The model
 This text-to-speech (TTS) system is a combination of two neural network
 models:
-# The model
+* a modified Tacotron 2 model from the [Natural TTS Synthesis by Conditioning WaveNet on Mel Spectrogram Predictions](https://arxiv.org/abs/1712.05884)
-This text-to-speech (TTS) system is a combination of two neural network models:
+paper and
 * a modified Tacotron 2 model from the [Natural TTS Synthesis by Conditioning WaveNet on Mel Spectrogram Predictions](https://arxiv.org/abs/1712.05884) paper and
 * a flow-based neural network model from the [WaveGlow: A Flow-based Generative Network for Speech Synthesis](https://arxiv.org/abs/1811.00002) paper.
-The Tacotron 2 and WaveGlow model form a text-to-speech system that enables
+The Tacotron 2 and WaveGlow models form a text-to-speech system that enables
-user to synthesise a natural sounding speech from raw transcripts without
+users to synthesize natural sounding speech from raw transcripts without
-any additional prosody information.
+any additional information such as patterns and/or rhythms of speech.
-Our implementation of Tacotron 2 model differs from the model described in the
+Our implementation of Tacotron 2 models differs from the model described in the
-paper. Our implementation uses Dropout instead of Zoneout to regularize the LSTM layers.
+paper. Our implementation uses Dropout instead of Zoneout to regularize the
-Also, the original text-to-speech system proposed in the paper used the [WaveNet](https://arxiv.org/abs/1609.03499)
+LSTM layers. Also, the original text-to-speech system proposed in the paper
-model to synthesize waveforms.
+uses the [WaveNet](https://arxiv.org/abs/1609.03499) model to synthesize
-In our implementation, we use the WaveGlow model for this purpose.
+waveforms. In our implementation, we use the WaveGlow model for this purpose.
 Both models are based on implementations of NVIDIA GitHub repositories
 [Tacotron 2](https://github.com/NVIDIA/tacotron2) and
 [WaveGlow](https://github.com/NVIDIA/waveglow), and are trained on a publicly
 available [LJ Speech dataset](https://keithito.com/LJ-Speech-Dataset/).
-This model trains with mixed precision tensor cores on Volta, therefore researchers
+The Tacotron 2 and WaveGlow model enables you to efficiently synthesize high
-can get results much faster than training without tensor cores.  This model is
+quality speech from text.
 tested against each NGC monthly container release to ensure consistent accuracy
 and performance over time.
-## Default configuration
+Both models are trained with mixed precision using Tensor Cores on NVIDIA
-The Tacotron 2 model produces mel spectrograms from input text using
+Volta and Turing GPUs. Therefore, researchers can get results 1.5x faster for Tacotron 2
-encoder-decoder architecture. WaveGlow is a flow-based model that consumes the
+and 2.2x faster for WaveGlow than training without Tensor Cores, while
-mel spectrograms to generate speech. Both models support multi-gpu and mixed
+experiencing the benefits of mixed precision training. The models are tested
-precision training with dynamic loss scaling (see Apex code [here](https://github.com/NVIDIA/apex/blob/master/apex/fp16_utils/loss_scaler.py)),
+against each NGC monthly container release to ensure consistent accuracy and
-as well as mixed precision inference.
+performance over time.
-# Setup
+### Model architecture
 The following sections list the requirements in order to
 start training the Tacotron 2 and WaveGlow models.
-## Requirements
+The Tacotron 2 model is a recurrent sequence-to-sequence model with attention that
-This repository contains `Dockerfile` which extends the PyTorch NGC container
+predicts mel-spectrograms from text. The encoder (blue blocks in the figure
 below) transforms the whole text into a fixed-size hidden feature
 representation. This feature representation is then consumed by the
 autoregressive decoder (orange blocks) that produces one spectrogram frame at
 a time. In our implementation, the autoregressive WaveNet (green block) is
 replaced by the flow-based generative WaveGlow.
 ![](./img/tacotron2_arch.png "Tacotron 2 architecture")
 Figure 1. Architecture of the Tacotron 2 model. Taken from the
 [Tacotron 2](https://arxiv.org/abs/1712.05884) paper.
 The WaveGlow model is a flow-based generative model that generates audio
 samples from Gaussian distribution using mel-spectrogram conditioning (Figure
 2). During training, the model learns to transform the dataset distribution
 into spherical Gaussian distribution through a series of flows. One step of a
 flow consists of an invertible convolution, followed by a modified WaveNet
 architecture that serves as an affine coupling layer. During inference, the
 network is inverted and audio samples are generated from the Gaussian
 distribution.
 ![](./img/waveglow_arch.png "WaveGlow architecture")
 Figure 2. Architecture of the WaveGlow model. Taken from the
 [WaveGlow](https://arxiv.org/abs/1811.00002) paper.
 ### Default configuration
 Both models support multi-GPU and mixed precision training with dynamic loss
 scaling (see Apex code
 [here](https://github.com/NVIDIA/apex/blob/master/apex/fp16_utils/loss_scaler.py)),
 as well as mixed precision inference. To speed up Tacotron 2 training,
 reference mel-spectrograms are generated during a preprocessing step and read
 directly from disk during training, instead of being generated during training.
 The following features were implemented in this model:
 * data-parallel multi-GPU training
 * dynamic loss scaling with backoff for Tensor Cores (mixed precision)
 training.
 ### Feature support matrix
 The following features are supported by this model.
 | Feature               | Tacotron 2 | and WaveGlow |               
 |:-------|---------:|-----------:|
 |[AMP](https://nvidia.github.io/apex/amp.html) | Yes | Yes |
 |[Apex DistributedDataParallel](https://nvidia.github.io/apex/parallel.html) | Yes | Yes |
 #### Features 
 AMP - a tool that enables Tensor Core-accelerated training. Please refer to section [Enabling mixed precision](#enabling-mixed-precision) for more details.
 Apex DistributedDataParallel - a module wrapper that enables easy multiprocess distributed data parallel training, similar to `torch.nn.parallel.DistributedDataParallel`. `DistributedDataParallel` is optimized for use with NCCL. It achieves high performance by overlapping communication with computation during backward() and bucketing smaller gradient transfers to reduce the total number of transfers required.
 ## Setup
 The following section lists the requirements in order to start training the
 Tacotron 2 and WaveGlow models.
 ### Requirements
 This repository contains Dockerfile which extends the PyTorch NGC container
 and encapsulates some dependencies. Aside from these dependencies, ensure you
 have the following components:
 * [NVIDIA Docker](https://github.com/NVIDIA/nvidia-docker)
-* [PyTorch 19.05-py3 NGC container](https://ngc.nvidia.com/registry/nvidia-pytorch) or newer
+* [PyTorch 19.04-py3 NGC container](https://ngc.nvidia.com/registry/nvidia-pytorch)
-* [NVIDIA Volta based GPU](https://www.nvidia.com/en-us/data-center/volta-gpu-architecture/)
+or newer
 * [NVIDIA Volta](https://www.nvidia.com/en-us/data-center/volta-gpu-architecture/) or [Turing](https://www.nvidia.com/en-us/geforce/turing/) based GPU
 For more information about how to get started with NGC containers, see the
@ -84,35 +162,49 @@ Documentation:
 * [Accessing And Pulling From The NGC Container Registry](https://docs.nvidia.com/deeplearning/dgx/user-guide/index.html#accessing_registry)
 * [Running PyTorch](https://docs.nvidia.com/deeplearning/dgx/pytorch-release-notes/running.html#running)
-# Quick Start Guide
+For those unable to use the PyTorch NGC container, to set up the required
-To train your model using mixed precision with tensor cores or using FP32,
+environment or create your own container, see the versioned
-perform the following steps using the default parameters of the Tacrotron 2
+[NVIDIA Container Support Matrix](https://docs.nvidia.com/deeplearning/dgx/support-matrix/index.html).
 and WaveGlow model on the [LJ Speech](https://keithito.com/LJ-Speech-Dataset/) dataset.
-## 1. Clone the repository.
+## Quick Start Guide
 To train your model using mixed precision with Tensor Cores or using FP32,
 perform the following steps using the default parameters of the Tacrotron 2
 and WaveGlow model on the [LJ Speech](https://keithito.com/LJ-Speech-Dataset/)
 dataset.
 1. Clone the repository.
 ```bash
 git clone https://github.com/NVIDIA/DeepLearningExamples.git
 cd DeepLearningExamples/PyTorch/SpeechSynthesis/Tacotron2
 ```
-## 2. Download and preprocess the dataset.
+2. Download and preprocess the dataset.
 Use the `./scripts/prepare-dataset.sh` download script to automatically
-download and preprocess the training, validation and test datasets. To run this script, issue:
+download and preprocess the training, validation and test datasets. To run
 this script, issue:
 ```bash
 bash scripts/prepare-dataset.sh
 ```
 To preprocess the datasets for Tacotron 2 training, use the
 `./scripts/prepare-mels.sh` script:
 ```bash
 bash scripts/prepare_mels.sh
 ```
 Data is downloaded to the `./LJSpeech-1.1` directory (on the host).  The
 `./LJSpeech-1.1` directory is mounted to the `/workspace/tacotron2/LJSpeech-1.1`
-location in the NGC container.  The script will also generate the necessary
+location in the NGC container. The preprocessed mel-spectrograms are stored in the 
-filelists for training and validation in `./filelists` if they are not already present.
+`./LJSpeech-1.1/mels` directory.
-## 3. Build the Tacotron 2 and WaveGlow PyTorch NGC container.
+3. Build the Tacotron 2 and WaveGlow PyTorch NGC container.
 ```bash
 bash scripts/docker/build.sh
 ```
-## 4. Start an interactive session in the NGC container to run training/inference.
+4. Start an interactive session in the NGC container to run training/inference.
-After you build the container image, you can start an interactive CLI session with
+After you build the container image, you can start an interactive CLI session with:
 ```bash
 bash scripts/docker/interactive.sh
@ -121,23 +213,24 @@ bash scripts/docker/interactive.sh
 The `interactive.sh` script requires that the location on the dataset is specified.
 For example, `LJSpeech-1.1`.
-## 5. Start training.
+5. Start training.
-To run Tacotron 2 training, run:
+To start Tacotron 2 training, run:
 ```bash
 bash scripts/train_tacotron2.sh
 ```
-To run WaveGlow training, run:
+To start WaveGlow training, run:
 ```bash
 bash scripts/train_waveglow.sh
 ```
-## 6. Start validation/evaluation.
+6. Start validation/evaluation.
 Ensure your loss values are comparable to those listed in the table in the
-Results section. For both models, the loss values are stored in the 
+[Results][#results] section. For both models, the loss values are stored in the
 `./output/nvlog.json` log file.
-After you have trained the Tacotron 2 and WaveGlow models, you should get audio results similar to the
+After you have trained the Tacotron 2 model for 1500 epochs and the
 WaveGlow model for 800 epochs, you should get audio results similar to the
 samples in the `./audio` folder. For details about generating audio, see the
 [Inference process](#inference-process) section below.
@ -145,130 +238,217 @@ The training scripts automatically run the validation after each training
 epoch. The results from the validation are printed to the standard output
 (`stdout`) and saved to the log files.
-## 7. Start inference.
+7. Start inference.
 After you have trained the Tacotron 2 and WaveGlow models, you can perform
 inference using the respective checkpoints that are passed as `--tacotron2`
 and `--waveglow` arguments.
 To run inference issue:
 ```bash
-python inference.py --tacotron2 <Tacotron2_checkpoint> --waveglow <WaveGlow_checkpoint> -o output/ -i phrase.txt --fp16-run
+python inference.py --tacotron2 <Tacotron2_checkpoint> --waveglow <WaveGlow_checkpoint> -o output/ -i text.txt --fp16-run
 ```
-The speech is generated from text file passed with `-i` argument.
+The speech is generated from a text file that is passed with `-i` argument. To run
-If no file is provided or if the provided file cannot be opened, speech will be
+inference in mixed precision, use the `--amp-run` flag. The output audio will
-generated from a default text located in the `inference.py` file. To run
+be stored in the path specified by the `-o` argument.
 inference in mixed precision, use `--fp16-run` flag. The output audio will 
 be stored in the path specified by `-o` argument.
-# Details
+## Details
-The following sections provide greater details of the dataset, running training 
+
-and inference, and the training results.
+The following sections provide greater details of the dataset, running
 training and inference, and the training results.
 ### Scripts and sample code
 The sample code for Tacotron 2 and WaveGlow has scripts specific to a
 particular model, located in directories `./tacotron2` and `./waveglow`, as well as scripts common to both
 models, located in the `./common` directory. The model-specific scripts are as follows:
 * `<model_name>/model.py` - the model architecture, definition of forward and
 inference functions
 * `<model_name>/arg_parser.py` - argument parser for parameters specific to a
 given model
 * `<model_name>/data_function.py` - data loading functions
 * `<model_name>/loss_function.py` - loss function for the model
 The common scripts contain layer definitions common to both models
 (`common/layers.py`), some utility scripts (`common/utils.py`) and scripts
 for audio processing (`common/audio_processing.py` and `common/stft.py`). In
 the root directory `./` of this repository, the `./run.py` script is used for
 training while inference can be executed with the `./inference.py` script. The
 scripts `./models.py`, `./data_functions.py` and `./loss_functions.py` call
 the respective scripts in the `<model_name>` directory, depending on what
 model is trained using the `run.py` script.
 ### Parameters
 In this section, we list the most important hyperparameters and command-line arguments,
 together with their default values that are used to train Tacotron 2 and
 WaveGlow models.
 #### Shared parameters
 * `--epochs` - number of epochs (Tacotron 2: 1500, WaveGlow: 1000)
 * `--learning-rate` - learning rate (Tacotron 2: 1e-3, WaveGlow: 1e-4)
 * `--batch-size` - batch size (Tacotron 2 FP16/FP32: 80/48, WaveGlow FP16/FP32: 8/4)
 * `--amp-run` - use mixed precision training
 #### Shared audio/STFT parameters
 * `--sampling-rate` - sampling rate in Hz of input and output audio (22050)
 * `--filter-length` - (1024)
 * `--hop-length` - hop length for FFT, i.e., sample stride between consecutive FFTs (256)
 * `--win-length` - window size for FFT (1024)
 * `--mel-fmin` - lowest frequency in Hz (0.0)
 * `--mel-fmax` - highest frequency in Hz (8.000)
 #### Tacotron 2 parameters
 * `--anneal-steps` - epochs at which to anneal the learning rate (500 1000 1500)
 * `--anneal-factor` - factor by which to anneal the learning rate (FP16/FP32: 0.3/0.1)
 #### WaveGlow parameters
 * `--segment-length` - segment length of input audio processed by the neural network (8000)
 ### Command-line options
 To see the full list of available options and their descriptions, use the `-h` or `--help` command line option, for example:
 ```bash
 python train.py --help
 ```
 ### Getting the data
 The Tacotron 2 and WaveGlow models were trained on the LJSpeech-1.1 dataset.  
 This repository contains the `./scripts/prepare_dataset.sh` script which will automatically download and extract the whole dataset. By default, data will be extracted to the `./LJSpeech-1.1` directory. The dataset directory contains a `README` file, a `wavs` directory with all audio samples, and a file `metadata.csv` that contains audio file names and the corresponding transcripts.
 #### Dataset guidelines
 The LJSpeech dataset has 13,100 clips that amount to about 24 hours of speech. Since the original dataset has all transcripts in the `metadata.csv` file, in this repository we provide file lists in the `./filelists` directory that determine training and validation subsets; `ljs_audio_text_train_filelist.txt` is a test set used as a training dataset and `ljs_audio_text_val_filelist.txt` is a test set used as a validation dataset.
 #### Multi-dataset
 To use datasets different than the default LJSpeech dataset:
 1. Prepare a directory with all audio files and pass it to the `--dataset-path` command-line option.  
 2. Add two text files containing file lists: one for the training subset (`--training-files`) and one for the validation subset (`--validation files`).
 The structure of the filelists should be as follows:
 ```bash
 `<audio file path>|<transcript>`
 ```
 The `<audio file path>` is the relative path to the path provided by the `--dataset-path` option.
 ### Training process
 ## Training process
 The Tacotron2 and WaveGlow models are trained separately and independently.
-Both models obtain mel spectrograms from short time Fourier transform (STFT) 
+Both models obtain mel-spectrograms from short time Fourier transform (STFT)
-during training. These mel spectrograms are used for loss computation in case 
+during training. These mel-spectrograms are used for loss computation in case
 of Tacotron 2 and as conditioning input to the network in case of WaveGlow.
 The training loss is averaged over an entire training epoch, whereas the
 validation loss is averaged over the validation dataset. Performance is
-reported in total input tokens per second for the Tacotron 2 model, and 
+reported in total input tokens per second for the Tacotron 2 model and
 in total output samples per second for the WaveGlow model. Both measures are
-recorded as `train_iter_items/sec` (after each iteration) and `train_epoch_items/sec` 
+recorded as `train_iter_items/sec` (after each iteration) and
-(averaged over epoch) in the output log. The result is averaged over an 
+`train_epoch_items/sec` (averaged over epoch) in the output log file `./output/nvlog.json`. The result is
-entire training epoch and summed over all GPUs that were included in the training.
+averaged over an entire training epoch and summed over all GPUs that were
 included in the training.
 Even though the training script uses all available GPUs, you can change
 this behavior by setting the `CUDA_VISIBLE_DEVICES` variable in your
 environment or by setting the `NV_GPU` variable at the Docker container launch
 ([see section "GPU isolation"](https://github.com/NVIDIA/nvidia-docker/wiki/nvidia-docker#gpu-isolation)).
-### Hyperparameters and command line arguments
+### Inference process
 Here, we list the most important hyperparameters and command line arguments, 
 together with their default values that are used to train Tacotron 2 and 
 WaveGlow models.
-#### Shared parameters
+You can run inference using the `./inference.py` script. This script takes
-`--epochs` - number of epochs (Tacotron 2: 1500, WaveGlow: 1000)
+text as input and runs Tacotron 2 and then WaveGlow inference to produce an
 audio file. It requires  pre-trained checkpoints from Tacotron 2 and WaveGlow
 models and input text as a text file, with one phrase per line.
-`--learning-rate` - learning rate (Tacotron 2: 1e-3, WaveGlow: 1e-4)
+To run inference, issue:
 ```bash
 python inference.py --tacotron2 <Tacotron2_checkpoint> --waveglow <WaveGlow_checkpoint> -o output/ -i text.txt --amp-run
 ```
 Here, `Tacotron2_checkpoint` and `WaveGlow_checkpoint` are pre-trained
 checkpoints for the respective models, and `text.txt` contains input phrases.
 Audio will be saved in the output folder.
-`--batch-size` - batch size (Tacotron 2 FP16/FP32: 80/48, WaveGlow FP16/FP32: 8/4)
+You can find all the available options by calling `python inference.py --help`.
-`--fp16-run` - use mixed precision training
+## Mixed precision training
-#### Shared audio/STFT parameters
+*Mixed precision* is the combined use of different numerical precisions in a
-`--sampling-rate` - Sampling rate in Hz of input and output audio (22050)
+computational method. [Mixed precision](https://arxiv.org/abs/1710.03740)
-
+training offers significant computational speedup by performing operations in
-`--filter-length` - (1024)
+half-precision format, while storing minimal information in single-precision
-
+to retain as much information as possible in critical parts of the network.
-`--hop-length` - Hop length for FFT, i.e., sample stride between consecutive FFTs (256)
+Since the introduction of [Tensor Cores](https://developer.nvidia.com/tensor-cores)
-
+in the Volta and Turing architecture, significant training speedups are
-`--win-length` - Window size for FFT (1024)
+experienced by switching to mixed precision -- up to 3x overall speedup on
-
+the most arithmetically intense model architectures.  Using mixed precision
-`--mel-fmin` - Lowest frequency in Hz (0.0)
+training requires two steps:
 `--mel-fmax` - Highest frequency in Hz (8.000)
 #### Tacotron 2 parameters
 `--anneal-steps` - epochs at which to anneal the learning rate (500 1000 1500)
 `--anneal-factor` - factor by which to anneal the learning rate (FP16/FP32: 0.3/0.1)
 #### WaveGlow parameters
 `--segment-length` - segment length of input audio processed by the neural network (8000)
 ## Enabling mixed precision
 [Mixed precision](https://arxiv.org/abs/1710.03740) training offers significant 
 computational speedup by performing operations in half-precision format, while 
 storing minimal information in single-precision to retain as much information as 
 possible in critical parts of the network. Since the introduction of 
 [tensor cores](https://developer.nvidia.com/tensor-cores) in the Volta and Turing 
 architectures, significant training speedups are experienced by switching to 
 mixed precision -- up to 3x overall speedup on the most arithmetically intense 
 model architectures. Using [mixed precision training](https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html) 
 previously required two steps:
 1. Porting the model to use the FP16 data type where appropriate.
-2. Manually adding loss scaling to preserve small gradient values.
+2. Adding loss scaling to preserve small gradient values.
 The ability to train deep learning networks with lower precision was
 introduced in the Pascal architecture and first supported in [CUDA 8](https://devblogs.nvidia.com/parallelforall/tag/fp16/) in the NVIDIA Deep Learning SDK.
 For information about:
 * How to train using mixed precision, see the [Mixed Precision Training](https://arxiv.org/abs/1710.03740)
 paper and [Training With Mixed Precision](https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html)
 documentation.
 * Techniques used for mixed precision training, see the [Mixed-Precision Training of Deep Neural Networks](https://devblogs.nvidia.com/mixed-precision-training-deep-neural-networks/)
 blog.
 * How to access and enable AMP for TensorFlow, see [Using TF-AMP](https://docs.nvidia.com/deeplearning/dgx/tensorflow-user-guide/index.html#tfamp)
 from the TensorFlow User Guide.
 * APEX tools for mixed precision training, see the [NVIDIA Apex: Tools for Easy Mixed-Precision Training in PyTorch](https://devblogs.nvidia.com/apex-pytorch-easy-mixed-precision-training/).
 ### Enabling mixed precision
 Mixed precision is enabled in PyTorch by using the Automatic Mixed Precision
-(AMP),  library from [APEX](https://github.com/NVIDIA/apex) that casts variables 
+(AMP)  library from [APEX](https://github.com/NVIDIA/apex) that casts variables
 to half-precision upon retrieval, while storing variables in single-precision
 format. Furthermore, to preserve small gradient magnitudes in backpropagation,
 a [loss scaling](https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html#lossscaling)
 step must be included when applying gradients. In PyTorch, loss scaling can be
-easily applied by using `scale_loss()` method provided by AMP. The scaling value 
+easily applied by using the `scale_loss()` method provided by AMP. The scaling value
 to be used can be [dynamic](https://nvidia.github.io/apex/fp16_utils.html#apex.fp16_utils.DynamicLossScaler) or fixed.
-By default, the `train_tacotron2.sh` and `train_waveglow.sh` scripts will launch 
+By default, the `train_tacotron2.sh` and `train_waveglow.sh` scripts will
-mixed precision training with tensor cores. You can change this behaviour by 
+launch mixed precision training with Tensor Cores. You can change this
-removing the `--fp16-run` flag from the `train.py` script.
+behaviour by removing the `--amp-run` flag from the `train.py` script.
-For an in-depth walk through on AMP, check out sample usage [here](https://github.com/NVIDIA/apex/tree/master/apex/amp#usage-and-getting-started). 
+To enable mixed precision, the following steps were performed in the Tacotron 2 and
-[APEX](https://github.com/NVIDIA/apex) is a PyTorch extension that contains 
+WaveGlow models:
-utility libraries, such as AMP, which require minimal network code changes to 
+* Import AMP from APEX:
 leverage tensor cores performance.
 To enable mixed precision, you can:
 * Import AMP from APEX, for example: 
    ```bash
    from apex import amp
 	amp.lists.functional_overrides.FP32_FUNCS.remove('softmax')
 	amp.lists.functional_overrides.FP16_FUNCS.append('softmax')
    ```
-* Initialize an AMP handle, for example: 
+* Initialize AMP:
    ```bash
-    amp_handle = amp.init(enabled=True, verbose=True)
+	model, optimizer = amp.initialize(model, optimizer, opt_level="O1")
    ```
-* Wrap your optimizer with the AMP handle, for example:
+* If running on multi-GPU, wrap the model with `DistributedDataParallel`:
    ```bash
-    optimizer = amp_handle.wrap_optimizer(optimizer)    
+  from apex.parallel import DistributedDataParallel as DDP
  model = DDP(model)
 	```
-* Scale loss before backpropagation (assuming loss is stored in a variable called losses)
+* Scale loss before backpropagation (assuming loss is stored in a variable
 called `losses`)
    * Default backpropagate for FP32:
        ```bash
        losses.backward()
@ -278,53 +458,15 @@ To enable mixed precision, you can:
        ```bash
        with optimizer.scale_loss(losses) as scaled_losses:
            scaled_losses.backward()
-        ````
+        ```
-For information about:
+## Benchmarking
 * How to train using mixed precision, see the [Mixed Precision Training](https://arxiv.org/abs/1710.03740) 
 paper and [Training With Mixed Precision](https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html) documentation.
 * Techniques used for mixed precision training, see the [Mixed-Precision Training of Deep Neural Networks](https://devblogs.nvidia.com/mixed-precision-training-deep-neural-networks/) blog.
 * APEX tools for mixed precision training, see the [NVIDIA Apex: Tools for Easy Mixed-Precision Training in PyTorch](https://devblogs.nvidia.com/apex-pytorch-easy-mixed-precision-training/).
 ## Inference process
 You can run inference using the `./inference.py` script. This script takes text 
 as input, and runs Tacotron 2 and then WaveGlow inference to produce an audio 
 file. It requires  pre-trained checkpoints from Tacotron 2 and WaveGlow models 
 and input text as a text file, with one phrase per line.
 To run inference, issue:
 ```bash
 python inference.py --tacotron2 <Tacotron2_checkpoint> --waveglow <WaveGlow_checkpoint> -o output/ -i phrase.txt --fp16-run
 ```
 Here, `Tacotron2_checkpoint` and `WaveGlow_checkpoint` are pre-trained 
 checkpoints for the respective models, and `text.txt` contains input phrases. 
 Audio will be saved in the output folder.
 You can find all available options by calling `python inference.py --help`.
 # Benchmarking
 The following section shows how to run benchmarks measuring the model
-performance in training mode.
+performance in training and inference mode.
-## Inference performance benchmark
+### Training performance benchmark
 To benchmark the inference performance on a batch size=1, run:
 * For FP32
    ```bash
    python inference.py --tacotron2 <Tacotron2_checkpoint> --waveglow <WaveGlow_checkpoint> -o output/ --include-warmup -i phrase.txt --log-file=output/nvlog_fp32.json
    ```
 * For FP16
    ```bash
    python inference.py --tacotron2 <Tacotron2_checkpoint> --waveglow <WaveGlow_checkpoint> -o output/ --include-warmup -i phrase.txt --fp16-run --log-file=output/nvlog_fp16.json
    ```
 The output log files will contain performance numbers for Tacotron 2 model
 (number of input tokens per second, reported as `tacotron2_items_per_sec`)
 and for WaveGlow (number of output samples per second, reported as
 `waveglow_items_per_sec`). The `inference.py` script will run a few warmup
 iterations before running the benchmark.
 ## Training performance benchmark
 To benchmark the training performance on a specific batch size, run:
 **Tacotron 2**
@ -336,7 +478,7 @@ To benchmark the training performance on a specific batch size, run:
        ```
 	* FP16
        ```bash
-        python train.py -m Tacotron2 -o <output_dir> -lr 1e-3 --epochs 10 -bs <batch_size> --weight-decay 1e-6 --grad-clip-thresh 1.0 --cudnn-benchmark=True --log-file nvlog.json --training-files filelists/ljs_audio_text_train_subset_2500_filelist.txt --dataset-path <dataset-path> --fp16-run
+        python train.py -m Tacotron2 -o <output_dir> -lr 1e-3 --epochs 10 -bs <batch_size> --weight-decay 1e-6 --grad-clip-thresh 1.0 --cudnn-benchmark=True --log-file nvlog.json --training-files filelists/ljs_audio_text_train_subset_2500_filelist.txt --dataset-path <dataset-path> --amp-run
        ```
 * For multiple GPUs
@ -346,7 +488,7 @@ To benchmark the training performance on a specific batch size, run:
        ```
 	* FP16
        ```bash
-        python -m multiproc train.py -m Tacotron2 -o <output_dir> -lr 1e-3 --epochs 10 -bs <batch_size> --weight-decay 1e-6 --grad-clip-thresh 1.0 --cudnn-benchmark=True --log-file nvlog.json --training-files filelists/ljs_audio_text_train_subset_2500_filelist.txt --dataset-path <dataset-path> --fp16-run
+        python -m multiproc train.py -m Tacotron2 -o <output_dir> -lr 1e-3 --epochs 10 -bs <batch_size> --weight-decay 1e-6 --grad-clip-thresh 1.0 --cudnn-benchmark=True --log-file nvlog.json --training-files filelists/ljs_audio_text_train_subset_2500_filelist.txt --dataset-path <dataset-path> --amp-run
        ```
 **WaveGlow**
@ -358,7 +500,7 @@ To benchmark the training performance on a specific batch size, run:
        ```
 	* FP16
        ```bash
-        python train.py -m WaveGlow -o <output_dir> -lr 1e-4 --epochs 10 -bs <batch_size> --segment-length 8000 --weight-decay 0 --grad-clip-thresh 65504.0 --cudnn-benchmark=True --log-file nvlog.json --training-files filelists/ljs_audio_text_train_subset_1250_filelist.txt --dataset-path <dataset-path> --fp16-run
+        python train.py -m WaveGlow -o <output_dir> -lr 1e-4 --epochs 10 -bs <batch_size> --segment-length 8000 --weight-decay 0 --grad-clip-thresh 65504.0 --cudnn-benchmark=True --log-file nvlog.json --training-files filelists/ljs_audio_text_train_subset_1250_filelist.txt --dataset-path <dataset-path> --amp-run
        ```
 * For multiple GPUs
@ -368,116 +510,157 @@ To benchmark the training performance on a specific batch size, run:
        ```
 	* FP16
        ```bash
-        python -m multiproc train.py -m WaveGlow -o <output_dir> -lr 1e-4 --epochs 10 -bs <batch_size> --segment-length 8000 --weight-decay 0 --grad-clip-thresh 65504.0 --cudnn-benchmark=True --log-file nvlog.json --training-files filelists/ljs_audio_text_train_subset_1250_filelist.txt --dataset-path <dataset-path> --fp16-run
+        python -m multiproc train.py -m WaveGlow -o <output_dir> -lr 1e-4 --epochs 10 -bs <batch_size> --segment-length 8000 --weight-decay 0 --grad-clip-thresh 65504.0 --cudnn-benchmark=True --log-file nvlog.json --training-files filelists/ljs_audio_text_train_subset_1250_filelist.txt --dataset-path <dataset-path> --amp-run
        ```
-Each of these scripts runs for 10 epochs and for each epoch measures the averaged number of items per second. The performance results can be read from the nvlog.json files produced by the commands.
+Each of these scripts runs for 10 epochs and for each epoch measures the
 average number of items per second. The performance results can be read from
 the `nvlog.json` files produced by the commands.
 ### Inference performance benchmark
 To benchmark the inference performance on a batch size=1, run:
 * For FP32
    ```bash
    python inference.py --tacotron2 <Tacotron2_checkpoint> --waveglow <WaveGlow_checkpoint> -o output/ -i text.txt --log-file=output/nvlog_fp32.json
    ```
 * For FP16
    ```bash
    python inference.py --tacotron2 <Tacotron2_checkpoint> --waveglow <WaveGlow_checkpoint> -o output/ -i text.txt --amp-run --log-file=output/nvlog_fp16.json
    ```
 The log files contain performance numbers for Tacotron 2 model
 (number of input tokens per second, reported as `tacotron2_items_per_sec`)
 and for WaveGlow (number of output samples per second, reported as
 `waveglow_items_per_sec`).
 ## Results
 # Results
 The following sections provide details on how we achieved our performance
 and accuracy in training and inference.
-## Training accuracy results
+### Training accuracy results
 Our results were obtained by running the `./platform/train_{tacotron2,waveglow}_{FP16,FP32}_DGX1_16GB_8GPU.sh` 
 training script in the PyTorch-19.05-py3 NGC container on NVIDIA DGX-1 with 8x V100 16G GPUs.
-All of the results were produced using the `train.py` as described in the 
+
 ##### NVIDIA DGX-1 (8x V100 16G)
 Our results were obtained by running the `./platform/train_{tacotron2,waveglow}_{FP16,FP32}_DGX1_16GB_8GPU.sh` training script in the PyTorch-19.04-py3
 NGC container on NVIDIA DGX-1 with 8x V100 16G GPUs.
 All of the results were produced using the `train.py` script as described in the
 [Training process](#training-process) section of this document.
 | Loss (Model/Epoch) |       1 |     250 |     500 |     750 |    1000 |
 | :----------------: | ------: | ------: | ------: | ------: | ------: |
-| **Tacotron 2 FP16** | 26.7176 |   0.473 |  0.3985 |  0.3725 |  0.3645 |
+| Tacotron 2 FP16 | 13.0732 | 0.5736 | 0.4408 | 0.3923 | 0.3735 |
-| **Tacotron 2 FP32** |  5.3406 |  0.4317 |  0.3699 |  0.3635 |  0.3629 |
+| Tacotron 2 FP32 | 8.5776 | 0.4807 | 0.3875 | 0.3421 | 0.3308 |
-| **WaveGlow FP16**  | -2.2054 | -5.7602 |  -5.901 | -5.9706 | -6.0258 |
+| WaveGlow FP16  | -2.2054 | -5.7602 |  -5.901 | -5.9706 | -6.0258 |
-| **WaveGlow FP32**  | -3.0327 |  -5.858 | -6.0056 | -6.0613 | -6.1087 |
+| WaveGlow FP32  | -3.0327 |  -5.858 | -6.0056 | -6.0613 | -6.1087 |
 Tacotron 2 FP16 loss - batch size 80 (mean and std over 16 runs)
-![](./img/tacotron2_fp16_loss.png "Tacotron 2 FP16 loss")
+![](./img/tacotron2_amp_loss.png "Tacotron 2 FP16 loss")
 Tacotron 2 FP32 loss - batch size 48 (mean and std over 16 runs)
 ![](./img/tacotron2_fp32_loss.png "Tacotron 2 FP16 loss")
 WaveGlow FP16 loss - batch size 8 (mean and std over 16 runs)
 ![](./img/waveglow_fp16_loss.png "WaveGlow FP16 loss")
 WaveGlow FP32 loss - batch size 4 (mean and std over 16 runs)
 ![](./img/waveglow_fp32_loss.png "WaveGlow FP32 loss")
-## Training performance results
+
 ### Training performance results
 ##### NVIDIA DGX-1 (8x V100 16G)
 Our results were obtained by running the `./platform/train_{tacotron2,waveglow}_{FP16,FP32}_DGX1_16GB_8GPU.sh`
-training script in the PyTorch-19.05-py3 NGC container on NVIDIA DGX-1 with 
+training script in the PyTorch-19.04-py3 NGC container on NVIDIA DGX-1 with
 8x V100 16G GPUs. Performance numbers (in input tokens per second for
 Tacotron 2 and output samples per second for WaveGlow) were averaged over
 an entire training epoch.
-This table shows the results for Tacotron 2, with batch size equal 80 and 48
+This table shows the results for Tacotron 2:
 for mixed precision and FP32 training, respectively.
-|Number of GPUs|Mixed precision tokens/sec|FP32 tokens/sec|Speed-up with mixed precision|Multi-gpu weak scaling with mixed precision|Multi-gpu weak scaling with FP32|
+|Number of GPUs|Batch size per GPU|Number of tokens used with mixed precision|Number of tokens used with FP32|Speed-up with mixed precision|Multi-GPU weak scaling with mixed precision|Multi-GPU weak scaling with FP32|
-|---:|---:|---:|---:|---:|---:|
+|---:|---:|---:|---:|---:|---:|---:|
-|**1**|2,554|1,740|1.47|1.00|1.00|
+|1|128@FP16, 64@FP32 | 3,746 | 2,087 | 1.79 | 1.00 | 1.00 |
-|**4**|7,768|5,683|1.37|3.04|3.27|
+|4|128@FP16, 64@FP32 | 13,264 | 8,052 | 1.65 | 3.54 | 3.86 |
-|**8**|12,524|10,484|1.19|4.90|6.03|
+|8|128@FP16, 64@FP32 | 25,056 | 15,863 | 1.58 | 6.69 | 7.60 |
-The following table shows the results for WaveGlow, with batch size equal 8 and
+The following table shows the results for WaveGlow:
 4 for mixed precision and FP32 training, respectively.
-|Number of GPUs|Mixed precision samples/sec|FP32 samples/sec|Speed-up with mixed precision|Multi-gpu weak scaling with mixed precision|Multi-gpu weak scaling with FP32|
+|Number of GPUs|Batch size per GPU|Number of samples used with mixed precision|Number of samples used with FP32|Speed-up with mixed precision|Multi-GPU weak scaling with mixed precision|Multi-GPU weak scaling with FP32|
-|---:|---:|---:|---:|---:|---:|
+|---:|---:|---:|---:|---:|---:|---:|
-|**1**| 76,686 | 36,602 | 2.10 | 1.00 | 1.00 |
+|1| 10@FP16, 4@FP32 | 79248.87426 | 35695.56774 | 2.22 | 1.00 | 1.00 |
-|**4**| 260,826 | 124,514 | 2.09 | 3.40 | 3.40 |
+|4| 10@FP16, 4@FP32 | 275310.0262 | 126497.6265 | 2.18 | 3.47 | 3.54 |
-|**8**| 566,471 | 264,138 | 2.14 | 7.39 | 7.22 |
+|8| 10@FP16, 4@FP32 | 576709.4935 | 255155.1798 | 2.26 | 7.28 | 7.15 |
-To achieve these same results, follow the [Quick Start Guide](#quick-start-guide) outlined above.
+To achieve these same results, follow the steps in the [Quick Start Guide](#quick-start-guide).
-### Expected training time
+#### Expected training time
-This table shows the expected training time for convergence for Tacotron 2 (1500 epochs, time in hours).
+The following table shows the expected training time for convergence for Tacotron 2 (1500 epochs):
-|Number of GPUs|Expected training time in hours with mixed precision|Expected training time in hours with FP32|Speed-up with mixed precision|
+|Number of GPUs|Batch size per GPU|Time to train with mixed precision (Hrs)|Time to train with FP32 (Hrs)|Speed-up with mixed precision|
 |---:|---:|---:|---:|
-|**1**| 197.39 | 302.32 | 1.38 |
+|1| 128@FP16, 64@FP32 | 137.33 | 227.66 | 1.66 |
-|**4**| 63.29 | 88.07 | 1.25 |
+|4| 128@FP16, 64@FP32 | 40.68 | 63.99 | 1.57 |
-|**8**| 33.72 | 45.51 | 1.33 |
+|8| 128@FP16, 64@FP32 | 20.74 | 32.47 | 1.57 |
 This table shows the expected training time for convergence for WaveGlow (1000 epochs, time in hours).
-|Number of GPUs|Expected training time in hours with mixed precision|Expected training time in hours with FP32|Speed-up with mixed precision|
+
 The following table shows the expected training time for convergence for WaveGlow (1000 epochs):
 |Number of GPUs|Batch size per GPU|Time to train with mixed precision (Hrs)|Time to train with FP32 (Hrs)|Speed-up with mixed precision|
 |---:|---:|---:|---:|
-|**1**| 400.99 | 782.67 | 1.95 |
+|1| 10@FP16, 4@FP32 | 358.00 | 793.97 | 2.22 |
-|**4**|	89.40 | 213.09 | 2.38 |
+|4| 10@FP16, 4@FP32 | 103.10 | 223.59 | 2.17 |
-|**8**|	48.43 | 107.27 | 2.21 |
+|8| 10@FP16, 4@FP32 | 50.40 | 109.45 | 2.17 |
 ### Inference performance results
 ##### NVIDIA DGX-1 (8x V100 16G)
 ## Inference performance results
 Our results were obtained by running the `./inference.py` inference script in the
-PyTorch-19.05-py3 NGC container on NVIDIA DGX-1 with 8x V100 16G GPUs.
+PyTorch-18.12.1-py3 NGC container on NVIDIA DGX-1 with 8x V100 16G GPUs.
 Performance numbers (in input tokens per second for Tacotron 2 and output
 samples per second for WaveGlow) were averaged over 16 runs.
-This table shows the inference performance results for Tacotron 2.
+The following table shows the inference performance results for Tacotron 2.
 Results are measured in the number of input tokens per second.
-|Number of GPUs|Mixed precision tokens/sec|FP32 tokens/sec|Speed-up with mixed precision|
+|Number of GPUs|Number of tokens used with mixed precision|Number of tokens used with FP32|Speed-up with mixed precision|
 |---:|---:|---:|---:|
-|**1**|132|153|0.86|
+|1|168|173|0.97|
-
+The following table shows the inference performance results for WaveGlow.
 This table shows the inference performance results for WaveGlow. 
 Results are measured in the number of output audio samples per second.<sup>1</sup>
-|Number of GPUs|Mixed precision samples/sec|FP32 samples/sec|Speed-up with mixed precision|
+|Number of GPUs|Number of samples used with mixed precision|Number of samples used with FP32|Speed-up with mixed precision|
 |---:|---:|---:|---:|
-|**1**|425379|376037|1.13|
+|1|583318|553380|1.05|
 <sup>1</sup>With sampling rate equal to 22050, one second of audio is generated from 22050 samples.
-To achieve these same results, follow the [Quick Start Guide](#quick-start-guide) outlined above.
+To achieve these same results, follow the steps in the [Quick Start Guide](#quick-start-guide).
-# Changelog
+## Changelog
 March 2019
 * Initial release
-# Known issues
+June 2019
-For mixed precision training of Tacotron 2, dropouts on LSTMCells 
+* AMP support
-cause overflow leading to dynamic loss scaling going to 1, see [here](https://github.com/NVIDIA/tacotron2/issues/112). 
+* Data preprocessing for Tacotron 2 training
-The current workaround, which is already applied in our model implementation, 
+* Fixed dropouts on LSTMCells
-is to convert `attention_rnn` and `decoder_rnn` back to FP32 precision.
+
 ## Known issues
 There are no known issues in this release.
--- a/PyTorch/SpeechSynthesis/Tacotron2/audio/audio_fp16.wav
+++ b/PyTorch/SpeechSynthesis/Tacotron2/audio/audio_fp16.wav
--- a/PyTorch/SpeechSynthesis/Tacotron2/audio/audio_fp32.wav
+++ b/PyTorch/SpeechSynthesis/Tacotron2/audio/audio_fp32.wav
--- a/Show more
+++ b/Show more
		`@ -1 +0,0 @@`
			{"metric_keys": ["train.loss", "val.acc"], "metrics": {"train.loss": [8.812795396454991, 5.914838795058071, 6, 5.092440919584583, 4.887887316499735, 4.744666463422983, 4.694560192557922, 4.567333741479565, 4.492525351620137, 6, 4.408311570055099, 4.334232046614567, 6, 4.263646488106407, 4.2514614595596445, 4.2171871953656055, 4.206751160226014, 4.1795772798196715, 4.156515416099515, 6, 4.108870625495911, 4.0985876759066855, 4.075221928967139, 4.080158276849438, 6, 4.033980131669857, 4.037739227952915, 6, 3.99941903534935, 6, 3.9875937877263565, 3.971811039999583, 3.980771179282509, 3.953947089124455, 3.9305202960968018, 3.9366443781873546, 3.9252991879350754, 3.8827156307395367, 3.9388060424005102, 3.88922161618695, 3.8874285418914396, 6, 3.8936942113018453, 3.537499847891029, 3.4058184228089177, 6, 6, 3.3219671837627627, 3.295458280363458, 3.262115957955606, 6, 6, 6, 3.2190717260910433, 3.213117691627236, 3.1739242191397987, 3.1791626058811704, 3.2088054501854177, 3.1719801842385507, 3.187761370792139, 3.1809213312432236, 3.1823803410259397, 3.1752594631311677, 3.1709555600928425, 3.1823559530957817], "val.acc": [0.025120322205631106, 0.06065902615325462, 0.08224594352985645, 0.09868630608427395, 0.11402055039858493, 0.11779455253460233, 0.1232203941357061, 0.13708232144631768, 0.13614397127135028, 0.13289094380937685, 0.14004009449749777, 0.1369843423424096, 0.13877603069457692, 0.15418866425831707, 0.1500001994042602, 0.1542573219664272, 0.14771151227315413, 0.15896497766306272, 0.1600724682809656, 0.15881491661088476, 0.16213217020726906, 0.16466781280171408, 0.15738430149539484, 0.16634155547369375, 0.1623110334880526, 0.16394517553182106, 0.1494171026560053, 0.16762167601953265, 0.16063595691096758, 0.16982898253523193, 0.17321918229909394, 0.17242960413896102, 0.1625123530546557, 0.18330429802960516, 0.16333127233412115, 0.17973452067250242, 0.16699022570278652, 0.17183956548028687, 0.17168756775917593, 0.17547718325478198, 0.1750019046551496, 0.18416070771679066, 0.1711460087987496, 0.231325087097653, 0.23716038401167305, 0.23886896590018106, 0.2403412383214709, 0.24380227870861898, 0.24383605475007317, 0.2449733300818802, 0.24508423152154857, 0.24252172333110344, 0.24566254540226004, 0.24661345705692578, 0.25123807624083877, 0.25184439401895475, 0.2519010236397111, 0.25191664071239706, 0.2522156441636805, 0.25215053241008767, 0.2525434296889651, 0.2524917808636186, 0.2527410425201369, 0.2534121449798447, 0.25279479287831214]}, "bs": [64], "model": "", "ngpus": [8]}
		`@ -1,4 +0,0 @@`
			`#!/bin/bash`

			`python3 -m torch.distributed.launch --nproc_per_node=8 qa/qa_accuracy_main.py --bs 64 --fp16 --warmup 300 --learning-rate 2.6e-3 --seed 1 --benchmark-mode epoch-accuracy --benchmark-file qa/curve_baselines/SSD300_pytorch_19.01_fp16_1epoch_run_acc_baseline.json --data $1`
		`@ -1,3 +0,0 @@`
			`#!/bin/bash`

			`python ./qa/benchmark_performance.py --benchmark-mode inference --ngpus 1 --bs 2 4 8 16 32 --fp16 --bench-warmup 100 --bench-iterations 200 --benchmark-file qa/benchmark_baselines/SSD300_pytorch_19.05_inference_fp16.json --data $1`