Updating models

This commit is contained in:
Przemek Strzelczyk 2019-07-08 22:51:28 +02:00
parent f89dcca19d
commit 0663b67c1a
283 changed files with 112904 additions and 133470 deletions

3
.gitignore vendored Normal file
View File

@ -0,0 +1,3 @@
repos.cfg
repos_init.cfg
nvtool*

View File

@ -1,4 +1,3 @@
Apache License
Version 2.0, January 2004
http://www.apache.org/licenses/

View File

@ -1,5 +1,7 @@
# -----------------------------------------------------------------------
# Copyright 2017-2018 The Apache Software Foundation
#
#
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information

View File

@ -1,4 +1,4 @@
FROM nvcr.io/nvidia/pytorch:19.03-py3
FROM nvcr.io/nvidia/pytorch:19.05-py3
# Set working directory
WORKDIR /mlperf

View File

@ -1,31 +0,0 @@
{
"model": "",
"ngpus": [1, 4, 8],
"bs": [2, 4, 8, 16, 32, 64, 128],
"metric_keys": ["images_per_second"],
"metrics": {
"1": {
"2": {
"images_per_second": 191.25867003414876
},
"4": {
"images_per_second": 340.9537905548054
},
"8": {
"images_per_second": 517.2612062140391
},
"16": {
"images_per_second": 711.5516679788083
},
"32": {
"images_per_second": 812.9203401838566
},
"64": {
"images_per_second": 951.7432815456556
},
"128": {
"images_per_second": 876.1868813828711
}
}
}
}

View File

@ -1,31 +0,0 @@
{
"model": "",
"ngpus": [1, 4, 8],
"bs": [2, 4, 8, 16, 32, 64, 128],
"metric_keys": ["images_per_second"],
"metrics": {
"1": {
"2": {
"images_per_second": 174.58768325581374
},
"4": {
"images_per_second": 254.24180710755593
},
"8": {
"images_per_second": 308.95847419165545
},
"16": {
"images_per_second": 419.60746029488445
},
"32": {
"images_per_second": 453.81433823995565
},
"64": {
"images_per_second": 592.6385687558369
},
"128": {
"images_per_second": 603.8453409148115
}
}
}
}

View File

@ -1,59 +0,0 @@
{
"model": "",
"ngpus": [1, 4, 8],
"bs": [2, 4, 8, 16, 32, 64],
"metric_keys": ["images_per_second"],
"metrics": {
"1": {
"2": {
"images_per_second": 40.71944999694824
},
"4": {
"images_per_second": 68.22257804870605
},
"8": {
"images_per_second": 121.42024612426758
},
"16": {
"images_per_second": 159.56442260742188
},
"32": {
"images_per_second": 185.69010543823242
}
},
"4": {
"2": {
"images_per_second": 40.75998783111572
},
"4": {
"images_per_second": 75.58991050720215
},
"8": {
"images_per_second": 142.64888381958008
},
"16": {
"images_per_second": 256.07005310058594
},
"32": {
"images_per_second": 300.8989944458008
}
},
"8": {
"2": {
"images_per_second": 61.28578186035156
},
"4": {
"images_per_second": 119.46021270751953
},
"8": {
"images_per_second": 231.7295379638672
},
"16": {
"images_per_second": 430.5494079589844
},
"32": {
"images_per_second": 454.2975769042969
}
}
}
}

View File

@ -1,59 +0,0 @@
{
"model": "",
"ngpus": [1, 4, 8],
"bs": [2, 4, 8, 16, 32],
"metric_keys": ["images_per_second"],
"metrics": {
"1": {
"2": {
"images_per_second": 48.635780334472656
},
"4": {
"images_per_second": 66.06407419840494
},
"8": {
"images_per_second": 83.91736857096353
},
"16": {
"images_per_second": 102.67040761311848
},
"32": {
"images_per_second": 110.02347819010416
}
},
"4": {
"2": {
"images_per_second": 41.199180603027344
},
"4": {
"images_per_second": 79.85076141357422
},
"8": {
"images_per_second": 145.39981587727863
},
"16": {
"images_per_second": 247.95855712890625
},
"32": {
"images_per_second": 341.29132080078125
}
},
"8": {
"2": {
"images_per_second": 63.07561111450195
},
"4": {
"images_per_second": 123.25757344563802
},
"8": {
"images_per_second": 237.3413340250651
},
"16": {
"images_per_second": 376.59598795572913
},
"32": {
"images_per_second": 507.9451497395833
}
}
}
}

View File

@ -1,34 +0,0 @@
{
"bs" : [
2,
4,
8,
16,
32
],
"metric_keys" : [
"images_per_second"
],
"metrics" : {
"1" : {
"16" : {
"images_per_second" : 470.099200788709
},
"2" : {
"images_per_second" : 163.117099093173
},
"32" : {
"images_per_second" : 520.538879400471
},
"4" : {
"images_per_second" : 296.604178917743
},
"8" : {
"images_per_second" : 412.522394180558
}
}
},
"ngpus" : [
1
]
}

View File

@ -1,34 +0,0 @@
{
"bs" : [
2,
4,
8,
16,
32
],
"metric_keys" : [
"images_per_second"
],
"metrics" : {
"1" : {
"16" : {
"images_per_second" : 280.570005994299
},
"2" : {
"images_per_second" : 147.914221468741
},
"32" : {
"images_per_second" : 302.430594818483
},
"4" : {
"images_per_second" : 201.622430560779
},
"8" : {
"images_per_second" : 228.159516872363
}
}
},
"ngpus" : [
1
]
}

View File

@ -1,52 +0,0 @@
{
"bs" : [
2,
4,
8,
16,
32
],
"metric_keys" : [
"images_per_second"
],
"metrics" : {
"1" : {
"16" : {
"images_per_second" : 192.623916625977
},
"2" : {
"images_per_second" : 48.7488899230957
},
"32" : {
"images_per_second" : 204.250648498535
},
"4" : {
"images_per_second" : 95.4697418212891
},
"8" : {
"images_per_second" : 164.66495513916
}
},
"4" : {
"16" : {
"images_per_second" : 701.366027832031
},
"2" : {
"images_per_second" : 154.449935913086
},
"32" : {
"images_per_second" : 771.171325683594
},
"4" : {
"images_per_second" : 300.332641601562
},
"8" : {
"images_per_second" : 550.924163818359
}
}
},
"ngpus" : [
1,
4
]
}

View File

@ -1,45 +0,0 @@
{
"bs" : [
2,
4,
8,
16
],
"metric_keys" : [
"images_per_second"
],
"metrics" : {
"1" : {
"16" : {
"images_per_second" : 121.772495269775
},
"2" : {
"images_per_second" : 56.0
},
"4" : {
"images_per_second" : 90.5315437316895
},
"8" : {
"images_per_second" : 103.113033294678
}
},
"4" : {
"16" : {
"images_per_second" : 472.226806640625
},
"2" : {
"images_per_second" : 184.061141967773
},
"4" : {
"images_per_second" : 324.639801025391
},
"8" : {
"images_per_second" : 391.055908203125
}
}
},
"ngpus" : [
1,
4
]
}

View File

@ -1,34 +0,0 @@
{
"bs" : [
2,
4,
8,
16,
32
],
"metric_keys" : [
"images_per_second"
],
"metrics" : {
"1" : {
"16" : {
"images_per_second" : 478.225033
},
"2" : {
"images_per_second" : 148.5965123
},
"32" : {
"images_per_second" : 531.1827376
},
"4" : {
"images_per_second" : 283.3305197
},
"8" : {
"images_per_second" : 418.7012914
}
}
},
"ngpus" : [
1
]
}

View File

@ -1,34 +0,0 @@
{
"bs" : [
2,
4,
8,
16,
32
],
"metric_keys" : [
"images_per_second"
],
"metrics" : {
"1" : {
"16" : {
"images_per_second" : 280.4733254
},
"2" : {
"images_per_second" : 143.8231571
},
"32" : {
"images_per_second" : 305.4504603
},
"4" : {
"images_per_second" : 202.6915644
},
"8" : {
"images_per_second" : 230.262872
}
}
},
"ngpus" : [
1
]
}

View File

@ -1,81 +0,0 @@
import argparse
import subprocess
from qa.qa_utils import compare_benchmarks, load_json, save_json, OKBLUE, ENDC, FAIL
# parsing
def parse_testscript_args():
parser = argparse.ArgumentParser(description='PyTorch Benchmark Tests')
parser.add_argument('--bs', default=[1], type=int, nargs='+')
parser.add_argument('--ngpus', default=[1], type=int, nargs='+')
parser.add_argument('--benchmark-mode', default='training', choices=['training', 'inference'],
help='benchmark training or inference', required=True)
parser.add_argument('--bench-iterations', type=int, default=20, metavar='N',
help='Run N iterations while benchmarking (ignored when training and validation)')
parser.add_argument('--bench-warmup', type=int, default=10, metavar='N',
help='Number of warmup iterations for benchmarking')
parser.add_argument('--fp16', action='store_true', help='Run model in mixed precision.')
parser.add_argument('-j', '--workers', default=4, type=int, metavar='N',
help='number of data loading workers')
parser.add_argument('--data', type=str, metavar='<PATH>', required=True,
help='path to the dataset')
parser.add_argument('--results-file', default='experiment_raport.json', type=str,
help='file in which to store JSON experiment raport')
parser.add_argument('--benchmark-file', type=str, metavar='FILE', required=True,
help='path to the file with baselines')
return parser.parse_args()
# job command
command_template = 'python3 {launcher} qa/qa_perf_main.py --bs {bs} --ebs {bs} ' \
'--benchmark-mode {mode} --benchmark-warmup {bw} --benchmark-iterations {bi} {fp16} ' \
'--backbone resnet50 --seed 1 --data {data} --results-file {results_file} --benchmark-file {benchmark_file}'
if __name__ == '__main__':
args = parse_testscript_args()
fp16 = '--fp16' if args.fp16 else ''
# create results json file
# todo: maybe some template json file?
results = {'ngpus': args.ngpus,
'bs': args.bs,
'metric_keys': ['images_per_second'],
'metrics': {}}
for gpu in args.ngpus:
results['metrics'][str(gpu)] = {}
for bs in args.bs:
results['metrics'][str(gpu)][str(bs)] = {'images_per_second': None}
save_json(args.results_file, results)
# run qa_perf_main.py tests one by one
for gpu in args.ngpus:
launcher = '' if gpu == 1 else '-m torch.distributed.launch --nproc_per_node={}'.format(gpu)
for bs in args.bs:
print('#' * 80)
command = command_template.format(launcher=launcher, bs=bs, workers=args.workers, mode=args.benchmark_mode,
bw=args.bench_warmup, bi=args.bench_iterations, fp16=fp16,
data=args.data, results_file=args.results_file,
benchmark_file=args.benchmark_file)
print('Running "{}"'.format(command))
process = subprocess.Popen(command, shell=True)
output, error = process.communicate()
if error is not None:
print(FAIL + 'Program exited with status {}. Data has not been collected'.format(error) + ENDC)
# elif results['metrics'][str(gpu)][str(bs)]['images_per_second'] is None:
# print(WARNING + 'Program did not end sucessfully. Data has not been collected.' + ENDC)
else:
print(OKBLUE + 'Program ended sucessfully. Data has been collected.' + ENDC)
results_data = load_json(args.results_file)
benchmark_data = load_json(args.benchmark_file)
exit_code = compare_benchmarks(results_data, benchmark_data, args, 0.16 if args.benchmark_mode == 'inference' else 0.1)
print(exit_code)
exit(exit_code)

View File

@ -1 +0,0 @@
{"metric_keys": ["train.loss", "val.acc"], "metrics": {"train.loss": [8.812795396454991, 5.914838795058071, 6, 5.092440919584583, 4.887887316499735, 4.744666463422983, 4.694560192557922, 4.567333741479565, 4.492525351620137, 6, 4.408311570055099, 4.334232046614567, 6, 4.263646488106407, 4.2514614595596445, 4.2171871953656055, 4.206751160226014, 4.1795772798196715, 4.156515416099515, 6, 4.108870625495911, 4.0985876759066855, 4.075221928967139, 4.080158276849438, 6, 4.033980131669857, 4.037739227952915, 6, 3.99941903534935, 6, 3.9875937877263565, 3.971811039999583, 3.980771179282509, 3.953947089124455, 3.9305202960968018, 3.9366443781873546, 3.9252991879350754, 3.8827156307395367, 3.9388060424005102, 3.88922161618695, 3.8874285418914396, 6, 3.8936942113018453, 3.537499847891029, 3.4058184228089177, 6, 6, 3.3219671837627627, 3.295458280363458, 3.262115957955606, 6, 6, 6, 3.2190717260910433, 3.213117691627236, 3.1739242191397987, 3.1791626058811704, 3.2088054501854177, 3.1719801842385507, 3.187761370792139, 3.1809213312432236, 3.1823803410259397, 3.1752594631311677, 3.1709555600928425, 3.1823559530957817], "val.acc": [0.025120322205631106, 0.06065902615325462, 0.08224594352985645, 0.09868630608427395, 0.11402055039858493, 0.11779455253460233, 0.1232203941357061, 0.13708232144631768, 0.13614397127135028, 0.13289094380937685, 0.14004009449749777, 0.1369843423424096, 0.13877603069457692, 0.15418866425831707, 0.1500001994042602, 0.1542573219664272, 0.14771151227315413, 0.15896497766306272, 0.1600724682809656, 0.15881491661088476, 0.16213217020726906, 0.16466781280171408, 0.15738430149539484, 0.16634155547369375, 0.1623110334880526, 0.16394517553182106, 0.1494171026560053, 0.16762167601953265, 0.16063595691096758, 0.16982898253523193, 0.17321918229909394, 0.17242960413896102, 0.1625123530546557, 0.18330429802960516, 0.16333127233412115, 0.17973452067250242, 0.16699022570278652, 0.17183956548028687, 0.17168756775917593, 0.17547718325478198, 0.1750019046551496, 0.18416070771679066, 0.1711460087987496, 0.231325087097653, 0.23716038401167305, 0.23886896590018106, 0.2403412383214709, 0.24380227870861898, 0.24383605475007317, 0.2449733300818802, 0.24508423152154857, 0.24252172333110344, 0.24566254540226004, 0.24661345705692578, 0.25123807624083877, 0.25184439401895475, 0.2519010236397111, 0.25191664071239706, 0.2522156441636805, 0.25215053241008767, 0.2525434296889651, 0.2524917808636186, 0.2527410425201369, 0.2534121449798447, 0.25279479287831214]}, "bs": [64], "model": "", "ngpus": [8]}

View File

@ -1 +0,0 @@
{"metric_keys": ["train.loss", "val.acc"], "metrics": {"train.loss": [9.887425426832973, 6.30290542835752, 5.566619733535567, 5.192713968618468, 4.943981836976963, 4.777146058311629, 4.682364774062644, 4.566371860462505, 4.479279315107254, 5, 4.398730874582149, 4.31779890601812, 4.293896813580043, 4.250142149529603, 4.219812418175577, 4.21572122303159, 4.187492328960302, 4.147948342119242, 4.134799897931028, 4.131298205737984, 4.071315974647822, 4.074750597299968, 4.0595350983882055, 4.042616275720722, 4.029284068070124, 4.02082926113012, 3.9983501902834298, 4.00984974094874, 3.9730074155799167, 5, 3.9646901324326294, 3.952598022061144, 3.944574903713043, 3.9182081201711596, 3.9252539055836775, 3.907297405092997, 3.8867245969813986, 3.87151758639573, 3.8793927009449254, 3.8687505586699107, 3.8750464156204956, 5, 3.8645522469516402, 3.504709825765618, 3.3920036476251862, 3.318732707260998, 5, 3.295415750237011, 3.2602547589347872, 5, 5, 5, 5, 3.199645553613854, 3.1623374312205086, 5, 3.147109237820821, 3.158245995575684, 3.1465386938319977, 3.1480963979746055, 3.151234711101482, 3.146022343739672, 3.1410668343956294, 3.142435818259893, 3.123337645718104], "val.acc": [0.01106397969239677, 0.04958324872172423, 0.07470961174804201, 0.08412781056028416, 0.1052591997157941, 0.11592629309116805, 0.1275672396324061, 0.12472585915140484, 0.13138377072048255, 0.1262696666605193, 0.13354663690485083, 0.14424123617821044, 0.14059169419863984, 0.14768715602101368, 0.15450788443085858, 0.14792122925940135, 0.1508861356435794, 0.157419558440425, 0.15279118544884585, 0.16075469826863828, 0.14747077091644412, 0.16340857637480236, 0.14427366437395484, 0.15709914018423293, 0.16324391683493303, 0.16440443232887508, 0.16479726175439752, 0.17508843799046686, 0.16142292492169025, 0.1643848499786872, 0.16912610131976924, 0.16376330941842296, 0.16894551721633602, 0.17771765128166106, 0.1749561896689298, 0.1695538322677119, 0.16778561571905298, 0.16380194923909086, 0.16994188486879763, 0.1716953661397215, 0.17755697810460197, 0.17187995479426885, 0.1742018462295355, 0.23426649845846764, 0.23613136034024038, 0.24175797706337981, 0.2425279583355936, 0.24352550398110506, 0.24411115979837528, 0.24656561042490024, 0.24383524308920906, 0.24686666489675338, 0.24814559219197632, 0.24840393696219026, 0.251965847689631, 0.25254138256097747, 0.2523565615073023, 0.2529904738785998, 0.253555154014026, 0.2530651493203877, 0.25358174010109197, 0.2537683728256746, 0.2539384684886946, 0.2540280117408162, 0.2534652864501853]}, "bs": [32], "model": "", "ngpus": [8]}

View File

@ -1,20 +0,0 @@
{
"metrics" : {
"val.acc" : [
0.0100971670737651
],
"train.loss" : [
9.85026645043801
]
},
"ngpus" : [
8
],
"metric_keys" : [
"train.loss",
"val.acc"
],
"bs" : [
64
]
}

View File

@ -1,20 +0,0 @@
{
"bs" : [
32
],
"metrics" : {
"train.loss" : [
8.79916159380589
],
"val.acc" : [
0.0238952010105531
]
},
"metric_keys" : [
"train.loss",
"val.acc"
],
"ngpus" : [
8
]
}

View File

@ -1,73 +0,0 @@
# core imports
import os
import numpy as np
# pytorch imports
import torch
import torch.utils.data.distributed
# Apex imports
try:
from apex.parallel.LARC import LARC
from apex.parallel import DistributedDataParallel as DDP
from apex.fp16_utils import *
except ImportError:
raise ImportError("Please install APEX from https://github.com/nvidia/apex")
# project imports
from src.train import train_loop
from main import train, make_parser
from src.logger import Logger
from qa.qa_utils import load_json, create_json_file, compare_acc, save_json
RESULT = None
def add_benchmark_args(parser):
parser.add_argument('--benchmark-mode', type=str, default='epoch-accuracy',
choices=['full-accuracy', 'epoch-accuracy'], required=True)
parser.add_argument('--benchmark-file', type=str, default=None, metavar='FILE',
help='path to the file with baselines', required=True)
return parser
def main(args):
if args.local_rank == 0:
os.makedirs('./models', exist_ok=True)
if args.seed is not None:
print("Using seed = {}".format(args.seed))
torch.manual_seed(args.seed)
np.random.seed(seed=args.seed)
torch.backends.cudnn.benchmark = True
if args.benchmark_mode == 'epoch-accuracy':
args.epochs = 1
train_loop_func = train_loop
logger = Logger('Accuracy test', print_freq=10)
args.evaluation = list(range(90))
train(train_loop_func, logger, args)
exit_code = 0
if args.local_rank == 0:
train_loss_results, val_acc_results, train_time_results = logger.print_results()
print(train_time_results)
print(train_loss_results)
print(val_acc_results)
measured_results = create_json_file(val_acc_results, train_loss_results, ngpus=8, bs=args.batch_size)
save_json('/results/results.json', measured_results)
print(measured_results)
benchmark_results = load_json(args.benchmark_file)
exit_code = compare_acc(measured_results, benchmark_results, args)
exit(exit_code)
if __name__ == "__main__":
parser = make_parser()
parser = add_benchmark_args(parser)
args = parser.parse_args()
print(args)
main(args)

View File

@ -1,199 +0,0 @@
# core imports
import os
import numpy as np
import json
from pprint import pprint
import time
# pytorch imports
import torch
import torch.utils.data.distributed
from torch.autograd import Variable
# Apex imports
try:
from apex.parallel.LARC import LARC
from apex.parallel import DistributedDataParallel as DDP
from apex.fp16_utils import *
except ImportError:
raise ImportError("Please install APEX from https://github.com/nvidia/apex")
# project imports
from main import train, make_parser
from src.logger import BenchLogger
# from src.train import benchmark_inference_loop, benchmark_train_loop
from SSD import _C as C
RESULT = None
def add_benchmark_args(parser):
parser.add_argument('--benchmark-mode', type=str, choices=['training', 'inference'],
default='inference', required=True)
parser.add_argument('--results-file', default='experiment_raport.json', type=str,
help='file in which to store JSON experiment raport')
parser.add_argument('--benchmark-file', type=str, default=None, metavar='FILE',
help='path to the file with baselines')
return parser
def benchmark_train_loop(model, loss_func, epoch, optim, train_dataloader, val_dataloader, encoder, iteration, logger, args, mean, std):
start_time = None
# tensor for results
result = torch.zeros((1,)).cuda()
for i, data in enumerate(loop(train_dataloader)):
if i >= args.benchmark_warmup:
start_time = time.time()
img = data[0][0][0]
bbox = data[0][1][0]
label = data[0][2][0]
label = label.type(torch.cuda.LongTensor)
bbox_offsets = data[0][3][0]
# handle random flipping outside of DALI for now
bbox_offsets = bbox_offsets.cuda()
img, bbox = C.random_horiz_flip(img, bbox, bbox_offsets, 0.5, False)
if not args.no_cuda:
img = img.cuda()
bbox = bbox.cuda()
label = label.cuda()
bbox_offsets = bbox_offsets.cuda()
img.sub_(mean).div_(std)
N = img.shape[0]
if bbox_offsets[-1].item() == 0:
print("No labels in batch")
continue
bbox, label = C.box_encoder(N, bbox, bbox_offsets, label, encoder.dboxes.cuda(), 0.5)
M = bbox.shape[0] // N
bbox = bbox.view(N, M, 4)
label = label.view(N, M)
ploc, plabel = model(img)
ploc, plabel = ploc.float(), plabel.float()
trans_bbox = bbox.transpose(1, 2).contiguous().cuda()
if not args.no_cuda:
label = label.cuda()
gloc = Variable(trans_bbox, requires_grad=False)
glabel = Variable(label, requires_grad=False)
loss = loss_func(ploc, plabel, gloc, glabel)
# loss scaling
if args.fp16:
if args.amp:
with optim.scale_loss(loss) as scale_loss:
scale_loss.backward()
else:
optim.backward(loss)
else:
loss.backward()
optim.step()
optim.zero_grad()
iteration += 1
# reduce all results from every gpu
if i >= args.benchmark_warmup + args.benchmark_iterations:
result.data[0] = logger.print_result()
if args.N_gpu > 1:
torch.distributed.reduce(result, 0)
if args.local_rank == 0:
global RESULT
RESULT = float(result.data[0])
return
if i >= args.benchmark_warmup:
logger.update(args.batch_size, time.time() - start_time)
def loop(dataloader):
while True:
for data in dataloader:
yield data
def benchmark_inference_loop(model, loss_func, epoch, optim, train_dataloader, val_dataloader, encoder, iteration, logger, args, mean, std):
assert args.N_gpu == 1, 'Inference benchmark only on 1 gpu'
start_time = None
model.eval()
i=-1
dataloader = loop(val_dataloader)
while True:
i+=1
with torch.no_grad():
torch.cuda.synchronize()
if i >= args.benchmark_warmup:
start_time = time.time()
data = next(dataloader)
img = data[0]
if not args.no_cuda:
img = img.cuda()
if args.fp16:
img = img.half()
img.sub_(mean).div_(std)
img = Variable(img, requires_grad=False)
_ = model(img)
torch.cuda.synchronize()
if i >= args.benchmark_warmup + args.benchmark_iterations:
global RESULT
RESULT = logger.print_result()
return
if i >= args.benchmark_warmup:
logger.update(args.batch_size, time.time() - start_time)
def main(args):
if args.local_rank == 0:
os.makedirs('./models', exist_ok=True)
if args.seed is not None:
print("Using seed = {}".format(args.seed))
torch.manual_seed(args.seed)
np.random.seed(seed=args.seed)
torch.backends.cudnn.benchmark = True
if args.benchmark_mode == 'training':
train_loop_func = benchmark_train_loop
logger = BenchLogger('Training benchmark')
else:
train_loop_func = benchmark_inference_loop
logger = BenchLogger('Inference benchmark')
args.epochs = 1
train(train_loop_func, logger, args)
if args.local_rank == 0:
global RESULT
with open(args.results_file) as f:
results = json.load(f)
results['metrics'][str(args.N_gpu)][str(args.batch_size)] = {'images_per_second': RESULT}
pprint(results)
with open(args.results_file, 'w') as f:
json.dump(results, f)
if __name__ == "__main__":
parser = make_parser()
parser = add_benchmark_args(parser)
args = parser.parse_args()
print(args)
main(args)

View File

@ -1,115 +0,0 @@
import json
# terminal stdout colors
OKBLUE = '\033[94m'
OKGREEN = '\033[92m'
WARNING = '\033[93m'
FAIL = '\033[91m'
ENDC = '\033[0m'
# load results and benchmark
def load_json(filepath):
with open(filepath) as f:
data = json.load(f)
return data
def save_json(filepath, data):
with open(filepath, 'w') as f:
json.dump(data, f)
# compare func
def compare(measured_value, true_value, pmargin=0.1):
assert 0 < pmargin < 1, 'Margin should be in range [0, 1]'
return (1 - pmargin) * true_value < measured_value
# compare 2 benchmark json files
def compare_benchmarks(results, benchmark, args, pmargin=0.1):
# sanity check
for metric in results['metric_keys']:
if metric not in benchmark['metric_keys']:
assert False, "You want to compare {} metric which doesn't appear in benchmark file".format(metric)
assert len(args.bs) <= len(benchmark['bs']), 'len(args.bs) <= len(benchmark["bs"] ({} <= {})'.format(len(args.bs),
len(benchmark[
'bs']))
assert len(args.bs) == len(results['bs']), 'len(args.bs) <= len(results["bs"] ({} == {})'.format(len(args.bs),
len(results['bs']))
for bs in results['bs']:
if bs not in benchmark['bs']:
assert False, "You want to compare batch size = {} which doesn't appear in benchmark file".format(bs)
assert len(args.ngpus) <= len(benchmark['ngpus']), 'len(args.ngpus) <= len(benchmark["ngpus"]) ({} <= {})'.format(
len(args.bs), len(benchmark['ngpus']))
assert len(args.ngpus) == len(results['ngpus']), 'len(args.ngpus) == len(results["ngpus"]) ({} == {})'.format(
len(args.bs), len(results['ngpus']))
for gpu in results['ngpus']:
if gpu not in benchmark['ngpus']:
assert False, "You want to compare {} gpus results which don't appear in benchmark file".format(gpu)
# compare measured numbers with benchmark
exit = 0
for metric in results['metric_keys']:
for gpu in results['ngpus']:
for bs in results['bs']:
measured_metric = results['metrics'][str(gpu)][str(bs)][metric]
ground_truth_metric = benchmark['metrics'][str(gpu)][str(bs)][metric]
ok = compare(measured_metric, ground_truth_metric, pmargin)
if ok:
print(OKGREEN + 'BENCHMARK PASSED: metric={} gpu={} bs={}'.format(metric, gpu, bs) + ENDC)
else:
print(FAIL + 'BENCHMARK NOT PASSED: metric={} gpu={} bs={}'.format(metric, gpu, bs) + ENDC)
exit = 1
return exit
# compare 2 benchmark json files
def compare_acc(results, benchmark, args):
# sanity check
for metric in results['metric_keys']:
if metric not in benchmark['metric_keys']:
assert False, "You want to compare {} metric which doesn't appear in benchmark file".format(metric)
for bs in results['bs']:
if bs not in benchmark['bs']:
assert False, "You want to compare batch size = {} which doesn't appear in benchmark file".format(bs)
for gpu in results['ngpus']:
if gpu not in benchmark['ngpus']:
assert False, "You want to compare {} gpus results which don't appear in benchmark file".format(gpu)
# compare measured numbers with benchmark
for i, (result, ground_truth) in enumerate(zip(results['metrics']['val.acc'], benchmark['metrics']['val.acc'])):
if i > 43: # before first decay accuracy tends to vary more than 15% at ~30th epoch
if ground_truth * 0.9 > result:
print(FAIL + 'ACCURACY TEST NOT PASSED' + ENDC)
return 1
# compare measured numbers with benchmark
for i, (result, ground_truth) in enumerate(zip(results['metrics']['train.loss'], benchmark['metrics']['train.loss'])):
if i > 43:
if ground_truth * 1.1 < result:
print(FAIL + 'LOSS TEST NOT PASSED' + ENDC)
return 1
print(OKGREEN + 'ACCURACY TEST PASSED' + ENDC)
return 0
def create_json_file(val_acc_results, train_loss_results, ngpus=8, bs=32):
results = {"ngpus": [ngpus],
"bs": [bs],
"metric_keys": ["train.loss", "val.acc"],
"metrics": {
"train.loss": [],
"val.acc": []
}
}
for i, ((epoch1, acc), (epoch2, loss)) in enumerate(zip(val_acc_results, train_loss_results)):
assert i == epoch1 == epoch2
results['metrics']['train.loss'].append(loss)
results['metrics']['val.acc'].append(acc)
return results

View File

@ -1,4 +0,0 @@
#!/bin/bash
python3 -m torch.distributed.launch --nproc_per_node=8 qa/qa_accuracy_main.py --bs 64 --fp16 --warmup 300 --learning-rate 2.6e-3 --seed 1 --benchmark-mode epoch-accuracy --benchmark-file qa/curve_baselines/SSD300_pytorch_19.01_fp16_1epoch_run_acc_baseline.json --data $1

View File

@ -1,4 +0,0 @@
#!/bin/bash
python3 -m torch.distributed.launch --nproc_per_node=8 qa/qa_accuracy_main.py --bs 64 --fp16 --warmup 300 --learning-rate 2.6e-3 --seed 1 --benchmark-mode full-accuracy --benchmark-file qa/curve_baselines/SSD300_pytorch_18.08_fp16_full_run_acc_baseline.json --data $1

View File

@ -1,4 +0,0 @@
#!/bin/bash
python3 -m torch.distributed.launch --nproc_per_node=8 qa/qa_accuracy_main.py --bs 32 --warmup 300 --learning-rate 2.6e-3 --seed 1 --benchmark-mode epoch-accuracy --benchmark-file qa/curve_baselines/SSD300_pytorch_19.01_fp32_1epoch_run_acc_baseline.json --data $1

View File

@ -1,4 +0,0 @@
#!/bin/bash
python3 -m torch.distributed.launch --nproc_per_node=8 qa/qa_accuracy_main.py --bs 32 --warmup 300 --learning-rate 2.6e-3 --seed 1 --benchmark-mode full-accuracy --benchmark-file qa/curve_baselines/SSD300_pytorch_18.08_fp32_full_run_acc_baseline.json --data $1

View File

@ -1,3 +0,0 @@
#!/bin/bash
python ./qa/benchmark_performance.py --benchmark-mode inference --ngpus 1 --bs 2 4 8 16 32 --fp16 --bench-warmup 100 --bench-iterations 200 --benchmark-file qa/benchmark_baselines/SSD300_pytorch_19.05_inference_fp16.json --data $1

View File

@ -1,3 +0,0 @@
#!/bin/bash
python ./qa/benchmark_performance.py --benchmark-mode inference --ngpus 1 --bs 2 4 8 16 32 --bench-warmup 100 --bench-iterations 200 --benchmark-file qa/benchmark_baselines/SSD300_pytorch_19.05_inference_fp32.json --data $1

View File

@ -1,3 +0,0 @@
#!/bin/bash
python ./qa/benchmark_performance.py --benchmark-mode training --ngpus 1 4 --bs 2 4 8 16 32 --fp16 --bench-warmup 100 --bench-iterations 200 --benchmark-file qa/benchmark_baselines/SSD300_pytorch_19.01_training_fp16.json --data $1

View File

@ -1,3 +0,0 @@
#!/bin/bash
python ./qa/benchmark_performance.py --benchmark-mode training --ngpus 1 4 --bs 2 4 8 16 --bench-warmup 100 --bench-iterations 200 --benchmark-file qa/benchmark_baselines/SSD300_pytorch_19.01_training_fp32.json --data $1

View File

@ -35,9 +35,9 @@ class COCOPipeline(Pipeline):
super(COCOPipeline, self).__init__(batch_size=batch_size, device_id=device_id,
num_threads=num_threads, seed = seed)
try:
if torch.distributed.is_initialized():
shard_id = torch.distributed.get_rank()
except RuntimeError:
else:
shard_id = 0
self.input = ops.COCOReader(file_root = file_root, annotations_file = annotations_file,

View File

@ -0,0 +1,3 @@
data/
vocab/
results/

129
PyTorch/LanguageModeling/BERT/.gitignore vendored Normal file
View File

@ -0,0 +1,129 @@
# Initially taken from Github's Python gitignore file
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
# C extensions
*.so
#Data
data/*/*/
data/*/*.zip
# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
.hypothesis/
.pytest_cache/
# Translations
*.mo
*.pot
# Django stuff:
*.log
local_settings.py
db.sqlite3
# Flask stuff:
instance/
.webassets-cache
# Scrapy stuff:
.scrapy
# Sphinx documentation
docs/_build/
# PyBuilder
target/
# Jupyter Notebook
.ipynb_checkpoints
# IPython
profile_default/
ipython_config.py
# pyenv
.python-version
# celery beat schedule file
celerybeat-schedule
# SageMath parsed files
*.sage.py
# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/
# Spyder project settings
.spyderproject
.spyproject
# Rope project settings
.ropeproject
# mkdocs documentation
/site
# mypy
.mypy_cache/
.dmypy.json
dmypy.json
# Pyre type checker
.pyre/
# vscode
.vscode
# TF code
tensorflow_code
# Models
models

View File

@ -0,0 +1,27 @@
ARG FROM_IMAGE_NAME=gitlab-master.nvidia.com:5005/dl/dgx/pytorch:19.05-py3-devel
FROM ${FROM_IMAGE_NAME}
RUN apt-get update && apt-get install -y pbzip2 pv bzip2 cabextract
#WORKDIR /opt
#RUN cd pytorch/apex \
# && git fetch origin pull/182/head:norm_fix \
# && git checkout norm_fix \
# && python setup.py develop --cuda_ext --cpp_ext
WORKDIR /opt
RUN cd pytorch/apex ; \
pip uninstall apex; \
pip uninstall apex; \
git checkout master; \
git pull; \
pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" .
WORKDIR /workspace
RUN git clone https://github.com/attardi/wikiextractor.git
RUN git clone https://github.com/soskek/bookcorpus.git
WORKDIR /workspace/bert
COPY . .
RUN pip install tqdm boto3 requests six ipdb h5py html2text nltk progressbar

View File

@ -0,0 +1,202 @@
Apache License
Version 2.0, January 2004
http://www.apache.org/licenses/
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
1. Definitions.
"License" shall mean the terms and conditions for use, reproduction,
and distribution as defined by Sections 1 through 9 of this document.
"Licensor" shall mean the copyright owner or entity authorized by
the copyright owner that is granting the License.
"Legal Entity" shall mean the union of the acting entity and all
other entities that control, are controlled by, or are under common
control with that entity. For the purposes of this definition,
"control" means (i) the power, direct or indirect, to cause the
direction or management of such entity, whether by contract or
otherwise, or (ii) ownership of fifty percent (50%) or more of the
outstanding shares, or (iii) beneficial ownership of such entity.
"You" (or "Your") shall mean an individual or Legal Entity
exercising permissions granted by this License.
"Source" form shall mean the preferred form for making modifications,
including but not limited to software source code, documentation
source, and configuration files.
"Object" form shall mean any form resulting from mechanical
transformation or translation of a Source form, including but
not limited to compiled object code, generated documentation,
and conversions to other media types.
"Work" shall mean the work of authorship, whether in Source or
Object form, made available under the License, as indicated by a
copyright notice that is included in or attached to the work
(an example is provided in the Appendix below).
"Derivative Works" shall mean any work, whether in Source or Object
form, that is based on (or derived from) the Work and for which the
editorial revisions, annotations, elaborations, or other modifications
represent, as a whole, an original work of authorship. For the purposes
of this License, Derivative Works shall not include works that remain
separable from, or merely link (or bind by name) to the interfaces of,
the Work and Derivative Works thereof.
"Contribution" shall mean any work of authorship, including
the original version of the Work and any modifications or additions
to that Work or Derivative Works thereof, that is intentionally
submitted to Licensor for inclusion in the Work by the copyright owner
or by an individual or Legal Entity authorized to submit on behalf of
the copyright owner. For the purposes of this definition, "submitted"
means any form of electronic, verbal, or written communication sent
to the Licensor or its representatives, including but not limited to
communication on electronic mailing lists, source code control systems,
and issue tracking systems that are managed by, or on behalf of, the
Licensor for the purpose of discussing and improving the Work, but
excluding communication that is conspicuously marked or otherwise
designated in writing by the copyright owner as "Not a Contribution."
"Contributor" shall mean Licensor and any individual or Legal Entity
on behalf of whom a Contribution has been received by Licensor and
subsequently incorporated within the Work.
2. Grant of Copyright License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
copyright license to reproduce, prepare Derivative Works of,
publicly display, publicly perform, sublicense, and distribute the
Work and such Derivative Works in Source or Object form.
3. Grant of Patent License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
(except as stated in this section) patent license to make, have made,
use, offer to sell, sell, import, and otherwise transfer the Work,
where such license applies only to those patent claims licensable
by such Contributor that are necessarily infringed by their
Contribution(s) alone or by combination of their Contribution(s)
with the Work to which such Contribution(s) was submitted. If You
institute patent litigation against any entity (including a
cross-claim or counterclaim in a lawsuit) alleging that the Work
or a Contribution incorporated within the Work constitutes direct
or contributory patent infringement, then any patent licenses
granted to You under this License for that Work shall terminate
as of the date such litigation is filed.
4. Redistribution. You may reproduce and distribute copies of the
Work or Derivative Works thereof in any medium, with or without
modifications, and in Source or Object form, provided that You
meet the following conditions:
(a) You must give any other recipients of the Work or
Derivative Works a copy of this License; and
(b) You must cause any modified files to carry prominent notices
stating that You changed the files; and
(c) You must retain, in the Source form of any Derivative Works
that You distribute, all copyright, patent, trademark, and
attribution notices from the Source form of the Work,
excluding those notices that do not pertain to any part of
the Derivative Works; and
(d) If the Work includes a "NOTICE" text file as part of its
distribution, then any Derivative Works that You distribute must
include a readable copy of the attribution notices contained
within such NOTICE file, excluding those notices that do not
pertain to any part of the Derivative Works, in at least one
of the following places: within a NOTICE text file distributed
as part of the Derivative Works; within the Source form or
documentation, if provided along with the Derivative Works; or,
within a display generated by the Derivative Works, if and
wherever such third-party notices normally appear. The contents
of the NOTICE file are for informational purposes only and
do not modify the License. You may add Your own attribution
notices within Derivative Works that You distribute, alongside
or as an addendum to the NOTICE text from the Work, provided
that such additional attribution notices cannot be construed
as modifying the License.
You may add Your own copyright statement to Your modifications and
may provide additional or different license terms and conditions
for use, reproduction, or distribution of Your modifications, or
for any such Derivative Works as a whole, provided Your use,
reproduction, and distribution of the Work otherwise complies with
the conditions stated in this License.
5. Submission of Contributions. Unless You explicitly state otherwise,
any Contribution intentionally submitted for inclusion in the Work
by You to the Licensor shall be under the terms and conditions of
this License, without any additional terms or conditions.
Notwithstanding the above, nothing herein shall supersede or modify
the terms of any separate license agreement you may have executed
with Licensor regarding such Contributions.
6. Trademarks. This License does not grant permission to use the trade
names, trademarks, service marks, or product names of the Licensor,
except as required for reasonable and customary use in describing the
origin of the Work and reproducing the content of the NOTICE file.
7. Disclaimer of Warranty. Unless required by applicable law or
agreed to in writing, Licensor provides the Work (and each
Contributor provides its Contributions) on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied, including, without limitation, any warranties or conditions
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
PARTICULAR PURPOSE. You are solely responsible for determining the
appropriateness of using or redistributing the Work and assume any
risks associated with Your exercise of permissions under this License.
8. Limitation of Liability. In no event and under no legal theory,
whether in tort (including negligence), contract, or otherwise,
unless required by applicable law (such as deliberate and grossly
negligent acts) or agreed to in writing, shall any Contributor be
liable to You for damages, including any direct, indirect, special,
incidental, or consequential damages of any character arising as a
result of this License or out of the use or inability to use the
Work (including but not limited to damages for loss of goodwill,
work stoppage, computer failure or malfunction, or any and all
other commercial damages or losses), even if such Contributor
has been advised of the possibility of such damages.
9. Accepting Warranty or Additional Liability. While redistributing
the Work or Derivative Works thereof, You may choose to offer,
and charge a fee for, acceptance of support, warranty, indemnity,
or other liability obligations and/or rights consistent with this
License. However, in accepting such obligations, You may act only
on Your own behalf and on Your sole responsibility, not on behalf
of any other Contributor, and only if You agree to indemnify,
defend, and hold each Contributor harmless for any liability
incurred by, or claims asserted against, such Contributor by reason
of your accepting any such warranty or additional liability.
END OF TERMS AND CONDITIONS
APPENDIX: How to apply the Apache License to your work.
To apply the Apache License to your work, attach the following
boilerplate notice, with the fields enclosed by brackets "[]"
replaced with your own identifying information. (Don't include
the brackets!) The text should be enclosed in the appropriate
comment syntax for the file format. We also recommend that a
file or class name and description of purpose be included on the
same "printed page" as the copyright notice for easier
identification within third-party archives.
Copyright [yyyy] [name of copyright owner]
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.

View File

@ -0,0 +1,554 @@
# Bert For PyTorch
This repository provides scripts and recipes to pretrain BERT from a dataset of choice and achieve state of the art accuracy on relevant fine tuning tasks. This is tested and maintained by NVIDIA.
## Table Of Contents:
* [The model](#the-model)
* [Default configuration](#default-configuration)
* [Setup](#setup)
* [Requirements](#requirements)
* [Quick start guide](#quick-start-guide)
* [Details](#details)
* [Command line options](#command-line-options)
* [Getting the data](#getting-the-data)
* [Training process](#training-process)
* [Pre-training](#pre-training)
* [Fine tuning](#fine-tuning)
* [Enabling mixed precision](#enabling-mixed-precision)
* [Inference process](#inference-process)
* [Benchmarking](#benchmarking)
* [Training performance benchmark](#training-performance-benchmark)
* [Inference performance benchmark](#inference-performance-benchmark)
* [Results](#results)
* [Training accuracy results](#training-accuracy-results)
* [Training stability test](#training-stability-test)
* [Training performance results](#training-performance-results)
* [NVIDIA DGX-1 (8x V100 16G)](#nvidia-dgx-1-8x-v100-16g)
* [NVIDIA DGX-1 (8x V100 32G)](#nvidia-dgx-1-8x-v100-32g)
* [NVIDIA DGX-2 (16x V100 32G)](#nvidia-dgx-2-16x-v100-32g)
* [Inference performance results](#inference-performance-results)
* [NVIDIA DGX-1 16G (1x V100 16G)](#nvidia-dgx-1-16g-1x-v100-16g)
* [NVIDIA DGX-1 32G (1x V100 32G)](#nvidia-dgx-1-32g-1x-v100-32g)
* [NVIDIA DGX-2 32G (1x V100 32G)](#nvidia-dgx-2-32g-1x-v100-32g)
* [Changelog](#changelog)
* [Known issues](#known-issues)
## The model
BERT, or Bidirectional Encoder Representations from Transformers, is a new method of pre-training language representations which obtains state-of-the-art results on a wide array of Natural Language Processing (NLP) tasks. This model is based on [BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding](https://arxiv.org/abs/1810.04805) paper. NVIDIA's BERT 19.04 is an optimized version of [Google's official implementation](https://github.com/google-research/bert), leveraging mixed precision arithmetic and tensor cores on V100 GPUS for faster training times while maintaining target accuracy.
The repository also contains scripts to interactively launch data download, training, benchmarking and inference routines in a Docker container for both pretraining and fine tuning for Question Answering. The major differences between the official implementation of the paper and our version of BERT are as follows:
- [Mixed precision](https://arxiv.org/abs/1710.03740) training offers significant computational speedup by performing operations in half-precision format, while storing minimal information in single-precision to retain as much information as possible in critical parts of the network. Since the introduction of [tensor cores](https://developer.nvidia.com/tensor-cores) in the Volta and Turing architectures, significant training speedups are experienced by switching to mixed precision -- up to 3x overall speedup on the most arithmetically intense model architectures. Using [mixed precision training](https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html) previously required two steps:
- 1. Porting the model to use the FP16 data type where appropriate.
- 2. Manually adding loss scaling to preserve small gradient values.
Mixed precision is enabled in PyTorch by using the Automatic Mixed Precision (AMP), library from [APEX](https://github.com/NVIDIA/apex) that casts variables to half-precision upon retrieval, while storing variables in single-precision format. Furthermore, to preserve small gradient magnitudes in backpropagation, a [loss scaling](https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html#lossscaling) step must be included when applying gradients. In PyTorch, loss scaling can be easily applied by using scale_loss() method provided by amp. The scaling value to be used can be [dynamic](https://nvidia.github.io/apex/fp16_utils.html#apex.fp16_utils.DynamicLossScaler) or fixed.
For an in-depth walk through on AMP, check out sample usage [here](https://github.com/NVIDIA/apex/tree/master/apex/amp#usage-and-getting-started). [APEX](https://github.com/NVIDIA/apex) is a PyTorch extension that contains utility libraries, such as AMP, which require minimal network code changes to leverage tensor cores performance.
- Scripts to download dataset for
- Pretraining - [Wikipedia](https://dumps.wikimedia.org/), [BookCorpus](http://yknzhu.wixsite.com/mbweb)
- Fine Tuning - [SQuaD](https://rajpurkar.github.io/SQuAD-explorer/) (Stanford Question Answering Dataset), Pretrained Weights from Google
- Custom fused CUDA kernels for faster computations
- Multi-GPU/Multi-Node support using [APEX DDP](https://github.com/NVIDIA/apex#2-distributed-training)
These techniques and optimizations improve model performance and reduce training time, allowing you to perform various NLP tasks with no additional effort.
Other publicly available implementations of BERT include:
1. [Hugging Face](https://github.com/huggingface/pytorch-pretrained-BERT)
2. [codertimo](https://github.com/codertimo/BERT-pytorch)
This model trains with mixed precision tensor cores on Volta, therefore researchers can get results much faster than training without tensor cores. This model is tested against each NGC monthly container release to ensure consistent accuracy and performance over time.
### Default configuration
BERT's model architecture is a multi-layer bidirectional Transformer encoder. Based on the model size, we have the following two default configurations of BERT.
| **Model** | **Hidden layers** | **Hidden unit size** | **Attention heads** | **Feedforward filter size** | **Max sequence length** | **Parameters** |
|:---------:|:----------:|:----:|:---:|:--------:|:---:|:----:|
|BERTBASE |12 encoder| 768| 12|4 x 768|512|110M|
|BERTLARGE|24 encoder|1024| 16|4 x 1024|512|330M|
## Setup
The following section list the requirements in order to start training the BERT model.
### Requirements
This repository contains `Dockerfile` which extends the TensorFlow NGC container and encapsulates some dependencies. Aside from these dependencies, ensure you have the following components:
- [NVIDIA Docker](https://github.com/NVIDIA/nvidia-docker)
- [PyTorch 19.04-py3](https://ngc.nvidia.com/catalog/containers/nvidia:pytorch) NGC container
- [NVIDIA Volta based GPU](https://www.nvidia.com/en-us/data-center/volta-gpu-architecture/)
For more information about how to get started with NGC containers, see the following sections from the NVIDIA GPU Cloud Documentation and the Deep Learning Documentation:
- [Getting Started Using NVIDIA GPU Cloud](https://docs.nvidia.com/ngc/ngc-getting-started-guide/index.html)
- [Accessing And Pulling From The NGC Container Registry](https://docs.nvidia.com/deeplearning/dgx/user-guide/index.html#accessing_registry)
- [Running PyTorch](https://docs.nvidia.com/deeplearning/dgx/pytorch-release-notes/running.html#running)
## Quick start guide
To pretrain or fine tune your model for Question Answering using mixed precision with tensor cores or using FP32, perform the following steps using the default parameters of the BERT model.
### 1. Clone the repository.
```bash
git clone https://github.com/NVIDIA/DeepLearningExamples
cd DeepLearningExamples/PyTorch/LanguageModeling/BERT
```
### 2. Build the BERT TensorFlow NGC container.
```bash
bash scripts/docker/build.sh
```
### 3. Download and preprocess the dataset.
This repository provides scripts to download, verify and extract various datasets:
SQuaD and swag for fine-tuning as well as Wikipedia and BookCorpus for pretraining. If you just want to do fine-tuning, you can also download the pretrained weights.
To download, verify, and extract required datasets:
```bash
bash scripts/data_download.sh
```
Datasets can also be mixed before used for training or inference. In case of training there are two options:
The script launches a docker container with current directory mounted and downloads datasets to `data/` folder on the host.
Datasets can also be mixed before used for training or inference
### 4. Start an interactive session in the NGC container to run training/inference.
After you build the container image and download the data, you can start an interactive CLI session as follows:
```bash
bash scripts/docker/launch.sh
```
The `launch.sh` script assumes that the datasets are in the following locations by default after downloading data.
- SQuaD v1.1 - `data/squad/v1.1`
- BERT - `data/pretrained_models_google/uncased_L-24_H-1024_A-16`
- Wikipedia - `data/wikipedia_corpus/hdf5_shards`
- BookCorpus - `data/bookcorpus/hdf5_shards`
### 5. Start pre-training.
BERT is designed to pre-train deep bidirectional representations for language representations. The following scripts are to replicate pretraining on Wikipedia+Book Corpus from the [paper](https://arxiv.org/pdf/1810.04805.pdf). These scripts are general and can be used for pretraining language representations on any corpus of choice.
From within the container, you can use the following script to run pre-training.
```bash
bash scripts/run_pretraining.sh <train_batch_size_per_gpu> <learning_rate> <precision> <num_gpus> <warmup_proportion> <train_steps> <save_checkpoint_steps> <create_logfile>
```
<!-- For FP16 training with XLA using a DGX-1 V100 32G, run:
```bash
bash scripts/run_pretraining.sh 14 8 5e-5 fp16_xla 8 5000 2285000 5000 true
```
For FP32 training without XLA using a DGX-1 V100 32G, run:
```bash
bash scripts/run_pretraining.sh 6 6 2e-5 fp32 8 2000 5333333 5000 true
``` -->
### 6. Start fine tuning.
The above pretrained BERT representations can be fine tuned with just one additional output layer for a state-of-the-art Question Answering system. From within the container, you can use the following script to run fine-training for SQuaD.
```bash
bash scripts/run_squad.sh <batch_size_per_gpu> <learning_rate_per_gpu> <precision> <num_gpus> <checkpoint> <epochs>
```
For FP32 training using a DGX-1 V100 32G, run:
```bash
bash scripts/run_squad.sh 5 5e-6 fp32 8 /bert/bert_model.ckpt 2
```
### 7. Start validation/evaluation.
The `run_squad_inference.sh` script runs inference on a checkpoint fine tuned for SQuaD and evaluates the goodness of predictions on the basis of exact match and F1 score.
```bash
bash scripts/run_squad_inference.sh <init_checkpoint> <batch_size> <precision>
```
For FP32 inference without XLA using a DGX-1 V100 32G, run:
```bash
bash scripts/run_squad_inference.sh /results/model.ckpt 8 fp32
```
## Details
The following sections provide greater details of the dataset, running training and inference, and the training results.
### Command line options
To see the full list of available options and their descriptions, use the -h or --help command line option, for example:
```bash
python run_pretraining.py --help
python run_squad.py --help
```
Aside from options to set hyperparameters, the relevant options to control the behaviour of the `run_pretraining.py` script are:
```bash
--[no]amp: Whether to enable AMP ops.(default: 'false')
--[no]amp_fastmath: Whether to enable AMP fasthmath ops.(default: 'false')
--bert_config_file: The config json file corresponding to the pre-trained BERT model. This specifies the model architecture.
--[no]do_eval: Whether to run evaluation on the dev set.(default: 'false')
--[no]do_train: Whether to run training.(evaluation: 'false')
--eval_batch_size: Total batch size for eval.(default: '8')(an integer)
--[no]fastmath: Whether to enable loss scaler for fasthmath ops.(default: 'false')
--[no]horovod: Whether to use Horovod for multi-gpu runs(default: 'false')
--init_checkpoint: Initial checkpoint (usually from a pre-trained BERT model).
--input_file: Input TF example files (can be a glob or comma separated).
--iterations_per_loop: How many steps to make in each estimator call.(default: '1000')
```
Aside from options to set hyperparameters, some relevant options to control the behaviour of the run_squad.py script are:
```bash
--bert_config_file: The config json file corresponding to the pre-trained BERT model. This specifies the model architecture.
--[no]do_predict: Whether to run evaluation on the dev set. (default: 'false')
--[no]do_train: Whether to run training. (default: 'false')
--learning_rate: The initial learning rate for Adam.(default: '5e-06')(a number)
--max_answer_length: The maximum length of an answer that can be generated. This is needed because the start and end predictions are not conditioned on one another.(default: '30')(an integer)
--max_query_length: The maximum number of tokens for the question. Questions longer than this will be truncated to this length.(default: '64')(an integer)
--max_seq_length: The maximum total input sequence length after WordPiece tokenization. Sequences longer than this will be truncated, and sequences shorter than this will be padded.(default: '384')(an integer)
--predict_batch_size: Total batch size for predictions.(default: '8')(an integer)
--train_batch_size: Total batch size for training.(default: '8')(an integer)
--[no]use_fp16: Whether to use fp32 or fp16 arithmetic on GPU.(default: 'false')
--[no]use_xla: Whether to enable XLA JIT compilation.(default: 'false')
--[no]verbose_logging: If true, all of the warnings related to data processing will be printed. A number of warnings are expected for a normal SQuAD evaluation.(default: 'false')
--[no]version_2_with_negative: If true, the SQuAD examples contain some that do not have an answer.(default: 'false')
```
### Getting the data
For pre-training BERT, we use the concatenation of Wikipedia (2500M words) as well as Book Corpus (800M words). For Wikipedia, we extract only the text passages from [here](ftp://ftpmirror.your.org/pub/wikimedia/dumps/enwiki/20190301/enwiki-20190301-pages-articles-multistream.xml.bz2) and ignore headers list and tables. It is structured as a document level corpus rather than a shuffled sentence level corpus because it is critical to extract long contiguous sentences. The next step is to run `create_pretraining_data.py` with the document level corpus as input, which generates input data and labels for the masked language modeling and next sentence prediction tasks. Pre-training can also be performed on any corpus of your choice. The collection of data generation scripts are intended to be modular to allow modifications for additional preprocessing steps or to use additional data.
#### Mixing datasets
The repository provides tools to mix datasets for both training and finetuning.
In case of training there are two options:
a) inter sequence-pair mixing (after pretraining data is created)
In the `data/` directory, `merge_datasets_after_creation.sh` is a tool to mix data from multiple source corpora. To perform this mixing, the source corpora need to be already in the format of pretraining data, i.e. .hdf5 files. To call the script, use:
```bash
cd data
bash merge_datasets_after_creation.sh <destination_folder> <input_directories> <num_shards>
```
For example, to merge the bookcorpus and Wikipedia corpora provided with this repository and create 1024 new shards containing the mixed training instances, first make sure that `data/bookcorpus/hdf5_shards/` and `data/wikipedia_corpus/hdf5_shards/` exist and are filled with .hdf5, then run:
```
cd data
bash merge_datasets_after_creation.sh inter_instance_merged_wiki+books bookcorpus/hdf5_shards/,wikipedia_corpus/hdf5_shards/ 1024
```
b) intra sequence-pair mixing (before pretraining data is created)
In the `data/` directory, `merge_datasets_from_start.sh` is a tool to mix data from multiple source corpora. To perform this mixing, the source corpora must each be condensed into a single file that contains the entire corpus text, with line within the file corresponding to a document in the corpus. The script is then called as such:
```
cd data
merge_datasets_from_start.sh DESTINATION_FOLDER CORPUS_1 CORPUS_2 CORPUS_3 ...
```
For example, to merge the bookcorpus and Wikipedia corpora provided with this repository, first make sure that `data/bookcorpus/intermediate_files/bookcorpus.txt` and `data/wikipedia_corpus/intermediate_files/wikipedia.txt` exist, then run:
```
cd data
merge_datasets_from_start.sh intra_instance_merged_wiki+books bookcorpus/intermediate_files/bookcorpus.txt wikipedia_corpus/intermediate_files/wikipedia.txt
```
Note that `merge_datasets_from_start.sh` has a few dependencies, so it may be preferable to modify `data_download_helper.sh` to call the merging script and run `data_download.sh` so that the mixing process is done in a container.
#### Fine Tuning datasets
We can use a pre-trained BERT model for other fine tuning tasks like Question Answering. We use SQuaD for this task. SQuaD v1.1 has 100,000+ question-answer pairs on 500+ articles. SQuaD v2.0 combines v1.1 with an additional 50,000 new unanswerable questions and must not only answer questions but also determine when that is not possible.
### Training process
The training process consists of two steps: pre-training and fine tuning.
#### Pre-training
Pre-training is performed using the `run_pretraining.py` script along with parameters defined in the `scripts/run_pretraining.sh`.
The `run_pretraining.sh` script runs a job on a single node that trains the BERT-large model from scratch using the Wikipedia and Book corpus datasets as training data. By default, the training script:
- Runs on 8 GPUs with training batch size of 14 and evaluation batch size of 8 per GPU.
- Has FP16 precision enabled.
- Runs for 1144000 steps with 10000 warm-up steps.
- Saves a checkpoint every 5000 iterations (keeps only the latest checkpoint) and at the end of training. All checkpoints, evaluation results and training logs are saved to the `/results` directory (in the container which can be mounted to a local directory).
- Creates the log file containing all the output.
- Evaluates the model at the end of training. To skip evaluation, modify `--do_eval` to `False`.
These parameters will train Wikipedia + BooksCorpus to reasonable accuracy on a DGX1 with 32GB V100 cards. If you want to match googles best results from the BERT paper, you should either train for twice as many steps (2,288,000 steps) on a DGX1, or train on 16 GPUs on a DGX2. The DGX2 having 16 GPUs will be able to fit a batch size twice as large as a DGX1 (224 vs 112), hence the DGX2 can finish in half as many steps.
For example:
```bash
run_pretraining.sh <training_batch_size> <eval_batch_size> <learning-rate> <precision> <num_gpus> <warmup_proportion> <training_steps> <save_checkpoint_steps> <create_logfile>
```
Where:
- <training_batch_size> is per-gpu batch size used for training. Batch size varies with <precision>, larger batch sizes run more efficiently, but require more memory.
- <eval_batch_size> per-gpu batch size used for evaluation after training.<learning_rate> Default rate of 1e-4 is good for global batch size 256.
- <precision> Type of math in your model, can be either fp32, fp16, fastmath, amp_fm, amp_fm_xla, amp . The options mean:
- fp32 32 bit IEEE single precision floats.
- fp16 Hand-coded mixed precision 16 and 32 bit floats.
- fp16 Hand-coded mixed precision floats, JIT compiled with XLA.
- fastmath Matmuls done by tensor cores in mixed precision, the rest is done in FP32.
- amp_fm Alternative FastMath implementation that works by manipulating TensorFlows compute graph.
- amp_fm_xla The amp_fm flag plus XLA JIT compilation.
- amp Automatic rewrite of TensorFlow compute graph to take advantage of 16 bit arithmetic whenever that is safe.
- amp_xla The amp flag plus XLA JIT compilation.
- <num_gpus> Number of GPUs to use for training. Must be equal to or smaller than the number of GPUs attached to your node.
- <warmup_steps> Number of warm-up steps at the start of training.
- <training_steps> Total number of training steps.
- <save_checkpoint_steps> Controls how often checkpoints are saved. Default is 5000 steps.
- <create_logfile> Flag indicating if output should be written to a logfile or not (acceptable values are true or false, true indicates output should be saved to a logfile.)
For example:
```bash
bert_tf/scripts/run_pretraining.sh 14 8 1e-4 fp16_xla 16 10000 1144000 5000 true
```
Trains BERT-large from scratch on a single DGX-2 using FP16 arithmetic. This will take around 156 hours / 6.5 days. Checkpoints are written out every 5000 steps and all printouts are saved to a logfile.
#### Fine tuning
Fine tuning is performed using the `run_squad.py` script along with parameters defined in `scripts/run_squad.sh`.
The `run_squad.sh` script trains a model and performs evaluation on the SQuaD v1.1 dataset. By default, the training script:
- Uses 8 GPUs and batch size of 10 on each GPU.
- Has FP16 precision enabled.
- Is XLA enabled.
- Runs for 2 epochs.
- Saves a checkpoint every 1000 iterations (keeps only the latest checkpoint) and at the end of training. All checkpoints, evaluation results and training logs are saved to the `/results` directory (in the container which can be mounted to a local directory).
- Evaluation is done at the end of training. To skip evaluation, modify `--do_predict` to `False`.
This script outputs checkpoints to the `/results` directory, by default, inside the container. Mount point of `/results` can be changed in the `scripts/docker/launch.sh` file. The training log contains information about:
- Loss for the final step
- Training and evaluation performance
- F1 and exact match score on the Dev Set of SQuaD after evaluation.
The summary after training is printed in the following format:
```bash
I0312 23:10:45.137036 140287431493376 run_squad.py:1332] 0 Total Training Time = 3007.00 Training Time W/O start up overhead = 2855.92 Sentences processed = 175176
I0312 23:10:45.137243 140287431493376 run_squad.py:1333] 0 Training Performance = 61.3378 sentences/sec
I0312 23:14:00.550846 140287431493376 run_squad.py:1396] 0 Total Inference Time = 145.46 Inference Time W/O start up overhead = 131.86 Sentences processed = 10840
I0312 23:14:00.550973 140287431493376 run_squad.py:1397] 0 Inference Performance = 82.2095 sentences/sec
{"exact_match": 83.69914853358561, "f1": 90.8477003317459}
```
Multi-gpu training is enabled with the Horovod TensorFlow module. The following example runs training on 8 GPUs:
```bash
mpi_command="mpirun -np 8 -H localhost:8 \
--allow-run-as-root -bind-to none -map-by slot \
-x NCCL_DEBUG=INFO \
-x LD_LIBRARY_PATH \
-x PATH -mca pml ob1 -mca btl ^openib" \
python run_squad.py --horovod
```
### Enabling mixed precision
[Mixed precision](https://arxiv.org/abs/1710.03740) training offers significant computational speedup by performing operations in half-precision format, while storing minimal information in single-precision to retain as much information as possible in critical parts of the network. Since the introduction of [tensor cores](https://developer.nvidia.com/tensor-cores) in the Volta and Turing architectures, significant training speedups are experienced by switching to mixed precision -- up to 3x overall speedup on the most arithmetically intense model architectures. Using [mixed precision training](https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html) previously required two steps:
1. Porting the model to use the FP16 data type where appropriate.
2. Manually adding loss scaling to preserve small gradient values.
This can now be achieved using Automatic Mixed Precision (AMP) for TensorFlow to enable the full [mixed precision methodology](https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html#tensorflow) in your existing TensorFlow model code. AMP enables mixed precision training on Volta and Turing GPUs automatically. The TensorFlow framework code makes all necessary model changes internally.
In TF-AMP, the computational graph is optimized to use as few casts as necessary and maximize the use of FP16, and the loss scaling is automatically applied inside of supported optimizers. AMP can be configured to work with the existing `tf.contrib` loss scaling manager by disabling the AMP scaling with a single environment variable to perform only the automatic mixed-precision optimization. It accomplishes this by automatically rewriting all computation graphs with the necessary operations to enable mixed precision training and automatic loss scaling.
For information about:
- How to train using mixed precision, see the [Mixed Precision Training](https://arxiv.org/abs/1710.03740) paper and [Training With Mixed Precision](https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html) documentation.
- How to access and enable AMP for TensorFlow, see [Using TF-AMP](https://docs.nvidia.com/deeplearning/dgx/tensorflow-user-guide/index.html#tfamp) from the TensorFlow User Guide.
- Techniques used for mixed precision training, see the [Mixed-Precision Training of Deep Neural Networks](https://devblogs.nvidia.com/mixed-precision-training-deep-neural-networks/) blog.
### Inference process
Inference on a fine tuned Question Answering system is performed using the `run_squad.py` script along with parameters defined in the `scripts/run_squad_inference.sh`. Inference is supported on single GPU at this moment.
The `run_squad_inference.sh` script trains a model and performs evaluation on the SQuaD v1.1 dataset. By default, the inferencing script:
- Has FP16 precision enabled
- Is XLA enabled
- Evaluates the latest checkpoint present in `/results` with a batch size of 8
This script outputs predictions file to `/results/predictions.json` and computes F1 score and exact match score using SQuaD's `evaluate-v1.1.py`. Mount point of `/results` can be changed in the `scripts/docker/launch.sh` file.
The output log contains information about:
- Evaluation performance
- F1 and exact match score on the Dev Set of SQuaD after evaluation.
The summary after inference is printed in the following format:
```bash
I0312 23:14:00.550846 140287431493376 run_squad.py:1396] 0 Total Inference Time = 145.46 Inference Time W/O start up overhead = 131.86 Sentences processed = 10840
I0312 23:14:00.550973 140287431493376 run_squad.py:1397] 0 Inference Performance = 82.2095 sentences/sec
{"exact_match": 83.69914853358561, "f1": 90.8477003317459}
```
## Benchmarking
The following section shows how to run benchmarks measuring the model performance in training and inference modes.
Benchmarking can be performed for both training and inference. Both scripts run the BERT model for fine tuning. You can specify whether benchmarking is performed in FP16 or FP32 by specifying it as an argument to the benchmarking scripts.
Both of these benchmarking scripts enable you to run a number of epochs and extract performance numbers.
### Training performance benchmark
Training benchmarking can be performed by running the script:
```bash
scripts/finetune_train_benchmark.sh squad <fp16/fp32> <use_xla> <num_gpu> <batch_size/gpu> <lr>
```
### Inference performance benchmark
Inference benchmarking can be performed by running the script:
```bash
scripts/finetune_inference_benchmark.sh squad <fp16/fp32> <use_xla> <batch_size> <path-to-checkpoint>
```
## Results
The following sections provide details on how we achieved our performance and accuracy in training and inference for Question Answering fine tuning.
### Training accuracy results
Our results were obtained by running the `run_squad.py` training script in the TensorFlow 19.03-py3 NGC container on NVIDIA DGX-1 with 8x V100 32G GPUs.
| **Number of GPUs** | **Batch size per GPU** | **Training time with FP16 (Hrs)** | **Training time with FP32 (Hrs)** |
|:---:|:---:|:----:|:----:|
| 8 | 4 |||
#### Training stability test
The following tables compare `F1` scores across 5 different training runs with different seeds, for both FP16 and FP32 respectively. The runs showcase consistent convergence on all 5 seeds with very little deviation.
| **FP16, 8x GPUs** | **seed #1** | **seed #2** | **seed #3** | **seed #4** | **seed #5** | **mean** | **std** |
|:-----------:|:-----:|:-----:|:-----:|:-----:|:-----:|:-----:|:-----:|
|F1 ||
|Exact match||
| **FP32, 8x GPUs** | **seed #1** | **seed #2** | **seed #3** | **seed #4** | **seed #5** | **mean** | **std** |
|:-----------:|:-----:|:-----:|:-----:|:-----:|:-----:|:-----:|:-----:|
|F1 | |
|Exact match| |
### Training performance results
Our results were obtained by running batch sizes up to 3x GPUs on a 16GB V100 and up to 10x GPUs on a 32G V100 with mixed precision.
#### NVIDIA DGX-1 (8x V100 16G)
Our results were obtained by running the `scripts/run_pretraining.sh` training script in the TensorFlow 19.03-py3 NGC container on NVIDIA DGX-1 with 8x V100 16G GPUs. Performance numbers (in tokens per second) were averaged over an entire training epoch.
| **Number of GPUs** | **Batch size per GPU** | **FP32 sentences/sec** | **FP16 sentences/sec** | **Speed-up with mixed precision** | **Multi-gpu weak scaling with FP32** | **Multi-gpu weak scaling with FP16** |
|:---:|:---:|:------:|:-----:|:----:|:----:|:----:|
| 1 | 2 | 5.48 |18.97|3.46 |1.0 |1.0 |
| 4 | 2 |19.6|60.6|3.09|3.57 |3.2|
| 8 | 2 |39.21 |121.21|3.09|7.15|6.38|
| **Number of GPUs** | **Batch size per GPU** | **FP32 sentences/sec** | **FP16 sentences/sec** | **Speed-up with mixed precision** | **Multi-gpu weak scaling with FP32** | **Multi-gpu weak scaling with FP16** |
|:---:|:---:|:-----:|:-----:|:---:|:---:|:----:|
| 1 | 4 | - |19.46| - | - |1.0 |
| 4 | 4 | - |75.67| - | - |3.88|
| 8 | 4 | - |151.35| - | - |7.77 |
Note: The respective values for FP32 runs that use a batch size of 4 are not available due to out of memory errors that arise. Batch size of 4 is only available on using FP16.
To achieve these same results, follow the [Quick Start Guide](#quick-start-guide) outlined above.
#### NVIDIA DGX-1 (8x V100 32G)
Our results were obtained by running the `scripts/run_pretraining.sh` training script in the TensorFlow 19.03-py3 NGC container on NVIDIA DGX-1 with 8x V100 32G GPUs. Performance numbers (in sentences per second) were averaged over an entire training epochs.
| **Number of GPUs** | **Batch size per GPU** | **FP32 sentences/sec** | **FP16 sentences/sec** | **Speed-up with mixed precision** | **Multi-gpu weak scaling with FP32** | **Multi-gpu weak scaling with FP16** |
|---|---|-----|-----|----|----|----|
| 1 | 7 | 7.56|24.29|3.21|1.0 |1.0 |
| 4 | 7 |28.84|86.24|2.99|3.81|3.55|
| 8 | 7 |57.68|172.48|2.99|7.62|7.10|
| **Number of GPUs** | **Batch size per GPU** | **FP32 sentences/sec** | **FP16 sentences/sec** | **Speed-up with mixed precision** | **Multi-gpu weak scaling with FP32** | **Multi-gpu weak scaling with FP16** |
|---|---|-----|-------|---|---|----|
| 1 | 14| - | 26.04 | - | - |1.0 |
| 4 | 14| - | 99.68| - | - |3.87|
| 8 | 14| - |199.35 | - | - |7.65 |
Note: The respective values for FP32 runs that use a batch size of 10 are not available due to out of memory errors that arise. Batch size of 10 is only available on using FP16.
To achieve these same results, follow the [Quick Start Guide](#quick-start-guide) outlined above.
#### NVIDIA DGX-2 (16x V100 32G)
Our results were obtained by running the `scripts/run_pretraining.sh` training script in the TensorFlow 19.03-py3 NGC container on NVIDIA DGX-2 with 16x V100 32G GPUs. Performance numbers (in sentences per second) were averaged over an entire training epoch.
| **Number of GPUs** | **Batch size per GPU** | **FP32 sentences/sec** | **FP16 sentences/sec** | **Speed-up with mixed precision** | **Multi-gpu weak scaling with FP32** | **Multi-gpu weak scaling with FP16** |
|---|---|------|------|----|-----|----|
| 1| 7 | 8.47| 26.04|3.07| 1.0 |1.0 |
| 4| 7 | 32.2 | 92.68|2.87| 3.8|3.80|
| 8| 7 | 63.84|183.68|2.87| 7.53|7.05|
| 16| 7 |126.56|365.12|2.87|14.94|14.02|
| **Number of GPUs** | **Batch size per GPU** | **FP32 sentences/sec** | **FP16 sentences/sec** | **Speed-up with mixed precision** | **Multi-gpu weak scaling with FP32** | **Multi-gpu weak scaling with FP16** |
|---|---|---|------|---|---|----|
| 1| 14| - | 28.28| - | - |1.0 |
| 4| 14| - | 103.6| - | - |3.66|
| 8| 14| - |208.32| - | - |7.36|
| 16| 14| - |416.64| - | - |14.73|
Note: The respective values for FP32 runs that use a batch size of 10 are not available due to out of memory errors that arise. Batch size of 10 is only available on using FP16.
To achieve these same results, follow the [Quick Start Guide](#quick-start-guide) outlined above.
### Inference performance results
#### NVIDIA DGX-1 16G (1x V100 16G)
Our results were obtained by running the `scripts/run_squad_inference.sh` training script in the TensorFlow 19.03-py3 NGC container on NVIDIA DGX-1 with 1x V100 16G GPUs. Performance numbers (in sentences per second) were averaged over an entire training epoch.
| **Number of GPUs** | **Batch size per GPU** | **FP32 sentences/sec** | **FP16 sentences/sec** | **Speedup** |
|---|---|-----|------|----|
| 1 | 8 ||
To achieve these same results, follow the [Quick Start Guide](#quick-start-guide) outlined above.
#### NVIDIA DGX-1 32G (1x V100 32G)
Our results were obtained by running the `scripts/run_squad_inference.sh` training script in the TensorFlow 19.03-py3 NGC container on NVIDIA DGX-1 with 1x V100 32G GPUs. Performance numbers (in sentences per second) were averaged over an entire training epoch.
| **Number of GPUs** | **Batch size per GPU** | **FP32 sentences/sec** | **FP16 sentences/sec** | **Speedup** |
|---|---|-----|------|----|
| 1 | 8 ||
To achieve these same results, follow the [Quick Start Guide](#quick-start-guide) outlined above.
#### NVIDIA DGX-2 32G (1x V100 32G)
Our results were obtained by running the `scripts/run_squad_inference.sh` training script in the TensorFlow 19.03-py3 NGC container on NVIDIA DGX-2 with 1x V100 32G GPUs. Performance numbers (in sentences per second) were averaged over an entire training epoch.
| **Number of GPUs** | **Batch size per GPU** | **FP32 sentences/sec** | **FP16 sentences/sec** | **Speedup** |
|---|---|-----|------|----|
| 1 | 8 ||
To achieve these same results, follow the [Quick Start Guide](#quick-start-guide) outlined above.
## Changelog
March 2019
- Initial release
## Known issues
There are no known issues with this model.

View File

@ -0,0 +1,13 @@
{
"attention_probs_dropout_prob": 0.1,
"hidden_act": "gelu",
"hidden_dropout_prob": 0.1,
"hidden_size": 1024,
"initializer_range": 0.02,
"intermediate_size": 4096,
"max_position_embeddings": 512,
"num_attention_heads": 16,
"num_hidden_layers": 24,
"type_vocab_size": 2,
"vocab_size": 30522
}

View File

@ -0,0 +1,472 @@
# coding=utf-8
# Copyright 2018 The Google AI Language Team Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Create masked LM/next sentence masked_lm TF examples for BERT."""
from __future__ import absolute_import, division, print_function, unicode_literals
import argparse
import logging
import os
import random
from io import open
import h5py
import numpy as np
from tqdm import tqdm, trange
from tokenization import BertTokenizer
import tokenization as tokenization
import random
import collections
class TrainingInstance(object):
"""A single training instance (sentence pair)."""
def __init__(self, tokens, segment_ids, masked_lm_positions, masked_lm_labels,
is_random_next):
self.tokens = tokens
self.segment_ids = segment_ids
self.is_random_next = is_random_next
self.masked_lm_positions = masked_lm_positions
self.masked_lm_labels = masked_lm_labels
def __str__(self):
s = ""
s += "tokens: %s\n" % (" ".join(
[tokenization.printable_text(x) for x in self.tokens]))
s += "segment_ids: %s\n" % (" ".join([str(x) for x in self.segment_ids]))
s += "is_random_next: %s\n" % self.is_random_next
s += "masked_lm_positions: %s\n" % (" ".join(
[str(x) for x in self.masked_lm_positions]))
s += "masked_lm_labels: %s\n" % (" ".join(
[tokenization.printable_text(x) for x in self.masked_lm_labels]))
s += "\n"
return s
def __repr__(self):
return self.__str__()
def write_instance_to_example_file(instances, tokenizer, max_seq_length,
max_predictions_per_seq, output_file):
"""Create TF example files from `TrainingInstance`s."""
total_written = 0
features = collections.OrderedDict()
num_instances = len(instances)
features["input_ids"] = np.zeros([num_instances, max_seq_length], dtype="int32")
features["input_mask"] = np.zeros([num_instances, max_seq_length], dtype="int32")
features["segment_ids"] = np.zeros([num_instances, max_seq_length], dtype="int32")
features["masked_lm_positions"] = np.zeros([num_instances, max_predictions_per_seq], dtype="int32")
features["masked_lm_ids"] = np.zeros([num_instances, max_predictions_per_seq], dtype="int32")
features["next_sentence_labels"] = np.zeros(num_instances, dtype="int32")
for inst_index, instance in enumerate(tqdm(instances)):
input_ids = tokenizer.convert_tokens_to_ids(instance.tokens)
input_mask = [1] * len(input_ids)
segment_ids = list(instance.segment_ids)
assert len(input_ids) <= max_seq_length
while len(input_ids) < max_seq_length:
input_ids.append(0)
input_mask.append(0)
segment_ids.append(0)
assert len(input_ids) == max_seq_length
assert len(input_mask) == max_seq_length
assert len(segment_ids) == max_seq_length
masked_lm_positions = list(instance.masked_lm_positions)
masked_lm_ids = tokenizer.convert_tokens_to_ids(instance.masked_lm_labels)
masked_lm_weights = [1.0] * len(masked_lm_ids)
while len(masked_lm_positions) < max_predictions_per_seq:
masked_lm_positions.append(0)
masked_lm_ids.append(0)
masked_lm_weights.append(0.0)
next_sentence_label = 1 if instance.is_random_next else 0
features["input_ids"][inst_index] = input_ids
features["input_mask"][inst_index] = input_mask
features["segment_ids"][inst_index] = segment_ids
features["masked_lm_positions"][inst_index] = masked_lm_positions
features["masked_lm_ids"][inst_index] = masked_lm_ids
features["next_sentence_labels"][inst_index] = next_sentence_label
total_written += 1
# if inst_index < 20:
# tf.logging.info("*** Example ***")
# tf.logging.info("tokens: %s" % " ".join(
# [tokenization.printable_text(x) for x in instance.tokens]))
# for feature_name in features.keys():
# feature = features[feature_name]
# values = []
# if feature.int64_list.value:
# values = feature.int64_list.value
# elif feature.float_list.value:
# values = feature.float_list.value
# tf.logging.info(
# "%s: %s" % (feature_name, " ".join([str(x) for x in values])))
print("saving data")
f= h5py.File(output_file, 'w')
f.create_dataset("input_ids", data=features["input_ids"], dtype='i4', compression='gzip')
f.create_dataset("input_mask", data=features["input_mask"], dtype='i1', compression='gzip')
f.create_dataset("segment_ids", data=features["segment_ids"], dtype='i1', compression='gzip')
f.create_dataset("masked_lm_positions", data=features["masked_lm_positions"], dtype='i4', compression='gzip')
f.create_dataset("masked_lm_ids", data=features["masked_lm_ids"], dtype='i4', compression='gzip')
f.create_dataset("next_sentence_labels", data=features["next_sentence_labels"], dtype='i1', compression='gzip')
f.flush()
f.close()
def create_training_instances(input_files, tokenizer, max_seq_length,
dupe_factor, short_seq_prob, masked_lm_prob,
max_predictions_per_seq, rng):
"""Create `TrainingInstance`s from raw text."""
all_documents = [[]]
# Input file format:
# (1) One sentence per line. These should ideally be actual sentences, not
# entire paragraphs or arbitrary spans of text. (Because we use the
# sentence boundaries for the "next sentence prediction" task).
# (2) Blank lines between documents. Document boundaries are needed so
# that the "next sentence prediction" task doesn't span between documents.
for input_file in input_files:
print("creating instance from {}".format(input_file))
with open(input_file, "r") as reader:
while True:
line = tokenization.convert_to_unicode(reader.readline())
if not line:
break
line = line.strip()
# Empty lines are used as document delimiters
if not line:
all_documents.append([])
tokens = tokenizer.tokenize(line)
if tokens:
all_documents[-1].append(tokens)
# Remove empty documents
all_documents = [x for x in all_documents if x]
rng.shuffle(all_documents)
vocab_words = list(tokenizer.vocab.keys())
instances = []
for _ in range(dupe_factor):
for document_index in range(len(all_documents)):
instances.extend(
create_instances_from_document(
all_documents, document_index, max_seq_length, short_seq_prob,
masked_lm_prob, max_predictions_per_seq, vocab_words, rng))
rng.shuffle(instances)
return instances
def create_instances_from_document(
all_documents, document_index, max_seq_length, short_seq_prob,
masked_lm_prob, max_predictions_per_seq, vocab_words, rng):
"""Creates `TrainingInstance`s for a single document."""
document = all_documents[document_index]
# Account for [CLS], [SEP], [SEP]
max_num_tokens = max_seq_length - 3
# We *usually* want to fill up the entire sequence since we are padding
# to `max_seq_length` anyways, so short sequences are generally wasted
# computation. However, we *sometimes*
# (i.e., short_seq_prob == 0.1 == 10% of the time) want to use shorter
# sequences to minimize the mismatch between pre-training and fine-tuning.
# The `target_seq_length` is just a rough target however, whereas
# `max_seq_length` is a hard limit.
target_seq_length = max_num_tokens
if rng.random() < short_seq_prob:
target_seq_length = rng.randint(2, max_num_tokens)
# We DON'T just concatenate all of the tokens from a document into a long
# sequence and choose an arbitrary split point because this would make the
# next sentence prediction task too easy. Instead, we split the input into
# segments "A" and "B" based on the actual "sentences" provided by the user
# input.
instances = []
current_chunk = []
current_length = 0
i = 0
while i < len(document):
segment = document[i]
current_chunk.append(segment)
current_length += len(segment)
if i == len(document) - 1 or current_length >= target_seq_length:
if current_chunk:
# `a_end` is how many segments from `current_chunk` go into the `A`
# (first) sentence.
a_end = 1
if len(current_chunk) >= 2:
a_end = rng.randint(1, len(current_chunk) - 1)
tokens_a = []
for j in range(a_end):
tokens_a.extend(current_chunk[j])
tokens_b = []
# Random next
is_random_next = False
if len(current_chunk) == 1 or rng.random() < 0.5:
is_random_next = True
target_b_length = target_seq_length - len(tokens_a)
# This should rarely go for more than one iteration for large
# corpora. However, just to be careful, we try to make sure that
# the random document is not the same as the document
# we're processing.
for _ in range(10):
random_document_index = rng.randint(0, len(all_documents) - 1)
if random_document_index != document_index:
break
random_document = all_documents[random_document_index]
random_start = rng.randint(0, len(random_document) - 1)
for j in range(random_start, len(random_document)):
tokens_b.extend(random_document[j])
if len(tokens_b) >= target_b_length:
break
# We didn't actually use these segments so we "put them back" so
# they don't go to waste.
num_unused_segments = len(current_chunk) - a_end
i -= num_unused_segments
# Actual next
else:
is_random_next = False
for j in range(a_end, len(current_chunk)):
tokens_b.extend(current_chunk[j])
truncate_seq_pair(tokens_a, tokens_b, max_num_tokens, rng)
assert len(tokens_a) >= 1
assert len(tokens_b) >= 1
tokens = []
segment_ids = []
tokens.append("[CLS]")
segment_ids.append(0)
for token in tokens_a:
tokens.append(token)
segment_ids.append(0)
tokens.append("[SEP]")
segment_ids.append(0)
for token in tokens_b:
tokens.append(token)
segment_ids.append(1)
tokens.append("[SEP]")
segment_ids.append(1)
(tokens, masked_lm_positions,
masked_lm_labels) = create_masked_lm_predictions(
tokens, masked_lm_prob, max_predictions_per_seq, vocab_words, rng)
instance = TrainingInstance(
tokens=tokens,
segment_ids=segment_ids,
is_random_next=is_random_next,
masked_lm_positions=masked_lm_positions,
masked_lm_labels=masked_lm_labels)
instances.append(instance)
current_chunk = []
current_length = 0
i += 1
return instances
MaskedLmInstance = collections.namedtuple("MaskedLmInstance",
["index", "label"])
def create_masked_lm_predictions(tokens, masked_lm_prob,
max_predictions_per_seq, vocab_words, rng):
"""Creates the predictions for the masked LM objective."""
cand_indexes = []
for (i, token) in enumerate(tokens):
if token == "[CLS]" or token == "[SEP]":
continue
cand_indexes.append(i)
rng.shuffle(cand_indexes)
output_tokens = list(tokens)
num_to_predict = min(max_predictions_per_seq,
max(1, int(round(len(tokens) * masked_lm_prob))))
masked_lms = []
covered_indexes = set()
for index in cand_indexes:
if len(masked_lms) >= num_to_predict:
break
if index in covered_indexes:
continue
covered_indexes.add(index)
masked_token = None
# 80% of the time, replace with [MASK]
if rng.random() < 0.8:
masked_token = "[MASK]"
else:
# 10% of the time, keep original
if rng.random() < 0.5:
masked_token = tokens[index]
# 10% of the time, replace with random word
else:
masked_token = vocab_words[rng.randint(0, len(vocab_words) - 1)]
output_tokens[index] = masked_token
masked_lms.append(MaskedLmInstance(index=index, label=tokens[index]))
masked_lms = sorted(masked_lms, key=lambda x: x.index)
masked_lm_positions = []
masked_lm_labels = []
for p in masked_lms:
masked_lm_positions.append(p.index)
masked_lm_labels.append(p.label)
return (output_tokens, masked_lm_positions, masked_lm_labels)
def truncate_seq_pair(tokens_a, tokens_b, max_num_tokens, rng):
"""Truncates a pair of sequences to a maximum sequence length."""
while True:
total_length = len(tokens_a) + len(tokens_b)
if total_length <= max_num_tokens:
break
trunc_tokens = tokens_a if len(tokens_a) > len(tokens_b) else tokens_b
assert len(trunc_tokens) >= 1
# We want to sometimes truncate from the front and sometimes from the
# back to add more randomness and avoid biases.
if rng.random() < 0.5:
del trunc_tokens[0]
else:
trunc_tokens.pop()
def main():
parser = argparse.ArgumentParser()
## Required parameters
parser.add_argument("--vocab_file",
default=None,
type=str,
required=True,
help="The vocabulary the BERT model will train on.")
parser.add_argument("--input_file",
default=None,
type=str,
required=True,
help="The input train corpus. can be directory with .txt files or a path to a single file")
parser.add_argument("--output_file",
default=None,
type=str,
required=True,
help="The output file where the model checkpoints will be written.")
## Other parameters
# str
parser.add_argument("--bert_model", default="bert-large-uncased", type=str, required=False,
help="Bert pre-trained model selected in the list: bert-base-uncased, "
"bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese.")
#int
parser.add_argument("--max_seq_length",
default=128,
type=int,
help="The maximum total input sequence length after WordPiece tokenization. \n"
"Sequences longer than this will be truncated, and sequences shorter \n"
"than this will be padded.")
parser.add_argument("--dupe_factor",
default=10,
type=int,
help="Number of times to duplicate the input data (with different masks).")
parser.add_argument("--max_predictions_per_seq",
default=20,
type=int,
help="Maximum sequence length.")
# floats
parser.add_argument("--masked_lm_prob",
default=0.15,
type=float,
help="Masked LM probability.")
parser.add_argument("--short_seq_prob",
default=0.1,
type=float,
help="Probability to create a sequence shorter than maximum sequence length")
parser.add_argument("--do_lower_case",
action='store_true',
default=True,
help="Whether to lower case the input text. True for uncased models, False for cased models.")
parser.add_argument('--random_seed',
type=int,
default=12345,
help="random seed for initialization")
args = parser.parse_args()
tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case)
input_files = []
if os.path.isfile(args.input_file):
input_files.append(args.input_file)
elif os.path.isdir(args.input_file):
input_files = [os.path.join(args.input_file, f) for f in os.listdir(args.input_file) if (os.path.isfile(os.path.join(args.input_file, f)) and f.endswith('.txt') )]
else:
raise ValueError("{} is not a valid path".format(args.input_file))
rng = random.Random(args.random_seed)
instances = create_training_instances(
input_files, tokenizer, args.max_seq_length, args.dupe_factor,
args.short_seq_prob, args.masked_lm_prob, args.max_predictions_per_seq,
rng)
output_file = args.output_file
write_instance_to_example_file(instances, tokenizer, args.max_seq_length,
args.max_predictions_per_seq, output_file)
if __name__ == "__main__":
main()

View File

@ -0,0 +1,30 @@
Steps to reproduce datasets from web
1) Build the container
* docker build -t bert_prep .
2) Run the container interactively
* nvidia-docker run -it --ipc=host bert_prep
* Optional: Mount data volumes
* -v yourpath:/workspace/bert/data/wikipedia_corpus/download
* -v yourpath:/workspace/bert/data/wikipedia_corpus/extracted_articles
* -v yourpath:/workspace/bert/data/wikipedia_corpus/raw_data
* -v yourpath:/workspace/bert/data/wikipedia_corpus/intermediate_files
* -v yourpath:/workspace/bert/data/wikipedia_corpus/final_text_file_single
* -v yourpath:/workspace/bert/data/wikipedia_corpus/final_text_files_sharded
* -v yourpath:/workspace/bert/data/wikipedia_corpus/final_tfrecords_sharded
* -v yourpath:/workspace/bert/data/bookcorpus/download
* -v yourpath:/workspace/bert/data/bookcorpus/final_text_file_single
* -v yourpath:/workspace/bert/data/bookcorpus/final_text_files_sharded
* -v yourpath:/workspace/bert/data/bookcorpus/final_tfrecords_sharded
* Optional: Select visible GPUs
* -e CUDA_VISIBLE_DEVICES=0
** Inside of the container starting here**
3) Download pretrained weights (they contain vocab files for preprocessing)
* cd data/pretrained_models_google && python3 download_models.py
4) "One-click" Wikipedia data download and prep (provides tfrecords)
* Set your configuration in data/wikipedia_corpus/config.sh
* cd /data/wikipedia_corpus && ./run_preprocessing.sh
5) "One-click" BookCorpus data download and prep (provided tfrecords)
* Set your configuration in data/wikipedia_corpus/config.sh
* cd /data/bookcorpus && ./run_preprocessing.sh

View File

@ -0,0 +1,23 @@
# NVIDIA
import glob
import os
import argparse
parser = argparse.ArgumentParser(description='Cleaning and merge downloaded bookcorpus files')
parser.add_argument('download_path', type=str)
parser.add_argument('output_file', type=str)
args = parser.parse_args()
download_path = args.download_path
output_file = args.output_file
with open(output_file, "w") as ofile:
for filename in glob.glob('{}/*.txt'.format(download_path), recursive=True):
with open(filename, mode='r', encoding="utf-8-sig") as file:
for line in file:
if line.strip() != "":
ofile.write(line.strip() + " ")
ofile.write("\n\n")

View File

@ -0,0 +1,9 @@
#! /bin/bash
# Download books
mkdir -p ./download
python3 /workspace/bookcorpus/download_files.py --list /workspace/bookcorpus/url_list.jsonl --out ./download --trash-bad-count
# Clean and prep (one book per line)
python3 ./clean_and_merge_text.py ./download bookcorpus.txt

View File

@ -0,0 +1,38 @@
#!/bin/bash
# Note: There are several directories created to make it clear what has been performed at each stage of preprocessing. The intermediate files may be useful if you want to further clean/prepare/augment the data for your own applications.
# NLTK was chosen as the default over spaCy simply due to speed of sentence segmentation on the large files.
MERGED_DIR=$1
args="${*:2}"
source utils/config.sh
mkdir -p ${MERGED_DIR}
corpus_file=${MERGED_DIR}/corpus.txt
## Shuffle the full corpus texts
if [ ! -z $3 ]
then
echo "Merging $args"
cat $args | sed "/^$/d" | shuf > $corpus_file
else
corpus_file=$2
fi
# Split articles into one-sentence-per-line format for use with BERT scripts
echo "Applying sentence segmentation to get one sentence per line"
mkdir -p ${MERGED_DIR}/final_text_file_single
python3 utils/sentence_segmentation_nltk.py $corpus_file ${MERGED_DIR}/final_text_file_single/corpus.segmented.nltk.txt
## Shard finalized text so that it has a chance of fitting in memory when creating pretraining data into hdf5 (choose appropriate number of shards for distributed training)
echo "Shard text files - size is approximate to prevent splitting an article across shards"
mkdir -p ${MERGED_DIR}/final_text_files_sharded
python3 utils/shard_text_input_file.py ${MERGED_DIR}/final_text_file_single/corpus.segmented.nltk.txt ${MERGED_DIR}/final_text_files_sharded/corpus.segmented.part.
# Convert sharded text files into hdf5 that are ready for BERT pretraining
echo "Creating hdf5 for each text shard"
mkdir -p ${MERGED_DIR}/hdf5_shards
export TARGET_DIR=${MERGED_DIR}
. utils/preprocessing_xargs_wrapper.sh ${N_PROCS_PREPROCESS}

View File

@ -0,0 +1,7 @@
#!/usr/bin/env bash
echo "Downloading MRPC data"
wget https://gist.githubusercontent.com/W4ngatang/60c2bdb54d156a41194446737ce03e2e/raw/17b8dd0d724281ed7c3b2aeeda662b92809aadd5/download_glue_data.py
python download_glue_data.py --data_dir . --tasks MRPC

View File

@ -0,0 +1,29 @@
#!/bin/bash
MERGED_DIR=$1 # e.g wikipedia+bookcorpus
INPUTFILES=$2 # directories with hdf5 files separated by comma
NUM_SHARDS=$3
source utils/config.sh
META_DIR=$MERGED_DIR/meta
mkdir -p ${MERGED_DIR}
mkdir -p ${META_DIR}
echo "create mixed dataset ids"
echo "python utils/create_mixed_dataset_ids.py --input_files=${INPUTFILES} --num_output_shards=${NUM_SHARDS} --output_dir=${META_DIR} --random_seed=${SEED}"
python utils/create_mixed_dataset_ids.py --input_files=${INPUTFILES} --num_output_shards=${NUM_SHARDS} --output_dir=${META_DIR} --random_seed=${SEED}
echo "Creating hdf5 for each text shard"
mkdir -p ${MERGED_DIR}/hdf5_shards
echo "create mixed datasets with hdf5 files"
echo "python utils/create_mixed_dataset.py --input_files=${INPUTFILES} --output_dir=${MERGED_DIR}/hdf5_shards --lookup=${META_DIR}/lookup_table.pkl --indices_dir=${META_DIR} --index_range=0-${NUM_SHARDS} --random_seed=${SEED}"
python utils/create_mixed_dataset.py --input_files=${INPUTFILES} --output_dir=${MERGED_DIR}/hdf5_shards --lookup=${META_DIR}/lookup_table.pkl --indices_dir=${META_DIR} --index_range=0-$((NUM_SHARDS-1)) --random_seed=${SEED}
rm -rf ${META_DIR}

View File

@ -0,0 +1,60 @@
#!/usr/bin/env bash
echo "Downloading dataset for squad..."
# Download SQuAD
v1="v1.1"
mkdir $v1
wget https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v1.1.json -O $v1/train-v1.1.json
wget https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v1.1.json -O $v1/dev-v1.1.json
wget https://worksheets.codalab.org/rest/bundles/0xbcd57bee090b421c982906709c8c27e1/contents/blob/ -O $v1/evaluate-v1.1.py
EXP_TRAIN_v1='981b29407e0affa3b1b156f72073b945 -'
EXP_DEV_v1='3e85deb501d4e538b6bc56f786231552 -'
EXP_EVAL_v1='afb04912d18ff20696f7f88eed49bea9 -'
CALC_TRAIN_v1=`cat ${v1}/train-v1.1.json |md5sum`
CALC_DEV_v1=`cat ${v1}/dev-v1.1.json |md5sum`
CALC_EVAL_v1=`cat ${v1}/evaluate-v1.1.py |md5sum`
v2="v2.0"
mkdir $v2
wget https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v2.0.json -O $v2/train-v2.0.json
wget https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v2.0.json -O $v2/dev-v2.0.json
wget https://worksheets.codalab.org/rest/bundles/0x6b567e1cf2e041ec80d7098f031c5c9e/contents/blob/ -O $v2/evaluate-v2.0.py
EXP_TRAIN_v2='62108c273c268d70893182d5cf8df740 -'
EXP_DEV_v2='246adae8b7002f8679c027697b0b7cf8 -'
EXP_EVAL_v2='ff23213bed5516ea4a6d9edb6cd7d627 -'
CALC_TRAIN_v2=`cat ${v2}/train-v2.0.json |md5sum`
CALC_DEV_v2=`cat ${v2}/dev-v2.0.json |md5sum`
CALC_EVAL_v2=`cat ${v2}/evaluate-v2.0.py |md5sum`
echo "Squad data download done!"
echo "Verifying Dataset...."
if [ "$EXP_TRAIN_v1" != "$CALC_TRAIN_v1" ]; then
echo "train-v1.1.json is corrupted! md5sum doesn't match"
fi
if [ "$EXP_DEV_v1" != "$CALC_DEV_v1" ]; then
echo "dev-v1.1.json is corrupted! md5sum doesn't match"
fi
if [ "$EXP_EVAL_v1" != "$CALC_EVAL_v1" ]; then
echo "evaluate-v1.1.py is corrupted! md5sum doesn't match"
fi
if [ "$EXP_TRAIN_v2" != "$CALC_TRAIN_v2" ]; then
echo "train-v2.0.json is corrupted! md5sum doesn't match"
fi
if [ "$EXP_DEV_v2" != "$CALC_DEV_v2" ]; then
echo "dev-v2.0.json is corrupted! md5sum doesn't match"
fi
if [ "$EXP_EVAL_v2" != "$CALC_EVAL_v2" ]; then
echo "evaluate-v2.0.py is corrupted! md5sum doesn't match"
fi
echo "Complete!"

View File

@ -0,0 +1,24 @@
#! /bin/bash
set -e
USE_BERT_LARGE=true
MAX_SEQUENCE_LENGTH=512
MAX_PREDICTIONS_PER_SEQUENCE=80
MASKED_LM_PROB=0.15
SEED=12345
DUPE_FACTOR=5
DO_LOWER_CASE="True"
N_LINES_PER_SHARD_APPROX=396000 # Default=396000 creates 256 shards
N_PROCS_PREPROCESS=4 # Adjust this based on memory requirements and available number of cores
BERT_BASE_DIR="/workspace/bert/vocab/uncased_L-12_H-768_A-12"
BERT_LARGE_DIR="/workspace/bert/vocab/uncased_L-24_H-1024_A-16"
if [ "$USE_BERT_LARGE" = true ] ; then
VOCAB_FILE="${BERT_LARGE_DIR}/vocab.txt"
else
VOCAB_FILE="${BERT_BASE_DIR}/vocab.txt"
fi

View File

@ -0,0 +1,160 @@
from __future__ import absolute_import, division, print_function, unicode_literals
import argparse
import logging
import os
import random
from io import open
import h5py
import numpy as np
from tqdm import tqdm, trange
import random
import collections
import math
import multiprocessing as mp
"""
mixing hdf5 shards with each other
"""
def shard_files(output_files, l_instance_ids, lookuptable, files):
l_input_ids = []
l_input_masks = []
l_segment_ids = []
l_masked_lm_positions = []
l_masked_lm_ids = []
l_next_sentence_labels = []
seq_len = 0
pred_len = 0
with h5py.File(files[0], 'r') as f:
seq_len = f['input_ids'].shape[1]
pred_len = f['masked_lm_positions'].shape[1]
assert(seq_len > 0 and pred_len > 0)
for i, output_file in enumerate(output_files):
output_length = len(l_instance_ids[i])
print("preparing to write {} instances to {}".format(output_length, output_file))
input_ids = np.ones([output_length, seq_len], dtype=np.int32)
input_masks = np.ones([output_length, seq_len], dtype=np.int8)
segment_ids = np.ones([output_length, seq_len], dtype=np.int8)
masked_lm_positions = np.ones([output_length, pred_len], dtype=np.int32)
masked_lm_ids= np.ones([output_length, pred_len], dtype=np.int32)
next_sentence_labels = np.ones(output_length, dtype=np.int8)
l_input_ids.append(input_ids)
l_input_masks.append(input_masks)
l_segment_ids.append(segment_ids)
l_masked_lm_positions.append(masked_lm_positions)
l_masked_lm_ids.append(masked_lm_ids)
l_next_sentence_labels.append(next_sentence_labels)
for did, f in enumerate(tqdm(files)):
h5_f = h5py.File(f, 'r')
f_input_ids = h5_f['input_ids'][:]
f_input_masks = h5_f['input_mask'][:]
f_segment_ids = h5_f['segment_ids'][:]
f_masked_lm_positions = h5_f['masked_lm_positions'][:]
f_masked_lm_ids = h5_f['masked_lm_ids'][:]
f_next_sentence_labels = h5_f['next_sentence_labels'][:]
h5_f.close()
for out_i, out_file in enumerate(output_files):
instance_ids = l_instance_ids[out_i]
for l, idx in enumerate(instance_ids):
doc_id, line_id = lookuptable[idx]
if doc_id == did:
l_input_ids[out_i][l] = f_input_ids[line_id]
l_input_masks[out_i][l] = f_input_masks[line_id]
l_segment_ids[out_i][l] = f_segment_ids[line_id]
l_masked_lm_positions[out_i][l] = f_masked_lm_positions[line_id]
l_masked_lm_ids[out_i][l] = f_masked_lm_ids[line_id]
l_next_sentence_labels[out_i][l] = f_next_sentence_labels[line_id]
for out_i, out_file in enumerate(output_files):
output_length = len(l_input_ids[out_i])
print("writing {} instances to {}".format(output_length, out_file))
with h5py.File(out_file, 'w') as f:
f.create_dataset("input_ids", data=l_input_ids[out_i], dtype='i4', compression='gzip')
f.create_dataset("input_mask", data=l_input_masks[out_i], dtype='i1', compression='gzip')
f.create_dataset("segment_ids", data=l_segment_ids[out_i], dtype='i1', compression='gzip')
f.create_dataset("masked_lm_positions", data=l_masked_lm_positions[out_i], dtype='i4', compression='gzip')
f.create_dataset("masked_lm_ids", data=l_masked_lm_ids[out_i], dtype='i4', compression='gzip')
f.create_dataset("next_sentence_labels", data=l_next_sentence_labels[out_i], dtype='i1', compression='gzip')
def main():
parser = argparse.ArgumentParser()
## Required parameters
parser.add_argument("--input_files",
default=None,
type=str,
required=True,
help="comma seperated list of file paths, each path can be either file or directory of files")
parser.add_argument("--output_dir",
default=None,
type=str,
required=True,
help="directory for output shards")
parser.add_argument("--lookup",
default=None,
type=str,
required=True,
help="path to lookup table")
parser.add_argument("--indices_dir",
default=None,
type=str,
required=True,
help="path to shuffled instance indices")
parser.add_argument("--index_range",
default=None,
type=str,
required=True,
help="index range of output files to be written out, e.g specify '0-100' for writing out 0.hdf5 , ..., 100.hdf5")
parser.add_argument('--random_seed',
type=int,
default=12345,
help="random seed for initialization")
args = parser.parse_args()
rng = random.Random(args.random_seed)
np.random.seed(args.random_seed)
input_paths = args.input_files.strip().split(',')
input_paths = [f for f in input_paths if f]
input_files = []
for path in input_paths:
if os.path.isfile(path):
assert (path.endswith('.hdf5')), "file must be hdf5 file"
input_files.append(path)
else:
assert os.path.isdir(path)
hdf5_files = [os.path.join(path, f) for f in os.listdir(path) if os.path.isfile(os.path.join(path, f)) and f.endswith('.hdf5')]
input_files.extend(hdf5_files)
input_files.sort()
assert(os.path.isdir(args.output_dir))
print("loading indices file")
start_idx, end_idx= int(args.index_range.split('-')[0]), int(args.index_range.split('-')[1])
index_files = []
instance_ids = []
for i in range(start_idx, end_idx + 1):
index_files.append(os.path.join(args.indices_dir, "indices_" + str(i) + ".npy"))
instance_ids.append( np.load(index_files[-1]))
output_files = [os.path.join(args.output_dir, indices_file.split('.')[0].split('_')[-1] + ".hdf5") for indices_file in index_files]
print("output_files", output_files)
print("loading lookup table")
lookup_table = np.load(args.lookup)
shard_files(output_files, instance_ids, lookup_table, input_files)
if __name__ == "__main__":
main()

View File

@ -0,0 +1,134 @@
from __future__ import absolute_import, division, print_function, unicode_literals
import argparse
import logging
import os
import random
from io import open
import h5py
import numpy as np
from tqdm import tqdm, trange
import random
import collections
import math
from tqdm import tqdm
import multiprocessing as mp
import pickle
import json
"""
mixing hdf5 shards with each other
"""
def load_and_prepare(input_files, num_shards):
seq_len = None
pred_len = None
input_lengths = []
for input_file in input_files:
with h5py.File(input_file, 'r') as f:
input_lengths.append(len(f['input_ids']))
if seq_len is None:
seq_len = f['input_ids'].shape[1]
pred_len = f['masked_lm_ids'].shape[1]
assert (isinstance(seq_len, int) and isinstance(pred_len, int))
total_instances = sum(input_lengths)
n_inst_per_file = math.ceil(total_instances * 1.0 / num_shards)
permutation = np.random.permutation(total_instances)
instance_indices = []
for i in range(0, num_shards):
start_pos = i * n_inst_per_file
end_pos = min((i+1) * n_inst_per_file, total_instances)
instance_indices.append(permutation[start_pos:end_pos])
return seq_len, pred_len, input_lengths, instance_indices
def main():
parser = argparse.ArgumentParser()
## Required parameters
parser.add_argument("--input_files",
default=None,
type=str,
required=True,
help="comma seperated list of file paths, each path can be either file or directory of hdf5 files")
parser.add_argument("--num_output_shards",
default=None,
type=int,
required=True,
help="number of shards to be created. shards will be created as even as possible.")
parser.add_argument("--output_dir",
default=None,
type=str,
required=True,
help="directory for meta files")
parser.add_argument('--random_seed',
type=int,
default=12345,
help="random seed for initialization")
args = parser.parse_args()
rng = random.Random(args.random_seed)
np.random.seed(args.random_seed)
input_paths = args.input_files.strip().split(',')
input_paths = [f for f in input_paths if f]
input_files = []
for path in input_paths:
if os.path.isfile(path):
assert (path.endswith('.hdf5')), "file must be hdf5 file"
input_files.append(path)
else:
assert os.path.isdir(path)
hdf5_files = [os.path.join(path, f) for f in os.listdir(path) if os.path.isfile(os.path.join(path, f)) and f.endswith('.hdf5')]
input_files.extend(hdf5_files)
input_files.sort()
assert(os.path.isdir(args.output_dir))
print("load and prepare")
seq_len, pred_len, input_lengths, output_inst_indices = load_and_prepare(input_files, args.num_output_shards)
print("preparing lookup table")
total_num_instances = sum(input_lengths)
out_2_in = dict()
length_so_far = 0
for i, l in enumerate(input_lengths):
for j in range(l):
out_2_in[length_so_far + j] = (i, j)
length_so_far += input_lengths[i]
output_files = [os.path.join(args.output_dir, "indices_" + str(i) + ".npy") for i in range(args.num_output_shards)]
print("save data")
with open(os.path.join(args.output_dir, 'lookup_table.pkl'), 'wb') as f:
pickle.dump(out_2_in, f)
for i, out_file in enumerate(output_files):
np.save(out_file, output_inst_indices[i])
meta = {'seq_len': seq_len, 'pred_len':pred_len}
with open(os.path.join(args.output_dir, 'meta_data.pkl'), 'wb') as f:
pickle.dump(meta, f)
if __name__ == "__main__":
main()

View File

@ -0,0 +1,23 @@
#! /bin/bash
SHARD_INDEX=${1}
INPUT_FILE="${TARGET_DIR}/final_text_files_sharded/corpus.segmented.part.${SHARD_INDEX}.txt"
source /workspace/bert/data/utils/config.sh
OUTPUT_DIR=${TARGET_DIR}/hdf5_shards
mkdir -p ${OUTPUT_DIR}
OUTPUT_FILE="${OUTPUT_DIR}/${SHARD_INDEX}.hdf5"
python /workspace/bert/create_pretraining_data.py \
--input_file=${INPUT_FILE} \
--output_file=${OUTPUT_FILE} \
--vocab_file=${VOCAB_FILE} \
--do_lower_case \
--max_seq_length=${MAX_SEQUENCE_LENGTH} \
--max_predictions_per_seq=${MAX_PREDICTIONS_PER_SEQUENCE} \
--masked_lm_prob=${MASKED_LM_PROB} \
--random_seed=${SEED} \
--dupe_factor=${DUPE_FACTOR}

View File

@ -0,0 +1,15 @@
#! /bin/bash
source /workspace/bert/data/utils/config.sh
SHARD_COUNT=0
rm -rf ${TARGET_DIR}/xarg_list.txt
touch ${TARGET_DIR}/xarg_list.txt
for file in ${TARGET_DIR}/final_text_files_sharded/*; do
echo ${SHARD_COUNT} >> ${TARGET_DIR}/xarg_list.txt
SHARD_COUNT=$((SHARD_COUNT+1))
done
xargs -n 1 --max-procs=${N_PROCS_PREPROCESS} --arg-file=${TARGET_DIR}/xarg_list.txt /workspace/bert/data/utils/preprocessing.sh
rm ${TARGET_DIR}/xarg_list.txt

View File

@ -0,0 +1,28 @@
# NVIDIA
import argparse
import nltk
import os
nltk.download('punkt')
parser = argparse.ArgumentParser(description='Sentence Segmentation')
parser.add_argument('input_file', type=str)
parser.add_argument('output_file', type=str)
args = parser.parse_args()
input_file = args.input_file
output_file = args.output_file
doc_seperator = "\n"
with open(input_file) as ifile:
with open(output_file, "w") as ofile:
for line in ifile:
if line != "\n":
sent_list = nltk.tokenize.sent_tokenize(line)
for sent in sent_list:
ofile.write(sent + "\n")
ofile.write(doc_seperator)

View File

@ -0,0 +1,47 @@
# NVIDIA
import os
import argparse
parser = argparse.ArgumentParser(description='Dataset sharding')
parser.add_argument('input_file', type=str)
parser.add_argument('output_file', type=str)
args = parser.parse_args()
input_file = args.input_file
output_file = args.output_file
doc_seperator = "\n"
line_buffer = []
shard_size = 396000 # Approximate, will split at next article break
line_counter = 0
shard_index = 0
ifile_lines = 0
with open(input_file) as ifile:
for line in ifile:
ifile_lines += 1
print("Input file contains", ifile_lines, "lines.")
iline_counter = 1
with open(input_file) as ifile:
for line in ifile:
if line_counter < shard_size and iline_counter < ifile_lines:
line_buffer.append(line)
line_counter += 1
iline_counter += 1
elif line_counter >= shard_size and line != "\n" and iline_counter < ifile_lines:
line_buffer.append(line)
line_counter += 1
iline_counter += 1
else:
with open(output_file + str(shard_index) + ".txt", "w") as ofile:
for oline in line_buffer:
ofile.write(oline)
line_buffer = []
line_counter = 0
shard_index += 1

View File

@ -0,0 +1,30 @@
#! /bin/bash
WIKI_DUMP="ftp://ftpmirror.your.org/pub/wikimedia/dumps/enwiki/20190301/enwiki-20190301-pages-articles-multistream.xml.bz2"
N_PROCS_PREPROCESS=4 # Adjust this based on memory requirements and available number of cores
# Download Wikipedia dump file
mkdir -p ./download
# Not using --noclobber since it emits an error if exists (incompatible with bash 'set -e')
echo "Downloading Wikidump"
if [ ! -f ./download/wikidump.xml.bz2 ]; then
wget -O ./download/wikidump.xml.bz2 ${WIKI_DUMP}
fi
# Extract dump
echo "Extracting Wikidump"
mkdir -p ./raw_data
if [ ! -f ./raw_data/wikidump.xml ]; then
pv ./download/wikidump.xml.bz2 | bunzip2 -kdc > ./raw_data/wikidump.xml
fi
# Wikiextractor.py - Creates lots of folders/files in "doc format"
echo "Running Wikiextractor"
mkdir -p ./extracted_articles
/workspace/wikiextractor/WikiExtractor.py ./raw_data/wikidump.xml -b 1000M --processes ${N_PROCS_PREPROCESS} -o ./extracted_articles
# Remove XML Tags and extraneous titles (since they are not sentences)
# Also clean to remove lines between paragraphs within article and use space-separated articles
echo "Cleaning and formatting files (one article per line)"
python3 ./remove_tags_and_clean.py ./extracted_articles ./wikipedia_corpus.txt

View File

@ -0,0 +1,39 @@
# NVIDIA
import glob
import os
import argparse
parser = argparse.ArgumentParser(description='Cleaning and merge downloaded bookcorpus files')
parser.add_argument('extracted_articles_path', type=str)
parser.add_argument('output_file', type=str)
args = parser.parse_args()
extracted_articles_path = args.extracted_articles_path
output_file = args.output_file
with open(output_file, "w") as ofile:
for dirname in glob.glob('{}/*/'.format(extracted_articles_path), recursive=False):
for filename in glob.glob(dirname + 'wiki_*', recursive=True):
print(filename)
article_lines = []
article_open = False
with open(filename, "r") as file:
for line in file:
if "<doc id=" in line:
article_open = True
elif "</doc>" in line:
article_open = False
for oline in article_lines[1:]:
if oline != "\n":
ofile.write(oline.rstrip() + " ")
ofile.write("\n\n")
article_lines = []
else:
if article_open:
article_lines.append(line)

View File

@ -0,0 +1,297 @@
# coding=utf-8
# Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Extract pre-computed feature vectors from a PyTorch BERT model."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import argparse
import collections
import logging
import json
import re
import torch
from torch.utils.data import TensorDataset, DataLoader, SequentialSampler
from torch.utils.data.distributed import DistributedSampler
from tokenization import BertTokenizer
from modeling import BertModel
logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s',
datefmt = '%m/%d/%Y %H:%M:%S',
level = logging.INFO)
logger = logging.getLogger(__name__)
class InputExample(object):
def __init__(self, unique_id, text_a, text_b):
self.unique_id = unique_id
self.text_a = text_a
self.text_b = text_b
class InputFeatures(object):
"""A single set of features of data."""
def __init__(self, unique_id, tokens, input_ids, input_mask, input_type_ids):
self.unique_id = unique_id
self.tokens = tokens
self.input_ids = input_ids
self.input_mask = input_mask
self.input_type_ids = input_type_ids
def convert_examples_to_features(examples, seq_length, tokenizer):
"""Loads a data file into a list of `InputBatch`s."""
features = []
for (ex_index, example) in enumerate(examples):
tokens_a = tokenizer.tokenize(example.text_a)
tokens_b = None
if example.text_b:
tokens_b = tokenizer.tokenize(example.text_b)
if tokens_b:
# Modifies `tokens_a` and `tokens_b` in place so that the total
# length is less than the specified length.
# Account for [CLS], [SEP], [SEP] with "- 3"
_truncate_seq_pair(tokens_a, tokens_b, seq_length - 3)
else:
# Account for [CLS] and [SEP] with "- 2"
if len(tokens_a) > seq_length - 2:
tokens_a = tokens_a[0:(seq_length - 2)]
# The convention in BERT is:
# (a) For sequence pairs:
# tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
# type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1
# (b) For single sequences:
# tokens: [CLS] the dog is hairy . [SEP]
# type_ids: 0 0 0 0 0 0 0
#
# Where "type_ids" are used to indicate whether this is the first
# sequence or the second sequence. The embedding vectors for `type=0` and
# `type=1` were learned during pre-training and are added to the wordpiece
# embedding vector (and position vector). This is not *strictly* necessary
# since the [SEP] token unambigiously separates the sequences, but it makes
# it easier for the model to learn the concept of sequences.
#
# For classification tasks, the first vector (corresponding to [CLS]) is
# used as as the "sentence vector". Note that this only makes sense because
# the entire model is fine-tuned.
tokens = []
input_type_ids = []
tokens.append("[CLS]")
input_type_ids.append(0)
for token in tokens_a:
tokens.append(token)
input_type_ids.append(0)
tokens.append("[SEP]")
input_type_ids.append(0)
if tokens_b:
for token in tokens_b:
tokens.append(token)
input_type_ids.append(1)
tokens.append("[SEP]")
input_type_ids.append(1)
input_ids = tokenizer.convert_tokens_to_ids(tokens)
# The mask has 1 for real tokens and 0 for padding tokens. Only real
# tokens are attended to.
input_mask = [1] * len(input_ids)
# Zero-pad up to the sequence length.
while len(input_ids) < seq_length:
input_ids.append(0)
input_mask.append(0)
input_type_ids.append(0)
assert len(input_ids) == seq_length
assert len(input_mask) == seq_length
assert len(input_type_ids) == seq_length
if ex_index < 5:
logger.info("*** Example ***")
logger.info("unique_id: %s" % (example.unique_id))
logger.info("tokens: %s" % " ".join([str(x) for x in tokens]))
logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
logger.info("input_mask: %s" % " ".join([str(x) for x in input_mask]))
logger.info(
"input_type_ids: %s" % " ".join([str(x) for x in input_type_ids]))
features.append(
InputFeatures(
unique_id=example.unique_id,
tokens=tokens,
input_ids=input_ids,
input_mask=input_mask,
input_type_ids=input_type_ids))
return features
def _truncate_seq_pair(tokens_a, tokens_b, max_length):
"""Truncates a sequence pair in place to the maximum length."""
# This is a simple heuristic which will always truncate the longer sequence
# one token at a time. This makes more sense than truncating an equal percent
# of tokens from each, since if one sequence is very short then each token
# that's truncated likely contains more information than a longer sequence.
while True:
total_length = len(tokens_a) + len(tokens_b)
if total_length <= max_length:
break
if len(tokens_a) > len(tokens_b):
tokens_a.pop()
else:
tokens_b.pop()
def read_examples(input_file):
"""Read a list of `InputExample`s from an input file."""
examples = []
unique_id = 0
with open(input_file, "r", encoding='utf-8') as reader:
while True:
line = reader.readline()
if not line:
break
line = line.strip()
text_a = None
text_b = None
m = re.match(r"^(.*) \|\|\| (.*)$", line)
if m is None:
text_a = line
else:
text_a = m.group(1)
text_b = m.group(2)
examples.append(
InputExample(unique_id=unique_id, text_a=text_a, text_b=text_b))
unique_id += 1
return examples
def main():
parser = argparse.ArgumentParser()
## Required parameters
parser.add_argument("--input_file", default=None, type=str, required=True)
parser.add_argument("--output_file", default=None, type=str, required=True)
parser.add_argument("--bert_model", default=None, type=str, required=True,
help="Bert pre-trained model selected in the list: bert-base-uncased, "
"bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese.")
## Other parameters
parser.add_argument("--do_lower_case", action='store_true', help="Set this flag if you are using an uncased model.")
parser.add_argument("--layers", default="-1,-2,-3,-4", type=str)
parser.add_argument("--max_seq_length", default=128, type=int,
help="The maximum total input sequence length after WordPiece tokenization. Sequences longer "
"than this will be truncated, and sequences shorter than this will be padded.")
parser.add_argument("--batch_size", default=32, type=int, help="Batch size for predictions.")
parser.add_argument("--local_rank",
type=int,
default=-1,
help = "local_rank for distributed training on gpus")
parser.add_argument("--no_cuda",
action='store_true',
help="Whether not to use CUDA when available")
args = parser.parse_args()
if args.local_rank == -1 or args.no_cuda:
device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
n_gpu = torch.cuda.device_count()
else:
device = torch.device("cuda", args.local_rank)
n_gpu = 1
# Initializes the distributed backend which will take care of sychronizing nodes/GPUs
torch.distributed.init_process_group(backend='nccl')
logger.info("device: {} n_gpu: {} distributed training: {}".format(device, n_gpu, bool(args.local_rank != -1)))
layer_indexes = [int(x) for x in args.layers.split(",")]
tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case)
examples = read_examples(args.input_file)
features = convert_examples_to_features(
examples=examples, seq_length=args.max_seq_length, tokenizer=tokenizer)
unique_id_to_feature = {}
for feature in features:
unique_id_to_feature[feature.unique_id] = feature
model = BertModel.from_pretrained(args.bert_model)
model.to(device)
if args.local_rank != -1:
model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank],
output_device=args.local_rank)
elif n_gpu > 1:
model = torch.nn.DataParallel(model)
all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long)
all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long)
eval_data = TensorDataset(all_input_ids, all_input_mask, all_example_index)
if args.local_rank == -1:
eval_sampler = SequentialSampler(eval_data)
else:
eval_sampler = DistributedSampler(eval_data)
eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.batch_size)
model.eval()
with open(args.output_file, "w", encoding='utf-8') as writer:
for input_ids, input_mask, example_indices in eval_dataloader:
input_ids = input_ids.to(device)
input_mask = input_mask.to(device)
all_encoder_layers, _ = model(input_ids, token_type_ids=None, attention_mask=input_mask)
all_encoder_layers = all_encoder_layers
for b, example_index in enumerate(example_indices):
feature = features[example_index.item()]
unique_id = int(feature.unique_id)
# feature = unique_id_to_feature[unique_id]
output_json = collections.OrderedDict()
output_json["linex_index"] = unique_id
all_out_features = []
for (i, token) in enumerate(feature.tokens):
all_layers = []
for (j, layer_index) in enumerate(layer_indexes):
layer_output = all_encoder_layers[int(layer_index)].detach().cpu().numpy()
layer_output = layer_output[b]
layers = collections.OrderedDict()
layers["index"] = layer_index
layers["values"] = [
round(x.item(), 6) for x in layer_output[i]
]
all_layers.append(layers)
out_features = collections.OrderedDict()
out_features["token"] = token
out_features["layers"] = all_layers
all_out_features.append(out_features)
output_json["features"] = all_out_features
writer.write(json.dumps(output_json) + "\n")
if __name__ == "__main__":
main()

View File

@ -0,0 +1,249 @@
"""
Utilities for working with the local dataset cache.
This file is adapted from the AllenNLP library at https://github.com/allenai/allennlp
Copyright by the AllenNLP authors.
"""
from __future__ import (absolute_import, division, print_function, unicode_literals)
import json
import logging
import os
import shutil
import tempfile
from functools import wraps
from hashlib import sha256
import sys
from io import open
import boto3
import requests
from botocore.exceptions import ClientError
from tqdm import tqdm
try:
from urllib.parse import urlparse
except ImportError:
from urlparse import urlparse
try:
from pathlib import Path
PYTORCH_PRETRAINED_BERT_CACHE = Path(os.getenv('PYTORCH_PRETRAINED_BERT_CACHE',
Path.home() / '.pytorch_pretrained_bert'))
except AttributeError:
PYTORCH_PRETRAINED_BERT_CACHE = os.getenv('PYTORCH_PRETRAINED_BERT_CACHE',
os.path.join(os.path.expanduser("~"), '.pytorch_pretrained_bert'))
logger = logging.getLogger(__name__) # pylint: disable=invalid-name
def url_to_filename(url, etag=None):
"""
Convert `url` into a hashed filename in a repeatable way.
If `etag` is specified, append its hash to the url's, delimited
by a period.
"""
url_bytes = url.encode('utf-8')
url_hash = sha256(url_bytes)
filename = url_hash.hexdigest()
if etag:
etag_bytes = etag.encode('utf-8')
etag_hash = sha256(etag_bytes)
filename += '.' + etag_hash.hexdigest()
return filename
def filename_to_url(filename, cache_dir=None):
"""
Return the url and etag (which may be ``None``) stored for `filename`.
Raise ``EnvironmentError`` if `filename` or its stored metadata do not exist.
"""
if cache_dir is None:
cache_dir = PYTORCH_PRETRAINED_BERT_CACHE
if sys.version_info[0] == 3 and isinstance(cache_dir, Path):
cache_dir = str(cache_dir)
cache_path = os.path.join(cache_dir, filename)
if not os.path.exists(cache_path):
raise EnvironmentError("file {} not found".format(cache_path))
meta_path = cache_path + '.json'
if not os.path.exists(meta_path):
raise EnvironmentError("file {} not found".format(meta_path))
with open(meta_path, encoding="utf-8") as meta_file:
metadata = json.load(meta_file)
url = metadata['url']
etag = metadata['etag']
return url, etag
def cached_path(url_or_filename, cache_dir=None):
"""
Given something that might be a URL (or might be a local path),
determine which. If it's a URL, download the file and cache it, and
return the path to the cached file. If it's already a local path,
make sure the file exists and then return the path.
"""
if cache_dir is None:
cache_dir = PYTORCH_PRETRAINED_BERT_CACHE
if sys.version_info[0] == 3 and isinstance(url_or_filename, Path):
url_or_filename = str(url_or_filename)
if sys.version_info[0] == 3 and isinstance(cache_dir, Path):
cache_dir = str(cache_dir)
parsed = urlparse(url_or_filename)
if parsed.scheme in ('http', 'https', 's3'):
# URL, so get it from the cache (downloading if necessary)
return get_from_cache(url_or_filename, cache_dir)
elif os.path.exists(url_or_filename):
# File, and it exists.
return url_or_filename
elif parsed.scheme == '':
# File, but it doesn't exist.
raise EnvironmentError("file {} not found".format(url_or_filename))
else:
# Something unknown
raise ValueError("unable to parse {} as a URL or as a local path".format(url_or_filename))
def split_s3_path(url):
"""Split a full s3 path into the bucket name and path."""
parsed = urlparse(url)
if not parsed.netloc or not parsed.path:
raise ValueError("bad s3 path {}".format(url))
bucket_name = parsed.netloc
s3_path = parsed.path
# Remove '/' at beginning of path.
if s3_path.startswith("/"):
s3_path = s3_path[1:]
return bucket_name, s3_path
def s3_request(func):
"""
Wrapper function for s3 requests in order to create more helpful error
messages.
"""
@wraps(func)
def wrapper(url, *args, **kwargs):
try:
return func(url, *args, **kwargs)
except ClientError as exc:
if int(exc.response["Error"]["Code"]) == 404:
raise EnvironmentError("file {} not found".format(url))
else:
raise
return wrapper
@s3_request
def s3_etag(url):
"""Check ETag on S3 object."""
s3_resource = boto3.resource("s3")
bucket_name, s3_path = split_s3_path(url)
s3_object = s3_resource.Object(bucket_name, s3_path)
return s3_object.e_tag
@s3_request
def s3_get(url, temp_file):
"""Pull a file directly from S3."""
s3_resource = boto3.resource("s3")
bucket_name, s3_path = split_s3_path(url)
s3_resource.Bucket(bucket_name).download_fileobj(s3_path, temp_file)
def http_get(url, temp_file):
req = requests.get(url, stream=True)
content_length = req.headers.get('Content-Length')
total = int(content_length) if content_length is not None else None
progress = tqdm(unit="B", total=total)
for chunk in req.iter_content(chunk_size=1024):
if chunk: # filter out keep-alive new chunks
progress.update(len(chunk))
temp_file.write(chunk)
progress.close()
def get_from_cache(url, cache_dir=None):
"""
Given a URL, look for the corresponding dataset in the local cache.
If it's not there, download it. Then return the path to the cached file.
"""
if cache_dir is None:
cache_dir = PYTORCH_PRETRAINED_BERT_CACHE
if sys.version_info[0] == 3 and isinstance(cache_dir, Path):
cache_dir = str(cache_dir)
if not os.path.exists(cache_dir):
os.makedirs(cache_dir)
# Get eTag to add to filename, if it exists.
if url.startswith("s3://"):
etag = s3_etag(url)
else:
response = requests.head(url, allow_redirects=True)
if response.status_code != 200:
raise IOError("HEAD request failed for url {} with status code {}"
.format(url, response.status_code))
etag = response.headers.get("ETag")
filename = url_to_filename(url, etag)
# get cache path to put the file
cache_path = os.path.join(cache_dir, filename)
if not os.path.exists(cache_path):
# Download to temporary file, then copy to cache dir once finished.
# Otherwise you get corrupt cache entries if the download gets interrupted.
with tempfile.NamedTemporaryFile() as temp_file:
logger.info("%s not found in cache, downloading to %s", url, temp_file.name)
# GET file object
if url.startswith("s3://"):
s3_get(url, temp_file)
else:
http_get(url, temp_file)
# we are copying the file before closing it, so flush to avoid truncation
temp_file.flush()
# shutil.copyfileobj() starts at the current position, so go to the start
temp_file.seek(0)
logger.info("copying %s to cache at %s", temp_file.name, cache_path)
with open(cache_path, 'wb') as cache_file:
shutil.copyfileobj(temp_file, cache_file)
logger.info("creating metadata file for %s", cache_path)
meta = {'url': url, 'etag': etag}
meta_path = cache_path + '.json'
with open(meta_path, 'w', encoding="utf-8") as meta_file:
json.dump(meta, meta_file)
logger.info("removing temp file %s", temp_file.name)
return cache_path
def read_set_from_file(filename):
'''
Extract a de-duped collection (set) of text from a file.
Expected file format is one item per line.
'''
collection = set()
with open(filename, 'r', encoding='utf-8') as file_:
for line in file_:
collection.add(line.rstrip())
return collection
def get_file_extension(path, dot=True, lower=True):
ext = os.path.splitext(path)[1]
ext = ext if dot else ext[1:]
return ext.lower() if lower else ext

View File

@ -0,0 +1,205 @@
import types
import importlib
import math
import torch
def warmup_cosine(x, warmup=0.002):
if x < warmup:
return x/warmup
return 0.5 * (1.0 + torch.cos(math.pi * x))
def warmup_constant(x, warmup=0.002):
if x < warmup:
return x/warmup
return 1.0
def warmup_linear(x, warmup=0.002):
if x < warmup:
return x/warmup
return 1.0 - x
SCHEDULES = {
'warmup_cosine':warmup_cosine,
'warmup_constant':warmup_constant,
'warmup_linear':warmup_linear,
}
class FusedAdamBert(torch.optim.Optimizer):
"""Implements Adam algorithm. Currently GPU-only. Requires Apex to be installed via
``python setup.py install --cuda_ext --cpp_ext``.
It has been proposed in `Adam: A Method for Stochastic Optimization`_.
Arguments:
params (iterable): iterable of parameters to optimize or dicts defining
parameter groups.
lr (float, optional): learning rate. (default: 1e-3)
betas (Tuple[float, float], optional): coefficients used for computing
running averages of gradient and its square. (default: (0.9, 0.999))
eps (float, optional): term added to the denominator to improve
numerical stability. (default: 1e-8)
weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
amsgrad (boolean, optional): whether to use the AMSGrad variant of this
algorithm from the paper `On the Convergence of Adam and Beyond`_
(default: False) NOT SUPPORTED in FusedAdam!
eps_inside_sqrt (boolean, optional): in the 'update parameters' step,
adds eps to the bias-corrected second moment estimate before
evaluating square root instead of adding it to the square root of
second moment estimate as in the original paper. (default: False)
.. _Adam\: A Method for Stochastic Optimization:
https://arxiv.org/abs/1412.6980
.. _On the Convergence of Adam and Beyond:
https://openreview.net/forum?id=ryQu7f-RZ
"""
# def __init__(self, params,
# lr=1e-3, bias_correction = True,
# betas=(0.9, 0.999), eps=1e-8, eps_inside_sqrt = False,
# weight_decay=0., max_grad_norm=0., amsgrad=False):
def __init__(self, params, lr=1e-3, warmup=-1, t_total=-1, bias_correction=False, betas=(0.9, 0.999), schedule='warmup_linear',
eps=1e-6, eps_inside_sqrt = False, weight_decay=0., max_grad_norm=1.0, amsgrad=False):
global fused_adam_cuda
fused_adam_cuda = importlib.import_module("fused_adam_cuda")
if amsgrad:
raise RuntimeError('FusedAdam does not support the AMSGrad variant.')
defaults = dict(lr=lr, bias_correction=bias_correction,
betas=betas, eps=eps, weight_decay=weight_decay,
max_grad_norm=max_grad_norm)
super(FusedAdamBert, self).__init__(params, defaults)
print("LOCAL FUSED ADAM")
self.eps_mode = 0 if eps_inside_sqrt else 1
self.schedule = schedule
self.t_total = t_total
self.warmup = warmup
def get_lr(self):
lr = []
for group in self.param_groups:
for p in group['params']:
state = self.state[p]
if len(state) == 0:
return [0]
if group['t_total'] != -1:
schedule_fct = SCHEDULES[group['schedule']]
lr_scheduled = group['lr'] * schedule_fct(state['step']/group['t_total'], group['warmup'])
else:
lr_scheduled = group['lr']
lr.append(lr_scheduled)
print("LR {}".format(lr_scheduled))
return lr
def step(self, closure=None, grads=None, output_params=None, scale=1., grad_norms=None):
"""Performs a single optimization step.
Arguments:
closure (callable, optional): A closure that reevaluates the model
and returns the loss.
grads (list of tensors, optional): weight gradient to use for the
optimizer update. If gradients have type torch.half, parameters
are expected to be in type torch.float. (default: None)
output params (list of tensors, optional): A reduced precision copy
of the updated weights written out in addition to the regular
updated weights. Have to be of same type as gradients. (default: None)
scale (float, optional): factor to divide gradient tensor values
by before applying to weights. (default: 1)
"""
loss = None
if closure is not None:
loss = closure()
if grads is None:
grads_group = [None]*len(self.param_groups)
# backward compatibility
# assuming a list/generator of parameter means single group
elif isinstance(grads, types.GeneratorType):
grads_group = [grads]
elif type(grads[0])!=list:
grads_group = [grads]
else:
grads_group = grads
if output_params is None:
output_params_group = [None]*len(self.param_groups)
elif isinstance(output_params, types.GeneratorType):
output_params_group = [output_params]
elif type(output_params[0])!=list:
output_params_group = [output_params]
else:
output_params_group = output_params
if grad_norms is None:
grad_norms = [None]*len(self.param_groups)
#Compute global norm
global_norm = 0.0
for group, grads_this_group, output_params_this_group, grad_norm in zip(self.param_groups, grads_group,
output_params_group, grad_norms):
global_norm = (global_norm ** 2 + grad_norm ** 2) ** 0.5
for group, grads_this_group, output_params_this_group, grad_norm in zip(self.param_groups, grads_group, output_params_group, grad_norms):
if grads_this_group is None:
grads_this_group = [None]*len(group['params'])
if output_params_this_group is None:
output_params_this_group = [None]*len(group['params'])
# compute combined scale factor for this group
combined_scale = scale
if group['max_grad_norm'] > 0:
# norm is in fact norm*scale
clip = ((global_norm / scale) + 1e-6) / group['max_grad_norm']
if clip > 1:
combined_scale = clip * scale
bias_correction = 1 if group['bias_correction'] else 0
for p, grad, output_param in zip(group['params'], grads_this_group, output_params_this_group):
#note: p.grad should not ever be set for correct operation of mixed precision optimizer that sometimes sends None gradients
if p.grad is None and grad is None:
continue
if grad is None:
grad = p.grad.data
if grad.is_sparse:
raise RuntimeError('FusedAdam does not support sparse gradients, please consider SparseAdam instead')
state = self.state[p]
# State initialization
if len(state) == 0:
state['step'] = 0
# Exponential moving average of gradient values
state['exp_avg'] = torch.zeros_like(p.data)
# Exponential moving average of squared gradient values
state['exp_avg_sq'] = torch.zeros_like(p.data)
exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']
beta1, beta2 = group['betas']
state['step'] += 1
out_p = torch.tensor([], dtype = torch.float) if output_param is None else output_param
#Changes sharath
schedule_fct = SCHEDULES[self.schedule]
#schedule_fct(state['step']/self.t_total, self.warmup)
#step_lr = group['lr'] * schedule_fct(state['step']/self.t_total, self.warmup)
#step_lr = group['lr'] * scale#schedule_fct(state['step']/self.t_total, self.warmup)# schedule_fct(state['step']/group['t_total'], group['warmup'])
#print(scale, step_lr)
#print(group['lr'])
fused_adam_cuda.adam(p.data,
out_p,
exp_avg,
exp_avg_sq,
grad,
group['lr'], #step_lr,#group['lr'],
beta1,
beta2,
group['eps'],
combined_scale,
state['step'],
self.eps_mode,
bias_correction,
group['weight_decay'])
return loss

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,218 @@
# coding=utf-8
# Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""PyTorch optimization for BERT model."""
import math
import torch
from torch.optim import Optimizer
from torch.optim.optimizer import required
from torch.nn.utils import clip_grad_norm_
#from fused_adam_local import FusedAdam
from apex.optimizers import FusedAdam
def warmup_cosine(x, warmup=0.002):
if x < warmup:
return x/warmup
return 0.5 * (1.0 + torch.cos(math.pi * x))
def warmup_constant(x, warmup=0.002):
if x < warmup:
return x/warmup
return 1.0
def warmup_linear(x, warmup=0.002):
if x < warmup:
return x/warmup
# return (1.0 - x)
return max((x - 1. )/ (warmup - 1.), 0.)
SCHEDULES = {
'warmup_cosine':warmup_cosine,
'warmup_constant':warmup_constant,
'warmup_linear':warmup_linear,
}
class BertAdam(Optimizer):
"""Implements BERT version of Adam algorithm with weight decay fix.
Params:
lr: learning rate
warmup: portion of t_total for the warmup, -1 means no warmup. Default: -1
t_total: total number of training steps for the learning
rate schedule, -1 means constant learning rate. Default: -1
schedule: schedule to use for the warmup (see above). Default: 'warmup_linear'
b1: Adams b1. Default: 0.9
b2: Adams b2. Default: 0.999
e: Adams epsilon. Default: 1e-6
weight_decay: Weight decay. Default: 0.01
max_grad_norm: Maximum norm for the gradients (-1 means no clipping). Default: 1.0
"""
def __init__(self, params, lr=required, warmup=-1, t_total=-1, schedule='warmup_linear',
b1=0.9, b2=0.999, e=1e-6, weight_decay=0.01,
max_grad_norm=1.0):
if lr is not required and lr < 0.0:
raise ValueError("Invalid learning rate: {} - should be >= 0.0".format(lr))
if schedule not in SCHEDULES:
raise ValueError("Invalid schedule parameter: {}".format(schedule))
if not 0.0 <= warmup < 1.0 and not warmup == -1:
raise ValueError("Invalid warmup: {} - should be in [0.0, 1.0[ or -1".format(warmup))
if not 0.0 <= b1 < 1.0:
raise ValueError("Invalid b1 parameter: {} - should be in [0.0, 1.0[".format(b1))
if not 0.0 <= b2 < 1.0:
raise ValueError("Invalid b2 parameter: {} - should be in [0.0, 1.0[".format(b2))
if not e >= 0.0:
raise ValueError("Invalid epsilon value: {} - should be >= 0.0".format(e))
defaults = dict(lr=lr, schedule=schedule, warmup=warmup, t_total=t_total,
b1=b1, b2=b2, e=e, weight_decay=weight_decay,
max_grad_norm=max_grad_norm)
super(BertAdam, self).__init__(params, defaults)
def get_lr(self):
lr = []
for group in self.param_groups:
for p in group['params']:
state = self.state[p]
if len(state) == 0:
return [0]
if group['t_total'] != -1:
schedule_fct = SCHEDULES[group['schedule']]
lr_scheduled = group['lr'] * schedule_fct(state['step']/group['t_total'], group['warmup'])
else:
lr_scheduled = group['lr']
lr.append(lr_scheduled)
return lr
def step(self, closure=None):
"""Performs a single optimization step.
Arguments:
closure (callable, optional): A closure that reevaluates the model
and returns the loss.
"""
loss = None
if closure is not None:
loss = closure()
for group in self.param_groups:
for p in group['params']:
if p.grad is None:
continue
grad = p.grad.data
if grad.is_sparse:
raise RuntimeError('Adam does not support sparse gradients, please consider SparseAdam instead')
state = self.state[p]
# State initialization
if len(state) == 0:
state['step'] = 0
# Exponential moving average of gradient values
state['next_m'] = torch.zeros_like(p.data)
# Exponential moving average of squared gradient values
state['next_v'] = torch.zeros_like(p.data)
next_m, next_v = state['next_m'], state['next_v']
beta1, beta2 = group['b1'], group['b2']
# Add grad clipping
if group['max_grad_norm'] > 0:
clip_grad_norm_(p, group['max_grad_norm'])
# Decay the first and second moment running average coefficient
# In-place operations to update the averages at the same time
next_m.mul_(beta1).add_(1 - beta1, grad)
next_v.mul_(beta2).addcmul_(1 - beta2, grad, grad)
update = next_m / (next_v.sqrt() + group['e'])
# Just adding the square of the weights to the loss function is *not*
# the correct way of using L2 regularization/weight decay with Adam,
# since that will interact with the m and v parameters in strange ways.
#
# Instead we want to decay the weights in a manner that doesn't interact
# with the m/v parameters. This is equivalent to adding the square
# of the weights to the loss with plain (non-momentum) SGD.
if group['weight_decay'] > 0.0:
update += group['weight_decay'] * p.data
if group['t_total'] != -1:
schedule_fct = SCHEDULES[group['schedule']]
lr_scheduled = group['lr'] * schedule_fct(state['step']/group['t_total'], group['warmup'])
else:
lr_scheduled = group['lr']
update_with_lr = lr_scheduled * update
p.data.add_(-update_with_lr)
state['step'] += 1
# step_size = lr_scheduled * math.sqrt(bias_correction2) / bias_correction1
# No bias correction
# bias_correction1 = 1 - beta1 ** state['step']
# bias_correction2 = 1 - beta2 ** state['step']
return loss
# =======================================================================
class BertAdam_FP16(FusedAdam):
"""Implements BERT version of Adam algorithm with weight decay fix.
Params:
lr: learning rate
warmup: portion of t_total for the warmup, -1 means no warmup. Default: -1
t_total: total number of training steps for the learning
rate schedule, -1 means constant learning rate. Default: -1
schedule: schedule to use for the warmup (see above). Default: 'warmup_linear'
b1: Adams b1. Default: 0.9
b2: Adams b2. Default: 0.999
e: Adams epsilon. Default: 1e-6
weight_decay: Weight decay. Default: 0.01
max_grad_norm: Maximum norm for the gradients (-1 means no clipping). Default: 1.0
"""
def __init__(self, params, lr, warmup=-1, t_total=-1, bias_correction=False, schedule='warmup_linear',
b1=0.9, b2=0.999, e=1e-6, weight_decay=0.01,
max_grad_norm=1.0):
if not lr >= 0.0:
raise ValueError("Invalid learning rate: {} - should be >= 0.0".format(lr))
if schedule not in SCHEDULES:
raise ValueError("Invalid schedule parameter: {}".format(schedule))
if not 0.0 <= warmup < 1.0 and not warmup == -1:
raise ValueError("Invalid warmup: {} - should be in [0.0, 1.0[ or -1".format(warmup))
if not 0.0 <= b1 < 1.0:
raise ValueError("Invalid b1 parameter: {} - should be in [0.0, 1.0[".format(b1))
if not 0.0 <= b2 < 1.0:
raise ValueError("Invalid b2 parameter: {} - should be in [0.0, 1.0[".format(b2))
if not e >= 0.0:
raise ValueError("Invalid epsilon value: {} - should be >= 0.0".format(e))
# defaults = dict(lr=lr, schedule=schedule, warmup=warmup, t_total=t_total,
# b1=b1, b2=b2, e=e, weight_decay=weight_decay,
# max_grad_norm=max_grad_norm)
super(BertAdam_FP16, self).__init__(params, lr=lr, bias_correction=bias_correction, betas=(b1, b2), eps=e, weight_decay=weight_decay, max_grad_norm=max_grad_norm)#defaults)
def get_lr(self):
lr = []
for group in self.param_groups:
for p in group['params']:
state = self.state[p]
if len(state) == 0:
print("returning", state)
return [0]
if group['t_total'] != -1:
schedule_fct = SCHEDULES[group['schedule']]
lr_scheduled = group['lr'] * schedule_fct(state['step']/group['t_total'], group['warmup'])
else:
lr_scheduled = group['lr']
lr.append(lr_scheduled)
print("LR {}".format(lr_scheduled))
return lr

View File

@ -0,0 +1,13 @@
# progress bars in model download and training scripts
tqdm
# Accessing files from S3 directly.
boto3
# Used for downloading models over HTTP
requests
six
ipdb
#Data processing
h5py
html2text
nltk
progressbar

View File

@ -0,0 +1,649 @@
# coding=utf-8
# Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team.
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""BERT finetuning runner."""
from __future__ import absolute_import, division, print_function
import argparse
import csv
import logging
import os
import random
import sys
import numpy as np
import torch
from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,
TensorDataset)
from torch.utils.data.distributed import DistributedSampler
from tqdm import tqdm, trange
from file_utils import PYTORCH_PRETRAINED_BERT_CACHE
from modeling import BertForSequenceClassification, BertConfig, WEIGHTS_NAME, CONFIG_NAME
from tokenization import BertTokenizer
from optimization import BertAdam, warmup_linear
logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s',
datefmt = '%m/%d/%Y %H:%M:%S',
level = logging.INFO)
logger = logging.getLogger(__name__)
class InputExample(object):
"""A single training/test example for simple sequence classification."""
def __init__(self, guid, text_a, text_b=None, label=None):
"""Constructs a InputExample.
Args:
guid: Unique id for the example.
text_a: string. The untokenized text of the first sequence. For single
sequence tasks, only this sequence must be specified.
text_b: (Optional) string. The untokenized text of the second sequence.
Only must be specified for sequence pair tasks.
label: (Optional) string. The label of the example. This should be
specified for train and dev examples, but not for test examples.
"""
self.guid = guid
self.text_a = text_a
self.text_b = text_b
self.label = label
class InputFeatures(object):
"""A single set of features of data."""
def __init__(self, input_ids, input_mask, segment_ids, label_id):
self.input_ids = input_ids
self.input_mask = input_mask
self.segment_ids = segment_ids
self.label_id = label_id
class DataProcessor(object):
"""Base class for data converters for sequence classification data sets."""
def get_train_examples(self, data_dir):
"""Gets a collection of `InputExample`s for the train set."""
raise NotImplementedError()
def get_dev_examples(self, data_dir):
"""Gets a collection of `InputExample`s for the dev set."""
raise NotImplementedError()
def get_labels(self):
"""Gets the list of labels for this data set."""
raise NotImplementedError()
@classmethod
def _read_tsv(cls, input_file, quotechar=None):
"""Reads a tab separated value file."""
with open(input_file, "r") as f:
reader = csv.reader(f, delimiter="\t", quotechar=quotechar)
lines = []
for line in reader:
if sys.version_info[0] == 2:
line = list(unicode(cell, 'utf-8') for cell in line)
lines.append(line)
return lines
class MrpcProcessor(DataProcessor):
"""Processor for the MRPC data set (GLUE version)."""
def get_train_examples(self, data_dir):
"""See base class."""
logger.info("LOOKING AT {}".format(os.path.join(data_dir, "train.tsv")))
return self._create_examples(
self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
def get_dev_examples(self, data_dir):
"""See base class."""
return self._create_examples(
self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
def get_labels(self):
"""See base class."""
return ["0", "1"]
def _create_examples(self, lines, set_type):
"""Creates examples for the training and dev sets."""
examples = []
for (i, line) in enumerate(lines):
if i == 0:
continue
guid = "%s-%s" % (set_type, i)
text_a = line[3]
text_b = line[4]
label = line[0]
examples.append(
InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
return examples
class MnliProcessor(DataProcessor):
"""Processor for the MultiNLI data set (GLUE version)."""
def get_train_examples(self, data_dir):
"""See base class."""
return self._create_examples(
self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
def get_dev_examples(self, data_dir):
"""See base class."""
return self._create_examples(
self._read_tsv(os.path.join(data_dir, "dev_matched.tsv")),
"dev_matched")
def get_labels(self):
"""See base class."""
return ["contradiction", "entailment", "neutral"]
def _create_examples(self, lines, set_type):
"""Creates examples for the training and dev sets."""
examples = []
for (i, line) in enumerate(lines):
if i == 0:
continue
guid = "%s-%s" % (set_type, line[0])
text_a = line[8]
text_b = line[9]
label = line[-1]
examples.append(
InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
return examples
class ColaProcessor(DataProcessor):
"""Processor for the CoLA data set (GLUE version)."""
def get_train_examples(self, data_dir):
"""See base class."""
return self._create_examples(
self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
def get_dev_examples(self, data_dir):
"""See base class."""
return self._create_examples(
self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
def get_labels(self):
"""See base class."""
return ["0", "1"]
def _create_examples(self, lines, set_type):
"""Creates examples for the training and dev sets."""
examples = []
for (i, line) in enumerate(lines):
guid = "%s-%s" % (set_type, i)
text_a = line[3]
label = line[1]
examples.append(
InputExample(guid=guid, text_a=text_a, text_b=None, label=label))
return examples
def convert_examples_to_features(examples, label_list, max_seq_length, tokenizer):
"""Loads a data file into a list of `InputBatch`s."""
label_map = {label : i for i, label in enumerate(label_list)}
features = []
for (ex_index, example) in enumerate(examples):
tokens_a = tokenizer.tokenize(example.text_a)
tokens_b = None
if example.text_b:
tokens_b = tokenizer.tokenize(example.text_b)
# Modifies `tokens_a` and `tokens_b` in place so that the total
# length is less than the specified length.
# Account for [CLS], [SEP], [SEP] with "- 3"
_truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3)
else:
# Account for [CLS] and [SEP] with "- 2"
if len(tokens_a) > max_seq_length - 2:
tokens_a = tokens_a[:(max_seq_length - 2)]
# The convention in BERT is:
# (a) For sequence pairs:
# tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
# type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1
# (b) For single sequences:
# tokens: [CLS] the dog is hairy . [SEP]
# type_ids: 0 0 0 0 0 0 0
#
# Where "type_ids" are used to indicate whether this is the first
# sequence or the second sequence. The embedding vectors for `type=0` and
# `type=1` were learned during pre-training and are added to the wordpiece
# embedding vector (and position vector). This is not *strictly* necessary
# since the [SEP] token unambigiously separates the sequences, but it makes
# it easier for the model to learn the concept of sequences.
#
# For classification tasks, the first vector (corresponding to [CLS]) is
# used as as the "sentence vector". Note that this only makes sense because
# the entire model is fine-tuned.
tokens = ["[CLS]"] + tokens_a + ["[SEP]"]
segment_ids = [0] * len(tokens)
if tokens_b:
tokens += tokens_b + ["[SEP]"]
segment_ids += [1] * (len(tokens_b) + 1)
input_ids = tokenizer.convert_tokens_to_ids(tokens)
# The mask has 1 for real tokens and 0 for padding tokens. Only real
# tokens are attended to.
input_mask = [1] * len(input_ids)
# Zero-pad up to the sequence length.
padding = [0] * (max_seq_length - len(input_ids))
input_ids += padding
input_mask += padding
segment_ids += padding
assert len(input_ids) == max_seq_length
assert len(input_mask) == max_seq_length
assert len(segment_ids) == max_seq_length
label_id = label_map[example.label]
if ex_index < 5:
logger.info("*** Example ***")
logger.info("guid: %s" % (example.guid))
logger.info("tokens: %s" % " ".join(
[str(x) for x in tokens]))
logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
logger.info("input_mask: %s" % " ".join([str(x) for x in input_mask]))
logger.info(
"segment_ids: %s" % " ".join([str(x) for x in segment_ids]))
logger.info("label: %s (id = %d)" % (example.label, label_id))
features.append(
InputFeatures(input_ids=input_ids,
input_mask=input_mask,
segment_ids=segment_ids,
label_id=label_id))
return features
def _truncate_seq_pair(tokens_a, tokens_b, max_length):
"""Truncates a sequence pair in place to the maximum length."""
# This is a simple heuristic which will always truncate the longer sequence
# one token at a time. This makes more sense than truncating an equal percent
# of tokens from each, since if one sequence is very short then each token
# that's truncated likely contains more information than a longer sequence.
while True:
total_length = len(tokens_a) + len(tokens_b)
if total_length <= max_length:
break
if len(tokens_a) > len(tokens_b):
tokens_a.pop()
else:
tokens_b.pop()
def accuracy(out, labels):
outputs = np.argmax(out, axis=1)
return np.sum(outputs == labels)
def main():
parser = argparse.ArgumentParser()
## Required parameters
parser.add_argument("--data_dir",
default=None,
type=str,
required=True,
help="The input data dir. Should contain the .tsv files (or other data files) for the task.")
parser.add_argument("--bert_model", default=None, type=str, required=True,
help="Bert pre-trained model selected in the list: bert-base-uncased, "
"bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, "
"bert-base-multilingual-cased, bert-base-chinese.")
parser.add_argument("--task_name",
default=None,
type=str,
required=True,
help="The name of the task to train.")
parser.add_argument("--output_dir",
default=None,
type=str,
required=True,
help="The output directory where the model predictions and checkpoints will be written.")
parser.add_argument("--init_checkpoint",
default=None,
type=str,
required=True,
help="The checkpoint file from pretraining")
## Other parameters
parser.add_argument("--cache_dir",
default="",
type=str,
help="Where do you want to store the pre-trained models downloaded from s3")
parser.add_argument("--max_seq_length",
default=128,
type=int,
help="The maximum total input sequence length after WordPiece tokenization. \n"
"Sequences longer than this will be truncated, and sequences shorter \n"
"than this will be padded.")
parser.add_argument("--do_train",
action='store_true',
help="Whether to run training.")
parser.add_argument("--do_eval",
action='store_true',
help="Whether to run eval on the dev set.")
parser.add_argument("--do_lower_case",
action='store_true',
help="Set this flag if you are using an uncased model.")
parser.add_argument("--train_batch_size",
default=32,
type=int,
help="Total batch size for training.")
parser.add_argument("--eval_batch_size",
default=8,
type=int,
help="Total batch size for eval.")
parser.add_argument("--learning_rate",
default=5e-5,
type=float,
help="The initial learning rate for Adam.")
parser.add_argument("--num_train_epochs",
default=3.0,
type=float,
help="Total number of training epochs to perform.")
parser.add_argument("--max_steps", default=-1.0, type=float,
help="Total number of training steps to perform.")
parser.add_argument("--warmup_proportion",
default=0.1,
type=float,
help="Proportion of training to perform linear learning rate warmup for. "
"E.g., 0.1 = 10%% of training.")
parser.add_argument("--no_cuda",
action='store_true',
help="Whether not to use CUDA when available")
parser.add_argument("--local_rank",
type=int,
default=-1,
help="local_rank for distributed training on gpus")
parser.add_argument('--seed',
type=int,
default=42,
help="random seed for initialization")
parser.add_argument('--gradient_accumulation_steps',
type=int,
default=1,
help="Number of updates steps to accumulate before performing a backward/update pass.")
parser.add_argument('--fp16',
action='store_true',
help="Whether to use 16-bit float precision instead of 32-bit")
parser.add_argument('--loss_scale',
type=float, default=0,
help="Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n"
"0 (default value): dynamic loss scaling.\n"
"Positive power of 2: static loss scaling value.\n")
parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.")
parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.")
args = parser.parse_args()
if args.server_ip and args.server_port:
# Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
import ptvsd
print("Waiting for debugger attach")
ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True)
ptvsd.wait_for_attach()
processors = {
"cola": ColaProcessor,
"mnli": MnliProcessor,
"mrpc": MrpcProcessor,
}
num_labels_task = {
"cola": 2,
"mnli": 3,
"mrpc": 2,
}
if args.local_rank == -1 or args.no_cuda:
device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
n_gpu = torch.cuda.device_count()
else:
torch.cuda.set_device(args.local_rank)
device = torch.device("cuda", args.local_rank)
n_gpu = 1
# Initializes the distributed backend which will take care of sychronizing nodes/GPUs
torch.distributed.init_process_group(backend='nccl')
logger.info("device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".format(
device, n_gpu, bool(args.local_rank != -1), args.fp16))
if args.gradient_accumulation_steps < 1:
raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format(
args.gradient_accumulation_steps))
args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps
random.seed(args.seed)
np.random.seed(args.seed)
torch.manual_seed(args.seed)
if n_gpu > 0:
torch.cuda.manual_seed_all(args.seed)
if not args.do_train and not args.do_eval:
raise ValueError("At least one of `do_train` or `do_eval` must be True.")
if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train:
print("WARNING: Output directory ({}) already exists and is not empty.".format(args.output_dir))
if not os.path.exists(args.output_dir):
os.makedirs(args.output_dir)
task_name = args.task_name.lower()
if task_name not in processors:
raise ValueError("Task not found: %s" % (task_name))
processor = processors[task_name]()
num_labels = num_labels_task[task_name]
label_list = processor.get_labels()
tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case)
train_examples = None
num_train_optimization_steps = None
if args.do_train:
train_examples = processor.get_train_examples(args.data_dir)
num_train_optimization_steps = int(
len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs
if args.local_rank != -1:
num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size()
# Prepare model
cache_dir = args.cache_dir if args.cache_dir else os.path.join(PYTORCH_PRETRAINED_BERT_CACHE, 'distributed_{}'.format(args.local_rank))
model = BertForSequenceClassification.from_pretrained(args.bert_model,
cache_dir=cache_dir,
num_labels = num_labels)
model.load_state_dict(torch.load(args.init_checkpoint, map_location='cpu'), strict=False)
if args.fp16:
model.half()
model.to(device)
if args.local_rank != -1:
try:
from apex.parallel import DistributedDataParallel as DDP
except ImportError:
raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.")
model = DDP(model)
elif n_gpu > 1:
model = torch.nn.DataParallel(model)
# Prepare optimizer
param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
{'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
{'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]
if args.fp16:
try:
from apex.optimizers import FP16_Optimizer
from apex.optimizers import FusedAdam
except ImportError:
raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.")
optimizer = FusedAdam(optimizer_grouped_parameters,
lr=args.learning_rate,
bias_correction=False,
max_grad_norm=1.0)
if args.loss_scale == 0:
optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)
else:
optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale)
else:
optimizer = BertAdam(optimizer_grouped_parameters,
lr=args.learning_rate,
warmup=args.warmup_proportion,
t_total=num_train_optimization_steps)
global_step = 0
nb_tr_steps = 0
tr_loss = 0
if args.do_train:
train_features = convert_examples_to_features(
train_examples, label_list, args.max_seq_length, tokenizer)
logger.info("***** Running training *****")
logger.info(" Num examples = %d", len(train_examples))
logger.info(" Batch size = %d", args.train_batch_size)
logger.info(" Num steps = %d", num_train_optimization_steps)
all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long)
all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long)
all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long)
all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long)
train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)
if args.local_rank == -1:
train_sampler = RandomSampler(train_data)
else:
train_sampler = DistributedSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size)
model.train()
for _ in trange(int(args.num_train_epochs), desc="Epoch"):
tr_loss = 0
nb_tr_examples, nb_tr_steps = 0, 0
for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration")):
if args.max_steps > 0 and global_step > args.max_steps:
break
batch = tuple(t.to(device) for t in batch)
input_ids, input_mask, segment_ids, label_ids = batch
loss = model(input_ids, segment_ids, input_mask, label_ids)
if n_gpu > 1:
loss = loss.mean() # mean() to average on multi-gpu.
if args.gradient_accumulation_steps > 1:
loss = loss / args.gradient_accumulation_steps
if args.fp16:
optimizer.backward(loss)
else:
loss.backward()
tr_loss += loss.item()
nb_tr_examples += input_ids.size(0)
nb_tr_steps += 1
if (step + 1) % args.gradient_accumulation_steps == 0:
if args.fp16:
# modify learning rate with special warm up BERT uses
# if args.fp16 is False, BertAdam is used that handles this automatically
lr_this_step = args.learning_rate * warmup_linear(global_step/num_train_optimization_steps, args.warmup_proportion)
for param_group in optimizer.param_groups:
param_group['lr'] = lr_this_step
optimizer.step()
optimizer.zero_grad()
global_step += 1
if args.do_train:
# Save a trained model and the associated configuration
model_to_save = model.module if hasattr(model, 'module') else model # Only save the model it-self
output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME)
torch.save(model_to_save.state_dict(), output_model_file)
output_config_file = os.path.join(args.output_dir, CONFIG_NAME)
with open(output_config_file, 'w') as f:
f.write(model_to_save.config.to_json_string())
# Load a trained model and config that you have fine-tuned
config = BertConfig(output_config_file)
model = BertForSequenceClassification(config, num_labels=num_labels)
model.load_state_dict(torch.load(output_model_file))
else:
model = BertForSequenceClassification.from_pretrained(args.bert_model, num_labels=num_labels)
model.load_state_dict(torch.load(args.init_checkpoint, map_location='cpu'), strict=False)
model.to(device)
if args.do_eval and (args.local_rank == -1 or torch.distributed.get_rank() == 0):
eval_examples = processor.get_dev_examples(args.data_dir)
eval_features = convert_examples_to_features(
eval_examples, label_list, args.max_seq_length, tokenizer)
logger.info("***** Running evaluation *****")
logger.info(" Num examples = %d", len(eval_examples))
logger.info(" Batch size = %d", args.eval_batch_size)
all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long)
all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long)
all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long)
all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long)
eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)
# Run prediction for full data
eval_sampler = SequentialSampler(eval_data)
eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size)
model.eval()
eval_loss, eval_accuracy = 0, 0
nb_eval_steps, nb_eval_examples = 0, 0
for input_ids, input_mask, segment_ids, label_ids in tqdm(eval_dataloader, desc="Evaluating"):
input_ids = input_ids.to(device)
input_mask = input_mask.to(device)
segment_ids = segment_ids.to(device)
label_ids = label_ids.to(device)
with torch.no_grad():
tmp_eval_loss = model(input_ids, segment_ids, input_mask, label_ids)
logits = model(input_ids, segment_ids, input_mask)
logits = logits.detach().cpu().numpy()
label_ids = label_ids.to('cpu').numpy()
tmp_eval_accuracy = accuracy(logits, label_ids)
eval_loss += tmp_eval_loss.mean().item()
eval_accuracy += tmp_eval_accuracy
nb_eval_examples += input_ids.size(0)
nb_eval_steps += 1
eval_loss = eval_loss / nb_eval_steps
eval_accuracy = eval_accuracy / nb_eval_examples
loss = tr_loss/nb_tr_steps if args.do_train else None
result = {'eval_loss': eval_loss,
'eval_accuracy': eval_accuracy,
'global_step': global_step,
'loss': loss}
output_eval_file = os.path.join(args.output_dir, "eval_results.txt")
with open(output_eval_file, "w") as writer:
logger.info("***** Eval results *****")
for key in sorted(result.keys()):
logger.info(" %s = %s", key, str(result[key]))
writer.write("%s = %s\n" % (key, str(result[key])))
if __name__ == "__main__":
main()

View File

@ -0,0 +1,417 @@
# coding=utf-8
# Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""BERT finetuning runner."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
#==================
import csv
import os
import logging
import argparse
import random
import h5py
from tqdm import tqdm, trange
import os
import numpy as np
import torch
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, Dataset
from torch.utils.data.distributed import DistributedSampler
import math
from apex import amp
from tokenization import BertTokenizer
from modeling import BertForPreTraining, BertConfig
from optimization import BertAdam, BertAdam_FP16
# from fused_adam_local import FusedAdamBert
from file_utils import PYTORCH_PRETRAINED_BERT_CACHE
from apex.optimizers import FusedAdam #, FP16_Optimizer
#from apex.optimizers import FusedAdam
from apex.parallel import DistributedDataParallel as DDP
from schedulers import LinearWarmUpScheduler
logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s',
datefmt = '%m/%d/%Y %H:%M:%S',
level = logging.INFO)
logger = logging.getLogger(__name__)
class pretraining_dataset(Dataset):
def __init__(self, input_file, max_pred_length):
self.input_file = input_file
self.max_pred_length = max_pred_length
f = h5py.File(input_file, "r")
self.input_ids = np.asarray(f["input_ids"][:]).astype(np.int64)#[num_instances x max_seq_length])
self.input_masks = np.asarray(f["input_mask"][:]).astype(np.int64) #[num_instances x max_seq_length]
self.segment_ids = np.asarray(f["segment_ids"][:]).astype(np.int64) #[num_instances x max_seq_length]
self.masked_lm_positions = np.asarray(f["masked_lm_positions"][:]).astype(np.int64) #[num_instances x max_pred_length]
self.masked_lm_ids= np.asarray(f["masked_lm_ids"][:]).astype(np.int64) #[num_instances x max_pred_length]
self.next_sentence_labels = np.asarray(f["next_sentence_labels"][:]).astype(np.int64) # [num_instances]
f.close()
def __len__(self):
'Denotes the total number of samples'
return len(self.input_ids)
def __getitem__(self, index):
input_ids= torch.from_numpy(self.input_ids[index]) # [max_seq_length]
input_mask = torch.from_numpy(self.input_masks[index]) #[max_seq_length]
segment_ids = torch.from_numpy(self.segment_ids[index])# [max_seq_length]
masked_lm_positions = torch.from_numpy(self.masked_lm_positions[index]) #[max_pred_length]
masked_lm_ids = torch.from_numpy(self.masked_lm_ids[index]) #[max_pred_length]
next_sentence_labels = torch.from_numpy(np.asarray(self.next_sentence_labels[index])) #[1]
masked_lm_labels = torch.ones(input_ids.shape, dtype=torch.long) * -1
index = self.max_pred_length
# store number of masked tokens in index
if len((masked_lm_positions == 0).nonzero()) != 0:
index = (masked_lm_positions == 0).nonzero()[0].item()
masked_lm_labels[masked_lm_positions[:index]] = masked_lm_ids[:index]
return [input_ids, segment_ids, input_mask, masked_lm_labels, next_sentence_labels]
def main():
print("IN NEW MAIN XD\n")
parser = argparse.ArgumentParser()
## Required parameters
parser.add_argument("--input_dir",
default=None,
type=str,
required=True,
help="The input data dir. Should contain .hdf5 files for the task.")
parser.add_argument("--config_file",
default=None,
type=str,
required=True,
help="The BERT model config")
parser.add_argument("--bert_model", default="bert-large-uncased", type=str,
help="Bert pre-trained model selected in the list: bert-base-uncased, "
"bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese.")
parser.add_argument("--output_dir",
default=None,
type=str,
required=True,
help="The output directory where the model checkpoints will be written.")
## Other parameters
parser.add_argument("--max_seq_length",
default=512,
type=int,
help="The maximum total input sequence length after WordPiece tokenization. \n"
"Sequences longer than this will be truncated, and sequences shorter \n"
"than this will be padded.")
parser.add_argument("--max_predictions_per_seq",
default=80,
type=int,
help="The maximum total of masked tokens in input sequence")
parser.add_argument("--train_batch_size",
default=32,
type=int,
help="Total batch size for training.")
parser.add_argument("--learning_rate",
default=5e-5,
type=float,
help="The initial learning rate for Adam.")
parser.add_argument("--num_train_epochs",
default=3.0,
type=float,
help="Total number of training epochs to perform.")
parser.add_argument("--max_steps",
default=1000,
type=float,
help="Total number of training steps to perform.")
parser.add_argument("--warmup_proportion",
default=0.01,
type=float,
help="Proportion of training to perform linear learning rate warmup for. "
"E.g., 0.1 = 10%% of training.")
parser.add_argument("--local_rank",
type=int,
default=-1,
help="local_rank for distributed training on gpus")
parser.add_argument('--seed',
type=int,
default=42,
help="random seed for initialization")
parser.add_argument('--gradient_accumulation_steps',
type=int,
default=1,
help="Number of updates steps to accumualte before performing a backward/update pass.")
parser.add_argument('--fp16',
default=False,
action='store_true',
help="Whether to use 16-bit float precision instead of 32-bit")
parser.add_argument('--loss_scale',
type=float, default=0.0,
help='Loss scaling, positive power of 2 values can improve fp16 convergence.')
parser.add_argument('--log_freq',
type=float, default=10.0,
help='frequency of logging loss.')
parser.add_argument('--checkpoint_activations',
default=False,
action='store_true',
help="Whether to use gradient checkpointing")
parser.add_argument("--resume_from_checkpoint",
default=False,
action='store_true',
help="Whether to resume training from checkpoint.")
parser.add_argument('--resume_step',
type=int,
default=-1,
help="Step to resume training from.")
parser.add_argument('--num_steps_per_checkpoint',
type=int,
default=2000,
help="Number of update steps until a model checkpoint is saved to disk.")
args = parser.parse_args()
random.seed(args.seed)
np.random.seed(args.seed)
torch.manual_seed(args.seed)
assert(torch.cuda.is_available())
if args.local_rank == -1:
device = torch.device("cuda")
n_gpu = torch.cuda.device_count()
else:
torch.cuda.set_device(args.local_rank)
device = torch.device("cuda", args.local_rank)
n_gpu = 1
# Initializes the distributed backend which will take care of sychronizing nodes/GPUs
torch.distributed.init_process_group(backend='nccl', init_method='env://')
logger.info("device %s n_gpu %d distributed training %r", device, n_gpu, bool(args.local_rank != -1))
if args.gradient_accumulation_steps < 1:
raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format(
args.gradient_accumulation_steps))
if args.train_batch_size % args.gradient_accumulation_steps != 0:
raise ValueError("Invalid gradient_accumulation_steps parameter: {}, batch size {} should be divisible".format(
args.gradient_accumulation_steps, args.train_batch_size))
args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps
if not args.resume_from_checkpoint and os.path.exists(args.output_dir) and (os.listdir(args.output_dir) and os.listdir(args.output_dir)!=['logfile.txt']):
raise ValueError("Output directory ({}) already exists and is not empty.".format(args.output_dir))
if not args.resume_from_checkpoint:
os.makedirs(args.output_dir, exist_ok=True)
# Prepare model
config = BertConfig.from_json_file(args.config_file)
model = BertForPreTraining(config)
if not args.resume_from_checkpoint:
global_step = 0
else:
if args.resume_step == -1:
model_names = [f for f in os.listdir(args.output_dir) if f.endswith(".pt")]
args.resume_step = max([int(x.split('.pt')[0].split('_')[1].strip()) for x in model_names])
global_step = args.resume_step
checkpoint = torch.load(os.path.join(args.output_dir, "ckpt_{}.pt".format(global_step)), map_location="cpu")
model.load_state_dict(checkpoint['model'], strict=False)
print("resume step from ", args.resume_step)
model.to(device)
# Prepare optimizer
param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
{'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
{'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]
if args.fp16:
optimizer = FusedAdam(optimizer_grouped_parameters,
lr=args.learning_rate,
#warmup=args.warmup_proportion,
#t_total=args.max_steps,
bias_correction=False,
weight_decay=0.01,
max_grad_norm=1.0)
if args.loss_scale == 0:
# optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)
model, optimizer = amp.initialize(model, optimizer, opt_level="O2", keep_batchnorm_fp32=False, loss_scale="dynamic")
else:
# optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale)
model, optimizer = amp.initialize(model, optimizer, opt_level="O2", keep_batchnorm_fp32=False, loss_scale=args.loss_scale)
scheduler = LinearWarmUpScheduler(optimizer, warmup=args.warmup_proportion, total_steps=args.max_steps)
else:
optimizer = BertAdam(optimizer_grouped_parameters,
lr=args.learning_rate,
warmup=args.warmup_proportion,
t_total=args.max_steps)
if args.resume_from_checkpoint:
optimizer.load_state_dict(checkpoint['optimizer']) # , strict=False)
if args.local_rank != -1:
model = DDP(model)
elif n_gpu > 1:
model = torch.nn.DataParallel(model)
files = [os.path.join(args.input_dir, f) for f in os.listdir(args.input_dir) if os.path.isfile(os.path.join(args.input_dir, f))]
files.sort()
num_files = len(files)
logger.info("***** Running training *****")
# logger.info(" Num examples = %d", len(train_data))
logger.info(" Batch size = %d", args.train_batch_size)
print(" LR = ", args.learning_rate)
model.train()
print("Training. . .")
most_recent_ckpts_paths = []
print("Training. . .")
tr_loss = 0.0 # total added training loss
average_loss = 0.0 # averaged loss every args.log_freq steps
epoch = 0
training_steps = 0
while True:
if not args.resume_from_checkpoint:
random.shuffle(files)
f_start_id = 0
else:
f_start_id = checkpoint['files'][0]
files = checkpoint['files'][1:]
args.resume_from_checkpoint = False
for f_id in range(f_start_id, len(files)):
data_file = files[f_id]
logger.info("file no %s file %s" %(f_id, data_file))
train_data = pretraining_dataset(input_file=data_file, max_pred_length=args.max_predictions_per_seq)
if args.local_rank == -1:
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size * n_gpu, num_workers=4, pin_memory=True)
else:
train_sampler = DistributedSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size, num_workers=4, pin_memory=True)
for step, batch in enumerate(tqdm(train_dataloader, desc="File Iteration")):
training_steps += 1
batch = [t.to(device) for t in batch]
input_ids, segment_ids, input_mask, masked_lm_labels, next_sentence_labels = batch#\
loss = model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask, masked_lm_labels=masked_lm_labels, next_sentence_label=next_sentence_labels, checkpoint_activations=args.checkpoint_activations)
if n_gpu > 1:
loss = loss.mean() # mean() to average on multi-gpu.
if args.gradient_accumulation_steps > 1:
loss = loss / args.gradient_accumulation_steps
if args.fp16:
# optimizer.backward(loss)
with amp.scale_loss(loss, optimizer) as scaled_loss:
scaled_loss.backward()
else:
loss.backward()
tr_loss += loss
average_loss += loss.item()
if training_steps % args.gradient_accumulation_steps == 0:
if args.fp16:
scheduler.step()
optimizer.step()
optimizer.zero_grad()
global_step += 1
if training_steps == 1 * args.gradient_accumulation_steps:
logger.info("Step:{} Average Loss = {} Step Loss = {} LR {}".format(global_step, average_loss,
loss.item(), optimizer.param_groups[0]['lr']))
if training_steps % (args.log_freq * args.gradient_accumulation_steps) == 0:
logger.info("Step:{} Average Loss = {} Step Loss = {} LR {}".format(global_step, average_loss / args.log_freq,
loss.item(), optimizer.param_groups[0]['lr']))
average_loss = 0
if global_step >= args.max_steps or training_steps == 1 * args.gradient_accumulation_steps or training_steps % (args.num_steps_per_checkpoint * args.gradient_accumulation_steps) == 0:
if (not torch.distributed.is_initialized() or (torch.distributed.is_initialized() and torch.distributed.get_rank() == 0)):
# Save a trained model
logger.info("** ** * Saving fine - tuned model ** ** * ")
model_to_save = model.module if hasattr(model, 'module') else model # Only save the model it-self
output_save_file = os.path.join(args.output_dir, "ckpt_{}.pt".format(global_step))
torch.save({'model' : model_to_save.state_dict(),
'optimizer' : optimizer.state_dict(),
'files' : [f_id] + files }, output_save_file)
most_recent_ckpts_paths.append(output_save_file)
if len(most_recent_ckpts_paths) > 3:
ckpt_to_be_removed = most_recent_ckpts_paths.pop(0)
os.remove(ckpt_to_be_removed)
if global_step >= args.max_steps:
tr_loss = tr_loss * args.gradient_accumulation_steps / training_steps
if (torch.distributed.is_initialized()):
tr_loss /= torch.distributed.get_world_size()
torch.distributed.all_reduce(tr_loss)
logger.info("Total Steps:{} Final Loss = {}".format(training_steps, tr_loss.item()))
return
del train_dataloader
del train_sampler
del train_data
#for obj in gc.get_objects():
# if torch.is_tensor(obj) or (hasattr(obj, 'data') and torch.is_tensor(obj.data)):
# del obj
torch.cuda.empty_cache()
epoch += 1
if __name__ == "__main__":
main()

View File

@ -0,0 +1,300 @@
# coding=utf-8
# Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""BERT finetuning runner."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
#==================
import csv
import os
import logging
import argparse
import random
import h5py
from tqdm import tqdm, trange
import os
import numpy as np
import torch
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, Dataset
from torch.utils.data.distributed import DistributedSampler
import math
import time
from tokenization import BertTokenizer
from modeling import BertForPreTraining, BertConfig
# from fused_adam_local import FusedAdamBert
from file_utils import PYTORCH_PRETRAINED_BERT_CACHE
from apex.parallel import DistributedDataParallel as DDP
import torch.distributed as dist
logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s',
datefmt = '%m/%d/%Y %H:%M:%S',
level = logging.INFO)
logger = logging.getLogger(__name__)
class pretraining_dataset(Dataset):
def __init__(self, input_file, max_pred_length):
self.input_file = input_file
self.max_pred_length = max_pred_length
f = h5py.File(input_file, "r")
self.input_ids = np.asarray(f["input_ids"][:]).astype(np.int64)#[num_instances x max_seq_length])
self.input_masks = np.asarray(f["input_mask"][:]).astype(np.int64) #[num_instances x max_seq_length]
self.segment_ids = np.asarray(f["segment_ids"][:]).astype(np.int64) #[num_instances x max_seq_length]
self.masked_lm_positions = np.asarray(f["masked_lm_positions"][:]).astype(np.int64) #[num_instances x max_pred_length]
self.masked_lm_ids= np.asarray(f["masked_lm_ids"][:]).astype(np.int64) #[num_instances x max_pred_length]
self.next_sentence_labels = np.asarray(f["next_sentence_labels"][:]).astype(np.int64) # [num_instances]
f.close()
def __len__(self):
'Denotes the total number of samples'
return len(self.input_ids)
def __getitem__(self, index):
input_ids= torch.from_numpy(self.input_ids[index]) # [max_seq_length]
input_mask = torch.from_numpy(self.input_masks[index]) #[max_seq_length]
segment_ids = torch.from_numpy(self.segment_ids[index])# [max_seq_length]
masked_lm_positions = torch.from_numpy(self.masked_lm_positions[index]) #[max_pred_length]
masked_lm_ids = torch.from_numpy(self.masked_lm_ids[index]) #[max_pred_length]
next_sentence_labels = torch.from_numpy(np.asarray(self.next_sentence_labels[index])) #[1]
masked_lm_labels = torch.ones(input_ids.shape, dtype=torch.long) * -1
index = self.max_pred_length
# store number of masked tokens in index
if len((masked_lm_positions == 0).nonzero()) != 0:
index = (masked_lm_positions == 0).nonzero()[0].item()
masked_lm_labels[masked_lm_positions[:index]] = masked_lm_ids[:index]
return [input_ids, segment_ids, input_mask, masked_lm_labels, next_sentence_labels]
def main():
print("IN NEW MAIN XD\n")
parser = argparse.ArgumentParser()
## Required parameters
parser.add_argument("--input_dir",
default=None,
type=str,
required=True,
help="The input data dir. Should contain .hdf5 files for the task.")
parser.add_argument("--config_file",
default="bert_config.json",
type=str,
required=False,
help="The BERT model config")
parser.add_argument("--ckpt_dir",
default=None,
type=str,
required=True,
help="The ckpt directory, e.g. /results")
group = parser.add_mutually_exclusive_group(required=True)
group.add_argument('--eval', dest='do_eval', action='store_true')
group.add_argument('--prediction', dest='do_eval', action='store_false')
## Other parameters
parser.add_argument("--bert_model", default="bert-large-uncased", type=str, required=False,
help="Bert pre-trained model selected in the list: bert-base-uncased, "
"bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese.")
parser.add_argument("--max_seq_length",
default=512,
type=int,
help="The maximum total input sequence length after WordPiece tokenization. \n"
"Sequences longer than this will be truncated, and sequences shorter \n"
"than this will be padded.")
parser.add_argument("--max_predictions_per_seq",
default=80,
type=int,
help="The maximum total of masked tokens in input sequence")
parser.add_argument("--ckpt_step",
default=-1,
type=int,
required=False,
help="The model checkpoint iteration, e.g. 1000")
parser.add_argument("--eval_batch_size",
default=8,
type=int,
help="Total batch size for training.")
parser.add_argument("--max_steps",
default=-1,
type=int,
help="Total number of eval steps to perform, otherwise use full dataset")
parser.add_argument("--no_cuda",
default=False,
action='store_true',
help="Whether not to use CUDA when available")
parser.add_argument("--local_rank",
type=int,
default=-1,
help="local_rank for distributed training on gpus")
parser.add_argument('--seed',
type=int,
default=42,
help="random seed for initialization")
parser.add_argument('--fp16',
default=False,
action='store_true',
help="Whether to use 16-bit float precision instead of 32-bit")
args = parser.parse_args()
if args.local_rank == -1 or args.no_cuda:
device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
else:
torch.cuda.set_device(args.local_rank)
device = torch.device("cuda", args.local_rank)
# Initializes the distributed backend which will take care of sychronizing nodes/GPUs
torch.distributed.init_process_group(backend='nccl', init_method='env://')
n_gpu = torch.cuda.device_count()
if n_gpu > 1:
assert(args.local_rank != -1) # only use torch.distributed for multi-gpu
logger.info("device %s n_gpu %d distributed inference %r", device, n_gpu, bool(args.local_rank != -1))
random.seed(args.seed)
np.random.seed(args.seed)
torch.manual_seed(args.seed)
if n_gpu > 0:
torch.cuda.manual_seed_all(args.seed)
# Prepare model
config = BertConfig.from_json_file(args.config_file)
model = BertForPreTraining(config)
if args.ckpt_step == -1:
#retrieve latest model
model_names = [f for f in os.listdir(args.ckpt_dir) if f.endswith(".model")]
args.ckpt_step = max([int(x.split('.model')[0].split('_')[1].strip()) for x in model_names])
print("load model saved at iteraton", args.ckpt_step)
model_file = os.path.join(args.ckpt_dir, "ckpt_" + str(args.ckpt_step) + ".model")
state_dict = torch.load(model_file, map_location="cpu")
model.load_state_dict(state_dict, strict=False)
if args.fp16:
model.half() # all parameters and buffers are converted to half precision
model.to(device)
multi_gpu_training = args.local_rank != -1 and torch.distributed.is_initialized()
if multi_gpu_training:
model = DDP(model)
files = [os.path.join(args.input_dir, f) for f in os.listdir(args.input_dir) if os.path.isfile(os.path.join(args.input_dir, f))]
files.sort()
logger.info("***** Running evaluation *****")
logger.info(" Batch size = %d", args.eval_batch_size)
model.eval()
print("Evaluation. . .")
nb_instances = 0
max_steps = args.max_steps if args.max_steps > 0 else np.inf
global_step = 0
with torch.no_grad():
if args.do_eval:
final_loss = 0.0 #
for data_file in files:
logger.info("file %s" %( data_file))
dataset = pretraining_dataset(input_file=data_file, max_pred_length=args.max_predictions_per_seq)
if not multi_gpu_training:
train_sampler = RandomSampler(dataset)
datasetloader = DataLoader(dataset, sampler=train_sampler, batch_size=args.eval_batch_size, num_workers=4, pin_memory=True)
else:
train_sampler = DistributedSampler(dataset)
datasetloader = DataLoader(dataset, sampler=train_sampler, batch_size=args.eval_batch_size, num_workers=4, pin_memory=True)
for step, batch in enumerate(tqdm(datasetloader, desc="Iteration")):
if global_step > max_steps:
break
batch = [t.to(device) for t in batch]
input_ids, segment_ids, input_mask, masked_lm_labels, next_sentence_labels = batch#\
loss = model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask, masked_lm_labels=masked_lm_labels, next_sentence_label=next_sentence_labels)
final_loss += loss
global_step += 1
torch.cuda.empty_cache()
if global_step > max_steps:
break
final_loss /= global_step
if multi_gpu_training:
final_loss /= torch.distributed.get_world_size()
dist.all_reduce(final_loss)
if (not multi_gpu_training or (multi_gpu_training and torch.distributed.get_rank() == 0)):
logger.info("Finished: Final Loss = {}".format(final_loss))
else: # inference
# if multi_gpu_training:
# torch.distributed.barrier()
# start_t0 = time.time()
for data_file in files:
logger.info("file %s" %( data_file))
dataset = pretraining_dataset(input_file=data_file, max_pred_length=args.max_predictions_per_seq)
if not multi_gpu_training:
train_sampler = RandomSampler(dataset)
datasetloader = DataLoader(dataset, sampler=train_sampler, batch_size=args.eval_batch_size, num_workers=4, pin_memory=True)
else:
train_sampler = DistributedSampler(dataset)
datasetloader = DataLoader(dataset, sampler=train_sampler, batch_size=args.eval_batch_size, num_workers=4, pin_memory=True)
for step, batch in enumerate(tqdm(datasetloader, desc="Iteration")):
if global_step > max_steps:
break
batch = [t.to(device) for t in batch]
input_ids, segment_ids, input_mask, masked_lm_labels, next_sentence_labels = batch#\
lm_logits, nsp_logits = model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask, masked_lm_labels=None, next_sentence_label=None)
nb_instances += input_ids.size(0)
global_step += 1
torch.cuda.empty_cache()
if global_step > max_steps:
break
# if multi_gpu_training:
# torch.distributed.barrier()
if (not multi_gpu_training or (multi_gpu_training and torch.distributed.get_rank() == 0)):
logger.info("Finished")
if __name__ == "__main__":
main()

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,561 @@
# coding=utf-8
# Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team.
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""BERT finetuning runner."""
import argparse
import csv
import logging
import os
import random
import sys
from io import open
import numpy as np
import torch
from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,
TensorDataset)
from torch.utils.data.distributed import DistributedSampler
from tqdm import tqdm, trange
from file_utils import PYTORCH_PRETRAINED_BERT_CACHE
from modeling import BertForMultipleChoice, BertConfig, WEIGHTS_NAME, CONFIG_NAME
from optimization import BertAdam, warmup_linear
from tokenization import BertTokenizer
logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s',
datefmt = '%m/%d/%Y %H:%M:%S',
level = logging.INFO)
logger = logging.getLogger(__name__)
class SwagExample(object):
"""A single training/test example for the SWAG dataset."""
def __init__(self,
swag_id,
context_sentence,
start_ending,
ending_0,
ending_1,
ending_2,
ending_3,
label = None):
self.swag_id = swag_id
self.context_sentence = context_sentence
self.start_ending = start_ending
self.endings = [
ending_0,
ending_1,
ending_2,
ending_3,
]
self.label = label
def __str__(self):
return self.__repr__()
def __repr__(self):
l = [
"swag_id: {}".format(self.swag_id),
"context_sentence: {}".format(self.context_sentence),
"start_ending: {}".format(self.start_ending),
"ending_0: {}".format(self.endings[0]),
"ending_1: {}".format(self.endings[1]),
"ending_2: {}".format(self.endings[2]),
"ending_3: {}".format(self.endings[3]),
]
if self.label is not None:
l.append("label: {}".format(self.label))
return ", ".join(l)
class InputFeatures(object):
def __init__(self,
example_id,
choices_features,
label
):
self.example_id = example_id
self.choices_features = [
{
'input_ids': input_ids,
'input_mask': input_mask,
'segment_ids': segment_ids
}
for _, input_ids, input_mask, segment_ids in choices_features
]
self.label = label
def read_swag_examples(input_file, is_training):
with open(input_file, 'r', encoding='utf-8') as f:
reader = csv.reader(f)
lines = []
for line in reader:
if sys.version_info[0] == 2:
line = list(unicode(cell, 'utf-8') for cell in line)
lines.append(line)
if is_training and lines[0][-1] != 'label':
raise ValueError(
"For training, the input file must contain a label column."
)
examples = [
SwagExample(
swag_id = line[2],
context_sentence = line[4],
start_ending = line[5], # in the swag dataset, the
# common beginning of each
# choice is stored in "sent2".
ending_0 = line[7],
ending_1 = line[8],
ending_2 = line[9],
ending_3 = line[10],
label = int(line[11]) if is_training else None
) for line in lines[1:] # we skip the line with the column names
]
return examples
def convert_examples_to_features(examples, tokenizer, max_seq_length,
is_training):
"""Loads a data file into a list of `InputBatch`s."""
# Swag is a multiple choice task. To perform this task using Bert,
# we will use the formatting proposed in "Improving Language
# Understanding by Generative Pre-Training" and suggested by
# @jacobdevlin-google in this issue
# https://github.com/google-research/bert/issues/38.
#
# Each choice will correspond to a sample on which we run the
# inference. For a given Swag example, we will create the 4
# following inputs:
# - [CLS] context [SEP] choice_1 [SEP]
# - [CLS] context [SEP] choice_2 [SEP]
# - [CLS] context [SEP] choice_3 [SEP]
# - [CLS] context [SEP] choice_4 [SEP]
# The model will output a single value for each input. To get the
# final decision of the model, we will run a softmax over these 4
# outputs.
features = []
for example_index, example in enumerate(examples):
context_tokens = tokenizer.tokenize(example.context_sentence)
start_ending_tokens = tokenizer.tokenize(example.start_ending)
choices_features = []
for ending_index, ending in enumerate(example.endings):
# We create a copy of the context tokens in order to be
# able to shrink it according to ending_tokens
context_tokens_choice = context_tokens[:]
ending_tokens = start_ending_tokens + tokenizer.tokenize(ending)
# Modifies `context_tokens_choice` and `ending_tokens` in
# place so that the total length is less than the
# specified length. Account for [CLS], [SEP], [SEP] with
# "- 3"
_truncate_seq_pair(context_tokens_choice, ending_tokens, max_seq_length - 3)
tokens = ["[CLS]"] + context_tokens_choice + ["[SEP]"] + ending_tokens + ["[SEP]"]
segment_ids = [0] * (len(context_tokens_choice) + 2) + [1] * (len(ending_tokens) + 1)
input_ids = tokenizer.convert_tokens_to_ids(tokens)
input_mask = [1] * len(input_ids)
# Zero-pad up to the sequence length.
padding = [0] * (max_seq_length - len(input_ids))
input_ids += padding
input_mask += padding
segment_ids += padding
assert len(input_ids) == max_seq_length
assert len(input_mask) == max_seq_length
assert len(segment_ids) == max_seq_length
choices_features.append((tokens, input_ids, input_mask, segment_ids))
label = example.label
if example_index < 5:
logger.info("*** Example ***")
logger.info("swag_id: {}".format(example.swag_id))
for choice_idx, (tokens, input_ids, input_mask, segment_ids) in enumerate(choices_features):
logger.info("choice: {}".format(choice_idx))
logger.info("tokens: {}".format(' '.join(tokens)))
logger.info("input_ids: {}".format(' '.join(map(str, input_ids))))
logger.info("input_mask: {}".format(' '.join(map(str, input_mask))))
logger.info("segment_ids: {}".format(' '.join(map(str, segment_ids))))
if is_training:
logger.info("label: {}".format(label))
features.append(
InputFeatures(
example_id = example.swag_id,
choices_features = choices_features,
label = label
)
)
return features
def _truncate_seq_pair(tokens_a, tokens_b, max_length):
"""Truncates a sequence pair in place to the maximum length."""
# This is a simple heuristic which will always truncate the longer sequence
# one token at a time. This makes more sense than truncating an equal percent
# of tokens from each, since if one sequence is very short then each token
# that's truncated likely contains more information than a longer sequence.
while True:
total_length = len(tokens_a) + len(tokens_b)
if total_length <= max_length:
break
if len(tokens_a) > len(tokens_b):
tokens_a.pop()
else:
tokens_b.pop()
def accuracy(out, labels):
outputs = np.argmax(out, axis=1)
return np.sum(outputs == labels)
def select_field(features, field):
return [
[
choice[field]
for choice in feature.choices_features
]
for feature in features
]
def main():
parser = argparse.ArgumentParser()
## Required parameters
parser.add_argument("--data_dir",
default=None,
type=str,
required=True,
help="The input data dir. Should contain the .csv files (or other data files) for the task.")
parser.add_argument("--bert_model", default=None, type=str, required=True,
help="Bert pre-trained model selected in the list: bert-base-uncased, "
"bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, "
"bert-base-multilingual-cased, bert-base-chinese.")
parser.add_argument("--output_dir",
default=None,
type=str,
required=True,
help="The output directory where the model checkpoints will be written.")
parser.add_argument("--init_checkpoint",
default=None,
type=str,
required=True,
help="The checkpoint file from pretraining")
## Other parameters
parser.add_argument("--max_seq_length",
default=128,
type=int,
help="The maximum total input sequence length after WordPiece tokenization. \n"
"Sequences longer than this will be truncated, and sequences shorter \n"
"than this will be padded.")
parser.add_argument("--do_train",
action='store_true',
help="Whether to run training.")
parser.add_argument("--do_eval",
action='store_true',
help="Whether to run eval on the dev set.")
parser.add_argument("--do_lower_case",
action='store_true',
help="Set this flag if you are using an uncased model.")
parser.add_argument("--train_batch_size",
default=32,
type=int,
help="Total batch size for training.")
parser.add_argument("--eval_batch_size",
default=8,
type=int,
help="Total batch size for eval.")
parser.add_argument("--learning_rate",
default=5e-5,
type=float,
help="The initial learning rate for Adam.")
parser.add_argument("--num_train_epochs",
default=3.0,
type=float,
help="Total number of training epochs to perform.")
parser.add_argument("--max_steps", default=-1.0, type=float,
help="Total number of training steps to perform.")
parser.add_argument("--warmup_proportion",
default=0.1,
type=float,
help="Proportion of training to perform linear learning rate warmup for. "
"E.g., 0.1 = 10%% of training.")
parser.add_argument("--no_cuda",
action='store_true',
help="Whether not to use CUDA when available")
parser.add_argument("--local_rank",
type=int,
default=-1,
help="local_rank for distributed training on gpus")
parser.add_argument('--seed',
type=int,
default=42,
help="random seed for initialization")
parser.add_argument('--gradient_accumulation_steps',
type=int,
default=1,
help="Number of updates steps to accumulate before performing a backward/update pass.")
parser.add_argument('--fp16',
action='store_true',
help="Whether to use 16-bit float precision instead of 32-bit")
parser.add_argument('--loss_scale',
type=float, default=0,
help="Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n"
"0 (default value): dynamic loss scaling.\n"
"Positive power of 2: static loss scaling value.\n")
args = parser.parse_args()
if args.local_rank == -1 or args.no_cuda:
device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
n_gpu = torch.cuda.device_count()
else:
torch.cuda.set_device(args.local_rank)
device = torch.device("cuda", args.local_rank)
n_gpu = 1
# Initializes the distributed backend which will take care of sychronizing nodes/GPUs
torch.distributed.init_process_group(backend='nccl')
logger.info("device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".format(
device, n_gpu, bool(args.local_rank != -1), args.fp16))
if args.gradient_accumulation_steps < 1:
raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format(
args.gradient_accumulation_steps))
args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps
random.seed(args.seed)
np.random.seed(args.seed)
torch.manual_seed(args.seed)
if n_gpu > 0:
torch.cuda.manual_seed_all(args.seed)
if not args.do_train and not args.do_eval:
raise ValueError("At least one of `do_train` or `do_eval` must be True.")
if os.path.exists(args.output_dir) and os.listdir(args.output_dir):
print("WARNING: Output directory ({}) already exists and is not empty.".format(args.output_dir))
if not os.path.exists(args.output_dir):
os.makedirs(args.output_dir)
tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case)
train_examples = None
num_train_optimization_steps = None
if args.do_train:
train_examples = read_swag_examples(os.path.join(args.data_dir, 'train.csv'), is_training = True)
num_train_optimization_steps = int(
len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs
if args.local_rank != -1:
num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size()
# Prepare model
model = BertForMultipleChoice.from_pretrained(args.bert_model,
cache_dir=os.path.join(PYTORCH_PRETRAINED_BERT_CACHE, 'distributed_{}'.format(args.local_rank)),
num_choices=4)
model.load_state_dict(torch.load(args.init_checkpoint, map_location='cpu'), strict=False)
if args.fp16:
model.half()
model.to(device)
if args.local_rank != -1:
try:
from apex.parallel import DistributedDataParallel as DDP
except ImportError:
raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.")
model = DDP(model)
elif n_gpu > 1:
model = torch.nn.DataParallel(model)
# Prepare optimizer
param_optimizer = list(model.named_parameters())
# hack to remove pooler, which is not used
# thus it produce None grad that break apex
param_optimizer = [n for n in param_optimizer if 'pooler' not in n[0]]
no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
{'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
{'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]
if args.fp16:
try:
from apex.optimizers import FP16_Optimizer
from apex.optimizers import FusedAdam
except ImportError:
raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.")
optimizer = FusedAdam(optimizer_grouped_parameters,
lr=args.learning_rate,
bias_correction=False,
max_grad_norm=1.0)
if args.loss_scale == 0:
optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)
else:
optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale)
else:
optimizer = BertAdam(optimizer_grouped_parameters,
lr=args.learning_rate,
warmup=args.warmup_proportion,
t_total=num_train_optimization_steps)
global_step = 0
if args.do_train:
train_features = convert_examples_to_features(
train_examples, tokenizer, args.max_seq_length, True)
logger.info("***** Running training *****")
logger.info(" Num examples = %d", len(train_examples))
logger.info(" Batch size = %d", args.train_batch_size)
logger.info(" Num steps = %d", num_train_optimization_steps)
all_input_ids = torch.tensor(select_field(train_features, 'input_ids'), dtype=torch.long)
all_input_mask = torch.tensor(select_field(train_features, 'input_mask'), dtype=torch.long)
all_segment_ids = torch.tensor(select_field(train_features, 'segment_ids'), dtype=torch.long)
all_label = torch.tensor([f.label for f in train_features], dtype=torch.long)
train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label)
if args.local_rank == -1:
train_sampler = RandomSampler(train_data)
else:
train_sampler = DistributedSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size)
model.train()
for _ in trange(int(args.num_train_epochs), desc="Epoch"):
tr_loss = 0
nb_tr_examples, nb_tr_steps = 0, 0
for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration")):
# Terminate early for benchmarking
if args.max_steps > 0 and global_step > args.max_steps:
break
batch = tuple(t.to(device) for t in batch)
input_ids, input_mask, segment_ids, label_ids = batch
loss = model(input_ids, segment_ids, input_mask, label_ids)
if n_gpu > 1:
loss = loss.mean() # mean() to average on multi-gpu.
if args.fp16 and args.loss_scale != 1.0:
# rescale loss for fp16 training
# see https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html
loss = loss * args.loss_scale
if args.gradient_accumulation_steps > 1:
loss = loss / args.gradient_accumulation_steps
tr_loss += loss.item()
nb_tr_examples += input_ids.size(0)
nb_tr_steps += 1
if args.fp16:
optimizer.backward(loss)
else:
loss.backward()
if (step + 1) % args.gradient_accumulation_steps == 0:
if args.fp16:
# modify learning rate with special warm up BERT uses
# if args.fp16 is False, BertAdam is used that handles this automatically
lr_this_step = args.learning_rate * warmup_linear(global_step/num_train_optimization_steps, args.warmup_proportion)
for param_group in optimizer.param_groups:
param_group['lr'] = lr_this_step
optimizer.step()
optimizer.zero_grad()
global_step += 1
if args.do_train:
# Save a trained model and the associated configuration
model_to_save = model.module if hasattr(model, 'module') else model # Only save the model it-self
output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME)
torch.save(model_to_save.state_dict(), output_model_file)
output_config_file = os.path.join(args.output_dir, CONFIG_NAME)
with open(output_config_file, 'w') as f:
f.write(model_to_save.config.to_json_string())
# Load a trained model and config that you have fine-tuned
config = BertConfig(output_config_file)
model = BertForMultipleChoice(config, num_choices=4)
model.load_state_dict(torch.load(output_model_file))
else:
model = BertForMultipleChoice.from_pretrained(args.bert_model, num_choices=4)
model.load_state_dict(torch.load(args.init_checkpoint, map_location='cpu'), strict=False)
model.to(device)
if args.do_eval and (args.local_rank == -1 or torch.distributed.get_rank() == 0):
eval_examples = read_swag_examples(os.path.join(args.data_dir, 'val.csv'), is_training = True)
eval_features = convert_examples_to_features(
eval_examples, tokenizer, args.max_seq_length, True)
logger.info("***** Running evaluation *****")
logger.info(" Num examples = %d", len(eval_examples))
logger.info(" Batch size = %d", args.eval_batch_size)
all_input_ids = torch.tensor(select_field(eval_features, 'input_ids'), dtype=torch.long)
all_input_mask = torch.tensor(select_field(eval_features, 'input_mask'), dtype=torch.long)
all_segment_ids = torch.tensor(select_field(eval_features, 'segment_ids'), dtype=torch.long)
all_label = torch.tensor([f.label for f in eval_features], dtype=torch.long)
eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label)
# Run prediction for full data
eval_sampler = SequentialSampler(eval_data)
eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size)
model.eval()
eval_loss, eval_accuracy = 0, 0
nb_eval_steps, nb_eval_examples = 0, 0
for input_ids, input_mask, segment_ids, label_ids in tqdm(eval_dataloader, desc="Evaluating"):
input_ids = input_ids.to(device)
input_mask = input_mask.to(device)
segment_ids = segment_ids.to(device)
label_ids = label_ids.to(device)
with torch.no_grad():
tmp_eval_loss = model(input_ids, segment_ids, input_mask, label_ids)
logits = model(input_ids, segment_ids, input_mask)
logits = logits.detach().cpu().numpy()
label_ids = label_ids.to('cpu').numpy()
tmp_eval_accuracy = accuracy(logits, label_ids)
eval_loss += tmp_eval_loss.mean().item()
eval_accuracy += tmp_eval_accuracy
nb_eval_examples += input_ids.size(0)
nb_eval_steps += 1
eval_loss = eval_loss / nb_eval_steps
eval_accuracy = eval_accuracy / nb_eval_examples
result = {'eval_loss': eval_loss,
'eval_accuracy': eval_accuracy,
'global_step': global_step,
'loss': tr_loss/nb_tr_steps}
output_eval_file = os.path.join(args.output_dir, "eval_results.txt")
with open(output_eval_file, "w") as writer:
logger.info("***** Eval results *****")
for key in sorted(result.keys()):
logger.info(" %s = %s", key, str(result[key]))
writer.write("%s = %s\n" % (key, str(result[key])))
if __name__ == "__main__":
main()

View File

@ -0,0 +1,92 @@
import math
import torch
from torch.optim.optimizer import Optimizer
from apex.optimizers import FP16_Optimizer
from torch.optim.lr_scheduler import _LRScheduler
class LRScheduler(_LRScheduler):
def __init__(self, optimizer, last_epoch=-1):
# Check if using mixed precision training
self.mixed_training = False
base_optimizer = optimizer
if isinstance(optimizer, FP16_Optimizer):
self.mixed_training = True
self.fp16_optimizer = optimizer
base_optimizer = optimizer.optimizer
# Check that optimizer param is valid
elif not isinstance(optimizer, Optimizer):
raise TypeError('{} is not an Optimizer'.format(
type(optimizer).__name__))
super(LRScheduler, self).__init__(base_optimizer, last_epoch)
def step(self, epoch=None):
# Set the current training step
# ('epoch' is used to be consistent with _LRScheduler)
if self.mixed_training:
# The assumption is that the step will be constant
state_dict = self.optimizer.state[self.optimizer.param_groups[0]['params'][0]]
if 'step' in state_dict:
self.last_epoch = state_dict['step'] + 1
else:
self.last_epoch = 1
else:
self.last_epoch = epoch if epoch is not None else self.last_epoch + 1
for param_group, lr in zip(self.optimizer.param_groups, self.get_lr()):
param_group['lr'] = lr
class CosineWarmupScheduler(LRScheduler):
"""
Applies a warm up period to the learning rate.
"""
def __init__(self, optimizer, warmup, total_steps, last_epoch=-1):
self.warmup = warmup
self.total_steps = total_steps
super(CosineWarmUpScheduler, self).__init__(optimizer, last_epoch)
def get_lr(self):
progress = self.last_epoch / self.total_steps
if progress < self.warmup:
return [base_lr * progress / self.warmup for base_lr in self.base_lrs]
else:
return [base_lr * (0.5 * (1.0 + torch.cos(math.pi + progress))) for base_lr in self.base_lrs]
class ConstantWarmupScheduler(LRScheduler):
"""
Applies a warm up period to the learning rate.
"""
def __init__(self, optimizer, warmup, total_steps, last_epoch=-1):
self.warmup = warmup
self.total_steps = total_steps
super(CosineWarmUpScheduler, self).__init__(optimizer, last_epoch)
def get_lr(self):
progress = self.last_epoch / self.total_steps
if progress < self.warmup:
return [base_lr * progress / self.warmup for base_lr in self.base_lrs]
else:
return self.base_lrs
class LinearWarmUpScheduler(LRScheduler):
"""
Applies a warm up period to the learning rate.
"""
def __init__(self, optimizer, warmup, total_steps, last_epoch=-1):
self.warmup = warmup
self.total_steps = total_steps
super(LinearWarmUpScheduler, self).__init__(optimizer, last_epoch)
def get_lr(self):
progress = self.last_epoch / self.total_steps
if progress < self.warmup:
return [base_lr * progress / self.warmup for base_lr in self.base_lrs]
else:
return [base_lr * max(( progress - 1.0)/(self.warmup - 1.0), 0.) for base_lr in self.base_lrs]

View File

@ -0,0 +1,38 @@
#!/usr/bin/env bash
DATA_DIR=${1:-/workspace/bert/data}
# Check running from repository root
if [ ! -d .git ]; then
echo "Not running from repository root! Exiting."
exit 1
fi
# Download vocab files from pretrained model
cd vocab && python3 download_models.py && rm *.zip && rm ./*/*.ckpt.*
# Download SQUAD
cd $DATA_DIR/squad && . squad_download.sh
# Download SWAG
git clone https://github.com/rowanz/swagaf.git $DATA_DIR/swag
# Download GLUE
cd $DATA_DIR/glue && . download_mrpc.sh
# WIKI Download
cd $DATA_DIR/wikipedia_corpus && . download_wikipedia.sh
# Bookcorpus Download
cd $DATA_DIR/bookcorpus && . download_bookcorpus.sh
cd $DATA_DIR
# Create HDF5 files for WIKI
bash create_datasets_from_start.sh wikipedia_corpus ./wikipedia_corpus/wikipedia_corpus.txt \
&& rm -r ./wikipedia_corpus/final_* \
# Create HDF5 files for Bookcorpus
bash create_datasets_from_start.sh bookcorpus ./bookcorpus/bookcorpus.txt \
&& rm -r ./bookcorpus/final_* \
# Create HDF5 files for inter sequence-pair mixed Wikipedia and Bookcorpus
bash merge_datasets_after_creation.sh merged_wiki+books wikipedia_corpus/hdf5_shards,bookcorpus/hdf5_shards 1024

View File

@ -0,0 +1,9 @@
#!/bin/bash
# Check running from repository root
if [ ! -d .git ]; then
echo "Not running from repository root! Exiting."
exit 1
fi
docker build . --rm -t bert

View File

@ -0,0 +1,23 @@
#!/bin/bash
# Check running from repository root
if [ ! -d .git ]; then
echo "Not running from repository root! Exiting."
exit 1
fi
DATA_DIR=${1:-"/mnt/dldata/bert"}
VOCAB_DIR=${2:-"/mnt/dldata/bert/vocab"}
CHECKPOINT_DIR=${3:-"/mnt/dldata/bert/pretrained_models_nvidia_pytorch"}
docker run -it --rm \
--runtime=nvidia \
-p 8888:8888 \
--shm-size=1g \
--ulimit memlock=-1 \
--ulimit stack=67108864 \
-v $DATA_DIR:/workspace/bert/data \
-v $CHECKPOINT_DIR:/workspace/checkpoints \
-v $VOCAB_DIR:/workspace/bert/vocab \
-v $PWD/results:/results \
bert bash

View File

@ -0,0 +1,184 @@
#!/bin/bash
#SBATCH -p mlperf # partition
#SBATCH -N 1 # number of nodes
#SBATCH -t 12:00:00 # wall time
#SBATCH -J image_classification # job name
#SBATCH --exclusive # exclusive node access
#SBATCH --mem=0 # all mem avail
#SBATCH --mail-type=FAIL # only send email on failure
#SBATCH --ntasks-per-node=8 # n tasks per machine (one task per gpu)
#SBATCH --threads-per-core=2 # HT is on
#SBATCH --cores-per-socket=20 # 20 cores on each socket
#SBATCH --overcommit
hostname
#DGXIBDEVICES=$(eval ls /dev/infiniband/ | tr " " "\n" | awk '{printf "--device=/dev/infiniband/%s ",$1}' | sed s'/.$//')
printf "DGXIBDEVICES=%s\n" "$DGXIBDEVICES"
printf "VOLS=%s\n" "$VOLS"
printf "EXTRA_PARAMS=%s\n" "$EXTRA_PARAMS"
cd $CODEDIR
VOLS+=" -v $CHKPTDIR/$SLURM_JOB_ID:/checkpoints"
mkdir -p $CHKPTDIR/$SLURM_JOB_ID
## DO NOT CHANGE ANYTHING BELOW -- DL params are in run_and_time.sh and config_<system>.sh files
DEBUG=1 # 1 = Print verbose messages for debugging
## Pre-warming the containers ##
hosts=( `scontrol show hostname |tr "\n" " "` )
pids=(); for hostn in ${hosts[@]}; do
timeout -k 600s 600s \
srun -N 1 -n 1 -w $hostn \
docker pull $CONT &
pids+=($!);
pids+=($!); rets+=($?);
done
wait "${pids[@]}"
success=0; for s in ${rets[@]}; do ((success+=s)); done ; if [ $success -ne 0 ]; then echo "ERR: Container pull failed"; exit $success ; fi
IBDEVICES=${IBDEVICES:-$DGXIBDEVICES}
## Check whether we are running in a slurm env
INSLURM=1
if [[ -z "$SLURM_JOB_ID" ]]; then
INSLURM=0
export SLURM_JOB_ID="${DATESTAMP}"
export SLURM_NNODES=1
fi
if [[ -z "SLURM_JOB_ID" || $SLURM_NNODES -eq 1 ]]; then
# don't need IB if not multi-node
export IBDEVICES=""
fi
# Create results directory
LOGFILE_BASE="${LOGDIR}/${DATESTAMP}"
mkdir -p $(dirname "${LOGFILE_BASE}")
export CONTNAME="${SLURM_JOB_ID}"
export DOCKEREXEC="nvidia-docker run --rm --net=host --uts=host --ipc=host --ulimit stack=67108864 --ulimit memlock=-1 --security-opt seccomp=unconfined $IBDEVICES"
CMD="python -np $((SLURM_NNODES*DGXNGPU)) -x EXTRA_PARAMS=\"${EXTRA_PARAMS}\" -x NCCL_LL_THRESHOLD=0 -x NCCL_DEBUG=INFO -x NCCL_NET_GDR_READ=1 -x NCCL_SOCKET_IFNAME=^docker0,bond0,lo $BIND ./run_pretraining.sh"
echo $CMD
mkdir -m 777 -p $LOGDIR
echo $CMD | tee -a $LOGDIR/$DATESTAMP.log
echo "slurm job id" $SLURM_JOB_ID &> $LOGDIR/$DATESTAMP.log
MASTER_IP=`getent hosts \`hostname\` | cut -d ' ' -f1`
SSH=''
SRUN=''
if [[ $INSLURM -eq 0 ]]; then
export hosts=( `hostname` )
else
export hosts=( `scontrol show hostname |tr "\n" " "` )
SSH='ssh -q -o UserKnownHostsFile=/dev/null -o StrictHostKeyChecking=no $hostn'
SRUN='srun -N 1 -n 1 -w $hostn'
fi
unique_hosts=( $(echo "${hosts[@]}" | tr ' ' '\n' | sort -u | tr '\n' ' ' ) )
export MASTER_HOST=${hosts[0]}
VARS="-e OMPI_MCA_mca_base_param_files=/dev/shm/mpi/${SLURM_JOB_ID}/mca_params.conf -e EXTRA_PARAMS -e GPUS -e BATCHSIZE -e CONT -e DGXSYSTEM=$DGXSYSTEM -e MASTER_HOST -e MASTER_IP -e SLURM_JOB_NUM_NODES -e SLURM_NNODES -e SLURM_NTASKS_PER_NODE -w /workspace/bert"
RUNSLEEPCMD=""
[[ "${PULL}" -eq "1" ]] && docker pull $CONT
## Setting up MPI
# MPI support files - in /dev/shm/mpi/<jobid>
# 1. Copy user keys to /dev/shm/mpi/<jobid>
# 2. Create mca_params.conf
# 3. Create sshentry.sh to support lauching into containers on worker nodes
# 4. Create mpi_hosts file
# 5. Copy standard ssh
if [[ $SLURM_NNODES -ne "1" ]]; then
# Make keys and copy
echo
[[ $DEBUG == 1 ]] && echo "Setting up ssh keys and config"
mkdir -p ${HOME}/.ssh/sbatch/${SLURM_JOB_ID}
ssh-keygen -t rsa -b 2048 -n "" -f "${HOME}/.ssh/sbatch/${SLURM_JOB_ID}/sshkey.rsa" -C "mxnet_${SLURM_JOB_ID}_" &>/dev/null
echo command=no-port-forwarding,no-agent-forwarding,no-X11-forwarding $(cat ${HOME}/.ssh/sbatch/${SLURM_JOB_ID}/sshkey.rsa.pub) >> ${HOME}/.ssh/authorized_keys
chmod 600 ~/.ssh/authorized_keys
[[ $DEBUG == 1 ]] && echo "Copy keys: srun -n $SLURM_JOB_NUM_NODES && cp -R ${HOME}/.ssh/sbatch/${SLURM_JOB_ID} /dev/shm/mpi && chmod 700 /dev/shm/mpi/${SLURM_JOB_ID}"
srun -n $SLURM_JOB_NUM_NODES --ntasks-per-node=1 bash -c "mkdir -p /dev/shm/mpi/${SLURM_JOB_ID}; cp -R ${HOME}/.ssh/sbatch/${SLURM_JOB_ID} /dev/shm/mpi; chmod 700 /dev/shm/mpi/${SLURM_JOB_ID}"
sleep 2 # Making copy
[[ $DEBUG == 1 ]] && ls /dev/shm
# Create mpi config file
srun -n $SLURM_JOB_NUM_NODES --ntasks-per-node=1 tee /dev/shm/mpi/${SLURM_JOB_ID}/mca_params.conf <<EOF
plm_rsh_agent = /usr/bin/ssh
plm_rsh_args = -i /dev/shm/mpi/${SLURM_JOB_ID}/sshkey.rsa -oStrictHostKeyChecking=no -oUserKnownHostsFile=/dev/null -oLogLevel=ERROR -l ${USER}
orte_default_hostfile = /dev/shm/mpi/${SLURM_JOB_ID}/mpi_hosts
btl_openib_warn_default_gid_prefix = 0
mpi_warn_on_fork = 0
allow_run_as_root = 1
EOF
[[ $DEBUG == 1 ]] && echo "::mca_params.conf=" && cat /dev/shm/mpi/${SLURM_JOB_ID}/mca_params.conf
# Create ssh helper script that transfers an ssh into a compute node into the running container on that node
srun -n $SLURM_JOB_NUM_NODES --ntasks-per-node=1 tee /dev/shm/mpi/${SLURM_JOB_ID}/sshentry.sh <<EOF
#!/bin/bash
echo "::sshentry: entered \$(hostname)"
[[ -f $CONTNAME ]] && "::worker container not found error" && exit 1
echo "::sshentry: running \$SSH_ORIGINAL_COMMAND"
exec docker exec $CONTNAME /bin/bash -c "\$SSH_ORIGINAL_COMMAND"
EOF
[[ $DEBUG == 1 ]] && echo "::sshentry=" && cat /dev/shm/mpi/${SLURM_JOB_ID}/sshentry.sh
# Create mpi hostlist
for h in ${hosts[@]}; do
echo "$h slots=${SLURM_NTASKS_PER_NODE}" >> /dev/shm/mpi/${SLURM_JOB_ID}/mpi_hosts
done
[[ $DEBUG == 1 ]] && echo '::mpi-host file=' && cat /dev/shm/mpi/${SLURM_JOB_ID}/mpi_hosts
srun -n $SLURM_JOB_NUM_NODES --ntasks-per-node=1 bash -c "cp $(which ssh) /dev/shm/mpi/${SLURM_JOB_ID}/.; chmod 755 /dev/shm/mpi/${SLURM_JOB_ID}/mca_params.conf; chmod 755 /dev/shm/mpi/${SLURM_JOB_ID}/sshentry.sh"
# Check that ssh/mpi dir has correct number of files
[[ $(ls /dev/shm/mpi/${SLURM_JOB_ID} | wc -w) -lt 5 ]] && echo "ERR: /dev/shm/mpi/${SLURM_JOB_ID} doesn't exist or missing ssh/mpi files" && exit $?
fi
# Container launch
if [[ $INSLURM -eq 1 ]]; then
# Launch containers behind srun
[[ $DEBUG == 1 ]] && echo "" && echo ":Launch containers: srun -n $SLURM_JOB_NUM_NODES --ntasks-per-node=1 $DOCKEREXEC --name $CONTNAME $VOLS $VARS $CONT bash -c 'sleep infinity'"
srun -n $SLURM_JOB_NUM_NODES --ntasks-per-node=1 $DOCKEREXEC --name $CONTNAME $VOLS $VARS $CONT bash -c 'sleep infinity' & rv=$?
else
$DOCKEREXEC --name $CONTNAME $VOLS $VARS $CONT bash -c 'sleep infinity' & rv=$?
fi
[[ $rv -ne 0 ]] && echo "ERR: Launch sleep containers failed." && exit $rv
echo "sleep 60 while we pull our container, good golly!"
sleep 60
# Run benchmarks
echo "sleep again for 20"
sleep 20
export EXTRA_PARAMS
(
# Launching app
echo
echo "Launching user script on master node:"
hostn=$MASTER_HOST
$(eval echo $SSH) docker exec $VARS $CONTNAME $MPICMD ; rv=$?
[[ $rv -ne 0 ]] && echo "ERR: User script failed." && exit $rv
) |& tee ${LOGFILE_BASE}_$nrun.log
# Clean up (note: on SLURM we skip this, as the epilogue will take care of it)
if [[ $INSLURM -eq 0 ]]; then
docker rm -f $CONTNAME
fi

View File

@ -0,0 +1,63 @@
#!/bin/bash
MRPC_DIR=/workspace/bert/data/glue/MRPC
OUT_DIR=/results/MRPC
mkdir -p $OUT_DIR
echo "Container nvidia build = " $NVIDIA_BUILD_ID
init_checkpoint=${1}
mode=${2:-"train"}
max_steps=${3:-"-1.0"} # if < 0, has no effect
batch_size=${4:-"12"}
learning_rate=${5:-"5e-6"}
precision=${6:-"fp32"}
num_gpu=${7:-"8"}
epochs=${8:-"2"}
if [ "$mode" != "train" ] ; then
num_gpu=1
fi
use_fp16=""
if [ "$precision" = "fp16" ] ; then
echo "fp16 activated!"
use_fp16="--fp16"
fi
if [ "$num_gpu" = "1" ] ; then
mpi_command=""
else
mpi_command="torch.distributed.launch --nproc_per_node=$num_gpu"
fi
CMD="python -m $mpi_command run_glue.py "
CMD+="--task_name MRPC "
if [ "$mode" = "train" ] ; then
CMD+="--do_train "
CMD+="--train_batch_size=$batch_size "
else
CMD+="--do_eval "
CMD+="--eval_batch_size=$batch_size "
fi
CMD+="--do_lower_case "
CMD+="--data_dir $MRPC_DIR "
CMD+="--bert_model bert-large-uncased "
CMD+="--init_checkpoint $init_checkpoint "
CMD+="--max_seq_length 128 "
CMD+="--learning_rate $learning_rate "
CMD+="--num_train_epochs $epochs "
CMD+="--max_steps $max_steps "
CMD+="--output_dir $OUT_DIR "
CMD+="$use_fp16"
LOGFILE=$OUT_DIR/logfile
$CMD |& tee $LOGFILE
sed -r 's/ |(\[A)/\n/g' $LOGFILE > $LOGFILE.edit
throughput=`cat $LOGFILE.edit | grep -E 'Iteration.*[0-9.]+(s/it|it/s)' | tail -1 | egrep -o '[0-9.]+(s/it|it/s)'`
echo "throughput: $throughput"

View File

@ -0,0 +1,152 @@
#!/bin/bash
echo "Container nvidia build = " $NVIDIA_BUILD_ID
DATASET=wikipedia_corpus # change this for other datasets
DATA_DIR=data/${DATASET}/hdf5_shards/
BERT_CONFIG=bert_config.json
RESULTS_DIR=/results
CHECKPOINTS_DIR=/results/checkpoints
mkdir -p $CHECKPOINTS_DIR
if [ ! -d "$DATA_DIR" ] ; then
echo "Warning! $DATA_DIR directory missing. Training cannot start"
fi
if [ ! -d "$RESULTS_DIR" ] ; then
echo "Error! $RESULTS_DIR directory missing."
exit -1
fi
if [ ! -d "$CHECKPOINTS_DIR" ] ; then
echo "Warning! $CHECKPOINTS_DIR directory missing."
echo "Checkpoints will be written to $RESULTS_DIR instead."
CHECKPOINTS_DIR=$RESULTS_DIR
fi
if [ ! -f "$BERT_CONFIG" ] ; then
echo "Error! BERT large configuration file not found at $BERT_CONFIG"
exit -1
fi
train_batch_size=${1:-14}
learning_rate=${2:-"0.4375e-4"}
precision=${3:-"fp16"}
num_gpus=${4:-8}
warmup_proportion=${5:-"0.01"}
train_steps=${6:-2285714}
save_checkpoint_steps=${7:-2000}
resume_training=${8:-"false"}
create_logfile=${9:-"true"}
checkpoint_activations=${10:-"false"}
seed=${11:-42}
PREC=""
if [ "$precision" = "fp16" ] ; then
PREC="--fp16"
elif [ "$precision" = "fp32" ] ; then
PREC=""
else
echo "Unknown <precision> argument"
exit -2
fi
CHECKPOINT_ACTIVATIONS=""
if [ "$checkpoint_activations" == "true" ] ; then
CHECKPOINT_ACTIVATIONS="--checkpoint_activations"
fi
CHECKPOINT=""
if [ "$resume_training" == "true" ] ; then
CHECKPOINT="--resume_from_checkpoint"
fi
echo $DATA_DIR
INPUT_DIR=$DATA_DIR
CMD=" /workspace/bert/run_pretraining.py"
CMD+=" --input_dir=$DATA_DIR"
CMD+=" --output_dir=$CHECKPOINTS_DIR"
CMD+=" --config_file=$BERT_CONFIG"
CMD+=" --do_train"
CMD+=" --bert_model=bert-large-uncased"
CMD+=" --train_batch_size=$train_batch_size"
CMD+=" --max_seq_length=512"
CMD+=" --max_predictions_per_seq=80"
CMD+=" --max_steps=$train_steps"
CMD+=" --warmup_proportion=$warmup_proportion"
CMD+=" --num_steps_per_checkpoint=$save_checkpoint_steps"
CMD+=" --learning_rate=$learning_rate"
CMD+=" --seed=$seed"
CMD+=" $PREC"
CMD+=" $CHECKPOINT_ACTIVATIONS"
CMD+=" $CHECKPOINT"
if [ "$num_gpus" -gt 1 ] ; then
CMD="python3 -m torch.distributed.launch --nproc_per_node=$num_gpus $CMD"
else
CMD="python3 $CMD"
fi
if [ "$create_logfile" = "true" ] ; then
export GBS=$(expr $train_batch_size \* $num_gpus)
printf -v TAG "pyt_bert_pretraining_%s_gbs%d" "$precision" $GBS
DATESTAMP=`date +'%y%m%d%H%M%S'`
LOGFILE=$RESULTS_DIR/$TAG.$DATESTAMP.log
printf "Logs written to %s\n" "$LOGFILE"
fi
set -x
if [ -z "$LOGFILE" ] ; then
$CMD
else
(
$CMD
) |& tee $LOGFILE
fi
set +x
echo "finished pretraining, starting benchmarking"
target_loss=15
THROUGHPUT=10
THRESHOLD=0.9
throughput=`cat $LOGFILE | grep Iteration | tail -1 | awk -F's/it' '{print $1}' | awk -F',' '{print $2}' | egrep -o [0-9.]+`
loss=`cat $LOGFILE | grep 'Average Loss' | tail -1 | awk -F'Average Loss =' '{print $2}' | awk -F' ' '{print $1}' | egrep -o [0-9.]+`
final_loss=`cat $LOGFILE | grep 'Total Steps' | tail -1 | awk -F'Final Loss =' '{print $2}' | awk -F' ' '{print $1}' | egrep -o [0-9.]+`
echo "throughput: $throughput s/it"
echo "average loss: $loss"
echo "final loss: $final_loss"
ACCURACY_TEST_RESULT=$(awk 'BEGIN {print ('${loss}' <= '${target_loss}')}')
if [ $ACCURACY_TEST_RESULT == 1 ];
then
echo "&&&& ACCURACY TEST PASSED"
else
echo "&&&& ACCURACY TEST FAILED"
fi
PERFORMANCE_TEST_RESULT=$(awk 'BEGIN {print ('${throughput}' <= ('${THROUGHPUT}' * '${THRESHOLD}'))}')
if [ $PERFORMANCE_TEST_RESULT == 1 ];
then
echo "&&&& PERFORMANCE TEST PASSED"
else
echo "&&&& PERFORMANCE TEST FAILED"
fi
if [ $ACCURACY_TEST_RESULT == 1 -a $PERFORMANCE_TEST_RESULT == 1 ];
then
echo "&&&& PASSED"
exit 0
else
echo "&&&& FAILED"
exit 1
fi

View File

@ -0,0 +1,146 @@
#!/bin/bash
echo "Container nvidia build = " $NVIDIA_BUILD_ID
DATASET=wikipedia_corpus # change this for other datasets
DATA_DIR=data/${DATASET}/hdf5_shards/
BERT_CONFIG=bert_config.json
RESULTS_DIR=/results
CHECKPOINTS_DIR=/results/checkpoints
if [ ! -d "$DATA_DIR" ] ; then
echo "Warning! $DATA_DIR directory missing. Inference cannot start"
fi
if [ ! -d "$RESULTS_DIR" ] ; then
echo "Error! $RESULTS_DIR directory missing."
exit -1
fi
if [ ! -d "$CHECKPOINTS_DIR" ] ; then
echo "Warning! $CHECKPOINTS_DIR directory missing."
echo "Checkpoints will be loaded from $RESULTS_DIR instead."
CHECKPOINTS_DIR=$RESULTS_DIR
fi
if [ ! -f "$BERT_CONFIG" ] ; then
echo "Error! BERT large configuration file not found at $BERT_CONFIG"
exit -1
fi
eval_batch_size=${1:-14}
precision=${2:-"fp16"}
num_gpus=${3:-8}
inference_mode=${4:-"eval"}
model_checkpoint=${5:-"-1"}
inference_steps=${6:-"-1"}
create_logfile=${7:-"true"}
seed=${8:-42}
PREC=""
if [ "$precision" = "fp16" ] ; then
PREC="--fp16"
elif [ "$precision" = "fp32" ] ; then
PREC=""
else
echo "Unknown <precision> argument"
exit -2
fi
MODE=""
if [ "$inference_mode" = "eval" ] ; then
MODE="--eval"
elif [ "$inference_mode" = "prediction" ] ; then
MODE="--prediction"
else
echo "Unknown <inference_mode> argument"
exit -2
fi
echo $DATA_DIR
CMD=" /workspace/bert/run_pretraining_inference.py"
CMD+=" --input_dir=$DATA_DIR"
CMD+=" --ckpt_dir=$CHECKPOINTS_DIR"
CMD+=" --config_file=$BERT_CONFIG"
CMD+=" --bert_model=bert-large-uncased"
CMD+=" --eval_batch_size=$eval_batch_size"
CMD+=" --max_seq_length=512"
CMD+=" --max_predictions_per_seq=80"
CMD+=" --max_steps=$inference_steps"
CMD+=" --ckpt_step=$model_checkpoint"
CMD+=" --seed=$seed"
CMD+=" $PREC"
CMD+=" $MODE"
if [ "$num_gpus" -gt 1 ] ; then
CMD="python3 -m torch.distributed.launch --nproc_per_node=$num_gpus $CMD"
else
CMD="python3 $CMD"
fi
if [ "$create_logfile" = "true" ] ; then
export GBS=$((eval_batch_size * num_gpus))
printf -v TAG "pyt_bert_pretraining_inference_%s_gbs%d" "$precision" $GBS
DATESTAMP=`date +'%y%m%d%H%M%S'`
LOGFILE=$RESULTS_DIR/$TAG.$DATESTAMP.log
printf "Logs written to %s\n" "$LOGFILE"
fi
set -x
if [ -z "$LOGFILE" ] ; then
$CMD
else
(
$CMD
) |& tee $LOGFILE
fi
set +x
target_loss=15
THROUGHPUT=1.0
THRESHOLD=0.9
throughput=`cat $LOGFILE | grep Iteration | tail -1 | awk -F'it/s' '{print $1}' | awk -F',' '{print $2}' | egrep -o [0-9.]+`
echo "throughput: $throughput it/s"
PERFORMANCE_TEST_RESULT=$(awk 'BEGIN {print ('${throughput}' >= \
('${THROUGHPUT}' * '${THRESHOLD}'))}')
if [ $PERFORMANCE_TEST_RESULT == 1 ];
then
echo "&&&& PERFORMANCE TEST PASSED"
else
echo "&&&& PERFORMANCE TEST FAILED"
fi
if [ "$inference_mode" = "eval" ] ; then
loss=`cat $LOGFILE | grep Finished | tail -1 | awk -F'Final Loss =' '{print $2}' | awk -F' ' '{print $1}' | egrep -o [0-9.]+`
echo "final loss: $loss"
ACCURACY_TEST_RESULT=$(awk 'BEGIN {print ('${loss}' <= '${target_loss}')}')
if [ $ACCURACY_TEST_RESULT == 1 ];
then
echo "&&&& ACCURACY TEST PASSED"
else
echo "&&&& ACCURACY TEST FAILED"
fi
if [ $ACCURACY_TEST_RESULT == 1 -a $PERFORMANCE_TEST_RESULT == 1 ];
then
echo "&&&& PASSED"
exit 0
else
echo "&&&& FAILED"
exit 1
fi
fi

View File

@ -0,0 +1,88 @@
#!/usr/bin/env bash
#OUT_DIR=/results/SQuAD
echo "Container nvidia build = " $NVIDIA_BUILD_ID
init_checkpoint=${1:-"/workspace/checkpoints/bert_uncased.pt"}
epochs=${2:-"2.0"}
batch_size=${3:-"24"}
learning_rate=${4:-"3e-5"}
precision=${5:-"fp16"}
num_gpu=${6:-"8"}
seed=${7:-"42"}
squad_dir=${8:-"/workspace/bert/data/squad/v1.1"}
vocab_file=${9:-"/workspace/bert/vocab/vocab"}
OUT_DIR=${10:-"/results/SQuAD"}
mode=${11:-"train eval"}
CONFIG_FILE=${12:-"/workspace/bert/bert_config.json"}
max_steps=${13:-"-1"}
echo "out dir is $OUT_DIR"
mkdir -p $OUT_DIR
if [ ! -d "$OUT_DIR" ]; then
echo "ERROR: non existing $OUT_DIR"
exit 1
fi
use_fp16=""
if [ "$precision" = "fp16" ] ; then
echo "fp16 activated!"
use_fp16=" --fp16 "
fi
if [ "$num_gpu" = "1" ] ; then
export CUDA_VISIBLE_DEVICES=0
mpi_command=""
else
unset CUDA_VISIBLE_DEVICES
mpi_command=" -m torch.distributed.launch --nproc_per_node=$num_gpu"
fi
CMD="python $mpi_command run_squad.py "
CMD+="--init_checkpoint=$init_checkpoint "
if [ "$mode" = "train" ] ; then
CMD+="--do_train "
CMD+="--train_file=$squad_dir/train-v1.1.json "
CMD+="--train_batch_size=$batch_size "
elif [ "$mode" = "eval" ] ; then
CMD+="--do_predict "
CMD+="--predict_file=$squad_dir/dev-v1.1.json "
CMD+="--predict_batch_size=$batch_size "
else
CMD+=" --do_train "
CMD+=" --train_file=$squad_dir/train-v1.1.json "
CMD+=" --train_batch_size=$batch_size "
CMD+="--do_predict "
CMD+="--predict_file=$squad_dir/dev-v1.1.json "
CMD+="--predict_batch_size=$batch_size "
fi
CMD+=" --do_lower_case "
# CMD+=" --old "
# CMD+=" --loss_scale=128 "
CMD+=" --bert_model=bert-large-uncased "
CMD+=" --learning_rate=$learning_rate "
CMD+=" --seed=$seed "
CMD+=" --num_train_epochs=$epochs "
CMD+=" --max_seq_length=384 "
CMD+=" --doc_stride=128 "
CMD+=" --output_dir=$OUT_DIR "
CMD+=" --vocab_file=$vocab_file "
CMD+=" --config_file=$CONFIG_FILE "
CMD+=" --max_steps=$max_steps "
CMD+=" $use_fp16"
LOGFILE=$OUT_DIR/logfile.txt
echo "$CMD |& tee $LOGFILE"
time $CMD |& tee $LOGFILE
#sed -r 's/
#|([A)/\n/g' $LOGFILE > $LOGFILE.edit
throughput=`cat $LOGFILE | grep -E 'Iteration.*[0-9.]+(s/it|it/s)' | tail -1 | egrep -o '[0-9.]+(s/it|it/s)' | head -1 | egrep -o '[0-9.]+'`
if [ "$mode" != "train" ]; then
python $squad_dir/evaluate-v1.1.py $squad_dir/dev-v1.1.json $OUT_DIR/predictions.json |& tee -a $LOGFILE
fi
echo "throughput: $throughput"

View File

@ -0,0 +1,62 @@
#!/bin/bash
SWAG_DIR=/workspace/bert/data/swag
OUT_DIR=/results/SWAG
mkdir -p $OUT_DIR
echo "Container nvidia build = " $NVIDIA_BUILD_ID
init_checkpoint=${1}
mode=${2:-"train"}
max_steps=${3:-"-1.0"} # if < 0, has no effect
batch_size=${4:-"12"}
learning_rate=${5:-"5e-6"}
precision=${6:-"fp32"}
num_gpu=${7:-"8"}
epochs=${8:-"2"}
if [ "$mode" != "train" ] ; then
num_gpu=1
fi
use_fp16=""
if [ "$precision" = "fp16" ] ; then
echo "fp16 activated!"
use_fp16="--fp16"
fi
if [ "$num_gpu" = "1" ] ; then
mpi_command=""
else
mpi_command="torch.distributed.launch --nproc_per_node=$num_gpu"
fi
CMD="python -m $mpi_command run_swag.py "
CMD+="--init_checkpoint=$init_checkpoint "
if [ "$mode" = "train" ] ; then
CMD+="--do_train "
CMD+="--train_batch_size=$batch_size "
else
CMD+="--do_eval "
CMD+="--eval_batch_size=$batch_size "
fi
CMD+="--do_lower_case "
CMD+="--data_dir $SWAG_DIR/data/ "
CMD+="--bert_model bert-large-uncased "
CMD+="--max_seq_length 128 "
CMD+="--learning_rate $learning_rate "
CMD+="--num_train_epochs $epochs "
CMD+="--max_steps $max_steps "
CMD+="--output_dir $OUT_DIR "
CMD+="$use_fp16"
LOGFILE=$OUT_DIR/logfile
$CMD |& tee $LOGFILE
sed -r 's/ |(\[A)/\n/g' $LOGFILE > $LOGFILE.edit
throughput=`cat $LOGFILE.edit | grep -E 'Iteration.*[0-9.]+(s/it|it/s)' | tail -1 | egrep -o '[0-9.]+(s/it|it/s)'`
echo "throughput: $throughput"

View File

@ -0,0 +1,89 @@
#!/bin/bash
# purpose: for multinode training on slurm clusters
node_type=${1:-"dgx1"}
num_nodes=${2:-1}
partition=${3:-"default"}
wall_time=${4:-"12:00:00"}
job_name=${5:-"pyt_bert"}
root_dir=${6:-"$PWD"}
train_batch_size=${7:-4}
eval_batch_size=${8:-4}
train_steps=${9:-1000000}
warmup_proportion=${10:-0.01}
learning_rate=${11:-1e-4}
precision=${12:-"fp16"}
save_checkpoint_steps=${13:-5000}
results_dir=${14:-"$root_dir/results"}
checkpoints_dir=${15:-"$root_dir/checkpoints"}
CONT=${CONT:-"gitlab-master.nvidia.com:5005/dl/dgx/pytorch:19.02-py3-devel"}
BENCHMARK=${BENCHMARK:-"bert"}
BENCHMARK_NAME="bert"
if [ "$node_type" = "dgx1" ] ; then
echo "Running on dgx1 systems"
DGXSYSTEM="DGX1"
DGXNGPU=8
DGXSOCKETCORES=20
DGXNSOCKET=2
DGXHT=2
DGXIBDEVICES='--device=/dev/infiniband --device=/dev/infiniband/rdma_cm --device=/dev/infiniband/ucm3 --device=/dev/infiniband/ucm2 --device=/dev/infiniband/ucm1 --device=/dev/infiniband/ucm0 --device=/dev/infiniband/uverbs3 --device=/dev/infiniband/uverbs2 --device=/dev/infiniband/uverbs1 --device=/dev/infiniband/uverbs0 --device=/dev/infiniband/issm3 --device=/dev/infiniband/umad3 --device=/dev/infiniband/issm2 --device=/dev/infiniband/umad2 --device=/dev/infiniband/issm1 --device=/dev/infiniband/umad1 --device=/dev/infiniband/issm0 --device=/dev/infiniband/umad0'
elif [ "$node_type" = "dgx2h" ] ; then
echo "Running on dgx2h systems"
DGXSYSTEM="DGX2H"
DGXNGPU=16
DGXSOCKETCORES=24
DGXNSOCKET=2
DGXHT=2 # HT is on is 2, HT off is 1
DGXIBDEVICES='--device=/dev/infiniband/rdma_cm --device=/dev/infiniband/ucm10 --device=/dev/infiniband/ucm9 --device=/dev/infiniband/ucm8 --device=/dev/infiniband/ucm7 --device=/dev/infiniband/ucm4 --device=/dev/infiniband/ucm3 --device=/dev/infiniband/ucm2 --device=/dev/infiniband/ucm1 --device=/dev/infiniband/uverbs10 --device=/dev/infiniband/uverbs9 --device=/dev/infiniband/uverbs8 --device=/dev/infiniband/uverbs7 --device=/dev/infiniband/uverbs4 --device=/dev/infiniband/uverbs3 --device=/dev/infiniband/uverbs2 --device=/dev/infiniband/uverbs1 --device=/dev/infiniband/issm10 --device=/dev/infiniband/umad10 --device=/dev/infiniband/issm9 --device=/dev/infiniband/umad9 --device=/dev/infiniband/issm8 --device=/dev/infiniband/umad8 --device=/dev/infiniband/issm7 --device=/dev/infiniband/umad7 --device=/dev/infiniband/issm4 --device=/dev/infiniband/umad4 --device=/dev/infiniband/issm3 --device=/dev/infiniband/umad3 --device=/dev/infiniband/issm2 --device=/dev/infiniband/umad2 --device=/dev/infiniband/issm1 --device=/dev/infiniband/umad1'
else
echo "Unknown <node_type>, must be either dgx1 or dgx2"
exit -1
fi
printf -v EXTRA_PARAMS "%d %d %e %s 1 %d %d %d false" $train_batch_size $eval_batch_size $learning_rate "$precision" $warmup_proportion $train_steps $save_checkpoint_steps
export ROOTDIR=$root_dir
export DATA_DIR=${DATA_DIR:-$CODEDIR/data/wikipedia_corpus/pyt_hdf5_shards}
VOLS="-v $ROOTDIR:/workspace/bert"
VOLS+=" -v $DATA_DIR:/workspace/bert/data/wikipedia_corpus/pyt_hdf5_shards"
# VOLS+=" -v $BOOKS_DIR:/workspace/bert/data/bookcorpus/final_tfrecord_sharded"
VOLS+=" -v $results_dir:/results"
VOLS+=" -v $checkpoints_dir:/checkpoints"
export VOLS
export CONT
export DGXSYSTEM
export DGXNGPU
export DGXIBDEVICES
export EXTRA_PARAMS
set -x
cd $CODEDIR
pwd
PART=""
if [ "$partition" != "default" ] ; then
printf -v PART "%s" "-p $partition"
fi
export GBS=$(expr $num_nodes \* $batch_size \* $DGXNGPU)
printf -v TAG "%s_%dn_%s_gbs%d" "$job_name" $num_nodes "$precision" $GBS
export DATESTAMP=`date +'%y%m%d%H%M%S'`
sbatch $PART \
-N $num_nodes \
-t $wall_time \
-J $job_name \
--exclusive \
--mem=0 \
--mail-type=FAIL \
--ntasks-per-node=$DGXNGPU \
--threads-per-core=$DGXHT \
--cores-per-socket=$DGXSOCKETCORES \
--output=$LOGDIR/$TAG.$DATESTAMP.log \
$CODEDIR/scripts/run.sub
set +x

View File

@ -0,0 +1,391 @@
# coding=utf-8
# Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tokenization classes."""
from __future__ import absolute_import, division, print_function, unicode_literals
import collections
import logging
import os
import unicodedata
import six
from io import open
from file_utils import cached_path
logger = logging.getLogger(__name__)
PRETRAINED_VOCAB_ARCHIVE_MAP = {
'bert-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt",
'bert-large-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-vocab.txt",
'bert-base-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-vocab.txt",
'bert-large-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-vocab.txt",
'bert-base-multilingual-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-uncased-vocab.txt",
'bert-base-multilingual-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased-vocab.txt",
'bert-base-chinese': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese-vocab.txt",
}
PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP = {
'bert-base-uncased': 512,
'bert-large-uncased': 512,
'bert-base-cased': 512,
'bert-large-cased': 512,
'bert-base-multilingual-uncased': 512,
'bert-base-multilingual-cased': 512,
'bert-base-chinese': 512,
}
VOCAB_NAME = 'vocab.txt'
def convert_to_unicode(text):
"""Converts `text` to Unicode (if it's not already), assuming utf-8 input."""
if six.PY3:
if isinstance(text, str):
return text
elif isinstance(text, bytes):
return text.decode("utf-8", "ignore")
else:
raise ValueError("Unsupported string type: %s" % (type(text)))
elif six.PY2:
if isinstance(text, str):
return text.decode("utf-8", "ignore")
elif isinstance(text, unicode):
return text
else:
raise ValueError("Unsupported string type: %s" % (type(text)))
else:
raise ValueError("Not running on Python2 or Python 3?")
def load_vocab(vocab_file):
"""Loads a vocabulary file into a dictionary."""
vocab = collections.OrderedDict()
index = 0
with open(vocab_file, "r", encoding="utf-8") as reader:
while True:
token = reader.readline()
if not token:
break
token = token.strip()
vocab[token] = index
index += 1
return vocab
def whitespace_tokenize(text):
"""Runs basic whitespace cleaning and splitting on a piece of text."""
text = text.strip()
if not text:
return []
tokens = text.split()
return tokens
class BertTokenizer(object):
"""Runs end-to-end tokenization: punctuation splitting + wordpiece"""
def __init__(self, vocab_file, do_lower_case=True, max_len=None,
never_split=("[UNK]", "[SEP]", "[PAD]", "[CLS]", "[MASK]")):
if not os.path.isfile(vocab_file):
raise ValueError(
"Can't find a vocabulary file at path '{}'. To load the vocabulary from a Google pretrained "
"model use `tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`".format(vocab_file))
self.vocab = load_vocab(vocab_file)
self.ids_to_tokens = collections.OrderedDict(
[(ids, tok) for tok, ids in self.vocab.items()])
self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case,
never_split=never_split)
self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab)
self.max_len = max_len if max_len is not None else int(1e12)
def tokenize(self, text):
split_tokens = []
for token in self.basic_tokenizer.tokenize(text):
for sub_token in self.wordpiece_tokenizer.tokenize(token):
split_tokens.append(sub_token)
return split_tokens
def convert_tokens_to_ids(self, tokens):
"""Converts a sequence of tokens into ids using the vocab."""
ids = []
for token in tokens:
ids.append(self.vocab[token])
if len(ids) > self.max_len:
raise ValueError(
"Token indices sequence length is longer than the specified maximum "
" sequence length for this BERT model ({} > {}). Running this"
" sequence through BERT will result in indexing errors".format(len(ids), self.max_len)
)
return ids
def convert_ids_to_tokens(self, ids):
"""Converts a sequence of ids in wordpiece tokens using the vocab."""
tokens = []
for i in ids:
tokens.append(self.ids_to_tokens[i])
return tokens
@classmethod
def from_pretrained(cls, pretrained_model_name_or_path, cache_dir=None, *inputs, **kwargs):
"""
Instantiate a PreTrainedBertModel from a pre-trained model file.
Download and cache the pre-trained model file if needed.
"""
if pretrained_model_name_or_path in PRETRAINED_VOCAB_ARCHIVE_MAP:
vocab_file = PRETRAINED_VOCAB_ARCHIVE_MAP[pretrained_model_name_or_path]
else:
vocab_file = pretrained_model_name_or_path
if os.path.isdir(vocab_file):
vocab_file = os.path.join(vocab_file, VOCAB_NAME)
# redirect to the cache, if necessary
try:
resolved_vocab_file = cached_path(vocab_file, cache_dir=cache_dir)
except EnvironmentError:
logger.error(
"Model name '{}' was not found in model name list ({}). "
"We assumed '{}' was a path or url but couldn't find any file "
"associated to this path or url.".format(
pretrained_model_name_or_path,
', '.join(PRETRAINED_VOCAB_ARCHIVE_MAP.keys()),
vocab_file))
return None
if resolved_vocab_file == vocab_file:
logger.info("loading vocabulary file {}".format(vocab_file))
else:
logger.info("loading vocabulary file {} from cache at {}".format(
vocab_file, resolved_vocab_file))
if pretrained_model_name_or_path in PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP:
# if we're using a pretrained model, ensure the tokenizer wont index sequences longer
# than the number of positional embeddings
max_len = PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP[pretrained_model_name_or_path]
kwargs['max_len'] = min(kwargs.get('max_len', int(1e12)), max_len)
# Instantiate tokenizer.
tokenizer = cls(resolved_vocab_file, *inputs, **kwargs)
return tokenizer
class BasicTokenizer(object):
"""Runs basic tokenization (punctuation splitting, lower casing, etc.)."""
def __init__(self,
do_lower_case=True,
never_split=("[UNK]", "[SEP]", "[PAD]", "[CLS]", "[MASK]")):
"""Constructs a BasicTokenizer.
Args:
do_lower_case: Whether to lower case the input.
"""
self.do_lower_case = do_lower_case
self.never_split = never_split
def tokenize(self, text):
"""Tokenizes a piece of text."""
text = self._clean_text(text)
# This was added on November 1st, 2018 for the multilingual and Chinese
# models. This is also applied to the English models now, but it doesn't
# matter since the English models were not trained on any Chinese data
# and generally don't have any Chinese data in them (there are Chinese
# characters in the vocabulary because Wikipedia does have some Chinese
# words in the English Wikipedia.).
text = self._tokenize_chinese_chars(text)
orig_tokens = whitespace_tokenize(text)
split_tokens = []
for token in orig_tokens:
if self.do_lower_case and token not in self.never_split:
token = token.lower()
token = self._run_strip_accents(token)
split_tokens.extend(self._run_split_on_punc(token))
output_tokens = whitespace_tokenize(" ".join(split_tokens))
return output_tokens
def _run_strip_accents(self, text):
"""Strips accents from a piece of text."""
text = unicodedata.normalize("NFD", text)
output = []
for char in text:
cat = unicodedata.category(char)
if cat == "Mn":
continue
output.append(char)
return "".join(output)
def _run_split_on_punc(self, text):
"""Splits punctuation on a piece of text."""
if text in self.never_split:
return [text]
chars = list(text)
i = 0
start_new_word = True
output = []
while i < len(chars):
char = chars[i]
if _is_punctuation(char):
output.append([char])
start_new_word = True
else:
if start_new_word:
output.append([])
start_new_word = False
output[-1].append(char)
i += 1
return ["".join(x) for x in output]
def _tokenize_chinese_chars(self, text):
"""Adds whitespace around any CJK character."""
output = []
for char in text:
cp = ord(char)
if self._is_chinese_char(cp):
output.append(" ")
output.append(char)
output.append(" ")
else:
output.append(char)
return "".join(output)
def _is_chinese_char(self, cp):
"""Checks whether CP is the codepoint of a CJK character."""
# This defines a "chinese character" as anything in the CJK Unicode block:
# https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
#
# Note that the CJK Unicode block is NOT all Japanese and Korean characters,
# despite its name. The modern Korean Hangul alphabet is a different block,
# as is Japanese Hiragana and Katakana. Those alphabets are used to write
# space-separated words, so they are not treated specially and handled
# like the all of the other languages.
if ((cp >= 0x4E00 and cp <= 0x9FFF) or #
(cp >= 0x3400 and cp <= 0x4DBF) or #
(cp >= 0x20000 and cp <= 0x2A6DF) or #
(cp >= 0x2A700 and cp <= 0x2B73F) or #
(cp >= 0x2B740 and cp <= 0x2B81F) or #
(cp >= 0x2B820 and cp <= 0x2CEAF) or
(cp >= 0xF900 and cp <= 0xFAFF) or #
(cp >= 0x2F800 and cp <= 0x2FA1F)): #
return True
return False
def _clean_text(self, text):
"""Performs invalid character removal and whitespace cleanup on text."""
output = []
for char in text:
cp = ord(char)
if cp == 0 or cp == 0xfffd or _is_control(char):
continue
if _is_whitespace(char):
output.append(" ")
else:
output.append(char)
return "".join(output)
class WordpieceTokenizer(object):
"""Runs WordPiece tokenization."""
def __init__(self, vocab, unk_token="[UNK]", max_input_chars_per_word=100):
self.vocab = vocab
self.unk_token = unk_token
self.max_input_chars_per_word = max_input_chars_per_word
def tokenize(self, text):
"""Tokenizes a piece of text into its word pieces.
This uses a greedy longest-match-first algorithm to perform tokenization
using the given vocabulary.
For example:
input = "unaffable"
output = ["un", "##aff", "##able"]
Args:
text: A single token or whitespace separated tokens. This should have
already been passed through `BasicTokenizer`.
Returns:
A list of wordpiece tokens.
"""
output_tokens = []
for token in whitespace_tokenize(text):
chars = list(token)
if len(chars) > self.max_input_chars_per_word:
output_tokens.append(self.unk_token)
continue
is_bad = False
start = 0
sub_tokens = []
while start < len(chars):
end = len(chars)
cur_substr = None
while start < end:
substr = "".join(chars[start:end])
if start > 0:
substr = "##" + substr
if substr in self.vocab:
cur_substr = substr
break
end -= 1
if cur_substr is None:
is_bad = True
break
sub_tokens.append(cur_substr)
start = end
if is_bad:
output_tokens.append(self.unk_token)
else:
output_tokens.extend(sub_tokens)
return output_tokens
def _is_whitespace(char):
"""Checks whether `chars` is a whitespace character."""
# \t, \n, and \r are technically contorl characters but we treat them
# as whitespace since they are generally considered as such.
if char == " " or char == "\t" or char == "\n" or char == "\r":
return True
cat = unicodedata.category(char)
if cat == "Zs":
return True
return False
def _is_control(char):
"""Checks whether `chars` is a control character."""
# These are technically control characters but we count them as whitespace
# characters.
if char == "\t" or char == "\n" or char == "\r":
return False
cat = unicodedata.category(char)
if cat.startswith("C"):
return True
return False
def _is_punctuation(char):
"""Checks whether `chars` is a punctuation character."""
cp = ord(char)
# We treat all non-letter/number ASCII as punctuation.
# Characters such as "^", "$", and "`" are not in the Unicode
# Punctuation class but we treat them as punctuation anyways, for
# consistency.
if ((cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or
(cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126)):
return True
cat = unicodedata.category(char)
if cat.startswith("P"):
return True
return False

View File

@ -0,0 +1,123 @@
# NVIDIA
import hashlib
import urllib.request
import zipfile
# Download urls
model_urls = {
'bert_base_uncased' : ('https://storage.googleapis.com/bert_models/2018_10_18/uncased_L-12_H-768_A-12.zip', 'uncased_L-12_H-768_A-12.zip'),
'bert_large_uncased' : ('https://storage.googleapis.com/bert_models/2018_10_18/uncased_L-24_H-1024_A-16.zip', 'uncased_L-24_H-1024_A-16.zip'),
'bert_base_cased' : ('https://storage.googleapis.com/bert_models/2018_10_18/cased_L-12_H-768_A-12.zip', 'cased_L-12_H-768_A-12.zip'),
'bert_large_cased' : ('https://storage.googleapis.com/bert_models/2018_10_18/cased_L-24_H-1024_A-16.zip', 'cased_L-24_H-1024_A-16.zip'),
'bert_base_multilingual_cased' : ('https://storage.googleapis.com/bert_models/2018_11_23/multi_cased_L-12_H-768_A-12.zip', 'multi_cased_L-12_H-768_A-12.zip'),
'bert_large_multilingual_uncased' : ('https://storage.googleapis.com/bert_models/2018_11_03/multilingual_L-12_H-768_A-12.zip', 'multilingual_L-12_H-768_A-12.zip'),
'bert_base_chinese' : ('https://storage.googleapis.com/bert_models/2018_11_03/chinese_L-12_H-768_A-12.zip', 'chinese_L-12_H-768_A-12.zip')
}
# SHA256sum verification for file download integrity (and checking for changes from the download source over time)
bert_base_uncased_sha = {
'bert_config.json' : '7b4e5f53efbd058c67cda0aacfafb340113ea1b5797d9ce6ee411704ba21fcbc',
'bert_model.ckpt.data-00000-of-00001' : '58580dc5e0bf0ae0d2efd51d0e8272b2f808857f0a43a88aaf7549da6d7a8a84',
'bert_model.ckpt.index' : '04c1323086e2f1c5b7c0759d8d3e484afbb0ab45f51793daab9f647113a0117b',
'bert_model.ckpt.meta' : 'dd5682170a10c3ea0280c2e9b9a45fee894eb62da649bbdea37b38b0ded5f60e',
'vocab.txt' : '07eced375cec144d27c900241f3e339478dec958f92fddbc551f295c992038a3',
}
bert_large_uncased_sha = {
'bert_config.json' : 'bfa42236d269e2aeb3a6d30412a33d15dbe8ea597e2b01dc9518c63cc6efafcb',
'bert_model.ckpt.data-00000-of-00001' : 'bc6b3363e3be458c99ecf64b7f472d2b7c67534fd8f564c0556a678f90f4eea1',
'bert_model.ckpt.index' : '68b52f2205ffc64dc627d1120cf399c1ef1cbc35ea5021d1afc889ffe2ce2093',
'bert_model.ckpt.meta' : '6fcce8ff7628f229a885a593625e3d5ff9687542d5ef128d9beb1b0c05edc4a1',
'vocab.txt' : '07eced375cec144d27c900241f3e339478dec958f92fddbc551f295c992038a3',
}
bert_base_cased_sha = {
'bert_config.json' : 'f11dfb757bea16339a33e1bf327b0aade6e57fd9c29dc6b84f7ddb20682f48bc',
'bert_model.ckpt.data-00000-of-00001' : '734d5a1b68bf98d4e9cb6b6692725d00842a1937af73902e51776905d8f760ea',
'bert_model.ckpt.index' : '517d6ef5c41fc2ca1f595276d6fccf5521810d57f5a74e32616151557790f7b1',
'bert_model.ckpt.meta' : '5f8a9771ff25dadd61582abb4e3a748215a10a6b55947cbb66d0f0ba1694be98',
'vocab.txt' : 'eeaa9875b23b04b4c54ef759d03db9d1ba1554838f8fb26c5d96fa551df93d02',
}
bert_large_cased_sha = {
'bert_config.json' : '7adb2125c8225da495656c982fd1c5f64ba8f20ad020838571a3f8a954c2df57',
'bert_model.ckpt.data-00000-of-00001' : '6ff33640f40d472f7a16af0c17b1179ca9dcc0373155fb05335b6a4dd1657ef0',
'bert_model.ckpt.index' : 'ef42a53f577fbe07381f4161b13c7cab4f4fc3b167cec6a9ae382c53d18049cf',
'bert_model.ckpt.meta' : 'd2ddff3ed33b80091eac95171e94149736ea74eb645e575d942ec4a5e01a40a1',
'vocab.txt' : 'eeaa9875b23b04b4c54ef759d03db9d1ba1554838f8fb26c5d96fa551df93d02',
}
bert_base_multilingual_cased_sha = {
'bert_config.json' : 'e76c3964bc14a8bb37a5530cdc802699d2f4a6fddfab0611e153aa2528f234f0',
'bert_model.ckpt.data-00000-of-00001' : '55b8a2df41f69c60c5180e50a7c31b7cdf6238909390c4ddf05fbc0d37aa1ac5',
'bert_model.ckpt.index' : '7d8509c2a62b4e300feb55f8e5f1eef41638f4998dd4d887736f42d4f6a34b37',
'bert_model.ckpt.meta' : '95e5f1997e8831f1c31e5cf530f1a2e99f121e9cd20887f2dce6fe9e3343e3fa',
'vocab.txt' : 'fe0fda7c425b48c516fc8f160d594c8022a0808447475c1a7c6d6479763f310c',
}
bert_large_multilingual_uncased_sha = {
'bert_config.json' : '49063bb061390211d2fdd108cada1ed86faa5f90b80c8f6fdddf406afa4c4624',
'bert_model.ckpt.data-00000-of-00001' : '3cd83912ebeb0efe2abf35c9f1d5a515d8e80295e61c49b75c8853f756658429',
'bert_model.ckpt.index' : '87c372c1a3b1dc7effaaa9103c80a81b3cbab04c7933ced224eec3b8ad2cc8e7',
'bert_model.ckpt.meta' : '27f504f34f02acaa6b0f60d65195ec3e3f9505ac14601c6a32b421d0c8413a29',
'vocab.txt' : '87b44292b452f6c05afa49b2e488e7eedf79ea4f4c39db6f2f4b37764228ef3f',
}
bert_base_chinese_sha = {
'bert_config.json' : '7aaad0335058e2640bcb2c2e9a932b1cd9da200c46ea7b8957d54431f201c015',
'bert_model.ckpt.data-00000-of-00001' : '756699356b78ad0ef1ca9ba6528297bcb3dd1aef5feadd31f4775d7c7fc989ba',
'bert_model.ckpt.index' : '46315546e05ce62327b3e2cd1bed22836adcb2ff29735ec87721396edb21b82e',
'bert_model.ckpt.meta' : 'c0f8d51e1ab986604bc2b25d6ec0af7fd21ff94cf67081996ec3f3bf5d823047',
'vocab.txt' : '45bbac6b341c319adc98a532532882e91a9cefc0329aa57bac9ae761c27b291c',
}
# Relate SHA to urls for loop below
model_sha = {
'bert_base_uncased' : bert_base_uncased_sha,
'bert_large_uncased' : bert_large_uncased_sha,
'bert_base_cased' : bert_base_cased_sha,
'bert_large_cased' : bert_large_cased_sha,
'bert_base_multilingual_cased' : bert_base_multilingual_cased_sha,
'bert_large_multilingual_uncased' : bert_large_multilingual_uncased_sha,
'bert_base_chinese' : bert_base_chinese_sha
}
# Helper to get sha256sum of a file
def sha256sum(filename):
h = hashlib.sha256()
b = bytearray(128*1024)
mv = memoryview(b)
with open(filename, 'rb', buffering=0) as f:
for n in iter(lambda : f.readinto(mv), 0):
h.update(mv[:n])
return h.hexdigest()
# Iterate over urls: download, unzip, verify sha256sum
found_mismatch_sha = False
for model in model_urls:
url = model_urls[model][0]
file = model_urls[model][1]
print("Downloading", url)
response = urllib.request.urlopen(url)
with open(file, "wb") as handle:
handle.write(response.read())
print("Unzipping", file)
zip = zipfile.ZipFile(file, 'r')
zip.extractall()
zip.close()
sha_dict = model_sha[model]
for extracted_file in sha_dict:
sha = sha_dict[extracted_file]
if sha != sha256sum(file[:-4] + "/" + extracted_file):
found_mismatch_sha = True
print("SHA256sum does not match on file:", extracted_file, "from download url:", url)
else:
print(file[:-4] + "/" + extracted_file, "\t", "verified")
if not found_mismatch_sha:
print("All downloads pass sha256sum verification.")

File diff suppressed because it is too large Load Diff

View File

@ -12,7 +12,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
FROM nvcr.io/nvidia/pytorch:18.12.1-py3
FROM nvcr.io/nvidia/pytorch:19.05-py3
RUN apt-get update && \
apt-get install -y unzip

View File

@ -1,6 +1,52 @@
# Neural Collaborative Filtering (NCF)
# Neural Collaborative Filtering (NCF) for PyTorch
This repository provides a script and recipe to train the Neural Collaborative Filtering (NCF)
model to achieve state of the art accuracy, and is tested and maintained by NVIDIA.
Table of Contents
=================
* [The model](#the-model)
* [Model architecture](#model-architecture)
* [Default configuration](#default-configuration)
* [Feature support matrix](#feature-support-matrix)
* [Features](#features)
* [Setup](#setup)
* [Requirements](#requirements)
* [Quick Start Guide](#quick-start-guide)
* [Details](#details)
* [Scripts and sample code](#scripts-and-sample-code)
* [Command-line options](#command-line-options)
* [Getting the data](#getting-the-data)
* [Dataset guidelines](#dataset-guidelines)
* [Multi-dataset](#multi-dataset)
* [ML-1m](#ml-1m)
* [Training process](#training-process)
* [Inference process](#inference-process)
* [Mixed precision training](#mixed-precision-training)
* [Enabling mixed precision](#enabling-mixed-precision)
* [Benchmarking](#benchmarking)
* [Training performance benchmark](#training-performance-benchmark)
* [Inference performance benchmark](#inference-performance-benchmark)
* [Results](#results)
* [Training accuracy results](#training-accuracy-results)
* [NVIDIA DGX-1 (8x V100 32G)](#nvidia-dgx-1-8x-v100-32g)
* [Training stability test](#training-stability-test)
* [Training performance results](#training-performance-results)
* [NVIDIA DGX-1 (8x V100 16G)](#nvidia-dgx-1-(8x-v100-16g))
* [NVIDIA DGX-1 (8x V100 32G)](#nvidia-dgx-1-(8x-v100-32g))
* [NVIDIA DGX-2 (16x V100 32G)](#nvidia-dgx-2-(16x-v100-32g))
* [Inference performance results](#inference-performance-results)
* [NVIDIA DGX-1 (8x V100 16G)](#nvidia-dgx-1-(8x-v100-16g))
* [NVIDIA DGX-1 (8x V100 32G)](#nvidia-dgx-1-(8x-v100-32g))
* [NVIDIA DGX-2 (16x V100 32G)](#nvidia-dgx-2-(16x-v100-32g))
* [Changelog](#changelog)
* [Known issues](#known-issues)
* [Scaling beyond 8 GPUs](#scaling-beyond-8-gpus)
* [Memory usage](#memory-usage)
## The model
The NCF model focuses on providing recommendations, also known as collaborative filtering; with implicit feedback. The training data for this model should contain binary information about whether a user interacted with a specific item.
NCF was first described by Xiangnan He, Lizi Liao, Hanwang Zhang, Liqiang Nie, Xia Hu and Tat-Seng Chua in the [Neural Collaborative Filtering paper](https://arxiv.org/abs/1708.05031).
@ -8,6 +54,23 @@ The implementation in this repository focuses on the NeuMF instantiation of the
We modified it to use dropout in the FullyConnected layers. This reduces overfitting and increases the final accuracy.
Training the other two instantiations of NCF (GMF and MLP) is not supported.
Contrary to the original paper, we benchmark the model on the larger [ML-20m dataset](https://grouplens.org/datasets/movielens/20m/)
instead of using the smaller [ML-1m](https://grouplens.org/datasets/movielens/1m/) dataset as we think this is more realistic of production type environments.
However, using the ML-1m dataset is also supported.
This model is trained with mixed precision using Tensor Cores on NVIDIA Volta and Turing GPUs. Therefore, researchers can get results 2x faster than training without Tensor Cores, while experiencing the benefits of mixed precision training. Multi-GPU training is also supported. This model is tested against each NGC monthly container release to ensure consistent accuracy and performance over time.
### Model architecture
This model is based mainly on Embedding and FullyConnected layers. The control flow is divided into two branches:
* Multi Layer Perceptron (MLP) branch, which transforms the input through FullyConnected layers with ReLU activations and dropout.
* Matrix Factorization (MF) branch, which performs collaborative filtering factorization.
Each user and each item has two embedding vectors associated with it -- one for the MLP branch and the other for the MF branch.
The outputs from those branches are concatenated and fed to the final FullyConnected layer with sigmoid activation.
This can be interpreted as a probability of a user interacting with a given item.
<p align="center">
<img width="70%" src="./img/ncf_diagram.png" />
@ -16,252 +79,483 @@ Figure 1. The architecture of a Neural Collaborative Filtering model. Taken from
</p>
Contrary to the original paper, we benchmark the model on the larger [ml-20m dataset](https://grouplens.org/datasets/movielens/20m/)
instead of using the smaller [ml-1m](https://grouplens.org/datasets/movielens/1m/) dataset as we think this is more realistic of production type environments.
However, using the ml-1m dataset is also supported.
### Default configuration
## Requirements
The following features were implemented in this model:
* Automatic Mixed Precision (AMP)
* Data-parallel multi-GPU training and evaluation
* Dropout
* Gradient accumulation
The easiest way to train the model is to use a Docker container. This would require:
* [nvidia-docker](https://github.com/NVIDIA/nvidia-docker)
* [PyTorch 18.12.1-py3 NGC container](https://ngc.nvidia.com/catalog/containers/nvidia:pytorch) or newer
For more information about how to get started with NGC containers, see the
following sections from the NVIDIA GPU Cloud Documentation and the Deep Learning
Frameworks Documentation:
* [Getting Started Using NVIDIA GPU Cloud](https://docs.nvidia.com/ngc/ngc-getting-started-guide/index.html)
* [Accessing And Pulling From The NGC Container Registry](https://docs.nvidia.com/deeplearning/dgx/user-guide/index.html#accessing_registry)
* [Running PyTorch](https://docs.nvidia.com/deeplearning/dgx/pytorch-release-notes/running.html#running)
The following performance optimizations were implemented in this model:
* FusedAdam optimizer
* Approximate train negative sampling
* Caching all the positive training samples in the device memory
## Training using mixed precision with Tensor Cores
### Supported hardware
Before you can train using mixed precision with Tensor Cores, ensure that you have an
NVIDIA Volta based GPU. Other platforms may work, however, are not officially
supported.
### Software changes
For detailed information about how to train using mixed precision, see the [Mixed
Precision Training paper](https://arxiv.org/abs/1710.03740)
and [Training With Mixed Precision documentation](https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html).
### Feature support matrix
The following features are supported by this model:
Another option for adding mixed-precision support is available from NVIDIAs
[APEX](https://github.com/NVIDIA/apex), a PyTorch extension, that contains utility libraries, such as AMP, which require minimal network code changes to leverage Tensor Core performance.
| **Feature** | **NCF PyTorch** |
|:---:|:--------:|
| Automatic Mixed Precision (AMP) | Yes |
| Multi-GPU training with Distributed Data Parallel (DDP) | Yes |
| Fused Adam | Yes |
This implementation of the NCF model uses a custom FP16 optimizer to implement mixed precision with static loss scaling.
The custom FP16 Optimizer was used to take advantage of the performance gains provided by the FusedOptimizer.
#### Features
* Automatic Mixed Precision - This implementation of NCF uses AMP to implement mixed precision training.
It allows us to use FP16 training with FP32 master weights by modifying just 3 lines of code.
* Multi-GPU training with Distributed Data Parallel - uses Apex's DDP to implement efficient multi-GPU training with NCCL.
* Fused Adam - We use a special implementation of the Adam implementation provided by the Apex package. It fuses some operations for faster weight updates.
Since NCF is a relatively lightweight model with a large number of parameters, weve observed significant performance improvements from using FusedAdam.
## Quick start guide
## Setup
The following section lists the requirements in order to start training the Neural Collaborative Filtering model.
### 1. Build and launch an NCF PyTorch Docker container
### Requirements
This repository contains Dockerfile which extends the PyTorch NGC container and encapsulates some dependencies.
Aside from these dependencies, ensure you have the following components:
NVIDIA Docker
PyTorch 19.05-py3 NGC container
NVIDIA Volta or Turing based GPU
After Docker is correctly set up, you can build the NCF image with:
For more information about how to get started with NGC containers, see the following sections from the NVIDIA GPU Cloud Documentation and the Deep Learning Documentation:
Getting Started Using NVIDIA GPU Cloud
Accessing And Pulling From The NGC Container Registry
Running PyTorch
For those unable to use the PyTorch NGC container, to set up the required environment or create your own container, see the versioned NVIDIA Container Support Matrix.
### Quick Start Guide
1. Clone the repository.
```bash
git clone https://github.com/NVIDIA/DeepLearningExamples
cd DeepLearningExamples/TensorFlow/Segmentation/UNetIndustrial
```
2. Build an NCF PyTorch Docker container.
After Docker is setup, you can build the NCF image with:
```bash
docker build . -t nvidia_ncf
```
After that the NVIDIA NCF container can be launched with:
3. Start an interactive session in the NGC container to run preprocessing/training and inference.
The NCF PyTorch container can be launched with:
```bash
mkdir data
docker run --runtime=nvidia -it --rm --ipc=host -v ${PWD}/data:/data nvidia_ncf bash
```
This will launch the container and mount the ./data directory as a volume to the /data directory inside the container.
Any datasets and experiment results (logs, checkpoints etc.) saved to /data will be accessible
in the './data' directory on the host.
This will launch the container and mount the `./data` directory as a volume to the `./data` directory inside the container.
Any datasets and experiment results (logs, checkpoints etc.) saved to `./data` will be accessible
in the `./data` directory on the host.
### 2. Data preparation
4. Download and preprocess the data.
Preprocessing consists of downloading the data, filtering out users that have less than 20 ratings (by default), sorting the data and dropping the duplicates.
Preprocessing consists of downloading the data, filtering out users that have less than 20 ratings (by default), sorting the data and dropping the duplicates.
The preprocessed train and test data is then saved in PyTorch binary format to be loaded just before training.
Note: Preprocessing requires PyTorch and should therefore be run inside the Docker container.
No data augmentation techniques are used.
To download and preprocess the ml-20m dataset you can run:
To download and preprocess the ML-20m dataset you can run:
```bash
./prepare_dataset.sh
```
Please note that this command will return immediately without downloading anything if the data is already present in the /data directory.
Note: This command will return immediately without downloading anything if the data is already present in the `./data` directory.
#### Other datasets
This will store the preprocessed training and evaluation data in the `./data` directory so that it can be later
used to train the model (by passing the appropriate `--data` argument to the `ncf.py` script).
This implementation is tuned for the ml-20m and ml-1m datasets.
Using other datasets might require tuning some hyperparameters (e.g., learning rate, beta1, beta2)
5. Start training.
If you'd like to use your custom dataset you can do it by adding support for it in the prepare_dataset.sh and download_dataset.sh scripts.
The required format of the data is a CSV file in which the first column contains the userID and the second column contains
the itemID.
The performance of the model depends on the dataset size.
Generally, the model should scale better for datasets containing more data points.
For a smaller dataset the you might experience slower performance.
##### ml-1m
To download and preprocess the ml-1m dataset run:
```bash
./prepare_dataset.sh ml-1m
```
This will store the preprocessed training and evaluation data in the /data directory so that it can be later
used to train the model (by passing the appropriate --data argument to the ncf.py script).
### 3. Run the training
After the docker container is launched, the training with the [default hyperparameters](#5-hyperparameters) can be started with:
After the Docker container is launched, the training with the default hyperparameters can be started with:
```bash
./prepare_dataset.sh
python -m torch.distributed.launch --nproc_per_node=8 ncf.py --data /data/cache/ml-20m
```
This will result in a checkpoint file being written to /data/checkpoints/model.pth.
This will result in a checkpoint file being written to `/data/checkpoints/model.pth`.
### 4. Test a trained model
6. Start validation/evaluation.
The trained model can be evaluated by passing the --mode test flag to the run.sh script:
The trained model can be evaluated by passing the `--mode test` flag to the `run.sh` script:
```bash
python -m torch.distributed.launch --nproc_per_node=8 ncf.py --data /data/cache/ml-20m --mode test --checkpoint-path /data/checkpoints/model.pth
```
### 5. Hyperparameters and command line arguments
The default hyperparameters used are:
## Details
* learning rate: 0.0045
* beta1: 0.25
* beta2: 0.5
* training batch size: 1048576
* epsilon: 1e-8
* loss scale: 8192
* negatives sampled for training: 4
* use mixed precision training: Yes
* number of GPUs used: 8
The following sections provide greater details of the dataset, running training and inference, and the training results.
### Scripts and sample code
The `ncf.py` script contains most of the training and validation logic. Data loading and preprocessing code is located in `dataloading.py`.
The model architecture is defined in `neumf.py`. Some initial data preprocessing is located in `convert.py`.
The logger directory contains simple bookkeeping utilities for storing training results.
### Command-line options
To see the full list of available options and their descriptions, use the `-h` or `--help` command line option, for example:
`python ncf.py --help`
The following example output is printed when running the sample:
```
usage: ncf.py [-h] [--data DATA] [-e EPOCHS] [-b BATCH_SIZE]
[--valid_batch_size VALID_BATCH_SIZE] [-f FACTORS]
[--layers LAYERS [LAYERS ...]] [-n NEGATIVE_SAMPLES]
[-l LEARNING_RATE] [-k TOPK] [--seed SEED]
[--threshold THRESHOLD] [--valid_negative VALID_NEGATIVE]
[--beta1 BETA1] [--beta2 BETA2] [--eps EPS] [--dropout DROPOUT]
[--checkpoint_dir CHECKPOINT_DIR] [--mode {train,test}]
[--grads_accumulated GRADS_ACCUMULATED] [--opt_level {O0,O2}]
[--local_rank LOCAL_RANK]
Train a Neural Collaborative Filtering model:
optional arguments:
-h, --help show this help message and exit
--data DATA Path to test and training data files
-e EPOCHS, --epochs EPOCHS
Number of epochs for training
-b BATCH_SIZE, --batch_size BATCH_SIZE
Number of examples for each iteration
--valid_batch_size VALID_BATCH_SIZE
Number of examples in each validation chunk
-f FACTORS, --factors FACTORS
Number of predictive factors
--layers LAYERS [LAYERS ...]
Sizes of hidden layers for MLP
-n NEGATIVE_SAMPLES, --negative_samples NEGATIVE_SAMPLES
Number of negative examples per interaction
-l LEARNING_RATE, --learning_rate LEARNING_RATE
Learning rate for optimizer
-k TOPK, --topk TOPK Rank for test examples to be considered a hit
--seed SEED, -s SEED Manually set random seed for torch
--threshold THRESHOLD, -t THRESHOLD
Stop training early at threshold
--valid_negative VALID_NEGATIVE
Number of negative samples for each positive test
example
--beta1 BETA1, -b1 BETA1
Beta1 for Adam
--beta2 BETA2, -b2 BETA2
Beta1 for Adam
--eps EPS Epsilon for Adam
--dropout DROPOUT Dropout probability, if equal to 0 will not use
dropout at all
--checkpoint_dir CHECKPOINT_DIR
Path to the directory storing the checkpoint file
--mode {train,test} Passing "test" will only run a single evaluation,
otherwise full training will be performed
--grads_accumulated GRADS_ACCUMULATED
Number of gradients to accumulate before performing an
optimization step
--opt_level {O0,O2} Optimization level for Automatic Mixed Precision
--local_rank LOCAL_RANK
Necessary for multi-GPU training
All these parameters can be controlled by passing command line arguments to the ncf.py script.
To get a complete list of all command line arguments with descriptions and default values you can run:
```bash
python ncf.py --help
```
### Getting the data
## Training accuracy results
The NCF model was trained on the ML-20m dataset.
For each user, the interaction with the latest timestamp was included in the test set and the rest of the examples are used as the training data.
This repository contains the `./prepare_dataset.sh` script which will automatically download and preprocess the training and validation datasets.
By default, data will be downloaded to the `/data` directory. The preprocessed data will be placed in `/data/cache`.
#### Dataset guidelines
The required format of the data is a CSV file with three columns: `user_id`, `item_id` and `timestamp`. This CSV should contain only the positive examples, in other words,
the ones for which an interaction between a user and an item occurred. The negatives will be sampled during the training and validation.
#### Multi-dataset
This implementation is tuned for the ML-20m and ML-1m datasets.
Using other datasets might require tuning some hyperparameters (for example, learning rate, beta1 and beta2).
If you'd like to use your custom dataset you can do it by adding support for it in the `prepare_dataset.sh` and `download_dataset.sh` scripts.
The performance of the model depends on the dataset size.
Generally, the model should scale better for datasets containing more data points.
For a smaller dataset you might experience slower performance.
#### ML-1m
To download, preprocess and train on the ML-1m dataset run:
```bash
./prepare_dataset.sh ml-1m
python -m torch.distributed.launch --nproc_per_node=8 ncf.py --data /data/cache/ml-1m
```
### Training process
The name of the training script is `ncf.py`. Because of the multi-GPU support, it should always be run with the torch distributed launcher like this:
```bash
python -m torch.distributed.launch --nproc_per_node=<number_of_gpus> ncf.py --data <path_to_dataset> [other_parameters]
```
The main result of the training are checkpoints stored by default in `/data/checkpoints/`. This location can be controlled
by the `--checkpoint_dir` command-line argument.
The validation metric is Hit Rate at 10 (HR@10) with 100 test negative samples. This means that for each positive sample in
the test set 100 negatives are sampled. All resulting 101 samples are then scored by the model. If the true positive sample is
among the 10 samples with highest scores we have a "hit" and the metric is equal to 1, otherwise it's equal to 0.
The HR@10 metric is the number of hits in the entire test set divided by the number of samples in the test set.
### Inference process
Inference can be launched with the same script used for training by passing the `--mode test` flag:
```bash
python -m torch.distributed.launch --nproc_per_node=<number_of_gpus> ncf.py --data <path_to_dataset> --mode test [other_parameters]
```
The script will then:
* Load the checkpoint from the directory specified by the `--checkpoint_dir` directory
* Run inference on the test dataset
* Compute and print the validation metric
## Mixed precision training
Mixed precision is the combined use of different numerical precisions in a computational method. [Mixed precision](https://arxiv.org/abs/1710.03740) training offers significant computational speedup by performing operations in half-precision format, while storing minimal information in single-precision to retain as much information as possible in critical parts of the network. Since the introduction of [tensor cores](https://developer.nvidia.com/tensor-cores) in the Volta and Turing architecture, significant training speedups are experienced by switching to mixed precision -- up to 3x overall speedup on the most arithmetically intense model architectures. Using mixed precision training requires two steps:
1. Porting the model to use the FP16 data type where appropriate.
2. Adding loss scaling to preserve small gradient values.
The ability to train deep learning networks with lower precision was introduced in the Pascal architecture and first supported in [CUDA 8](https://devblogs.nvidia.com/parallelforall/tag/fp16/) in the NVIDIA Deep Learning SDK.
For information about:
- How to train using mixed precision, see the [Mixed Precision Training](https://arxiv.org/abs/1710.03740) paper and [Training With Mixed Precision](https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html) documentation.
- Techniques used for mixed precision training, see the [Mixed-Precision Training of Deep Neural Networks](https://devblogs.nvidia.com/mixed-precision-training-deep-neural-networks/) blog.
- How to access and enable AMP for TensorFlow, see [Using TF-AMP](https://docs.nvidia.com/deeplearning/dgx/tensorflow-user-guide/index.html#tfamp) from the TensorFlow User Guide.
- APEX tools for mixed precision training, see the [NVIDIA Apex: Tools for Easy Mixed-Precision Training in PyTorch](https://devblogs.nvidia.com/apex-pytorch-easy-mixed-precision-training/).
### Enabling mixed precision
Using the Automatic Mixed Precision (AMP) package requires two modifications in the source code.
The first one is to initialize the model and the optimizer using the `amp.initialize` function:
```python
model, optimizer = amp.initialize(model, optimizer, opt_level=args.opt_level,
keep_batchnorm_fp32=False, loss_scale='dynamic')
```
The second one is to use the AMP's loss scaling context manager:
```python
with amp.scale_loss(loss, optimizer) as scaled_loss:
scaled_loss.backward()
```
## Benchmarking
### Training performance benchmark
NCF training on NVIDIA DGX systems is very fast, therefore, in order to measure train and validation throughput, you can simply run the full training job with:
```bash
./prepare_dataset.sh
python -m torch.distributed.launch --nproc_per_node=8 ncf.py --data /data/cache/ml-20m --epochs 5
```
At the end of the script, a line reporting the best train throughput is printed.
### Inference performance benchmark
Validation throughput can be measured by running the full training job with:
```bash
./prepare_dataset.sh
python -m torch.distributed.launch --nproc_per_node=8 ncf.py --data /data/cache/ml-20m --epochs 5
```
The best validation throughput is reported to the standard output.
## Results
The following sections provide details on how we achieved our performance and accuracy in training and inference.
### Training accuracy results
#### NVIDIA DGX-1 (8x V100 32G)
Our results were obtained by following the steps in the Quick Start Guide in the PyTorch 19.05-py3 NGC container on NVIDIA DGX-1 with 8x V100 32G GPUs.
The following table lists the best hit rate at 10 for DGX-1 with 8 V100 32G GPUs:
| **Number of GPUs** | **Full precision HR@10** | **Mixed precision HR@10** |
| **Number of GPUs** | **Single precision HR@10** | **Mixed precision HR@10** |
|:---:|:--------:|:-------:|
|1| 0.959015 |0.959485|
|4| 0.959389 |0.959274|
|8| 0.959015 |0.96|
|1| 0.95847 | 0.95845 |
|4| 0.95887 | 0.95841 |
|8| 0.95850 | 0.95885 |
Here's an example validation accuracy curve for mixed precision vs full precision on DGX-1 with 8 V100 32G GPUs:
Here's an example validation accuracy curve for mixed precision vs single precision on DGX-1 with 8 V100 32G GPUs:
![ValidationAccuracy](./img/dgx1v_32_curve.png)
To reproduce this result, start the NCF Docker container interactively and run:
```bash
./prepare_dataset.sh
python -m torch.distributed.launch --nproc_per_node=8 ncf.py --data /data/cache/ml-20m
```
Training accuracy results on a DGX-1 with 8 V100 16G GPUs and on DGX-2 should be the same.
#### Training stability test
The histogram below shows the best HR@10 achieved
for 400 experiments using mixed precision and 400 experiments using single precision.
Mean HR@10 for mixed precision was equal to 0.95917 and for single precision it was equal to
0.95915.
Mean HR@10 for mixed precision was equal to 0.95868 and for single precision it was equal to
0.95867.
![hr_histogram](./img/hr_histogram.png)
## Training performance results
### Training performance results
This example is based on [our submission for the MLPerf v0.5 benchmark](https://github.com/mlperf/results/tree/master/v0.5.0/nvidia/submission/code/recommendation/pytorch). Please note that we've introduced some improvements to this version that make time-to-train not directly comparable between it and our MLPerf submission:
- This version uses a more efficient multi-gpu sharding algorithm
- We added dropout operations here to achieve better accuracy
- This version uses 100 negatives by default during the evaluation phase as was done in the original NCF paper. MLPerf version used 999
- We save the model checkpoints in this version. This might make the training a few seconds slower depending on the speed of your storage
### NVIDIA DGX-1 with 8 V100 16G GPUs
#### NVIDIA DGX-1 (8x V100 16G)
Our results were obtained by following the steps in the Quick Start Guide in the PyTorch 19.05-py3 NGC container on NVIDIA DGX-1 with 8x V100 16G GPUs.
The following table shows the best training throughput:
| **Number of GPUs (samples/sec)** | **Mixed precision (samples/sec)** | **Full precision (samples/sec)** | **Speedup** |
|:---:|:-------------:|:-----------:|:-----:|
| 1 | 20,027,840 | 9,529,271 | 2.10 |
| 4 | 62,633,260| 32,719,700 | 1.91 |
| 8 | 99,332,230| 55,004,590 | 1.81 |
| **Number of GPUs** | **Batch size per GPU**| **Mixed precision throughput (samples/sec)** | **Single precision throughput (samples/sec)** | **Speed-up with mixed precision** | **Multi-GPU strong scaling with mixed precision** | **Multi-GPU strong scaling with FP32** |
|:---:|:--------:|:-----:|:-----------:|:-----:|:----:|:---|
| 1 |1048576| 20,459,365| 9,777,551 | 2.09 | 1 | 1 |
| 4 |262144 | 61,782,125| 32,583,924 | 1.90 | 3.02 |3.33|
| 8 |131072 | 98,464,084| 55,365,147 | 1.78 |4.81 |5.66|
The following table shows the average time to reach HR@10 of 0.9562 across 5 random seeds. The training time was measured excluding data downloading, preprocessing, validation data generation and library initialization times.
The following table shows mean time to reach HR@10 of 0.9562 across 5 random seeds. The training time was measured excluding data downloading, preprocessing and library initialization times.
| **Number of GPUs** | **Batch size per GPU** | **Mixed precision (seconds)** | **Single precision (seconds)** | **Speed-up with mixed precision** |
|:---:|:----:|:---------:|:-----------:|:-----:|
| 1 | 1048576| 67.03 | 142.31 | 2.12 |
| 4 | 262144| 23.92 | 47.57 | 1.99 |
| 8 | 131072| 18.82 | 31.48 | 1.67 |
| **Number of GPUs (samples/sec)** | **Mixed precision (seconds)** | **Full precision (seconds)** | **Speedup** |
|:---:|:-------------:|:-----------:|:-----:|
| 1 | 78.73 | 153.90 | 1.95 |
| 4 | 25.80 | 49.41 | 1.92 |
| 8 | 20.42 | 32.68 | 1.60 |
### NVIDIA DGX-1 with 8 V100 32G GPUs
#### NVIDIA DGX-1 (8x V100 32G)
Our results were obtained by following the steps in the Quick Start Guide in the PyTorch 19.05-py3 NGC container on NVIDIA DGX-1 with 8x V100 32G GPUs.
The following table shows the best training throughput:
| **Number of GPUs (samples/sec)** | **Mixed precision (samples/sec)** | **Full precision (samples/sec)** | **Speedup** |
| **Number of GPUs** | **Batch size per GPU** | **Mixed precision throughput (samples/sec)** | **Single precision throughput (samples/sec)** | **Speed-up with mixed precision** | **Multi-GPU strong scaling with mixed precision** | **Multi-GPU strong scaling with FP32** |
|:---:|:----:|:---------:|:-----------:|:-----:|:---:|:---:|
| 1 | 1048576| 19,314,944 | 9,464,431 | 2.04 | 1 | 1 |
| 4 | 262144| 58,579,745 |31,577,085 | 1.86 | 3.03 | 3.34 |
| 8 | 131072| 92,964,306 | 53,972,811 | 1.72 | 4.81 | 5.70 |
The following table shows the average time to reach HR@10 of 0.9562 across 5 random seeds. The training time was measured excluding data downloading, preprocessing, validation data generation and library initialization times.
| **Number of GPUs** | **Mixed precision (seconds)** | **Single precision (seconds)** | **Speed-up with mixed precision** |
|:---:|:-------------:|:-----------:|:-----:|
| 1 | 18,871,650 | 9,206,424 | 2.05 |
| 4 | 59,413,640 | 31,898,870 | 1.86 |
| 8 | 94,752,770 | 53,645,640 | 1.77 |
| 1 | 70.49 | 146.68 | 2.08 |
| 4 | 24.61 | 49.01 | 1.99 |
| 8 | 19.72 | 32.25 | 1.64 |
The following table shows mean time to reach HR@10 of 0.9562 across 5 random seeds. The training time was measured excluding data downloading, preprocessing and library initialization times.
| **Number of GPUs (samples/sec)** | **Mixed precision (seconds)** | **Full precision (seconds)** | **Speedup** |
#### NVIDIA DGX-2 (16x V100 32G)
Our results were obtained by following the steps in the Quick Start Guide in the PyTorch 19.05-py3 NGC container on NVIDIA DGX-2 with 16x V100 32G GPUs.
The following table shows the best training throughput:
| **Number of GPUs ** | **Batch size per GPU** | **Mixed precision throughput (samples/sec)** | **Single precision throughput (samples/sec)** | **Speed-up with mixed precision** | **Multi-GPU strong scaling with mixed precision** | **Multi-GPU strong scaling with FP32** |
|:---:|:-----:|:-------:|:-----------:|:-----:|:---:|:---:|
| 1 | 1048576| 20,645,544 | 10,145,873 | 2.03 | 1 | 1 |
| 4 | 262144 | 63,608,950 | 34,758,369 | 1.83 | 3.08 | 3.43 |
| 8 | 131072| 98,887,103 | 57,251,418 | 1.73 | 4.79 | 5.64 |
| 16 | 65536| 128,976,394 | 82,932,545 | 1.56 | 6.25 | 8.17 |
The following table shows the average time to reach HR@10 of 0.9562 across 5 random seeds. The training time was measured excluding data downloading, preprocessing, validation data generation and library initialization times.
| **Number of GPUs ** | **Mixed precision (seconds)** | **Single precision (seconds)** | **Speed-up with mixed precision** |
|:---:|:-------------:|:-----------:|:-----:|
| 1 | 79.80 | 147.92 | 1.85 |
| 4 | 27.67 | 47.64 | 1.72 |
| 8 | 22.61 | 31.62 | 1.40 |
| 1 | 65.99 |134.93 |2.04|
| 4 | 26.21 |41.12 |1.57|
| 8 | 21.96 |29.71 |1.35|
| 16| 22.15 |28.99 |1.31|
## Inference performance results
### NVIDIA DGX-1 with 8 V100 16G GPUs
### Inference performance results
#### NVIDIA DGX-1 (8x V100 16G)
Our results were obtained by following the steps in the Quick Start Guide in the PyTorch 19.05-py3 NGC container on NVIDIA DGX-1 with 8x V100 16G GPUs.
The following table shows the best inference throughput:
| **Number of GPUs (samples/sec)** | **Mixed precision (samples/sec)** | **Full precision (samples/sec)** | **Speedup** |
| **Number of GPUs ** | **Mixed precision (samples/sec)** | **Single precision (samples/sec)** | **Speed-up with mixed precision** |
|:---:|:-------------:|:-----------:|:-----:|
| 1 | 58,836,420 | 28,964,964 | 2.03 |
| 1 | 57,163,273 | 28,877,257 | 1.98 |
### NVIDIA DGX-1 with 8 V100 32G GPUs
#### NVIDIA DGX-1 (8x V100 32G)
Our results were obtained by following the steps in the Quick Start Guidein the PyTorch 19.05-py3 NGC container on NVIDIA DGX-1 with 8x V100 32G GPUs.
The following table shows the best inference throughput:
| **Number of GPUs (samples/sec)** | **Mixed precision (samples/sec)** | **Full precision (samples/sec)** | **Speedup** |
| **Number of GPUs** | **Mixed precision (samples/sec)** | **Single precision (samples/sec)** | **Speed-up with mixed precision** |
|:---:|:-------------:|:-----------:|:-----:|
| 1 | 55,317,010 | 28,470,920 | 1.94 |
| 1 | 54,570,476 | 28,085,521 | 1.94 |
#### NVIDIA DGX-2 (16x V100 32G)
Our results were obtained by following the steps in the Quick Start Guide in the PyTorch 19.05-py3 NGC container on NVIDIA DGX-2 with 16x V100 32G GPUs.
The following table shows the best inference throughput:
| **Number of GPUs** | **Mixed precision (samples/sec)** | **Single precision (samples/sec)** | **Speed-up with mixed precision** |
|:---:|:-------------:|:-----------:|:-----:|
| 1 | 58,383,216 | 30,018,043 | 1.94 |
## Changelog
1. January 22, 2018
* Initial release
2. May, 2019
* Lower memory consumption (down from about 18GB to 10GB for batch size 1M on a single NVIDIA Tesla V100). Achieved by using an approximate method for generating negatives for training.
* Automatic Mixed Precision (AMP) with dynamic loss scaling instead of a custom mixed-precision optimizer.
* Performance numbers for NVIDIA DGX-2.
* Data loading code cleanup.
* Default container updated to PyTorch 19.05-py3.
* Updated README.md.
## Known issues
## Known issues
### Scaling beyond 8 GPUs
Neural Collaborative Filtering is a relatively lightweight model that trains quickly with this relatively smaller dataset, ml-20m.
Because of that the high ratio of communication to computation makes it difficult to
efficiently use more than 8 GPUs. Normally this is not an issue because when using 8
GPUs with fp16 precision the training is sufficiently fast. However, if youd like to
scale the training to 16 GPUs and beyond you might try modifying the model so that
the communication-computation ratio facilitates better scaling. This could be done e.g.,
Neural Collaborative Filtering is a relatively lightweight model that trains quickly with this relatively smaller dataset, ML-20m.
Because of that, the high ratio of communication to computation makes it difficult to
efficiently use more than 8 GPUs. Typically, this is not an issue because when using 8
GPUs with FP16 precision, the training is sufficiently fast. However, if youd like to
scale the training to 16 GPUs and beyond, you might try modifying the model so that
the communication-computation ratio facilitates better scaling. This could be done, for example,
by finding hyperparameters that enable using a larger batch size or by reducing the
number of trainable parameters.
### Memory usage
Training on a single GPU with less than 16GB of memory or switching off FP16 mode might result in out-of-memory errors. To reduce memory usage you can use a smaller batch size.
However, since were using the Adam optimizer, this might require changing the hyperparameters such as learning rate, beta1 and beta2.
To circumvent this you can use gradient accumulation to combine multiple gradients computed from smaller batches into a single weight update.
This should keep the “effective” batch size the same as original and enable using the default hyperparameters with much lower memory usage:
```bash
python -m torch.distributed.launch --nproc_per_node=8 ncf.py --data /data/cache/ml-20m --grads_accumulated 2 --batch-size 524288
```
In the default settings, the additional memory beyond 16G may not be fully utilized.
This is because we set the default batch size for ML-20m dataset to 1M,
which is too small to completely fill-up multiple 32G GPUs.
1M is the batch size for which we experienced the best convergence on the ML-20m dataset.
However, on other datasets, even faster performance can be possible by finding hyperparameters that work well for larger batches and leverage additional GPU memory.
In the default settings the additional memory beyond 16G may not be fully utilized.
This is because we set the default batch size for ml-20m dataset to 1M,
which is too small to completely fill up multiple 32G GPUs.
1M is the batch size for which we experienced the best convergence on the ml-20m dataset.
However, on other datasets even faster performance can be possible by finding hyperparameters that work well for larger batches and leverage additional GPU memory.

View File

@ -0,0 +1,158 @@
# Copyright (c) 2018, deepakn94, codyaustun, robieta. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# -----------------------------------------------------------------------
#
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import time
import torch
import tqdm
class _TestNegSampler:
def __init__(self, train_ratings, nb_neg):
self.nb_neg = nb_neg
self.nb_users = int(train_ratings[:, 0].max()) + 1
self.nb_items = int(train_ratings[:, 1].max()) + 1
# compute unique ids for quickly created hash set and fast lookup
ids = (train_ratings[:, 0] * self.nb_items) + train_ratings[:, 1]
self.set = set(ids)
def generate(self, batch_size=128*1024):
users = torch.arange(0, self.nb_users).reshape([1, -1]).repeat([self.nb_neg, 1]).transpose(0, 1).reshape(-1)
items = [-1] * len(users)
random_items = torch.LongTensor(batch_size).random_(0, self.nb_items).tolist()
print('Generating validation negatives...')
for idx, u in enumerate(tqdm.tqdm(users.tolist())):
if not random_items:
random_items = torch.LongTensor(batch_size).random_(0, self.nb_items).tolist()
j = random_items.pop()
while u * self.nb_items + j in self.set:
if not random_items:
random_items = torch.LongTensor(batch_size).random_(0, self.nb_items).tolist()
j = random_items.pop()
items[idx] = j
items = torch.LongTensor(items)
return items
def create_test_data(train_ratings, test_ratings, args):
test_users = test_ratings[:,0]
test_pos = test_ratings[:,1].reshape(-1,1)
begin = time.time()
sampler = _TestNegSampler(train_ratings.cpu().numpy(), args.valid_negative)
test_negs = sampler.generate().cuda()
end = time.time()
print('Generating validation negatives took: ', end - begin)
del train_ratings
# create items with real sample at last position
test_users = test_users.reshape(-1,1).repeat(1, 1 + args.valid_negative)
test_items = torch.cat((test_negs.reshape(-1, args.valid_negative), test_pos), dim=1)
del test_ratings, test_negs
# generate dup mask and real indices for exact same behavior on duplication compare to reference
# here we need a sort that is stable(keep order of duplicates)
sorted_items, indices = torch.sort(test_items) # [1,1,1,2], [3,1,0,2]
sum_item_indices = sorted_items.float()+indices.float()/len(indices[0]) #[1.75,1.25,1.0,2.5]
indices_order = torch.sort(sum_item_indices)[1] #[2,1,0,3]
stable_indices = torch.gather(indices, 1, indices_order) #[0,1,3,2]
# produce -1 mask
dup_mask = (sorted_items[:,0:-1] == sorted_items[:,1:])
dup_mask = torch.cat((torch.zeros_like(test_pos, dtype=torch.uint8), dup_mask),dim=1)
dup_mask = torch.gather(dup_mask,1,stable_indices.sort()[1])
# produce real sample indices to later check in topk
sorted_items, indices = (test_items != test_pos).sort()
sum_item_indices = sorted_items.float()+indices.float()/len(indices[0])
indices_order = torch.sort(sum_item_indices)[1]
stable_indices = torch.gather(indices, 1, indices_order)
real_indices = stable_indices[:,0]
if args.distributed:
test_users = torch.chunk(test_users, args.world_size)[args.local_rank]
test_items = torch.chunk(test_items, args.world_size)[args.local_rank]
dup_mask = torch.chunk(dup_mask, args.world_size)[args.local_rank]
real_indices = torch.chunk(real_indices, args.world_size)[args.local_rank]
test_users = test_users.view(-1).split(args.valid_batch_size)
test_items = test_items.view(-1).split(args.valid_batch_size)
return test_users, test_items, dup_mask, real_indices
def prepare_epoch_train_data(train_ratings, nb_items, args):
# create label
train_label = torch.ones_like(train_ratings[:,0], dtype=torch.float32)
neg_label = torch.zeros_like(train_label, dtype=torch.float32)
neg_label = neg_label.repeat(args.negative_samples)
train_label = torch.cat((train_label,neg_label))
del neg_label
train_users = train_ratings[:,0]
train_items = train_ratings[:,1]
train_users_per_worker = len(train_label) / args.world_size
train_users_begin = int(train_users_per_worker * args.local_rank)
train_users_end = int(train_users_per_worker * (args.local_rank + 1))
# prepare data for epoch
neg_users = train_users.repeat(args.negative_samples)
neg_items = torch.empty_like(neg_users, dtype=torch.int64).random_(0, nb_items)
epoch_users = torch.cat((train_users, neg_users))
epoch_items = torch.cat((train_items, neg_items))
del neg_users, neg_items
# shuffle prepared data and split into batches
epoch_indices = torch.randperm(train_users_end - train_users_begin, device='cuda:{}'.format(args.local_rank))
epoch_indices += train_users_begin
epoch_users = epoch_users[epoch_indices]
epoch_items = epoch_items[epoch_indices]
epoch_label = train_label[epoch_indices]
if args.distributed:
local_batch = args.batch_size // args.world_size
else:
local_batch = args.batch_size
epoch_users = epoch_users.split(local_batch)
epoch_items = epoch_items.split(local_batch)
epoch_label = epoch_label.split(local_batch)
# the last batch will almost certainly be smaller, drop it
epoch_users = epoch_users[:-1]
epoch_items = epoch_items[:-1]
epoch_label = epoch_label[:-1]
return epoch_users, epoch_items, epoch_label

View File

@ -3,16 +3,19 @@ RAW_DATADIR=$2
function download_20m {
echo "Download ml-20m"
cd ${RAW_DATADIR}
curl -O http://files.grouplens.org/datasets/movielens/ml-20m.zip
mv ml-20m.zip ${RAW_DATADIR}
cd -
}
function download_1m {
echo "Downloading ml-1m"
cd ${RAW_DATADIR}
curl -O http://files.grouplens.org/datasets/movielens/ml-1m.zip
mv ml-1m.zip ${RAW_DATADIR}
cd -
}
if [[ ${DATASET_NAME} == "ml-1m" ]]
then
download_1m

Binary file not shown.

Before

Width:  |  Height:  |  Size: 42 KiB

After

Width:  |  Height:  |  Size: 41 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 28 KiB

After

Width:  |  Height:  |  Size: 29 KiB

View File

@ -60,12 +60,11 @@ def collect_by_scope(loglines):
# gather eval_accuracy
eval_accuracy_dup = [l.value for l in loglines if l.tag == tags.EVAL_ACCURACY]
eval_accuracy = list({l['value']:l for l in eval_accuracy_dup})
eval_accuracy = [l['value'] for l in eval_accuracy_dup]
epoch_stats['eval_accuracy'] = eval_accuracy
# gather it_per_sec
eval_it_per_sec = [l.value for l in loglines if l.tag == tags.PERF_IT_PER_SEC]
#eval_it_per_sec = list({l['value']:l for l in eval_it_per_sec_dup})
epoch_stats['it_per_sec'] = eval_it_per_sec

View File

@ -35,23 +35,21 @@ import os
import sys
import math
import time
from datetime import datetime
from collections import OrderedDict
from argparse import ArgumentParser
import torch
import torch.nn as nn
import utils
import dataloading
from neumf import NeuMF
from logger.logger import LOGGER, timed_block, timed_function
from logger import tags
from logger.autologging import log_hardware, log_args
from fp_optimizers import Fp16Optimizer
from apex.parallel import DistributedDataParallel as DDP
from apex import amp
LOGGER.model = 'ncf'
@ -60,30 +58,28 @@ def parse_args():
" Filtering model")
parser.add_argument('--data', type=str,
help='Path to test and training data files')
parser.add_argument('-e', '--epochs', type=int, default=40,
parser.add_argument('-e', '--epochs', type=int, default=30,
help='Number of epochs for training')
parser.add_argument('-b', '--batch-size', type=int, default=1048576,
parser.add_argument('-b', '--batch_size', type=int, default=2**20,
help='Number of examples for each iteration')
parser.add_argument('--valid-batch-size', type=int, default=2**20,
parser.add_argument('--valid_batch_size', type=int, default=2**20,
help='Number of examples in each validation chunk')
parser.add_argument('-f', '--factors', type=int, default=64,
help='Number of predictive factors')
parser.add_argument('--layers', nargs='+', type=int,
default=[256, 256, 128, 64],
help='Sizes of hidden layers for MLP')
parser.add_argument('-n', '--negative-samples', type=int, default=4,
parser.add_argument('-n', '--negative_samples', type=int, default=4,
help='Number of negative examples per interaction')
parser.add_argument('-l', '--learning-rate', type=float, default=0.0045,
parser.add_argument('-l', '--learning_rate', type=float, default=0.0045,
help='Learning rate for optimizer')
parser.add_argument('-k', '--topk', type=int, default=10,
help='Rank for test examples to be considered a hit')
parser.add_argument('--seed', '-s', type=int, default=0,
parser.add_argument('--seed', '-s', type=int, default=1,
help='Manually set random seed for torch')
parser.add_argument('--threshold', '-t', type=float, default=1.0,
help='Stop training early at threshold')
parser.add_argument('--no-fp16', action='store_false', dest='fp16',
help='Do not use fp16')
parser.add_argument('--valid-negative', type=int, default=100,
parser.add_argument('--valid_negative', type=int, default=100,
help='Number of negative samples for each positive test example')
parser.add_argument('--beta1', '-b1', type=float, default=0.25,
help='Beta1 for Adam')
@ -93,14 +89,15 @@ def parse_args():
help='Epsilon for Adam')
parser.add_argument('--dropout', type=float, default=0.5,
help='Dropout probability, if equal to 0 will not use dropout at all')
parser.add_argument('--loss-scale', default=8192, type=int,
help='Loss scale to use for mixed precision training')
parser.add_argument('--checkpoint-dir', default='/data/checkpoints/', type=str,
parser.add_argument('--checkpoint_dir', default='/data/checkpoints/', type=str,
help='Path to the directory storing the checkpoint file')
parser.add_argument('--mode', choices=['train', 'test'], default='train', type=str,
help='Passing "test" will only run a single evaluation, otherwise full training will be performed')
parser.add_argument('--grads_accumulated', default=1, type=int,
help='Number of gradients to accumulate before performing an optimization step')
parser.add_argument('--opt_level', default='O2', type=str,
help='Optimization level for Automatic Mixed Precision',
choices=['O0', 'O2'])
parser.add_argument('--local_rank', default=0, type=int, help='Necessary for multi-GPU training')
return parser.parse_args()
@ -133,12 +130,8 @@ def init_distributed(local_rank=0):
return distributed, int(os.environ['WORLD_SIZE'])
def val_epoch(model, x, y, dup_mask, real_indices, K, samples_per_user, num_user, output=None,
def val_epoch(model, x, y, dup_mask, real_indices, K, samples_per_user, num_user,
epoch=None, distributed=False):
start = datetime.now()
log_2 = math.log(2)
model.eval()
with torch.no_grad():
@ -146,80 +139,36 @@ def val_epoch(model, x, y, dup_mask, real_indices, K, samples_per_user, num_user
for u,n in zip(x,y):
p.append(model(u, n, sigmoid=True).detach())
del x
del y
temp = torch.cat(p).view(-1,samples_per_user)
del p
del x, y, p
# set duplicate results for the same item to -1 before topk
temp[dup_mask] = -1
out = torch.topk(temp,K)[1]
# topk in pytorch is stable(if not sort)
# key(item):value(predicetion) pairs are ordered as original key(item) order
# key(item):value(prediction) pairs are ordered as original key(item) order
# so we need the first position of real item(stored in real_indices) to check if it is in topk
ifzero = (out == real_indices.view(-1,1))
hits = ifzero.sum()
ndcg = (log_2 / (torch.nonzero(ifzero)[:,1].view(-1).to(torch.float)+2).log_()).sum()
ndcg = (math.log(2) / (torch.nonzero(ifzero)[:,1].view(-1).to(torch.float)+2).log_()).sum()
LOGGER.log(key=tags.EVAL_SIZE, value={"epoch": epoch, "value": num_user * samples_per_user})
LOGGER.log(key=tags.EVAL_HP_NUM_USERS, value=num_user)
LOGGER.log(key=tags.EVAL_HP_NUM_NEG, value=samples_per_user - 1)
end = datetime.now()
if distributed:
torch.distributed.all_reduce(hits, op=torch.distributed.reduce_op.SUM)
torch.distributed.all_reduce(ndcg, op=torch.distributed.reduce_op.SUM)
hits = hits.item()
ndcg = ndcg.item()
if output is not None:
result = OrderedDict()
result['timestamp'] = datetime.now()
result['duration'] = end - start
result['epoch'] = epoch
result['K'] = K
result['hit_rate'] = hits/num_user
result['NDCG'] = ndcg/num_user
utils.save_result(result, output)
hr = hits.item() / num_user
ndcg = ndcg.item() / num_user
model.train()
return hits/num_user, ndcg/num_user
def generate_neg(users, true_mat, item_range, num_neg, sort=False):
# assuming 1-d tensor input
# for each user in 'users', generate 'num_neg' negative samples in [0, item_range)
# also make sure negative sample is not in true sample set with mask
# true_mat store a mask matrix where true_mat(user, item) = 0 for true sample
# return (neg_user, neg_item)
# list to append iterations of result
neg_u = []
neg_i = []
neg_users = users.repeat(num_neg)
while len(neg_users) > 0: # generate then filter loop
neg_items = torch.empty_like(neg_users, dtype=torch.int64).random_(0, item_range)
neg_mask = true_mat[neg_users, neg_items]
neg_u.append(neg_users.masked_select(neg_mask))
neg_i.append(neg_items.masked_select(neg_mask))
neg_users = neg_users.masked_select(1-neg_mask)
neg_users = torch.cat(neg_u)
neg_items = torch.cat(neg_i)
if sort == False:
return neg_users, neg_items
sorted_users, sort_indices = torch.sort(neg_users)
return sorted_users, neg_items[sort_indices]
return hr, ndcg
def main():
log_hardware()
args = parse_args()
args.distributed, args.world_size = init_distributed(args.local_rank)
log_args(args)
@ -229,90 +178,35 @@ def main():
if args.seed is not None:
torch.manual_seed(args.seed)
# Save configuration to file
print("Saving results to {}".format(args.checkpoint_dir))
if not os.path.exists(args.checkpoint_dir) and args.checkpoint_dir != '':
os.makedirs(args.checkpoint_dir, exist_ok=True)
checkpoint_path = os.path.join(args.checkpoint_dir, 'model.pth')
# more like load trigger timer now
LOGGER.log(key=tags.PREPROC_HP_NUM_EVAL, value=args.valid_negative)
# The default of np.random.choice is replace=True, so does pytorch random_()
LOGGER.log(key=tags.PREPROC_HP_SAMPLE_EVAL_REPLACEMENT, value=True)
LOGGER.log(key=tags.INPUT_HP_SAMPLE_TRAIN_REPLACEMENT, value=True)
LOGGER.log(key=tags.INPUT_STEP_EVAL_NEG_GEN)
# sync worker before timing.
# sync workers before timing
if args.distributed:
torch.distributed.broadcast(torch.tensor([1], device="cuda"), 0)
torch.cuda.synchronize()
LOGGER.log(key=tags.RUN_START)
run_start_time = time.time()
# load not converted data, just seperate one for test
train_ratings = torch.load(args.data+'/train_ratings.pt', map_location=torch.device('cuda:{}'.format(args.local_rank)))
test_ratings = torch.load(args.data+'/test_ratings.pt', map_location=torch.device('cuda:{}'.format(args.local_rank)))
# get input data
# get dims
nb_maxs = torch.max(train_ratings, 0)[0]
nb_users = nb_maxs[0].item()+1
nb_items = nb_maxs[1].item()+1
train_users = train_ratings[:,0]
train_items = train_ratings[:,1]
del nb_maxs, train_ratings
LOGGER.log(key=tags.INPUT_SIZE, value=len(train_users))
# produce things not change between epoch
# mask for filtering duplicates with real sample
# note: test data is removed before create mask, same as reference
mat = torch.cuda.ByteTensor(nb_users, nb_items).fill_(1)
mat[train_users, train_items] = 0
# create label
train_label = torch.ones_like(train_users, dtype=torch.float32)
neg_label = torch.zeros_like(train_label, dtype=torch.float32)
neg_label = neg_label.repeat(args.negative_samples)
train_label = torch.cat((train_label,neg_label))
del neg_label
if args.fp16:
train_label = train_label.half()
nb_users = nb_maxs[0].item() + 1
nb_items = nb_maxs[1].item() + 1
LOGGER.log(key=tags.INPUT_SIZE, value=len(train_ratings))
# produce validation negative sample on GPU
all_test_users = test_ratings.shape[0]
test_users = test_ratings[:,0]
test_pos = test_ratings[:,1].reshape(-1,1)
test_negs = generate_neg(test_users, mat, nb_items, args.valid_negative, True)[1]
# create items with real sample at last position
test_users = test_users.reshape(-1,1).repeat(1,1+args.valid_negative)
test_items = torch.cat((test_negs.reshape(-1,args.valid_negative), test_pos), dim=1)
del test_ratings, test_negs
# generate dup mask and real indice for exact same behavior on duplication compare to reference
# here we need a sort that is stable(keep order of duplicates)
# this is a version works on integer
sorted_items, indices = torch.sort(test_items) # [1,1,1,2], [3,1,0,2]
sum_item_indices = sorted_items.float()+indices.float()/len(indices[0]) #[1.75,1.25,1.0,2.5]
indices_order = torch.sort(sum_item_indices)[1] #[2,1,0,3]
stable_indices = torch.gather(indices, 1, indices_order) #[0,1,3,2]
# produce -1 mask
dup_mask = (sorted_items[:,0:-1] == sorted_items[:,1:])
dup_mask = torch.cat((torch.zeros_like(test_pos, dtype=torch.uint8), dup_mask),dim=1)
dup_mask = torch.gather(dup_mask,1,stable_indices.sort()[1])
# produce real sample indices to later check in topk
sorted_items, indices = (test_items != test_pos).sort()
sum_item_indices = sorted_items.float()+indices.float()/len(indices[0])
indices_order = torch.sort(sum_item_indices)[1]
stable_indices = torch.gather(indices, 1, indices_order)
real_indices = stable_indices[:,0]
del sorted_items, indices, sum_item_indices, indices_order, stable_indices, test_pos
if args.distributed:
test_users = torch.chunk(test_users, args.world_size)[args.local_rank]
test_items = torch.chunk(test_items, args.world_size)[args.local_rank]
dup_mask = torch.chunk(dup_mask, args.world_size)[args.local_rank]
real_indices = torch.chunk(real_indices, args.world_size)[args.local_rank]
test_users, test_items, dup_mask, real_indices = dataloading.create_test_data(train_ratings, test_ratings, args)
# make pytorch memory behavior more consistent later
torch.cuda.empty_cache()
@ -320,36 +214,33 @@ def main():
LOGGER.log(key=tags.INPUT_BATCH_SIZE, value=args.batch_size)
LOGGER.log(key=tags.INPUT_ORDER) # we shuffled later with randperm
print('Load data done [%.1f s]. #user=%d, #item=%d, #train=%d, #test=%d'
% (time.time()-run_start_time, nb_users, nb_items, len(train_users),
nb_users))
# Create model
model = NeuMF(nb_users, nb_items,
mf_dim=args.factors, mf_reg=0.,
mf_dim=args.factors,
mlp_layer_sizes=args.layers,
mlp_layer_regs=[0. for i in args.layers],
dropout=args.dropout)
if args.fp16:
model = model.half()
optimizer = FusedAdam(model.parameters(), lr=args.learning_rate,
betas=(args.beta1, args.beta2), eps=args.eps, eps_inside_sqrt=False)
criterion = nn.BCEWithLogitsLoss(reduction='none') # use torch.mean() with dim later to avoid copy to host
# Move model and loss to GPU
model = model.cuda()
criterion = criterion.cuda()
if args.opt_level == "O2":
model, optimizer = amp.initialize(model, optimizer, opt_level=args.opt_level,
keep_batchnorm_fp32=False, loss_scale='dynamic')
if args.distributed:
model = DDP(model)
local_batch = args.batch_size // args.world_size
traced_criterion = torch.jit.trace(criterion.forward,
(torch.rand(local_batch,1),torch.rand(local_batch,1)))
print(model)
print("{} parameters".format(utils.count_parameters(model)))
# Save model text description
with open(os.path.join(args.checkpoint_dir, 'model.txt'), 'w') as file:
file.write(str(model))
# Add optimizer and loss to graph
if args.fp16:
fp_optimizer = Fp16Optimizer(model, args.loss_scale)
params = fp_optimizer.fp32_params
else:
params = model.parameters()
optimizer = FusedAdam(params, lr=args.learning_rate, betas=(args.beta1, args.beta2), eps=args.eps, eps_inside_sqrt=False)
criterion = nn.BCEWithLogitsLoss(reduction='none') # use torch.mean() with dim later to avoid copy to host
LOGGER.log(key=tags.OPT_LR, value=args.learning_rate)
LOGGER.log(key=tags.OPT_NAME, value="Adam")
LOGGER.log(key=tags.OPT_HP_ADAM_BETA1, value=args.beta1)
@ -357,53 +248,22 @@ def main():
LOGGER.log(key=tags.OPT_HP_ADAM_EPSILON, value=args.eps)
LOGGER.log(key=tags.MODEL_HP_LOSS_FN, value=tags.VALUE_BCE)
# Move model and loss to GPU
model = model.cuda()
criterion = criterion.cuda()
if args.distributed:
model = DDP(model)
local_batch = args.batch_size // int(os.environ['WORLD_SIZE'])
else:
local_batch = args.batch_size
traced_criterion = torch.jit.trace(criterion.forward, (torch.rand(local_batch,1),torch.rand(local_batch,1)))
train_users_per_worker = len(train_label) / int(os.environ['WORLD_SIZE'])
train_users_begin = int(train_users_per_worker * args.local_rank)
train_users_end = int(train_users_per_worker * (args.local_rank + 1))
# Create files for tracking training
valid_results_file = os.path.join(args.checkpoint_dir, 'valid_results.csv')
# Calculate initial Hit Ratio and NDCG
test_x = test_users.view(-1).split(args.valid_batch_size)
test_y = test_items.view(-1).split(args.valid_batch_size)
if args.mode == 'test':
state_dict = torch.load(checkpoint_path)
model.load_state_dict(state_dict)
begin = time.time()
LOGGER.log(key=tags.EVAL_START, value=-1)
hr, ndcg = val_epoch(model, test_x, test_y, dup_mask, real_indices, args.topk, samples_per_user=test_items.size(1),
num_user=all_test_users, distributed=args.distributed)
val_time = time.time() - begin
print('Initial HR@{K} = {hit_rate:.4f}, NDCG@{K} = {ndcg:.4f}, valid_time: {val_time:.4f}'
.format(K=args.topk, hit_rate=hr, ndcg=ndcg, val_time=val_time))
LOGGER.log(key=tags.EVAL_ACCURACY, value={"epoch": -1, "value": hr})
LOGGER.log(key=tags.EVAL_TARGET, value=args.threshold)
LOGGER.log(key=tags.EVAL_STOP, value=-1)
if args.mode == 'test':
hr, ndcg = val_epoch(model, test_users, test_items, dup_mask, real_indices, args.topk,
samples_per_user=args.valid_negative + 1,
num_user=all_test_users, distributed=args.distributed)
print('HR@{K} = {hit_rate:.4f}, NDCG@{K} = {ndcg:.4f}'
.format(K=args.topk, hit_rate=hr, ndcg=ndcg))
return
success = False
max_hr = 0
LOGGER.log(key=tags.TRAIN_LOOP)
train_throughputs = []
eval_throughputs = []
train_throughputs, eval_throughputs = [], []
LOGGER.log(key=tags.TRAIN_LOOP)
for epoch in range(args.epochs):
LOGGER.log(key=tags.TRAIN_EPOCH_START, value=epoch)
@ -412,68 +272,43 @@ def main():
begin = time.time()
# prepare data for epoch
neg_users, neg_items = generate_neg(train_users, mat, nb_items, args.negative_samples)
epoch_users = torch.cat((train_users,neg_users))
epoch_items = torch.cat((train_items,neg_items))
del neg_users, neg_items
# shuffle prepared data and split into batches
epoch_indices = torch.randperm(train_users_end - train_users_begin, device='cuda:{}'.format(args.local_rank))
epoch_indices += train_users_begin
epoch_users = epoch_users[epoch_indices]
epoch_items = epoch_items[epoch_indices]
epoch_label = train_label[epoch_indices]
epoch_users_list = epoch_users.split(local_batch)
epoch_items_list = epoch_items.split(local_batch)
epoch_label_list = epoch_label.split(local_batch)
# only print progress bar on rank 0
num_batches = len(epoch_users_list)
# handle extremely rare case where last batch size < number of worker
if len(epoch_users) % args.batch_size < args.world_size:
print("epoch_size % batch_size < number of worker!")
exit(1)
epoch_users, epoch_items, epoch_label = dataloading.prepare_epoch_train_data(train_ratings, nb_items, args)
num_batches = len(epoch_users)
for i in range(num_batches // args.grads_accumulated):
for j in range(args.grads_accumulated):
batch_idx = (args.grads_accumulated * i) + j
user = epoch_users_list[batch_idx]
item = epoch_items_list[batch_idx]
label = epoch_label_list[batch_idx].view(-1,1)
user = epoch_users[batch_idx]
item = epoch_items[batch_idx]
label = epoch_label[batch_idx].view(-1,1)
outputs = model(user, item)
loss = traced_criterion(outputs, label).float()
loss = torch.mean(loss.view(-1), 0)
if args.fp16:
fp_optimizer.backward(loss)
if args.opt_level == "O2":
with amp.scale_loss(loss, optimizer) as scaled_loss:
scaled_loss.backward()
else:
loss.backward()
if args.fp16:
fp_optimizer.step(optimizer)
else:
optimizer.step()
optimizer.step()
for p in model.parameters():
p.grad = None
p.grad = None
del epoch_users, epoch_items, epoch_label, epoch_users_list, epoch_items_list, epoch_label_list, user, item, label
del epoch_users, epoch_items, epoch_label
train_time = time.time() - begin
begin = time.time()
epoch_samples = len(train_users) * (args.negative_samples + 1)
epoch_samples = len(train_ratings) * (args.negative_samples + 1)
train_throughput = epoch_samples / train_time
train_throughputs.append(train_throughput)
LOGGER.log(key='train_throughput', value=train_throughput)
LOGGER.log(key=tags.TRAIN_EPOCH_STOP, value=epoch)
LOGGER.log(key=tags.EVAL_START, value=epoch)
hr, ndcg = val_epoch(model, test_x, test_y, dup_mask, real_indices, args.topk, samples_per_user=test_items.size(1),
num_user=all_test_users, output=valid_results_file, epoch=epoch, distributed=args.distributed)
hr, ndcg = val_epoch(model, test_users, test_items, dup_mask, real_indices, args.topk,
samples_per_user=args.valid_negative + 1,
num_user=all_test_users, epoch=epoch, distributed=args.distributed)
val_time = time.time() - begin
print('Epoch {epoch}: HR@{K} = {hit_rate:.4f}, NDCG@{K} = {ndcg:.4f},'
@ -486,7 +321,7 @@ def main():
LOGGER.log(key=tags.EVAL_TARGET, value=args.threshold)
LOGGER.log(key=tags.EVAL_STOP, value=epoch)
eval_size = all_test_users * test_items.size(1)
eval_size = all_test_users * (args.valid_negative + 1)
eval_throughput = eval_size / val_time
eval_throughputs.append(eval_throughput)
LOGGER.log(key='eval_throughput', value=eval_throughput)

View File

@ -34,8 +34,8 @@ import torch.nn as nn
import sys
from os.path import abspath, join, dirname
# enabling modules discovery from global entrypoint
sys.path.append(abspath(dirname(__file__)+'/'))
# enabling modules discovery from the global entrypoint
sys.path.append(abspath(dirname(__file__) + '/'))
from logger.logger import LOGGER
from logger import tags
@ -44,12 +44,8 @@ LOGGER.model = 'ncf'
class NeuMF(nn.Module):
def __init__(self, nb_users, nb_items,
mf_dim, mf_reg,
mlp_layer_sizes, mlp_layer_regs,
dropout=0):
mf_dim, mlp_layer_sizes, dropout=0):
if len(mlp_layer_sizes) != len(mlp_layer_regs):
raise RuntimeError('u dummy, layer_sizes != layer_regs!')
if mlp_layer_sizes[0] % 2 != 0:
raise RuntimeError('u dummy, mlp_layer_sizes[0] % 2 != 0')
super(NeuMF, self).__init__()

View File

@ -31,10 +31,11 @@
#!/bin/bash
set -e
set -x
DATASET_NAME=${1:-'ml-20m'}
RAW_DATADIR='/data'
CACHED_DATADIR='/data/cache/'${DATASET_NAME}
RAW_DATADIR=${2:-'/data'}
CACHED_DATADIR="${RAW_DATADIR}/cache/${DATASET_NAME}"
# you can add another option to this case in order to support other datasets
case ${DATASET_NAME} in
@ -51,9 +52,17 @@ case ${DATASET_NAME} in
exit 1
esac
mkdir -p ${RAW_DATADIR}
mkdir -p ${CACHED_DATADIR}
rm -f log
if [ ! -d ${RAW_DATADIR} ]; then
mkdir -p ${RAW_DATADIR}
fi
if [ ! -d ${CACHED_DATADIR} ]; then
mkdir -p ${CACHED_DATADIR}
fi
if [ -f log ]; then
rm -f log
fi
if [ ! -f ${ZIP_PATH} ]; then
echo 'Dataset not found, downloading...'
@ -76,6 +85,6 @@ else
fi
echo "Dataset $DATASET_NAME successfully prepared at: $CACHED_DATADIR\n"
echo 'You can now run the training with: python -m torch.distributed.launch --nproc_per_node=<number_of_GPUs> ncf.py --data /data/cache/ml-20m'
echo "You can now run the training with: python -m torch.distributed.launch --nproc_per_node=<number_of_GPUs> ncf.py --data ${CACHED_DATADIR}"

View File

@ -1 +1,2 @@
pandas
tqdm

View File

@ -10,6 +10,12 @@ import torch.distributed as dist
from maskrcnn_benchmark.utils.comm import get_world_size
from maskrcnn_benchmark.utils.metric_logger import MetricLogger
try:
from apex import amp
use_amp = True
except ImportError:
print('Use APEX for multi-precision via apex.amp')
use_amp = False
def reduce_loss_dict(loss_dict):
"""
@ -80,7 +86,7 @@ def do_train(
# Note: If mixed precision is not used, this ends up doing nothing
# Otherwise apply loss scaling for mixed-precision recipe
if use_amp:
with optimizer.scale_loss(losses) as scaled_losses:
with amp.scale_loss(losses, optimizer) as scaled_losses:
scaled_losses.backward()
else:
losses.backward()

View File

@ -2,9 +2,14 @@
import os
import sys
from torch.utils.model_zoo import _download_url_to_file
from torch.utils.model_zoo import urlparse
from torch.utils.model_zoo import HASH_REGEX
try:
from torch.utils.model_zoo import _download_url_to_file
from torch.utils.model_zoo import urlparse
from torch.utils.model_zoo import HASH_REGEX
except:
from torch.hub import _download_url_to_file
from torch.hub import urlparse
from torch.hub import HASH_REGEX
from maskrcnn_benchmark.utils.comm import is_main_process
from maskrcnn_benchmark.utils.comm import synchronize

View File

@ -97,14 +97,9 @@ def train(cfg, local_rank, distributed):
if use_amp:
# Initialize mixed-precision training
use_mixed_precision = cfg.DTYPE == "float16"
amp_handle = amp.init(enabled=use_mixed_precision, verbose=cfg.AMP_VERBOSE)
# wrap the optimizer for mixed precision
if cfg.SOLVER.ACCUMULATE_GRAD:
# also specify number of steps to accumulate over
optimizer = amp_handle.wrap_optimizer(optimizer, num_loss=cfg.SOLVER.ACCUMULATE_STEPS)
else:
optimizer = amp_handle.wrap_optimizer(optimizer)
amp_opt_level = 'O1' if use_mixed_precision else 'O0'
model, optimizer = amp.initialize(model, optimizer, opt_level=amp_opt_level)
if distributed:
if use_apex_ddp:

View File

@ -1,5 +1,10 @@
FROM nvcr.io/nvidia/pytorch:18.12.1-py3
FROM nvcr.io/nvidia/pytorch:19.03-py3
ADD . /workspace/tacotron2
WORKDIR /workspace/tacotron2
RUN pip install -r requirements.txt
RUN cd /workspace; \
git clone https://github.com/NVIDIA/apex.git; \
cd /workspace/apex; \
pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" .
WORKDIR /workspace/tacotron2

View File

@ -1,79 +1,157 @@
# Tacotron 2 And WaveGlow v1.0 For PyTorch
# Tacotron 2 And WaveGlow v1.5 For PyTorch
This repository provides a script and recipe to train Tacotron 2 and WaveGlow v1.0 to achieve state of the art accuracy, and is tested and maintained by NVIDIA.
This repository provides a script and recipe to train Tacotron 2 and WaveGlow
v1.5 models to achieve state of the art accuracy, and is tested and maintained by
NVIDIA.
## Table Of Contents
Table of Contents
=================
* [The model](#the-model)
* [Default configuration](#default-configuration)
* [Model architecture](#model-architecture)
* [Default configuration](#default-configuration)
* [Feature support matrix](#feature-support-matrix)
* [Features](#features)
* [Setup](#setup)
* [Requirements](#requirements)
* [Requirements](#requirements)
* [Quick Start Guide](#quick-start-guide)
* [Details](#details)
* [Training process](#training-process)
* [Hyperparameters and command line arguments](#hyperparameters-and-command-line-arguments)
* [Shared parameters](#shared-parameters)
* [Shared audio/STFT parameters](#shared-audiostft-parameters)
* [Tacotron 2 parameters](#tacotron-2-parameters)
* [WaveGlow parameters](#waveglow-parameters)
* [Enabling mixed precision](#enabling-mixed-precision)
* [Inference process](#inference-process)
* [Scripts and sample code](#scripts-and-sample-code)
* [Parameters](#parameters)
* [Shared parameters](#shared-parameters)
* [Shared audio/STFT parameters](#shared-audiostft-parameters)
* [Tacotron 2 parameters](#tacotron-2-parameters)
* [WaveGlow parameters](#waveglow-parameters)
* [Command-line options](#command-line-options)
* [Getting the data](#getting-the-data)
* [Dataset guidelines](#dataset-guidelines)
* [Multi-dataset](#multi-dataset)
* [Training process](#training-process)
* [Inference process](#inference-process)
* [Mixed precision training](#mixed-precision-training)
* [Enabling mixed precision](#enabling-mixed-precision)
* [Benchmarking](#benchmarking)
* [Inference performance benchmark](#inference-performance-benchmark)
* [Training performance benchmark](#training-performance-benchmark)
* [Training performance benchmark](#training-performance-benchmark)
* [Inference performance benchmark](#inference-performance-benchmark)
* [Results](#results)
* [Training accuracy results](#training-accuracy-results)
* [Training performance results](#training-performance-results)
* [Expected training time](#expected-training-time)
* [Inference performance results](#inference-performance-results)
* [Training accuracy results](#training-accuracy-results)
* [NVIDIA DGX-1 (8x V100 16G)](#nvidia-dgx-1-8x-v100-16g)
* [Training performance results](#training-performance-results)
* [NVIDIA DGX-1 (8x V100 16G)](#nvidia-dgx-1-8x-v100-16g)
* [Expected training time](#expected-training-time)
* [Inference performance results](#inference-performance-results)
* [NVIDIA DGX-1 (8x V100 16G)](#nvidia-dgx-1-8x-v100-16g)
* [Changelog](#changelog)
* [Known issues](#known-issues)
## The model
This text-to-speech (TTS) system is a combination of two neural network
models:
# The model
This text-to-speech (TTS) system is a combination of two neural network models:
* a modified Tacotron 2 model from the [Natural TTS Synthesis by Conditioning WaveNet on Mel Spectrogram Predictions](https://arxiv.org/abs/1712.05884) paper and
* a modified Tacotron 2 model from the [Natural TTS Synthesis by Conditioning WaveNet on Mel Spectrogram Predictions](https://arxiv.org/abs/1712.05884)
paper and
* a flow-based neural network model from the [WaveGlow: A Flow-based Generative Network for Speech Synthesis](https://arxiv.org/abs/1811.00002) paper.
The Tacotron 2 and WaveGlow model form a text-to-speech system that enables
user to synthesise a natural sounding speech from raw transcripts without
any additional prosody information.
The Tacotron 2 and WaveGlow models form a text-to-speech system that enables
users to synthesize natural sounding speech from raw transcripts without
any additional information such as patterns and/or rhythms of speech.
Our implementation of Tacotron 2 model differs from the model described in the
paper. Our implementation uses Dropout instead of Zoneout to regularize the LSTM layers.
Also, the original text-to-speech system proposed in the paper used the [WaveNet](https://arxiv.org/abs/1609.03499)
model to synthesize waveforms.
In our implementation, we use the WaveGlow model for this purpose.
Our implementation of Tacotron 2 models differs from the model described in the
paper. Our implementation uses Dropout instead of Zoneout to regularize the
LSTM layers. Also, the original text-to-speech system proposed in the paper
uses the [WaveNet](https://arxiv.org/abs/1609.03499) model to synthesize
waveforms. In our implementation, we use the WaveGlow model for this purpose.
Both models are based on implementations of NVIDIA GitHub repositories
[Tacotron 2](https://github.com/NVIDIA/tacotron2) and
[WaveGlow](https://github.com/NVIDIA/waveglow), and are trained on a publicly
available [LJ Speech dataset](https://keithito.com/LJ-Speech-Dataset/).
This model trains with mixed precision tensor cores on Volta, therefore researchers
can get results much faster than training without tensor cores. This model is
tested against each NGC monthly container release to ensure consistent accuracy
and performance over time.
The Tacotron 2 and WaveGlow model enables you to efficiently synthesize high
quality speech from text.
## Default configuration
The Tacotron 2 model produces mel spectrograms from input text using
encoder-decoder architecture. WaveGlow is a flow-based model that consumes the
mel spectrograms to generate speech. Both models support multi-gpu and mixed
precision training with dynamic loss scaling (see Apex code [here](https://github.com/NVIDIA/apex/blob/master/apex/fp16_utils/loss_scaler.py)),
as well as mixed precision inference.
Both models are trained with mixed precision using Tensor Cores on NVIDIA
Volta and Turing GPUs. Therefore, researchers can get results 1.5x faster for Tacotron 2
and 2.2x faster for WaveGlow than training without Tensor Cores, while
experiencing the benefits of mixed precision training. The models are tested
against each NGC monthly container release to ensure consistent accuracy and
performance over time.
# Setup
The following sections list the requirements in order to
start training the Tacotron 2 and WaveGlow models.
### Model architecture
## Requirements
This repository contains `Dockerfile` which extends the PyTorch NGC container
The Tacotron 2 model is a recurrent sequence-to-sequence model with attention that
predicts mel-spectrograms from text. The encoder (blue blocks in the figure
below) transforms the whole text into a fixed-size hidden feature
representation. This feature representation is then consumed by the
autoregressive decoder (orange blocks) that produces one spectrogram frame at
a time. In our implementation, the autoregressive WaveNet (green block) is
replaced by the flow-based generative WaveGlow.
![](./img/tacotron2_arch.png "Tacotron 2 architecture")
Figure 1. Architecture of the Tacotron 2 model. Taken from the
[Tacotron 2](https://arxiv.org/abs/1712.05884) paper.
The WaveGlow model is a flow-based generative model that generates audio
samples from Gaussian distribution using mel-spectrogram conditioning (Figure
2). During training, the model learns to transform the dataset distribution
into spherical Gaussian distribution through a series of flows. One step of a
flow consists of an invertible convolution, followed by a modified WaveNet
architecture that serves as an affine coupling layer. During inference, the
network is inverted and audio samples are generated from the Gaussian
distribution.
![](./img/waveglow_arch.png "WaveGlow architecture")
Figure 2. Architecture of the WaveGlow model. Taken from the
[WaveGlow](https://arxiv.org/abs/1811.00002) paper.
### Default configuration
Both models support multi-GPU and mixed precision training with dynamic loss
scaling (see Apex code
[here](https://github.com/NVIDIA/apex/blob/master/apex/fp16_utils/loss_scaler.py)),
as well as mixed precision inference. To speed up Tacotron 2 training,
reference mel-spectrograms are generated during a preprocessing step and read
directly from disk during training, instead of being generated during training.
The following features were implemented in this model:
* data-parallel multi-GPU training
* dynamic loss scaling with backoff for Tensor Cores (mixed precision)
training.
### Feature support matrix
The following features are supported by this model.
| Feature | Tacotron 2 | and WaveGlow |
|:-------|---------:|-----------:|
|[AMP](https://nvidia.github.io/apex/amp.html) | Yes | Yes |
|[Apex DistributedDataParallel](https://nvidia.github.io/apex/parallel.html) | Yes | Yes |
#### Features
AMP - a tool that enables Tensor Core-accelerated training. Please refer to section [Enabling mixed precision](#enabling-mixed-precision) for more details.
Apex DistributedDataParallel - a module wrapper that enables easy multiprocess distributed data parallel training, similar to `torch.nn.parallel.DistributedDataParallel`. `DistributedDataParallel` is optimized for use with NCCL. It achieves high performance by overlapping communication with computation during backward() and bucketing smaller gradient transfers to reduce the total number of transfers required.
## Setup
The following section lists the requirements in order to start training the
Tacotron 2 and WaveGlow models.
### Requirements
This repository contains Dockerfile which extends the PyTorch NGC container
and encapsulates some dependencies. Aside from these dependencies, ensure you
have the following components:
* [NVIDIA Docker](https://github.com/NVIDIA/nvidia-docker)
* [PyTorch 19.05-py3 NGC container](https://ngc.nvidia.com/registry/nvidia-pytorch) or newer
* [NVIDIA Volta based GPU](https://www.nvidia.com/en-us/data-center/volta-gpu-architecture/)
* [PyTorch 19.04-py3 NGC container](https://ngc.nvidia.com/registry/nvidia-pytorch)
or newer
* [NVIDIA Volta](https://www.nvidia.com/en-us/data-center/volta-gpu-architecture/) or [Turing](https://www.nvidia.com/en-us/geforce/turing/) based GPU
For more information about how to get started with NGC containers, see the
@ -84,35 +162,49 @@ Documentation:
* [Accessing And Pulling From The NGC Container Registry](https://docs.nvidia.com/deeplearning/dgx/user-guide/index.html#accessing_registry)
* [Running PyTorch](https://docs.nvidia.com/deeplearning/dgx/pytorch-release-notes/running.html#running)
# Quick Start Guide
To train your model using mixed precision with tensor cores or using FP32,
perform the following steps using the default parameters of the Tacrotron 2
and WaveGlow model on the [LJ Speech](https://keithito.com/LJ-Speech-Dataset/) dataset.
For those unable to use the PyTorch NGC container, to set up the required
environment or create your own container, see the versioned
[NVIDIA Container Support Matrix](https://docs.nvidia.com/deeplearning/dgx/support-matrix/index.html).
## 1. Clone the repository.
## Quick Start Guide
To train your model using mixed precision with Tensor Cores or using FP32,
perform the following steps using the default parameters of the Tacrotron 2
and WaveGlow model on the [LJ Speech](https://keithito.com/LJ-Speech-Dataset/)
dataset.
1. Clone the repository.
```bash
git clone https://github.com/NVIDIA/DeepLearningExamples.git
cd DeepLearningExamples/PyTorch/SpeechSynthesis/Tacotron2
```
## 2. Download and preprocess the dataset.
2. Download and preprocess the dataset.
Use the `./scripts/prepare-dataset.sh` download script to automatically
download and preprocess the training, validation and test datasets. To run this script, issue:
download and preprocess the training, validation and test datasets. To run
this script, issue:
```bash
bash scripts/prepare-dataset.sh
```
To preprocess the datasets for Tacotron 2 training, use the
`./scripts/prepare-mels.sh` script:
```bash
bash scripts/prepare_mels.sh
```
Data is downloaded to the `./LJSpeech-1.1` directory (on the host). The
`./LJSpeech-1.1` directory is mounted to the `/workspace/tacotron2/LJSpeech-1.1`
location in the NGC container. The script will also generate the necessary
filelists for training and validation in `./filelists` if they are not already present.
location in the NGC container. The preprocessed mel-spectrograms are stored in the
`./LJSpeech-1.1/mels` directory.
## 3. Build the Tacotron 2 and WaveGlow PyTorch NGC container.
3. Build the Tacotron 2 and WaveGlow PyTorch NGC container.
```bash
bash scripts/docker/build.sh
```
## 4. Start an interactive session in the NGC container to run training/inference.
After you build the container image, you can start an interactive CLI session with
4. Start an interactive session in the NGC container to run training/inference.
After you build the container image, you can start an interactive CLI session with:
```bash
bash scripts/docker/interactive.sh
@ -121,210 +213,260 @@ bash scripts/docker/interactive.sh
The `interactive.sh` script requires that the location on the dataset is specified.
For example, `LJSpeech-1.1`.
## 5. Start training.
To run Tacotron 2 training, run:
5. Start training.
To start Tacotron 2 training, run:
```bash
bash scripts/train_tacotron2.sh
```
To run WaveGlow training, run:
To start WaveGlow training, run:
```bash
bash scripts/train_waveglow.sh
```
## 6. Start validation/evaluation.
Ensure your loss values are comparable to those listed in the table in the
Results section. For both models, the loss values are stored in the
`./output/nvlog.json` log file.
6. Start validation/evaluation.
Ensure your loss values are comparable to those listed in the table in the
[Results][#results] section. For both models, the loss values are stored in the
`./output/nvlog.json` log file.
After you have trained the Tacotron 2 and WaveGlow models, you should get audio results similar to the
samples in the `./audio` folder. For details about generating audio, see the
After you have trained the Tacotron 2 model for 1500 epochs and the
WaveGlow model for 800 epochs, you should get audio results similar to the
samples in the `./audio` folder. For details about generating audio, see the
[Inference process](#inference-process) section below.
The training scripts automatically run the validation after each training
epoch. The results from the validation are printed to the standard output
The training scripts automatically run the validation after each training
epoch. The results from the validation are printed to the standard output
(`stdout`) and saved to the log files.
## 7. Start inference.
After you have trained the Tacotron 2 and WaveGlow models, you can perform
inference using the respective checkpoints that are passed as `--tacotron2`
and `--waveglow` arguments.
7. Start inference.
After you have trained the Tacotron 2 and WaveGlow models, you can perform
inference using the respective checkpoints that are passed as `--tacotron2`
and `--waveglow` arguments.
To run inference issue:
```bash
python inference.py --tacotron2 <Tacotron2_checkpoint> --waveglow <WaveGlow_checkpoint> -o output/ -i phrase.txt --fp16-run
python inference.py --tacotron2 <Tacotron2_checkpoint> --waveglow <WaveGlow_checkpoint> -o output/ -i text.txt --fp16-run
```
The speech is generated from text file passed with `-i` argument.
If no file is provided or if the provided file cannot be opened, speech will be
generated from a default text located in the `inference.py` file. To run
inference in mixed precision, use `--fp16-run` flag. The output audio will
be stored in the path specified by `-o` argument.
The speech is generated from a text file that is passed with `-i` argument. To run
inference in mixed precision, use the `--amp-run` flag. The output audio will
be stored in the path specified by the `-o` argument.
# Details
The following sections provide greater details of the dataset, running training
and inference, and the training results.
## Details
## Training process
The Tacotron2 and WaveGlow models are trained separately and independently.
Both models obtain mel spectrograms from short time Fourier transform (STFT)
during training. These mel spectrograms are used for loss computation in case
of Tacotron 2 and as conditioning input to the network in case of WaveGlow.
The following sections provide greater details of the dataset, running
training and inference, and the training results.
The training loss is averaged over an entire training epoch, whereas the
validation loss is averaged over the validation dataset. Performance is
reported in total input tokens per second for the Tacotron 2 model, and
in total output samples per second for the WaveGlow model. Both measures are
recorded as `train_iter_items/sec` (after each iteration) and `train_epoch_items/sec`
(averaged over epoch) in the output log. The result is averaged over an
entire training epoch and summed over all GPUs that were included in the training.
### Scripts and sample code
Even though the training script uses all available GPUs, you can change
this behavior by setting the `CUDA_VISIBLE_DEVICES` variable in your
environment or by setting the `NV_GPU` variable at the Docker container launch
([see section "GPU isolation"](https://github.com/NVIDIA/nvidia-docker/wiki/nvidia-docker#gpu-isolation)).
The sample code for Tacotron 2 and WaveGlow has scripts specific to a
particular model, located in directories `./tacotron2` and `./waveglow`, as well as scripts common to both
models, located in the `./common` directory. The model-specific scripts are as follows:
### Hyperparameters and command line arguments
Here, we list the most important hyperparameters and command line arguments,
together with their default values that are used to train Tacotron 2 and
* `<model_name>/model.py` - the model architecture, definition of forward and
inference functions
* `<model_name>/arg_parser.py` - argument parser for parameters specific to a
given model
* `<model_name>/data_function.py` - data loading functions
* `<model_name>/loss_function.py` - loss function for the model
The common scripts contain layer definitions common to both models
(`common/layers.py`), some utility scripts (`common/utils.py`) and scripts
for audio processing (`common/audio_processing.py` and `common/stft.py`). In
the root directory `./` of this repository, the `./run.py` script is used for
training while inference can be executed with the `./inference.py` script. The
scripts `./models.py`, `./data_functions.py` and `./loss_functions.py` call
the respective scripts in the `<model_name>` directory, depending on what
model is trained using the `run.py` script.
### Parameters
In this section, we list the most important hyperparameters and command-line arguments,
together with their default values that are used to train Tacotron 2 and
WaveGlow models.
#### Shared parameters
`--epochs` - number of epochs (Tacotron 2: 1500, WaveGlow: 1000)
`--learning-rate` - learning rate (Tacotron 2: 1e-3, WaveGlow: 1e-4)
`--batch-size` - batch size (Tacotron 2 FP16/FP32: 80/48, WaveGlow FP16/FP32: 8/4)
`--fp16-run` - use mixed precision training
* `--epochs` - number of epochs (Tacotron 2: 1500, WaveGlow: 1000)
* `--learning-rate` - learning rate (Tacotron 2: 1e-3, WaveGlow: 1e-4)
* `--batch-size` - batch size (Tacotron 2 FP16/FP32: 80/48, WaveGlow FP16/FP32: 8/4)
* `--amp-run` - use mixed precision training
#### Shared audio/STFT parameters
`--sampling-rate` - Sampling rate in Hz of input and output audio (22050)
`--filter-length` - (1024)
`--hop-length` - Hop length for FFT, i.e., sample stride between consecutive FFTs (256)
`--win-length` - Window size for FFT (1024)
`--mel-fmin` - Lowest frequency in Hz (0.0)
`--mel-fmax` - Highest frequency in Hz (8.000)
* `--sampling-rate` - sampling rate in Hz of input and output audio (22050)
* `--filter-length` - (1024)
* `--hop-length` - hop length for FFT, i.e., sample stride between consecutive FFTs (256)
* `--win-length` - window size for FFT (1024)
* `--mel-fmin` - lowest frequency in Hz (0.0)
* `--mel-fmax` - highest frequency in Hz (8.000)
#### Tacotron 2 parameters
`--anneal-steps` - epochs at which to anneal the learning rate (500 1000 1500)
`--anneal-factor` - factor by which to anneal the learning rate (FP16/FP32: 0.3/0.1)
* `--anneal-steps` - epochs at which to anneal the learning rate (500 1000 1500)
* `--anneal-factor` - factor by which to anneal the learning rate (FP16/FP32: 0.3/0.1)
#### WaveGlow parameters
`--segment-length` - segment length of input audio processed by the neural network (8000)
* `--segment-length` - segment length of input audio processed by the neural network (8000)
## Enabling mixed precision
[Mixed precision](https://arxiv.org/abs/1710.03740) training offers significant
computational speedup by performing operations in half-precision format, while
storing minimal information in single-precision to retain as much information as
possible in critical parts of the network. Since the introduction of
[tensor cores](https://developer.nvidia.com/tensor-cores) in the Volta and Turing
architectures, significant training speedups are experienced by switching to
mixed precision -- up to 3x overall speedup on the most arithmetically intense
model architectures. Using [mixed precision training](https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html)
previously required two steps:
### Command-line options
To see the full list of available options and their descriptions, use the `-h` or `--help` command line option, for example:
```bash
python train.py --help
```
### Getting the data
The Tacotron 2 and WaveGlow models were trained on the LJSpeech-1.1 dataset.
This repository contains the `./scripts/prepare_dataset.sh` script which will automatically download and extract the whole dataset. By default, data will be extracted to the `./LJSpeech-1.1` directory. The dataset directory contains a `README` file, a `wavs` directory with all audio samples, and a file `metadata.csv` that contains audio file names and the corresponding transcripts.
#### Dataset guidelines
The LJSpeech dataset has 13,100 clips that amount to about 24 hours of speech. Since the original dataset has all transcripts in the `metadata.csv` file, in this repository we provide file lists in the `./filelists` directory that determine training and validation subsets; `ljs_audio_text_train_filelist.txt` is a test set used as a training dataset and `ljs_audio_text_val_filelist.txt` is a test set used as a validation dataset.
#### Multi-dataset
To use datasets different than the default LJSpeech dataset:
1. Prepare a directory with all audio files and pass it to the `--dataset-path` command-line option.
2. Add two text files containing file lists: one for the training subset (`--training-files`) and one for the validation subset (`--validation files`).
The structure of the filelists should be as follows:
```bash
`<audio file path>|<transcript>`
```
The `<audio file path>` is the relative path to the path provided by the `--dataset-path` option.
### Training process
The Tacotron2 and WaveGlow models are trained separately and independently.
Both models obtain mel-spectrograms from short time Fourier transform (STFT)
during training. These mel-spectrograms are used for loss computation in case
of Tacotron 2 and as conditioning input to the network in case of WaveGlow.
The training loss is averaged over an entire training epoch, whereas the
validation loss is averaged over the validation dataset. Performance is
reported in total input tokens per second for the Tacotron 2 model and
in total output samples per second for the WaveGlow model. Both measures are
recorded as `train_iter_items/sec` (after each iteration) and
`train_epoch_items/sec` (averaged over epoch) in the output log file `./output/nvlog.json`. The result is
averaged over an entire training epoch and summed over all GPUs that were
included in the training.
Even though the training script uses all available GPUs, you can change
this behavior by setting the `CUDA_VISIBLE_DEVICES` variable in your
environment or by setting the `NV_GPU` variable at the Docker container launch
([see section "GPU isolation"](https://github.com/NVIDIA/nvidia-docker/wiki/nvidia-docker#gpu-isolation)).
### Inference process
You can run inference using the `./inference.py` script. This script takes
text as input and runs Tacotron 2 and then WaveGlow inference to produce an
audio file. It requires pre-trained checkpoints from Tacotron 2 and WaveGlow
models and input text as a text file, with one phrase per line.
To run inference, issue:
```bash
python inference.py --tacotron2 <Tacotron2_checkpoint> --waveglow <WaveGlow_checkpoint> -o output/ -i text.txt --amp-run
```
Here, `Tacotron2_checkpoint` and `WaveGlow_checkpoint` are pre-trained
checkpoints for the respective models, and `text.txt` contains input phrases.
Audio will be saved in the output folder.
You can find all the available options by calling `python inference.py --help`.
## Mixed precision training
*Mixed precision* is the combined use of different numerical precisions in a
computational method. [Mixed precision](https://arxiv.org/abs/1710.03740)
training offers significant computational speedup by performing operations in
half-precision format, while storing minimal information in single-precision
to retain as much information as possible in critical parts of the network.
Since the introduction of [Tensor Cores](https://developer.nvidia.com/tensor-cores)
in the Volta and Turing architecture, significant training speedups are
experienced by switching to mixed precision -- up to 3x overall speedup on
the most arithmetically intense model architectures. Using mixed precision
training requires two steps:
1. Porting the model to use the FP16 data type where appropriate.
2. Manually adding loss scaling to preserve small gradient values.
2. Adding loss scaling to preserve small gradient values.
Mixed precision is enabled in PyTorch by using the Automatic Mixed Precision
(AMP), library from [APEX](https://github.com/NVIDIA/apex) that casts variables
to half-precision upon retrieval, while storing variables in single-precision
format. Furthermore, to preserve small gradient magnitudes in backpropagation,
a [loss scaling](https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html#lossscaling)
step must be included when applying gradients. In PyTorch, loss scaling can be
easily applied by using `scale_loss()` method provided by AMP. The scaling value
The ability to train deep learning networks with lower precision was
introduced in the Pascal architecture and first supported in [CUDA 8](https://devblogs.nvidia.com/parallelforall/tag/fp16/) in the NVIDIA Deep Learning SDK.
For information about:
* How to train using mixed precision, see the [Mixed Precision Training](https://arxiv.org/abs/1710.03740)
paper and [Training With Mixed Precision](https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html)
documentation.
* Techniques used for mixed precision training, see the [Mixed-Precision Training of Deep Neural Networks](https://devblogs.nvidia.com/mixed-precision-training-deep-neural-networks/)
blog.
* How to access and enable AMP for TensorFlow, see [Using TF-AMP](https://docs.nvidia.com/deeplearning/dgx/tensorflow-user-guide/index.html#tfamp)
from the TensorFlow User Guide.
* APEX tools for mixed precision training, see the [NVIDIA Apex: Tools for Easy Mixed-Precision Training in PyTorch](https://devblogs.nvidia.com/apex-pytorch-easy-mixed-precision-training/).
### Enabling mixed precision
Mixed precision is enabled in PyTorch by using the Automatic Mixed Precision
(AMP) library from [APEX](https://github.com/NVIDIA/apex) that casts variables
to half-precision upon retrieval, while storing variables in single-precision
format. Furthermore, to preserve small gradient magnitudes in backpropagation,
a [loss scaling](https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html#lossscaling)
step must be included when applying gradients. In PyTorch, loss scaling can be
easily applied by using the `scale_loss()` method provided by AMP. The scaling value
to be used can be [dynamic](https://nvidia.github.io/apex/fp16_utils.html#apex.fp16_utils.DynamicLossScaler) or fixed.
By default, the `train_tacotron2.sh` and `train_waveglow.sh` scripts will launch
mixed precision training with tensor cores. You can change this behaviour by
removing the `--fp16-run` flag from the `train.py` script.
By default, the `train_tacotron2.sh` and `train_waveglow.sh` scripts will
launch mixed precision training with Tensor Cores. You can change this
behaviour by removing the `--amp-run` flag from the `train.py` script.
For an in-depth walk through on AMP, check out sample usage [here](https://github.com/NVIDIA/apex/tree/master/apex/amp#usage-and-getting-started).
[APEX](https://github.com/NVIDIA/apex) is a PyTorch extension that contains
utility libraries, such as AMP, which require minimal network code changes to
leverage tensor cores performance.
To enable mixed precision, you can:
* Import AMP from APEX, for example:
To enable mixed precision, the following steps were performed in the Tacotron 2 and
WaveGlow models:
* Import AMP from APEX:
```bash
from apex import amp
amp.lists.functional_overrides.FP32_FUNCS.remove('softmax')
amp.lists.functional_overrides.FP16_FUNCS.append('softmax')
```
* Initialize an AMP handle, for example:
* Initialize AMP:
```bash
amp_handle = amp.init(enabled=True, verbose=True)
model, optimizer = amp.initialize(model, optimizer, opt_level="O1")
```
* Wrap your optimizer with the AMP handle, for example:
* If running on multi-GPU, wrap the model with `DistributedDataParallel`:
```bash
optimizer = amp_handle.wrap_optimizer(optimizer)
```
from apex.parallel import DistributedDataParallel as DDP
model = DDP(model)
```
* Scale loss before backpropagation (assuming loss is stored in a variable
called `losses`)
* Scale loss before backpropagation (assuming loss is stored in a variable called losses)
* Default backpropagate for FP32:
```bash
losses.backward()
```
* Scale loss and backpropagate with AMP:
```bash
with optimizer.scale_loss(losses) as scaled_losses:
scaled_losses.backward()
````
```
For information about:
* How to train using mixed precision, see the [Mixed Precision Training](https://arxiv.org/abs/1710.03740)
paper and [Training With Mixed Precision](https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html) documentation.
* Techniques used for mixed precision training, see the [Mixed-Precision Training of Deep Neural Networks](https://devblogs.nvidia.com/mixed-precision-training-deep-neural-networks/) blog.
* APEX tools for mixed precision training, see the [NVIDIA Apex: Tools for Easy Mixed-Precision Training in PyTorch](https://devblogs.nvidia.com/apex-pytorch-easy-mixed-precision-training/).
## Benchmarking
## Inference process
You can run inference using the `./inference.py` script. This script takes text
as input, and runs Tacotron 2 and then WaveGlow inference to produce an audio
file. It requires pre-trained checkpoints from Tacotron 2 and WaveGlow models
and input text as a text file, with one phrase per line.
The following section shows how to run benchmarks measuring the model
performance in training and inference mode.
To run inference, issue:
```bash
python inference.py --tacotron2 <Tacotron2_checkpoint> --waveglow <WaveGlow_checkpoint> -o output/ -i phrase.txt --fp16-run
```
Here, `Tacotron2_checkpoint` and `WaveGlow_checkpoint` are pre-trained
checkpoints for the respective models, and `text.txt` contains input phrases.
Audio will be saved in the output folder.
### Training performance benchmark
You can find all available options by calling `python inference.py --help`.
# Benchmarking
The following section shows how to run benchmarks measuring the model
performance in training mode.
## Inference performance benchmark
To benchmark the inference performance on a batch size=1, run:
* For FP32
```bash
python inference.py --tacotron2 <Tacotron2_checkpoint> --waveglow <WaveGlow_checkpoint> -o output/ --include-warmup -i phrase.txt --log-file=output/nvlog_fp32.json
```
* For FP16
```bash
python inference.py --tacotron2 <Tacotron2_checkpoint> --waveglow <WaveGlow_checkpoint> -o output/ --include-warmup -i phrase.txt --fp16-run --log-file=output/nvlog_fp16.json
```
The output log files will contain performance numbers for Tacotron 2 model
(number of input tokens per second, reported as `tacotron2_items_per_sec`)
and for WaveGlow (number of output samples per second, reported as
`waveglow_items_per_sec`). The `inference.py` script will run a few warmup
iterations before running the benchmark.
## Training performance benchmark
To benchmark the training performance on a specific batch size, run:
**Tacotron 2**
@ -336,7 +478,7 @@ To benchmark the training performance on a specific batch size, run:
```
* FP16
```bash
python train.py -m Tacotron2 -o <output_dir> -lr 1e-3 --epochs 10 -bs <batch_size> --weight-decay 1e-6 --grad-clip-thresh 1.0 --cudnn-benchmark=True --log-file nvlog.json --training-files filelists/ljs_audio_text_train_subset_2500_filelist.txt --dataset-path <dataset-path> --fp16-run
python train.py -m Tacotron2 -o <output_dir> -lr 1e-3 --epochs 10 -bs <batch_size> --weight-decay 1e-6 --grad-clip-thresh 1.0 --cudnn-benchmark=True --log-file nvlog.json --training-files filelists/ljs_audio_text_train_subset_2500_filelist.txt --dataset-path <dataset-path> --amp-run
```
* For multiple GPUs
@ -346,7 +488,7 @@ To benchmark the training performance on a specific batch size, run:
```
* FP16
```bash
python -m multiproc train.py -m Tacotron2 -o <output_dir> -lr 1e-3 --epochs 10 -bs <batch_size> --weight-decay 1e-6 --grad-clip-thresh 1.0 --cudnn-benchmark=True --log-file nvlog.json --training-files filelists/ljs_audio_text_train_subset_2500_filelist.txt --dataset-path <dataset-path> --fp16-run
python -m multiproc train.py -m Tacotron2 -o <output_dir> -lr 1e-3 --epochs 10 -bs <batch_size> --weight-decay 1e-6 --grad-clip-thresh 1.0 --cudnn-benchmark=True --log-file nvlog.json --training-files filelists/ljs_audio_text_train_subset_2500_filelist.txt --dataset-path <dataset-path> --amp-run
```
**WaveGlow**
@ -358,7 +500,7 @@ To benchmark the training performance on a specific batch size, run:
```
* FP16
```bash
python train.py -m WaveGlow -o <output_dir> -lr 1e-4 --epochs 10 -bs <batch_size> --segment-length 8000 --weight-decay 0 --grad-clip-thresh 65504.0 --cudnn-benchmark=True --log-file nvlog.json --training-files filelists/ljs_audio_text_train_subset_1250_filelist.txt --dataset-path <dataset-path> --fp16-run
python train.py -m WaveGlow -o <output_dir> -lr 1e-4 --epochs 10 -bs <batch_size> --segment-length 8000 --weight-decay 0 --grad-clip-thresh 65504.0 --cudnn-benchmark=True --log-file nvlog.json --training-files filelists/ljs_audio_text_train_subset_1250_filelist.txt --dataset-path <dataset-path> --amp-run
```
* For multiple GPUs
@ -368,116 +510,157 @@ To benchmark the training performance on a specific batch size, run:
```
* FP16
```bash
python -m multiproc train.py -m WaveGlow -o <output_dir> -lr 1e-4 --epochs 10 -bs <batch_size> --segment-length 8000 --weight-decay 0 --grad-clip-thresh 65504.0 --cudnn-benchmark=True --log-file nvlog.json --training-files filelists/ljs_audio_text_train_subset_1250_filelist.txt --dataset-path <dataset-path> --fp16-run
python -m multiproc train.py -m WaveGlow -o <output_dir> -lr 1e-4 --epochs 10 -bs <batch_size> --segment-length 8000 --weight-decay 0 --grad-clip-thresh 65504.0 --cudnn-benchmark=True --log-file nvlog.json --training-files filelists/ljs_audio_text_train_subset_1250_filelist.txt --dataset-path <dataset-path> --amp-run
```
Each of these scripts runs for 10 epochs and for each epoch measures the averaged number of items per second. The performance results can be read from the nvlog.json files produced by the commands.
Each of these scripts runs for 10 epochs and for each epoch measures the
average number of items per second. The performance results can be read from
the `nvlog.json` files produced by the commands.
# Results
The following sections provide details on how we achieved our performance
### Inference performance benchmark
To benchmark the inference performance on a batch size=1, run:
* For FP32
```bash
python inference.py --tacotron2 <Tacotron2_checkpoint> --waveglow <WaveGlow_checkpoint> -o output/ -i text.txt --log-file=output/nvlog_fp32.json
```
* For FP16
```bash
python inference.py --tacotron2 <Tacotron2_checkpoint> --waveglow <WaveGlow_checkpoint> -o output/ -i text.txt --amp-run --log-file=output/nvlog_fp16.json
```
The log files contain performance numbers for Tacotron 2 model
(number of input tokens per second, reported as `tacotron2_items_per_sec`)
and for WaveGlow (number of output samples per second, reported as
`waveglow_items_per_sec`).
## Results
The following sections provide details on how we achieved our performance
and accuracy in training and inference.
## Training accuracy results
Our results were obtained by running the `./platform/train_{tacotron2,waveglow}_{FP16,FP32}_DGX1_16GB_8GPU.sh`
training script in the PyTorch-19.05-py3 NGC container on NVIDIA DGX-1 with 8x V100 16G GPUs.
### Training accuracy results
All of the results were produced using the `train.py` as described in the
##### NVIDIA DGX-1 (8x V100 16G)
Our results were obtained by running the `./platform/train_{tacotron2,waveglow}_{FP16,FP32}_DGX1_16GB_8GPU.sh` training script in the PyTorch-19.04-py3
NGC container on NVIDIA DGX-1 with 8x V100 16G GPUs.
All of the results were produced using the `train.py` script as described in the
[Training process](#training-process) section of this document.
| Loss (Model/Epoch) | 1 | 250 | 500 | 750 | 1000 |
| :----------------: | ------: | ------: | ------: | ------: | ------: |
| **Tacotron 2 FP16** | 26.7176 | 0.473 | 0.3985 | 0.3725 | 0.3645 |
| **Tacotron 2 FP32** | 5.3406 | 0.4317 | 0.3699 | 0.3635 | 0.3629 |
| **WaveGlow FP16** | -2.2054 | -5.7602 | -5.901 | -5.9706 | -6.0258 |
| **WaveGlow FP32** | -3.0327 | -5.858 | -6.0056 | -6.0613 | -6.1087 |
| Tacotron 2 FP16 | 13.0732 | 0.5736 | 0.4408 | 0.3923 | 0.3735 |
| Tacotron 2 FP32 | 8.5776 | 0.4807 | 0.3875 | 0.3421 | 0.3308 |
| WaveGlow FP16 | -2.2054 | -5.7602 | -5.901 | -5.9706 | -6.0258 |
| WaveGlow FP32 | -3.0327 | -5.858 | -6.0056 | -6.0613 | -6.1087 |
Tacotron 2 FP16 loss - batch size 80 (mean and std over 16 runs)
![](./img/tacotron2_fp16_loss.png "Tacotron 2 FP16 loss")
![](./img/tacotron2_amp_loss.png "Tacotron 2 FP16 loss")
Tacotron 2 FP32 loss - batch size 48 (mean and std over 16 runs)
![](./img/tacotron2_fp32_loss.png "Tacotron 2 FP16 loss")
WaveGlow FP16 loss - batch size 8 (mean and std over 16 runs)
![](./img/waveglow_fp16_loss.png "WaveGlow FP16 loss")
WaveGlow FP32 loss - batch size 4 (mean and std over 16 runs)
![](./img/waveglow_fp32_loss.png "WaveGlow FP32 loss")
## Training performance results
Our results were obtained by running the `./platform/train_{tacotron2,waveglow}_{FP16,FP32}_DGX1_16GB_8GPU.sh`
training script in the PyTorch-19.05-py3 NGC container on NVIDIA DGX-1 with
8x V100 16G GPUs. Performance numbers (in input tokens per second for
Tacotron 2 and output samples per second for WaveGlow) were averaged over
### Training performance results
##### NVIDIA DGX-1 (8x V100 16G)
Our results were obtained by running the `./platform/train_{tacotron2,waveglow}_{FP16,FP32}_DGX1_16GB_8GPU.sh`
training script in the PyTorch-19.04-py3 NGC container on NVIDIA DGX-1 with
8x V100 16G GPUs. Performance numbers (in input tokens per second for
Tacotron 2 and output samples per second for WaveGlow) were averaged over
an entire training epoch.
This table shows the results for Tacotron 2, with batch size equal 80 and 48
for mixed precision and FP32 training, respectively.
This table shows the results for Tacotron 2:
|Number of GPUs|Mixed precision tokens/sec|FP32 tokens/sec|Speed-up with mixed precision|Multi-gpu weak scaling with mixed precision|Multi-gpu weak scaling with FP32|
|---:|---:|---:|---:|---:|---:|
|**1**|2,554|1,740|1.47|1.00|1.00|
|**4**|7,768|5,683|1.37|3.04|3.27|
|**8**|12,524|10,484|1.19|4.90|6.03|
|Number of GPUs|Batch size per GPU|Number of tokens used with mixed precision|Number of tokens used with FP32|Speed-up with mixed precision|Multi-GPU weak scaling with mixed precision|Multi-GPU weak scaling with FP32|
|---:|---:|---:|---:|---:|---:|---:|
|1|128@FP16, 64@FP32 | 3,746 | 2,087 | 1.79 | 1.00 | 1.00 |
|4|128@FP16, 64@FP32 | 13,264 | 8,052 | 1.65 | 3.54 | 3.86 |
|8|128@FP16, 64@FP32 | 25,056 | 15,863 | 1.58 | 6.69 | 7.60 |
The following table shows the results for WaveGlow, with batch size equal 8 and
4 for mixed precision and FP32 training, respectively.
The following table shows the results for WaveGlow:
|Number of GPUs|Mixed precision samples/sec|FP32 samples/sec|Speed-up with mixed precision|Multi-gpu weak scaling with mixed precision|Multi-gpu weak scaling with FP32|
|---:|---:|---:|---:|---:|---:|
|**1**| 76,686 | 36,602 | 2.10 | 1.00 | 1.00 |
|**4**| 260,826 | 124,514 | 2.09 | 3.40 | 3.40 |
|**8**| 566,471 | 264,138 | 2.14 | 7.39 | 7.22 |
|Number of GPUs|Batch size per GPU|Number of samples used with mixed precision|Number of samples used with FP32|Speed-up with mixed precision|Multi-GPU weak scaling with mixed precision|Multi-GPU weak scaling with FP32|
|---:|---:|---:|---:|---:|---:|---:|
|1| 10@FP16, 4@FP32 | 79248.87426 | 35695.56774 | 2.22 | 1.00 | 1.00 |
|4| 10@FP16, 4@FP32 | 275310.0262 | 126497.6265 | 2.18 | 3.47 | 3.54 |
|8| 10@FP16, 4@FP32 | 576709.4935 | 255155.1798 | 2.26 | 7.28 | 7.15 |
To achieve these same results, follow the [Quick Start Guide](#quick-start-guide) outlined above.
To achieve these same results, follow the steps in the [Quick Start Guide](#quick-start-guide).
### Expected training time
#### Expected training time
This table shows the expected training time for convergence for Tacotron 2 (1500 epochs, time in hours).
The following table shows the expected training time for convergence for Tacotron 2 (1500 epochs):
|Number of GPUs|Expected training time in hours with mixed precision|Expected training time in hours with FP32|Speed-up with mixed precision|
|Number of GPUs|Batch size per GPU|Time to train with mixed precision (Hrs)|Time to train with FP32 (Hrs)|Speed-up with mixed precision|
|---:|---:|---:|---:|
|**1**| 197.39 | 302.32 | 1.38 |
|**4**| 63.29 | 88.07 | 1.25 |
|**8**| 33.72 | 45.51 | 1.33 |
|1| 128@FP16, 64@FP32 | 137.33 | 227.66 | 1.66 |
|4| 128@FP16, 64@FP32 | 40.68 | 63.99 | 1.57 |
|8| 128@FP16, 64@FP32 | 20.74 | 32.47 | 1.57 |
This table shows the expected training time for convergence for WaveGlow (1000 epochs, time in hours).
|Number of GPUs|Expected training time in hours with mixed precision|Expected training time in hours with FP32|Speed-up with mixed precision|
The following table shows the expected training time for convergence for WaveGlow (1000 epochs):
|Number of GPUs|Batch size per GPU|Time to train with mixed precision (Hrs)|Time to train with FP32 (Hrs)|Speed-up with mixed precision|
|---:|---:|---:|---:|
|**1**| 400.99 | 782.67 | 1.95 |
|**4**| 89.40 | 213.09 | 2.38 |
|**8**| 48.43 | 107.27 | 2.21 |
|1| 10@FP16, 4@FP32 | 358.00 | 793.97 | 2.22 |
|4| 10@FP16, 4@FP32 | 103.10 | 223.59 | 2.17 |
|8| 10@FP16, 4@FP32 | 50.40 | 109.45 | 2.17 |
## Inference performance results
Our results were obtained by running the `./inference.py` inference script in the
PyTorch-19.05-py3 NGC container on NVIDIA DGX-1 with 8x V100 16G GPUs.
Performance numbers (in input tokens per second for Tacotron 2 and output
### Inference performance results
##### NVIDIA DGX-1 (8x V100 16G)
Our results were obtained by running the `./inference.py` inference script in the
PyTorch-18.12.1-py3 NGC container on NVIDIA DGX-1 with 8x V100 16G GPUs.
Performance numbers (in input tokens per second for Tacotron 2 and output
samples per second for WaveGlow) were averaged over 16 runs.
This table shows the inference performance results for Tacotron 2.
The following table shows the inference performance results for Tacotron 2.
Results are measured in the number of input tokens per second.
|Number of GPUs|Mixed precision tokens/sec|FP32 tokens/sec|Speed-up with mixed precision|
|Number of GPUs|Number of tokens used with mixed precision|Number of tokens used with FP32|Speed-up with mixed precision|
|---:|---:|---:|---:|
|**1**|132|153|0.86|
|1|168|173|0.97|
This table shows the inference performance results for WaveGlow.
The following table shows the inference performance results for WaveGlow.
Results are measured in the number of output audio samples per second.<sup>1</sup>
|Number of GPUs|Mixed precision samples/sec|FP32 samples/sec|Speed-up with mixed precision|
|Number of GPUs|Number of samples used with mixed precision|Number of samples used with FP32|Speed-up with mixed precision|
|---:|---:|---:|---:|
|**1**|425379|376037|1.13|
|1|583318|553380|1.05|
<sup>1</sup>With sampling rate equal to 22050, one second of audio is generated from 22050 samples.
To achieve these same results, follow the [Quick Start Guide](#quick-start-guide) outlined above.
To achieve these same results, follow the steps in the [Quick Start Guide](#quick-start-guide).
# Changelog
## Changelog
March 2019
* Initial release
# Known issues
For mixed precision training of Tacotron 2, dropouts on LSTMCells
cause overflow leading to dynamic loss scaling going to 1, see [here](https://github.com/NVIDIA/tacotron2/issues/112).
The current workaround, which is already applied in our model implementation,
is to convert `attention_rnn` and `decoder_rnn` back to FP32 precision.
June 2019
* AMP support
* Data preprocessing for Tacotron 2 training
* Fixed dropouts on LSTMCells
## Known issues
There are no known issues in this release.

Some files were not shown because too many files have changed in this diff Show More