DeepLearningExamples/PyTorch/Detection/SSD/qa/qa_perf_main.py

200 lines
5.8 KiB
Python

# core imports
import os
import numpy as np
import json
from pprint import pprint
import time
# pytorch imports
import torch
import torch.utils.data.distributed
from torch.autograd import Variable
# Apex imports
try:
from apex.parallel.LARC import LARC
from apex.parallel import DistributedDataParallel as DDP
from apex.fp16_utils import *
except ImportError:
raise ImportError("Please install APEX from https://github.com/nvidia/apex")
# project imports
from main import train, make_parser
from src.logger import BenchLogger
# from src.train import benchmark_inference_loop, benchmark_train_loop
from SSD import _C as C
RESULT = None
def add_benchmark_args(parser):
parser.add_argument('--benchmark-mode', type=str, choices=['training', 'inference'],
default='inference', required=True)
parser.add_argument('--results-file', default='experiment_raport.json', type=str,
help='file in which to store JSON experiment raport')
parser.add_argument('--benchmark-file', type=str, default=None, metavar='FILE',
help='path to the file with baselines')
return parser
def benchmark_train_loop(model, loss_func, epoch, optim, train_dataloader, val_dataloader, encoder, iteration, logger, args, mean, std):
start_time = None
# tensor for results
result = torch.zeros((1,)).cuda()
for i, data in enumerate(loop(train_dataloader)):
if i >= args.benchmark_warmup:
start_time = time.time()
img = data[0][0][0]
bbox = data[0][1][0]
label = data[0][2][0]
label = label.type(torch.cuda.LongTensor)
bbox_offsets = data[0][3][0]
# handle random flipping outside of DALI for now
bbox_offsets = bbox_offsets.cuda()
img, bbox = C.random_horiz_flip(img, bbox, bbox_offsets, 0.5, False)
if not args.no_cuda:
img = img.cuda()
bbox = bbox.cuda()
label = label.cuda()
bbox_offsets = bbox_offsets.cuda()
img.sub_(mean).div_(std)
N = img.shape[0]
if bbox_offsets[-1].item() == 0:
print("No labels in batch")
continue
bbox, label = C.box_encoder(N, bbox, bbox_offsets, label, encoder.dboxes.cuda(), 0.5)
M = bbox.shape[0] // N
bbox = bbox.view(N, M, 4)
label = label.view(N, M)
ploc, plabel = model(img)
ploc, plabel = ploc.float(), plabel.float()
trans_bbox = bbox.transpose(1, 2).contiguous().cuda()
if not args.no_cuda:
label = label.cuda()
gloc = Variable(trans_bbox, requires_grad=False)
glabel = Variable(label, requires_grad=False)
loss = loss_func(ploc, plabel, gloc, glabel)
# loss scaling
if args.fp16:
if args.amp:
with optim.scale_loss(loss) as scale_loss:
scale_loss.backward()
else:
optim.backward(loss)
else:
loss.backward()
optim.step()
optim.zero_grad()
iteration += 1
# reduce all results from every gpu
if i >= args.benchmark_warmup + args.benchmark_iterations:
result.data[0] = logger.print_result()
if args.N_gpu > 1:
torch.distributed.reduce(result, 0)
if args.local_rank == 0:
global RESULT
RESULT = float(result.data[0])
return
if i >= args.benchmark_warmup:
logger.update(args.batch_size, time.time() - start_time)
def loop(dataloader):
while True:
for data in dataloader:
yield data
def benchmark_inference_loop(model, loss_func, epoch, optim, train_dataloader, val_dataloader, encoder, iteration, logger, args, mean, std):
assert args.N_gpu == 1, 'Inference benchmark only on 1 gpu'
start_time = None
model.eval()
i=-1
dataloader = loop(val_dataloader)
while True:
i+=1
with torch.no_grad():
torch.cuda.synchronize()
if i >= args.benchmark_warmup:
start_time = time.time()
data = next(dataloader)
img = data[0]
if not args.no_cuda:
img = img.cuda()
if args.fp16:
img = img.half()
img.sub_(mean).div_(std)
img = Variable(img, requires_grad=False)
_ = model(img)
torch.cuda.synchronize()
if i >= args.benchmark_warmup + args.benchmark_iterations:
global RESULT
RESULT = logger.print_result()
return
if i >= args.benchmark_warmup:
logger.update(args.batch_size, time.time() - start_time)
def main(args):
if args.local_rank == 0:
os.makedirs('./models', exist_ok=True)
if args.seed is not None:
print("Using seed = {}".format(args.seed))
torch.manual_seed(args.seed)
np.random.seed(seed=args.seed)
torch.backends.cudnn.benchmark = True
if args.benchmark_mode == 'training':
train_loop_func = benchmark_train_loop
logger = BenchLogger('Training benchmark')
else:
train_loop_func = benchmark_inference_loop
logger = BenchLogger('Inference benchmark')
args.epochs = 1
train(train_loop_func, logger, args)
if args.local_rank == 0:
global RESULT
with open(args.results_file) as f:
results = json.load(f)
results['metrics'][str(args.N_gpu)][str(args.batch_size)] = {'images_per_second': RESULT}
pprint(results)
with open(args.results_file, 'w') as f:
json.dump(results, f)
if __name__ == "__main__":
parser = make_parser()
parser = add_benchmark_args(parser)
args = parser.parse_args()
print(args)
main(args)