200 lines
5.8 KiB
Python
200 lines
5.8 KiB
Python
# core imports
|
|
import os
|
|
import numpy as np
|
|
import json
|
|
from pprint import pprint
|
|
import time
|
|
|
|
# pytorch imports
|
|
import torch
|
|
import torch.utils.data.distributed
|
|
from torch.autograd import Variable
|
|
|
|
|
|
# Apex imports
|
|
try:
|
|
from apex.parallel.LARC import LARC
|
|
from apex.parallel import DistributedDataParallel as DDP
|
|
from apex.fp16_utils import *
|
|
except ImportError:
|
|
raise ImportError("Please install APEX from https://github.com/nvidia/apex")
|
|
|
|
# project imports
|
|
from main import train, make_parser
|
|
from src.logger import BenchLogger
|
|
# from src.train import benchmark_inference_loop, benchmark_train_loop
|
|
|
|
from SSD import _C as C
|
|
|
|
RESULT = None
|
|
|
|
|
|
def add_benchmark_args(parser):
|
|
parser.add_argument('--benchmark-mode', type=str, choices=['training', 'inference'],
|
|
default='inference', required=True)
|
|
parser.add_argument('--results-file', default='experiment_raport.json', type=str,
|
|
help='file in which to store JSON experiment raport')
|
|
parser.add_argument('--benchmark-file', type=str, default=None, metavar='FILE',
|
|
help='path to the file with baselines')
|
|
return parser
|
|
|
|
def benchmark_train_loop(model, loss_func, epoch, optim, train_dataloader, val_dataloader, encoder, iteration, logger, args, mean, std):
|
|
start_time = None
|
|
# tensor for results
|
|
result = torch.zeros((1,)).cuda()
|
|
for i, data in enumerate(loop(train_dataloader)):
|
|
if i >= args.benchmark_warmup:
|
|
start_time = time.time()
|
|
|
|
img = data[0][0][0]
|
|
bbox = data[0][1][0]
|
|
label = data[0][2][0]
|
|
label = label.type(torch.cuda.LongTensor)
|
|
bbox_offsets = data[0][3][0]
|
|
# handle random flipping outside of DALI for now
|
|
bbox_offsets = bbox_offsets.cuda()
|
|
img, bbox = C.random_horiz_flip(img, bbox, bbox_offsets, 0.5, False)
|
|
|
|
if not args.no_cuda:
|
|
img = img.cuda()
|
|
bbox = bbox.cuda()
|
|
label = label.cuda()
|
|
bbox_offsets = bbox_offsets.cuda()
|
|
img.sub_(mean).div_(std)
|
|
|
|
N = img.shape[0]
|
|
if bbox_offsets[-1].item() == 0:
|
|
print("No labels in batch")
|
|
continue
|
|
bbox, label = C.box_encoder(N, bbox, bbox_offsets, label, encoder.dboxes.cuda(), 0.5)
|
|
|
|
M = bbox.shape[0] // N
|
|
bbox = bbox.view(N, M, 4)
|
|
label = label.view(N, M)
|
|
|
|
|
|
|
|
|
|
|
|
ploc, plabel = model(img)
|
|
ploc, plabel = ploc.float(), plabel.float()
|
|
|
|
trans_bbox = bbox.transpose(1, 2).contiguous().cuda()
|
|
|
|
if not args.no_cuda:
|
|
label = label.cuda()
|
|
gloc = Variable(trans_bbox, requires_grad=False)
|
|
glabel = Variable(label, requires_grad=False)
|
|
|
|
loss = loss_func(ploc, plabel, gloc, glabel)
|
|
|
|
|
|
|
|
# loss scaling
|
|
if args.fp16:
|
|
if args.amp:
|
|
with optim.scale_loss(loss) as scale_loss:
|
|
scale_loss.backward()
|
|
else:
|
|
optim.backward(loss)
|
|
else:
|
|
loss.backward()
|
|
|
|
optim.step()
|
|
optim.zero_grad()
|
|
iteration += 1
|
|
|
|
# reduce all results from every gpu
|
|
if i >= args.benchmark_warmup + args.benchmark_iterations:
|
|
result.data[0] = logger.print_result()
|
|
if args.N_gpu > 1:
|
|
torch.distributed.reduce(result, 0)
|
|
if args.local_rank == 0:
|
|
global RESULT
|
|
RESULT = float(result.data[0])
|
|
return
|
|
|
|
if i >= args.benchmark_warmup:
|
|
logger.update(args.batch_size, time.time() - start_time)
|
|
|
|
def loop(dataloader):
|
|
while True:
|
|
for data in dataloader:
|
|
yield data
|
|
|
|
def benchmark_inference_loop(model, loss_func, epoch, optim, train_dataloader, val_dataloader, encoder, iteration, logger, args, mean, std):
|
|
assert args.N_gpu == 1, 'Inference benchmark only on 1 gpu'
|
|
start_time = None
|
|
model.eval()
|
|
i=-1
|
|
dataloader = loop(val_dataloader)
|
|
while True:
|
|
i+=1
|
|
with torch.no_grad():
|
|
torch.cuda.synchronize()
|
|
if i >= args.benchmark_warmup:
|
|
start_time = time.time()
|
|
data = next(dataloader)
|
|
|
|
img = data[0]
|
|
|
|
if not args.no_cuda:
|
|
img = img.cuda()
|
|
|
|
if args.fp16:
|
|
img = img.half()
|
|
|
|
img.sub_(mean).div_(std)
|
|
img = Variable(img, requires_grad=False)
|
|
_ = model(img)
|
|
torch.cuda.synchronize()
|
|
|
|
if i >= args.benchmark_warmup + args.benchmark_iterations:
|
|
global RESULT
|
|
RESULT = logger.print_result()
|
|
return
|
|
|
|
if i >= args.benchmark_warmup:
|
|
logger.update(args.batch_size, time.time() - start_time)
|
|
|
|
|
|
def main(args):
|
|
if args.local_rank == 0:
|
|
os.makedirs('./models', exist_ok=True)
|
|
|
|
if args.seed is not None:
|
|
print("Using seed = {}".format(args.seed))
|
|
torch.manual_seed(args.seed)
|
|
np.random.seed(seed=args.seed)
|
|
|
|
torch.backends.cudnn.benchmark = True
|
|
|
|
if args.benchmark_mode == 'training':
|
|
train_loop_func = benchmark_train_loop
|
|
logger = BenchLogger('Training benchmark')
|
|
else:
|
|
train_loop_func = benchmark_inference_loop
|
|
logger = BenchLogger('Inference benchmark')
|
|
|
|
args.epochs = 1
|
|
|
|
train(train_loop_func, logger, args)
|
|
|
|
if args.local_rank == 0:
|
|
global RESULT
|
|
with open(args.results_file) as f:
|
|
results = json.load(f)
|
|
results['metrics'][str(args.N_gpu)][str(args.batch_size)] = {'images_per_second': RESULT}
|
|
pprint(results)
|
|
|
|
with open(args.results_file, 'w') as f:
|
|
json.dump(results, f)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
parser = make_parser()
|
|
parser = add_benchmark_args(parser)
|
|
args = parser.parse_args()
|
|
print(args)
|
|
main(args)
|