177 lines
7 KiB
Python
177 lines
7 KiB
Python
# *****************************************************************************
|
|
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
|
|
#
|
|
# Redistribution and use in source and binary forms, with or without
|
|
# modification, are permitted provided that the following conditions are met:
|
|
# * Redistributions of source code must retain the above copyright
|
|
# notice, this list of conditions and the following disclaimer.
|
|
# * Redistributions in binary form must reproduce the above copyright
|
|
# notice, this list of conditions and the following disclaimer in the
|
|
# documentation and/or other materials provided with the distribution.
|
|
# * Neither the name of the NVIDIA CORPORATION nor the
|
|
# names of its contributors may be used to endorse or promote products
|
|
# derived from this software without specific prior written permission.
|
|
#
|
|
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
|
# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
|
# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
|
# DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
|
|
# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
|
# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
|
# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
|
# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
|
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
|
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
#
|
|
# *****************************************************************************
|
|
|
|
from tacotron2.text import text_to_sequence
|
|
import models
|
|
import torch
|
|
import argparse
|
|
import numpy as np
|
|
from scipy.io.wavfile import write
|
|
import json
|
|
import time
|
|
|
|
from inference import checkpoint_from_distributed, unwrap_distributed, load_and_setup_model
|
|
|
|
from dllogger.logger import LOGGER
|
|
import dllogger.logger as dllg
|
|
from dllogger import tags
|
|
from dllogger.autologging import log_hardware, log_args
|
|
|
|
def parse_args(parser):
|
|
"""
|
|
Parse commandline arguments.
|
|
"""
|
|
parser.add_argument('-m', '--model-name', type=str, default='', required=True,
|
|
help='Model to train')
|
|
parser.add_argument('--input-text', type=str, default=None,
|
|
help='Path to tensor containing text (when running Tacotron2)')
|
|
parser.add_argument('--input-mels', type=str, default=None,
|
|
help='Path to tensor containing mels (when running WaveGlow)')
|
|
parser.add_argument('-sr', '--sampling-rate', default=22050, type=int,
|
|
help='Sampling rate')
|
|
parser.add_argument('--fp16-run', action='store_true',
|
|
help='inference in fp16')
|
|
parser.add_argument('-bs', '--batch-size', type=int, default=1)
|
|
parser.add_argument('--create-benchmark', action='store_true')
|
|
|
|
return parser
|
|
|
|
|
|
def collate_text(batch):
|
|
"""Collate's training batch from normalized text and mel-spectrogram
|
|
PARAMS
|
|
------
|
|
batch: [text_normalized]
|
|
"""
|
|
# Right zero-pad all one-hot text sequences to max input length
|
|
input_lengths, ids_sorted_decreasing = torch.sort(
|
|
torch.LongTensor([len(x) for x in batch]),
|
|
dim=0, descending=True)
|
|
max_input_len = input_lengths[0]
|
|
|
|
text_padded = torch.LongTensor(len(batch), max_input_len)
|
|
text_padded.zero_()
|
|
for i in range(len(ids_sorted_decreasing)):
|
|
text = batch[ids_sorted_decreasing[i]]
|
|
text_padded[i, :text.size(0)] = text
|
|
|
|
return text_padded, input_lengths
|
|
|
|
|
|
def main():
|
|
"""
|
|
Launches inference benchmark.
|
|
Inference is executed on a single GPU.
|
|
"""
|
|
parser = argparse.ArgumentParser(
|
|
description='PyTorch Tacotron 2 Inference')
|
|
parser = parse_args(parser)
|
|
args, _ = parser.parse_known_args()
|
|
|
|
log_file = ("qa/baselines/" + args.model_name + "_inferbench_BS" + str(args.batch_size) + "_FP" + ("16" if args.fp16_run else "32") +
|
|
"_DGX1_16GB_1GPU_single" + ".json") \
|
|
if args.create_benchmark else \
|
|
(args.model_name + "_infer_BS" + str(args.batch_size) + "_FP" + ("16" if args.fp16_run else "32") + \
|
|
"_DGX1_16GB_1GPU_single" + ".json")
|
|
|
|
LOGGER.set_model_name("Tacotron2_PyT")
|
|
LOGGER.set_backends([
|
|
dllg.StdOutBackend(log_file=None,
|
|
logging_scope=dllg.TRAIN_ITER_SCOPE, iteration_interval=1),
|
|
dllg.JsonBackend(log_file,
|
|
logging_scope=dllg.TRAIN_ITER_SCOPE, iteration_interval=1)
|
|
])
|
|
LOGGER.register_metric("items_per_sec",
|
|
metric_scope=dllg.TRAIN_ITER_SCOPE)
|
|
|
|
log_hardware()
|
|
log_args(args)
|
|
|
|
# ## uncomment to generate new padded text
|
|
# texts = []
|
|
# f = open('qa/ljs_text_train_subset_2500.txt', 'r')
|
|
# texts = f.readlines()
|
|
# sequence = []
|
|
# for i, text in enumerate(texts):
|
|
# sequence.append(torch.IntTensor(text_to_sequence(text, ['english_cleaners'])))
|
|
|
|
# text_padded, input_lengths = collate_text(sequence)
|
|
# text_padded = torch.autograd.Variable(text_padded).cuda().long()
|
|
# torch.save(text_padded, "qa/text_padded.pt")
|
|
# torch.save(input_lengths, "qa/input_lengths.pt")
|
|
|
|
model = load_and_setup_model(args.model_name, parser, None, args.fp16_run)
|
|
|
|
dry_runs = 3
|
|
num_iters = (16+dry_runs) if args.create_benchmark else (1+dry_runs)
|
|
|
|
for i in range(num_iters):
|
|
## skipping the first inference which is slower
|
|
if i >= dry_runs:
|
|
LOGGER.iteration_start()
|
|
|
|
if args.model_name == 'Tacotron2':
|
|
text_padded = torch.load(args.input_text)
|
|
text_padded = text_padded[:args.batch_size]
|
|
text_padded = torch.autograd.Variable(text_padded).cuda().long()
|
|
|
|
t0 = time.time()
|
|
with torch.no_grad():
|
|
_, mels, _, _ = model.infer(text_padded)
|
|
t1 = time.time()
|
|
inference_time= t1 - t0
|
|
num_items = text_padded.size(0)*text_padded.size(1)
|
|
|
|
# # ## uncomment to generate new padded mels
|
|
# torch.save(mels, "qa/mel_padded.pt")
|
|
|
|
if args.model_name == 'WaveGlow':
|
|
mel_padded = torch.load(args.input_mels)
|
|
mel_padded = torch.cat((mel_padded, mel_padded, mel_padded, mel_padded))
|
|
mel_padded = mel_padded[:args.batch_size]
|
|
mel_padded = mel_padded.cuda()
|
|
|
|
if args.fp16_run:
|
|
mel_padded = mel_padded.half()
|
|
|
|
t0 = time.time()
|
|
with torch.no_grad():
|
|
audios = model.infer(mel_padded)
|
|
audios = audios.float()
|
|
t1 = time.time()
|
|
inference_time = t1 - t0
|
|
num_items = audios.size(0)*audios.size(1)
|
|
|
|
if i >= dry_runs:
|
|
LOGGER.log(key="items_per_sec", value=(num_items/inference_time))
|
|
LOGGER.iteration_stop()
|
|
|
|
LOGGER.finish()
|
|
|
|
if __name__ == '__main__':
|
|
main()
|