213 lines
9.7 KiB
Python
213 lines
9.7 KiB
Python
# *****************************************************************************
|
|
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
|
|
#
|
|
# Redistribution and use in source and binary forms, with or without
|
|
# modification, are permitted provided that the following conditions are met:
|
|
# * Redistributions of source code must retain the above copyright
|
|
# notice, this list of conditions and the following disclaimer.
|
|
# * Redistributions in binary form must reproduce the above copyright
|
|
# notice, this list of conditions and the following disclaimer in the
|
|
# documentation and/or other materials provided with the distribution.
|
|
# * Neither the name of the NVIDIA CORPORATION nor the
|
|
# names of its contributors may be used to endorse or promote products
|
|
# derived from this software without specific prior written permission.
|
|
#
|
|
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
|
# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
|
# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
|
# DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
|
|
# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
|
# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
|
# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
|
# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
|
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
|
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
#
|
|
# *****************************************************************************
|
|
|
|
from tacotron2.text import text_to_sequence
|
|
import models
|
|
import torch
|
|
import argparse
|
|
import numpy as np
|
|
from scipy.io.wavfile import write
|
|
|
|
import sys
|
|
|
|
from inference import checkpoint_from_distributed, unwrap_distributed, MeasureTime, prepare_input_sequence, load_and_setup_model
|
|
|
|
import time
|
|
import dllogger as DLLogger
|
|
from dllogger import StdOutBackend, JSONStreamBackend, Verbosity
|
|
|
|
from apex import amp
|
|
|
|
from waveglow.denoiser import Denoiser
|
|
|
|
def parse_args(parser):
|
|
"""
|
|
Parse commandline arguments.
|
|
"""
|
|
parser.add_argument('--tacotron2', type=str,
|
|
help='full path to the Tacotron2 model checkpoint file')
|
|
parser.add_argument('--waveglow', type=str,
|
|
help='full path to the WaveGlow model checkpoint file')
|
|
parser.add_argument('-s', '--sigma-infer', default=0.6, type=float)
|
|
parser.add_argument('-d', '--denoising-strength', default=0.01, type=float)
|
|
parser.add_argument('-sr', '--sampling-rate', default=22050, type=int,
|
|
help='Sampling rate')
|
|
|
|
run_mode = parser.add_mutually_exclusive_group()
|
|
run_mode.add_argument('--fp16', action='store_true',
|
|
help='Run inference with FP16')
|
|
run_mode.add_argument('--cpu', action='store_true',
|
|
help='Run inference on CPU')
|
|
|
|
parser.add_argument('--log-file', type=str, default='nvlog.json',
|
|
help='Filename for logging')
|
|
parser.add_argument('--stft-hop-length', type=int, default=256,
|
|
help='STFT hop length for estimating audio length from mel size')
|
|
parser.add_argument('--num-iters', type=int, default=10,
|
|
help='Number of iterations')
|
|
parser.add_argument('-il', '--input-length', type=int, default=64,
|
|
help='Input length')
|
|
parser.add_argument('-bs', '--batch-size', type=int, default=1,
|
|
help='Batch size')
|
|
return parser
|
|
|
|
|
|
def print_stats(measurements_all):
|
|
|
|
throughput = measurements_all['throughput']
|
|
preprocessing = measurements_all['pre_processing']
|
|
type_conversion = measurements_all['type_conversion']
|
|
storage = measurements_all['storage']
|
|
data_transfer = measurements_all['data_transfer']
|
|
postprocessing = [sum(p) for p in zip(type_conversion,storage,data_transfer)]
|
|
latency = measurements_all['latency']
|
|
waveglow_latency = measurements_all['waveglow_latency']
|
|
tacotron2_latency = measurements_all['tacotron2_latency']
|
|
denoiser_latency = measurements_all['denoiser_latency']
|
|
num_mels_per_audio = measurements_all['num_mels_per_audio']
|
|
|
|
latency.sort()
|
|
|
|
cf_50 = max(latency[:int(len(latency)*0.50)])
|
|
cf_90 = max(latency[:int(len(latency)*0.90)])
|
|
cf_95 = max(latency[:int(len(latency)*0.95)])
|
|
cf_99 = max(latency[:int(len(latency)*0.99)])
|
|
cf_100 = max(latency[:int(len(latency)*1.0)])
|
|
|
|
print("Throughput average (samples/sec) = {:.0f}".format(np.mean(throughput)))
|
|
print("Preprocessing average (seconds) = {:.4f}".format(np.mean(preprocessing)))
|
|
print("Postprocessing average (seconds) = {:.4f}".format(np.mean(postprocessing)))
|
|
print("Number of mels per audio average = {:.0f}".format(np.mean(num_mels_per_audio)))
|
|
print("Tacotron2 latency average (seconds) = {:.2f}".format(np.mean(tacotron2_latency)))
|
|
print("WaveGlow latency average (seconds) = {:.2f}".format(np.mean(waveglow_latency)))
|
|
print("Denoiser latency average (seconds) = {:.4f}".format(np.mean(denoiser_latency)))
|
|
print("Latency average (seconds) = {:.2f}".format(np.mean(latency)))
|
|
print("Latency std (seconds) = {:.2f}".format(np.std(latency)))
|
|
print("Latency cl 50 (seconds) = {:.2f}".format(cf_50))
|
|
print("Latency cl 90 (seconds) = {:.2f}".format(cf_90))
|
|
print("Latency cl 95 (seconds) = {:.2f}".format(cf_95))
|
|
print("Latency cl 99 (seconds) = {:.2f}".format(cf_99))
|
|
print("Latency cl 100 (seconds) = {:.2f}".format(cf_100))
|
|
|
|
|
|
def main():
|
|
"""
|
|
Launches text to speech (inference).
|
|
Inference is executed on a single GPU or CPU.
|
|
"""
|
|
parser = argparse.ArgumentParser(
|
|
description='PyTorch Tacotron 2 Inference')
|
|
parser = parse_args(parser)
|
|
args, unknown_args = parser.parse_known_args()
|
|
|
|
DLLogger.init(backends=[JSONStreamBackend(Verbosity.DEFAULT, args.log_file),
|
|
StdOutBackend(Verbosity.VERBOSE)])
|
|
for k,v in vars(args).items():
|
|
DLLogger.log(step="PARAMETER", data={k:v})
|
|
DLLogger.log(step="PARAMETER", data={'model_name':'Tacotron2_PyT'})
|
|
|
|
measurements_all = {"pre_processing": [],
|
|
"tacotron2_latency": [],
|
|
"waveglow_latency": [],
|
|
"denoiser_latency": [],
|
|
"latency": [],
|
|
"type_conversion": [],
|
|
"data_transfer": [],
|
|
"storage": [],
|
|
"tacotron2_items_per_sec": [],
|
|
"waveglow_items_per_sec": [],
|
|
"num_mels_per_audio": [],
|
|
"throughput": []}
|
|
|
|
print("args:", args, unknown_args)
|
|
|
|
tacotron2 = load_and_setup_model('Tacotron2', parser, args.tacotron2,
|
|
args.fp16, args.cpu, forward_is_infer=True)
|
|
waveglow = load_and_setup_model('WaveGlow', parser, args.waveglow,
|
|
args.fp16, args.cpu, forward_is_infer=True)
|
|
denoiser = Denoiser(waveglow)
|
|
if not args.cpu:
|
|
denoiser.cuda()
|
|
|
|
texts = ["The forms of printed letters should be beautiful, and that their arrangement on the page should be reasonable and a help to the shapeliness of the letters themselves. The forms of printed letters should be beautiful, and that their arrangement on the page should be reasonable and a help to the shapeliness of the letters themselves."]
|
|
texts = [texts[0][:args.input_length]]
|
|
texts = texts*args.batch_size
|
|
|
|
warmup_iters = 3
|
|
|
|
for iter in range(args.num_iters):
|
|
|
|
measurements = {}
|
|
|
|
with MeasureTime(measurements, "pre_processing", args.cpu):
|
|
sequences_padded, input_lengths = prepare_input_sequence(texts, args.cpu)
|
|
|
|
with torch.no_grad():
|
|
with MeasureTime(measurements, "latency", args.cpu):
|
|
with MeasureTime(measurements, "tacotron2_latency", args.cpu):
|
|
mel, mel_lengths, _ = tacotron2.infer(sequences_padded, input_lengths)
|
|
|
|
with MeasureTime(measurements, "waveglow_latency", args.cpu):
|
|
audios = waveglow.infer(mel, sigma=args.sigma_infer)
|
|
|
|
num_mels = mel.size(0)*mel.size(2)
|
|
num_samples = audios.size(0)*audios.size(1)
|
|
|
|
|
|
with MeasureTime(measurements, "type_conversion", args.cpu):
|
|
audios = audios.float()
|
|
|
|
with torch.no_grad(), MeasureTime(measurements, "denoiser_latency", args.cpu):
|
|
audios = denoiser(audios, strength=args.denoising_strength).squeeze(1)
|
|
|
|
with MeasureTime(measurements, "data_transfer", args.cpu):
|
|
audios = audios.cpu()
|
|
|
|
with MeasureTime(measurements, "storage", args.cpu):
|
|
audios = audios.numpy()
|
|
for i, audio in enumerate(audios):
|
|
audio_path = "audio_"+str(i)+".wav"
|
|
write(audio_path, args.sampling_rate,
|
|
audio[:mel_lengths[i]*args.stft_hop_length])
|
|
|
|
measurements['tacotron2_items_per_sec'] = num_mels/measurements['tacotron2_latency']
|
|
measurements['waveglow_items_per_sec'] = num_samples/measurements['waveglow_latency']
|
|
measurements['num_mels_per_audio'] = mel.size(2)
|
|
measurements['throughput'] = num_samples/measurements['latency']
|
|
|
|
if iter >= warmup_iters:
|
|
for k,v in measurements.items():
|
|
measurements_all[k].append(v)
|
|
DLLogger.log(step=(iter-warmup_iters), data={k: v})
|
|
|
|
DLLogger.flush()
|
|
|
|
print_stats(measurements_all)
|
|
|
|
if __name__ == '__main__':
|
|
main()
|