DeepLearningExamples/PyTorch/SpeechSynthesis/Tacotron2/test_infer.py
2020-07-04 01:15:57 +02:00

213 lines
9.7 KiB
Python

# *****************************************************************************
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
# * Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# * Neither the name of the NVIDIA CORPORATION nor the
# names of its contributors may be used to endorse or promote products
# derived from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
# *****************************************************************************
from tacotron2.text import text_to_sequence
import models
import torch
import argparse
import numpy as np
from scipy.io.wavfile import write
import sys
from inference import checkpoint_from_distributed, unwrap_distributed, MeasureTime, prepare_input_sequence, load_and_setup_model
import time
import dllogger as DLLogger
from dllogger import StdOutBackend, JSONStreamBackend, Verbosity
from apex import amp
from waveglow.denoiser import Denoiser
def parse_args(parser):
"""
Parse commandline arguments.
"""
parser.add_argument('--tacotron2', type=str,
help='full path to the Tacotron2 model checkpoint file')
parser.add_argument('--waveglow', type=str,
help='full path to the WaveGlow model checkpoint file')
parser.add_argument('-s', '--sigma-infer', default=0.6, type=float)
parser.add_argument('-d', '--denoising-strength', default=0.01, type=float)
parser.add_argument('-sr', '--sampling-rate', default=22050, type=int,
help='Sampling rate')
run_mode = parser.add_mutually_exclusive_group()
run_mode.add_argument('--fp16', action='store_true',
help='Run inference with FP16')
run_mode.add_argument('--cpu', action='store_true',
help='Run inference on CPU')
parser.add_argument('--log-file', type=str, default='nvlog.json',
help='Filename for logging')
parser.add_argument('--stft-hop-length', type=int, default=256,
help='STFT hop length for estimating audio length from mel size')
parser.add_argument('--num-iters', type=int, default=10,
help='Number of iterations')
parser.add_argument('-il', '--input-length', type=int, default=64,
help='Input length')
parser.add_argument('-bs', '--batch-size', type=int, default=1,
help='Batch size')
return parser
def print_stats(measurements_all):
throughput = measurements_all['throughput']
preprocessing = measurements_all['pre_processing']
type_conversion = measurements_all['type_conversion']
storage = measurements_all['storage']
data_transfer = measurements_all['data_transfer']
postprocessing = [sum(p) for p in zip(type_conversion,storage,data_transfer)]
latency = measurements_all['latency']
waveglow_latency = measurements_all['waveglow_latency']
tacotron2_latency = measurements_all['tacotron2_latency']
denoiser_latency = measurements_all['denoiser_latency']
num_mels_per_audio = measurements_all['num_mels_per_audio']
latency.sort()
cf_50 = max(latency[:int(len(latency)*0.50)])
cf_90 = max(latency[:int(len(latency)*0.90)])
cf_95 = max(latency[:int(len(latency)*0.95)])
cf_99 = max(latency[:int(len(latency)*0.99)])
cf_100 = max(latency[:int(len(latency)*1.0)])
print("Throughput average (samples/sec) = {:.0f}".format(np.mean(throughput)))
print("Preprocessing average (seconds) = {:.4f}".format(np.mean(preprocessing)))
print("Postprocessing average (seconds) = {:.4f}".format(np.mean(postprocessing)))
print("Number of mels per audio average = {:.0f}".format(np.mean(num_mels_per_audio)))
print("Tacotron2 latency average (seconds) = {:.2f}".format(np.mean(tacotron2_latency)))
print("WaveGlow latency average (seconds) = {:.2f}".format(np.mean(waveglow_latency)))
print("Denoiser latency average (seconds) = {:.4f}".format(np.mean(denoiser_latency)))
print("Latency average (seconds) = {:.2f}".format(np.mean(latency)))
print("Latency std (seconds) = {:.2f}".format(np.std(latency)))
print("Latency cl 50 (seconds) = {:.2f}".format(cf_50))
print("Latency cl 90 (seconds) = {:.2f}".format(cf_90))
print("Latency cl 95 (seconds) = {:.2f}".format(cf_95))
print("Latency cl 99 (seconds) = {:.2f}".format(cf_99))
print("Latency cl 100 (seconds) = {:.2f}".format(cf_100))
def main():
"""
Launches text to speech (inference).
Inference is executed on a single GPU or CPU.
"""
parser = argparse.ArgumentParser(
description='PyTorch Tacotron 2 Inference')
parser = parse_args(parser)
args, unknown_args = parser.parse_known_args()
DLLogger.init(backends=[JSONStreamBackend(Verbosity.DEFAULT, args.log_file),
StdOutBackend(Verbosity.VERBOSE)])
for k,v in vars(args).items():
DLLogger.log(step="PARAMETER", data={k:v})
DLLogger.log(step="PARAMETER", data={'model_name':'Tacotron2_PyT'})
measurements_all = {"pre_processing": [],
"tacotron2_latency": [],
"waveglow_latency": [],
"denoiser_latency": [],
"latency": [],
"type_conversion": [],
"data_transfer": [],
"storage": [],
"tacotron2_items_per_sec": [],
"waveglow_items_per_sec": [],
"num_mels_per_audio": [],
"throughput": []}
print("args:", args, unknown_args)
tacotron2 = load_and_setup_model('Tacotron2', parser, args.tacotron2,
args.fp16, args.cpu, forward_is_infer=True)
waveglow = load_and_setup_model('WaveGlow', parser, args.waveglow,
args.fp16, args.cpu, forward_is_infer=True)
denoiser = Denoiser(waveglow)
if not args.cpu:
denoiser.cuda()
texts = ["The forms of printed letters should be beautiful, and that their arrangement on the page should be reasonable and a help to the shapeliness of the letters themselves. The forms of printed letters should be beautiful, and that their arrangement on the page should be reasonable and a help to the shapeliness of the letters themselves."]
texts = [texts[0][:args.input_length]]
texts = texts*args.batch_size
warmup_iters = 3
for iter in range(args.num_iters):
measurements = {}
with MeasureTime(measurements, "pre_processing", args.cpu):
sequences_padded, input_lengths = prepare_input_sequence(texts, args.cpu)
with torch.no_grad():
with MeasureTime(measurements, "latency", args.cpu):
with MeasureTime(measurements, "tacotron2_latency", args.cpu):
mel, mel_lengths, _ = tacotron2.infer(sequences_padded, input_lengths)
with MeasureTime(measurements, "waveglow_latency", args.cpu):
audios = waveglow.infer(mel, sigma=args.sigma_infer)
num_mels = mel.size(0)*mel.size(2)
num_samples = audios.size(0)*audios.size(1)
with MeasureTime(measurements, "type_conversion", args.cpu):
audios = audios.float()
with torch.no_grad(), MeasureTime(measurements, "denoiser_latency", args.cpu):
audios = denoiser(audios, strength=args.denoising_strength).squeeze(1)
with MeasureTime(measurements, "data_transfer", args.cpu):
audios = audios.cpu()
with MeasureTime(measurements, "storage", args.cpu):
audios = audios.numpy()
for i, audio in enumerate(audios):
audio_path = "audio_"+str(i)+".wav"
write(audio_path, args.sampling_rate,
audio[:mel_lengths[i]*args.stft_hop_length])
measurements['tacotron2_items_per_sec'] = num_mels/measurements['tacotron2_latency']
measurements['waveglow_items_per_sec'] = num_samples/measurements['waveglow_latency']
measurements['num_mels_per_audio'] = mel.size(2)
measurements['throughput'] = num_samples/measurements['latency']
if iter >= warmup_iters:
for k,v in measurements.items():
measurements_all[k].append(v)
DLLogger.log(step=(iter-warmup_iters), data={k: v})
DLLogger.flush()
print_stats(measurements_all)
if __name__ == '__main__':
main()