Merge pull request #677 from GrzegorzKarchNV/convai-update

updated convai
This commit is contained in:
nv-kkudrynski 2020-10-07 13:03:48 +02:00 committed by GitHub
commit b1ce24a54f
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
40 changed files with 1154 additions and 423 deletions

View file

@ -11,14 +11,14 @@
# See the License for the specific language governing permissions and
# limitations under the License.
FROM nvcr.io/nvidia/tritonserver:20.03-py3-clientsdk AS trt
FROM nvcr.io/nvidia/tritonserver:20.06-py3-clientsdk AS triton
FROM continuumio/miniconda3
RUN apt-get update && apt-get install -y pbzip2 pv bzip2 cabextract mc iputils-ping wget
WORKDIR /workspace/speech_ai_demo_TTS/
# Copy the perf_client over
COPY --from=trt /workspace/install/ /workspace/install/
COPY --from=triton /workspace/install/ /workspace/install/
ENV LD_LIBRARY_PATH /workspace/install/lib:${LD_LIBRARY_PATH}
# set up env variables
@ -27,14 +27,14 @@ RUN cd /workspace/speech_ai_demo_TTS/
# jupyter lab extensions
RUN conda install -c conda-forge jupyterlab ipywidgets nodejs python-sounddevice librosa unidecode inflect
RUN pip install /workspace/install/python/tensorrtserver*.whl
RUN pip install /workspace/install/python/triton*.whl
# Copy the python wheel and install with pip
COPY --from=trt /workspace/install/python/tensorrtserver*.whl /tmp/
RUN pip install /tmp/tensorrtserver*.whl && rm /tmp/tensorrtserver*.whl
COPY --from=triton /workspace/install/python/triton*.whl /tmp/
RUN pip install /tmp/triton*.whl && rm /tmp/triton*.whl
RUN cd /workspace/speech_ai_demo_TTS/
COPY ./notebooks/trtis/ .
COPY ./notebooks/triton/ .
RUN mkdir /workspace/speech_ai_demo_TTS/tacotron2/
COPY ./tacotron2/text /workspace/speech_ai_demo_TTS/tacotron2/text
RUN chmod a+x /workspace/speech_ai_demo_TTS/run_this.sh

View file

@ -30,6 +30,22 @@ from scipy.io.wavfile import read
import torch
import os
import argparse
import json
class ParseFromConfigFile(argparse.Action):
def __init__(self, option_strings, type, dest, help=None, required=False):
super(ParseFromConfigFile, self).__init__(option_strings=option_strings, type=type, dest=dest, help=help, required=required)
def __call__(self, parser, namespace, values, option_string):
with open(values, 'r') as f:
data = json.load(f)
for group in data.keys():
for k,v in data[group].items():
underscore_k = k.replace('-', '_')
setattr(namespace, underscore_k, v)
def get_mask_from_lengths(lengths):
max_len = torch.max(lengths).item()

View file

@ -0,0 +1,11 @@
{
"audio": {
"max-wav-value": 32768.0,
"sampling-rate": 22050,
"filter-length": 1024,
"hop-length": 256,
"win-length": 1024,
"mel-fmin": 0.0,
"mel-fmax": 7000.0
}
}

View file

@ -33,7 +33,7 @@ from tacotron2.data_function import batch_to_gpu as batch_to_gpu_tacotron2
from waveglow.data_function import batch_to_gpu as batch_to_gpu_waveglow
def get_collate_function(model_name, n_frames_per_step):
def get_collate_function(model_name, n_frames_per_step=1):
if model_name == 'Tacotron2':
collate_fn = TextMelCollate(n_frames_per_step)
elif model_name == 'WaveGlow':

View file

@ -29,6 +29,7 @@ from tacotron2.text import text_to_sequence
import models
import torch
import argparse
import os
import numpy as np
from scipy.io.wavfile import write
import matplotlib
@ -106,8 +107,9 @@ def unwrap_distributed(state_dict):
def load_and_setup_model(model_name, parser, checkpoint, fp16_run, cpu_run, forward_is_infer=False):
model_parser = models.parse_model_args(model_name, parser, add_help=False)
model_parser = models.model_parser(model_name, parser, add_help=False)
model_args, _ = model_parser.parse_known_args()
model_config = models.get_model_config(model_name, model_args)
model = models.get_model(model_name, model_config, cpu_run=cpu_run,
forward_is_infer=forward_is_infer)
@ -195,8 +197,8 @@ def main():
parser = parse_args(parser)
args, _ = parser.parse_known_args()
DLLogger.init(backends=[JSONStreamBackend(Verbosity.DEFAULT,
args.output+'/'+args.log_file),
log_file = os.path.join(args.output, args.log_file)
DLLogger.init(backends=[JSONStreamBackend(Verbosity.DEFAULT, log_file),
StdOutBackend(Verbosity.VERBOSE)])
for k,v in vars(args).items():
DLLogger.log(step="PARAMETER", data={k:v})
@ -245,8 +247,7 @@ def main():
audios = denoiser(audios, strength=args.denoising_strength).squeeze(1)
print("Stopping after",mel.size(2),"decoder steps")
tacotron2_infer_perf = mel.size(0)*mel.size(2)/measurements['tacotron2_time']
tacotron2_infer_perf = mel.size(0)*mel.size(2)/measurements['tacotron2_time']
waveglow_infer_perf = audios.size(0)*audios.size(1)/measurements['waveglow_time']
DLLogger.log(step=0, data={"tacotron2_items_per_sec": tacotron2_infer_perf})
@ -259,12 +260,12 @@ def main():
for i, audio in enumerate(audios):
plt.imshow(alignments[i].float().data.cpu().numpy().T, aspect="auto", origin="lower")
figure_path = args.output+"alignment_"+str(i)+"_"+args.suffix+".png"
figure_path = os.path.join(args.output,"alignment_"+str(i)+args.suffix+".png")
plt.savefig(figure_path)
audio = audio[:mel_lengths[i]*args.stft_hop_length]
audio = audio/torch.max(torch.abs(audio))
audio_path = args.output+"audio_"+str(i)+"_"+args.suffix+".wav"
audio_path = os.path.join(args.output,"audio_"+str(i)+args.suffix+".wav")
write(audio_path, args.sampling_rate, audio.cpu().numpy())
DLLogger.flush()

View file

@ -31,33 +31,69 @@ import argparse
import numpy as np
import json
import time
import os
import sys
from inference import checkpoint_from_distributed, unwrap_distributed, load_and_setup_model, MeasureTime
from inference import checkpoint_from_distributed, unwrap_distributed, load_and_setup_model, MeasureTime, prepare_input_sequence
import dllogger as DLLogger
from dllogger import StdOutBackend, JSONStreamBackend, Verbosity
from apex import amp
def parse_args(parser):
"""
Parse commandline arguments.
"""
parser.add_argument('-m', '--model-name', type=str, default='', required=True,
help='Model to train')
parser.add_argument('-m', '--model-name', type=str, default='',
required=True, help='Model to train')
parser.add_argument('--model', type=str, default='',
help='Full path to the model checkpoint file')
parser.add_argument('-sr', '--sampling-rate', default=22050, type=int,
help='Sampling rate')
parser.add_argument('--amp-run', action='store_true',
parser.add_argument('--fp16', action='store_true',
help='inference with AMP')
parser.add_argument('-bs', '--batch-size', type=int, default=1)
parser.add_argument('-o', '--output', type=str, required=True,
help='Directory to save results')
parser.add_argument('--log-file', type=str, default='nvlog.json',
help='Filename for logging')
parser.add_argument('--synth-data', action='store_true',
help='Test with synthetic data')
return parser
def gen_text(use_synthetic_data):
batch_size = 1
text_len = 140
if use_synthetic_data:
text_padded = torch.randint(low=0, high=148,
size=(batch_size, text_len),
dtype=torch.long).cuda()
input_lengths = torch.IntTensor([text_padded.size(1)]*
batch_size).cuda().long()
else:
texts = ['The forms of printed letters should be beautiful, and that their arrangement on the page should be reasonable and a help to the shapeliness of the letters themselves.']
texts = texts[:][:text_len]
text_padded, input_lengths = prepare_input_sequence(texts)
return (text_padded, input_lengths)
def gen_mel(use_synthetic_data, n_mel_channels, fp16):
if use_synthetic_data:
batch_size = 1
num_mels = 895
mel_padded = torch.zeros(batch_size, n_mel_channels,
num_mels).normal_(-5.62, 1.98).cuda()
else:
mel_padded = torch.load("data/mel.pt")
if fp16:
mel_padded = mel_padded.half()
return mel_padded
def main():
"""
Launches inference benchmark.
@ -68,17 +104,24 @@ def main():
parser = parse_args(parser)
args, _ = parser.parse_known_args()
log_file = args.log_file
log_file = os.path.join(args.output, args.log_file)
DLLogger.init(backends=[JSONStreamBackend(Verbosity.DEFAULT,
args.output+'/'+args.log_file),
DLLogger.init(backends=[JSONStreamBackend(Verbosity.DEFAULT, log_file),
StdOutBackend(Verbosity.VERBOSE)])
for k,v in vars(args).items():
DLLogger.log(step="PARAMETER", data={k:v})
DLLogger.log(step="PARAMETER", data={'model_name':'Tacotron2_PyT'})
model = load_and_setup_model(args.model_name, parser, None, args.amp_run,
forward_is_infer=True)
if args.synth_data:
model = load_and_setup_model(args.model_name, parser, None, args.fp16,
cpu_run=False, forward_is_infer=True)
else:
if not os.path.isfile(args.model):
print(f"File {args.model} does not exist!")
sys.exit(1)
model = load_and_setup_model(args.model_name, parser, args.model,
args.fp16, cpu_run=False,
forward_is_infer=True)
if args.model_name == "Tacotron2":
model = torch.jit.script(model)
@ -91,20 +134,16 @@ def main():
measurements = {}
if args.model_name == 'Tacotron2':
text_padded = torch.randint(low=0, high=148, size=(args.batch_size, 140),
dtype=torch.long).cuda()
input_lengths = torch.IntTensor([text_padded.size(1)]*args.batch_size).cuda().long()
text_padded, input_lengths = gen_text(args.synth_data)
with torch.no_grad(), MeasureTime(measurements, "inference_time"):
mels, _, _ = model(text_padded, input_lengths)
num_items = mels.size(0)*mels.size(2)
if args.model_name == 'WaveGlow':
n_mel_channels = model.upsample.in_channels
num_mels = 895
mel_padded = torch.zeros(args.batch_size, n_mel_channels,
num_mels).normal_(-5.62, 1.98).cuda()
if args.amp_run:
mel_padded = mel_padded.half()
mel_padded = gen_mel(args.synth_data, n_mel_channels, args.fp16)
with torch.no_grad(), MeasureTime(measurements, "inference_time"):
audios = model(mel_padded)

View file

@ -34,13 +34,13 @@ from waveglow.model import WaveGlow
import torch
def parse_model_args(model_name, parser, add_help=False):
def model_parser(model_name, parser, add_help=False):
if model_name == 'Tacotron2':
from tacotron2.arg_parser import parse_tacotron2_args
return parse_tacotron2_args(parser, add_help)
from tacotron2.arg_parser import tacotron2_parser
return tacotron2_parser(parser, add_help)
if model_name == 'WaveGlow':
from waveglow.arg_parser import parse_waveglow_args
return parse_waveglow_args(parser, add_help)
from waveglow.arg_parser import waveglow_parser
return waveglow_parser(parser, add_help)
else:
raise NotImplementedError(model_name)
@ -88,7 +88,7 @@ def get_model(model_name, model_config, cpu_run,
if uniform_initialize_bn_weight:
init_bn(model)
if cpu_run==False:
if not cpu_run:
model = model.cuda()
return model

View file

@ -1,4 +1,3 @@
This Readme accompanies the GTC 2020 talk: "PyTorch from Research to Production" available [here](https://developer.nvidia.com/gtc/2020/video/s21928).
## Model Preparation
@ -32,15 +31,15 @@ wget https://api.ngc.nvidia.com/v2/models/nvidia/bert_large_pyt_amp_ckpt_squad_q
```
* [Tacotron 2](https://ngc.nvidia.com/models/nvidia:tacotron2pyt_fp16/files?version=2)
* [Tacotron 2](https://ngc.nvidia.com/catalog/models/nvidia:tacotron2_pyt_ckpt_amp/files?version=19.12.0)
```bash
wget https://api.ngc.nvidia.com/v2/models/nvidia/tacotron2pyt_fp16/versions/2/files/nvidia_tacotron2pyt_fp16_20190427
wget https://api.ngc.nvidia.com/v2/models/nvidia/tacotron2_pyt_ckpt_amp/versions/19.12.0/files/nvidia_tacotron2pyt_fp16.pt
```
* [WaveGlow](https://ngc.nvidia.com/models/nvidia:waveglow256pyt_fp16/files)
* [WaveGlow](https://ngc.nvidia.com/catalog/models/nvidia:waveglow_ckpt_amp_256/files?version=20.01.0)
```bash
wget https://api.ngc.nvidia.com/v2/models/nvidia/waveglow256pyt_fp16/versions/1/files/nvidia_waveglow256pyt_fp16
wget https://api.ngc.nvidia.com/v2/models/nvidia/waveglow_ckpt_amp_256/versions/20.01.0/files/nvidia_waveglow256pyt_fp16.pt
```
@ -48,14 +47,13 @@ Move the downloaded checkpoints to `models` directory:
```bash
cd DeepLearningExamples/PyTorch/SpeechSynthesis/Tacotron2/notebooks/conversationalai
bert_large_qa.pt nvidia_tacotron2pyt_fp16_20190427 nvidia_waveglow256pyt_fp16 models/
```
### Prepare Jasper
First, let's generate a TensorRT engine for Jasper using TensorRT version 7.
Download the Jasper checkpoint from [NGC](https://ngc.nvidia.com/catalog/models/nvidia:jasperpyt_fp16/files)
Download the Jasper checkpoint from [NGC](https://ngc.nvidia.com/catalog/models/nvidia:jasperpyt_fp16/files)
and move it to `Jasper/checkpoints/` direcotry:
```bash
@ -65,8 +63,8 @@ mv jasper_fp16.pt DeepLearningExamples/PyTorch/SpeechRecognition/Jasper/checkpoi
Apply a patch to enable support of TensorRT 7:
```bash
cd DeepLearningExamples/
```bash
cd DeepLearningExamples/
git apply --ignore-space-change --reject --whitespace=fix ../patch_jasper_trt7
```
@ -74,7 +72,7 @@ Now, build a container for Jasper:
```bash
cd DeepLearningExamples/PyTorch/SpeechRecognition/Jasper/
bash trt/scripts/docker/build.sh
bash tensorrt/scripts/docker/build.sh
```
To run the container, type:
@ -87,15 +85,16 @@ export CHECKPOINT_DIR=$JASPER_DIR/checkpoints/
export RESULT_DIR=$JASPER_DIR/results/
cd $JASPER_DIR
mkdir -p $DATA_DIR $CHECKPOINT_DIR $RESULT_DIR
bash trt/scripts/docker/launch.sh $DATA_DIR $CHECKPOINT_DIR $RESULT_DIR
bash tensorrt/scripts/docker/launch.sh $DATA_DIR $CHECKPOINT_DIR $RESULT_DIR
```
Inside the container export Jasper TensorRT engine by executing:
```bash
pip install --upgrade onnx
mkdir -p /results/onnxs/ /results/engines/
cd /jasper
python trt/perf.py --batch_size 1 --engine_batch_size 1 --model_toml configs/jasper10x5dr_nomask.toml --ckpt_path /checkpoints/jasper_fp16.pt --trt_fp16 --pyt_fp16 --engine_path /results/engines/fp16_DYNAMIC.engine --onnx_path /results/onnxs/fp32_DYNAMIC.onnx --seq_len 3600 --make_onnx
python tensorrt/perf.py --batch_size 1 --engine_batch_size 1 --model_toml configs/jasper10x5dr_nomask.toml --ckpt_path /checkpoints/jasper_fp16.pt --trt_fp16 --pyt_fp16 --engine_path /results/engines/jasper_fp16.engine --onnx_path /results/onnxs/fp32_DYNAMIC.onnx --seq_len 3600 --make_onnx
```
After successful export, copy the engine to model_repo:
@ -103,7 +102,7 @@ After successful export, copy the engine to model_repo:
```bash
cd DeepLearningExamples/Pytorch
mkdir -p SpeechSynthesis/Tacotron2/notebooks/conversationalai/model_repo/jasper-trt/1
cp SpeechRecognition/Jasper/results/engines/fp16_DYNAMIC.engine SpeechSynthesis/Tacotron2/notebooks/conversationalai/model_repo/jasper-trt/1/jasper_fp16.engine
cp SpeechRecognition/Jasper/results/engines/jasper_fp16.engine SpeechSynthesis/Tacotron2/notebooks/conversationalai/model_repo/jasper-trt/1/
```
You will also need Jasper feature extractor and decoder. Download them from [NGC](https://ngc.nvidia.com/catalog/models/nvidia:jasperpyt_jit_fp16/files) and move to the model_repo:
@ -121,12 +120,12 @@ wget -P jasper-feature-extractor/1/ https://api.ngc.nvidia.com/v2/models/nvidia/
With the generated Jasper model, we can proceed to BERT.
Download the BERT checkpoint from [NGC](https://ngc.nvidia.com/catalog/models/nvidia:bert_large_pyt_amp_ckpt_squad_qa1_1/files)
Download the BERT checkpoint from [NGC](https://ngc.nvidia.com/catalog/models/nvidia:bert_large_pyt_amp_ckpt_squad_qa1_1/files)
and move it to `BERT/checkpoints/` direcotry:
```bash
mkdir -p DeepLearningExamples/PyTorch/LanguageModeling/BERT/checkpoints/
mv bert_large_qa.pt DeepLearningExamples/PyTorch/LanguageModeling/BERT/checkpoints/
mv bert_large_qa.pt DeepLearningExamples/PyTorch/LanguageModeling/BERT/checkpoints/bert_qa.pt
```
Now, build a container for BERT:
@ -146,14 +145,14 @@ The model will be saved in `results/triton_models/bertQA-onnx`, together with Tr
```bash
cd DeepLearningExamples
cp -r PyTorch/LanguageModeling/BERT/results/triton_models/bertQA-onnx DeepLearningExamples/PyTorch/SpeechSynthesis/Tacotron2/notebooks/conversationalai/model_repo/
cp -r PyTorch/LanguageModeling/BERT/results/triton_models/bertQA-ts-script DeepLearningExamples/PyTorch/SpeechSynthesis/Tacotron2/notebooks/conversationalai/model_repo/
```
### Prepare Tacotron 2 and WaveGlow
Now to the final part - TTS system.
Download the [Tacotron 2](https://ngc.nvidia.com/models/nvidia:tacotron2pyt_fp16/files?version=2) and [WaveGlow](https://ngc.nvidia.com/models/nvidia:waveglow256pyt_fp16/files) checkpoints from [NGC](https://ngc.nvidia.com/catalog/models/)
Download the [Tacotron 2](https://ngc.nvidia.com/models/nvidia:tacotron2pyt_fp16/files?version=2) and [WaveGlow](https://ngc.nvidia.com/models/nvidia:waveglow256pyt_fp16/files) checkpoints from [NGC](https://ngc.nvidia.com/catalog/models/)
and move them to `Tacotron2/checkpoints/` direcotry:
```bash
@ -178,29 +177,20 @@ Export Tacotron 2 to TorchScript:
```bash
cd /workspace/tacotron2/
mkdir -p output
python exports/export_tacotron2_ts.py --tacotron2 checkpoints/nvidia_tacotron2pyt_fp16_20190427 -o output/model.pt --amp
```
To export WaveGlow to TensorRT 7, install ONNX-TRT
```bash
cd /workspace && git clone https://github.com/onnx/onnx-tensorrt.git
cd /workspace/onnx-tensorrt/ && git submodule update --init --recursive
cd /workspace/onnx-tensorrt && mkdir -p build
cd /workspace/onnx-tensorrt/build && cmake .. -DCMAKE_CXX_FLAGS=-isystem\\ /usr/local/cuda/include && make -j12 && make install
cd /workspace/tacotron2
python notebooks/conversationalai/export_tacotron2_ts.py --tacotron2 notebooks/conversationalai/nvidia_tacotron2pyt_fp16.pt -o output/tacotron2_fp16.pt --fp16
```
Export WaveGlow to ONNX intermediate representation:
```bash
python exports/export_waveglow_onnx.py --waveglow checkpoints/nvidia_waveglow256pyt_fp16 --wn-channels 256 --fp16 -o output/
python tensorrt/convert_waveglow2onnx.py --waveglow notebooks/conversationalai/nvidia_waveglow256pyt_fp16.pt --wn-channels 256 --fp16 -o output/ --config-file config.json
```
Use the exported ONNX IR to generate TensorRT engine:
```bash
python trt/export_onnx2trt.py --waveglow output/waveglow.onnx -o output/ --fp16
pip install pycuda
python tensorrt/convert_onnx2trt.py --waveglow output/waveglow.onnx -o output/ --fp16
```
After successful export, exit the container and copy the Tacotron 2 model and the WaveGlow engine to `model_repo`:
@ -208,8 +198,8 @@ After successful export, exit the container and copy the Tacotron 2 model and th
```bash
cd DeepLearningExamples/PyTorch/SpeechSynthesis/Tacotron2/
mkdir -p notebooks/conversationalai/model_repo/tacotron2/1/ notebooks/conversationalai/model_repo/waveglow-trt/1/
cp output/model.pt notebooks/conversationalai/model_repo/tacotron2/1/
cp output/waveglow_fp16.engine mnotebooks/conversationalai/odel_repo/waveglow-trt/1/
cp output/tacotron2_fp16.pt notebooks/conversationalai/model_repo/tacotron2/1/
cp output/waveglow_fp16.engine notebooks/conversationalai/model_repo/waveglow-trt/1/
```
## Deployment
@ -223,12 +213,13 @@ docker build -f Dockerfile --network=host -t speech_ai_client:demo .
From terminal start the Triton server:
```bash
NV_GPU=1 nvidia-docker run --ipc=host --network=host --rm -p8000:8000 -p8001:8001 \\
-v /home/gkarch/dev/gtc2020/speechai/model_repo/:/models nvcr.io/nvidia/tensorrtserver:20.01-py3 trtserver --model-store=/models --log-verbose 1
cd DeepLearningExamples/PyTorch/SpeechSynthesis/Tacotron2/notebooks/conversationalai
NV_GPU=1 nvidia-docker run --ipc=host --network=host --rm -p8000:8000 -p8001:8001 -v ${PWD}/model_repo/:/models nvcr.io/nvidia/tritonserver:20.06-v1-py3 tritonserver --model-store=/models --log-verbose 1
```
In another another terminal run the client:
```bash
docker run -it --rm --network=host --device /dev/snd:/dev/snd --device /dev/usb:/dev/usb speech_ai_client:demo bash /workspace/speech_ai_demo/start_jupyter.sh
docker run -it --rm --network=host --device /dev/snd:/dev/snd speech_ai_client:demo bash /workspace/speech_ai_demo/start_jupyter.sh
```

View file

@ -11,7 +11,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
FROM nvcr.io/nvidia/tritonserver:20.03-py3-clientsdk AS trtserver
FROM nvcr.io/nvidia/tritonserver:20.06-v1-py3-clientsdk AS trtserver
FROM continuumio/miniconda3
RUN apt-get update && apt-get install -y pbzip2 pv bzip2 cabextract mc iputils-ping wget
@ -26,7 +26,7 @@ ENV PATH="$PATH:/opt/conda/bin"
RUN cd /workspace/speech_ai_demo/
# jupyter lab extensions
RUN conda install -c conda-forge jupyterlab=1.0 ipywidgets=7.5 nodejs python-sounddevice librosa unidecode inflect
RUN conda install -c conda-forge jupyterlab=1.0 ipywidgets=7.5 nodejs=10.13 python-sounddevice librosa unidecode inflect
RUN jupyter labextension install @jupyter-widgets/jupyterlab-manager
RUN pip install /workspace/install/python/tensorrtserver*.whl

View file

@ -25,6 +25,7 @@
"metadata": {},
"outputs": [],
"source": [
"import sys\n",
"import os\n",
"import time\n",
"import numpy as np\n",
@ -85,7 +86,7 @@
"\n",
"\n",
"# create the inference context for the models\n",
"infer_ctx_bert = InferContext(args.url, args.protocol, 'bertQA-onnx', -1)\n",
"infer_ctx_bert = InferContext(args.url, args.protocol, 'bertQA-ts-script', -1)\n",
"infer_ctx_tacotron2 = InferContext(args.url, args.protocol, 'tacotron2', -1)\n",
"infer_ctx_waveglow = InferContext(args.url, args.protocol, 'waveglow-trt', -1)\n",
"infer_jasper = SpeechClient(args.url, args.protocol, 'jasper-trt-ensemble', -1, \n",
@ -211,24 +212,20 @@
" ::mel_lengths:: original length of mel spectrogram\n",
" ::returns:: waveform\n",
" '''\n",
" # padding/trimming mel to dimension 620\n",
" mel = mel[:,:,None]\n",
" # prepare input/output\n",
" mel = np.expand_dims(mel, axis=0)\n",
" input_dict = {}\n",
" input_dict['mel'] = (mel,)\n",
" stride = 256\n",
" kernel_size = 1024\n",
" n_group = 8\n",
" z_size = (mel.shape[1]-1)*stride + (kernel_size-1) + 1 - (kernel_size-stride)\n",
" z_size = z_size//n_group\n",
" shape = (n_group,z_size,1)\n",
" z_size = mel.shape[2]*stride//n_group\n",
" shape = (1,n_group,z_size)\n",
" input_dict['z'] = np.random.normal(0.0, 1.0, shape).astype(mel.dtype)\n",
" input_dict['z'] = (input_dict['z'],)\n",
" output_dict = {}\n",
" output_dict['audio'] = InferContext.ResultFormat.RAW\n",
" batch_size = 1\n",
" # call waveglow\n",
" result = infer_ctx_waveglow.run(input_dict, output_dict, batch_size)\n",
" result = infer_ctx_waveglow.run(input_dict, output_dict)\n",
" # get the results\n",
" signal = result['audio'][0] # take only the first instance in the output batch\n",
" # postprocessing of waveglow: trimming signal to its actual size\n",
@ -319,7 +316,6 @@
" # \n",
" result = infer_ctx_bert.run(input_dict, output_dict, batch_size)\n",
" # \n",
" print(\"BANGLA\")\n",
" start_logits = [float(x) for x in result[\"output__0\"][0].flat]\n",
" end_logits = [float(x) for x in result[\"output__1\"][0].flat]\n",
" return start_logits, end_logits\n",
@ -453,13 +449,6 @@
"question_text.observe(question_text_change, names='value')\n",
"context.observe(context_change, names='value')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
@ -478,7 +467,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.4"
"version": "3.7.6"
}
},
"nbformat": 4,

View file

@ -14,8 +14,10 @@
# limitations under the License.
import math
import json
import numpy as np
import collections
from utils.bert.tokenization import BasicTokenizer
from utils.bert.tokenization import (BasicTokenizer, BertTokenizer, whitespace_tokenize)
class SquadExample(object):
@ -143,6 +145,8 @@ def convert_example_to_feature(example, tokenizer, max_seq_length,
tok_to_orig_index.append(i)
all_doc_tokens.append(sub_token)
tok_start_position = None
tok_end_position = None
# The -3 accounts for [CLS], [SEP] and [SEP]
max_tokens_for_doc = max_seq_length - len(query_tokens) - 3

View file

@ -28,10 +28,15 @@
import librosa
import soundfile as sf
import math
from os import system
import numpy as np
from tensorrtserver.api import *
import tensorrtserver.api.model_config_pb2 as model_config
import grpc
from tensorrtserver.api import api_pb2
from tensorrtserver.api import grpc_service_pb2
from tensorrtserver.api import grpc_service_pb2_grpc
WINDOWS_FNS = {"hanning": np.hanning, "hamming": np.hamming, "none": None}

View file

@ -0,0 +1,68 @@
# *****************************************************************************
# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
# * Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# * Neither the name of the NVIDIA CORPORATION nor the
# names of its contributors may be used to endorse or promote products
# derived from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
# *****************************************************************************
import torch
import argparse
import sys
sys.path.append('./')
from inference import checkpoint_from_distributed, unwrap_distributed, load_and_setup_model
def parse_args(parser):
"""
Parse commandline arguments.
"""
parser.add_argument('--tacotron2', type=str, required=True,
help='full path to the Tacotron2 model checkpoint file')
parser.add_argument('-o', '--output', type=str, default="trtis_repo/tacotron/1/model.pt",
help='filename for the Tacotron 2 TorchScript model')
parser.add_argument('--fp16', action='store_true',
help='inference with mixed precision')
return parser
def main():
parser = argparse.ArgumentParser(
description='PyTorch Tacotron 2 Inference')
parser = parse_args(parser)
args = parser.parse_args()
tacotron2 = load_and_setup_model('Tacotron2', parser, args.tacotron2,
fp16_run=args.fp16, cpu_run=False,
forward_is_infer=True)
jitted_tacotron2 = torch.jit.script(tacotron2)
torch.jit.save(jitted_tacotron2, args.output)
if __name__ == '__main__':
main()

View file

@ -1,26 +1,17 @@
diff --git a/PyTorch/SpeechRecognition/Jasper/trt/Dockerfile b/PyTorch/SpeechRecognition/Jasper/trt/Dockerfile
diff --git a/PyTorch/SpeechRecognition/Jasper/trt/Dockerfile b/PyTorch/SpeechRecognition/Jasper/tensorrt/Dockerfile
index e598a67..562be83 100644
--- a/PyTorch/SpeechRecognition/Jasper/trt/Dockerfile
+++ b/PyTorch/SpeechRecognition/Jasper/trt/Dockerfile
--- a/PyTorch/SpeechRecognition/Jasper/tensorrt/Dockerfile
+++ b/PyTorch/SpeechRecognition/Jasper/tensorrt/Dockerfile
@@ -1,4 +1,4 @@
-ARG FROM_IMAGE_NAME=nvcr.io/nvidia/pytorch:19.10-py3
+ARG FROM_IMAGE_NAME=nvcr.io/nvidia/pytorch:20.01-py3
+ARG FROM_IMAGE_NAME=nvcr.io/nvidia/pytorch:20.08-py3
FROM ${FROM_IMAGE_NAME}
RUN apt-get update && apt-get install -y python3
@@ -6,7 +6,7 @@ RUN apt-get update && apt-get install -y python3
WORKDIR /tmp/onnx-trt
COPY trt/onnx-trt.patch .
RUN git clone https://github.com/onnx/onnx-tensorrt.git && cd onnx-tensorrt && git submodule update --init --recursive && \
- patch -f < ../onnx-trt.patch && mkdir build && cd build && cmake .. -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=/usr -DGPU_ARCHS="60 70 75" && make -j16 && make install && mv -f /usr/lib/libnvonnx* /usr/lib/x86_64-linux-gnu/ && ldconfig
+ mkdir build && cd build && cmake .. -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=/usr -DGPU_ARCHS="60 70 75" && make -j16 && make install && mv -f /usr/lib/libnvonnx* /usr/lib/x86_64-linux-gnu/ && ldconfig
# Here's a good place to install pip reqs from JoC repo.
diff --git a/PyTorch/SpeechRecognition/Jasper/trt/perf.py b/PyTorch/SpeechRecognition/Jasper/trt/perf.py
diff --git a/PyTorch/SpeechRecognition/Jasper/tensorrt/perf.py b/PyTorch/SpeechRecognition/Jasper/tensorrt/perf.py
index 426ee66..5917a1f 100755
--- a/PyTorch/SpeechRecognition/Jasper/trt/perf.py
+++ b/PyTorch/SpeechRecognition/Jasper/trt/perf.py
--- a/PyTorch/SpeechRecognition/Jasper/tensorrt/perf.py
+++ b/PyTorch/SpeechRecognition/Jasper/tensorrt/perf.py
@@ -64,6 +64,9 @@ def main(args):
print("TRANSCRIPT: ", hypotheses)
return
@ -31,32 +22,32 @@ index 426ee66..5917a1f 100755
wer, preds, times = perfprocedures.compare_times_trt_pyt_exhaustive(engine,
pyt_components,
args)
diff --git a/PyTorch/SpeechRecognition/Jasper/trt/scripts/docker/build.sh b/PyTorch/SpeechRecognition/Jasper/trt/scripts/docker/build.sh
diff --git a/PyTorch/SpeechRecognition/Jasper/tensorrt/scripts/docker/build.sh b/PyTorch/SpeechRecognition/Jasper/tensorrt/scripts/docker/build.sh
index 0e44c7f..62e7446 100755
--- a/PyTorch/SpeechRecognition/Jasper/trt/scripts/docker/build.sh
+++ b/PyTorch/SpeechRecognition/Jasper/trt/scripts/docker/build.sh
--- a/PyTorch/SpeechRecognition/Jasper/tensorrt/scripts/docker/build.sh
+++ b/PyTorch/SpeechRecognition/Jasper/tensorrt/scripts/docker/build.sh
@@ -1,5 +1,5 @@
#!/bin/bash
# Constructs a docker image containing dependencies for execution of JASPER through TRT
-echo "docker build . -f ./trt/Dockerfile -t jasper:trt6"
-docker build . -f ./trt/Dockerfile -t jasper:trt6
+echo "docker build . -f ./trt/Dockerfile -t jasper:trt7"
+docker build . -f ./trt/Dockerfile -t jasper:trt7
diff --git a/PyTorch/SpeechRecognition/Jasper/trt/scripts/docker/launch.sh b/PyTorch/SpeechRecognition/Jasper/trt/scripts/docker/launch.sh
-echo "docker build . -f ./tensorrt/Dockerfile -t jasper:trt6"
-docker build . -f ./tensorrt/Dockerfile -t jasper:trt6
+echo "docker build . -f ./tensorrt/Dockerfile -t jasper:trt7"
+docker build . -f ./tensorrt/Dockerfile -t jasper:trt7
diff --git a/PyTorch/SpeechRecognition/Jasper/tensorrt/scripts/docker/launch.sh b/PyTorch/SpeechRecognition/Jasper/tensorrt/scripts/docker/launch.sh
index 9959062..ed5e711 100755
--- a/PyTorch/SpeechRecognition/Jasper/trt/scripts/docker/launch.sh
+++ b/PyTorch/SpeechRecognition/Jasper/trt/scripts/docker/launch.sh
--- a/PyTorch/SpeechRecognition/Jasper/tensorrt/scripts/docker/launch.sh
+++ b/PyTorch/SpeechRecognition/Jasper/tensorrt/scripts/docker/launch.sh
@@ -40,4 +40,4 @@ nvidia-docker run -it --rm \
-v $RESULT_DIR:/results/ \
-v ${JASPER_REPO}:/jasper \
${EXTRA_JASPER_ENV} \
- jasper:trt6 bash $PROGRAM_PATH
+ jasper:trt7 bash $PROGRAM_PATH
diff --git a/PyTorch/SpeechRecognition/Jasper/trt/trtutils.py b/PyTorch/SpeechRecognition/Jasper/trt/trtutils.py
diff --git a/PyTorch/SpeechRecognition/Jasper/tensorrt/trtutils.py b/PyTorch/SpeechRecognition/Jasper/tensorrt/trtutils.py
index 92460b2..01c8b6a 100644
--- a/PyTorch/SpeechRecognition/Jasper/trt/trtutils.py
+++ b/PyTorch/SpeechRecognition/Jasper/trt/trtutils.py
--- a/PyTorch/SpeechRecognition/Jasper/tensorrt/trtutils.py
+++ b/PyTorch/SpeechRecognition/Jasper/tensorrt/trtutils.py
@@ -40,7 +40,7 @@ def build_engine_from_parser(args):
'''
TRT_LOGGER = trt.Logger(trt.Logger.VERBOSE) if args.verbose else trt.Logger(trt.Logger.WARNING)

View file

@ -0,0 +1,44 @@
name: "bertQA-ts-script"
platform: "pytorch_libtorch"
max_batch_size: 8
input [
{
name: "input__0"
data_type: TYPE_INT64
dims: [384]
},
{
name: "input__1"
data_type: TYPE_INT64
dims: [384]
},
{
name: "input__2"
data_type: TYPE_INT64
dims: [384]
}
]
output [
{
name: "output__0"
data_type: TYPE_FP16
dims: [384]
},
{
name: "output__1"
data_type: TYPE_FP16
dims: [384]
}
]
optimization {
cuda {
graphs: 0
}
}
instance_group [
{
count: 1
kind: KIND_GPU
gpus: [ 0 ]
}
]

View file

@ -1,10 +1,10 @@
name: "jasper-trt-ensemble"
platform: "ensemble"
max_batch_size: 1
max_batch_size: 1#MAX_BATCH
input {
name: "AUDIO_SIGNAL"
data_type: TYPE_FP32
dims: -1
dims: -1#AUDIO_LENGTH
}
input {
name: "NUM_SAMPLES"
@ -58,3 +58,4 @@ ensemble_scheduling {
}
}
}

View file

@ -2,20 +2,20 @@ name: "waveglow-trt"
platform: "tensorrt_plan"
default_model_filename: "waveglow_fp16.engine"
max_batch_size: 1
max_batch_size: 0
input {
name: "mel"
data_type: TYPE_FP16
dims: [80, -1, 1]
dims: [1, 80, -1]
}
input {
name: "z"
data_type: TYPE_FP16
dims: [8, -1, 1]
dims: [1, 8, -1]
}
output {
name: "audio"
data_type: TYPE_FP16
dims: [-1]
dims: [1, -1]
}

View file

@ -0,0 +1,146 @@
# Tacotron 2 and WaveGlow inference on Triton Inference Server
## Setup
### Clone the repository.
```bash
git clone https://github.com/NVIDIA/DeepLearningExamples.git
cd DeepLearningExamples/PyTorch/SpeechSynthesis/Tacotron2
```
### Obtain models to be loaded in Triton Inference Server.
We have prepared Tacotron 2 and WaveGlow models that are ready to be loaded in
Triton Inference Server, so you don't need to train and export the models.
Please follow the instructions below to learn how to train,
export --- or simply download the pretrained models.
### Obtain Tacotron 2 and WaveGlow checkpoints.
You can either download the pretrained checkpoints or train the models yourself.
#### (Option 1) Download pretrained checkpoints.
If you want to use a pretrained checkpoints, download them from [NGC](https://ngc.nvidia.com/catalog/models):
- [Tacotron2 checkpoint](https://ngc.nvidia.com/models/nvidia:tacotron2pyt_fp16)
- [WaveGlow checkpoint](https://ngc.nvidia.com/models/nvidia:waveglow256pyt_fp16)
#### (Option 2) Train Tacotron 2 and WaveGlow models.
In order to train the models, follow the QuickStart section in the `Tacotron2/README.md`
file by executing points 1-5. You have to train WaveGlow in a different way than described there. Use
the following command instead of the one given in QuickStart at point 5:
```bash
python -m multiproc train.py -m WaveGlow -o output/ --amp -lr 1e-4 --epochs 2001 --wn-channels 256 -bs 12 --segment-length 16000 --weight-decay 0 --grad-clip-thresh 65504.0 --cudnn-benchmark --cudnn-enabled --log-file output/nvlog.json
```
This will train the WaveGlow model with a smaller number of residual connections
in the coupling layer networks and larger segment length. Training should take
about 100 hours on DGX-1 (8x V100 16G).
### Setup Tacotron 2 TorchScript.
There are two ways to proceed.
#### (Option 1) Download the Tacotron 2 TorchScript model.
Download the Tacotron 2 TorchScript model from:
- [Tacotron2 TorchScript](https://ngc.nvidia.com/models/nvidia:tacotron2pyt_jit_fp16)
Next, save it to `triton_models/tacotron2-ts-script/1/` and rename as `model.pt`:
```bash
wget https://api.ngc.nvidia.com/v2/models/nvidia/tacotron2pyt_jit_fp16/versions/1/files/nvidia_tacotron2pyt_jit_fp16
mkdir -p triton_models/tacotron2-ts-script/1/
mv nvidia_tacotron2pyt_jit_fp16 triton_models/tacotron2-ts-script/1/model.pt
```
Copy the Triton config file for the Tacotron 2 model to the model directory:
```bash
cp notebooks/triton/tacotron2_ts-script_config.pbtxt triton_models/tacotron2-ts-script/config.pbtxt
```
#### (Option 2) Export the Tacotron 2 model using TorchScript.
To export the Tacotron 2 model using TorchScript, type:
```bash
python exports/export_tacotron2.py --triton-model-name tacotron2-ts-script --export ts-script -- --checkpoint <Tacotron 2 checkpoint> --config-file config.json
```
This will create the model as file `model.pt` and save it in folder `triton_models/tacotron2-ts-script/1/`.
The command will also generate the Triton configuration file `config.pbtxt` for the Tacotron 2 model.
You can change the folder names using the flags `--triton-models-dir` (default `triton_models`), `--triton-model-name` (default `""`) and `--triton-model-version` (default `1`).
You can also change model file name with the flag `--export-name <filename>`.
### Setup WaveGlow TensorRT engine.
There are two ways to proceed.
#### (Option 1) Download the WaveGlow TensorRT engine.
Download the WaveGlow TensorRT engine from:
- [WaveGlow TensorRT engine](https://ngc.nvidia.com/models/nvidia:waveglow256pyt_trt_fp16)
Next, save it to `triton_models/waveglow-tensorrt/1/` and rename as `model.plan`:
```bash
wget https://api.ngc.nvidia.com/v2/models/nvidia/waveglow256pyt_trt_fp16/versions/1/files/nvidia_waveglow256pyt_trt_fp16
mkdir -p triton_models/waveglow-tensorrt/1/
mv nvidia_waveglow256pyt_trt_fp16 triton_models/waveglow-tensorrt/1/model.plan
```
Copy the Triton config file for the WaveGlow model to the model directory:
```bash
cp notebooks/triton/waveglow_tensorrt_config.pbtxt triton_models/waveglow-tensorrt/config.pbtxt
```
#### (Option 2) Export the WaveGlow model to TensorRT.
In order to export the model into the TensorRT engine, type:
```bash
python exports/export_waveglow.py --triton-model-name waveglow-tensorrt --export tensorrt --tensorrt-fp16 -- --checkpoint <waveglow_checkpoint> --config-file config.json --wn-channels 256
```
This will create the model as file `model.plan` and save it in folder `triton_models/waveglow-tensorrt/1/`.
The command will also generate the Triton configuration file `config.pbtxt` for the WaveGlow model.
You can change the folder names using the flags `--triton-models-dir` (default `triton_models`), `--triton-model-name` (default `""`) and `--triton-model-version` (default `1`).
You can also change model file name with the flag `--export-name <filename>`.
### Setup the Triton Inference Server.
Download the Triton Inference Server container by typing:
```bash
docker pull nvcr.io/nvidia/tritonserver:20.06-py3
docker tag nvcr.io/nvidia/tritonserver:20.06-py3 tritonserver:20.06
```
### Setup the Triton notebook client.
Now go to the root directory of the Tacotron 2 repo, and type:
```bash
docker build -f Dockerfile_triton_client --network=host -t speech_ai_tts_only:demo .
```
### Run the Triton Inference Server.
To run the server, type in the root directory of the Tacotron 2 repo:
```bash
NV_GPU=1 nvidia-docker run -ti --ipc=host --network=host --rm -p8000:8000 -p8001:8001 -v $PWD/triton_models/:/models tritonserver:20.06 tritonserver --model-store=/models --log-verbose 1
```
The flag `NV_GPU` selects the GPU the server is going to see. If we want it to see all the available GPUs, then run the above command without this flag.
By default, the model repository will be in `triton_models/`.
### Run the Triton notebook client.
Leave the server running. In another terminal, type:
```bash
docker run -it --rm --network=host --device /dev/snd:/dev/snd speech_ai_tts_only:demo bash ./run_this.sh
```
Open the URL in a browser, open `notebook.ipynb`, click play, and enjoy.

View file

@ -14,15 +14,14 @@
"import matplotlib.pyplot as plt\n",
"from matplotlib import cm as cm\n",
"from IPython.display import Audio, display, clear_output, Markdown, Image\n",
"import librosa\n",
"import librosa.display\n",
"#import librosa\n",
"#import librosa.display\n",
"import ipywidgets as widgets\n",
"# \n",
"from tacotron2.text import text_to_sequence as text_to_sequence_internal\n",
"from tacotron2.text.symbols import symbols\n",
"# \n",
"from tensorrtserver.api import *\n",
"\n",
"import tritonhttpclient as thc\n",
"\n",
"defaults = {\n",
" # settings\n",
@ -30,7 +29,6 @@
" 'sampling_rate': 22050, # don't touch this\n",
" 'stft_hop_length': 256, # don't touch this\n",
" 'url': 'localhost:8000', # don't touch this\n",
" 'protocol': 0, # 0: http, 1: grpc \n",
" 'autoplay': True, # autoplay\n",
" 'character_limit_min': 4, # don't touch this\n",
" 'character_limit_max': 340 # don't touch this\n",
@ -42,33 +40,9 @@
" def __init__(self, **entries):\n",
" self.__dict__.update(entries)\n",
"\n",
"\n",
"args = Struct(**defaults)\n",
"\n",
"\n",
"# create the inference context for the models\n",
"infer_ctx_tacotron2 = InferContext(args.url, args.protocol, 'tacotron2', -1)\n",
"infer_ctx_waveglow = InferContext(args.url, args.protocol, 'waveglow', -1)\n",
"\n",
"\n",
"def display_heatmap(sequence, title='preprocessed text'):\n",
" ''' displays sequence as a heatmap '''\n",
" clear_output(wait=True)\n",
" sequence = sequence[None, :]\n",
" plt.figure(figsize=(10, 2.5))\n",
" plt.title(title)\n",
" plt.tick_params(\n",
" axis='both',\n",
" which='both',\n",
" bottom=False,\n",
" top=False,\n",
" left=False,\n",
" right=False,\n",
" labelbottom=False,\n",
" labelleft=False)\n",
" plt.imshow(sequence, cmap='BrBG_r', interpolation='nearest')\n",
" plt.show()\n",
"\n",
"triton_client = thc.InferenceServerClient(args.url)\n",
"\n",
"def display_sound(signal, title, color):\n",
" ''' displays signal '''\n",
@ -84,7 +58,13 @@
" right=False,\n",
" labelbottom=True,\n",
" labelleft=False)\n",
" librosa.display.waveplot(signal, color=color)\n",
" # librosa.display.waveplot(signal, color=color)\n",
" sig = signal[0]\n",
" hop = args.stft_hop_length\n",
" smoothed = []\n",
" for i in range(0, len(sig), hop):\n",
" smoothed.append(np.average(sig[i:i+hop]))\n",
" plt.plot(smoothed, color=color)\n",
" plt.show()\n",
"\n",
"\n",
@ -105,7 +85,7 @@
" labelleft=False)\n",
" plt.xlabel('Time')\n",
" cmap = cm.get_cmap('jet', 30)\n",
" cax = ax.imshow(mel.astype(np.float32), interpolation=\"nearest\", cmap=cmap)\n",
" cax = ax.imshow(mel[0].astype(np.float32), interpolation=\"nearest\", cmap=cmap)\n",
" ax.grid(True)\n",
" plt.show()\n",
"\n",
@ -128,23 +108,24 @@
" mel_lengths contains the length of the unpadded mel, np.array\n",
" alignments contains attention weigths, np.array\n",
" '''\n",
" input_lengths = [len(sequence)]\n",
" input_lengths = np.array(input_lengths, dtype=np.int64)\n",
" sequence = np.reshape(sequence, (1, -1))\n",
" input_lengths = np.array([[len(sequence[0])]], dtype=np.int64)\n",
" # prepare input/output\n",
" input_dict = {}\n",
" input_dict['sequence__0'] = (sequence,)\n",
" input_dict['input_lengths__1'] = (input_lengths,)\n",
" output_dict = {}\n",
" output_dict['mel_outputs_postnet__0'] = InferContext.ResultFormat.RAW\n",
" output_dict['mel_lengths__1'] = InferContext.ResultFormat.RAW\n",
" output_dict['alignments__2'] = InferContext.ResultFormat.RAW\n",
" batch_size = 1\n",
" inputs = []\n",
" inputs.append(thc.InferInput('input__0', sequence.shape, 'INT64'))\n",
" inputs.append(thc.InferInput('input__1', input_lengths.shape, 'INT64'))\n",
" inputs[0].set_data_from_numpy(sequence, binary_data=True)\n",
" inputs[1].set_data_from_numpy(input_lengths, binary_data=True)\n",
" outputs = []\n",
" outputs.append(thc.InferRequestedOutput('output__0', binary_data=True))\n",
" outputs.append(thc.InferRequestedOutput('output__1', binary_data=True))\n",
" outputs.append(thc.InferRequestedOutput('output__2', binary_data=True))\n",
" # call tacotron2\n",
" result = infer_ctx_tacotron2.run(input_dict, output_dict, batch_size)\n",
" result = triton_client.infer(model_name=\"tacotron2-ts-script\", inputs=inputs, outputs=outputs)\n",
" # get results\n",
" mel = result['mel_outputs_postnet__0'][0] # take only the first instance in the output batch\n",
" mel_lengths = result['mel_lengths__1'][0] # take only the first instance in the output batch\n",
" alignments = result['alignments__2'][0] # take only the first instance in the output batch\n",
" mel = result.as_numpy('output__0')\n",
" mel_lengths = result.as_numpy('output__1')\n",
" alignments = result.as_numpy('output__2')\n",
" return mel, mel_lengths, alignments\n",
"\n",
"\n",
@ -154,27 +135,27 @@
" ::mel_lengths:: original length of mel spectrogram\n",
" ::returns:: waveform\n",
" '''\n",
" mel = mel[:,:,None]\n",
" # prepare input/output\n",
" input_dict = {}\n",
" input_dict['mel'] = (mel,)\n",
" mel = mel[:,:,:,None]\n",
" stride = 256\n",
" kernel_size = 1024\n",
" n_group = 8\n",
" z_size = (mel.shape[1]-1)*stride + (kernel_size-1) + 1 - (kernel_size-stride)\n",
" z_size = z_size//n_group\n",
" shape = (n_group,z_size,1)\n",
" input_dict['z'] = np.random.normal(0.0, 1.0, shape).astype(mel.dtype)\n",
" input_dict['z'] = (input_dict['z'],)\n",
" output_dict = {}\n",
" output_dict['audio'] = InferContext.ResultFormat.RAW\n",
" batch_size = 1\n",
" z_size = mel.shape[2]*stride//n_group\n",
" shape = (1, n_group, z_size, 1)\n",
" z = np.random.normal(0.0, 1.0, shape).astype(mel.dtype)\n",
" \n",
" inputs = []\n",
" inputs.append(thc.InferInput('mel', mel.shape, 'FP16'))\n",
" inputs.append(thc.InferInput('z', z.shape, 'FP16'))\n",
" inputs[0].set_data_from_numpy(mel, binary_data=True)\n",
" inputs[1].set_data_from_numpy(z, binary_data=True)\n",
" outputs = []\n",
" outputs.append(thc.InferRequestedOutput('audio', binary_data=True))\n",
" # call waveglow\n",
" result = infer_ctx_waveglow.run(input_dict, output_dict, batch_size)\n",
" result = triton_client.infer(model_name=\"waveglow-tensorrt\", inputs=inputs, outputs=outputs)\n",
" # get the results\n",
" signal = result['audio'][0] # take only the first instance in the output batch\n",
" signal = result.as_numpy('audio')\n",
" # postprocessing of waveglow: trimming signal to its actual size\n",
" trimmed_length = mel_lengths[0] * args.stft_hop_length\n",
" trimmed_length = mel.shape[2]*args.stft_hop_length\n",
" signal = signal[:trimmed_length] # trim\n",
" signal = signal.astype(np.float32)\n",
" return signal\n",
@ -201,7 +182,6 @@
")\n",
"\n",
"\n",
"plot_text_area_preprocessed = get_output_widget(width='10in',height='1in')\n",
"plot_spectrogram = get_output_widget(width='10in',height='2.1in')\n",
"plot_signal = get_output_widget(width='10in',height='2.1in')\n",
"plot_play = get_output_widget(width='10in',height='1in')\n",
@ -219,8 +199,6 @@
" return\n",
" # preprocess tacotron2\n",
" sequence = text_to_sequence(text)\n",
" with plot_text_area_preprocessed:\n",
" display_heatmap(sequence)\n",
" # run tacotron2\n",
" mel, mel_lengths, alignments = sequence_to_mel(sequence)\n",
" with plot_spectrogram:\n",
@ -241,7 +219,6 @@
"# decorative widgets\n",
"empty = widgets.VBox([], layout=widgets.Layout(height='1in'))\n",
"markdown_4 = Markdown('**tacotron2 input**')\n",
"markdown_5 = Markdown('**tacotron2 preprocessing**')\n",
"markdown_6 = Markdown('**tacotron2 output / waveglow input**')\n",
"markdown_7 = Markdown('**waveglow output**')\n",
"markdown_8 = Markdown('**play**')\n",
@ -250,7 +227,6 @@
"display(\n",
" empty, \n",
" markdown_4, text_area, \n",
"# markdown_5, plot_text_area_preprocessed, \n",
" markdown_6, plot_spectrogram, \n",
" markdown_7, plot_signal, \n",
" markdown_8, plot_play, \n",
@ -285,7 +261,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.4"
"version": "3.7.6"
}
},
"nbformat": 4,

View file

@ -0,0 +1,39 @@
name: "tacotron2-ts-script"
platform: "pytorch_libtorch"
max_batch_size: 1
input [
{
name: "input__0"
data_type: TYPE_INT64
dims: [-1]
},
{
name: "input__1"
data_type: TYPE_INT64
dims: [1]
reshape: { shape: [ ] }
}
]
output [
{
name: "output__0"
data_type: TYPE_FP16
dims: [80, -1]
},
{
name: "output__1"
data_type: TYPE_INT32
dims: [1]
reshape: { shape: [ ] }
},
{
name: "output__2"
data_type: TYPE_FP16
dims: [-1, -1]
}
]
optimization {
cuda {
graphs: 1
}
}

View file

@ -0,0 +1,22 @@
name: "waveglow-tensorrt"
platform: "tensorrt_plan"
default_model_filename: "model.plan"
max_batch_size: 0
input {
name: "mel"
data_type: TYPE_FP16
dims: [1, 80, -1, 1]
}
input {
name: "z"
data_type: TYPE_FP16
dims: [1, 8, -1, 1]
}
output {
name: "audio"
data_type: TYPE_FP16
dims: [1, -1]
}

View file

@ -1,153 +0,0 @@
# Tacotron 2 and WaveGlow inference on TRTIS
## Setup
### Clone the repository.
```bash
git clone https://github.com/NVIDIA/DeepLearningExamples.git
cd DeepLearningExamples/PyTorch/SpeechSynthesis/Tacotron2
```
### Obtain models to be loaded in TRTIS.
We have prepared Tacotron 2 and WaveGlow models that are ready to be loaded in TRTIS,
so you don't need to train and export the models. Please follow the instructions
below to learn how to train, export --- or simply download the pretrained models.
### Obtain Tacotron 2 and WaveGlow checkpoints.
You can either download the pretrained checkpoints or train the models yourself.
#### (Option 1) Download pretrained checkpoints.
If you want to use a pretrained checkpoints, download them from [NGC](https://ngc.nvidia.com/catalog/models):
- [Tacotron2 checkpoint](https://ngc.nvidia.com/models/nvidia:tacotron2pyt_fp16)
- [WaveGlow checkpoint](https://ngc.nvidia.com/models/nvidia:waveglow256pyt_fp16)
#### (Option 2) Train Tacotron 2 and WaveGlow models.
In order to train the models, follow the QuickStart section in the `Tacotron2/README.md`
file by executing points 1-5. You have to train WaveGlow in a different way than described there. Use
the following command instead of the one given in QuickStart at point 5:
```bash
python -m multiproc train.py -m WaveGlow -o output/ --amp -lr 1e-4 --epochs 2001 --wn-channels 256 -bs 12 --segment-length 16000 --weight-decay 0 --grad-clip-thresh 65504.0 --cudnn-benchmark --cudnn-enabled --log-file output/nvlog.json
```
This will train the WaveGlow model with a smaller number of residual connections
in the coupling layer networks and larger segment length. Training should take
about 100 hours on DGX-1 (8x V100 16G).
### Setup Tacotron 2 TorchScript.
First, you need to create a folder structure for the model to be loaded in TRTIS server.
Follow the Tacotron 2 Quick Start Guide (points 1-4) to start the container.
Inside the container, type:
```bash
cd /workspace/tacotron2/
python exports/export_tacotron2_ts_config.py --fp16
```
This will export the folder structure of the TRTIS repository and the config file of Tacotron 2.
By default, it will be found in the `trtis_repo/tacotron2` folder.
Now there are two ways to proceed.
#### (Option 1) Download the Tacotron 2 TorchScript model.
Download the Tacotron 2 TorchScript model from:
- [Tacotron2 TorchScript](https://ngc.nvidia.com/models/nvidia:tacotron2pyt_jit_fp16)
Move the downloaded model to `trtis_repo/tacotron2/1/model.pt`
#### (Option 2) Export the Tacotron 2 model using TorchScript.
To export the Tacotron 2 model using TorchScript, type:
```bash
python exports/export_tacotron2_ts.py --tacotron2 <tacotron2_checkpoint> -o trtis_repo/tacotron2/1/model.pt --fp16
```
This will save the model as ``trtis_repo/tacotron2/1/model.pt``.
### Setup WaveGlow TRT engine.
For WaveGlow, we also need to create the folder structure that will be used by the TRTIS server.
Inside the container, type:
```bash
cd /workspace/tacotron2/
python exports/export_waveglow_trt_config.py --fp16
```
This will export the folder structure of the TRTIS repository and the config file of Waveglow.
By default, it will be found in the `trtis_repo/waveglow` folder.
There are two ways to proceed.
#### (Option 1) Download the WaveGlow TRT engine.
Download the WaveGlow TRT engine from:
- [WaveGlow TRT engine](https://ngc.nvidia.com/models/nvidia:waveglow256pyt_trt_fp16)
Move the downloaded model to `trtis_repo/waveglow/1/model.plan`
#### (Option 2) Export the WaveGlow model to TRT.
Before exporting the model, you need to install onnx-tensorrt by typing:
```bash
cd /workspace && git clone https://github.com/onnx/onnx-tensorrt.git
cd /workspace/onnx-tensorrt/ && git submodule update --init --recursive
cd /workspace/onnx-tensorrt && mkdir -p build
cd /workspace/onnx-tensorrt/build && cmake .. -DCMAKE_CXX_FLAGS=-isystem\ /usr/local/cuda/include && make -j12 && make install
```
In order to export the model into the ONNX intermediate representation, type:
```bash
python exports/export_waveglow_onnx.py --waveglow <waveglow_checkpoint> --wn-channels 256 --fp16 --output ./output
```
This will save the model as `waveglow.onnx` (you can change its name with the flag `--output <filename>`).
With the model exported to ONNX, type the following to obtain a TRT engine and save it as `trtis_repo/waveglow/1/model.plan`:
```bash
python trt/export_onnx2trt.py --waveglow <exported_waveglow_onnx> -o trtis_repo/waveglow/1/ --fp16
```
### Setup the TRTIS server.
Download the TRTIS container by typing:
```bash
docker pull nvcr.io/nvidia/tritonserver:20.03-py3
docker tag nvcr.io/nvidia/tritonserver:20.03-py3 tritonserver:20.03
```
### Setup the TRTIS notebook client.
Now go to the root directory of the Tacotron 2 repo, and type:
```bash
docker build -f Dockerfile_trtis_client --network=host -t speech_ai_tts_only:demo .
```
### Run the TRTIS server.
To run the server, type in the root directory of the Tacotron 2 repo:
```bash
NV_GPU=1 nvidia-docker run -ti --ipc=host --network=host --rm -p8000:8000 -p8001:8001 -v $PWD/trtis_repo/:/models tritonserver:20.03 trtserver --model-store=/models --log-verbose 1
```
The flag `NV_GPU` selects the GPU the server is going to see. If we want it to see all the available GPUs, then run the above command without this flag.
By default, the model repository will be in `trtis_repo/`.
### Run the TRTIS notebook client.
Leave the server running. In another terminal, type:
```bash
docker run -it --rm --network=host --device /dev/snd:/dev/snd --device /dev/usb:/dev/usb speech_ai_tts_only:demo bash ./run_this.sh
```
Open the URL in a browser, open `notebook.ipynb`, click play, and enjoy.

View file

@ -30,7 +30,7 @@ import argparse
from tacotron2.text import symbols
def parse_tacotron2_args(parent, add_help=False):
def tacotron2_parser(parent, add_help=False):
"""
Parse commandline arguments.
"""

View file

@ -47,27 +47,27 @@ NVIDIA TensorRT is a platform for high-performance deep learning inference. It i
dpkg -l | grep TensorRT
```
6. Export the models to ONNX intermediate representation (ONNX IR).
Export Tacotron 2 to three ONNX parts: Encoder, Decoder, and Postnet:
6. Convert the models to ONNX intermediate representation (ONNX IR).
Convert Tacotron 2 to three ONNX parts: Encoder, Decoder, and Postnet:
```bash
mkdir -p output
python exports/export_tacotron2_onnx.py --tacotron2 ./checkpoints/nvidia_tacotron2pyt_fp16_20190427 -o output/ --fp16
python tensorrt/convert_tacotron22onnx.py --tacotron2 ./checkpoints/nvidia_tacotron2pyt_fp16_20190427 -o output/ --fp16
```
Export WaveGlow to ONNX IR:
Convert WaveGlow to ONNX IR:
```bash
python exports/export_waveglow_onnx.py --waveglow ./checkpoints/nvidia_waveglow256pyt_fp16 --wn-channels 256 -o output/ --fp16
```
python tensorrt/convert_waveglow2onnx.py --waveglow ./checkpoints/nvidia_waveglow256pyt_fp16 --config-file config.json --wn-channels 256 -o output/ --fp16
```
After running the above commands, there should be four new ONNX files in `./output/` directory:
`encoder.onnx`, `decoder_iter.onnx`, `postnet.onnx`, and `waveglow.onnx`.
7. Export the ONNX IRs to TensorRT engines with fp16 mode enabled:
7. Convert the ONNX IRs to TensorRT engines with fp16 mode enabled:
```bash
python trt/export_onnx2trt.py --encoder output/encoder.onnx --decoder output/decoder_iter.onnx --postnet output/postnet.onnx --waveglow output/waveglow.onnx -o output/ --fp16
python tensorrt/convert_onnx2trt.py --encoder output/encoder.onnx --decoder output/decoder_iter.onnx --postnet output/postnet.onnx --waveglow output/waveglow.onnx -o output/ --fp16
```
After running the command, there should be four new engine files in `./output/` directory:
@ -76,14 +76,14 @@ NVIDIA TensorRT is a platform for high-performance deep learning inference. It i
8. Run TTS inference pipeline with fp16:
```bash
python trt/inference_trt.py -i phrases/phrase.txt --encoder output/encoder_fp16.engine --decoder output/decoder_iter_fp16.engine --postnet output/postnet_fp16.engine --waveglow output/waveglow_fp16.engine -o output/ --fp16
python tensorrt/inference_trt.py -i phrases/phrase.txt --encoder output/encoder_fp16.engine --decoder output/decoder_iter_fp16.engine --postnet output/postnet_fp16.engine --waveglow output/waveglow_fp16.engine -o output/ --fp16
```
## Inference performance: NVIDIA T4
Our results were obtained by running the `./trt/run_latency_tests_trt.sh` script in the PyTorch-19.11-py3 NGC container. Please note that to reproduce the results, you need to provide pretrained checkpoints for Tacotron 2 and WaveGlow. Please edit the script to provide your checkpoint filenames. For all tests in this table, we used WaveGlow with 256 residual channels.
Our results were obtained by running the `./tensorrt/run_latency_tests_trt.sh` script in the PyTorch-19.11-py3 NGC container. Please note that to reproduce the results, you need to provide pretrained checkpoints for Tacotron 2 and WaveGlow. Please edit the script to provide your checkpoint filenames. For all tests in this table, we used WaveGlow with 256 residual channels.
|Framework|Batch size|Input length|Precision|Avg latency (s)|Latency std (s)|Latency confidence interval 90% (s)|Latency confidence interval 95% (s)|Latency confidence interval 99% (s)|Throughput (samples/sec)|Speed-up PyT+TRT/TRT|Avg mels generated (81 mels=1 sec of speech)|Avg audio length (s)|Avg RTF|
|Framework|Batch size|Input length|Precision|Avg latency (s)|Latency std (s)|Latency confidence interval 90% (s)|Latency confidence interval 95% (s)|Latency confidence interval 99% (s)|Throughput (samples/sec)|Speed-up PyTorch+TensorRT / TensorRT|Avg mels generated (81 mels=1 sec of speech)|Avg audio length (s)|Avg RTF|
|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|
|PyT+TRT|1| 128| FP16| 1.02| 0.05| 1.09| 1.10| 1.14| 150,439| 1.59| 602| 6.99| 6.86|
|PyT |1| 128| FP16| 1.63| 0.07| 1.71| 1.73| 1.81| 94,758| 1.00| 601| 6.98| 4.30|
|PyTorch+TensorRT|1| 128| FP16| 1.02| 0.05| 1.09| 1.10| 1.14| 150,439| 1.59| 602| 6.99| 6.86|
|PyTorch |1| 128| FP16| 1.63| 0.07| 1.71| 1.73| 1.81| 94,758| 1.00| 601| 6.98| 4.30|

View file

@ -27,14 +27,15 @@
import pycuda.driver as cuda
import pycuda.autoinit
import tensorrt as trt
import onnx
import argparse
import tensorrt as trt
import os
import sys
sys.path.append('./')
from trt.trt_utils import build_engine
from trt_utils import build_engine
def parse_args(parser):
"""
@ -119,7 +120,8 @@ def main():
print("Building WaveGlow ...")
waveglow_engine = build_engine(args.waveglow, shapes=shapes, fp16=args.fp16)
if waveglow_engine is not None:
with open(args.output+"/"+"waveglow"+engine_prec+".engine", 'wb') as f:
engine_path = os.path.join(args.output, "waveglow"+engine_prec+".engine")
with open(engine_path, 'wb') as f:
f.write(waveglow_engine.serialize())
else:
print("Failed to build engine from", args.waveglow)

View file

@ -0,0 +1,405 @@
# *****************************************************************************
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
# * Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# * Neither the name of the NVIDIA CORPORATION nor the
# names of its contributors may be used to endorse or promote products
# derived from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
# *****************************************************************************
import torch
from torch import nn
from torch.nn import functional as F
import argparse
import sys
sys.path.append('./')
import models
from inference import checkpoint_from_distributed, unwrap_distributed, load_and_setup_model, prepare_input_sequence
from common.utils import to_gpu, get_mask_from_lengths
def parse_args(parser):
"""
Parse commandline arguments.
"""
parser.add_argument('--tacotron2', type=str,
help='full path to the Tacotron2 model checkpoint file')
parser.add_argument('-o', '--output', type=str, required=True,
help='Directory for the exported Tacotron 2 ONNX model')
parser.add_argument('--fp16', action='store_true',
help='Export with half precision to ONNX')
return parser
def encoder_infer(self, x, input_lengths):
device = x.device
for conv in self.convolutions:
x = F.dropout(F.relu(conv(x.to(device))), 0.5, False)
x = x.transpose(1, 2)
x = nn.utils.rnn.pack_padded_sequence(
x, input_lengths, batch_first=True)
outputs, _ = self.lstm(x)
outputs, _ = nn.utils.rnn.pad_packed_sequence(
outputs, batch_first=True)
lens = input_lengths*2
return outputs, lens
class Encoder(torch.nn.Module):
def __init__(self, tacotron2):
super(Encoder, self).__init__()
self.tacotron2 = tacotron2
self.tacotron2.encoder.lstm.flatten_parameters()
self.infer = encoder_infer
def forward(self, sequence, sequence_lengths):
embedded_inputs = self.tacotron2.embedding(sequence).transpose(1, 2)
memory, lens = self.infer(self.tacotron2.encoder, embedded_inputs, sequence_lengths)
processed_memory = self.tacotron2.decoder.attention_layer.memory_layer(memory)
return memory, processed_memory, lens
class Postnet(torch.nn.Module):
def __init__(self, tacotron2):
super(Postnet, self).__init__()
self.tacotron2 = tacotron2
def forward(self, mel_outputs):
mel_outputs_postnet = self.tacotron2.postnet(mel_outputs)
return mel_outputs + mel_outputs_postnet
def lstmcell2lstm_params(lstm_mod, lstmcell_mod):
lstm_mod.weight_ih_l0 = torch.nn.Parameter(lstmcell_mod.weight_ih)
lstm_mod.weight_hh_l0 = torch.nn.Parameter(lstmcell_mod.weight_hh)
lstm_mod.bias_ih_l0 = torch.nn.Parameter(lstmcell_mod.bias_ih)
lstm_mod.bias_hh_l0 = torch.nn.Parameter(lstmcell_mod.bias_hh)
def prenet_infer(self, x):
x1 = x[:]
for linear in self.layers:
x1 = F.relu(linear(x1))
x0 = x1[0].unsqueeze(0)
mask = torch.le(torch.rand(256, device='cuda').to(x.dtype), 0.5).to(x.dtype)
mask = mask.expand(x1.size(0), x1.size(1))
x1 = x1*mask*2.0
return x1
class DecoderIter(torch.nn.Module):
def __init__(self, tacotron2):
super(DecoderIter, self).__init__()
self.tacotron2 = tacotron2
dec = tacotron2.decoder
self.p_attention_dropout = dec.p_attention_dropout
self.p_decoder_dropout = dec.p_decoder_dropout
self.prenet = dec.prenet
self.prenet.infer = prenet_infer
self.attention_rnn = nn.LSTM(dec.prenet_dim + dec.encoder_embedding_dim,
dec.attention_rnn_dim, 1)
lstmcell2lstm_params(self.attention_rnn, dec.attention_rnn)
self.attention_rnn.flatten_parameters()
self.attention_layer = dec.attention_layer
self.decoder_rnn = nn.LSTM(dec.attention_rnn_dim + dec.encoder_embedding_dim,
dec.decoder_rnn_dim, 1)
lstmcell2lstm_params(self.decoder_rnn, dec.decoder_rnn)
self.decoder_rnn.flatten_parameters()
self.linear_projection = dec.linear_projection
self.gate_layer = dec.gate_layer
def decode(self, decoder_input, in_attention_hidden, in_attention_cell,
in_decoder_hidden, in_decoder_cell, in_attention_weights,
in_attention_weights_cum, in_attention_context, memory,
processed_memory, mask):
cell_input = torch.cat((decoder_input, in_attention_context), -1)
_, (out_attention_hidden, out_attention_cell) = self.attention_rnn(
cell_input.unsqueeze(0), (in_attention_hidden.unsqueeze(0),
in_attention_cell.unsqueeze(0)))
out_attention_hidden = out_attention_hidden.squeeze(0)
out_attention_cell = out_attention_cell.squeeze(0)
out_attention_hidden = F.dropout(
out_attention_hidden, self.p_attention_dropout, False)
attention_weights_cat = torch.cat(
(in_attention_weights.unsqueeze(1),
in_attention_weights_cum.unsqueeze(1)), dim=1)
out_attention_context, out_attention_weights = self.attention_layer(
out_attention_hidden, memory, processed_memory,
attention_weights_cat, mask)
out_attention_weights_cum = in_attention_weights_cum + out_attention_weights
decoder_input_tmp = torch.cat(
(out_attention_hidden, out_attention_context), -1)
_, (out_decoder_hidden, out_decoder_cell) = self.decoder_rnn(
decoder_input_tmp.unsqueeze(0), (in_decoder_hidden.unsqueeze(0),
in_decoder_cell.unsqueeze(0)))
out_decoder_hidden = out_decoder_hidden.squeeze(0)
out_decoder_cell = out_decoder_cell.squeeze(0)
out_decoder_hidden = F.dropout(
out_decoder_hidden, self.p_decoder_dropout, False)
decoder_hidden_attention_context = torch.cat(
(out_decoder_hidden, out_attention_context), 1)
decoder_output = self.linear_projection(
decoder_hidden_attention_context)
gate_prediction = self.gate_layer(decoder_hidden_attention_context)
return (decoder_output, gate_prediction, out_attention_hidden,
out_attention_cell, out_decoder_hidden, out_decoder_cell,
out_attention_weights, out_attention_weights_cum, out_attention_context)
# @torch.jit.script
def forward(self,
decoder_input,
attention_hidden,
attention_cell,
decoder_hidden,
decoder_cell,
attention_weights,
attention_weights_cum,
attention_context,
memory,
processed_memory,
mask):
decoder_input1 = self.prenet.infer(self.prenet, decoder_input)
outputs = self.decode(decoder_input1,
attention_hidden,
attention_cell,
decoder_hidden,
decoder_cell,
attention_weights,
attention_weights_cum,
attention_context,
memory,
processed_memory,
mask)
return outputs
def test_inference(encoder, decoder_iter, postnet):
encoder.eval()
decoder_iter.eval()
postnet.eval()
sys.path.append('./tensorrt')
from inference_trt import init_decoder_inputs
texts = ["Hello World, good day."]
sequences, sequence_lengths = prepare_input_sequence(texts)
measurements = {}
print("Running Tacotron2 Encoder")
with torch.no_grad():
memory, processed_memory, lens = encoder(sequences, sequence_lengths)
print("Running Tacotron2 Decoder")
device = memory.device
dtype = memory.dtype
mel_lengths = torch.zeros([memory.size(0)], dtype=torch.int32, device = device)
not_finished = torch.ones([memory.size(0)], dtype=torch.int32, device = device)
mel_outputs, gate_outputs, alignments = (torch.zeros(1), torch.zeros(1), torch.zeros(1))
gate_threshold = 0.6
max_decoder_steps = 1000
first_iter = True
(decoder_input, attention_hidden, attention_cell, decoder_hidden,
decoder_cell, attention_weights, attention_weights_cum,
attention_context, memory, processed_memory,
mask) = init_decoder_inputs(memory, processed_memory, sequence_lengths)
while True:
with torch.no_grad():
(mel_output, gate_output,
attention_hidden, attention_cell,
decoder_hidden, decoder_cell,
attention_weights, attention_weights_cum,
attention_context) = decoder_iter(decoder_input, attention_hidden, attention_cell, decoder_hidden,
decoder_cell, attention_weights, attention_weights_cum,
attention_context, memory, processed_memory, mask)
if first_iter:
mel_outputs = torch.unsqueeze(mel_output, 2)
gate_outputs = torch.unsqueeze(gate_output, 2)
alignments = torch.unsqueeze(attention_weights, 2)
first_iter = False
else:
mel_outputs = torch.cat((mel_outputs, torch.unsqueeze(mel_output, 2)), 2)
gate_outputs = torch.cat((gate_outputs, torch.unsqueeze(gate_output, 2)), 2)
alignments = torch.cat((alignments, torch.unsqueeze(attention_weights, 2)), 2)
dec = torch.le(torch.sigmoid(gate_output), gate_threshold).to(torch.int32).squeeze(1)
not_finished = not_finished*dec
mel_lengths += not_finished
if torch.sum(not_finished) == 0:
print("Stopping after ",mel_outputs.size(2)," decoder steps")
break
if mel_outputs.size(2) == max_decoder_steps:
print("Warning! Reached max decoder steps")
break
decoder_input = mel_output
print("Running Tacotron2 PostNet")
with torch.no_grad():
mel_outputs_postnet = postnet(mel_outputs)
return mel_outputs_postnet
def main():
parser = argparse.ArgumentParser(
description='PyTorch Tacotron 2 export to TRT')
parser = parse_args(parser)
args, _ = parser.parse_known_args()
tacotron2 = load_and_setup_model('Tacotron2', parser, args.tacotron2,
fp16_run=args.fp16, cpu_run=False)
opset_version = 10
sequences = torch.randint(low=0, high=148, size=(1,50),
dtype=torch.long).cuda()
sequence_lengths = torch.IntTensor([sequences.size(1)]).cuda().long()
dummy_input = (sequences, sequence_lengths)
encoder = Encoder(tacotron2)
encoder.eval()
with torch.no_grad():
encoder(*dummy_input)
torch.onnx.export(encoder, dummy_input, args.output+"/"+"encoder.onnx",
opset_version=opset_version,
do_constant_folding=True,
input_names=["sequences", "sequence_lengths"],
output_names=["memory", "processed_memory", "lens"],
dynamic_axes={"sequences": {1: "text_seq"},
"memory": {1: "mem_seq"},
"processed_memory": {1: "mem_seq"}
})
decoder_iter = DecoderIter(tacotron2)
memory = torch.randn((1,sequence_lengths[0],512)).cuda() #encoder_outputs
if args.fp16:
memory = memory.half()
memory_lengths = sequence_lengths
# initialize decoder states for dummy_input
decoder_input = tacotron2.decoder.get_go_frame(memory)
mask = get_mask_from_lengths(memory_lengths)
(attention_hidden,
attention_cell,
decoder_hidden,
decoder_cell,
attention_weights,
attention_weights_cum,
attention_context,
processed_memory) = tacotron2.decoder.initialize_decoder_states(memory)
dummy_input = (decoder_input,
attention_hidden,
attention_cell,
decoder_hidden,
decoder_cell,
attention_weights,
attention_weights_cum,
attention_context,
memory,
processed_memory,
mask)
decoder_iter = DecoderIter(tacotron2)
decoder_iter.eval()
with torch.no_grad():
decoder_iter(*dummy_input)
torch.onnx.export(decoder_iter, dummy_input, args.output+"/"+"decoder_iter.onnx",
opset_version=opset_version,
do_constant_folding=True,
input_names=["decoder_input",
"attention_hidden",
"attention_cell",
"decoder_hidden",
"decoder_cell",
"attention_weights",
"attention_weights_cum",
"attention_context",
"memory",
"processed_memory",
"mask"],
output_names=["decoder_output",
"gate_prediction",
"out_attention_hidden",
"out_attention_cell",
"out_decoder_hidden",
"out_decoder_cell",
"out_attention_weights",
"out_attention_weights_cum",
"out_attention_context"],
dynamic_axes={"attention_weights" : {1: "seq_len"},
"attention_weights_cum" : {1: "seq_len"},
"memory" : {1: "seq_len"},
"processed_memory" : {1: "seq_len"},
"mask" : {1: "seq_len"},
"out_attention_weights" : {1: "seq_len"},
"out_attention_weights_cum" : {1: "seq_len"}
})
postnet = Postnet(tacotron2)
dummy_input = torch.randn((1,80,620)).cuda()
if args.fp16:
dummy_input = dummy_input.half()
torch.onnx.export(postnet, dummy_input, args.output+"/"+"postnet.onnx",
opset_version=opset_version,
do_constant_folding=True,
input_names=["mel_outputs"],
output_names=["mel_outputs_postnet"],
dynamic_axes={"mel_outputs": {2: "mel_seq"},
"mel_outputs_postnet": {2: "mel_seq"}})
mel = test_inference(encoder, decoder_iter, postnet)
torch.save(mel, "mel.pt")
if __name__ == '__main__':
main()

View file

@ -0,0 +1,104 @@
# *****************************************************************************
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
# * Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# * Neither the name of the NVIDIA CORPORATION nor the
# names of its contributors may be used to endorse or promote products
# derived from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
# *****************************************************************************
import torch
import argparse
import os
import sys
sys.path.append('./')
from common.utils import ParseFromConfigFile
from inference import load_and_setup_model
def parse_args(parser):
"""
Parse commandline arguments.
"""
parser.add_argument('--waveglow', type=str, required=True,
help='full path to the WaveGlow model checkpoint file')
parser.add_argument('-o', '--output', type=str, required=True,
help='Directory for the exported WaveGlow ONNX model')
parser.add_argument('--fp16', action='store_true',
help='inference with AMP')
parser.add_argument('-s', '--sigma-infer', default=0.6, type=float)
parser.add_argument('--config-file', action=ParseFromConfigFile,
type=str, help='Path to configuration file')
return parser
def export_onnx(parser, args):
waveglow = load_and_setup_model('WaveGlow', parser, args.waveglow,
fp16_run=args.fp16, cpu_run=False,
forward_is_infer=False)
# 80 mel channels, 620 mel spectrograms ~ 7 seconds of speech
mel = torch.randn(1, 80, 620).cuda()
stride = 256 # value from waveglow upsample
n_group = 8
z_size2 = (mel.size(2)*stride)//n_group
z = torch.randn(1, n_group, z_size2).cuda()
if args.fp16:
mel = mel.half()
z = z.half()
with torch.no_grad():
# run inference to force calculation of inverses
waveglow.infer(mel, sigma=args.sigma_infer)
# export to ONNX
if args.fp16:
waveglow = waveglow.half()
waveglow.forward = waveglow.infer_onnx
opset_version = 12
output_path = os.path.join(args.output, "waveglow.onnx")
torch.onnx.export(waveglow, (mel, z), output_path,
opset_version=opset_version,
do_constant_folding=True,
input_names=["mel", "z"],
output_names=["audio"],
dynamic_axes={"mel": {0: "batch_size", 2: "mel_seq"},
"z": {0: "batch_size", 2: "z_seq"},
"audio": {0: "batch_size", 1: "audio_seq"}})
def main():
parser = argparse.ArgumentParser(
description='PyTorch Tacotron 2 Inference')
parser = parse_args(parser)
args, _ = parser.parse_known_args()
export_onnx(parser, args)
if __name__ == '__main__':
main()

View file

@ -31,8 +31,8 @@ from scipy.io.wavfile import write
import time
import torch
import argparse
import sys
import sys
sys.path.append('./')
from common.utils import to_gpu, get_mask_from_lengths
@ -40,7 +40,7 @@ from tacotron2.text import text_to_sequence
from inference import MeasureTime, prepare_input_sequence, load_and_setup_model
import dllogger as DLLogger
from dllogger import StdOutBackend, JSONStreamBackend, Verbosity
from trt.trt_utils import load_engine, run_trt_engine
from trt_utils import load_engine, run_trt_engine
from waveglow.denoiser import Denoiser
@ -284,9 +284,9 @@ def infer_waveglow_trt(waveglow, waveglow_context, mel, measurements, fp16):
waveglow_tensors = {
"inputs" :
{'mel': mel, 'z': z},
{'input__0': mel, 'input__1': z},
"outputs" :
{'audio': audios}
{'output__0': audios}
}
print("Running WaveGlow")
with MeasureTime(measurements, "waveglow_time"):
@ -343,6 +343,7 @@ def main():
sequences, sequence_lengths = prepare_input_sequence(texts)
sequences = sequences.to(torch.int32)
sequence_lengths = sequence_lengths.to(torch.int32)
with MeasureTime(measurements, "latency"):
mel, mel_lengths = infer_tacotron2_trt(encoder, decoder_iter, postnet,
encoder_context, decoder_context, postnet_context,

View file

@ -0,0 +1 @@
bash test_infer.sh --test tensorrt/test_infer_trt.py -bs 1 -il 128 --fp16 --num-iters 1003 --encoder ./output/encoder_fp16.engine --decoder ./output/decoder_iter_fp16.engine --postnet ./output/postnet_fp16.engine --waveglow ./output/waveglow_fp16.engine

View file

@ -34,10 +34,10 @@ import argparse
import numpy as np
from scipy.io.wavfile import write
from inference import checkpoint_from_distributed, unwrap_distributed, MeasureTime, prepare_input_sequence
from inference import checkpoint_from_distributed, unwrap_distributed, MeasureTime, prepare_input_sequence, load_and_setup_model
from inference_trt import infer_tacotron2_trt, infer_waveglow_trt
from trt.trt_utils import load_engine
from trt_utils import load_engine
import tensorrt as trt
import time
@ -79,34 +79,6 @@ def parse_args(parser):
return parser
def load_and_setup_model(model_name, parser, checkpoint, amp_run, to_cuda=True):
model_parser = models.parse_model_args(model_name, parser, add_help=False)
model_args, _ = model_parser.parse_known_args()
model_config = models.get_model_config(model_name, model_args)
model = models.get_model(model_name, model_config, to_cuda=to_cuda)
if checkpoint is not None:
if to_cuda:
state_dict = torch.load(checkpoint)['state_dict']
else:
state_dict = torch.load(checkpoint,map_location='cpu')['state_dict']
if checkpoint_from_distributed(state_dict):
state_dict = unwrap_distributed(state_dict)
model.load_state_dict(state_dict)
if model_name == "WaveGlow":
model = model.remove_weightnorm(model)
model.eval()
if amp_run:
model, _ = amp.initialize(model, [], opt_level="O3")
return model
def print_stats(measurements_all):
print(np.mean(measurements_all['latency'][1:]),
@ -137,7 +109,7 @@ def print_stats(measurements_all):
print("Throughput average (samples/sec) = {:.4f}".format(np.mean(throughput)))
print("Preprocessing average (seconds) = {:.4f}".format(np.mean(preprocessing)))
print("Postprocessing average (seconds) = {:.4f}".format(np.mean(postprocessing)))
print("Number of mels per audio average = {}".format(np.mean(num_mels_per_audio)))
print("Number of mels per audio average = {}".format(np.mean(num_mels_per_audio))) #
print("Latency average (seconds) = {:.4f}".format(np.mean(latency)))
print("Latency std (seconds) = {:.4f}".format(np.std(latency)))
print("Latency cl 50 (seconds) = {:.4f}".format(cf_50))
@ -190,8 +162,11 @@ def main():
if args.waveglow_ckpt != "":
# setup denoiser using WaveGlow PyTorch checkpoint
waveglow_ckpt = load_and_setup_model('WaveGlow', parser, args.waveglow_ckpt,
True, forward_is_infer=True)
waveglow_ckpt = load_and_setup_model('WaveGlow', parser,
args.waveglow_ckpt,
fp16_run=args.fp16,
cpu_run=False,
forward_is_infer=True)
denoiser = Denoiser(waveglow_ckpt).cuda()
# after initialization, we don't need WaveGlow PyTorch checkpoint
# anymore - deleting

View file

@ -73,6 +73,8 @@ def parse_args(parser):
help='Input length')
parser.add_argument('-bs', '--batch-size', type=int, default=1,
help='Batch size')
return parser
@ -177,7 +179,6 @@ def main():
num_mels = mel.size(0)*mel.size(2)
num_samples = audios.size(0)*audios.size(1)
with MeasureTime(measurements, "type_conversion", args.cpu):
audios = audios.float()

View file

@ -78,7 +78,7 @@ TMP_LOGFILE=tmp_log_${LOG_SUFFIX}.log
LOGFILE=log_${LOG_SUFFIX}.log
if [ "$TEST_PROGRAM" = "trt/test_infer_trt.py" ]
if [ "$TEST_PROGRAM" = "tensorrt/test_infer_trt.py" ]
then
TACOTRON2_PARAMS="--encoder $ENCODER_CKPT --decoder $DECODER_CKPT --postnet $POSTNET_CKPT"
else

View file

@ -44,6 +44,7 @@ from apex.parallel import DistributedDataParallel as DDP
import models
import loss_functions
import data_functions
from common.utils import ParseFromConfigFile
import dllogger as DLLogger
from dllogger import StdOutBackend, JSONStreamBackend, Verbosity
@ -73,6 +74,9 @@ def parse_args(parser):
parser.add_argument('--anneal-factor', type=float, choices=[0.1, 0.3], default=0.1,
help='Factor for annealing learning rate')
parser.add_argument('--config-file', action=ParseFromConfigFile,
type=str, help='Path to configuration file')
# training
training = parser.add_argument_group('training setup')
training.add_argument('--epochs', type=int, required=True,
@ -162,7 +166,10 @@ def parse_args(parser):
def reduce_tensor(tensor, num_gpus):
rt = tensor.clone()
dist.all_reduce(rt, op=dist.reduce_op.SUM)
rt /= num_gpus
if rt.is_floating_point():
rt = rt/num_gpus
else:
rt = rt//num_gpus
return rt
@ -211,8 +218,7 @@ def save_checkpoint(model, optimizer, epoch, config, amp_run, output_dir, model_
checkpoint['amp'] = amp.state_dict()
checkpoint_filename = "checkpoint_{}_{}.pt".format(model_name, epoch)
checkpoint_path = os.path.join(
output_dir, checkpoint_filename)
checkpoint_path = os.path.join(output_dir, checkpoint_filename)
print("Saving model and optimizer state at epoch {} to {}".format(
epoch, checkpoint_path))
torch.save(checkpoint, checkpoint_path)
@ -221,7 +227,7 @@ def save_checkpoint(model, optimizer, epoch, config, amp_run, output_dir, model_
symlink_dst = os.path.join(
output_dir, "checkpoint_{}_last.pt".format(model_name))
if os.path.exists(symlink_dst) and os.path.islink(symlink_dst):
print("|||| Updating symlink", symlink_dst, "to point to", symlink_src)
print("Updating symlink", symlink_dst, "to point to", symlink_src)
os.remove(symlink_dst)
os.symlink(symlink_src, symlink_dst)
@ -230,10 +236,10 @@ def save_checkpoint(model, optimizer, epoch, config, amp_run, output_dir, model_
def get_last_checkpoint_filename(output_dir, model_name):
symlink = os.path.join(output_dir, "checkpoint_{}_last.pt".format(model_name))
if os.path.exists(symlink):
print("|||| Loading checkpoint from symlink", symlink)
print("Loading checkpoint from symlink", symlink)
return os.path.join(output_dir, os.readlink(symlink))
else:
print("|||| No last checkpoint available - starting from epoch 0 ")
print("No last checkpoint available - starting from epoch 0 ")
return ""
@ -311,7 +317,7 @@ def validate(model, criterion, valset, epoch, batch_iter, batch_size,
DLLogger.log(step=(epoch,), data={'val_items_per_sec':
(val_items_per_sec/num_iters if num_iters > 0 else 0.0)})
return val_loss
return val_loss, val_items_per_sec
def adjust_learning_rate(iteration, epoch, optimizer, learning_rate,
anneal_steps, anneal_factor, rank):
@ -350,8 +356,8 @@ def main():
distributed_run = world_size > 1
if local_rank == 0:
DLLogger.init(backends=[JSONStreamBackend(Verbosity.DEFAULT,
args.output+'/'+args.log_file),
log_file = os.path.join(args.output, args.log_file)
DLLogger.init(backends=[JSONStreamBackend(Verbosity.DEFAULT, log_file),
StdOutBackend(Verbosity.VERBOSE)])
else:
DLLogger.init(backends=[])
@ -361,7 +367,7 @@ def main():
DLLogger.log(step="PARAMETER", data={'model_name':'Tacotron2_PyT'})
model_name = args.model_name
parser = models.parse_model_args(model_name, parser)
parser = models.model_parser(model_name, parser)
args, _ = parser.parse_known_args()
torch.backends.cudnn.enabled = args.cudnn_enabled
@ -519,9 +525,11 @@ def main():
DLLogger.log(step=(epoch,), data={'train_loss': reduced_loss})
DLLogger.log(step=(epoch,), data={'train_epoch_time': epoch_time})
val_loss = validate(model, criterion, valset, epoch, iteration,
args.batch_size, world_size, collate_fn,
distributed_run, local_rank, batch_to_gpu)
val_loss, val_items_per_sec = validate(model, criterion, valset, epoch,
iteration, args.batch_size,
world_size, collate_fn,
distributed_run, local_rank,
batch_to_gpu)
if (epoch % args.epochs_per_checkpoint == 0) and args.bench_class == "":
save_checkpoint(model, optimizer, epoch, model_config,
@ -537,6 +545,7 @@ def main():
DLLogger.log(step=tuple(), data={'val_loss': val_loss})
DLLogger.log(step=tuple(), data={'train_items_per_sec':
(train_epoch_items_per_sec/num_iters if num_iters > 0 else 0.0)})
DLLogger.log(step=tuple(), data={'val_items_per_sec': val_items_per_sec})
if local_rank == 0:
DLLogger.flush()

View file

@ -1 +0,0 @@
bash test_infer.sh --test trt/test_infer_trt.py -bs 1 -il 128 -p fp16 --num-iters 1003 --encoder ./output/encoder_fp16.engine --decoder ./output/decoder_iter_fp16.engine --postnet ./output/postnet_fp16.engine --waveglow ./output/waveglow_fp16.engine

View file

@ -27,7 +27,7 @@
import argparse
def parse_waveglow_args(parent, add_help=False):
def waveglow_parser(parent, add_help=False):
"""
Parse commandline arguments.
"""

View file

@ -34,7 +34,7 @@ from common.layers import STFT
class Denoiser(torch.nn.Module):
""" Removes model bias from audio produced with waveglow """
def __init__(self, waveglow, cpu_run=False, filter_length=1024, n_overlap=4,
def __init__(self, waveglow, filter_length=1024, n_overlap=4,
win_length=1024, mode='zeros'):
super(Denoiser, self).__init__()
device = waveglow.upsample.weight.device

View file

@ -58,6 +58,7 @@ class Invertible1x1Conv(torch.nn.Module):
if torch.det(W) < 0:
W[:, 0] = -1 * W[:, 0]
W = W.view(c, c, 1)
W = W.contiguous()
self.conv.weight.data = W
def forward(self, z):
@ -279,6 +280,49 @@ class WaveGlow(torch.nn.Module):
return audio
def infer_onnx(self, spect, z, sigma=0.9):
spect = self.upsample(spect)
# trim conv artifacts. maybe pad spec to kernel multiple
time_cutoff = self.upsample.kernel_size[0] - self.upsample.stride[0]
spect = spect[:, :, :-time_cutoff]
length_spect_group = spect.size(2)//8
mel_dim = 80
batch_size = spect.size(0)
spect = spect.view((batch_size, mel_dim, length_spect_group, self.n_group))
spect = spect.permute(0, 2, 1, 3)
spect = spect.contiguous()
spect = spect.view((batch_size, length_spect_group, self.n_group*mel_dim))
spect = spect.permute(0, 2, 1)
spect = spect.contiguous()
audio = z[:, :self.n_remaining_channels, :]
z = z[:, self.n_remaining_channels:self.n_group, :]
audio = sigma*audio
for k in reversed(range(self.n_flows)):
n_half = int(audio.size(1) // 2)
audio_0 = audio[:, :n_half, :]
audio_1 = audio[:, n_half:(n_half+n_half), :]
output = self.WN[k]((audio_0, spect))
s = output[:, n_half:(n_half+n_half), :]
b = output[:, :n_half, :]
audio_1 = (audio_1 - b) / torch.exp(s)
audio = torch.cat([audio_0, audio_1], 1)
audio = self.convinv[k].infer(audio)
if k % self.n_early_every == 0 and k > 0:
audio = torch.cat((z[:, :self.n_early_size, :], audio), 1)
z = z[:, self.n_early_size:self.n_group, :]
audio = audio.permute(0,2,1).contiguous().view(batch_size, (length_spect_group * self.n_group))
return audio
@staticmethod
def remove_weightnorm(model):
waveglow = model