[KaldiASR] Adding KaldiASR custom backend for TRTIS

This commit is contained in:
Przemek Strzelczyk 2020-01-15 17:04:32 +01:00
parent 784eb0d8ca
commit b9b03e1446
28 changed files with 2291 additions and 0 deletions

View file

@ -0,0 +1,3 @@
.git/
data/
kaldi/

4
Kaldi/SpeechRecognition/.gitignore vendored Normal file
View file

@ -0,0 +1,4 @@
data/*
!data/README.md
.*.swp
.*.swo

0
Kaldi/SpeechRecognition/.gitmodules vendored Normal file
View file

View file

@ -0,0 +1,55 @@
# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
FROM nvcr.io/nvidia/kaldi:19.12-online-beta-py3 as kb
ENV DEBIAN_FRONTEND=noninteractive
ARG PYVER=3.6
FROM nvcr.io/nvidia/tensorrtserver:19.12-py3
# Kaldi dependencies
RUN apt-get update && apt-get install -y --no-install-recommends \
automake \
autoconf \
cmake \
flac \
gawk \
libatlas3-base \
libtool \
python$PYVER \
python$PYVER-dev \
sox \
subversion \
unzip \
bc \
libatlas-base-dev \
zlib1g-dev
RUN mkdir /opt/trtis-kaldi && mkdir -p /workspace/model-repo/kaldi_online/1 && mkdir -p /mnt/model-repo
# Copying static files
COPY scripts /workspace/scripts
# Moving Kaldi to container
COPY --from=kb /opt/kaldi /opt/kaldi
ENV LD_LIBRARY_PATH /opt/kaldi/src/lib/:$LD_LIBRARY_PATH
# Building the custom backend
COPY trtis-kaldi-backend /workspace/trtis-kaldi-backend
#COPY --from=cbe /workspace/install/custom-backend-sdk /workspace/trtis-kaldi-backend/custom-backend-sdk
RUN cd /workspace/trtis-kaldi-backend && wget https://github.com/NVIDIA/tensorrt-inference-server/releases/download/v1.9.0/v1.9.0_ubuntu1804.custombackend.tar.gz -O custom-backend-sdk.tar.gz && tar -xzf custom-backend-sdk.tar.gz
RUN cd /workspace/trtis-kaldi-backend/ && make && cp libkaldi-trtisbackend.so /workspace/model-repo/kaldi_online/1/ && cd - && rm -r /workspace/trtis-kaldi-backend
COPY scripts/nvidia_kaldi_trtis_entrypoint.sh /opt/trtis-kaldi
ENTRYPOINT ["/opt/trtis-kaldi/nvidia_kaldi_trtis_entrypoint.sh"]

View file

@ -0,0 +1,41 @@
# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
FROM nvcr.io/nvidia/kaldi:19.12-online-beta-py3 as kb
FROM nvcr.io/nvidia/tensorrtserver:19.12-py3-clientsdk
# Kaldi dependencies
RUN apt-get update && apt-get install -y --no-install-recommends \
automake \
autoconf \
cmake \
flac \
gawk \
libatlas3-base \
libtool \
python$PYVER \
python$PYVER-dev \
sox \
subversion \
unzip \
bc \
libatlas-base-dev \
zlib1g-dev
# Moving Kaldi to container
COPY --from=kb /opt/kaldi /opt/kaldi
ENV LD_LIBRARY_PATH /opt/kaldi/src/lib/:$LD_LIBRARY_PATH
COPY kaldi-asr-client /workspace/src/clients/c++/kaldi-asr-client
RUN echo "add_subdirectory(kaldi-asr-client)" >> "/workspace/src/clients/c++/CMakeLists.txt"
RUN cd /workspace/build/ && make -j16 trtis-clients

View file

@ -0,0 +1,203 @@
Except where otherwise noted, the following license applies to all files in this repo.
Apache License
Version 2.0, January 2004
http://www.apache.org/licenses/
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
1. Definitions.
"License" shall mean the terms and conditions for use, reproduction,
and distribution as defined by Sections 1 through 9 of this document.
"Licensor" shall mean the copyright owner or entity authorized by
the copyright owner that is granting the License.
"Legal Entity" shall mean the union of the acting entity and all
other entities that control, are controlled by, or are under common
control with that entity. For the purposes of this definition,
"control" means (i) the power, direct or indirect, to cause the
direction or management of such entity, whether by contract or
otherwise, or (ii) ownership of fifty percent (50%) or more of the
outstanding shares, or (iii) beneficial ownership of such entity.
"You" (or "Your") shall mean an individual or Legal Entity
exercising permissions granted by this License.
"Source" form shall mean the preferred form for making modifications,
including but not limited to software source code, documentation
source, and configuration files.
"Object" form shall mean any form resulting from mechanical
transformation or translation of a Source form, including but
not limited to compiled object code, generated documentation,
and conversions to other media types.
"Work" shall mean the work of authorship, whether in Source or
Object form, made available under the License, as indicated by a
copyright notice that is included in or attached to the work
(an example is provided in the Appendix below).
"Derivative Works" shall mean any work, whether in Source or Object
form, that is based on (or derived from) the Work and for which the
editorial revisions, annotations, elaborations, or other modifications
represent, as a whole, an original work of authorship. For the purposes
of this License, Derivative Works shall not include works that remain
separable from, or merely link (or bind by name) to the interfaces of,
the Work and Derivative Works thereof.
"Contribution" shall mean any work of authorship, including
the original version of the Work and any modifications or additions
to that Work or Derivative Works thereof, that is intentionally
submitted to Licensor for inclusion in the Work by the copyright owner
or by an individual or Legal Entity authorized to submit on behalf of
the copyright owner. For the purposes of this definition, "submitted"
means any form of electronic, verbal, or written communication sent
to the Licensor or its representatives, including but not limited to
communication on electronic mailing lists, source code control systems,
and issue tracking systems that are managed by, or on behalf of, the
Licensor for the purpose of discussing and improving the Work, but
excluding communication that is conspicuously marked or otherwise
designated in writing by the copyright owner as "Not a Contribution."
"Contributor" shall mean Licensor and any individual or Legal Entity
on behalf of whom a Contribution has been received by Licensor and
subsequently incorporated within the Work.
2. Grant of Copyright License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
copyright license to reproduce, prepare Derivative Works of,
publicly display, publicly perform, sublicense, and distribute the
Work and such Derivative Works in Source or Object form.
3. Grant of Patent License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
(except as stated in this section) patent license to make, have made,
use, offer to sell, sell, import, and otherwise transfer the Work,
where such license applies only to those patent claims licensable
by such Contributor that are necessarily infringed by their
Contribution(s) alone or by combination of their Contribution(s)
with the Work to which such Contribution(s) was submitted. If You
institute patent litigation against any entity (including a
cross-claim or counterclaim in a lawsuit) alleging that the Work
or a Contribution incorporated within the Work constitutes direct
or contributory patent infringement, then any patent licenses
granted to You under this License for that Work shall terminate
as of the date such litigation is filed.
4. Redistribution. You may reproduce and distribute copies of the
Work or Derivative Works thereof in any medium, with or without
modifications, and in Source or Object form, provided that You
meet the following conditions:
(a) You must give any other recipients of the Work or
Derivative Works a copy of this License; and
(b) You must cause any modified files to carry prominent notices
stating that You changed the files; and
(c) You must retain, in the Source form of any Derivative Works
that You distribute, all copyright, patent, trademark, and
attribution notices from the Source form of the Work,
excluding those notices that do not pertain to any part of
the Derivative Works; and
(d) If the Work includes a "NOTICE" text file as part of its
distribution, then any Derivative Works that You distribute must
include a readable copy of the attribution notices contained
within such NOTICE file, excluding those notices that do not
pertain to any part of the Derivative Works, in at least one
of the following places: within a NOTICE text file distributed
as part of the Derivative Works; within the Source form or
documentation, if provided along with the Derivative Works; or,
within a display generated by the Derivative Works, if and
wherever such third-party notices normally appear. The contents
of the NOTICE file are for informational purposes only and
do not modify the License. You may add Your own attribution
notices within Derivative Works that You distribute, alongside
or as an addendum to the NOTICE text from the Work, provided
that such additional attribution notices cannot be construed
as modifying the License.
You may add Your own copyright statement to Your modifications and
may provide additional or different license terms and conditions
for use, reproduction, or distribution of Your modifications, or
for any such Derivative Works as a whole, provided Your use,
reproduction, and distribution of the Work otherwise complies with
the conditions stated in this License.
5. Submission of Contributions. Unless You explicitly state otherwise,
any Contribution intentionally submitted for inclusion in the Work
by You to the Licensor shall be under the terms and conditions of
this License, without any additional terms or conditions.
Notwithstanding the above, nothing herein shall supersede or modify
the terms of any separate license agreement you may have executed
with Licensor regarding such Contributions.
6. Trademarks. This License does not grant permission to use the trade
names, trademarks, service marks, or product names of the Licensor,
except as required for reasonable and customary use in describing the
origin of the Work and reproducing the content of the NOTICE file.
7. Disclaimer of Warranty. Unless required by applicable law or
agreed to in writing, Licensor provides the Work (and each
Contributor provides its Contributions) on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied, including, without limitation, any warranties or conditions
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
PARTICULAR PURPOSE. You are solely responsible for determining the
appropriateness of using or redistributing the Work and assume any
risks associated with Your exercise of permissions under this License.
8. Limitation of Liability. In no event and under no legal theory,
whether in tort (including negligence), contract, or otherwise,
unless required by applicable law (such as deliberate and grossly
negligent acts) or agreed to in writing, shall any Contributor be
liable to You for damages, including any direct, indirect, special,
incidental, or consequential damages of any character arising as a
result of this License or out of the use or inability to use the
Work (including but not limited to damages for loss of goodwill,
work stoppage, computer failure or malfunction, or any and all
other commercial damages or losses), even if such Contributor
has been advised of the possibility of such damages.
9. Accepting Warranty or Additional Liability. While redistributing
the Work or Derivative Works thereof, You may choose to offer,
and charge a fee for, acceptance of support, warranty, indemnity,
or other liability obligations and/or rights consistent with this
License. However, in accepting such obligations, You may act only
on Your own behalf and on Your sole responsibility, not on behalf
of any other Contributor, and only if You agree to indemnify,
defend, and hold each Contributor harmless for any liability
incurred by, or claims asserted against, such Contributor by reason
of your accepting any such warranty or additional liability.
END OF TERMS AND CONDITIONS
APPENDIX: How to apply the Apache License to your work.
To apply the Apache License to your work, attach the following
boilerplate notice, with the fields enclosed by brackets "[]"
replaced with your own identifying information. (Don't include
the brackets!) The text should be enclosed in the appropriate
comment syntax for the file format. We also recommend that a
file or class name and description of purpose be included on the
same "printed page" as the copyright notice for easier
identification within third-party archives.
Copyright 2019 NVIDIA Corporation
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.

View file

@ -0,0 +1,241 @@
# Kaldi ASR Integration With TensorRT Inference Server
This repository provides a Kaldi ASR custom backend for the NVIDIA TensorRT Inference Server (TRTIS). It can be used to demonstrate high-performance online inference on Kaldi ASR models. This includes handling the gRPC communication between the TensorRT Inference Server and clients, and the dynamic batching of inference requests. This repository is tested and maintained by NVIDIA.
## Table Of Contents
- [Table Of Contents](#table-of-contents)
- [Solution overview](#solution-overview)
* [Reference model](#reference-model)
* [Default configuration](#default-configuration)
- [Setup](#setup)
* [Requirements](#requirements)
- [Quick Start Guide](#quick-start-guide)
- [Advanced](#advanced)
* [Parameters](#parameters)
* [Model path](#model-path)
* [Model configuration](#model-configuration)
* [Inference engine configuration](#inference-engine-configuration)
* [Inference process](#inference-process)
* [Client command-line parameters](#client-command-line-parameters)
* [Input/Output](#inputoutput)
* [Input](#input)
* [Output](#output)
* [Using a custom Kaldi ASR model](#using-a-custom-kaldi-asr-model)
- [Performance](#performance)
* [Metrics](#metrics)
* [Results](#results)
- [Release notes](#release-notes)
* [Changelog](#changelog)
* [Known issues](#known-issues)
## Solution overview
This repository provides a wrapper around the online GPU-accelerated ASR pipeline from the paper [GPU-Accelerated Viterbi Exact Lattice Decoder for Batched Online and Offline Speech Recognition](https://arxiv.org/abs/1910.10032). That work includes a high-performance implementation of a GPU HMM Decoder, a low-latency Neural Net driver, fast Feature Extraction for preprocessing, and new ASR pipelines tailored for GPUs. These different modules have been integrated into the Kaldi ASR framework.
This repository contains a TensorRT Inference Server custom backend for the Kaldi ASR framework. This custom backend calls the high-performance online GPU pipeline from the Kaldi ASR framework. This TensorRT Inference Server integration provides ease-of-use to Kaldi ASR inference: gRPC streaming server, dynamic sequence batching, and multi-instances support. A client connects to the gRPC server, streams audio by sending chunks to the server, and gets back the inferred text as an answer (see [Input/Output](#input-output)). More information about the TensorRT Inference Server can be found [here](https://docs.nvidia.com/deeplearning/sdk/tensorrt-inference-server-guide/docs/).
This TensorRT Inference Server integration is meant to be used with the LibriSpeech model for demonstration purposes. We include a pre-trained version of this model to allow you to easily test this work (see [Quick Start Guide](#quick-start-guide)). Both the TensorRT Inference Server integration and the underlying Kaldi ASR online GPU pipeline are a work in progress and will support more functionalities in the future. This includes online iVectors not currently supported in the Kaldi ASR GPU online pipeline and being replaced by a zero vector (see [Known issues](#known-issues)). Support for a custom Kaldi model is experimental (see [Using a custom Kaldi model](#using-custom-kaldi-model)).
### Reference model
A reference model is used by all test scripts and benchmarks presented in this repository to illustrate this solution. We are using the Kaldi ASR `LibriSpeech` recipe, available [here](https://github.com/kaldi-asr/kaldi/blob/master/egs/librispeech/s5). It was trained by NVIDIA and is delivered as a pre-trained model.
### Default configuration
Details about parameters can be found in the [Parameters](#parameters) section.
* `model path`: Configured to use the pretrained LibriSpeech model.
* `beam`: 10
* `lattice_beam`: 7
* `max_active`: 10,000
* `frame_subsampling_factor`: 3
* `acoustic_scale`: 1.0
* `num_worker_threads`: 20
* `max_execution_batch_size`: 256
* `max_batch_size`: 4096
* `instance_group.count`: 2
## Setup
### Requirements
This repository contains Dockerfiles which extends the Kaldi and TensorRT Inference Server NVIDIA GPU Cloud (NGC) containers and encapsulates some dependencies. Aside from these dependencies, ensure you have [NVIDIA Docker](https://github.com/NVIDIA/nvidia-docker) installed.
For more information about how to get started with NGC containers, see the following sections from the NVIDIA GPU Cloud Documentation and the Deep Learning Documentation:
- [Getting Started Using NVIDIA GPU Cloud](https://docs.nvidia.com/ngc/ngc-getting-started-guide/index.html)
- [Accessing And Pulling From The NGC Container Registry](https://docs.nvidia.com/deeplearning/dgx/user-guide/index.html#accessing_registry)
## Quick Start Guide
1. Clone the repository.
```
git clone https://github.com/NVIDIA/DeepLearningExamples.git
cd DeepLearningExamples/Kaldi/SpeechRecognition
```
2. Build the client and server containers.
`scripts/docker/build.sh`
3. Download and set up the pre-trained model and eval dataset.
`scripts/docker/launch_download.sh`
The model and dataset are downloaded in the `data/` folder.
4. Start the server.
`scripts/docker/launch_server.sh`
Once you see the line `Starting Metrics Service at 0.0.0.0:8002`, the server is ready to be used. You can then start the client.
Currently, multi-GPU is not supported. By default GPU 0 is used. You can use a specific GPU by using `NVIDIA_VISIBLE_DEVICES`:
`NVIDIA_VISIBLE_DEVICES=<GPUID> scripts/docker/launch_server.sh`
5. Start the client.
The following command will stream 1000 parallel streams to the server. The `-p` option prints the inferred `TEXT` sent back from the server.
`scripts/docker/launch_client.sh -p`
## Advanced
### Parameters
The configuration is done through the `config.pbtxt` file available in `model-repo/` directory. It allows you to specify the following:
#### Model path
The following parameters can be modified if you want to use your own Kaldi model.
* `mfcc_filename`
* `ivector_filename`
* `nnet3_rxfilename`
* `fst_rxfilename`
* `word_syms_rxfilename`
#### Model configuration
The model configuration parameters are passed to the model and have an impact on both accuracy and performance. The model parameters are usually Kaldi ASR parameters, meaning, if they are, you can reuse the values that are currently being used in the CPU Kaldi ASR pipeline.
* `beam`
* `lattice_beam`
* `max_active`
* `frame_subsampling_factor`
* `acoustic_scale`
#### Inference engine configuration
The inference engine configuration parameters configure the inference engine. They impact performance, but not accuracy.
* `max_batch_size`: The maximum number of inference channels opened at a given time. If set to `4096`, then one instance will handle at most 4096 concurrent users.
* `num_worker_threads`: The number of CPU threads for the postprocessing CPU tasks, such as lattice determinization and text generation from the lattice.
* `max_execution_batch_size`: The size of one execution batch on the GPU. This parameter should be set as large as necessary to saturate the GPU, but not bigger. Larger batches will lead to a higher throughput, smaller batches to lower latency.
* `input.WAV_DATA.dims`: The maximum number of samples per chunk. The value must be a multiple of `frame_subsampling_factor * chunks_per_frame`.
### Inference process
Inference is done through simulating concurrent users. Each user is attributed to one utterance from the LibriSpeech dataset. It streams that utterance by cutting it into chunks and gets the final `TEXT` output once the final chunk has been sent. A parameter sets the number of active users being simulated in parallel.
### Client command-line parameters
The client can be configured through a set of parameters that define its behavior. To see the full list of available options and their descriptions, use the `-h` command-line option. The parameters are:
```
-v
-i <Number of iterations on the dataset>
-c <Number of parallel audio channels>
-a <Path to the scp dataset file>
-l <Maximum number of samples per chunk. Must correspond to the server config>
-u <URL for inference service and its gRPC port>
-o : Only feed each channel at realtime speed. Simulates online clients.
-p : Print text outputs
```
### Input/Output
The API is currently experimental.
#### Input
The server execpts chunks of audio each containing up to `input.WAV_DATA.dims` samples. Per default, this corresponds to 510ms of audio per chunk. The last chunk can send a partial chunk smaller than this maximum value.
The chunk is made of a float array set in the input `WAV_DATA`, with the input `WAV_DATA_DIM` containing the number of samples contained in that chunk. Flags can be set to declare a chunk as a first chunk or last chunk for a sequence. Finally, each chunk from a given sequence is associated with a `CorrelationID`. Every chunk belonging to the same sequence must be given the same `CorrelationID`.
#### Output
Once the server receives the final chunk for a sequence (with the `END` flag set), it will generate the output associated with that sequence, and send it back to the client. The end of the sequencing procedure is:
1. Process the last chunk.
2. Flush and process the Neural Net context.
3. Generate the full lattice for the sequence.
4. Determinize the lattice.
5. Find the best path in the lattice.
6. Generate the text output for that best path.
7. Send the text back to the client.
Even if only the best path is used, we are still generating a full lattice for benchmarking purposes. Partial results (generated after each timestep) are currently not available but will be added in a future release.
### Using a custom Kaldi ASR model
Support for Kaldi ASR models that are different from the provided LibriSpeech model is experimental. However, it is possible to modify the [Model Path](#model-path) section of the config file `model-repo/kaldi_online/config.pbtxt` to set up your own model.
The models and Kaldi allocators are currently not shared between instances. This means that if your model is large, you may end up with not enough memory on the GPU to store two different instances. If that's the case, you can set `count` to `1` in the `instance_group` section of the config file.
## Performance
### Metrics
Throughput is measured using the RTFX metric. It is defined such as : `RTFX = (number of seconds of audio inferred) / (compute time in seconds)`. It is the inverse of the RTF (Real Time Factor) metric, such as `RTFX = 1/RTF`.
Latency is defined as the delay between the availability of the last chunk of audio and the reception of the inferred text. More precisely, it is defined such as :
1. *Client:* Last audio chunk available
2. ***t0** <- Current time*
3. *Client:* Send last audio chunk
4. *Server:* Compute inference of last chunk
5. *Server:* Generate the raw lattice for the full utterance
6. *Server:* Determinize the raw lattice
7. *Server:* Generate the text output associated with the best path in the determinized lattice
8. *Client:* Receive text output
9. *Client:* Call callback with output
10. ***t1** <- Current time*
The latency is defined such as `latency = t1 - t0`.
### Results
Our results were obtained by:
1. Building and starting the server as described in [Quick Start Guide](#quick-start-guide).
2. Running `scripts/run_inference_all_v100.sh` and `scripts/run_inference_all_t4.sh`
| GPU | Realtime I/O | Number of parallel audio channels | Throughput (RTFX) | Latency | | | |
| ------ | ------ | ------ | ------ | ------ | ------ | ------ |------ |
| | | | | 90% | 95% | 99% | Avg |
| V100 | No | 2000 | 1769.8 | N/A | N/A | N/A | N/A |
| V100 | Yes | 1500 | 1220 | 0.424 | 0.473 | 0.758 | 0.345 |
| V100 | Yes | 1000 | 867.4 | 0.358 | 0.405 | 0.707 | 0.276 |
| V100 | Yes | 800 | 647.8 | 0.304 | 0.325 | 0.517 | 0.238 |
| T4 | No | 1000 | 906.7 | N/A | N/A | N/A| N/A |
| T4 | Yes | 700 | 629.6 | 0.629 | 0.782 | 1.01 | 0.463 |
| T4 | Yes | 400 | 373.7 | 0.417 | 0.441 | 0.690 | 0.349 |
## Release notes
### Changelog
January 2020
* Initial release
### Known issues
Only mfcc features are supported at this time. The reference model used in the benchmark scripts requires both mfcc and iVector features to deliver the best accuracy. Support for iVector features will be added in a future release.

View file

View file

@ -0,0 +1,77 @@
# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
# are met:
# * Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# * Neither the name of NVIDIA CORPORATION nor the names of its
# contributors may be used to endorse or promote products derived
# from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
cmake_minimum_required (VERSION 3.5)
add_executable(kaldi_asr_parallel_client kaldi_asr_parallel_client.cc asr_client_imp.cc)
target_link_libraries(
kaldi_asr_parallel_client
PRIVATE request
)
target_link_libraries(
kaldi_asr_parallel_client
PRIVATE protobuf::libprotobuf
)
target_include_directories(
kaldi_asr_parallel_client
PRIVATE
/opt/kaldi/src/
)
target_include_directories(
kaldi_asr_parallel_client
PRIVATE
/opt/kaldi/tools/openfst-1.6.7/include/
)
target_link_libraries(
kaldi_asr_parallel_client
PRIVATE /opt/kaldi/src/lib/libkaldi-feat.so
)
target_link_libraries(
kaldi_asr_parallel_client
PRIVATE /opt/kaldi/src/lib/libkaldi-util.so
)
target_link_libraries(
kaldi_asr_parallel_client
PRIVATE /opt/kaldi/src/lib/libkaldi-matrix.so
)
target_link_libraries(
kaldi_asr_parallel_client
PRIVATE /opt/kaldi/src/lib/libkaldi-base.so
)
install(
TARGETS kaldi_asr_parallel_client
RUNTIME DESTINATION bin
)

View file

@ -0,0 +1,177 @@
// Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "asr_client_imp.h"
#include <unistd.h>
#include <cmath>
#include <cstring>
#include <iomanip>
#include <numeric>
#define FAIL_IF_ERR(X, MSG) \
{ \
nic::Error err = (X); \
if (!err.IsOk()) { \
std::cerr << "error: " << (MSG) << ": " << err << std::endl; \
exit(1); \
} \
}
void TRTISASRClient::CreateClientContext() {
contextes_.emplace_back();
ClientContext& client = contextes_.back();
FAIL_IF_ERR(nic::InferGrpcStreamContext::Create(
&client.trtis_context, /*corr_id*/ -1, url_, model_name_,
/*model_version*/ -1,
/*verbose*/ false),
"unable to create context");
}
void TRTISASRClient::SendChunk(ni::CorrelationID corr_id,
bool start_of_sequence, bool end_of_sequence,
float* chunk, int chunk_byte_size) {
ClientContext* client = &contextes_[corr_id % ncontextes_];
nic::InferContext& context = *client->trtis_context;
if (start_of_sequence) n_in_flight_.fetch_add(1, std::memory_order_consume);
// Setting options
std::unique_ptr<nic::InferContext::Options> options;
FAIL_IF_ERR(nic::InferContext::Options::Create(&options),
"unable to create inference options");
options->SetBatchSize(1);
options->SetFlags(0);
options->SetCorrelationId(corr_id);
if (start_of_sequence)
options->SetFlag(ni::InferRequestHeader::FLAG_SEQUENCE_START,
start_of_sequence);
if (end_of_sequence) {
options->SetFlag(ni::InferRequestHeader::FLAG_SEQUENCE_END,
end_of_sequence);
for (const auto& output : context.Outputs()) {
options->AddRawResult(output);
}
}
FAIL_IF_ERR(context.SetRunOptions(*options), "unable to set context options");
std::shared_ptr<nic::InferContext::Input> in_wave_data, in_wave_data_dim;
FAIL_IF_ERR(context.GetInput("WAV_DATA", &in_wave_data),
"unable to get WAV_DATA");
FAIL_IF_ERR(context.GetInput("WAV_DATA_DIM", &in_wave_data_dim),
"unable to get WAV_DATA_DIM");
// Wave data input
FAIL_IF_ERR(in_wave_data->Reset(), "unable to reset WAVE_DATA");
uint8_t* wave_data = reinterpret_cast<uint8_t*>(chunk);
if (chunk_byte_size < max_chunk_byte_size_) {
std::memcpy(&chunk_buf_[0], chunk, chunk_byte_size);
wave_data = &chunk_buf_[0];
}
FAIL_IF_ERR(in_wave_data->SetRaw(wave_data, max_chunk_byte_size_),
"unable to set data for WAVE_DATA");
// Dim
FAIL_IF_ERR(in_wave_data_dim->Reset(), "unable to reset WAVE_DATA_DIM");
int nsamples = chunk_byte_size / sizeof(float);
FAIL_IF_ERR(in_wave_data_dim->SetRaw(reinterpret_cast<uint8_t*>(&nsamples),
sizeof(int32_t)),
"unable to set data for WAVE_DATA_DIM");
total_audio_ += (static_cast<double>(nsamples) / 16000.); // TODO freq
double start = gettime_monotonic();
FAIL_IF_ERR(context.AsyncRun([corr_id, end_of_sequence, start, this](
nic::InferContext* ctx,
const std::shared_ptr<nic::InferContext::Request>& request) {
if (end_of_sequence) {
double elapsed = gettime_monotonic() - start;
std::string out;
std::map<std::string, std::unique_ptr<nic::InferContext::Result>> results;
ctx->GetAsyncRunResults(request, &results);
if (results.size() != 1) {
std::cerr << "Warning: Could not read output for corr_id " << corr_id
<< std::endl;
} else {
FAIL_IF_ERR(results["TEXT"]->GetRawAtCursor(0, &out),
"unable to get TEXT output");
if (print_results_) {
std::lock_guard<std::mutex> lk(stdout_m_);
std::cout << "CORR_ID " << corr_id << "\t\t" << out << std::endl;
}
{
std::lock_guard<std::mutex> lk(results_m_);
results_.insert({corr_id, {std::move(out), elapsed}});
}
}
n_in_flight_.fetch_sub(1, std::memory_order_relaxed);
}
}),
"unable to run model");
}
void TRTISASRClient::WaitForCallbacks() {
int n;
while ((n = n_in_flight_.load(std::memory_order_consume))) {
usleep(1000);
}
}
void TRTISASRClient::PrintStats() {
double now = gettime_monotonic();
double diff = now - started_at_;
double rtf = total_audio_ / diff;
std::cout << "Throughput:\t" << rtf << " RTFX" << std::endl;
std::vector<double> latencies;
{
std::lock_guard<std::mutex> lk(results_m_);
latencies.reserve(results_.size());
for (auto& result : results_) latencies.push_back(result.second.latency);
}
std::sort(latencies.begin(), latencies.end());
double nresultsf = static_cast<double>(latencies.size());
size_t per90i = static_cast<size_t>(std::floor(90. * nresultsf / 100.));
size_t per95i = static_cast<size_t>(std::floor(95. * nresultsf / 100.));
size_t per99i = static_cast<size_t>(std::floor(99. * nresultsf / 100.));
double lat_90 = latencies[per90i];
double lat_95 = latencies[per95i];
double lat_99 = latencies[per99i];
double avg = std::accumulate(latencies.begin(), latencies.end(), 0.0) /
latencies.size();
std::cout << std::setprecision(3);
std::cout << "Latencies:\t90\t\t95\t\t99\t\tAvg\n";
std::cout << "\t\t" << lat_90 << "\t\t" << lat_95 << "\t\t" << lat_99
<< "\t\t" << avg << std::endl;
}
TRTISASRClient::TRTISASRClient(const std::string& url,
const std::string& model_name,
const int ncontextes, bool print_results)
: url_(url),
model_name_(model_name),
ncontextes_(ncontextes),
print_results_(print_results) {
ncontextes_ = std::max(ncontextes_, 1);
for (int i = 0; i < ncontextes_; ++i) CreateClientContext();
std::shared_ptr<nic::InferContext::Input> in_wave_data;
FAIL_IF_ERR(contextes_[0].trtis_context->GetInput("WAV_DATA", &in_wave_data),
"unable to get WAV_DATA");
max_chunk_byte_size_ = in_wave_data->ByteSize();
chunk_buf_.resize(max_chunk_byte_size_);
shape_ = {max_chunk_byte_size_};
n_in_flight_.store(0);
started_at_ = gettime_monotonic();
total_audio_ = 0;
}

View file

@ -0,0 +1,73 @@
// Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <queue>
#include <string>
#include <vector>
#include "request_grpc.h"
#ifndef TRTIS_KALDI_ASR_CLIENT_H_
#define TRTIS_KALDI_ASR_CLIENT_H_
namespace ni = nvidia::inferenceserver;
namespace nic = nvidia::inferenceserver::client;
// time with arbitrary reference
double inline gettime_monotonic() {
struct timespec ts;
clock_gettime(CLOCK_MONOTONIC, &ts);
double time = ts.tv_sec;
time += (double)(ts.tv_nsec) / 1e9;
return time;
}
class TRTISASRClient {
struct ClientContext {
std::unique_ptr<nic::InferContext> trtis_context;
};
std::string url_;
std::string model_name_;
std::vector<ClientContext> contextes_;
int ncontextes_;
std::vector<uint8_t> chunk_buf_;
std::vector<int64_t> shape_;
int max_chunk_byte_size_;
std::atomic<int> n_in_flight_;
double started_at_;
double total_audio_;
bool print_results_;
std::mutex stdout_m_;
struct Result {
std::string text;
double latency;
};
std::unordered_map<ni::CorrelationID, Result> results_;
std::mutex results_m_;
public:
void CreateClientContext();
void SendChunk(uint64_t corr_id, bool start_of_sequence, bool end_of_sequence,
float* chunk, int chunk_byte_size);
void WaitForCallbacks();
void PrintStats();
TRTISASRClient(const std::string& url, const std::string& model_name,
const int ncontextes, bool print_results);
};
#endif // TRTIS_KALDI_ASR_CLIENT_H_

View file

@ -0,0 +1,218 @@
// Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <unistd.h>
#include <iostream>
#include <string>
#include <vector>
#include "asr_client_imp.h"
#include "feat/wave-reader.h" // to read the wav.scp
#include "util/kaldi-table.h"
using kaldi::BaseFloat;
void Usage(char** argv, const std::string& msg = std::string()) {
if (!msg.empty()) {
std::cerr << "error: " << msg << std::endl;
}
std::cerr << "Usage: scripts/docker/launch_client.sh [options]" << std::endl;
std::cerr << "\t-v" << std::endl;
std::cerr << "\t-i <Number of iterations on the dataset>" << std::endl;
std::cerr << "\t-c <Number of parallel audio channels>" << std::endl;
std::cerr << "\t-a <Path to the scp dataset file>" << std::endl;
std::cerr << "\t-l <Maximum number of samples per chunk. Must correspond to "
"the server config>"
<< std::endl;
std::cerr << "\t-u <URL for inference service and its gRPC port>"
<< std::endl;
std::cerr << "\t-o : Only feed each channel at realtime speed. Simulates "
"online clients."
<< std::endl;
std::cerr << "\t-p : Print text outputs" << std::endl;
std::cerr << std::endl;
exit(1);
}
int main(int argc, char** argv) {
std::cout << "\n";
std::cout << "==================================================\n"
<< "============= TRTIS Kaldi ASR Client =============\n"
<< "==================================================\n"
<< std::endl;
// kaldi nampespace TODO
using namespace kaldi;
typedef kaldi::int32 int32;
std::string url = "localhost:8001";
std::string model_name = "kaldi_online";
std::string wav_rspecifier =
"scp:/data/datasets/LibriSpeech/test_clean/wav_conv.scp";
int chunk_length = 8160;
size_t nchannels = 1000;
int niterations = 5;
bool verbose = false;
float samp_freq = 16000;
int ncontextes = 10;
bool online = false;
bool print_results = false;
// Parse commandline...
int opt;
while ((opt = getopt(argc, argv, "va:u:i:c:ophl:")) != -1) {
switch (opt) {
case 'i':
niterations = std::atoi(optarg);
break;
case 'c':
nchannels = std::atoi(optarg);
break;
case 'a':
wav_rspecifier = optarg;
break;
case 'u':
url = optarg;
break;
case 'v':
verbose = true;
break;
case 'o':
online = true;
break;
case 'p':
print_results = true;
break;
case 'l':
chunk_length = std::atoi(optarg);
break;
case 'h':
case '?':
Usage(argv);
break;
}
}
if (niterations <= 0) Usage(argv, "number of iterations must be > 0");
if (nchannels <= 0) Usage(argv, "number of audio channels must be > 0");
if (chunk_length <= 0) Usage(argv, "chunk length must be > 0");
std::cout << "Configuration:" << std::endl;
std::cout << std::endl;
std::cout << "Number of iterations\t\t: " << niterations << std::endl;
std::cout << "Number of parallel channels\t: " << nchannels << std::endl;
std::cout << "Server URL\t\t\t: " << url << std::endl;
std::cout << "Print results\t\t\t: " << (print_results ? "Yes" : "No")
<< std::endl;
std::cout << "Online - Realtime I/O\t\t: " << (online ? "Yes" : "No")
<< std::endl;
std::cout << std::endl;
float chunk_seconds = (double)chunk_length / samp_freq;
// need to read wav files
SequentialTableReader<WaveHolder> wav_reader(wav_rspecifier);
std::atomic<uint64_t> correlation_id;
correlation_id.store(1); // 0 = no correlation
double total_audio = 0;
// pre-loading data
// we don't want to measure I/O
std::vector<std::shared_ptr<WaveData>> all_wav;
{
std::cout << "Loading eval dataset..." << std::flush;
for (; !wav_reader.Done(); wav_reader.Next()) {
std::string utt = wav_reader.Key();
std::shared_ptr<WaveData> wave_data = std::make_shared<WaveData>();
wave_data->Swap(&wav_reader.Value());
all_wav.push_back(wave_data);
total_audio += wave_data->Duration();
}
std::cout << "done" << std::endl;
}
struct Stream {
std::shared_ptr<WaveData> wav;
ni::CorrelationID corr_id;
int offset;
float send_next_chunk_at;
std::atomic<bool> received_output;
Stream(const std::shared_ptr<WaveData>& _wav, ni::CorrelationID _corr_id)
: wav(_wav), corr_id(_corr_id), offset(0), received_output(true) {
send_next_chunk_at = gettime_monotonic();
}
};
std::cout << "Opening GRPC contextes..." << std::flush;
TRTISASRClient asr_client(url, model_name, ncontextes, print_results);
std::cout << "done" << std::endl;
std::cout << "Streaming utterances..." << std::flush;
std::vector<std::unique_ptr<Stream>> curr_tasks, next_tasks;
curr_tasks.reserve(nchannels);
next_tasks.reserve(nchannels);
size_t all_wav_i = 0;
size_t all_wav_max = all_wav.size() * niterations;
while (true) {
while (curr_tasks.size() < nchannels && all_wav_i < all_wav_max) {
// Creating new tasks
uint64_t corr_id = correlation_id.fetch_add(1);
std::unique_ptr<Stream> ptr(new Stream(all_wav[all_wav_i%(all_wav.size())], corr_id));
curr_tasks.emplace_back(std::move(ptr));
++all_wav_i;
}
// If still empty, done
if (curr_tasks.empty()) break;
for (size_t itask = 0; itask < curr_tasks.size(); ++itask) {
Stream& task = *(curr_tasks[itask]);
SubVector<BaseFloat> data(task.wav->Data(), 0);
int32 samp_offset = task.offset;
int32 nsamp = data.Dim();
int32 samp_remaining = nsamp - samp_offset;
int32 num_samp =
chunk_length < samp_remaining ? chunk_length : samp_remaining;
bool is_last_chunk = (chunk_length >= samp_remaining);
SubVector<BaseFloat> wave_part(data, samp_offset, num_samp);
bool is_first_chunk = (samp_offset == 0);
if (online) {
double now = gettime_monotonic();
double wait_for = task.send_next_chunk_at - now;
if (wait_for > 0) usleep(wait_for * 1e6);
}
asr_client.SendChunk(task.corr_id, is_first_chunk, is_last_chunk,
wave_part.Data(), wave_part.SizeInBytes());
task.send_next_chunk_at += chunk_seconds;
if (verbose)
std::cout << "Sending correlation_id=" << task.corr_id
<< " chunk offset=" << num_samp << std::endl;
task.offset += num_samp;
if (!is_last_chunk) next_tasks.push_back(std::move(curr_tasks[itask]));
}
curr_tasks.swap(next_tasks);
next_tasks.clear();
// Showing activity if necessary
if (!print_results && !verbose) std::cout << "." << std::flush;
}
std::cout << "done" << std::endl;
std::cout << "Waiting for all results..." << std::flush;
asr_client.WaitForCallbacks();
std::cout << "done" << std::endl;
asr_client.PrintStats();
return 0;
}

View file

@ -0,0 +1,149 @@
name: "kaldi_online"
platform: "custom"
default_model_filename: "libkaldi-trtisbackend.so"
max_batch_size: 2200
parameters: {
key: "mfcc_filename"
value: {
string_value:"/data/models/LibriSpeech/conf/mfcc.conf"
}
}
parameters: {
key: "ivector_filename"
value: {
string_value:"/data/models/LibriSpeech/conf/ivector_extractor.conf"
}
}
parameters: {
key: "nnet3_rxfilename"
value: {
string_value: "/data/models/LibriSpeech/final.mdl"
}
}
parameters: {
key: "fst_rxfilename"
value: {
string_value: "/data/models/LibriSpeech/HCLG.fst"
}
}
parameters: {
key: "word_syms_rxfilename"
value: {
string_value:"/data/models/LibriSpeech/words.txt"
}
}
parameters: [{
key: "beam"
value: {
string_value:"10"
}
},{
key: "num_worker_threads"
value: {
string_value:"40"
}
},
{
key: "max_execution_batch_size"
value: {
string_value:"512"
}
}]
parameters: {
key: "lattice_beam"
value: {
string_value:"7"
}
}
parameters: {
key: "max_active"
value: {
string_value:"10000"
}
}
parameters: {
key: "frame_subsampling_factor"
value: {
string_value:"3"
}
}
parameters: {
key: "acoustic_scale"
value: {
string_value:"1.0"
}
}
sequence_batching {
max_sequence_idle_microseconds:5000000
control_input [
{
name: "START"
control [
{
kind: CONTROL_SEQUENCE_START
int32_false_true: [ 0, 1 ]
}
]
},
{
name: "READY"
control [
{
kind: CONTROL_SEQUENCE_READY
int32_false_true: [ 0, 1 ]
}
]
},
{
name: "END"
control [
{
kind: CONTROL_SEQUENCE_END
int32_false_true: [ 0, 1 ]
}
]
},
{
name: "CORRID"
control [
{
kind: CONTROL_SEQUENCE_CORRID
data_type: TYPE_UINT64
}
]
}
]
oldest {
max_candidate_sequences:2200
preferred_batch_size:[256,512]
max_queue_delay_microseconds:1000
}
},
input [
{
name: "WAV_DATA"
data_type: TYPE_FP32
dims: [ 8160 ]
},
{
name: "WAV_DATA_DIM"
data_type: TYPE_INT32
dims: [ 1 ]
}
]
output [
{
name: "TEXT"
data_type: TYPE_STRING
dims: [ 1 ]
}
]
instance_group [
{
count: 2
kind: KIND_GPU
}
]

View file

@ -0,0 +1,17 @@
#!/bin/bash
# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
docker build . -f Dockerfile --rm -t trtis_kaldi_server
docker build . -f Dockerfile.client --rm -t trtis_kaldi_client

View file

@ -0,0 +1,18 @@
#!/bin/bash
# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
/workspace/scripts/docker/prepare_data.sh
chown -R $1:$2 /data/
mv /data/* /mnt/data/

View file

@ -0,0 +1,22 @@
#!/bin/bash
# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
docker run --rm -it \
--net=host \
--shm-size=1g \
--ulimit memlock=-1 \
--ulimit stack=67108864 \
-v $PWD/data:/data \
trtis_kaldi_client install/bin/kaldi_asr_parallel_client $@

View file

@ -0,0 +1,24 @@
#!/bin/bash
# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Start TRTIS server container for download - need some kaldi tools
nvidia-docker run --rm \
--shm-size=1g \
--ulimit memlock=-1 \
--ulimit stack=67108864 \
-v $PWD/data:/mnt/data \
trtis_kaldi_server /workspace/scripts/docker/dataset_setup.sh $(id -u) $(id -g)
# --user $(id -u):$(id -g) \

View file

@ -0,0 +1,30 @@
#!/bin/bash
# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
NV_VISIBLE_DEVICES=${NVIDIA_VISIBLE_DEVICES:-"0"}
# Start TRTIS server
nvidia-docker run --rm -it \
--shm-size=1g \
--ulimit memlock=-1 \
--ulimit stack=67108864 \
-p8000:8000 \
-p8001:8001 \
-p8002:8002 \
--name trt_server_asr \
-e NVIDIA_VISIBLE_DEVICES=$NV_VISIBLE_DEVICES \
-v $PWD/data:/data \
-v $PWD/model-repo:/mnt/model-repo \
trtis_kaldi_server trtserver --model-repo=/workspace/model-repo/

View file

@ -0,0 +1,89 @@
#!/bin/bash
# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
WORKSPACE=/data/
KALDI_ROOT=/opt/kaldi/
model=LibriSpeech
data=${1:-$WORKSPACE/data/}
datasets=$WORKSPACE/datasets/
models=$WORKSPACE/models/
# base url for downloads.
data_url=www.openslr.org/resources/12
lm_url=www.openslr.org/resources/11
mfccdir=mfcc
mkdir -p $data/$model
mkdir -p $models/$model
mkdir -p $datasets/$model
pushd $KALDI_ROOT/egs/librispeech/s5
. ./cmd.sh
. ./path.sh
. parse_options.sh
# you might not want to do this for interactive shells.
set -e
if [[ "$SKIP_DATA_DOWNLOAD" -ne "1" ]]; then
echo ----------- Fetching dataset -----------
# download the data. Note: we're using the 100 hour setup for
# now; later in the script we'll download more and use it to train neural
# nets.
for part in test-clean test-other; do
local/download_and_untar.sh $data $data_url $part
done
fi
# format the data as Kaldi data directories
echo ----------- Preprocessing dataset -----------
for part in test-clean test-other; do
# use underscore-separated names in data directories.
local/data_prep.sh $data/$model/$part $datasets/$model/$(echo $part | sed s/-/_/g)
# convert the manifests
pushd $datasets/$model/$(echo $part | sed s/-/_/g)
#sed -i 's@workspace@'"${WORKSPACE}"'@' wav.scp
(cat wav.scp | awk '{print $1" "$6}' | sed 's/\.flac/\.wav/g' > wav_conv.scp)
popd
done
if [[ "$SKIP_FLAC2WAV" -ne "1" ]]; then
# Convert flac files to wavs
for flac in $(find $data/$model -name "*.flac"); do
wav=$(echo $flac | sed 's/flac/wav/g')
sox $flac -r 16000 -b 16 $wav
done
echo "Converted flac to wav."
fi
popd >&/dev/null
if [[ "$SKIP_MODEL_DOWNLOAD" -ne "1" ]]; then
echo ----------- Fetching trained model -----------
pushd $models >&/dev/null
wget https://github.com/ryanleary/kaldi-test/releases/download/v0.0/LibriSpeech-trained.tgz -O LibriSpeech-trained.tgz
tar -xzf LibriSpeech-trained.tgz -C $model
cd $model/conf/
find . -name "*.conf" -exec sed -i 's@workspace@'"${WORKSPACE}"'@' {} \;
popd >&/dev/null
fi

View file

@ -0,0 +1,22 @@
#!/bin/bash
# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
set -e
if [ -d "/mnt/model-repo/kaldi_online" ]; then
ln -s /mnt/model-repo/kaldi_online/config.pbtxt /workspace/model-repo/kaldi_online/
fi
/opt/tensorrtserver/nvidia_entrypoint.sh $@

View file

@ -0,0 +1,30 @@
#!/bin/bash
# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
set -e
if [[ "$(docker ps | grep trtis_kaldi_server | wc -l)" == "0" ]]; then
printf "\nThe TensorRT Inference Server is currently not running. Please run scripts/docker/launch_server.sh\n\n"
exit 1
fi
printf "\nOffline benchmarks:\n"
scripts/docker/launch_client.sh -i 5 -c 1000
printf "\nOnline benchmarks:\n"
scripts/docker/launch_client.sh -i 10 -c 700 -o
scripts/docker/launch_client.sh -i 10 -c 400 -o

View file

@ -0,0 +1,31 @@
#!/bin/bash
# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
set -e
if [[ "$(docker ps | grep trtis_kaldi_server | wc -l)" == "0" ]]; then
printf "\nThe TensorRT Inference Server is currently not running. Please run scripts/docker/launch_server.sh\n\n"
exit 1
fi
printf "\nOffline benchmarks:\n"
scripts/docker/launch_client.sh -i 5 -c 2000
printf "\nOnline benchmarks:\n"
scripts/docker/launch_client.sh -i 10 -c 1500 -o
scripts/docker/launch_client.sh -i 10 -c 1000 -o
scripts/docker/launch_client.sh -i 5 -c 800 -o

View file

@ -0,0 +1,5 @@
.PHONY: all
all: kaldibackend
kaldibackend: kaldi-backend.cc kaldi-backend-utils.cc
g++ -fpic -shared -std=c++11 -o libkaldi-trtisbackend.so kaldi-backend.cc kaldi-backend-utils.cc -Icustom-backend-sdk/include custom-backend-sdk/lib/libcustombackend.a -I/opt/kaldi/src/ -I/usr/local/cuda/include -I/opt/kaldi/tools/openfst/include/ -L/opt/kaldi/src/lib/ -lkaldi-cudadecoder

View file

@ -0,0 +1,155 @@
// Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "kaldi-backend-utils.h"
namespace nvidia {
namespace inferenceserver {
namespace custom {
namespace kaldi_cbe {
int GetInputTensor(CustomGetNextInputFn_t input_fn, void* input_context,
const char* name, const size_t expected_byte_size,
std::vector<uint8_t>* input, const void** out) {
input->clear(); // reset buffer
// The values for an input tensor are not necessarily in one
// contiguous chunk, so we might copy the chunks into 'input' vector.
// If possible, we use the data in place
uint64_t total_content_byte_size = 0;
while (true) {
const void* content;
uint64_t content_byte_size = expected_byte_size - total_content_byte_size;
if (!input_fn(input_context, name, &content, &content_byte_size)) {
return kInputContents;
}
// If 'content' returns nullptr we have all the input.
if (content == nullptr) break;
// If the total amount of content received exceeds what we expect
// then something is wrong.
total_content_byte_size += content_byte_size;
if (total_content_byte_size > expected_byte_size)
return kInputSize;
if (content_byte_size == expected_byte_size) {
*out = content;
return kSuccess;
}
input->insert(input->end(), static_cast<const uint8_t*>(content),
static_cast<const uint8_t*>(content) + content_byte_size);
}
// Make sure we end up with exactly the amount of input we expect.
if (total_content_byte_size != expected_byte_size) {
return kInputSize;
}
*out = &input[0];
return kSuccess;
}
void LatticeToString(fst::SymbolTable& word_syms,
const kaldi::CompactLattice& dlat, std::string* out_str) {
kaldi::CompactLattice best_path_clat;
kaldi::CompactLatticeShortestPath(dlat, &best_path_clat);
kaldi::Lattice best_path_lat;
fst::ConvertLattice(best_path_clat, &best_path_lat);
std::vector<int32> alignment;
std::vector<int32> words;
kaldi::LatticeWeight weight;
fst::GetLinearSymbolSequence(best_path_lat, &alignment, &words, &weight);
std::ostringstream oss;
for (size_t i = 0; i < words.size(); i++) {
std::string s = word_syms.Find(words[i]);
if (s == "") std::cerr << "Word-id " << words[i] << " not in symbol table.";
oss << s << " ";
}
*out_str = std::move(oss.str());
}
int ReadParameter(const ModelConfig& model_config_, const std::string& key,
std::string* param) {
auto it = model_config_.parameters().find(key);
if (it == model_config_.parameters().end()) {
std::cerr << "Parameter \"" << key
<< "\" missing from config file. Exiting." << std::endl;
return kInvalidModelConfig;
}
*param = it->second.string_value();
return kSuccess;
}
int ReadParameter(const ModelConfig& model_config_, const std::string& key,
int* param) {
std::string tmp;
int err = ReadParameter(model_config_, key, &tmp);
*param = std::stoi(tmp);
return err;
}
int ReadParameter(const ModelConfig& model_config_, const std::string& key,
float* param) {
std::string tmp;
int err = ReadParameter(model_config_, key, &tmp);
*param = std::stof(tmp);
return err;
}
const char* CustomErrorString(int errcode) {
switch (errcode) {
case kSuccess:
return "success";
case kInvalidModelConfig:
return "invalid model configuration";
case kGpuNotSupported:
return "execution on GPU not supported";
case kSequenceBatcher:
return "model configuration must configure sequence batcher";
case kModelControl:
return "'START' and 'READY' must be configured as the control inputs";
case kInputOutput:
return "model must have four inputs and one output with shape [-1]";
case kInputName:
return "names for input don't exist";
case kOutputName:
return "model output must be named 'OUTPUT'";
case kInputOutputDataType:
return "model inputs or outputs data_type cannot be specified";
case kInputContents:
return "unable to get input tensor values";
case kInputSize:
return "unexpected size for input tensor";
case kOutputBuffer:
return "unable to get buffer for output tensor values";
case kBatchTooBig:
return "unable to execute batch larger than max-batch-size";
case kTimesteps:
return "unable to execute more than 1 timestep at a time";
case kChunkTooBig:
return "a chunk cannot contain more samples than the WAV_DATA dimension";
default:
break;
}
return "unknown error";
}
} // kaldi
} // custom
} // inferenceserver
} // nvidia

View file

@ -0,0 +1,66 @@
// Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "lat/lattice-functions.h"
#include "src/core/model_config.h"
#include "src/core/model_config.pb.h"
#include "src/custom/sdk/custom_instance.h"
namespace nvidia {
namespace inferenceserver {
namespace custom {
namespace kaldi_cbe {
enum ErrorCodes {
kSuccess,
kUnknown,
kInvalidModelConfig,
kGpuNotSupported,
kSequenceBatcher,
kModelControl,
kInputOutput,
kInputName,
kOutputName,
kInputOutputDataType,
kInputContents,
kInputSize,
kOutputBuffer,
kBatchTooBig,
kTimesteps,
kChunkTooBig
};
int GetInputTensor(CustomGetNextInputFn_t input_fn, void* input_context,
const char* name, const size_t expected_byte_size,
std::vector<uint8_t>* input, const void** out);
void LatticeToString(fst::SymbolTable& word_syms,
const kaldi::CompactLattice& dlat, std::string* out_str);
int ReadParameter(const ModelConfig& model_config_, const std::string& key,
std::string* param);
int ReadParameter(const ModelConfig& model_config_, const std::string& key,
int* param);
int ReadParameter(const ModelConfig& model_config_, const std::string& key,
float* param);
const char* CustomErrorString(int errcode);
} // kaldi
} // custom
} // inferenceserver
} // nvidia

View file

@ -0,0 +1,401 @@
// Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "kaldi-backend.h"
#include "kaldi-backend-utils.h"
namespace nvidia {
namespace inferenceserver {
namespace custom {
namespace kaldi_cbe {
Context::Context(const std::string& instance_name,
const ModelConfig& model_config, const int gpu_device)
: instance_name_(instance_name),
model_config_(model_config),
gpu_device_(gpu_device),
num_channels_(
model_config_
.max_batch_size()), // diff in def between kaldi and trtis
int32_byte_size_(GetDataTypeByteSize(TYPE_INT32)),
int64_byte_size_(GetDataTypeByteSize(TYPE_INT64)) {}
Context::~Context() { delete word_syms_; }
int Context::ReadModelParameters() {
// Reading config
float beam, lattice_beam;
int max_active;
int frame_subsampling_factor;
float acoustic_scale;
int num_worker_threads;
int err =
ReadParameter(model_config_, "mfcc_filename",
&batched_decoder_config_.feature_opts.mfcc_config) ||
ReadParameter(
model_config_, "ivector_filename",
&batched_decoder_config_.feature_opts.ivector_extraction_config) ||
ReadParameter(model_config_, "beam", &beam) ||
ReadParameter(model_config_, "lattice_beam", &lattice_beam) ||
ReadParameter(model_config_, "max_active", &max_active) ||
ReadParameter(model_config_, "frame_subsampling_factor",
&frame_subsampling_factor) ||
ReadParameter(model_config_, "acoustic_scale", &acoustic_scale) ||
ReadParameter(model_config_, "nnet3_rxfilename", &nnet3_rxfilename_) ||
ReadParameter(model_config_, "fst_rxfilename", &fst_rxfilename_) ||
ReadParameter(model_config_, "word_syms_rxfilename",
&word_syms_rxfilename_) ||
ReadParameter(model_config_, "num_worker_threads", &num_worker_threads) ||
ReadParameter(model_config_, "max_execution_batch_size",
&max_batch_size_);
if (err) return err;
max_batch_size_ = std::max<int>(max_batch_size_, 1);
num_channels_ = std::max<int>(num_channels_, 1);
// Sanity checks
if (beam <= 0) return kInvalidModelConfig;
if (lattice_beam <= 0) return kInvalidModelConfig;
if (max_active <= 0) return kInvalidModelConfig;
if (acoustic_scale <= 0) return kInvalidModelConfig;
if (num_worker_threads <= 0) return kInvalidModelConfig;
if (num_channels_ <= max_batch_size_) return kInvalidModelConfig;
batched_decoder_config_.compute_opts.frame_subsampling_factor =
frame_subsampling_factor;
batched_decoder_config_.compute_opts.acoustic_scale = acoustic_scale;
batched_decoder_config_.decoder_opts.default_beam = beam;
batched_decoder_config_.decoder_opts.lattice_beam = lattice_beam;
batched_decoder_config_.decoder_opts.max_active = max_active;
batched_decoder_config_.num_worker_threads = num_worker_threads;
batched_decoder_config_.max_batch_size = max_batch_size_;
batched_decoder_config_.num_channels = num_channels_;
auto feature_config = batched_decoder_config_.feature_opts;
kaldi::OnlineNnet2FeaturePipelineInfo feature_info(feature_config);
sample_freq_ = feature_info.mfcc_opts.frame_opts.samp_freq;
BaseFloat frame_shift = feature_info.FrameShiftInSeconds();
seconds_per_chunk_ = chunk_num_samps_ / sample_freq_;
int samp_per_frame = static_cast<int>(sample_freq_ * frame_shift);
float n_input_framesf = chunk_num_samps_ / samp_per_frame;
bool is_integer = (n_input_framesf == std::floor(n_input_framesf));
if (!is_integer) {
std::cerr << "WAVE_DATA dim must be a multiple fo samples per frame ("
<< samp_per_frame << ")" << std::endl;
return kInvalidModelConfig;
}
int n_input_frames = static_cast<int>(std::floor(n_input_framesf));
batched_decoder_config_.compute_opts.frames_per_chunk = n_input_frames;
return kSuccess;
}
int Context::InitializeKaldiPipeline() {
batch_corr_ids_.reserve(max_batch_size_);
batch_wave_samples_.reserve(max_batch_size_);
batch_is_last_chunk_.reserve(max_batch_size_);
wave_byte_buffers_.resize(max_batch_size_);
output_shape_ = {1, 1};
kaldi::CuDevice::Instantiate()
.SelectAndInitializeGpuIdWithExistingCudaContext(gpu_device_);
kaldi::CuDevice::Instantiate().AllowMultithreading();
// Loading models
{
bool binary;
kaldi::Input ki(nnet3_rxfilename_, &binary);
trans_model_.Read(ki.Stream(), binary);
am_nnet_.Read(ki.Stream(), binary);
kaldi::nnet3::SetBatchnormTestMode(true, &(am_nnet_.GetNnet()));
kaldi::nnet3::SetDropoutTestMode(true, &(am_nnet_.GetNnet()));
kaldi::nnet3::CollapseModel(kaldi::nnet3::CollapseModelConfig(),
&(am_nnet_.GetNnet()));
}
fst::Fst<fst::StdArc>* decode_fst = fst::ReadFstKaldiGeneric(fst_rxfilename_);
cuda_pipeline_.reset(
new kaldi::cuda_decoder::BatchedThreadedNnet3CudaOnlinePipeline(
batched_decoder_config_, *decode_fst, am_nnet_, trans_model_));
delete decode_fst;
// Loading word syms for text output
if (word_syms_rxfilename_ != "") {
if (!(word_syms_ = fst::SymbolTable::ReadText(word_syms_rxfilename_))) {
std::cerr << "Could not read symbol table from file "
<< word_syms_rxfilename_;
return kInvalidModelConfig;
}
}
chunk_num_samps_ = cuda_pipeline_->GetNSampsPerChunk();
chunk_num_bytes_ = chunk_num_samps_ * sizeof(BaseFloat);
return kSuccess;
}
int Context::Init() {
return InputOutputSanityCheck() || ReadModelParameters() ||
InitializeKaldiPipeline();
}
bool Context::CheckPayloadError(const CustomPayload& payload) {
int err = payload.error_code;
if (err) std::cerr << "Error: " << CustomErrorString(err) << std::endl;
return (err != 0);
}
int Context::Execute(const uint32_t payload_cnt, CustomPayload* payloads,
CustomGetNextInputFn_t input_fn,
CustomGetOutputFn_t output_fn) {
// kaldi::Timer timer;
if (payload_cnt > num_channels_) return kBatchTooBig;
// Each payload is a chunk for one sequence
// Currently using dynamic batcher, not sequence batcher
for (uint32_t pidx = 0; pidx < payload_cnt; ++pidx) {
if (batch_corr_ids_.size() == max_batch_size_) FlushBatch();
CustomPayload& payload = payloads[pidx];
if (payload.batch_size != 1) payload.error_code = kTimesteps;
if (CheckPayloadError(payload)) continue;
// Get input tensors
int32_t start, dim, end, ready;
CorrelationID corr_id;
const BaseFloat* wave_buffer;
payload.error_code = GetSequenceInput(
input_fn, payload.input_context, &corr_id, &start, &ready, &dim, &end,
&wave_buffer, &wave_byte_buffers_[pidx]);
if (CheckPayloadError(payload)) continue;
if (!ready) continue;
if (dim > chunk_num_samps_) payload.error_code = kChunkTooBig;
if (CheckPayloadError(payload)) continue;
kaldi::SubVector<BaseFloat> wave_part(wave_buffer, dim);
// Initialize corr_id if first chunk
if (start) cuda_pipeline_->InitCorrID(corr_id);
// Add to batch
batch_corr_ids_.push_back(corr_id);
batch_wave_samples_.push_back(wave_part);
batch_is_last_chunk_.push_back(end);
if (end) {
// If last chunk, set the callback for that seq
cuda_pipeline_->SetLatticeCallback(
corr_id, [this, &output_fn, &payloads, pidx,
corr_id](kaldi::CompactLattice& clat) {
std::string output;
LatticeToString(*word_syms_, clat, &output);
SetOutputTensor(output, output_fn, payloads[pidx]);
});
}
}
FlushBatch();
cuda_pipeline_->WaitForLatticeCallbacks();
return kSuccess;
}
int Context::FlushBatch() {
if (!batch_corr_ids_.empty()) {
cuda_pipeline_->DecodeBatch(batch_corr_ids_, batch_wave_samples_,
batch_is_last_chunk_);
batch_corr_ids_.clear();
batch_wave_samples_.clear();
batch_is_last_chunk_.clear();
}
}
int Context::InputOutputSanityCheck() {
if (!model_config_.has_sequence_batching()) {
return kSequenceBatcher;
}
auto& batcher = model_config_.sequence_batching();
if (batcher.control_input_size() != 4) {
return kModelControl;
}
std::set<std::string> control_input_names;
for (int i = 0; i < 4; ++i)
control_input_names.insert(batcher.control_input(i).name());
if (!(control_input_names.erase("START") &&
control_input_names.erase("END") &&
control_input_names.erase("CORRID") &&
control_input_names.erase("READY"))) {
return kModelControl;
}
if (model_config_.input_size() != 2) {
return kInputOutput;
}
if ((model_config_.input(0).dims().size() != 1) ||
(model_config_.input(0).dims(0) <= 0) ||
(model_config_.input(1).dims().size() != 1) ||
(model_config_.input(1).dims(0) != 1)) {
return kInputOutput;
}
chunk_num_samps_ = model_config_.input(0).dims(0);
chunk_num_bytes_ = chunk_num_samps_ * sizeof(float);
if ((model_config_.input(0).data_type() != DataType::TYPE_FP32) ||
(model_config_.input(1).data_type() != DataType::TYPE_INT32)) {
return kInputOutputDataType;
}
if ((model_config_.input(0).name() != "WAV_DATA") ||
(model_config_.input(1).name() != "WAV_DATA_DIM")) {
return kInputName;
}
if (model_config_.output_size() != 1) {
return kInputOutput;
}
if ((model_config_.output(0).dims().size() != 1) ||
(model_config_.output(0).dims(0) != 1)) {
return kInputOutput;
}
if (model_config_.output(0).data_type() != DataType::TYPE_STRING) {
return kInputOutputDataType;
}
if (model_config_.output(0).name() != "TEXT") {
return kOutputName;
}
return kSuccess;
}
int Context::GetSequenceInput(CustomGetNextInputFn_t& input_fn,
void* input_context, CorrelationID* corr_id,
int32_t* start, int32_t* ready, int32_t* dim,
int32_t* end, const BaseFloat** wave_buffer,
std::vector<uint8_t>* input_buffer) {
int err;
//&input_buffer[0]: char pointer -> alias with any types
// wave_data[0] will holds the struct
// Get start of sequence tensor
const void* out;
err = GetInputTensor(input_fn, input_context, "WAV_DATA_DIM",
int32_byte_size_, &byte_buffer_, &out);
if (err != kSuccess) return err;
*dim = *reinterpret_cast<const int32_t*>(out);
err = GetInputTensor(input_fn, input_context, "END", int32_byte_size_,
&byte_buffer_, &out);
if (err != kSuccess) return err;
*end = *reinterpret_cast<const int32_t*>(out);
err = GetInputTensor(input_fn, input_context, "START", int32_byte_size_,
&byte_buffer_, &out);
if (err != kSuccess) return err;
*start = *reinterpret_cast<const int32_t*>(out);
err = GetInputTensor(input_fn, input_context, "READY", int32_byte_size_,
&byte_buffer_, &out);
if (err != kSuccess) return err;
*ready = *reinterpret_cast<const int32_t*>(out);
err = GetInputTensor(input_fn, input_context, "CORRID", int64_byte_size_,
&byte_buffer_, &out);
if (err != kSuccess) return err;
*corr_id = *reinterpret_cast<const CorrelationID*>(out);
// Get pointer to speech tensor
err = GetInputTensor(input_fn, input_context, "WAV_DATA", chunk_num_bytes_,
input_buffer, &out);
if (err != kSuccess) return err;
*wave_buffer = reinterpret_cast<const BaseFloat*>(out);
return kSuccess;
}
int Context::SetOutputTensor(const std::string& output,
CustomGetOutputFn_t output_fn,
CustomPayload payload) {
uint32_t byte_size_with_size_int = output.size() + sizeof(int32);
// std::cout << output << std::endl;
// copy output from best_path to output buffer
if ((payload.error_code == 0) && (payload.output_cnt > 0)) {
const char* output_name = payload.required_output_names[0];
// output buffer
void* obuffer;
if (!output_fn(payload.output_context, output_name, output_shape_.size(),
&output_shape_[0], byte_size_with_size_int, &obuffer)) {
payload.error_code = kOutputBuffer;
return payload.error_code;
}
// If no error but the 'obuffer' is returned as nullptr, then
// skip writing this output.
if (obuffer != nullptr) {
// std::cout << "writing " << output << std::endl;
int32* buffer_as_int = reinterpret_cast<int32*>(obuffer);
buffer_as_int[0] = output.size();
memcpy(&buffer_as_int[1], output.data(), output.size());
}
}
}
/////////////
extern "C" {
int CustomInitialize(const CustomInitializeData* data, void** custom_context) {
// Convert the serialized model config to a ModelConfig object.
ModelConfig model_config;
if (!model_config.ParseFromString(std::string(
data->serialized_model_config, data->serialized_model_config_size))) {
return kInvalidModelConfig;
}
// Create the context and validate that the model configuration is
// something that we can handle.
Context* context = new Context(std::string(data->instance_name), model_config,
data->gpu_device_id);
int err = context->Init();
if (err != kSuccess) {
return err;
}
*custom_context = static_cast<void*>(context);
return kSuccess;
}
int CustomFinalize(void* custom_context) {
if (custom_context != nullptr) {
Context* context = static_cast<Context*>(custom_context);
delete context;
}
return kSuccess;
}
const char* CustomErrorString(void* custom_context, int errcode) {
return CustomErrorString(errcode);
}
int CustomExecute(void* custom_context, const uint32_t payload_cnt,
CustomPayload* payloads, CustomGetNextInputFn_t input_fn,
CustomGetOutputFn_t output_fn) {
if (custom_context == nullptr) {
return kUnknown;
}
Context* context = static_cast<Context*>(custom_context);
return context->Execute(payload_cnt, payloads, input_fn, output_fn);
}
} // extern "C"
}
}
}
} // namespace nvidia::inferenceserver::custom::kaldi_cbe

View file

@ -0,0 +1,119 @@
// Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#define HAVE_CUDA 1 // Loading Kaldi headers with GPU
#include <cfloat>
#include <sstream>
#include "cudadecoder/batched-threaded-nnet3-cuda-online-pipeline.h"
#include "fstext/fstext-lib.h"
#include "lat/lattice-functions.h"
#include "nnet3/am-nnet-simple.h"
#include "nnet3/nnet-utils.h"
#include "util/kaldi-thread.h"
#include "src/core/model_config.h"
#include "src/core/model_config.pb.h"
#include "src/custom/sdk/custom_instance.h"
using kaldi::BaseFloat;
namespace nvidia {
namespace inferenceserver {
namespace custom {
namespace kaldi_cbe {
// Context object. All state must be kept in this object.
class Context {
public:
Context(const std::string& instance_name, const ModelConfig& config,
const int gpu_device);
virtual ~Context();
// Initialize the context. Validate that the model configuration,
// etc. is something that we can handle.
int Init();
// Perform custom execution on the payloads.
int Execute(const uint32_t payload_cnt, CustomPayload* payloads,
CustomGetNextInputFn_t input_fn, CustomGetOutputFn_t output_fn);
private:
// init kaldi pipeline
int InitializeKaldiPipeline();
int InputOutputSanityCheck();
int ReadModelParameters();
int GetSequenceInput(CustomGetNextInputFn_t& input_fn, void* input_context,
CorrelationID* corr_id, int32_t* start, int32_t* ready,
int32_t* dim, int32_t* end,
const kaldi::BaseFloat** wave_buffer,
std::vector<uint8_t>* input_buffer);
int SetOutputTensor(const std::string& output, CustomGetOutputFn_t output_fn,
CustomPayload payload);
bool CheckPayloadError(const CustomPayload& payload);
int FlushBatch();
// The name of this instance of the backend.
const std::string instance_name_;
// The model configuration.
const ModelConfig model_config_;
// The GPU device ID to execute on or CUSTOM_NO_GPU_DEVICE if should
// execute on CPU.
const int gpu_device_;
// Models paths
std::string nnet3_rxfilename_, fst_rxfilename_;
std::string word_syms_rxfilename_;
// batch_size
int max_batch_size_;
int num_channels_;
int num_worker_threads_;
std::vector<CorrelationID> batch_corr_ids_;
std::vector<kaldi::SubVector<kaldi::BaseFloat>> batch_wave_samples_;
std::vector<bool> batch_is_last_chunk_;
BaseFloat sample_freq_, seconds_per_chunk_;
int chunk_num_bytes_, chunk_num_samps_;
// feature_config includes configuration for the iVector adaptation,
// as well as the basic features.
kaldi::cuda_decoder::BatchedThreadedNnet3CudaOnlinePipelineConfig
batched_decoder_config_;
std::unique_ptr<kaldi::cuda_decoder::BatchedThreadedNnet3CudaOnlinePipeline>
cuda_pipeline_;
// Maintain the state of some shared objects
kaldi::TransitionModel trans_model_;
kaldi::nnet3::AmNnetSimple am_nnet_;
fst::SymbolTable* word_syms_;
const uint64_t int32_byte_size_;
const uint64_t int64_byte_size_;
std::vector<int64_t> output_shape_;
std::vector<uint8_t> byte_buffer_;
std::vector<std::vector<uint8_t>> wave_byte_buffers_;
};
} // kaldi
} // custom
} // inferenceserver
} // nvidia

View file

@ -0,0 +1,21 @@
# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
{
global:
CustomErrorString;
CustomExecute;
CustomFinalize;
CustomInitialize;
local: *;
};