[KaldiASR] Adding KaldiASR custom backend for TRTIS

2020-01-15 17:04:32 +01:00 · 2020-01-15 17:04:32 +01:00 · b9b03e1446
parent 784eb0d8ca
commit b9b03e1446
28 changed files with 2291 additions and 0 deletions
--- a/Kaldi/SpeechRecognition/.dockerignore
+++ b/Kaldi/SpeechRecognition/.dockerignore
@ -0,0 +1,3 @@
+.git/
+data/
+kaldi/
--- a/Kaldi/SpeechRecognition/.gitignore
+++ b/Kaldi/SpeechRecognition/.gitignore
@ -0,0 +1,4 @@
+data/*
+!data/README.md
+.*.swp
+.*.swo
--- a/Kaldi/SpeechRecognition/.gitmodules
+++ b/Kaldi/SpeechRecognition/.gitmodules
--- a/Kaldi/SpeechRecognition/Dockerfile
+++ b/Kaldi/SpeechRecognition/Dockerfile
@ -0,0 +1,55 @@
+# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+FROM nvcr.io/nvidia/kaldi:19.12-online-beta-py3 as kb
+ENV DEBIAN_FRONTEND=noninteractive
+
+ARG PYVER=3.6
+
+FROM nvcr.io/nvidia/tensorrtserver:19.12-py3
+
+# Kaldi dependencies
+RUN apt-get update && apt-get install -y --no-install-recommends \
+        automake \
+        autoconf \
+        cmake \
+        flac \
+        gawk \
+        libatlas3-base \
+        libtool \
+        python$PYVER \
+        python$PYVER-dev \
+        sox \
+        subversion \
+        unzip \
+        bc \
+        libatlas-base-dev \
+        zlib1g-dev
+
+RUN mkdir /opt/trtis-kaldi && mkdir -p /workspace/model-repo/kaldi_online/1 && mkdir -p /mnt/model-repo
+# Copying static files
+COPY scripts /workspace/scripts
+
+# Moving Kaldi to container
+COPY --from=kb /opt/kaldi /opt/kaldi
+ENV LD_LIBRARY_PATH /opt/kaldi/src/lib/:$LD_LIBRARY_PATH
+
+# Building the custom backend
+COPY trtis-kaldi-backend /workspace/trtis-kaldi-backend
+#COPY --from=cbe /workspace/install/custom-backend-sdk /workspace/trtis-kaldi-backend/custom-backend-sdk
+RUN cd /workspace/trtis-kaldi-backend && wget https://github.com/NVIDIA/tensorrt-inference-server/releases/download/v1.9.0/v1.9.0_ubuntu1804.custombackend.tar.gz -O custom-backend-sdk.tar.gz && tar -xzf custom-backend-sdk.tar.gz
+RUN cd /workspace/trtis-kaldi-backend/ && make && cp libkaldi-trtisbackend.so /workspace/model-repo/kaldi_online/1/ && cd - && rm -r /workspace/trtis-kaldi-backend
+
+COPY scripts/nvidia_kaldi_trtis_entrypoint.sh /opt/trtis-kaldi
+
+ENTRYPOINT ["/opt/trtis-kaldi/nvidia_kaldi_trtis_entrypoint.sh"]
--- a/Kaldi/SpeechRecognition/Dockerfile.client
+++ b/Kaldi/SpeechRecognition/Dockerfile.client
@ -0,0 +1,41 @@
+# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+FROM nvcr.io/nvidia/kaldi:19.12-online-beta-py3 as kb
+FROM nvcr.io/nvidia/tensorrtserver:19.12-py3-clientsdk
+
+# Kaldi dependencies
+RUN apt-get update && apt-get install -y --no-install-recommends \
+        automake \
+        autoconf \
+        cmake \
+        flac \
+        gawk \
+        libatlas3-base \
+        libtool \
+        python$PYVER \
+        python$PYVER-dev \
+        sox \
+        subversion \
+        unzip \
+        bc \
+        libatlas-base-dev \
+        zlib1g-dev
+
+# Moving Kaldi to container
+COPY --from=kb /opt/kaldi /opt/kaldi
+ENV LD_LIBRARY_PATH /opt/kaldi/src/lib/:$LD_LIBRARY_PATH
+
+COPY kaldi-asr-client /workspace/src/clients/c++/kaldi-asr-client
+RUN echo "add_subdirectory(kaldi-asr-client)" >> "/workspace/src/clients/c++/CMakeLists.txt"
+RUN cd /workspace/build/ && make -j16 trtis-clients
--- a/Kaldi/SpeechRecognition/LICENSE
+++ b/Kaldi/SpeechRecognition/LICENSE
@ -0,0 +1,203 @@
+   Except where otherwise noted, the following license applies to all files in this repo. 
+        
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright 2019 NVIDIA Corporation
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
--- a/Kaldi/SpeechRecognition/README.md
+++ b/Kaldi/SpeechRecognition/README.md
@ -0,0 +1,241 @@
+# Kaldi ASR Integration With TensorRT Inference Server
+
+This repository provides a Kaldi ASR custom backend for the NVIDIA TensorRT Inference Server (TRTIS). It can be used to demonstrate high-performance online inference on Kaldi ASR models. This includes handling the gRPC communication between the TensorRT Inference Server and clients, and the dynamic batching of inference requests. This repository is tested and maintained by NVIDIA.
+
+## Table Of Contents
+
+- [Table Of Contents](#table-of-contents)
+- [Solution overview](#solution-overview)
+   * [Reference model](#reference-model)
+   * [Default configuration](#default-configuration)
+- [Setup](#setup)
+  * [Requirements](#requirements)
+- [Quick Start Guide](#quick-start-guide)
+- [Advanced](#advanced)
+   * [Parameters](#parameters)
+      * [Model path](#model-path)
+      * [Model configuration](#model-configuration)
+      * [Inference engine configuration](#inference-engine-configuration)
+  * [Inference process](#inference-process)
+  * [Client command-line parameters](#client-command-line-parameters)
+   * [Input/Output](#inputoutput)
+     * [Input](#input)
+     * [Output](#output)
+   * [Using a custom Kaldi ASR model](#using-a-custom-kaldi-asr-model)
+- [Performance](#performance)
+  * [Metrics](#metrics)
+  * [Results](#results)
+- [Release notes](#release-notes)
+  * [Changelog](#changelog)
+  * [Known issues](#known-issues)
+
+## Solution overview
+
+This repository provides a wrapper around the online GPU-accelerated ASR pipeline from the paper [GPU-Accelerated Viterbi Exact Lattice Decoder for Batched Online and Offline Speech Recognition](https://arxiv.org/abs/1910.10032). That work includes a high-performance implementation of a GPU HMM Decoder, a low-latency Neural Net driver, fast Feature Extraction for preprocessing, and new ASR pipelines tailored for GPUs. These different modules have been integrated into the Kaldi ASR framework.
+
+This repository contains a TensorRT Inference Server custom backend for the Kaldi ASR framework. This custom backend calls the high-performance online GPU pipeline from the Kaldi ASR framework. This TensorRT Inference Server integration provides ease-of-use to Kaldi ASR inference: gRPC streaming server, dynamic sequence batching, and multi-instances support. A client connects to the gRPC server, streams audio by sending chunks to the server, and gets back the inferred text as an answer (see [Input/Output](#input-output)). More information about the TensorRT Inference Server can be found [here](https://docs.nvidia.com/deeplearning/sdk/tensorrt-inference-server-guide/docs/).  
+
+This TensorRT Inference Server integration is meant to be used with the LibriSpeech model for demonstration purposes. We include a pre-trained version of this model to allow you to easily test this work (see [Quick Start Guide](#quick-start-guide)). Both the TensorRT Inference Server integration and the underlying Kaldi ASR online GPU pipeline are a work in progress and will support more functionalities in the future. This includes online iVectors not currently supported in the Kaldi ASR GPU online pipeline and being replaced by a zero vector (see [Known issues](#known-issues)). Support for a custom Kaldi model is experimental (see [Using a custom Kaldi model](#using-custom-kaldi-model)).
+
+### Reference model
+
+A reference model is used by all test scripts and benchmarks presented in this repository to illustrate this solution. We are using the Kaldi ASR `LibriSpeech` recipe, available [here](https://github.com/kaldi-asr/kaldi/blob/master/egs/librispeech/s5). It was trained by NVIDIA and is delivered as a pre-trained model.
+
+### Default configuration
+
+Details about parameters can be found in the [Parameters](#parameters) section.
+
+* `model path`: Configured to use the pretrained LibriSpeech model.
+* `beam`: 10
+* `lattice_beam`: 7
+* `max_active`: 10,000
+* `frame_subsampling_factor`: 3
+* `acoustic_scale`: 1.0
+* `num_worker_threads`: 20
+* `max_execution_batch_size`: 256
+* `max_batch_size`: 4096
+* `instance_group.count`: 2
+
+## Setup
+
+### Requirements 
+
+This repository contains Dockerfiles which extends the Kaldi and TensorRT Inference Server NVIDIA GPU Cloud (NGC) containers and encapsulates some dependencies. Aside from these dependencies, ensure you have [NVIDIA Docker](https://github.com/NVIDIA/nvidia-docker) installed.
+
+
+For more information about how to get started with NGC containers, see the following sections from the NVIDIA GPU Cloud Documentation and the Deep Learning Documentation:
+-   [Getting Started Using NVIDIA GPU Cloud](https://docs.nvidia.com/ngc/ngc-getting-started-guide/index.html)
+-   [Accessing And Pulling From The NGC Container Registry](https://docs.nvidia.com/deeplearning/dgx/user-guide/index.html#accessing_registry)
+
+
+## Quick Start Guide
+
+1. Clone the repository.
+ 
+```
+git clone https://github.com/NVIDIA/DeepLearningExamples.git
+cd DeepLearningExamples/Kaldi/SpeechRecognition
+```
+
+2. Build the client and server containers.
+ 
+`scripts/docker/build.sh`
+
+3. Download and set up the pre-trained model and eval dataset.
+
+`scripts/docker/launch_download.sh`
+
+The model and dataset are downloaded in the `data/` folder.
+
+4. Start the server.
+
+`scripts/docker/launch_server.sh`
+
+Once you see the line `Starting Metrics Service at 0.0.0.0:8002`, the server is ready to be used. You can then start the client.
+
+Currently, multi-GPU is not supported. By default GPU 0 is used. You can use a specific GPU by using `NVIDIA_VISIBLE_DEVICES`:
+
+`NVIDIA_VISIBLE_DEVICES=<GPUID> scripts/docker/launch_server.sh`
+
+5. Start the client.
+
+The following command will stream 1000 parallel streams to the server. The `-p` option prints the inferred `TEXT` sent back from the server. 
+
+`scripts/docker/launch_client.sh -p`
+
+
+## Advanced
+
+### Parameters
+
+The configuration is done through the `config.pbtxt` file available in `model-repo/` directory. It allows you to specify the following:
+
+####  Model path
+
+The following parameters can be modified if you want to use your own Kaldi model. 
+
+* `mfcc_filename`
+* `ivector_filename`
+* `nnet3_rxfilename`
+* `fst_rxfilename`
+* `word_syms_rxfilename`
+
+#### Model configuration
+
+The model configuration parameters are passed to the model and have  an impact on both accuracy and performance. The model parameters are usually Kaldi ASR parameters, meaning, if they are, you can reuse the values that are currently being used in the CPU Kaldi ASR pipeline. 
+
+* `beam`
+* `lattice_beam`
+* `max_active`
+* `frame_subsampling_factor`
+* `acoustic_scale`
+
+#### Inference engine configuration
+
+The inference engine configuration parameters configure the inference engine. They impact performance, but not accuracy.
+
+* `max_batch_size`: The maximum number of inference channels opened at a given time. If set to `4096`, then one instance will handle at most 4096 concurrent users.
+* `num_worker_threads`: The number of CPU threads for the postprocessing CPU tasks, such as lattice determinization and text generation from the lattice.
+* `max_execution_batch_size`: The size of one execution batch on the GPU. This parameter should be set as large as necessary to saturate the GPU, but not bigger. Larger batches will lead to a higher throughput, smaller batches to lower latency. 
+* `input.WAV_DATA.dims`: The maximum number of samples per chunk. The value must be a multiple of `frame_subsampling_factor * chunks_per_frame`.
+
+### Inference process
+
+Inference is done through simulating concurrent users. Each user is attributed to one utterance from the LibriSpeech dataset. It streams that utterance by cutting it into chunks and gets the final `TEXT` output once the final chunk has been sent. A parameter sets the number of active users being simulated in parallel.  
+
+### Client command-line parameters
+
+The client can be configured through a set of parameters that define its behavior. To see the full list of available options and their descriptions, use the `-h` command-line option. The parameters are:
+
+```
+    -v
+    -i <Number of iterations on the dataset>
+    -c <Number of parallel audio channels>
+    -a <Path to the scp dataset file>
+    -l <Maximum number of samples per chunk. Must correspond to the server config>
+    -u <URL for inference service and its gRPC port>
+    -o : Only feed each channel at realtime speed. Simulates online clients.
+    -p : Print text outputs
+
+```
+
+### Input/Output
+
+The API is currently experimental.
+
+#### Input
+
+The server execpts chunks of audio each containing up to `input.WAV_DATA.dims` samples. Per default, this corresponds to 510ms of audio per chunk. The last chunk can send a partial chunk smaller than this maximum value. 
+
+The chunk is made of a float array set in the input `WAV_DATA`, with the input `WAV_DATA_DIM` containing the number of samples contained in that chunk. Flags can be set to declare a chunk as a first chunk or last chunk for a sequence. Finally, each chunk from a given sequence is associated with a `CorrelationID`. Every chunk belonging to the same sequence must be given the same `CorrelationID`. 
+
+#### Output
+
+Once the server receives the final chunk for a sequence (with the `END` flag set), it will generate the output associated with that sequence, and send it back to the client. The end of the sequencing procedure is:
+
+1. Process the last chunk.
+2. Flush and process the Neural Net context. 
+3. Generate the full lattice for the sequence.
+4. Determinize the lattice.
+5. Find the best path in the lattice.
+6. Generate the text output for that best path.
+7. Send the text back to the client.
+
+Even if only the best path is used, we are still generating a full lattice for benchmarking purposes. Partial results (generated after each timestep) are currently not available but will be added in a future release. 
+
+### Using a custom Kaldi ASR model
+
+Support for Kaldi ASR models that are different from the provided LibriSpeech model is experimental. However, it is possible to modify the [Model Path](#model-path) section of the config file `model-repo/kaldi_online/config.pbtxt` to set up your own model. 
+
+The models and Kaldi allocators are currently not shared between instances. This means that if your model is large, you may end up with not enough memory on the GPU to store two different instances. If that's the case, you can set `count` to `1` in the `instance_group` section of the config file.
+
+## Performance
+
+
+### Metrics
+
+Throughput is measured using the RTFX metric. It is defined such as : `RTFX = (number of seconds of audio inferred) / (compute time in seconds)`. It is the inverse of the RTF (Real Time Factor) metric, such as `RTFX = 1/RTF`.
+
+Latency is defined as the delay between the availability of the last chunk of audio and the reception of the inferred text. More precisely, it is defined such as :
+
+1. *Client:* Last audio chunk available
+2. ***t0** <- Current time*
+3. *Client:* Send last audio chunk
+4. *Server:* Compute inference of last chunk
+5. *Server:* Generate the raw lattice for the full utterance
+6. *Server:* Determinize the raw lattice
+7. *Server:* Generate the text output associated with the best path in the determinized lattice
+8. *Client:* Receive text output
+9. *Client:* Call callback with output
+10. ***t1** <- Current time*  
+
+The latency is defined such as `latency = t1 - t0`.
+
+### Results
+
+Our results were obtained by:
+
+1. Building and starting the server as described in [Quick Start Guide](#quick-start-guide).
+2. Running  `scripts/run_inference_all_v100.sh` and  `scripts/run_inference_all_t4.sh`
+
+| GPU | Realtime I/O | Number of parallel audio channels | Throughput (RTFX) | Latency | | | |
+| ------ | ------ | ------ | ------ | ------ | ------ | ------ |------ |
+| | | | | 90% | 95% | 99% | Avg |
+| V100 | No | 2000 | 1769.8 | N/A | N/A | N/A | N/A |
+| V100 | Yes | 1500 |  1220 | 0.424 | 0.473 | 0.758 | 0.345 |
+| V100 | Yes | 1000 |  867.4 | 0.358 | 0.405 | 0.707 | 0.276 |
+| V100 | Yes | 800 |  647.8 | 0.304 | 0.325 | 0.517 | 0.238 |
+| T4 | No | 1000 | 906.7 | N/A | N/A | N/A| N/A |
+| T4 | Yes | 700 | 629.6 | 0.629 | 0.782 | 1.01 | 0.463 |
+| T4 | Yes | 400 | 373.7 | 0.417 | 0.441 | 0.690 | 0.349 |
+
+## Release notes
+
+### Changelog
+
+January 2020
+* Initial release
+
+### Known issues
+
+Only mfcc features are supported at this time. The reference model used in the benchmark scripts requires both mfcc and iVector features to deliver the best accuracy. Support for iVector features will be added in a future release.
--- a/Kaldi/SpeechRecognition/data/README.md
+++ b/Kaldi/SpeechRecognition/data/README.md
--- a/Kaldi/SpeechRecognition/kaldi-asr-client/CMakeLists.txt
+++ b/Kaldi/SpeechRecognition/kaldi-asr-client/CMakeLists.txt
@ -0,0 +1,77 @@
+# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+cmake_minimum_required (VERSION 3.5)
+
+add_executable(kaldi_asr_parallel_client kaldi_asr_parallel_client.cc asr_client_imp.cc) 
+
+target_link_libraries(
+   kaldi_asr_parallel_client
+   PRIVATE request
+)
+
+target_link_libraries(
+  kaldi_asr_parallel_client
+  PRIVATE protobuf::libprotobuf
+)
+
+target_include_directories(
+  kaldi_asr_parallel_client 
+  PRIVATE
+  /opt/kaldi/src/
+)
+
+target_include_directories(
+  kaldi_asr_parallel_client 
+  PRIVATE
+  /opt/kaldi/tools/openfst-1.6.7/include/
+)
+
+target_link_libraries(
+  kaldi_asr_parallel_client
+  PRIVATE /opt/kaldi/src/lib/libkaldi-feat.so
+)
+
+target_link_libraries(
+  kaldi_asr_parallel_client
+  PRIVATE /opt/kaldi/src/lib/libkaldi-util.so
+)
+
+target_link_libraries(
+   kaldi_asr_parallel_client
+   PRIVATE /opt/kaldi/src/lib/libkaldi-matrix.so
+)
+
+target_link_libraries(
+  kaldi_asr_parallel_client
+  PRIVATE /opt/kaldi/src/lib/libkaldi-base.so
+)
+
+
+install(
+  TARGETS kaldi_asr_parallel_client
+  RUNTIME DESTINATION bin
+)
--- a/Kaldi/SpeechRecognition/kaldi-asr-client/asr_client_imp.cc
+++ b/Kaldi/SpeechRecognition/kaldi-asr-client/asr_client_imp.cc
@ -0,0 +1,177 @@
+// Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "asr_client_imp.h"
+#include <unistd.h>
+#include <cmath>
+#include <cstring>
+#include <iomanip>
+#include <numeric>
+
+#define FAIL_IF_ERR(X, MSG)                                        \
+  {                                                                \
+    nic::Error err = (X);                                          \
+    if (!err.IsOk()) {                                             \
+      std::cerr << "error: " << (MSG) << ": " << err << std::endl; \
+      exit(1);                                                     \
+    }                                                              \
+  }
+
+void TRTISASRClient::CreateClientContext() {
+  contextes_.emplace_back();
+  ClientContext& client = contextes_.back();
+  FAIL_IF_ERR(nic::InferGrpcStreamContext::Create(
+                  &client.trtis_context, /*corr_id*/ -1, url_, model_name_,
+                  /*model_version*/ -1,
+                  /*verbose*/ false),
+              "unable to create context");
+}
+
+void TRTISASRClient::SendChunk(ni::CorrelationID corr_id,
+                               bool start_of_sequence, bool end_of_sequence,
+                               float* chunk, int chunk_byte_size) {
+  ClientContext* client = &contextes_[corr_id % ncontextes_];
+  nic::InferContext& context = *client->trtis_context;
+  if (start_of_sequence) n_in_flight_.fetch_add(1, std::memory_order_consume);
+
+  // Setting options
+  std::unique_ptr<nic::InferContext::Options> options;
+  FAIL_IF_ERR(nic::InferContext::Options::Create(&options),
+              "unable to create inference options");
+  options->SetBatchSize(1);
+  options->SetFlags(0);
+  options->SetCorrelationId(corr_id);
+  if (start_of_sequence)
+    options->SetFlag(ni::InferRequestHeader::FLAG_SEQUENCE_START,
+                     start_of_sequence);
+  if (end_of_sequence) {
+    options->SetFlag(ni::InferRequestHeader::FLAG_SEQUENCE_END,
+                     end_of_sequence);
+    for (const auto& output : context.Outputs()) {
+      options->AddRawResult(output);
+    }
+  }
+
+  FAIL_IF_ERR(context.SetRunOptions(*options), "unable to set context options");
+  std::shared_ptr<nic::InferContext::Input> in_wave_data, in_wave_data_dim;
+  FAIL_IF_ERR(context.GetInput("WAV_DATA", &in_wave_data),
+              "unable to get WAV_DATA");
+  FAIL_IF_ERR(context.GetInput("WAV_DATA_DIM", &in_wave_data_dim),
+              "unable to get WAV_DATA_DIM");
+
+  // Wave data input
+  FAIL_IF_ERR(in_wave_data->Reset(), "unable to reset WAVE_DATA");
+  uint8_t* wave_data = reinterpret_cast<uint8_t*>(chunk);
+  if (chunk_byte_size < max_chunk_byte_size_) {
+    std::memcpy(&chunk_buf_[0], chunk, chunk_byte_size);
+    wave_data = &chunk_buf_[0];
+  }
+  FAIL_IF_ERR(in_wave_data->SetRaw(wave_data, max_chunk_byte_size_),
+              "unable to set data for WAVE_DATA");
+  // Dim
+  FAIL_IF_ERR(in_wave_data_dim->Reset(), "unable to reset WAVE_DATA_DIM");
+  int nsamples = chunk_byte_size / sizeof(float);
+  FAIL_IF_ERR(in_wave_data_dim->SetRaw(reinterpret_cast<uint8_t*>(&nsamples),
+                                       sizeof(int32_t)),
+              "unable to set data for WAVE_DATA_DIM");
+
+  total_audio_ += (static_cast<double>(nsamples) / 16000.);  // TODO freq
+  double start = gettime_monotonic();
+  FAIL_IF_ERR(context.AsyncRun([corr_id, end_of_sequence, start, this](
+                  nic::InferContext* ctx,
+                  const std::shared_ptr<nic::InferContext::Request>& request) {
+    if (end_of_sequence) {
+      double elapsed = gettime_monotonic() - start;
+      std::string out;
+      std::map<std::string, std::unique_ptr<nic::InferContext::Result>> results;
+      ctx->GetAsyncRunResults(request, &results);
+
+      if (results.size() != 1) {
+        std::cerr << "Warning: Could not read output for corr_id " << corr_id
+                  << std::endl;
+      } else {
+        FAIL_IF_ERR(results["TEXT"]->GetRawAtCursor(0, &out),
+                    "unable to get TEXT output");
+        if (print_results_) {
+          std::lock_guard<std::mutex> lk(stdout_m_);
+          std::cout << "CORR_ID " << corr_id << "\t\t" << out << std::endl;
+        }
+        {
+          std::lock_guard<std::mutex> lk(results_m_);
+          results_.insert({corr_id, {std::move(out), elapsed}});
+        }
+      }
+      n_in_flight_.fetch_sub(1, std::memory_order_relaxed);
+    }
+  }),
+              "unable to run model");
+}
+
+void TRTISASRClient::WaitForCallbacks() {
+  int n;
+  while ((n = n_in_flight_.load(std::memory_order_consume))) {
+    usleep(1000);
+  }
+}
+
+void TRTISASRClient::PrintStats() {
+  double now = gettime_monotonic();
+  double diff = now - started_at_;
+  double rtf = total_audio_ / diff;
+  std::cout << "Throughput:\t" << rtf << " RTFX" << std::endl;
+  std::vector<double> latencies;
+  {
+    std::lock_guard<std::mutex> lk(results_m_);
+    latencies.reserve(results_.size());
+    for (auto& result : results_) latencies.push_back(result.second.latency);
+  }
+  std::sort(latencies.begin(), latencies.end());
+  double nresultsf = static_cast<double>(latencies.size());
+  size_t per90i = static_cast<size_t>(std::floor(90. * nresultsf / 100.));
+  size_t per95i = static_cast<size_t>(std::floor(95. * nresultsf / 100.));
+  size_t per99i = static_cast<size_t>(std::floor(99. * nresultsf / 100.));
+
+  double lat_90 = latencies[per90i];
+  double lat_95 = latencies[per95i];
+  double lat_99 = latencies[per99i];
+
+  double avg = std::accumulate(latencies.begin(), latencies.end(), 0.0) /
+               latencies.size();
+
+  std::cout << std::setprecision(3);
+  std::cout << "Latencies:\t90\t\t95\t\t99\t\tAvg\n";
+  std::cout << "\t\t" << lat_90 << "\t\t" << lat_95 << "\t\t" << lat_99
+            << "\t\t" << avg << std::endl;
+}
+
+TRTISASRClient::TRTISASRClient(const std::string& url,
+                               const std::string& model_name,
+                               const int ncontextes, bool print_results)
+    : url_(url),
+      model_name_(model_name),
+      ncontextes_(ncontextes),
+      print_results_(print_results) {
+  ncontextes_ = std::max(ncontextes_, 1);
+  for (int i = 0; i < ncontextes_; ++i) CreateClientContext();
+
+  std::shared_ptr<nic::InferContext::Input> in_wave_data;
+  FAIL_IF_ERR(contextes_[0].trtis_context->GetInput("WAV_DATA", &in_wave_data),
+              "unable to get WAV_DATA");
+  max_chunk_byte_size_ = in_wave_data->ByteSize();
+  chunk_buf_.resize(max_chunk_byte_size_);
+  shape_ = {max_chunk_byte_size_};
+  n_in_flight_.store(0);
+  started_at_ = gettime_monotonic();
+  total_audio_ = 0;
+}
--- a/Kaldi/SpeechRecognition/kaldi-asr-client/asr_client_imp.h
+++ b/Kaldi/SpeechRecognition/kaldi-asr-client/asr_client_imp.h
@ -0,0 +1,73 @@
+// Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <queue>
+#include <string>
+#include <vector>
+
+#include "request_grpc.h"
+
+#ifndef TRTIS_KALDI_ASR_CLIENT_H_
+#define TRTIS_KALDI_ASR_CLIENT_H_
+namespace ni = nvidia::inferenceserver;
+namespace nic = nvidia::inferenceserver::client;
+
+// time with arbitrary reference
+double inline gettime_monotonic() {
+  struct timespec ts;
+  clock_gettime(CLOCK_MONOTONIC, &ts);
+  double time = ts.tv_sec;
+  time += (double)(ts.tv_nsec) / 1e9;
+  return time;
+}
+
+class TRTISASRClient {
+  struct ClientContext {
+    std::unique_ptr<nic::InferContext> trtis_context;
+  };
+
+  std::string url_;
+  std::string model_name_;
+
+  std::vector<ClientContext> contextes_;
+  int ncontextes_;
+  std::vector<uint8_t> chunk_buf_;
+  std::vector<int64_t> shape_;
+  int max_chunk_byte_size_;
+  std::atomic<int> n_in_flight_;
+  double started_at_;
+  double total_audio_;
+  bool print_results_;
+  std::mutex stdout_m_;
+
+  struct Result {
+    std::string text;
+    double latency;
+  };
+
+  std::unordered_map<ni::CorrelationID, Result> results_;
+  std::mutex results_m_;
+
+ public:
+  void CreateClientContext();
+  void SendChunk(uint64_t corr_id, bool start_of_sequence, bool end_of_sequence,
+                 float* chunk, int chunk_byte_size);
+  void WaitForCallbacks();
+  void PrintStats();
+
+  TRTISASRClient(const std::string& url, const std::string& model_name,
+                 const int ncontextes, bool print_results);
+};
+
+#endif  // TRTIS_KALDI_ASR_CLIENT_H_
--- a/Kaldi/SpeechRecognition/kaldi-asr-client/kaldi_asr_parallel_client.cc
+++ b/Kaldi/SpeechRecognition/kaldi-asr-client/kaldi_asr_parallel_client.cc
@ -0,0 +1,218 @@
+// Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <unistd.h>
+#include <iostream>
+#include <string>
+#include <vector>
+#include "asr_client_imp.h"
+#include "feat/wave-reader.h"  // to read the wav.scp
+#include "util/kaldi-table.h"
+
+using kaldi::BaseFloat;
+
+void Usage(char** argv, const std::string& msg = std::string()) {
+  if (!msg.empty()) {
+    std::cerr << "error: " << msg << std::endl;
+  }
+
+  std::cerr << "Usage: scripts/docker/launch_client.sh [options]" << std::endl;
+  std::cerr << "\t-v" << std::endl;
+  std::cerr << "\t-i <Number of iterations on the dataset>" << std::endl;
+  std::cerr << "\t-c <Number of parallel audio channels>" << std::endl;
+  std::cerr << "\t-a <Path to the scp dataset file>" << std::endl;
+  std::cerr << "\t-l <Maximum number of samples per chunk. Must correspond to "
+               "the server config>"
+            << std::endl;
+  std::cerr << "\t-u <URL for inference service and its gRPC port>"
+            << std::endl;
+  std::cerr << "\t-o : Only feed each channel at realtime speed. Simulates "
+               "online clients."
+            << std::endl;
+  std::cerr << "\t-p : Print text outputs" << std::endl;
+
+  std::cerr << std::endl;
+  exit(1);
+}
+
+int main(int argc, char** argv) {
+  std::cout << "\n";
+  std::cout << "==================================================\n"
+            << "============= TRTIS Kaldi ASR Client =============\n"
+            << "==================================================\n"
+            << std::endl;
+
+  // kaldi nampespace TODO
+  using namespace kaldi;
+  typedef kaldi::int32 int32;
+
+  std::string url = "localhost:8001";
+  std::string model_name = "kaldi_online";
+  std::string wav_rspecifier =
+      "scp:/data/datasets/LibriSpeech/test_clean/wav_conv.scp";
+  int chunk_length = 8160;
+  size_t nchannels = 1000;
+  int niterations = 5;
+  bool verbose = false;
+  float samp_freq = 16000;
+  int ncontextes = 10;
+  bool online = false;
+  bool print_results = false;
+
+  // Parse commandline...
+  int opt;
+  while ((opt = getopt(argc, argv, "va:u:i:c:ophl:")) != -1) {
+    switch (opt) {
+      case 'i':
+        niterations = std::atoi(optarg);
+        break;
+      case 'c':
+        nchannels = std::atoi(optarg);
+        break;
+      case 'a':
+        wav_rspecifier = optarg;
+        break;
+      case 'u':
+        url = optarg;
+        break;
+      case 'v':
+        verbose = true;
+        break;
+      case 'o':
+        online = true;
+        break;
+      case 'p':
+        print_results = true;
+        break;
+      case 'l':
+        chunk_length = std::atoi(optarg);
+        break;
+      case 'h':
+      case '?':
+        Usage(argv);
+        break;
+    }
+  }
+
+  if (niterations <= 0) Usage(argv, "number of iterations must be > 0");
+  if (nchannels <= 0) Usage(argv, "number of audio channels must be > 0");
+  if (chunk_length <= 0) Usage(argv, "chunk length must be > 0");
+
+  std::cout << "Configuration:" << std::endl;
+  std::cout << std::endl;
+  std::cout << "Number of iterations\t\t: " << niterations << std::endl;
+  std::cout << "Number of parallel channels\t: " << nchannels << std::endl;
+  std::cout << "Server URL\t\t\t: " << url << std::endl;
+  std::cout << "Print results\t\t\t: " << (print_results ? "Yes" : "No")
+            << std::endl;
+  std::cout << "Online - Realtime I/O\t\t: " << (online ? "Yes" : "No")
+            << std::endl;
+  std::cout << std::endl;
+
+  float chunk_seconds = (double)chunk_length / samp_freq;
+  // need to read wav files
+  SequentialTableReader<WaveHolder> wav_reader(wav_rspecifier);
+
+  std::atomic<uint64_t> correlation_id;
+  correlation_id.store(1);  // 0 = no correlation
+
+  double total_audio = 0;
+  // pre-loading data
+  // we don't want to measure I/O
+  std::vector<std::shared_ptr<WaveData>> all_wav;
+  {
+    std::cout << "Loading eval dataset..." << std::flush;
+    for (; !wav_reader.Done(); wav_reader.Next()) {
+      std::string utt = wav_reader.Key();
+      std::shared_ptr<WaveData> wave_data = std::make_shared<WaveData>();
+      wave_data->Swap(&wav_reader.Value());
+      all_wav.push_back(wave_data);
+      total_audio += wave_data->Duration();
+    }
+    std::cout << "done" << std::endl;
+  }
+
+  struct Stream {
+    std::shared_ptr<WaveData> wav;
+    ni::CorrelationID corr_id;
+    int offset;
+    float send_next_chunk_at;
+    std::atomic<bool> received_output;
+
+    Stream(const std::shared_ptr<WaveData>& _wav, ni::CorrelationID _corr_id)
+        : wav(_wav), corr_id(_corr_id), offset(0), received_output(true) {
+      send_next_chunk_at = gettime_monotonic();
+    }
+  };
+  std::cout << "Opening GRPC contextes..." << std::flush;
+  TRTISASRClient asr_client(url, model_name, ncontextes, print_results);
+  std::cout << "done" << std::endl;
+  std::cout << "Streaming utterances..." << std::flush;
+  std::vector<std::unique_ptr<Stream>> curr_tasks, next_tasks;
+  curr_tasks.reserve(nchannels);
+  next_tasks.reserve(nchannels);
+  size_t all_wav_i = 0;
+  size_t all_wav_max = all_wav.size() * niterations;
+  while (true) {
+      while (curr_tasks.size() < nchannels && all_wav_i < all_wav_max) {
+        // Creating new tasks
+        uint64_t corr_id = correlation_id.fetch_add(1);
+        std::unique_ptr<Stream> ptr(new Stream(all_wav[all_wav_i%(all_wav.size())], corr_id));
+        curr_tasks.emplace_back(std::move(ptr));
+        ++all_wav_i;
+      }
+      // If still empty, done
+      if (curr_tasks.empty()) break;
+
+      for (size_t itask = 0; itask < curr_tasks.size(); ++itask) {
+        Stream& task = *(curr_tasks[itask]);
+
+        SubVector<BaseFloat> data(task.wav->Data(), 0);
+        int32 samp_offset = task.offset;
+        int32 nsamp = data.Dim();
+        int32 samp_remaining = nsamp - samp_offset;
+        int32 num_samp =
+            chunk_length < samp_remaining ? chunk_length : samp_remaining;
+        bool is_last_chunk = (chunk_length >= samp_remaining);
+        SubVector<BaseFloat> wave_part(data, samp_offset, num_samp);
+        bool is_first_chunk = (samp_offset == 0);
+        if (online) {
+          double now = gettime_monotonic();
+          double wait_for = task.send_next_chunk_at - now;
+          if (wait_for > 0) usleep(wait_for * 1e6);
+        }
+        asr_client.SendChunk(task.corr_id, is_first_chunk, is_last_chunk,
+                             wave_part.Data(), wave_part.SizeInBytes());
+        task.send_next_chunk_at += chunk_seconds;
+        if (verbose)
+          std::cout << "Sending correlation_id=" << task.corr_id
+                    << " chunk offset=" << num_samp << std::endl;
+
+        task.offset += num_samp;
+        if (!is_last_chunk) next_tasks.push_back(std::move(curr_tasks[itask]));
+      }
+
+      curr_tasks.swap(next_tasks);
+      next_tasks.clear();
+      // Showing activity if necessary
+      if (!print_results && !verbose) std::cout << "." << std::flush;
+  }
+  std::cout << "done" << std::endl;
+  std::cout << "Waiting for all results..." << std::flush;
+  asr_client.WaitForCallbacks();
+  std::cout << "done" << std::endl;
+  asr_client.PrintStats();
+
+  return 0;
+}
--- a/Kaldi/SpeechRecognition/model-repo/kaldi_online/config.pbtxt
+++ b/Kaldi/SpeechRecognition/model-repo/kaldi_online/config.pbtxt
@ -0,0 +1,149 @@
+name: "kaldi_online"
+platform: "custom"
+default_model_filename: "libkaldi-trtisbackend.so"
+max_batch_size: 2200
+parameters: {
+key: "mfcc_filename"
+value: { 
+string_value:"/data/models/LibriSpeech/conf/mfcc.conf"
+}
+}
+parameters: {
+key: "ivector_filename"
+value: { 
+string_value:"/data/models/LibriSpeech/conf/ivector_extractor.conf"
+}
+}
+parameters: {
+key: "nnet3_rxfilename"
+value: { 
+string_value: "/data/models/LibriSpeech/final.mdl"
+}
+}
+parameters: {
+key: "fst_rxfilename"
+value: { 
+string_value: "/data/models/LibriSpeech/HCLG.fst"
+}
+}
+parameters: {
+key: "word_syms_rxfilename"
+value: { 
+string_value:"/data/models/LibriSpeech/words.txt"
+}
+}
+parameters: [{
+key: "beam"
+value: { 
+string_value:"10"
+}
+},{
+key: "num_worker_threads"
+value: { 
+string_value:"40"
+} 
+},
+{
+key: "max_execution_batch_size"
+value: { 
+string_value:"512"
+}
+}]
+parameters: {
+key: "lattice_beam"
+value: { 
+string_value:"7" 
+}
+}
+parameters: {
+key: "max_active"
+value: { 
+string_value:"10000" 
+}
+}
+parameters: {
+key: "frame_subsampling_factor"
+value: { 
+string_value:"3" 
+}
+}
+parameters: {
+key: "acoustic_scale"
+value: { 
+string_value:"1.0" 
+}
+}
+sequence_batching {
+max_sequence_idle_microseconds:5000000
+  control_input [
+    {
+      name: "START"
+      control [
+        {
+          kind: CONTROL_SEQUENCE_START
+          int32_false_true: [ 0, 1 ]
+        }
+      ]
+    },
+    {
+      name: "READY"
+      control [
+        {
+          kind: CONTROL_SEQUENCE_READY
+          int32_false_true: [ 0, 1 ]
+        }
+      ]
+    },
+    {
+      name: "END"
+      control [
+        {
+          kind: CONTROL_SEQUENCE_END
+          int32_false_true: [ 0, 1 ]
+        }
+      ]
+    },
+    {
+      name: "CORRID"
+      control [
+        {
+          kind: CONTROL_SEQUENCE_CORRID
+    data_type: TYPE_UINT64
+        }
+      ]
+    }
+  ]
+oldest {
+max_candidate_sequences:2200
+preferred_batch_size:[256,512]
+max_queue_delay_microseconds:1000
+}
+},
+
+input [
+  {
+    name: "WAV_DATA"
+    data_type: TYPE_FP32
+    dims: [ 8160 ]
+  },
+  {
+    name: "WAV_DATA_DIM"
+    data_type: TYPE_INT32
+    dims: [ 1 ]
+  }
+]
+output [
+  {
+    name: "TEXT"
+    data_type: TYPE_STRING
+    dims: [ 1 ]
+  }
+]
+instance_group [
+  {
+    count: 2
+    kind: KIND_GPU
+  }
+]
+
+
--- a/Kaldi/SpeechRecognition/scripts/docker/build.sh
+++ b/Kaldi/SpeechRecognition/scripts/docker/build.sh
@ -0,0 +1,17 @@
+#!/bin/bash
+
+# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+docker build . -f Dockerfile --rm -t trtis_kaldi_server
+docker build . -f Dockerfile.client --rm -t trtis_kaldi_client
--- a/Kaldi/SpeechRecognition/scripts/docker/dataset_setup.sh
+++ b/Kaldi/SpeechRecognition/scripts/docker/dataset_setup.sh
@ -0,0 +1,18 @@
+#!/bin/bash
+
+# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+/workspace/scripts/docker/prepare_data.sh
+chown -R $1:$2 /data/
+mv /data/* /mnt/data/
--- a/Kaldi/SpeechRecognition/scripts/docker/launch_client.sh
+++ b/Kaldi/SpeechRecognition/scripts/docker/launch_client.sh
@ -0,0 +1,22 @@
+#!/bin/bash
+
+# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+docker run --rm -it \
+    --net=host \
+    --shm-size=1g \
+    --ulimit memlock=-1 \
+    --ulimit stack=67108864 \
+    -v $PWD/data:/data \
+    trtis_kaldi_client install/bin/kaldi_asr_parallel_client $@
--- a/Kaldi/SpeechRecognition/scripts/docker/launch_download.sh
+++ b/Kaldi/SpeechRecognition/scripts/docker/launch_download.sh
@ -0,0 +1,24 @@
+#!/bin/bash 
+
+# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Start TRTIS server container for download - need some kaldi tools
+nvidia-docker run --rm \
+   --shm-size=1g \
+   --ulimit memlock=-1 \
+   --ulimit stack=67108864 \
+   -v $PWD/data:/mnt/data \
+   trtis_kaldi_server /workspace/scripts/docker/dataset_setup.sh $(id -u) $(id -g)
+
+#   --user $(id -u):$(id -g) \
--- a/Kaldi/SpeechRecognition/scripts/docker/launch_server.sh
+++ b/Kaldi/SpeechRecognition/scripts/docker/launch_server.sh
@ -0,0 +1,30 @@
+#!/bin/bash 
+
+# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+NV_VISIBLE_DEVICES=${NVIDIA_VISIBLE_DEVICES:-"0"}
+
+# Start TRTIS server 
+nvidia-docker run --rm -it \
+   --shm-size=1g \
+   --ulimit memlock=-1 \
+   --ulimit stack=67108864 \
+   -p8000:8000 \
+   -p8001:8001 \
+   -p8002:8002 \
+   --name trt_server_asr \
+   -e NVIDIA_VISIBLE_DEVICES=$NV_VISIBLE_DEVICES \
+   -v $PWD/data:/data \
+   -v $PWD/model-repo:/mnt/model-repo \
+   trtis_kaldi_server trtserver --model-repo=/workspace/model-repo/
--- a/Kaldi/SpeechRecognition/scripts/docker/prepare_data.sh
+++ b/Kaldi/SpeechRecognition/scripts/docker/prepare_data.sh
@ -0,0 +1,89 @@
+#!/bin/bash
+
+# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+WORKSPACE=/data/
+KALDI_ROOT=/opt/kaldi/
+
+model=LibriSpeech
+
+data=${1:-$WORKSPACE/data/}
+datasets=$WORKSPACE/datasets/
+models=$WORKSPACE/models/
+
+# base url for downloads.
+data_url=www.openslr.org/resources/12
+lm_url=www.openslr.org/resources/11
+mfccdir=mfcc
+
+mkdir -p $data/$model
+mkdir -p $models/$model
+mkdir -p $datasets/$model
+  
+
+pushd $KALDI_ROOT/egs/librispeech/s5
+
+. ./cmd.sh
+. ./path.sh
+. parse_options.sh
+
+
+# you might not want to do this for interactive shells.
+set -e
+
+if [[ "$SKIP_DATA_DOWNLOAD" -ne "1" ]]; then
+  echo ----------- Fetching dataset -----------
+
+  # download the data.  Note: we're using the 100 hour setup for
+  # now; later in the script we'll download more and use it to train neural
+  # nets.
+  for part in test-clean test-other; do
+    local/download_and_untar.sh $data $data_url $part
+  done
+fi
+
+# format the data as Kaldi data directories
+echo ----------- Preprocessing dataset -----------
+
+for part in test-clean test-other; do
+  # use underscore-separated names in data directories.
+  local/data_prep.sh $data/$model/$part $datasets/$model/$(echo $part | sed s/-/_/g)
+  # convert the manifests
+  pushd $datasets/$model/$(echo $part | sed s/-/_/g)
+  #sed -i 's@workspace@'"${WORKSPACE}"'@' wav.scp
+  (cat wav.scp | awk '{print $1" "$6}' | sed 's/\.flac/\.wav/g' > wav_conv.scp)
+  popd
+done
+
+if [[ "$SKIP_FLAC2WAV" -ne "1" ]]; then
+  # Convert flac files to wavs
+  for flac in $(find $data/$model -name "*.flac"); do
+     wav=$(echo $flac | sed 's/flac/wav/g')
+     sox $flac -r 16000 -b 16 $wav
+  done
+
+  echo "Converted flac to wav."
+fi
+
+popd >&/dev/null
+
+if [[ "$SKIP_MODEL_DOWNLOAD" -ne "1" ]]; then
+  echo ----------- Fetching trained model -----------
+  pushd $models >&/dev/null
+  wget https://github.com/ryanleary/kaldi-test/releases/download/v0.0/LibriSpeech-trained.tgz -O LibriSpeech-trained.tgz
+  tar -xzf LibriSpeech-trained.tgz -C $model
+  cd $model/conf/
+  find . -name "*.conf" -exec sed -i 's@workspace@'"${WORKSPACE}"'@' {} \;
+  popd >&/dev/null
+fi
--- a/Kaldi/SpeechRecognition/scripts/nvidia_kaldi_trtis_entrypoint.sh
+++ b/Kaldi/SpeechRecognition/scripts/nvidia_kaldi_trtis_entrypoint.sh
@ -0,0 +1,22 @@
+#!/bin/bash
+
+# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -e
+
+if [ -d "/mnt/model-repo/kaldi_online" ]; then
+	ln -s /mnt/model-repo/kaldi_online/config.pbtxt /workspace/model-repo/kaldi_online/
+fi
+
+/opt/tensorrtserver/nvidia_entrypoint.sh $@
--- a/Kaldi/SpeechRecognition/scripts/run_inference_all_t4.sh
+++ b/Kaldi/SpeechRecognition/scripts/run_inference_all_t4.sh
@ -0,0 +1,30 @@
+#!/bin/bash
+
+# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -e
+
+if [[ "$(docker ps | grep trtis_kaldi_server | wc -l)" == "0" ]]; then
+	printf "\nThe TensorRT Inference Server is currently not running. Please run scripts/docker/launch_server.sh\n\n"
+	exit 1
+fi
+
+printf "\nOffline benchmarks:\n"
+
+scripts/docker/launch_client.sh -i 5 -c 1000
+
+printf "\nOnline benchmarks:\n"
+
+scripts/docker/launch_client.sh -i 10 -c 700 -o
+scripts/docker/launch_client.sh -i 10 -c 400 -o
--- a/Kaldi/SpeechRecognition/scripts/run_inference_all_v100.sh
+++ b/Kaldi/SpeechRecognition/scripts/run_inference_all_v100.sh
@ -0,0 +1,31 @@
+#!/bin/bash
+
+# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -e
+
+if [[ "$(docker ps | grep trtis_kaldi_server | wc -l)" == "0" ]]; then
+	printf "\nThe TensorRT Inference Server is currently not running. Please run scripts/docker/launch_server.sh\n\n"
+	exit 1
+fi
+
+printf "\nOffline benchmarks:\n"
+
+scripts/docker/launch_client.sh -i 5 -c 2000
+
+printf "\nOnline benchmarks:\n"
+
+scripts/docker/launch_client.sh -i 10 -c 1500 -o
+scripts/docker/launch_client.sh -i 10 -c 1000 -o
+scripts/docker/launch_client.sh -i 5 -c 800 -o
--- a/Kaldi/SpeechRecognition/trtis-kaldi-backend/Makefile
+++ b/Kaldi/SpeechRecognition/trtis-kaldi-backend/Makefile
@ -0,0 +1,5 @@
+.PHONY: all
+all: kaldibackend
+
+kaldibackend: kaldi-backend.cc kaldi-backend-utils.cc
+	g++ -fpic -shared -std=c++11 -o libkaldi-trtisbackend.so kaldi-backend.cc kaldi-backend-utils.cc -Icustom-backend-sdk/include custom-backend-sdk/lib/libcustombackend.a -I/opt/kaldi/src/ -I/usr/local/cuda/include -I/opt/kaldi/tools/openfst/include/ -L/opt/kaldi/src/lib/ -lkaldi-cudadecoder
--- a/Kaldi/SpeechRecognition/trtis-kaldi-backend/kaldi-backend-utils.cc
+++ b/Kaldi/SpeechRecognition/trtis-kaldi-backend/kaldi-backend-utils.cc
@ -0,0 +1,155 @@
+// Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "kaldi-backend-utils.h"
+
+namespace nvidia {
+namespace inferenceserver {
+namespace custom {
+namespace kaldi_cbe {
+
+int GetInputTensor(CustomGetNextInputFn_t input_fn, void* input_context,
+                   const char* name, const size_t expected_byte_size,
+                   std::vector<uint8_t>* input, const void** out) {
+  input->clear();  // reset buffer
+  // The values for an input tensor are not necessarily in one
+  // contiguous chunk, so we might copy the chunks into 'input' vector.
+  // If possible, we use the data in place
+  uint64_t total_content_byte_size = 0;
+  while (true) {
+    const void* content;
+    uint64_t content_byte_size = expected_byte_size - total_content_byte_size;
+    if (!input_fn(input_context, name, &content, &content_byte_size)) {
+      return kInputContents;
+    }
+
+    // If 'content' returns nullptr we have all the input.
+    if (content == nullptr) break;
+
+    // If the total amount of content received exceeds what we expect
+    // then something is wrong.
+    total_content_byte_size += content_byte_size;
+    if (total_content_byte_size > expected_byte_size) 
+	    return kInputSize;
+
+    if (content_byte_size == expected_byte_size) {
+      *out = content;
+      return kSuccess;
+    }
+
+    input->insert(input->end(), static_cast<const uint8_t*>(content),
+                  static_cast<const uint8_t*>(content) + content_byte_size);
+  }
+
+  // Make sure we end up with exactly the amount of input we expect.
+  if (total_content_byte_size != expected_byte_size) {
+    return kInputSize;
+  }
+  *out = &input[0];
+
+  return kSuccess;
+}
+
+void LatticeToString(fst::SymbolTable& word_syms,
+                     const kaldi::CompactLattice& dlat, std::string* out_str) {
+  kaldi::CompactLattice best_path_clat;
+  kaldi::CompactLatticeShortestPath(dlat, &best_path_clat);
+
+  kaldi::Lattice best_path_lat;
+  fst::ConvertLattice(best_path_clat, &best_path_lat);
+
+  std::vector<int32> alignment;
+  std::vector<int32> words;
+  kaldi::LatticeWeight weight;
+  fst::GetLinearSymbolSequence(best_path_lat, &alignment, &words, &weight);
+  std::ostringstream oss;
+  for (size_t i = 0; i < words.size(); i++) {
+    std::string s = word_syms.Find(words[i]);
+    if (s == "") std::cerr << "Word-id " << words[i] << " not in symbol table.";
+    oss << s << " ";
+  }
+  *out_str = std::move(oss.str());
+}
+
+int ReadParameter(const ModelConfig& model_config_, const std::string& key,
+                  std::string* param) {
+  auto it = model_config_.parameters().find(key);
+  if (it == model_config_.parameters().end()) {
+    std::cerr << "Parameter \"" << key
+              << "\" missing from config file. Exiting." << std::endl;
+    return kInvalidModelConfig;
+  }
+  *param = it->second.string_value();
+  return kSuccess;
+}
+
+int ReadParameter(const ModelConfig& model_config_, const std::string& key,
+                  int* param) {
+  std::string tmp;
+  int err = ReadParameter(model_config_, key, &tmp);
+  *param = std::stoi(tmp);
+  return err;
+}
+
+int ReadParameter(const ModelConfig& model_config_, const std::string& key,
+                  float* param) {
+  std::string tmp;
+  int err = ReadParameter(model_config_, key, &tmp);
+  *param = std::stof(tmp);
+  return err;
+}
+
+const char* CustomErrorString(int errcode) {
+  switch (errcode) {
+    case kSuccess:
+      return "success";
+    case kInvalidModelConfig:
+      return "invalid model configuration";
+    case kGpuNotSupported:
+      return "execution on GPU not supported";
+    case kSequenceBatcher:
+      return "model configuration must configure sequence batcher";
+    case kModelControl:
+      return "'START' and 'READY' must be configured as the control inputs";
+    case kInputOutput:
+      return "model must have four inputs and one output with shape [-1]";
+    case kInputName:
+      return "names for input don't exist";
+    case kOutputName:
+      return "model output must be named 'OUTPUT'";
+    case kInputOutputDataType:
+      return "model inputs or outputs data_type cannot be specified";
+    case kInputContents:
+      return "unable to get input tensor values";
+    case kInputSize:
+      return "unexpected size for input tensor";
+    case kOutputBuffer:
+      return "unable to get buffer for output tensor values";
+    case kBatchTooBig:
+      return "unable to execute batch larger than max-batch-size";
+    case kTimesteps:
+      return "unable to execute more than 1 timestep at a time";
+    case kChunkTooBig:
+      return "a chunk cannot contain more samples than the WAV_DATA dimension";
+    default:
+      break;
+  }
+
+  return "unknown error";
+}
+
+}  // kaldi
+}  // custom
+}  // inferenceserver
+}  // nvidia
--- a/Kaldi/SpeechRecognition/trtis-kaldi-backend/kaldi-backend-utils.h
+++ b/Kaldi/SpeechRecognition/trtis-kaldi-backend/kaldi-backend-utils.h
@ -0,0 +1,66 @@
+// Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "lat/lattice-functions.h"
+#include "src/core/model_config.h"
+#include "src/core/model_config.pb.h"
+#include "src/custom/sdk/custom_instance.h"
+
+namespace nvidia {
+namespace inferenceserver {
+namespace custom {
+namespace kaldi_cbe {
+
+enum ErrorCodes {
+  kSuccess,
+  kUnknown,
+  kInvalidModelConfig,
+  kGpuNotSupported,
+  kSequenceBatcher,
+  kModelControl,
+  kInputOutput,
+  kInputName,
+  kOutputName,
+  kInputOutputDataType,
+  kInputContents,
+  kInputSize,
+  kOutputBuffer,
+  kBatchTooBig,
+  kTimesteps,
+  kChunkTooBig
+};
+
+int GetInputTensor(CustomGetNextInputFn_t input_fn, void* input_context,
+                   const char* name, const size_t expected_byte_size,
+                   std::vector<uint8_t>* input, const void** out);
+
+void LatticeToString(fst::SymbolTable& word_syms,
+                     const kaldi::CompactLattice& dlat, std::string* out_str);
+
+int ReadParameter(const ModelConfig& model_config_, const std::string& key,
+                  std::string* param);
+
+int ReadParameter(const ModelConfig& model_config_, const std::string& key,
+                  int* param);
+int ReadParameter(const ModelConfig& model_config_, const std::string& key,
+                  float* param);
+
+const char* CustomErrorString(int errcode);
+
+}  // kaldi
+}  // custom
+}  // inferenceserver
+}  // nvidia
--- a/Kaldi/SpeechRecognition/trtis-kaldi-backend/kaldi-backend.cc
+++ b/Kaldi/SpeechRecognition/trtis-kaldi-backend/kaldi-backend.cc
@ -0,0 +1,401 @@
+// Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "kaldi-backend.h"
+#include "kaldi-backend-utils.h"
+
+namespace nvidia {
+namespace inferenceserver {
+namespace custom {
+namespace kaldi_cbe {
+
+Context::Context(const std::string& instance_name,
+                 const ModelConfig& model_config, const int gpu_device)
+    : instance_name_(instance_name),
+      model_config_(model_config),
+      gpu_device_(gpu_device),
+      num_channels_(
+          model_config_
+              .max_batch_size()),  // diff in def between kaldi and trtis
+      int32_byte_size_(GetDataTypeByteSize(TYPE_INT32)),
+      int64_byte_size_(GetDataTypeByteSize(TYPE_INT64)) {}
+
+Context::~Context() { delete word_syms_; }
+
+int Context::ReadModelParameters() {
+  // Reading config
+  float beam, lattice_beam;
+  int max_active;
+  int frame_subsampling_factor;
+  float acoustic_scale;
+  int num_worker_threads;
+  int err =
+      ReadParameter(model_config_, "mfcc_filename",
+                    &batched_decoder_config_.feature_opts.mfcc_config) ||
+      ReadParameter(
+          model_config_, "ivector_filename",
+          &batched_decoder_config_.feature_opts.ivector_extraction_config) ||
+      ReadParameter(model_config_, "beam", &beam) ||
+      ReadParameter(model_config_, "lattice_beam", &lattice_beam) ||
+      ReadParameter(model_config_, "max_active", &max_active) ||
+      ReadParameter(model_config_, "frame_subsampling_factor",
+                    &frame_subsampling_factor) ||
+      ReadParameter(model_config_, "acoustic_scale", &acoustic_scale) ||
+      ReadParameter(model_config_, "nnet3_rxfilename", &nnet3_rxfilename_) ||
+      ReadParameter(model_config_, "fst_rxfilename", &fst_rxfilename_) ||
+      ReadParameter(model_config_, "word_syms_rxfilename",
+                    &word_syms_rxfilename_) ||
+      ReadParameter(model_config_, "num_worker_threads", &num_worker_threads) ||
+      ReadParameter(model_config_, "max_execution_batch_size",
+                    &max_batch_size_);
+  if (err) return err;
+  max_batch_size_ = std::max<int>(max_batch_size_, 1);
+  num_channels_ = std::max<int>(num_channels_, 1);
+
+  // Sanity checks
+  if (beam <= 0) return kInvalidModelConfig;
+  if (lattice_beam <= 0) return kInvalidModelConfig;
+  if (max_active <= 0) return kInvalidModelConfig;
+  if (acoustic_scale <= 0) return kInvalidModelConfig;
+  if (num_worker_threads <= 0) return kInvalidModelConfig;
+  if (num_channels_ <= max_batch_size_) return kInvalidModelConfig;
+
+  batched_decoder_config_.compute_opts.frame_subsampling_factor =
+      frame_subsampling_factor;
+  batched_decoder_config_.compute_opts.acoustic_scale = acoustic_scale;
+  batched_decoder_config_.decoder_opts.default_beam = beam;
+  batched_decoder_config_.decoder_opts.lattice_beam = lattice_beam;
+  batched_decoder_config_.decoder_opts.max_active = max_active;
+  batched_decoder_config_.num_worker_threads = num_worker_threads;
+  batched_decoder_config_.max_batch_size = max_batch_size_;
+  batched_decoder_config_.num_channels = num_channels_;
+
+  auto feature_config = batched_decoder_config_.feature_opts;
+  kaldi::OnlineNnet2FeaturePipelineInfo feature_info(feature_config);
+  sample_freq_ = feature_info.mfcc_opts.frame_opts.samp_freq;
+  BaseFloat frame_shift = feature_info.FrameShiftInSeconds();
+  seconds_per_chunk_ = chunk_num_samps_ / sample_freq_;
+
+  int samp_per_frame = static_cast<int>(sample_freq_ * frame_shift);
+  float n_input_framesf = chunk_num_samps_ / samp_per_frame;
+  bool is_integer = (n_input_framesf == std::floor(n_input_framesf));
+  if (!is_integer) {
+    std::cerr << "WAVE_DATA dim must be a multiple fo samples per frame ("
+              << samp_per_frame << ")" << std::endl;
+    return kInvalidModelConfig;
+  }
+  int n_input_frames = static_cast<int>(std::floor(n_input_framesf));
+  batched_decoder_config_.compute_opts.frames_per_chunk = n_input_frames;
+
+  return kSuccess;
+}
+
+int Context::InitializeKaldiPipeline() {
+  batch_corr_ids_.reserve(max_batch_size_);
+  batch_wave_samples_.reserve(max_batch_size_);
+  batch_is_last_chunk_.reserve(max_batch_size_);
+  wave_byte_buffers_.resize(max_batch_size_);
+  output_shape_ = {1, 1};
+  kaldi::CuDevice::Instantiate()
+      .SelectAndInitializeGpuIdWithExistingCudaContext(gpu_device_);
+  kaldi::CuDevice::Instantiate().AllowMultithreading();
+
+  // Loading models
+  {
+    bool binary;
+    kaldi::Input ki(nnet3_rxfilename_, &binary);
+    trans_model_.Read(ki.Stream(), binary);
+    am_nnet_.Read(ki.Stream(), binary);
+
+    kaldi::nnet3::SetBatchnormTestMode(true, &(am_nnet_.GetNnet()));
+    kaldi::nnet3::SetDropoutTestMode(true, &(am_nnet_.GetNnet()));
+    kaldi::nnet3::CollapseModel(kaldi::nnet3::CollapseModelConfig(),
+                                &(am_nnet_.GetNnet()));
+  }
+  fst::Fst<fst::StdArc>* decode_fst = fst::ReadFstKaldiGeneric(fst_rxfilename_);
+  cuda_pipeline_.reset(
+      new kaldi::cuda_decoder::BatchedThreadedNnet3CudaOnlinePipeline(
+          batched_decoder_config_, *decode_fst, am_nnet_, trans_model_));
+  delete decode_fst;
+
+  // Loading word syms for text output
+  if (word_syms_rxfilename_ != "") {
+    if (!(word_syms_ = fst::SymbolTable::ReadText(word_syms_rxfilename_))) {
+      std::cerr << "Could not read symbol table from file "
+                << word_syms_rxfilename_;
+      return kInvalidModelConfig;
+    }
+  }
+  chunk_num_samps_ = cuda_pipeline_->GetNSampsPerChunk();
+  chunk_num_bytes_ = chunk_num_samps_ * sizeof(BaseFloat);
+  return kSuccess;
+}
+
+int Context::Init() {
+  return InputOutputSanityCheck() || ReadModelParameters() ||
+         InitializeKaldiPipeline();
+}
+
+bool Context::CheckPayloadError(const CustomPayload& payload) {
+  int err = payload.error_code;
+  if (err) std::cerr << "Error: " << CustomErrorString(err) << std::endl;
+  return (err != 0);
+}
+
+int Context::Execute(const uint32_t payload_cnt, CustomPayload* payloads,
+                     CustomGetNextInputFn_t input_fn,
+                     CustomGetOutputFn_t output_fn) {
+  // kaldi::Timer timer;
+  if (payload_cnt > num_channels_) return kBatchTooBig;
+  // Each payload is a chunk for one sequence
+  // Currently using dynamic batcher, not sequence batcher
+  for (uint32_t pidx = 0; pidx < payload_cnt; ++pidx) {
+    if (batch_corr_ids_.size() == max_batch_size_) FlushBatch();
+
+    CustomPayload& payload = payloads[pidx];
+    if (payload.batch_size != 1) payload.error_code = kTimesteps;
+    if (CheckPayloadError(payload)) continue;
+
+    // Get input tensors
+    int32_t start, dim, end, ready;
+    CorrelationID corr_id;
+    const BaseFloat* wave_buffer;
+    payload.error_code = GetSequenceInput(
+        input_fn, payload.input_context, &corr_id, &start, &ready, &dim, &end,
+        &wave_buffer, &wave_byte_buffers_[pidx]);
+    if (CheckPayloadError(payload)) continue;
+    if (!ready) continue;
+    if (dim > chunk_num_samps_) payload.error_code = kChunkTooBig;
+    if (CheckPayloadError(payload)) continue;
+
+    kaldi::SubVector<BaseFloat> wave_part(wave_buffer, dim);
+    // Initialize corr_id if first chunk
+    if (start) cuda_pipeline_->InitCorrID(corr_id);
+    // Add to batch
+    batch_corr_ids_.push_back(corr_id);
+    batch_wave_samples_.push_back(wave_part);
+    batch_is_last_chunk_.push_back(end);
+
+    if (end) {
+      // If last chunk, set the callback for that seq
+      cuda_pipeline_->SetLatticeCallback(
+          corr_id, [this, &output_fn, &payloads, pidx,
+                    corr_id](kaldi::CompactLattice& clat) {
+            std::string output;
+            LatticeToString(*word_syms_, clat, &output);
+            SetOutputTensor(output, output_fn, payloads[pidx]);
+          });
+    }
+  }
+  FlushBatch();
+  cuda_pipeline_->WaitForLatticeCallbacks();
+  return kSuccess;
+}
+
+int Context::FlushBatch() {
+  if (!batch_corr_ids_.empty()) {
+    cuda_pipeline_->DecodeBatch(batch_corr_ids_, batch_wave_samples_,
+                                batch_is_last_chunk_);
+    batch_corr_ids_.clear();
+    batch_wave_samples_.clear();
+    batch_is_last_chunk_.clear();
+  }
+}
+
+int Context::InputOutputSanityCheck() {
+  if (!model_config_.has_sequence_batching()) {
+    return kSequenceBatcher;
+  }
+
+  auto& batcher = model_config_.sequence_batching();
+  if (batcher.control_input_size() != 4) {
+    return kModelControl;
+  }
+
+  std::set<std::string> control_input_names;
+  for (int i = 0; i < 4; ++i)
+    control_input_names.insert(batcher.control_input(i).name());
+  if (!(control_input_names.erase("START") &&
+        control_input_names.erase("END") &&
+        control_input_names.erase("CORRID") &&
+        control_input_names.erase("READY"))) {
+    return kModelControl;
+  }
+
+  if (model_config_.input_size() != 2) {
+    return kInputOutput;
+  }
+  if ((model_config_.input(0).dims().size() != 1) ||
+      (model_config_.input(0).dims(0) <= 0) ||
+      (model_config_.input(1).dims().size() != 1) ||
+      (model_config_.input(1).dims(0) != 1)) {
+    return kInputOutput;
+  }
+  chunk_num_samps_ = model_config_.input(0).dims(0);
+  chunk_num_bytes_ = chunk_num_samps_ * sizeof(float);
+
+  if ((model_config_.input(0).data_type() != DataType::TYPE_FP32) ||
+      (model_config_.input(1).data_type() != DataType::TYPE_INT32)) {
+    return kInputOutputDataType;
+  }
+  if ((model_config_.input(0).name() != "WAV_DATA") ||
+      (model_config_.input(1).name() != "WAV_DATA_DIM")) {
+    return kInputName;
+  }
+
+  if (model_config_.output_size() != 1) {
+    return kInputOutput;
+  }
+  if ((model_config_.output(0).dims().size() != 1) ||
+      (model_config_.output(0).dims(0) != 1)) {
+    return kInputOutput;
+  }
+  if (model_config_.output(0).data_type() != DataType::TYPE_STRING) {
+    return kInputOutputDataType;
+  }
+  if (model_config_.output(0).name() != "TEXT") {
+    return kOutputName;
+  }
+
+  return kSuccess;
+}
+
+int Context::GetSequenceInput(CustomGetNextInputFn_t& input_fn,
+                              void* input_context, CorrelationID* corr_id,
+                              int32_t* start, int32_t* ready, int32_t* dim,
+                              int32_t* end, const BaseFloat** wave_buffer,
+                              std::vector<uint8_t>* input_buffer) {
+  int err;
+  //&input_buffer[0]: char pointer -> alias with any types
+  // wave_data[0] will holds the struct
+
+  // Get start of sequence tensor
+  const void* out;
+  err = GetInputTensor(input_fn, input_context, "WAV_DATA_DIM",
+                       int32_byte_size_, &byte_buffer_, &out);
+  if (err != kSuccess) return err;
+  *dim = *reinterpret_cast<const int32_t*>(out);
+
+  err = GetInputTensor(input_fn, input_context, "END", int32_byte_size_,
+                       &byte_buffer_, &out);
+  if (err != kSuccess) return err;
+  *end = *reinterpret_cast<const int32_t*>(out);
+
+  err = GetInputTensor(input_fn, input_context, "START", int32_byte_size_,
+                       &byte_buffer_, &out);
+  if (err != kSuccess) return err;
+  *start = *reinterpret_cast<const int32_t*>(out);
+
+  err = GetInputTensor(input_fn, input_context, "READY", int32_byte_size_,
+                       &byte_buffer_, &out);
+  if (err != kSuccess) return err;
+  *ready = *reinterpret_cast<const int32_t*>(out);
+
+  err = GetInputTensor(input_fn, input_context, "CORRID", int64_byte_size_,
+                       &byte_buffer_, &out);
+  if (err != kSuccess) return err;
+  *corr_id = *reinterpret_cast<const CorrelationID*>(out);
+
+  // Get pointer to speech tensor
+  err = GetInputTensor(input_fn, input_context, "WAV_DATA", chunk_num_bytes_,
+                       input_buffer, &out);
+  if (err != kSuccess) return err;
+  *wave_buffer = reinterpret_cast<const BaseFloat*>(out);
+
+  return kSuccess;
+}
+
+int Context::SetOutputTensor(const std::string& output,
+                             CustomGetOutputFn_t output_fn,
+                             CustomPayload payload) {
+  uint32_t byte_size_with_size_int = output.size() + sizeof(int32);
+
+  // std::cout << output << std::endl;
+
+  // copy output from best_path to output buffer
+  if ((payload.error_code == 0) && (payload.output_cnt > 0)) {
+    const char* output_name = payload.required_output_names[0];
+    // output buffer
+    void* obuffer;
+    if (!output_fn(payload.output_context, output_name, output_shape_.size(),
+                   &output_shape_[0], byte_size_with_size_int, &obuffer)) {
+      payload.error_code = kOutputBuffer;
+      return payload.error_code;
+    }
+
+    // If no error but the 'obuffer' is returned as nullptr, then
+    // skip writing this output.
+    if (obuffer != nullptr) {
+      // std::cout << "writing " << output << std::endl;
+      int32* buffer_as_int = reinterpret_cast<int32*>(obuffer);
+      buffer_as_int[0] = output.size();
+      memcpy(&buffer_as_int[1], output.data(), output.size());
+    }
+  }
+}
+/////////////
+
+extern "C" {
+
+int CustomInitialize(const CustomInitializeData* data, void** custom_context) {
+  // Convert the serialized model config to a ModelConfig object.
+  ModelConfig model_config;
+  if (!model_config.ParseFromString(std::string(
+          data->serialized_model_config, data->serialized_model_config_size))) {
+    return kInvalidModelConfig;
+  }
+
+  // Create the context and validate that the model configuration is
+  // something that we can handle.
+  Context* context = new Context(std::string(data->instance_name), model_config,
+                                 data->gpu_device_id);
+  int err = context->Init();
+  if (err != kSuccess) {
+    return err;
+  }
+
+  *custom_context = static_cast<void*>(context);
+
+  return kSuccess;
+}
+
+int CustomFinalize(void* custom_context) {
+  if (custom_context != nullptr) {
+    Context* context = static_cast<Context*>(custom_context);
+    delete context;
+  }
+
+  return kSuccess;
+}
+
+const char* CustomErrorString(void* custom_context, int errcode) {
+  return CustomErrorString(errcode);
+}
+
+int CustomExecute(void* custom_context, const uint32_t payload_cnt,
+                  CustomPayload* payloads, CustomGetNextInputFn_t input_fn,
+                  CustomGetOutputFn_t output_fn) {
+  if (custom_context == nullptr) {
+    return kUnknown;
+  }
+
+  Context* context = static_cast<Context*>(custom_context);
+  return context->Execute(payload_cnt, payloads, input_fn, output_fn);
+}
+
+}  // extern "C"
+}
+}
+}
+}  // namespace nvidia::inferenceserver::custom::kaldi_cbe
--- a/Kaldi/SpeechRecognition/trtis-kaldi-backend/kaldi-backend.h
+++ b/Kaldi/SpeechRecognition/trtis-kaldi-backend/kaldi-backend.h
@ -0,0 +1,119 @@
+// Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#define HAVE_CUDA 1  // Loading Kaldi headers with GPU
+
+#include <cfloat>
+#include <sstream>
+#include "cudadecoder/batched-threaded-nnet3-cuda-online-pipeline.h"
+#include "fstext/fstext-lib.h"
+#include "lat/lattice-functions.h"
+#include "nnet3/am-nnet-simple.h"
+#include "nnet3/nnet-utils.h"
+#include "util/kaldi-thread.h"
+
+#include "src/core/model_config.h"
+#include "src/core/model_config.pb.h"
+#include "src/custom/sdk/custom_instance.h"
+
+using kaldi::BaseFloat;
+
+namespace nvidia {
+namespace inferenceserver {
+namespace custom {
+namespace kaldi_cbe {
+
+// Context object. All state must be kept in this object.
+class Context {
+ public:
+  Context(const std::string& instance_name, const ModelConfig& config,
+          const int gpu_device);
+  virtual ~Context();
+
+  // Initialize the context. Validate that the model configuration,
+  // etc. is something that we can handle.
+  int Init();
+
+  // Perform custom execution on the payloads.
+  int Execute(const uint32_t payload_cnt, CustomPayload* payloads,
+              CustomGetNextInputFn_t input_fn, CustomGetOutputFn_t output_fn);
+
+ private:
+  // init kaldi pipeline
+  int InitializeKaldiPipeline();
+  int InputOutputSanityCheck();
+  int ReadModelParameters();
+  int GetSequenceInput(CustomGetNextInputFn_t& input_fn, void* input_context,
+                       CorrelationID* corr_id, int32_t* start, int32_t* ready,
+                       int32_t* dim, int32_t* end,
+                       const kaldi::BaseFloat** wave_buffer,
+                       std::vector<uint8_t>* input_buffer);
+
+  int SetOutputTensor(const std::string& output, CustomGetOutputFn_t output_fn,
+                      CustomPayload payload);
+
+  bool CheckPayloadError(const CustomPayload& payload);
+  int FlushBatch();
+
+  // The name of this instance of the backend.
+  const std::string instance_name_;
+
+  // The model configuration.
+  const ModelConfig model_config_;
+
+  // The GPU device ID to execute on or CUSTOM_NO_GPU_DEVICE if should
+  // execute on CPU.
+  const int gpu_device_;
+
+  // Models paths
+  std::string nnet3_rxfilename_, fst_rxfilename_;
+  std::string word_syms_rxfilename_;
+
+  // batch_size
+  int max_batch_size_;
+  int num_channels_;
+  int num_worker_threads_;
+  std::vector<CorrelationID> batch_corr_ids_;
+  std::vector<kaldi::SubVector<kaldi::BaseFloat>> batch_wave_samples_;
+  std::vector<bool> batch_is_last_chunk_;
+
+  BaseFloat sample_freq_, seconds_per_chunk_;
+  int chunk_num_bytes_, chunk_num_samps_;
+
+  // feature_config includes configuration for the iVector adaptation,
+  // as well as the basic features.
+  kaldi::cuda_decoder::BatchedThreadedNnet3CudaOnlinePipelineConfig
+      batched_decoder_config_;
+  std::unique_ptr<kaldi::cuda_decoder::BatchedThreadedNnet3CudaOnlinePipeline>
+      cuda_pipeline_;
+  // Maintain the state of some shared objects
+  kaldi::TransitionModel trans_model_;
+
+  kaldi::nnet3::AmNnetSimple am_nnet_;
+  fst::SymbolTable* word_syms_;
+
+  const uint64_t int32_byte_size_;
+  const uint64_t int64_byte_size_;
+  std::vector<int64_t> output_shape_;
+
+  std::vector<uint8_t> byte_buffer_;
+  std::vector<std::vector<uint8_t>> wave_byte_buffers_;
+};
+
+}  // kaldi
+}  // custom
+}  // inferenceserver
+}  // nvidia
--- a/Kaldi/SpeechRecognition/trtis-kaldi-backend/libkaldi_online.ldscript
+++ b/Kaldi/SpeechRecognition/trtis-kaldi-backend/libkaldi_online.ldscript
@ -0,0 +1,21 @@
+# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+{
+  global:
+    CustomErrorString;
+    CustomExecute;
+    CustomFinalize;
+    CustomInitialize;
+  local: *;
+};