[QuartzNet/PyT] Release QuartzNet model

This commit is contained in:
Mikolaj Blaz 2021-09-14 06:03:36 -07:00 committed by Krzysztof Kudrynski
parent 88eb3cff2f
commit 649776f79a
55 changed files with 6160 additions and 0 deletions

View file

@ -0,0 +1,9 @@
__pycache__
*.pt
results/
datasets/
checkpoints/
*.swp
*.swo
*.swn

View file

@ -0,0 +1,30 @@
# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
ARG FROM_IMAGE_NAME=nvcr.io/nvidia/pytorch:21.07-py3
FROM ${FROM_IMAGE_NAME}
RUN apt update && apt install -y libsndfile1 && apt install -y sox && rm -rf /var/lib/apt/lists/*
WORKDIR /workspace/quartznet
# Install requirements (do this first for better caching)
COPY requirements.txt .
RUN conda install -y pyyaml==5.4.1
RUN pip install --disable-pip-version-check -U -r requirements.txt
RUN pip install --extra-index-url https://developer.download.nvidia.com/compute/redist nvidia-dali-cuda110==1.2.0
# Copy rest of files
COPY . .

View file

@ -0,0 +1,203 @@
Except where otherwise noted, the following license applies to all files in this repo.
Apache License
Version 2.0, January 2004
http://www.apache.org/licenses/
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
1. Definitions.
"License" shall mean the terms and conditions for use, reproduction,
and distribution as defined by Sections 1 through 9 of this document.
"Licensor" shall mean the copyright owner or entity authorized by
the copyright owner that is granting the License.
"Legal Entity" shall mean the union of the acting entity and all
other entities that control, are controlled by, or are under common
control with that entity. For the purposes of this definition,
"control" means (i) the power, direct or indirect, to cause the
direction or management of such entity, whether by contract or
otherwise, or (ii) ownership of fifty percent (50%) or more of the
outstanding shares, or (iii) beneficial ownership of such entity.
"You" (or "Your") shall mean an individual or Legal Entity
exercising permissions granted by this License.
"Source" form shall mean the preferred form for making modifications,
including but not limited to software source code, documentation
source, and configuration files.
"Object" form shall mean any form resulting from mechanical
transformation or translation of a Source form, including but
not limited to compiled object code, generated documentation,
and conversions to other media types.
"Work" shall mean the work of authorship, whether in Source or
Object form, made available under the License, as indicated by a
copyright notice that is included in or attached to the work
(an example is provided in the Appendix below).
"Derivative Works" shall mean any work, whether in Source or Object
form, that is based on (or derived from) the Work and for which the
editorial revisions, annotations, elaborations, or other modifications
represent, as a whole, an original work of authorship. For the purposes
of this License, Derivative Works shall not include works that remain
separable from, or merely link (or bind by name) to the interfaces of,
the Work and Derivative Works thereof.
"Contribution" shall mean any work of authorship, including
the original version of the Work and any modifications or additions
to that Work or Derivative Works thereof, that is intentionally
submitted to Licensor for inclusion in the Work by the copyright owner
or by an individual or Legal Entity authorized to submit on behalf of
the copyright owner. For the purposes of this definition, "submitted"
means any form of electronic, verbal, or written communication sent
to the Licensor or its representatives, including but not limited to
communication on electronic mailing lists, source code control systems,
and issue tracking systems that are managed by, or on behalf of, the
Licensor for the purpose of discussing and improving the Work, but
excluding communication that is conspicuously marked or otherwise
designated in writing by the copyright owner as "Not a Contribution."
"Contributor" shall mean Licensor and any individual or Legal Entity
on behalf of whom a Contribution has been received by Licensor and
subsequently incorporated within the Work.
2. Grant of Copyright License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
copyright license to reproduce, prepare Derivative Works of,
publicly display, publicly perform, sublicense, and distribute the
Work and such Derivative Works in Source or Object form.
3. Grant of Patent License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
(except as stated in this section) patent license to make, have made,
use, offer to sell, sell, import, and otherwise transfer the Work,
where such license applies only to those patent claims licensable
by such Contributor that are necessarily infringed by their
Contribution(s) alone or by combination of their Contribution(s)
with the Work to which such Contribution(s) was submitted. If You
institute patent litigation against any entity (including a
cross-claim or counterclaim in a lawsuit) alleging that the Work
or a Contribution incorporated within the Work constitutes direct
or contributory patent infringement, then any patent licenses
granted to You under this License for that Work shall terminate
as of the date such litigation is filed.
4. Redistribution. You may reproduce and distribute copies of the
Work or Derivative Works thereof in any medium, with or without
modifications, and in Source or Object form, provided that You
meet the following conditions:
(a) You must give any other recipients of the Work or
Derivative Works a copy of this License; and
(b) You must cause any modified files to carry prominent notices
stating that You changed the files; and
(c) You must retain, in the Source form of any Derivative Works
that You distribute, all copyright, patent, trademark, and
attribution notices from the Source form of the Work,
excluding those notices that do not pertain to any part of
the Derivative Works; and
(d) If the Work includes a "NOTICE" text file as part of its
distribution, then any Derivative Works that You distribute must
include a readable copy of the attribution notices contained
within such NOTICE file, excluding those notices that do not
pertain to any part of the Derivative Works, in at least one
of the following places: within a NOTICE text file distributed
as part of the Derivative Works; within the Source form or
documentation, if provided along with the Derivative Works; or,
within a display generated by the Derivative Works, if and
wherever such third-party notices normally appear. The contents
of the NOTICE file are for informational purposes only and
do not modify the License. You may add Your own attribution
notices within Derivative Works that You distribute, alongside
or as an addendum to the NOTICE text from the Work, provided
that such additional attribution notices cannot be construed
as modifying the License.
You may add Your own copyright statement to Your modifications and
may provide additional or different license terms and conditions
for use, reproduction, or distribution of Your modifications, or
for any such Derivative Works as a whole, provided Your use,
reproduction, and distribution of the Work otherwise complies with
the conditions stated in this License.
5. Submission of Contributions. Unless You explicitly state otherwise,
any Contribution intentionally submitted for inclusion in the Work
by You to the Licensor shall be under the terms and conditions of
this License, without any additional terms or conditions.
Notwithstanding the above, nothing herein shall supersede or modify
the terms of any separate license agreement you may have executed
with Licensor regarding such Contributions.
6. Trademarks. This License does not grant permission to use the trade
names, trademarks, service marks, or product names of the Licensor,
except as required for reasonable and customary use in describing the
origin of the Work and reproducing the content of the NOTICE file.
7. Disclaimer of Warranty. Unless required by applicable law or
agreed to in writing, Licensor provides the Work (and each
Contributor provides its Contributions) on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied, including, without limitation, any warranties or conditions
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
PARTICULAR PURPOSE. You are solely responsible for determining the
appropriateness of using or redistributing the Work and assume any
risks associated with Your exercise of permissions under this License.
8. Limitation of Liability. In no event and under no legal theory,
whether in tort (including negligence), contract, or otherwise,
unless required by applicable law (such as deliberate and grossly
negligent acts) or agreed to in writing, shall any Contributor be
liable to You for damages, including any direct, indirect, special,
incidental, or consequential damages of any character arising as a
result of this License or out of the use or inability to use the
Work (including but not limited to damages for loss of goodwill,
work stoppage, computer failure or malfunction, or any and all
other commercial damages or losses), even if such Contributor
has been advised of the possibility of such damages.
9. Accepting Warranty or Additional Liability. While redistributing
the Work or Derivative Works thereof, You may choose to offer,
and charge a fee for, acceptance of support, warranty, indemnity,
or other liability obligations and/or rights consistent with this
License. However, in accepting such obligations, You may act only
on Your own behalf and on Your sole responsibility, not on behalf
of any other Contributor, and only if You agree to indemnify,
defend, and hold each Contributor harmless for any liability
incurred by, or claims asserted against, such Contributor by reason
of your accepting any such warranty or additional liability.
END OF TERMS AND CONDITIONS
APPENDIX: How to apply the Apache License to your work.
To apply the Apache License to your work, attach the following
boilerplate notice, with the fields enclosed by brackets "[]"
replaced with your own identifying information. (Don't include
the brackets!) The text should be enclosed in the appropriate
comment syntax for the file format. We also recommend that a
file or class name and description of purpose be included on the
same "printed page" as the copyright notice for easier
identification within third-party archives.
Copyright 2019 NVIDIA Corporation
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.

View file

@ -0,0 +1,5 @@
QuartzNet in PyTorch
This repository includes source code (in "common/") from:
* https://github.com/keithito/tacotron and https://github.com/ryanleary/patter licensed under MIT license.

View file

@ -0,0 +1,674 @@
# QuartzNet For PyTorch
This repository provides a script and recipe to train the QuartzNet model to achieve state-of-the-art accuracy. The content of this repository is tested and maintained by NVIDIA.
## Table Of Contents
- [Model overview](#model-overview)
* [Model architecture](#model-architecture)
* [Default configuration](#default-configuration)
* [Feature support matrix](#feature-support-matrix)
* [Features](#features)
* [Mixed precision training](#mixed-precision-training)
* [Enabling mixed precision](#enabling-mixed-precision)
* [Enabling TF32](#enabling-tf32)
* [Glossary](#glossary)
- [Setup](#setup)
* [Requirements](#requirements)
- [Quick Start Guide](#quick-start-guide)
- [Advanced](#advanced)
* [Scripts and sample code](#scripts-and-sample-code)
* [Parameters](#parameters)
* [Command-line options](#command-line-options)
* [Getting the data](#getting-the-data)
* [Dataset guidelines](#dataset-guidelines)
* [Multi-dataset](#multi-dataset)
* [Training process](#training-process)
* [Inference process](#inference-process)
- [Performance](#performance)
* [Benchmarking](#benchmarking)
* [Training performance benchmark](#training-performance-benchmark)
* [Inference performance benchmark](#inference-performance-benchmark)
* [Results](#results)
* [Training accuracy results](#training-accuracy-results)
* [Training accuracy: NVIDIA DGX A100 (8x A100 80GB)](#training-accuracy-nvidia-dgx-a100-8x-a100-80gb)
* [Training stability test](#training-stability-test)
* [Training performance results](#training-performance-results)
* [Training performance: NVIDIA DGX A100 (8x A100 80GB)](#training-performance-nvidia-dgx-a100-8x-a100-80gb)
* [Training performance: NVIDIA DGX-2 (16x V100 32GB)](#training-performance-nvidia-dgx-2-16x-v100-32gb)
* [Inference performance results](#inference-performance-results)
* [Inference performance: NVIDIA DGX A100 (1x A100 80GB)](#inference-performance-nvidia-dgx-a100-1x-a100-80gb)
* [Inference performance: NVIDIA DGX-2 (1x V100 32GB)](#inference-performance-nvidia-dgx-2-1x-v100-32gb)
- [Release notes](#release-notes)
* [Changelog](#changelog)
* [Known issues](#known-issues)
## Model overview
This repository provides an implementation of the QuartzNet model in PyTorch from the paper [QuartzNet: Deep Automatic Speech Recognition with 1D Time-Channel Separable Convolutions](https://arxiv.org/pdf/1910.10261).
The QuartzNet model is an end-to-end neural acoustic model for automatic speech recognition (ASR), that provides high accuracy at a low memory footprint. The QuartzNet architecture of convolutional layers was designed to facilitate fast GPU inference, by allowing whole sub-blocks to be fused into a single GPU kernel. This is important for meeting strict real-time requirements of ASR systems in deployment.
This repository is a PyTorch implementation of QuartzNet and provides scripts to train the QuartzNet 10x5 model from scratch on the [LibriSpeech](http://www.openslr.org/12) dataset to achieve the greedy decoding results improved upon the original paper.
The repository is self-contained and includes data preparation scripts, training, and inference scripts.
Both training and inference scripts offer the option to use Automatic Mixed Precision (AMP) to benefit from Tensor Cores for better performance.
In addition to providing the hyperparameters for training a model checkpoint, we publish a thorough inference analysis across different NVIDIA GPU platforms, for example, DGX-2, NVIDIA A100 GPU, and T4.
This model is trained with mixed precision using Tensor Cores on Volta, Turing, and the NVIDIA Ampere GPU architectures. Therefore, researchers can get results [1.4]x faster than training without Tensor Cores, while experiencing the benefits of mixed precision training. This model is tested against each NGC monthly container release to ensure consistent accuracy and performance over time.
### Model architecture
QuartzNet is an end-to-end neural acoustic model that is based on efficient, time-channel separable convolutions (Figure 1).
In the audio processing stage, each frame is transformed into mel-scale spectrogram features, which the acoustic model takes as input and outputs a probability distribution over the vocabulary for each frame.
<p align="center">
<img src="./img/model.png" alt="QuartzNet model architecture" width="50%" />
</p>
<p align="center">
<em>Figure 1. Architecture of QuartzNet (<a href=”https://arxiv.org/abs/1910.10261”>source</a>)
</em>
</p>
### Default configuration
The following features were implemented in this model:
* GPU-supported feature extraction with data augmentation options [SpecAugment](https://arxiv.org/abs/1904.08779) and [Cutout](https://arxiv.org/pdf/1708.04552.pdf) using the DALI library
* offline and online [Speed Perturbation](https://www.danielpovey.com/files/2015_interspeech_augmentation.pdf) using the DALI library
* data-parallel multi-GPU training and evaluation
* AMP with dynamic loss scaling for Tensor Core training
* FP16 inference
### Feature support matrix
| **Feature** | **QuartzNet** |
|---------------|---------------|
|[Apex AMP](https://nvidia.github.io/apex/amp.html) | Yes |
|[DALI](https://docs.nvidia.com/deeplearning/dali/release-notes/index.html) | Yes |
#### Features
**DALI**
NVIDIA Data Loading Library (DALI) is a collection of highly optimized building blocks, and an execution engine, to accelerate the pre-processing of the input data for deep learning applications. DALI provides both the performance and the flexibility for accelerating different data pipelines as a single library. This single library can then be easily integrated into different deep learning training and inference applications. For details, see example sources in this repository or see the [DALI documentation](https://docs.nvidia.com/deeplearning/dali/index.html).
**Automatic Mixed Precision (AMP)**
Computation graphs can be modified by PyTorch on runtime to support mixed precision training. A detailed explanation of mixed precision can be found in the next section.
### Mixed precision training
Mixed precision is the combined use of different numerical precisions in a computational method. [Mixed precision](https://arxiv.org/abs/1710.03740) training offers significant computational speedup by performing operations in half-precision format while storing minimal information in single-precision to retain as much information as possible in critical parts of the network. Since the introduction of [Tensor Cores](https://developer.nvidia.com/tensor-cores) in Volta, and following with both the Turing and Ampere architectures, significant training speedups are experienced by switching to mixed precision -- up to 3x overall speedup on the most arithmetically intense model architectures. Using [mixed precision training](https://docs.nvidia.com/deeplearning/performance/mixed-precision-training/index.html) previously required two steps:
1. Porting the model to use the FP16 data type where appropriate.
2. Adding loss scaling to preserve small gradient values.
For information about:
- How to train using mixed precision, see the [Mixed Precision Training](https://arxiv.org/abs/1710.03740) paper and [Training With Mixed Precision](https://docs.nvidia.com/deeplearning/performance/mixed-precision-training/index.html) documentation.
- Techniques used for mixed precision training, see the [Mixed-Precision Training of Deep Neural Networks](https://devblogs.nvidia.com/mixed-precision-training-deep-neural-networks/) blog.
- APEX tools for mixed precision training, see the [NVIDIA Apex: Tools for Easy Mixed-Precision Training in PyTorch](https://devblogs.nvidia.com/apex-pytorch-easy-mixed-precision-training/).
#### Enabling mixed precision
For training, mixed precision can be enabled by setting the flag: `train.py --amp`. When using bash helper scripts, mixed precision can be enabled with the environment variable `AMP=true`, for example, `AMP=true bash scripts/train.sh`, `AMP=true bash scripts/inference.sh`, etc.
#### Enabling TF32
TensorFloat-32 (TF32) is the new math mode in [NVIDIA A100](https://www.nvidia.com/en-us/data-center/a100/) GPUs for handling the matrix math also called tensor operations. TF32 running on Tensor Cores in A100 GPUs can provide up to 10x speedups compared to single-precision floating-point math (FP32) on Volta GPUs.
TF32 Tensor Cores can speed up networks using FP32, typically with no loss of accuracy. It is more robust than FP16 for models which require high dynamic range for weights or activations.
For more information, refer to the [TensorFloat-32 in the A100 GPU Accelerates AI Training, HPC up to 20x](https://blogs.nvidia.com/blog/2020/05/14/tensorfloat-32-precision-format/) blog post.
TF32 is supported in the NVIDIA Ampere GPU architecture and is enabled by default.
### Glossary
**Time-channel separable (TCS) convolution**
A module composed mainly of two convolutional layers: a 1D depthwise convolutional layer,
and a pointwise convolutional layer (Figure 2). The former operates across K time frames, and the latter across all channels. By decoupling time and channel axes, the separable module uses less parameters and calculates the result faster, than it would otherwise would.
<p align="center">
<img src="./img/tcs_conv.png" alt="Time-channel separable (TCS) convolutional module" width="50%" />
</p>
<p align="center">
<em>Figure 2. Time-channel separable (TCS) convolutional module: (a) basic design, (b) TCS with a group shuffle layer, added to increase cross-group interchange</em>
</p>
**Automatic Speech Recognition (ASR)**
Uses both an acoustic model and a language model to output the transcript of an input audio signal.
**Acoustic model**
Assigns a probability distribution over a vocabulary of characters given an audio frame. Typically, a large part of the entire ASR model.
**Language model**
Assigns a probability distribution over a sequence of words. Given a sequence of words, it assigns a probability to the whole sequence.
**Pre-training**
Training a model on vast amounts of data on the same (or different) task to build general understandings.
## Setup
The following section lists the requirements that you need to meet in order to start training the QuartzNet model.
### Requirements
This repository contains Dockerfile which extends the PyTorch 21.07-py3 NGC container and encapsulates some dependencies. Aside from these dependencies, ensure you have the following components:
- [NVIDIA Docker](https://github.com/NVIDIA/nvidia-docker)
- [PyTorch 21.07-py3 NGC container](https://ngc.nvidia.com/catalog/containers/nvidia:pytorch)
- Supported GPUs:
- [NVIDIA Volta architecture](https://www.nvidia.com/en-us/data-center/volta-gpu-architecture/)
- [NVIDIA Turing architecture](https://www.nvidia.com/en-us/design-visualization/technologies/turing-architecture/)
- [NVIDIA Ampere architecture](https://www.nvidia.com/en-us/data-center/nvidia-ampere-gpu-architecture/)
For more information about how to get started with NGC containers, see the following sections from the NVIDIA GPU Cloud Documentation and the Deep Learning Documentation:
- [Getting Started Using NVIDIA GPU Cloud](https://docs.nvidia.com/ngc/ngc-getting-started-guide/index.html)
- [Accessing And Pulling From The NGC Container Registry](https://docs.nvidia.com/deeplearning/frameworks/user-guide/index.html#accessing_registry)
- [Running PyTorch](https://docs.nvidia.com/deeplearning/dgx/pytorch-release-notes/running.html#running)
Further required Python packages are listed in `requirements.txt`, which are automatically installed with the built Docker container. To manually install them, run:
```bash
pip install -r requirements.txt
```
For those unable to use the PyTorch 21.07-py3 NGC container, to set up the required environment or create your own container, see the versioned [NVIDIA Container Support Matrix](https://docs.nvidia.com/deeplearning/frameworks/support-matrix/index.html).
## Quick Start Guide
To train your model using mixed or TF32 precision with Tensor Cores or using FP32, perform the following steps using the default parameters of the QuartzNet model on the LibriSpeech dataset. For the specifics concerning training and inference, see the [Advanced](#advanced) section.
1. Clone the repository.
```bash
git clone https://github.com/NVIDIA/DeepLearningExamples
cd DeepLearningExamples/PyTorch/SpeechRecognition/QuartzNet
```
2. Build the QuartzNet PyTorch NGC container.
```bash
bash scripts/docker/build.sh
```
3. Start an interactive session in the NGC container to prepare the dataset, or run training/inference.
Specify a local mountpoint for the dataset with the `DATA_DIR` variable:
```bash
DATA_DIR=<path_on_the_host> bash scripts/docker/launch.sh
```
4. Download and preprocess the dataset.
No GPU is required for data download and preprocessing.
It can take several hours to complete, and requires over 250GB of free disk space.
This repository provides scripts to download and extract LibriSpeech [http://www.openslr.org/12](http://www.openslr.org/12). The dataset contains 1000 hours of 16kHz read English speech derived from public domain audiobooks from the LibriVox project and has been carefully segmented and aligned. For more information, see the [LIBRISPEECH: AN ASR CORPUS BASED ON PUBLIC DOMAIN AUDIO BOOKS](http://www.danielpovey.com/files/2015_icassp_librispeech.pdf) paper.
Inside the container, download and extract the datasets into the required format for later training and inference:
```bash
bash scripts/download_librispeech.sh
```
After the data download is complete, the following folders should exist:
```bash
datasets/LibriSpeech/
├── dev-clean
├── dev-other
├── test-clean
├── test-other
├── train-clean-100
├── train-clean-360
└── train-other-500
```
Since `/datasets/` is mounted to `DATA_DIR` on the host, after the dataset is downloaded it will be accessible from outside of the container at `$DATA_DIR/LibriSpeech`.
Next, convert the data into WAV files:
```bash
bash scripts/preprocess_librispeech.sh
```
After the data is converted, the following additional files and folders should exist:
```bash
datasets/LibriSpeech/
├── dev-clean-wav
├── dev-other-wav
├── librispeech-train-clean-100-wav.json
├── librispeech-train-clean-360-wav.json
├── librispeech-train-other-500-wav.json
├── librispeech-dev-clean-wav.json
├── librispeech-dev-other-wav.json
├── librispeech-test-clean-wav.json
├── librispeech-test-other-wav.json
├── test-clean-wav
├── test-other-wav
├── train-clean-100-wav
├── train-clean-360-wav
└── train-other-500-wav
```
5. Start training.
Inside the container, use the following script to start training.
Make sure the downloaded and preprocessed dataset is located at `$DATA_DIR/LibriSpeech` on the host, which is mounted as `/datasets/LibriSpeech` inside the container.
```bash
[OPTION1=value1 OPTION2=value2 ...] bash scripts/train.sh
```
By default, automatic precision is disabled, batch size is 144 over two gradient accumulation steps, and the recipe is run on a total of 8 GPUs. The hyperparameters are tuned for a GPU with at least 32GB of memory and will require adjustment for different configurations (for example, by lowering the batch size and using more gradient accumulation steps).
Options are being passed as environment variables. More details on the available options can be found in the [Parameters](#parameters) and [Training process](#training-process) sections.
6. Start validation/evaluation.
Inside the container, use the following script to run evaluation.
Make sure the downloaded and preprocessed dataset is located at `$DATA_DIR/LibriSpeech` on the host, which is mounted as `/datasets/LibriSpeech` inside the container.
```bash
[OPTION1=value1 OPTION2=value2 ...] bash scripts/evaluation.sh [OPTIONS]
```
By default, this will use full precision, a batch size of 64, and run on a single GPU.
Options are being passed as environment variables. More details on the available options can be found in the [Parameters](#parameters) and [Evaluation process](#evaluation-process) sections.
7. Start inference/predictions.
Inside the container, use the following script to run inference.
Make sure the downloaded and preprocessed dataset is located at `$DATA_DIR/LibriSpeech` on the host, which is mounted as `/datasets/LibriSpeech` inside the container.
A pretrained model checkpoint can be downloaded from [NGC model repository](https://ngc.nvidia.com/catalog/models).
```bash
[OPTION1=value1 OPTION2=value2 ...] bash scripts/inference.sh
```
By default, this will use single precision, a batch size of 64, and run on a single GPU.
Options are being passed as environment variables. More details on the available options can be found in the [Parameters](#parameters) and [Inference process](#inference-process) sections.
Now that you have your model trained and evaluated, you can choose to compare your training results with our [Training accuracy results](#training-accuracy-results). You can also choose to benchmark your performance to [Training performance benchmark](#training-performance-results), or [Inference performance benchmark](#inference-performance-results). Following the steps in these sections will ensure that you achieve the same accuracy and performance results as stated in the [Results](#results) section.
## Advanced
The following sections provide greater details of the dataset, running training and inference, and the training results.
### Scripts and sample code
In the `root` directory, the most important files are:
```
quartznet
├── common # data pre-processing, logging, etc.
├── configs # model configurations
├── Dockerfile # container with the basic set of dependencies to run QuartzNet
├── inference.py # entry point for inference
├── quartznet # model-specific code
├── scripts # one-click scripts required for running various supported functionalities
│ ├── docker # contains the scripts for building and launching the container
│ ├── download_librispeech.sh # downloads LibriSpeech dataset
│ ├── evaluation.sh # runs evaluation using the `inference.py` script
│ ├── inference_benchmark.sh # runs the inference benchmark using the `inference_benchmark.py` script
│ ├── inference.sh # runs inference using the `inference.py` script
│ ├── preprocess_librispeech.sh # preprocess LibriSpeech raw data files for training and inference
│ ├── train_benchmark.sh # runs the training performance benchmark using the `train.py` script
│ └── train.sh # runs training using the `train.py` script
├── train.py # entry point for training
└── utils # data downloading and common routines
```
### Parameters
Parameters should be set as environment variables.
The complete list of available parameters for `scripts/train.sh` script contains:
```bash
DATA_DIR: directory of dataset. (default: '/datasets/LibriSpeech')
MODEL_CONFIG: relative path to model configuration. (default: 'configs/quartznet10x5dr_speedp_online_speca.yaml')
OUTPUT_DIR: directory for results, logs, and created checkpoints. (default: '/results')
CHECKPOINT: a specific model checkpoint to continue training from. To resume training from the last checkpoint, see the RESUME option.
RESUME: resume training from the last checkpoint found in OUTPUT_DIR, or from scratch if there are no checkpoints (default: true)
CUDNN_BENCHMARK: boolean that indicates whether to enable cudnn benchmark mode for using more optimized kernels. (default: true)
NUM_GPUS: number of GPUs to use. (default: 8)
AMP: if set to `true`, enables automatic mixed precision (default: false)
GPU_BATCH_SIZE: batch size for every forward/backward pass. The effective batch size might be higher, if gradient accumulation is enabled (default: 72)
GRAD_ACCUMULATION: number of forward/backward passes until the optimizer updates weights. (default: 2)
LEARNING_RATE: initial learning rate. (default: 0.01)
MIN_LEARNING_RATE: minimum learning rate, despite LR scheduling (default: 1e-5)
LR_POLICY: how to decay LR (default: exponential)
LR_EXP_GAMMA: decay factor for the exponential LR schedule (default: 0.981)
EMA: decay factor for exponential averages of checkpoints (default: 0.999)
SEED: seed for random number generator and used for ensuring reproducibility. (default: 0)
EPOCHS: number of training epochs. (default: 440)
WARMUP_EPOCHS: number of initial epoch of linearly increasing LR. (default: 2)
HOLD_EPOCHS: number of epochs to hold maximum LR after warmup. (default: 140)
SAVE_FREQUENCY: number of epochs between saving the model to disk. (default: 10)
EPOCHS_THIS_JOB: run training for this number of epochs. Does not affect LR schedule like the EPOCHS parameter. (default: 0)
DALI_DEVICE: device to run the DALI pipeline on for calculation of filterbanks. Valid choices: cpu, gpu, none. (default: gpu)
PAD_TO_MAX_DURATION: pad all sequences with zeros to maximum length. (default: false)
EVAL_FREQUENCY: number of steps between evaluations on the validation set. (default: 544)
PREDICTION_FREQUENCY: the number of steps between writing a sample prediction to stdout. (default: 544)
TRAIN_MANIFESTS: lists of .json training set files
VAL_MANIFESTS: lists of .json validation set files
```
The complete list of available parameters for `scripts/inference.sh` script contains:
```bash
DATA_DIR: directory of dataset. (default: '/datasets/LibriSpeech')
MODEL_CONFIG: model configuration. (default: 'configs/quartznet10x5dr_speedp-online_speca.yaml')
OUTPUT_DIR: directory for results and logs. (default: '/results')
CHECKPOINT: model checkpoint path. (required)
DATASET: name of the LibriSpeech subset to use. (default: 'dev-clean')
LOG_FILE: path to the DLLogger .json logfile. (default: '')
CUDNN_BENCHMARK: enable cudnn benchmark mode for using more optimized kernels. (default: false)
MAX_DURATION: filter out recordings shorter then MAX_DURATION seconds. (default: "")
PAD_TO_MAX_DURATION: pad all sequences with zeros to maximum length. (default: false)
NUM_GPUS: number of GPUs to use. Note that with > 1 GPUs WER results might be inaccurate due to the batching policy. (default: 1)
NUM_STEPS: number of batches to evaluate, loop the dataset if necessary. (default: 0)
NUM_WARMUP_STEPS: number of initial steps before measuring performance. (default: 0)
AMP: enable FP16 inference with AMP. (default: false)
BATCH_SIZE: data batch size. (default: 64)
EMA: Attempt to load exponentially averaged weights from a checkpoint. (default: true)
SEED: seed for random number generator and used for ensuring reproducibility. (default: 0)
DALI_DEVICE: device to run the DALI pipeline on for calculation of filterbanks. Valid choices: cpu, gpu, none. (default: gpu)
CPU: run inference on CPU. (default: false)
LOGITS_FILE: dump logit matrices to a file. (default: "")
PREDICTION_FILE: save predictions to a file. (default: "${OUTPUT_DIR}/${DATASET}.predictions")
```
The complete list of available parameters for `scripts/evaluation.sh` is the same as `scripts/inference.sh`. Only the defaults have changed.
```bash
PREDICTION_FILE: (default: "")
DATASET: (default: "test-other")
```
The `scripts/inference_benchmark.sh` script pads all input to a fixed duration and computes the mean, 90%, 95%, 99% percentile of latency for the specified number of inference steps. Latency is measured in milliseconds per batch. The `scripts/inference_benchmark.sh` measures latency for a single GPU and loops over a number of batch sizes and durations. It extends `scripts/inference.sh` and changes the defaults with:
```bash
BATCH_SIZE_SEQ: batch sizes to measure with. (default: "1 2 4 8 16")
MAX_DURATION_SEQ: input durations (in seconds) to measure with (default: "2 7 16.7")
CUDNN_BENCHMARK: (default: true)
PAD_TO_MAX_DURATION: (default: true)
NUM_WARMUP_STEPS: (default: 10)
NUM_STEPS: (default: 500)
DALI_DEVICE: (default: "cpu")
```
The `scripts/train_benchmark.sh` script pads all input to the same length according to the input argument `MAX_DURATION` and measures average training latency and throughput performance. Latency is measured in seconds per batch, throughput in sequences per second.
Training performance is measured with online speed perturbation and NVIDIA cuDNN benchmark mode enabled.
The script `scripts/train_benchmark.sh` loops over a number of batch sizes and GPU counts.
It extends `scripts/train.sh`, the complete list of available parameters for `scripts/train_benchmark.sh` script contains:
```bash
ACC_BATCH_SIZE: accumulated (effective) batch size to measure with. (default: "144")
GRAD_ACC_SEQ: the sequence of gradient accumulation settings to measure with. (default: "4 2")
NUM_GPUS_SEQ: number of GPUs to run the training on. (default: "1 4 8")
MODEL_CONFIG: (default: "configs/quartznet10x5dr_speedp-online_train-benchmark.yaml")
TRAIN_MANIFESTS: (default: "$DATA_DIR/librispeech-train-clean-100-wav.json")
RESUME: (default: false)
EPOCHS_THIS_JOB: (default: 2)
EPOCHS: (default: 100000)
SAVE_FREQUENCY: (default: 100000)
EVAL_FREQUENCY: (default: 100000)
GRAD_ACCUMULATION_STEPS: (default: 1)
PAD_TO_MAX_DURATION: (default: true)
EMA: (default: 0)
```
### Command-line options
To see the full list of available options and their descriptions, use the `-h` or `--help` command-line option, for example:
```bash
python train.py --help
python inference.py --help
```
### Getting the data
QuartzNet is trained on the LibriSpeech dataset. We use the concatenation of `train-clean-100`, `train-clean-360`, and `train-other-500` for training and `dev-clean` for validation.
This repository contains the `scripts/download_librispeech.sh` and `scripts/preprocess_librispeech.sh` scripts that automatically downloads and preprocesses the training, test, and development datasets. By default, data is downloaded to the `/datasets/LibriSpeech` directory. A minimum of 250GB free space is required for download and preprocessing; the final preprocessed dataset is approximately 100GB.
#### Dataset guidelines
The `scripts/preprocess_librispeech.sh` script converts the input audio files to WAV format with a sample rate of 16kHz. The target transcripts are stripped from whitespace characters, then lower-cased. No offline augmentations are stored on the disk - these are computed online with the DALI library without any impact on training time.
After preprocessing, the script creates JSON metadata files with output file paths, sample rate, target transcript and other metadata. These JSON files are used by the training script to identify training and validation datasets.
The QuartzNet model was tuned on audio signals with a sample rate of 16kHz. If you wish to use a different sampling rate, then some hyperparameters might need to be changed - specifically, the window size and step size.
#### Multi-dataset
Training scripts in this repository treat the training subsets of LibriSpeech (`train-clean-100`, `train-clean-360`, `train-other-500`) as three independent training datasets.
In order to add more datasets, follow the format of LibriSpeech, adjust the provided pre-processing scripts to generate metadata JSON files, and point them with the `TRAIN_MANIFESTS` variable to the `scripts/train.sh` script.
### Training process
Training is performed using the `train.py` script along with parameters defined in `scripts/train.sh`.
The `scripts/train.sh` script runs a job on a single node that trains the QuartzNet model from scratch using LibriSpeech as training data. To make training more efficient, we discard audio samples longer than 16.7 seconds from the training dataset, the total number of these samples is less than 1%. Such filtering does not degrade accuracy, but it allows us to decrease the number of time steps in a batch, which requires less GPU memory and increases training speed.
Apart from the default arguments as listed in the [Parameters](#parameters) section, by default the training script:
* Runs on 8 GPUs with at least 32GB of memory and training/evaluation batch size 48, split over three gradient accumulation steps
* Uses TF32 precision (A100 GPU) or FP32 (other GPUs)
* Trains on the concatenation of all 3 LibriSpeech training datasets and evaluates on the LibriSpeech dev-clean dataset
* Maintains an exponential moving average of parameters for evaluation
* Has cuDNN benchmark enabled
* Runs for 260 epochs
* Uses an initial learning rate of 0.02 and an exponential learning rate decay
* Saves a checkpoint every 10 epochs
* Automatically removes old checkpoints and preserves milestone checkpoints
* Runs evaluation on the development dataset every epoch and at the end of training
* Maintains a separate checkpoint with the lowest WER on development set
* Prints out training progress every iteration to `stdout`
* Creates a DLLogger log file and a TensorBoard log
* Calculates speed perturbation online during training
* Uses `SpecAugment` in data pre-processing
* Filters out audio samples longer than 16.7 seconds
* Pads each batch so its length is divisible by 16
* Uses time-channel separable convolutions as described in the paper
* Uses weight decay of 0.001
* Uses [Novograd](https://arxiv.org/pdf/1905.11286.pdf) as optimizer with betas=(0.95, 0)
Enabling AMP permits batch size 144 with one gradient accumulation step. Since each batch has to be padded to the longest sequence, all GPUs have to wait for the slowest one, and two accumulation steps are slightly faster.
The current training setup improves upon the greedy WER [Results](#results) of the QuartzNet paper.
### Inference process
Inference is performed using the `inference.py` script along with parameters defined in `scripts/inference.sh`.
The `scripts/inference.sh` script runs the job on a single GPU, taking a pre-trained QuartzNet model checkpoint and running it on the specified dataset.
Apart from the default arguments as listed in the [Parameters](#parameters) section, by default, the inference script:
* Evaluates on the LibriSpeech dev-clean dataset and prints out the final word error rate
* Uses a batch size of 64
* Creates a log file with progress and results which will be stored in the `results` folder
* Pads each batch so its length would be divisible by 16
* Does not use data augmentation
* Does greedy decoding and optionally saves the transcriptions in the results folder
* Has the option to save the model output tensors for more complex decoding, for example, beam search
* Has cuDNN benchmark disabled
To view all available options for inference, run `python inference.py --help`.
## Performance
### Benchmarking
The following section shows how to run benchmarks measuring the model performance in training and inference modes.
#### Training performance benchmark
To benchmark the training performance with a number of specific configurations, run:
```bash
GRAD_ACC_SEQ=<SEQUENCE> NUM_GPUS_SEQ=<NUMS_OF_GPUS> bash scripts/train_benchmark.sh
```
for example:
```bash
GRAD_ACC_SEQ="12 24" NUM_GPUS_SEQ="4 8" bash scripts/train_benchmark.sh
```
This invocation will measure performance in four setups (two different batch sizes for every single forward/backward pass times two hardware setups).
By default, this script makes forward/backward pre-allocation passes with all possible audio lengths
enabling immediate stabilization of training step times in the cuDNN benchmark mode,
and trains for two epochs on the `train-clean-100` subset of LibriSpeech.
#### Inference performance benchmark
To benchmark the inference performance on a specific batch size and audio length, run:
```bash
BATCH_SIZE_SEQ=<BATCH_SIZES> MAX_DURATION_SEQ=<DURATIONS> bash scripts/inference_benchmark.sh
```
for example:
```bash
BATCH_SIZE_SEQ="24 48" MAX_DURATION_SEQ="2 7 16.7" bash scripts/inference_benchmark.sh
```
The script runs on a single GPU and evaluates on the dataset of fixed-length utterances shorter than `MAX_DURATION` and padded to that duration.
### Results
The following sections provide details on how we achieved our performance and accuracy in training and inference.
#### Training accuracy results
##### Training accuracy: NVIDIA DGX A100 (8x A100 80GB)
Our results were obtained by running the `scripts/train.sh` training script in the PyTorch 21.07-py3 NGC container on NVIDIA DGX A100 (8x A100 80GB) GPUs.
| Number of GPUs | Batch size per GPU | Precision | dev-clean WER | dev-other WER | test-clean WER | test-other WER | Time to train |
|-----|-----|-------|-------|-------|------|-------|------|
| 8 | 144 | mixed | 3.47 | 10.84 | 3.69 | 10.69 | 34 h |
The table reports word error rate (WER) of the acoustic model with greedy decoding on all LibriSpeech dev and test datasets for mixed precision training.
##### Training stability test
The following table compares greedy decoding word error rates across 8 different training runs with different seeds for mixed precision training.
| DGX A100 80GB, FP16, 8x GPU | Seed #1 | Seed #2 | Seed #3 | Seed #4 | Seed #5 | Seed #6 | Seed #7 | Seed #8 | Mean | Std |
|-----------:|----------:|----------:|----------:|----------:|----------:|----------:|----------:|----------:|-------:|------:|
| dev-clean | 3.57 | 3.48 | 3.54 | 3.48 | 3.47 | 3.69 | 3.51 | 3.59 | 3.54 | 0.07 |
| dev-other | 10.68 | 10.78 | 10.47 | 10.72 | 10.84 | 11.03 | 10.67 | 10.86 | 10.76 | 0.15 |
| test-clean | 3.70 | 3.82 | 3.79 | 3.84 | 3.69 | 4.03 | 3.82 | 3.80 | 3.81 | 0.10 |
| test-other | 10.75 | 10.62 | 10.54 | 10.90 | 10.69 | 11.14 | 10.41 | 10.82 | 10.73 | 0.21 |
#### Training performance results
##### Training performance: NVIDIA DGX A100 (8x A100 80GB)
Our results were obtained by running:
```bash
AMP=true NUM_GPUS_SEQ="1" GRAD_ACC_SEQ="16 24" bash scripts/train_benchmark.sh
AMP=true NUM_GPUS_SEQ="4" GRAD_ACC_SEQ="4 6" bash scripts/train_benchmark.sh
AMP=true NUM_GPUS_SEQ="8" GRAD_ACC_SEQ="2 3" bash scripts/train_benchmark.sh
AMP=false NUM_GPUS_SEQ="1" GRAD_ACC_SEQ="16 24" bash scripts/train_benchmark.sh
AMP=false NUM_GPUS_SEQ="4" GRAD_ACC_SEQ="4 6" bash scripts/train_benchmark.sh
AMP=false NUM_GPUS_SEQ="8" GRAD_ACC_SEQ="2 3" bash scripts/train_benchmark.sh
```
in the PyTorch 21.07-py3 NGC container on NVIDIA DGX A100 with (8x A100 80GB) GPUs. Performance numbers (in items/images per second) were averaged over an entire training epoch.
| Batch size / GPU | Grad accumulation | GPUs | Throughput - FP32 | Throughput - mixed precision | Throughput speedup (FP32 to mixed precision) | Weak scaling - FP32 | Weak scaling - mixed precision |
|-----:|-----:|-------:|----------:|-------:|--------:|-----:|------:|
| 48 | 24 | 1 | 89.69 | 78.89 | 1.14 | 1.00 | 1.00 |
| 72 | 16 | 1 | 88.70 | 79.01 | 1.12 | 1.00 | 1.00 |
| 48 | 6 | 4 | 343.06 | 303.16 | 1.13 | 3.84 | 3.82 |
| 72 | 4 | 4 | 341.95 | 304.47 | 1.12 | 3.85 | 3.86 |
| 48 | 3 | 8 | 644.27 | 576.37 | 1.12 | 7.31 | 7.18 |
| 72 | 2 | 8 | 651.60 | 583.31 | 1.12 | 7.38 | 7.35 |
To achieve these same results, follow the steps in the [Quick Start Guide](#quick-start-guide).
##### Training performance: NVIDIA DGX-2 (16x V100 32GB)
Our results were obtained by running:
```bash
AMP=true NUM_GPUS_SEQ="1" GRAD_ACC_SEQ="24 48" bash scripts/train_benchmark.sh
AMP=true NUM_GPUS_SEQ="4" GRAD_ACC_SEQ="6 12" bash scripts/train_benchmark.sh
AMP=true NUM_GPUS_SEQ="8" GRAD_ACC_SEQ="3 6" bash scripts/train_benchmark.sh
AMP=true NUM_GPUS_SEQ="16" GRAD_ACC_SEQ="3" bash scripts/train_benchmark.sh
AMP=false NUM_GPUS_SEQ="1" GRAD_ACC_SEQ="48" bash scripts/train_benchmark.sh
AMP=false NUM_GPUS_SEQ="4" GRAD_ACC_SEQ="12" bash scripts/train_benchmark.sh
AMP=false NUM_GPUS_SEQ="8" GRAD_ACC_SEQ="6" bash scripts/train_benchmark.sh
AMP=false NUM_GPUS_SEQ="16" GRAD_ACC_SEQ="3" bash scripts/train_benchmark.sh
```
in the PyTorch 21.07-py3 NGC container on NVIDIA DGX-2 with (16x V100 32GB) GPUs. Performance numbers (in items/images per second) were averaged over an entire training epoch.
| Batch size / GPU | Grad accumulation | GPUs | Throughput - FP32 | Throughput - mixed precision | Throughput speedup (FP32 to mixed precision) | Weak scaling - FP32 | Weak scaling - mixed precision |
|-----:|-----:|-------:|----------:|-------:|--------:|------:|------:|
| 24 | 48 | 1 | 67.95 | 44.65 | 1.52 | 1.00 | 1.00 |
| 48 | 24 | 1 | 67.49 | - | - | 1.00 | 1.00 |
| 24 | 12 | 4 | 258.56 | 170.18 | 1.52 | 3.81 | 3.81 |
| 48 | 6 | 4 | 254.58 | - | - | - | 3.77 |
| 24 | 6 | 8 | 495.52 | 330.53 | 1.50 | 7.40 | 7.29 |
| 48 | 3 | 8 | 477.87 | - | - | - | 7.08 |
| 24 | 3 | 16 | 872.99 | 616.51 | 1.42 | 13.81 | 12.85 |
To achieve these same results, follow the steps in the [Quick Start Guide](#quick-start-guide).
#### Inference performance results
##### Inference performance: NVIDIA DGX A100 (1x A100 80GB)
Our results were obtained by running:
```bash
bash AMP=false scripts/inference_benchmark.sh
bash AMP=true scripts/inference_benchmark.sh
```
in the PyTorch 21.07-py3 NGC container on NVIDIA DGX A100 (1x A100 80GB) GPU.
Performance numbers (latency in milliseconds per batch) were averaged over 500 iterations.
| | | FP16 Latency (ms) Percentiles | | | | TF32 Latency (ms) Percentiles | | | | FP16/TF32 speed up |
|-----:|---------------:|------:|------:|------:|------:|------:|------:|------:|------:|------:|
| BS | Duration (s) | 90% | 95% | 99% | Avg | 90% | 95% | 99% | Avg | Avg |
| 1 | 2.0 | 35.51 | 36.36 | 55.57 | 35.71 | 33.23 | 33.86 | 40.05 | 33.23 | 0.93 |
| 2 | 2.0 | 38.05 | 38.91 | 52.67 | 38.21 | 34.17 | 35.17 | 39.32 | 33.73 | 0.88 |
| 4 | 2.0 | 38.43 | 38.98 | 45.44 | 37.78 | 35.02 | 36.00 | 44.10 | 34.75 | 0.92 |
| 8 | 2.0 | 38.63 | 39.37 | 45.43 | 37.94 | 35.49 | 36.70 | 45.94 | 34.53 | 0.91 |
| 16 | 2.0 | 42.33 | 44.58 | 61.02 | 40.28 | 35.66 | 36.93 | 45.38 | 34.78 | 0.86 |
| 1 | 7.0 | 37.72 | 38.54 | 42.56 | 37.28 | 33.23 | 34.16 | 40.54 | 33.13 | 0.89 |
| 2 | 7.0 | 39.44 | 41.35 | 53.62 | 38.56 | 35.15 | 35.81 | 41.83 | 34.82 | 0.90 |
| 4 | 7.0 | 38.39 | 39.48 | 45.01 | 37.98 | 37.54 | 38.51 | 42.67 | 36.12 | 0.95 |
| 8 | 7.0 | 40.82 | 41.76 | 54.20 | 39.43 | 37.67 | 39.97 | 45.24 | 36.12 | 0.92 |
| 16 | 7.0 | 42.80 | 44.80 | 56.92 | 41.52 | 40.66 | 41.96 | 53.24 | 39.24 | 0.95 |
| 1 | 16.7 | 38.22 | 38.98 | 44.15 | 37.80 | 33.89 | 34.98 | 42.66 | 33.23 | 0.88 |
| 2 | 16.7 | 39.84 | 41.09 | 52.50 | 39.34 | 35.86 | 37.16 | 42.04 | 34.39 | 0.87 |
| 4 | 16.7 | 41.02 | 42.64 | 54.96 | 39.50 | 35.98 | 37.02 | 39.30 | 34.87 | 0.88 |
| 8 | 16.7 | 40.93 | 42.06 | 56.26 | 39.36 | 40.93 | 42.06 | 45.50 | 39.34 | 1.00 |
| 16 | 16.7 | 57.21 | 58.65 | 71.33 | 57.78 | 62.74 | 63.82 | 71.13 | 61.49 | 1.06 |
To achieve these same results, follow the steps in the [Quick Start Guide](#quick-start-guide).
##### Inference performance: NVIDIA DGX-2 (1x V100 32GB)
Our results were obtained by running:
```bash
bash AMP=false scripts/inference_benchmark.sh
bash AMP=true scripts/inference_benchmark.sh
```
in the PyTorch 21.07-py3 NGC container on NVIDIA DGX-2 with (1x V100 32GB) GPU.
Performance numbers (latency in milliseconds per batch) were averaged over 500 iterations.
| | | FP16 Latency (ms) Percentiles | | | | FP32 Latency (ms) Percentiles | | | | FP16/FP32 speed up |
|-----:|---------------:|------:|------:|------:|------:|-------:|-------:|-------:|-------:|------:|
| BS | Duration (s) | 90% | 95% | 99% | Avg | 90% | 95% | 99% | Avg | Avg |
| 1 | 2.0 | 36.89 | 38.16 | 41.80 | 35.85 | 33.44 | 33.78 | 38.09 | 33.01 | 0.92 |
| 2 | 2.0 | 40.47 | 41.33 | 45.70 | 40.02 | 32.62 | 33.27 | 36.38 | 32.09 | 0.80 |
| 4 | 2.0 | 41.50 | 42.85 | 49.65 | 41.12 | 34.56 | 34.83 | 37.10 | 34.04 | 0.83 |
| 8 | 2.0 | 49.87 | 50.48 | 51.99 | 49.19 | 34.90 | 35.17 | 36.57 | 34.27 | 0.70 |
| 16 | 2.0 | 46.39 | 46.77 | 47.87 | 40.04 | 45.37 | 45.89 | 47.52 | 44.46 | 1.11 |
| 1 | 7.0 | 48.83 | 49.16 | 52.22 | 48.26 | 33.87 | 34.50 | 36.45 | 33.24 | 0.69 |
| 2 | 7.0 | 41.48 | 41.82 | 45.07 | 41.03 | 42.32 | 42.66 | 43.86 | 41.79 | 1.02 |
| 4 | 7.0 | 42.48 | 43.25 | 47.29 | 41.56 | 37.20 | 38.18 | 39.74 | 36.46 | 0.88 |
| 8 | 7.0 | 39.78 | 40.49 | 44.73 | 38.89 | 46.84 | 47.17 | 48.07 | 44.78 | 1.15 |
| 16 | 7.0 | 49.85 | 50.56 | 53.04 | 44.95 | 60.21 | 60.68 | 64.92 | 57.94 | 1.29 |
| 1 | 16.7 | 40.80 | 41.16 | 42.96 | 40.52 | 42.04 | 42.53 | 44.59 | 37.08 | 0.92 |
| 2 | 16.7 | 41.37 | 41.69 | 43.74 | 40.85 | 35.61 | 36.49 | 40.32 | 34.68 | 0.85 |
| 4 | 16.7 | 50.22 | 51.07 | 54.13 | 49.51 | 40.95 | 41.38 | 44.09 | 40.39 | 0.82 |
| 8 | 16.7 | 44.93 | 45.38 | 49.24 | 44.16 | 62.54 | 62.92 | 65.95 | 61.86 | 1.40 |
| 16 | 16.7 | 70.74 | 71.56 | 75.16 | 69.87 | 102.52 | 103.57 | 108.20 | 101.57 | 1.45 |
To achieve these same results, follow the steps in the [Quick Start Guide](#quick-start-guide).
## Release notes
We're constantly refining and improving our performance on AI and HPC workloads even on the same hardware with frequent updates to our software stack. For our latest performance data, refer to these pages for [AI](#https://developer.nvidia.com/deep-learning-performance-training-inference) and [HPC](#https://developer.nvidia.com/hpc-application-performance) benchmarks.
### Changelog
September 2021
- Initial release
### Known issues
There are no known issues in this release.

View file

@ -0,0 +1,247 @@
# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import random
import soundfile as sf
import librosa
import torch
import numpy as np
import sox
def audio_from_file(file_path, offset=0, duration=0, trim=False, target_sr=16000):
audio = AudioSegment(file_path, target_sr=target_sr, int_values=False,
offset=offset, duration=duration, trim=trim)
samples = torch.tensor(audio.samples, dtype=torch.float).cuda()
num_samples = torch.tensor(samples.shape[0]).int().cuda()
return (samples.unsqueeze(0), num_samples.unsqueeze(0))
class AudioSegment(object):
"""Monaural audio segment abstraction.
:param samples: Audio samples [num_samples x num_channels].
:type samples: ndarray.float32
:param sample_rate: Audio sample rate.
:type sample_rate: int
:raises TypeError: If the sample data type is not float or int.
"""
def __init__(self, filename, target_sr=None, int_values=False, offset=0,
duration=0, trim=False, trim_db=60):
"""Create audio segment from samples.
Samples are converted to float32 internally, with int scaled to [-1, 1].
Load a file supported by librosa and return as an AudioSegment.
:param filename: path of file to load
:param target_sr: the desired sample rate
:param int_values: if true, load samples as 32-bit integers
:param offset: offset in seconds when loading audio
:param duration: duration in seconds when loading audio
:return: numpy array of samples
"""
with sf.SoundFile(filename, 'r') as f:
dtype = 'int32' if int_values else 'float32'
sample_rate = f.samplerate
if offset > 0:
f.seek(int(offset * sample_rate))
if duration > 0:
samples = f.read(int(duration * sample_rate), dtype=dtype)
else:
samples = f.read(dtype=dtype)
samples = samples.transpose()
samples = self._convert_samples_to_float32(samples)
if target_sr is not None and target_sr != sample_rate:
samples = librosa.core.resample(samples, sample_rate, target_sr)
sample_rate = target_sr
if trim:
samples, _ = librosa.effects.trim(samples, trim_db)
self._samples = samples
self._sample_rate = sample_rate
if self._samples.ndim >= 2:
self._samples = np.mean(self._samples, 1)
def __eq__(self, other):
"""Return whether two objects are equal."""
if type(other) is not type(self):
return False
if self._sample_rate != other._sample_rate:
return False
if self._samples.shape != other._samples.shape:
return False
if np.any(self.samples != other._samples):
return False
return True
def __ne__(self, other):
"""Return whether two objects are unequal."""
return not self.__eq__(other)
def __str__(self):
"""Return human-readable representation of segment."""
return ("%s: num_samples=%d, sample_rate=%d, duration=%.2fsec, "
"rms=%.2fdB" % (type(self), self.num_samples, self.sample_rate,
self.duration, self.rms_db))
@staticmethod
def _convert_samples_to_float32(samples):
"""Convert sample type to float32.
Audio sample type is usually integer or float-point.
Integers will be scaled to [-1, 1] in float32.
"""
float32_samples = samples.astype('float32')
if samples.dtype in np.sctypes['int']:
bits = np.iinfo(samples.dtype).bits
float32_samples *= (1. / 2 ** (bits - 1))
elif samples.dtype in np.sctypes['float']:
pass
else:
raise TypeError("Unsupported sample type: %s." % samples.dtype)
return float32_samples
@property
def samples(self):
return self._samples.copy()
@property
def sample_rate(self):
return self._sample_rate
@property
def num_samples(self):
return self._samples.shape[0]
@property
def duration(self):
return self._samples.shape[0] / float(self._sample_rate)
@property
def rms_db(self):
mean_square = np.mean(self._samples ** 2)
return 10 * np.log10(mean_square)
def gain_db(self, gain):
self._samples *= 10. ** (gain / 20.)
def pad(self, pad_size, symmetric=False):
"""Add zero padding to the sample.
The pad size is given in number of samples. If symmetric=True,
`pad_size` will be added to both sides. If false, `pad_size` zeros
will be added only to the end.
"""
self._samples = np.pad(self._samples,
(pad_size if symmetric else 0, pad_size),
mode='constant')
def subsegment(self, start_time=None, end_time=None):
"""Cut the AudioSegment between given boundaries.
Note that this is an in-place transformation.
:param start_time: Beginning of subsegment in seconds.
:type start_time: float
:param end_time: End of subsegment in seconds.
:type end_time: float
:raise ValueError: If start_time or end_time is incorrectly set, e.g. out
of bounds in time.
"""
start_time = 0.0 if start_time is None else start_time
end_time = self.duration if end_time is None else end_time
if start_time < 0.0:
start_time = self.duration + start_time
if end_time < 0.0:
end_time = self.duration + end_time
if start_time < 0.0:
raise ValueError("The slice start position (%f s) is out of "
"bounds." % start_time)
if end_time < 0.0:
raise ValueError("The slice end position (%f s) is out of bounds." %
end_time)
if start_time > end_time:
raise ValueError("The slice start position (%f s) is later than "
"the end position (%f s)." % (start_time, end_time))
if end_time > self.duration:
raise ValueError("The slice end position (%f s) is out of bounds "
"(> %f s)" % (end_time, self.duration))
start_sample = int(round(start_time * self._sample_rate))
end_sample = int(round(end_time * self._sample_rate))
self._samples = self._samples[start_sample:end_sample]
class Perturbation:
def __init__(self, p=0.1, rng=None):
self.p = p
self._rng = random.Random() if rng is None else rng
def maybe_apply(self, segment, sample_rate=None):
if self._rng.random() < self.p:
self(segment, sample_rate)
class SpeedPerturbation(Perturbation):
def __init__(self, min_rate=0.85, max_rate=1.15, discrete=False, p=0.1, rng=None):
super(SpeedPerturbation, self).__init__(p, rng)
assert 0 < min_rate < max_rate
self.min_rate = min_rate
self.max_rate = max_rate
self.discrete = discrete
def __call__(self, data, sample_rate):
if self.discrete:
rate = np.random.choice([self.min_rate, None, self.max_rate])
else:
rate = self._rng.uniform(self.min_rate, self.max_rate)
if rate is not None:
data._samples = sox.Transformer().speed(factor=rate).build_array(
input_array=data._samples, sample_rate_in=sample_rate)
class GainPerturbation(Perturbation):
def __init__(self, min_gain_dbfs=-10, max_gain_dbfs=10, p=0.1, rng=None):
super(GainPerturbation, self).__init__(p, rng)
self._rng = random.Random() if rng is None else rng
self._min_gain_dbfs = min_gain_dbfs
self._max_gain_dbfs = max_gain_dbfs
def __call__(self, data, sample_rate=None):
del sample_rate
gain = self._rng.uniform(self._min_gain_dbfs, self._max_gain_dbfs)
data._samples = data._samples * (10. ** (gain / 20.))
class ShiftPerturbation(Perturbation):
def __init__(self, min_shift_ms=-5.0, max_shift_ms=5.0, p=0.1, rng=None):
super(ShiftPerturbation, self).__init__(p, rng)
self._min_shift_ms = min_shift_ms
self._max_shift_ms = max_shift_ms
def __call__(self, data, sample_rate):
shift_ms = self._rng.uniform(self._min_shift_ms, self._max_shift_ms)
if abs(shift_ms) / 1000 > data.duration:
# TODO: do something smarter than just ignore this condition
return
shift_samples = int(shift_ms * data.sample_rate // 1000)
# print("DEBUG: shift:", shift_samples)
if shift_samples < 0:
data._samples[-shift_samples:] = data._samples[:shift_samples]
data._samples[:-shift_samples] = 0
elif shift_samples > 0:
data._samples[:-shift_samples] = data._samples[shift_samples:]
data._samples[-shift_samples:] = 0

View file

@ -0,0 +1,182 @@
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import json
import math
import os
import torch
import torch.distributed as dist
from .iterator import DaliIterator, SyntheticDataIterator
from .pipeline import make_dali_asr_pipeline
from common.helpers import print_once
def _parse_json(json_path: str, start_label=0, predicate=lambda json: True):
"""
Parses json file to the format required by DALI.
Args:
json_path: path to json file
start_label: the label, starting from which DALI will assign
consecutive int numbers to every transcript
predicate: function, that accepts a sample descriptor
(i.e. json dictionary) as an argument. If the predicate for a given
sample returns True, it will be included in the dataset.
Returns:
output_files: dict that maps file name to label assigned by DALI
transcripts: dict that maps label assigned by DALI to the transcript
"""
global cnt
with open(json_path) as f:
librispeech_json = json.load(f)
output_files = {}
transcripts = {}
curr_label = start_label
for original_sample in librispeech_json:
if not predicate(original_sample):
continue
transcripts[curr_label] = original_sample['transcript']
output_files[original_sample['files'][-1]['fname']] = curr_label
curr_label += 1
return output_files, transcripts
def _dict_to_file(dict: dict, filename: str):
with open(filename, "w") as f:
for key, value in dict.items():
f.write("{} {}\n".format(key, value))
class DaliDataLoader:
"""
DataLoader is the main entry point to the data preprocessing pipeline.
To use, create an object and then just iterate over `data_iterator`.
DataLoader will do the rest for you.
Example:
data_layer = DataLoader(DaliTrainPipeline, path, json, bs, ngpu)
data_it = data_layer.data_iterator
for data in data_it:
print(data) # Here's your preprocessed data
Args:
device_type: Which device to use for preprocessing. Choose: "cpu", "gpu"
pipeline_type: Choose: "train", "val", "synth"
"""
def __init__(self, gpu_id, dataset_path: str, config_data: dict,
config_features: dict, json_names: list, symbols: list,
batch_size: int, pipeline_type: str,
grad_accumulation_steps: int = 1,
synth_iters_per_epoch: int = 544, device_type: str = "gpu"):
self.batch_size = batch_size
self.grad_accumulation_steps = grad_accumulation_steps
self.drop_last = (pipeline_type == 'train')
self.device_type = device_type
pipeline_type = self._parse_pipeline_type(pipeline_type)
if pipeline_type == "synth":
self._dali_data_iterator = self._init_synth_iterator(
self.batch_size,
config_features['nfilt'],
iters_per_epoch=synth_iters_per_epoch,
ngpus=torch.distributed.get_world_size())
else:
self._dali_data_iterator = self._init_iterator(
gpu_id=gpu_id,
dataset_path=dataset_path,
config_data=config_data,
config_features=config_features,
json_names=json_names,
symbols=symbols,
train_pipeline=pipeline_type == "train")
def _init_iterator(self, gpu_id, dataset_path, config_data,
config_features, json_names: list, symbols: list,
train_pipeline: bool):
"""Returns an iterator over data preprocessed with Dali."""
def hash_list_of_strings(li):
return str(abs(hash(''.join(li))))
output_files, transcripts = {}, {}
max_duration = config_data['max_duration']
for jname in json_names:
of, tr = _parse_json(
jname if jname[0] == '/' else os.path.join(dataset_path, jname),
len(output_files),
predicate=lambda json: json['original_duration'] <= max_duration)
output_files.update(of)
transcripts.update(tr)
file_list_path = os.path.join(
"/tmp", "asr_dali.file_list." + hash_list_of_strings(json_names))
_dict_to_file(output_files, file_list_path)
self.dataset_size = len(output_files)
print_once('Dataset read by DALI. '
f'Number of samples: {self.dataset_size}')
pipeline = make_dali_asr_pipeline(
config_data=config_data,
config_features=config_features,
device_id=gpu_id,
file_root=dataset_path,
file_list=file_list_path,
device_type=self.device_type,
batch_size=self.batch_size,
train_pipeline=train_pipeline)
return DaliIterator([pipeline], transcripts=transcripts,
symbols=symbols, batch_size=self.batch_size,
reader_name="file_reader",
train_iterator=train_pipeline)
def _init_synth_iterator(self, batch_size, nfeatures, iters_per_epoch,
ngpus):
self.dataset_size = ngpus * iters_per_epoch * batch_size
return SyntheticDataIterator(batch_size, nfeatures, regenerate=True)
@staticmethod
def _parse_pipeline_type(pipeline_type):
pipe = pipeline_type.lower()
assert pipe in ("train", "val", "synth"), \
'Invalid pipeline type (choices: "train", "val", "synth").'
return pipe
def _shard_size(self):
"""
Total number of samples handled by a single GPU in a single epoch.
"""
world_size = dist.get_world_size() if dist.is_initialized() else 1
if self.drop_last:
divisor = world_size * self.batch_size * self.grad_accumulation_steps
return self.dataset_size // divisor * divisor // world_size
else:
return int(math.ceil(self.dataset_size / world_size))
def __len__(self):
"""
Number of batches handled by each GPU.
"""
if self.drop_last:
assert self._shard_size() % self.batch_size == 0, \
f'{self._shard_size()} {self.batch_size}'
return int(math.ceil(self._shard_size() / self.batch_size))
def data_iterator(self):
return self._dali_data_iterator
def __iter__(self):
return self._dali_data_iterator

View file

@ -0,0 +1,183 @@
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import numpy as np
import torch
from nvidia.dali.plugin.base_iterator import LastBatchPolicy
from nvidia.dali.plugin.pytorch import DALIGenericIterator
from common.helpers import print_once
from common.text import _clean_text, punctuation_map
def normalize_string(s, symbols, punct_map):
"""
Normalizes string.
Example:
'call me at 8:00 pm!' -> 'call me at eight zero pm'
"""
labels = set(symbols)
try:
text = _clean_text(s, ["english_cleaners"], punct_map).strip()
return ''.join([tok for tok in text if all(t in labels for t in tok)])
except Exception as e:
print_once(f"WARNING: Normalizing failed: {s} {e}")
class DaliIterator(object):
"""Returns batches of data.
Batches are in the form:
(preprocessed_signal, preprocessed_signal_length, transcript,
transcript_length)
This iterator is not meant to be the entry point to a Dali pipeline.
Use DataLoader instead.
"""
def __init__(self, dali_pipelines, transcripts, symbols, batch_size,
reader_name, train_iterator: bool):
self.transcripts = transcripts
self.symbols = symbols
self.batch_size = batch_size
# in train pipeline shard_size is set to divisable by batch_size,
# so PARTIAL policy is safe
self.dali_it = DALIGenericIterator(
dali_pipelines,
["audio", "label", "audio_shape"],
reader_name=reader_name,
dynamic_shape=True,
auto_reset=True,
last_batch_policy=LastBatchPolicy.DROP)
@staticmethod
def _str2list(s: str):
"""
Returns list of floats, that represents given string.
'0.' denotes separator
'1.' denotes 'a'
'27.' denotes "'"
Assumes, that the string is lower case.
"""
list = []
for c in s:
if c == "'":
list.append(27.)
else:
list.append(max(0., ord(c) - 96.))
return list
@staticmethod
def _pad_lists(lists: list, pad_val=0):
"""
Pads lists, so that all have the same size.
Returns list with actual sizes of corresponding input lists
"""
max_length = 0
sizes = []
for li in lists:
sizes.append(len(li))
max_length = max_length if len(li) < max_length else len(li)
for li in lists:
li += [pad_val] * (max_length - len(li))
return sizes
def _gen_transcripts(self, labels, normalize_transcripts: bool = True):
"""
Generate transcripts in format expected by NN
"""
if normalize_transcripts:
lists = [
self._str2list(normalize_string(self.transcripts[lab.item()],
self.symbols, punctuation_map(self.symbols)))
for lab in labels]
else:
lists = [self._str2list(self.transcripts[lab.item()])
for lab in labels]
sizes = self._pad_lists(lists)
return (torch.tensor(lists).cuda(),
torch.tensor(sizes, dtype=torch.int32).cuda())
def __next__(self):
data = self.dali_it.__next__()
transcripts, transcripts_lengths = self._gen_transcripts(
data[0]["label"])
return (data[0]["audio"], data[0]["audio_shape"][:, 1], transcripts,
transcripts_lengths)
def next(self):
return self.__next__()
def __iter__(self):
return self
# TODO: refactor
class SyntheticDataIterator(object):
def __init__(self, batch_size, nfeatures, feat_min=-5., feat_max=0.,
txt_min=0., txt_max=23., feat_lens_max=1760, txt_lens_max=231,
regenerate=False):
"""
Args:
batch_size
nfeatures: number of features for melfbanks
feat_min: minimum value in `feat` tensor, used for randomization
feat_max: maximum value in `feat` tensor, used for randomization
txt_min: minimum value in `txt` tensor, used for randomization
txt_max: maximum value in `txt` tensor, used for randomization
regenerate: If True, regenerate random tensors for every iterator
step. If False, generate them only at start.
"""
self.batch_size = batch_size
self.nfeatures = nfeatures
self.feat_min = feat_min
self.feat_max = feat_max
self.feat_lens_max = feat_lens_max
self.txt_min = txt_min
self.txt_max = txt_max
self.txt_lens_max = txt_lens_max
self.regenerate = regenerate
if not self.regenerate:
(self.feat, self.feat_lens, self.txt, self.txt_lens
) = self._generate_sample()
def _generate_sample(self):
feat = ((self.feat_max - self.feat_min)
* np.random.random_sample(
(self.batch_size, self.nfeatures, self.feat_lens_max))
+ self.feat_min)
feat_lens = np.random.randint(0, int(self.feat_lens_max) - 1,
size=self.batch_size)
txt = (self.txt_max - self.txt_min) * np.random.random_sample(
(self.batch_size, self.txt_lens_max)) + self.txt_min
txt_lens = np.random.randint(0, int(self.txt_lens_max) - 1,
size=self.batch_size)
return (torch.Tensor(feat).cuda(),
torch.Tensor(feat_lens).cuda(),
torch.Tensor(txt).cuda(),
torch.Tensor(txt_lens).cuda())
def __next__(self):
if self.regenerate:
return self._generate_sample()
return self.feat, self.feat_lens, self.txt, self.txt_lens
def next(self):
return self.__next__()
def __iter__(self):
return self

View file

@ -0,0 +1,343 @@
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import itertools
import math
import multiprocessing
import numpy as np
import nvidia.dali as dali
import nvidia.dali.fn as fn
import nvidia.dali.types as types
import torch
import torch.distributed as dist
def _interleave_lists(*lists):
"""
[*, **, ***], [1, 2, 3], [a, b, c] -> [*, 1, a, **, 2, b, ***, 3, c]
Returns:
iterator over interleaved list
"""
assert all((len(lists[0]) == len(test_l) for test_l in lists)), \
"All lists have to have the same length"
return itertools.chain(*zip(*lists))
def _generate_cutouts(mask_params, nfeatures):
"""
Returns:
Generates anchors and shapes of the cutout regions.
Single call generates one batch of data.
The output shall be passed to DALI's Erase operator
anchors = [f0 t0 f1 t1 ...]
shapes = [f0w t0h f1w t1h ...]
"""
MAX_TIME_DIMENSION = 20 * 16000
freq_anchors = np.random.random(mask_params['freq_num_regions'])
time_anchors = np.random.random(mask_params['time_num_regions'])
both_anchors_freq = np.random.random(mask_params['both_num_regions'])
both_anchors_time = np.random.random(mask_params['both_num_regions'])
anchors = []
for anch in freq_anchors:
anchors.extend([anch, 0])
for anch in time_anchors:
anchors.extend([0, anch])
for t, f in zip(both_anchors_time, both_anchors_freq):
anchors.extend([f, t])
shapes = []
shapes.extend(
_interleave_lists(
np.random.randint(mask_params['freq_min'],
mask_params['freq_max'] + 1,
mask_params['freq_num_regions']),
# XXX: Here, a time dimension of the spectrogram shall be passed.
# However, in DALI ArgumentInput can't come from GPU.
# So we leave the job for Erase (masking operator) to get it together.
[int(MAX_TIME_DIMENSION)] * mask_params['freq_num_regions']
)
)
shapes.extend(
_interleave_lists(
[nfeatures] * mask_params['time_num_regions'],
np.random.randint(mask_params['time_min'],
mask_params['time_max'] + 1,
mask_params['time_num_regions'])
)
)
shapes.extend(
_interleave_lists(
np.random.randint(mask_params['both_min_freq'],
mask_params['both_max_freq'] + 1,
mask_params['both_num_regions']),
np.random.randint(mask_params['both_min_time'],
mask_params['both_max_time'] + 1,
mask_params['both_num_regions'])
)
)
return anchors, shapes
def _tuples2list(tuples: list):
"""
[(a, b), (c, d)] -> [[a, c], [b, d]]
"""
return map(list, zip(*tuples))
def _dali_init_log(args: dict):
if not dist.is_initialized() or dist.get_rank() == 0:
max_len = max([len(ii) for ii in args.keys()])
fmt_string = '\t%' + str(max_len) + 's : %s'
print('Initializing DALI with parameters:')
for keyPair in sorted(args.items()):
print(fmt_string % keyPair)
@dali.pipeline_def
def dali_asr_pipeline(train_pipeline, # True if training, False if validation
file_root,
file_list,
sample_rate,
silence_threshold,
resample_range,
discrete_resample_range,
window_size,
window_stride,
nfeatures,
nfft,
frame_splicing_factor,
dither_coeff,
pad_align,
preemph_coeff,
do_spectrogram_masking=False,
cutouts_generator=None,
shard_id=0,
n_shards=1,
preprocessing_device="gpu"):
do_remove_silence = silence_threshold is not None
def _div_ceil(dividend, divisor):
return (dividend + (divisor - 1)) // divisor
encoded, label = fn.readers.file(
device="cpu", name="file_reader", file_root=file_root,
file_list=file_list, shard_id=shard_id, num_shards=n_shards,
shuffle_after_epoch=train_pipeline)
speed_perturbation_coeffs = None
if resample_range is not None:
if discrete_resample_range:
values = [resample_range[0], 1.0, resample_range[1]]
speed_perturbation_coeffs = fn.random.uniform(device="cpu",
values=values)
else:
speed_perturbation_coeffs = fn.random.uniform(device="cpu",
range=resample_range)
if train_pipeline and speed_perturbation_coeffs is not None:
dec_sample_rate_arg = speed_perturbation_coeffs * sample_rate
elif resample_range is None:
dec_sample_rate_arg = sample_rate
else:
dec_sample_rate_arg = None
audio, _ = fn.decoders.audio(encoded, sample_rate=dec_sample_rate_arg,
dtype=types.FLOAT, downmix=True)
if do_remove_silence:
begin, length = fn.nonsilent_region(audio, cutoff_db=silence_threshold)
audio = fn.slice(audio, begin, length, axes=[0])
# Max duration drop is performed at DataLayer stage
if preprocessing_device == "gpu":
audio = audio.gpu()
if dither_coeff != 0.:
audio = audio + fn.random.normal(device=preprocessing_device
) * dither_coeff
audio = fn.preemphasis_filter(audio, preemph_coeff=preemph_coeff)
spec = fn.spectrogram(audio, nfft=nfft,
window_length=window_size * sample_rate,
window_step=window_stride * sample_rate)
mel_spec = fn.mel_filter_bank(spec, sample_rate=sample_rate,
nfilter=nfeatures, normalize=True)
log_features = fn.to_decibels(mel_spec, multiplier=np.log(10),
reference=1.0, cutoff_db=math.log(1e-20))
log_features_len = fn.shapes(log_features)
if frame_splicing_factor != 1:
log_features_len = _div_ceil(log_features_len, frame_splicing_factor)
log_features = fn.normalize(log_features, axes=[1])
log_features = fn.pad(log_features, axes=[1], fill_value=0, align=pad_align)
if train_pipeline and do_spectrogram_masking:
anchors, shapes = fn.external_source(source=cutouts_generator,
num_outputs=2, cycle=True)
log_features = fn.erase(log_features, anchor=anchors, shape=shapes,
axes=[0, 1], fill_value=0,
normalized_anchor=True)
# When modifying DALI pipeline returns, make sure you update `output_map`
# in DALIGenericIterator invocation
return log_features.gpu(), label.gpu(), log_features_len.gpu()
def make_dali_asr_pipeline(train_pipeline: bool, device_id, batch_size,
file_root: str, file_list: str, config_data: dict,
config_features: dict, device_type: str = "gpu",
do_resampling: bool = True,
num_cpu_threads: int = multiprocessing.cpu_count()):
max_duration = config_data['max_duration']
sample_rate = config_data['sample_rate']
silence_threshold = -60 if config_data['trim_silence'] else None
# TODO Take into account resampling probablity
# TODO config_features['speed_perturbation']['p']
if do_resampling and config_data['speed_perturbation'] is not None:
resample_range = [config_data['speed_perturbation']['min_rate'],
config_data['speed_perturbation']['max_rate']]
discrete_resample_range = config_data['speed_perturbation']['discrete']
else:
resample_range = None
discrete_resample_range = False
window_size = config_features['window_size']
window_stride = config_features['window_stride']
nfeatures = config_features['n_filt']
nfft = config_features['n_fft']
frame_splicing_factor = config_features['frame_splicing']
dither_coeff = config_features['dither']
pad_align = config_features['pad_align']
pad_to_max_duration = config_features['pad_to_max_duration']
assert not pad_to_max_duration, \
"Padding to max duration currently not supported in DALI"
preemph_coeff = .97
config_spec = config_features['spec_augment']
if config_spec is not None:
mask_time_num_regions = config_spec['time_masks']
mask_time_min = config_spec['min_time']
mask_time_max = config_spec['max_time']
mask_freq_num_regions = config_spec['freq_masks']
mask_freq_min = config_spec['min_freq']
mask_freq_max = config_spec['max_freq']
else:
mask_time_num_regions = 0
mask_time_min = 0
mask_time_max = 0
mask_freq_num_regions = 0
mask_freq_min = 0
mask_freq_max = 0
config_cutout = config_features['cutout_augment']
if config_cutout is not None:
mask_both_num_regions = config_cutout['masks']
mask_both_min_time = config_cutout['min_time']
mask_both_max_time = config_cutout['max_time']
mask_both_min_freq = config_cutout['min_freq']
mask_both_max_freq = config_cutout['max_freq']
else:
mask_both_num_regions = 0
mask_both_min_time = 0
mask_both_max_time = 0
mask_both_min_freq = 0
mask_both_max_freq = 0
nfeatures = config_features['n_filt']
do_spectrogram_masking = \
mask_time_num_regions > 0 or mask_freq_num_regions > 0 or \
mask_both_num_regions > 0
do_remove_silence = silence_threshold is not None
del(config_spec)
del(config_cutout)
del(config_data)
del(config_features)
_dali_init_log(locals())
mask_params = {
'time_num_regions': mask_time_num_regions,
'time_min': mask_time_min,
'time_max': mask_time_max,
'freq_num_regions': mask_freq_num_regions,
'freq_min': mask_freq_min,
'freq_max': mask_freq_max,
'both_num_regions': mask_both_num_regions,
'both_min_time': mask_both_min_time,
'both_max_time': mask_both_max_time,
'both_min_freq': mask_both_min_freq,
'both_max_freq': mask_both_max_freq,
}
def _cutouts_generator():
"""
Generator, that wraps cutouts creation in order to randomize inputs
and allow passing them to DALI's ExternalSource operator
"""
[anchors, shapes] = _tuples2list(
[_generate_cutouts(mask_params, nfeatures)
for _ in range(batch_size)])
yield (np.array(anchors, dtype=np.float32),
np.array(shapes, dtype=np.float32))
cutouts_gen = _cutouts_generator if do_spectrogram_masking else None
if torch.distributed.is_initialized():
shard_id = torch.distributed.get_rank()
n_shards = torch.distributed.get_world_size()
else:
shard_id = 0
n_shards = 1
preprocessing_device = device_type.lower()
assert preprocessing_device == "cpu" or preprocessing_device == "gpu", \
"Incorrect preprocessing device. Please choose either 'cpu' or 'gpu'"
pipe = dali_asr_pipeline(
train_pipeline=train_pipeline,
file_root=file_root,
file_list=file_list,
sample_rate=sample_rate,
silence_threshold=silence_threshold,
resample_range=resample_range,
discrete_resample_range=discrete_resample_range,
window_size=window_size,
window_stride=window_stride,
nfeatures=nfeatures,
nfft=nfft,
frame_splicing_factor=frame_splicing_factor,
dither_coeff=dither_coeff,
pad_align=pad_align,
preemph_coeff=preemph_coeff,
do_spectrogram_masking=do_spectrogram_masking,
cutouts_generator=cutouts_gen,
shard_id=shard_id,
n_shards=n_shards,
preprocessing_device=preprocessing_device,
batch_size=batch_size,
num_threads=num_cpu_threads,
device_id=device_id
)
return pipe

View file

@ -0,0 +1,234 @@
# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import json
from pathlib import Path
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from torch.utils.data.distributed import DistributedSampler
from .audio import (audio_from_file, AudioSegment, GainPerturbation,
ShiftPerturbation, SpeedPerturbation)
from .text import _clean_text, punctuation_map
def normalize_string(s, labels, punct_map):
"""Normalizes string.
Example:
'call me at 8:00 pm!' -> 'call me at eight zero pm'
"""
labels = set(labels)
try:
text = _clean_text(s, ["english_cleaners"], punct_map).strip()
return ''.join([tok for tok in text if all(t in labels for t in tok)])
except:
print(f"WARNING: Normalizing failed: {s}")
return None
class FilelistDataset(Dataset):
def __init__(self, filelist_fpath):
self.samples = [line.strip() for line in open(filelist_fpath, 'r')]
def __len__(self):
return len(self.samples)
def __getitem__(self, index):
audio, audio_len = audio_from_file(self.samples[index])
return (audio.squeeze(0), audio_len, torch.LongTensor([0]),
torch.LongTensor([0]))
class SingleAudioDataset(FilelistDataset):
def __init__(self, audio_fpath):
self.samples = [audio_fpath]
class AudioDataset(Dataset):
def __init__(self, data_dir, manifest_fpaths, labels,
sample_rate=16000, min_duration=0.1, max_duration=float("inf"),
pad_to_max_duration=False, max_utts=0, normalize_transcripts=True,
sort_by_duration=False, trim_silence=False,
speed_perturbation=None, gain_perturbation=None,
shift_perturbation=None, ignore_offline_speed_perturbation=False):
"""Loads audio, transcript and durations listed in a .json file.
Args:
data_dir: absolute path to dataset folder
manifest_filepath: relative path from dataset folder
to manifest json as described above. Can be coma-separated paths.
labels (str): all possible output symbols
min_duration (int): skip audio shorter than threshold
max_duration (int): skip audio longer than threshold
pad_to_max_duration (bool): pad all sequences to max_duration
max_utts (int): limit number of utterances
normalize_transcripts (bool): normalize transcript text
sort_by_duration (bool): sort sequences by increasing duration
trim_silence (bool): trim leading and trailing silence from audio
ignore_offline_speed_perturbation (bool): use precomputed speed perturbation
Returns:
tuple of Tensors
"""
self.data_dir = data_dir
self.labels = labels
self.labels_map = dict([(labels[i], i) for i in range(len(labels))])
self.punctuation_map = punctuation_map(labels)
self.blank_index = len(labels)
self.pad_to_max_duration = pad_to_max_duration
self.sort_by_duration = sort_by_duration
self.max_utts = max_utts
self.normalize_transcripts = normalize_transcripts
self.ignore_offline_speed_perturbation = ignore_offline_speed_perturbation
self.min_duration = min_duration
self.max_duration = max_duration
self.trim_silence = trim_silence
self.sample_rate = sample_rate
perturbations = []
if speed_perturbation is not None:
perturbations.append(SpeedPerturbation(**speed_perturbation))
if gain_perturbation is not None:
perturbations.append(GainPerturbation(**gain_perturbation))
if shift_perturbation is not None:
perturbations.append(ShiftPerturbation(**shift_perturbation))
self.perturbations = perturbations
self.max_duration = max_duration
self.samples = []
self.duration = 0.0
self.duration_filtered = 0.0
for fpath in manifest_fpaths:
self._load_json_manifest(fpath)
if sort_by_duration:
self.samples = sorted(self.samples, key=lambda s: s['duration'])
def __getitem__(self, index):
s = self.samples[index]
rn_indx = np.random.randint(len(s['audio_filepath']))
duration = s['audio_duration'][rn_indx] if 'audio_duration' in s else 0
offset = s.get('offset', 0)
segment = AudioSegment(
s['audio_filepath'][rn_indx], target_sr=self.sample_rate,
offset=offset, duration=duration, trim=self.trim_silence)
for p in self.perturbations:
p.maybe_apply(segment, self.sample_rate)
segment = torch.FloatTensor(segment.samples)
return (segment,
torch.tensor(segment.shape[0]).int(),
torch.tensor(s["transcript"]),
torch.tensor(len(s["transcript"])).int())
def __len__(self):
return len(self.samples)
def _load_json_manifest(self, fpath):
for s in json.load(open(fpath, "r", encoding="utf-8")):
if self.pad_to_max_duration and not self.ignore_offline_speed_perturbation:
# require all perturbed samples to be < self.max_duration
s_max_duration = max(f['duration'] for f in s['files'])
else:
# otherwise we allow perturbances to be > self.max_duration
s_max_duration = s['original_duration']
s['duration'] = s.pop('original_duration')
if not (self.min_duration <= s_max_duration <= self.max_duration):
self.duration_filtered += s['duration']
continue
# Prune and normalize according to transcript
tr = (s.get('transcript', None) or
self.load_transcript(s['text_filepath']))
if not isinstance(tr, str):
print(f'WARNING: Skipped sample (transcript not a str): {tr}.')
self.duration_filtered += s['duration']
continue
if self.normalize_transcripts:
tr = normalize_string(tr, self.labels, self.punctuation_map)
s["transcript"] = self.to_vocab_inds(tr)
files = s.pop('files')
if self.ignore_offline_speed_perturbation:
files = [f for f in files if f['speed'] == 1.0]
s['audio_duration'] = [f['duration'] for f in files]
s['audio_filepath'] = [str(Path(self.data_dir, f['fname']))
for f in files]
self.samples.append(s)
self.duration += s['duration']
if self.max_utts > 0 and len(self.samples) >= self.max_utts:
print(f'Reached max_utts={self.max_utts}. Finished parsing {fpath}.')
break
def load_transcript(self, transcript_path):
with open(transcript_path, 'r', encoding="utf-8") as transcript_file:
transcript = transcript_file.read().replace('\n', '')
return transcript
def to_vocab_inds(self, transcript):
chars = [self.labels_map.get(x, self.blank_index) for x in list(transcript)]
transcript = list(filter(lambda x: x != self.blank_index, chars))
return transcript
def collate_fn(batch):
bs = len(batch)
max_len = lambda l, idx: max(el[idx].size(0) for el in l)
audio = torch.zeros(bs, max_len(batch, 0))
audio_lens = torch.zeros(bs, dtype=torch.int32)
transcript = torch.zeros(bs, max_len(batch, 2))
transcript_lens = torch.zeros(bs, dtype=torch.int32)
for i, sample in enumerate(batch):
audio[i].narrow(0, 0, sample[0].size(0)).copy_(sample[0])
audio_lens[i] = sample[1]
transcript[i].narrow(0, 0, sample[2].size(0)).copy_(sample[2])
transcript_lens[i] = sample[3]
return audio, audio_lens, transcript, transcript_lens
def get_data_loader(dataset, batch_size, multi_gpu=True, shuffle=True,
drop_last=True, num_workers=4):
kw = {'dataset': dataset, 'collate_fn': collate_fn,
'num_workers': num_workers, 'pin_memory': True}
if multi_gpu:
loader_shuffle = False
sampler = DistributedSampler(dataset, shuffle=shuffle)
else:
loader_shuffle = shuffle
sampler = None
return DataLoader(batch_size=batch_size, drop_last=drop_last,
sampler=sampler, shuffle=loader_shuffle, **kw)

View file

@ -0,0 +1,301 @@
# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import math
import random
import librosa
import torch
import torch.nn as nn
class BaseFeatures(nn.Module):
"""Base class for GPU accelerated audio preprocessing."""
__constants__ = ["pad_align", "pad_to_max_duration", "max_len"]
def __init__(self, pad_align, pad_to_max_duration, max_duration,
sample_rate, window_size, window_stride, spec_augment=None,
cutout_augment=None):
super(BaseFeatures, self).__init__()
self.pad_align = pad_align
self.pad_to_max_duration = pad_to_max_duration
self.win_length = int(sample_rate * window_size) # frame size
self.hop_length = int(sample_rate * window_stride)
# Calculate maximum sequence length (# frames)
if pad_to_max_duration:
self.max_len = 1 + math.ceil(
(max_duration * sample_rate - self.win_length) / self.hop_length
)
if spec_augment is not None:
self.spec_augment = SpecAugment(**spec_augment)
else:
self.spec_augment = None
if cutout_augment is not None:
self.cutout_augment = CutoutAugment(**cutout_augment)
else:
self.cutout_augment = None
@torch.no_grad()
def calculate_features(self, audio, audio_lens):
return audio, audio_lens
def __call__(self, audio, audio_lens):
dtype = audio.dtype
audio = audio.float()
feat, feat_lens = self.calculate_features(audio, audio_lens)
feat = self.apply_padding(feat)
if self.cutout_augment is not None:
feat = self.cutout_augment(feat)
if self.spec_augment is not None:
feat = self.spec_augment(feat)
feat = feat.to(dtype)
return feat, feat_lens
def apply_padding(self, x):
if self.pad_to_max_duration:
x_size = max(x.size(-1), self.max_len)
else:
x_size = x.size(-1)
if self.pad_align > 0:
pad_amt = x_size % self.pad_align
else:
pad_amt = 0
padded_len = x_size + (self.pad_align - pad_amt if pad_amt > 0 else 0)
return nn.functional.pad(x, (0, padded_len - x.size(-1)))
class SpecAugment(nn.Module):
"""Spec augment. refer to https://arxiv.org/abs/1904.08779
"""
def __init__(self, freq_masks=0, min_freq=0, max_freq=10, time_masks=0,
min_time=0, max_time=10):
super(SpecAugment, self).__init__()
assert 0 <= min_freq <= max_freq
assert 0 <= min_time <= max_time
self.freq_masks = freq_masks
self.min_freq = min_freq
self.max_freq = max_freq
self.time_masks = time_masks
self.min_time = min_time
self.max_time = max_time
@torch.no_grad()
def forward(self, x):
sh = x.shape
mask = torch.zeros(x.shape, dtype=torch.bool, device=x.device)
for idx in range(sh[0]):
for _ in range(self.freq_masks):
w = torch.randint(self.min_freq, self.max_freq + 1, size=(1,)).item()
f0 = torch.randint(0, max(1, sh[1] - w), size=(1,))
mask[idx, f0:f0+w] = 1
for _ in range(self.time_masks):
w = torch.randint(self.min_time, self.max_time + 1, size=(1,)).item()
t0 = torch.randint(0, max(1, sh[2] - w), size=(1,))
mask[idx, :, t0:t0+w] = 1
return x.masked_fill(mask, 0)
class CutoutAugment(nn.Module):
"""Cutout. refer to https://arxiv.org/pdf/1708.04552.pdf
"""
def __init__(self, masks=0, min_freq=20, max_freq=20, min_time=5, max_time=5):
super(CutoutAugment, self).__init__()
assert 0 <= min_freq <= max_freq
assert 0 <= min_time <= max_time
self.masks = masks
self.min_freq = min_freq
self.max_freq = max_freq
self.min_time = min_time
self.max_time = max_time
@torch.no_grad()
def forward(self, x):
sh = x.shape
mask = torch.zeros(x.shape, dtype=torch.bool, device=x.device)
for idx in range(sh[0]):
for i in range(self.masks):
w = torch.randint(self.min_freq, self.max_freq + 1, size=(1,)).item()
h = torch.randint(self.min_time, self.max_time + 1, size=(1,)).item()
f0 = int(random.uniform(0, sh[1] - w))
t0 = int(random.uniform(0, sh[2] - h))
mask[idx, f0:f0+w, t0:t0+h] = 1
return x.masked_fill(mask, 0)
@torch.jit.script
def normalize_batch(x, seq_len, normalize_type: str):
# print ("normalize_batch: x, seq_len, shapes: ", x.shape, seq_len, seq_len.shape)
if normalize_type == "per_feature":
x_mean = torch.zeros((seq_len.shape[0], x.shape[1]), dtype=x.dtype,
device=x.device)
x_std = torch.zeros((seq_len.shape[0], x.shape[1]), dtype=x.dtype,
device=x.device)
for i in range(x.shape[0]):
x_mean[i, :] = x[i, :, :seq_len[i]].mean(dim=1)
x_std[i, :] = x[i, :, :seq_len[i]].std(dim=1)
# make sure x_std is not zero
x_std += 1e-5
return (x - x_mean.unsqueeze(2)) / x_std.unsqueeze(2)
elif normalize_type == "all_features":
x_mean = torch.zeros(seq_len.shape, dtype=x.dtype, device=x.device)
x_std = torch.zeros(seq_len.shape, dtype=x.dtype, device=x.device)
for i in range(x.shape[0]):
x_mean[i] = x[i, :, :int(seq_len[i])].mean()
x_std[i] = x[i, :, :int(seq_len[i])].std()
# make sure x_std is not zero
x_std += 1e-5
return (x - x_mean.view(-1, 1, 1)) / x_std.view(-1, 1, 1)
else:
return x
@torch.jit.script
def splice_frames(x, frame_splicing: int):
""" Stacks frames together across feature dim
input is batch_size, feature_dim, num_frames
output is batch_size, feature_dim*frame_splicing, num_frames
"""
seq = [x]
# TORCHSCRIPT: JIT doesnt like range(start, stop)
for n in range(frame_splicing - 1):
seq.append(torch.cat([x[:, :, :n + 1], x[:, :, n + 1:]], dim=2))
return torch.cat(seq, dim=1)
class FilterbankFeatures(BaseFeatures):
# For JIT, https://pytorch.org/docs/stable/jit.html#python-defined-constants
__constants__ = ["dither", "preemph", "n_fft", "hop_length", "win_length",
"log", "frame_splicing", "normalize"]
# torchscript: "center" removed due to a bug
def __init__(self, spec_augment=None, cutout_augment=None,
sample_rate=8000, window_size=0.02, window_stride=0.01,
window="hamming", normalize="per_feature", n_fft=None,
preemph=0.97, n_filt=64, lowfreq=0, highfreq=None, log=True,
dither=1e-5, pad_align=8, pad_to_max_duration=False,
max_duration=float('inf'), frame_splicing=1):
super(FilterbankFeatures, self).__init__(
pad_align=pad_align, pad_to_max_duration=pad_to_max_duration,
max_duration=max_duration, sample_rate=sample_rate,
window_size=window_size, window_stride=window_stride,
spec_augment=spec_augment, cutout_augment=cutout_augment)
torch_windows = {
'hann': torch.hann_window,
'hamming': torch.hamming_window,
'blackman': torch.blackman_window,
'bartlett': torch.bartlett_window,
'none': None,
}
self.n_fft = n_fft or 2 ** math.ceil(math.log2(self.win_length))
self.normalize = normalize
self.log = log
#TORCHSCRIPT: Check whether or not we need this
self.dither = dither
self.frame_splicing = frame_splicing
self.n_filt = n_filt
self.preemph = preemph
highfreq = highfreq or sample_rate / 2
window_fn = torch_windows.get(window, None)
window_tensor = window_fn(self.win_length,
periodic=False) if window_fn else None
filterbanks = torch.tensor(
librosa.filters.mel(sample_rate, self.n_fft, n_mels=n_filt,
fmin=lowfreq, fmax=highfreq),
dtype=torch.float).unsqueeze(0)
# torchscript
self.register_buffer("fb", filterbanks)
self.register_buffer("window", window_tensor)
def get_seq_len(self, seq_len):
return torch.ceil(seq_len.to(dtype=torch.float) / self.hop_length).to(
dtype=torch.int)
# do stft
# TORCHSCRIPT: center removed due to bug
def stft(self, x):
return torch.stft(x, n_fft=self.n_fft, hop_length=self.hop_length,
win_length=self.win_length,
window=self.window.to(dtype=torch.float))
@torch.no_grad()
def calculate_features(self, x, seq_len):
dtype = x.dtype
seq_len = self.get_seq_len(seq_len)
# dither
if self.dither > 0:
x += self.dither * torch.randn_like(x)
# do preemphasis
if self.preemph is not None:
x = torch.cat(
(x[:, 0].unsqueeze(1), x[:, 1:] - self.preemph * x[:, :-1]), dim=1)
x = self.stft(x)
# get power spectrum
x = x.pow(2).sum(-1)
# dot with filterbank energies
x = torch.matmul(self.fb.to(x.dtype), x)
# log features if required
if self.log:
x = torch.log(x + 1e-20)
# frame splicing if required
if self.frame_splicing > 1:
raise ValueError('Frame splicing not supported')
# normalize if required
x = normalize_batch(x, seq_len, normalize_type=self.normalize)
# mask to zero any values beyond seq_len in batch,
# pad to multiple of `pad_align` (for efficiency)
max_len = x.size(-1)
mask = torch.arange(max_len, dtype=seq_len.dtype, device=x.device)
mask = mask.expand(x.size(0), max_len) >= seq_len.unsqueeze(1)
x = x.masked_fill(mask.unsqueeze(1), 0)
# TORCHSCRIPT: Is this del important? It breaks scripting
# del mask
return x.to(dtype), seq_len

View file

@ -0,0 +1,276 @@
# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import glob
import os
import re
from collections import OrderedDict
import torch
import torch.distributed as dist
from .metrics import word_error_rate
def print_once(msg):
if not dist.is_initialized() or dist.get_rank() == 0:
print(msg)
def add_ctc_blank(symbols):
return symbols + ['<BLANK>']
def ctc_decoder_predictions_tensor(tensor, labels):
"""
Takes output of greedy ctc decoder and performs ctc decoding algorithm to
remove duplicates and special symbol. Returns prediction
Args:
tensor: model output tensor
label: A list of labels
Returns:
prediction
"""
blank_id = len(labels) - 1
hypotheses = []
labels_map = {i: labels[i] for i in range(len(labels))}
prediction_cpu_tensor = tensor.long().cpu()
# iterate over batch
for ind in range(prediction_cpu_tensor.shape[0]):
prediction = prediction_cpu_tensor[ind].numpy().tolist()
# CTC decoding procedure
decoded_prediction = []
previous = len(labels) - 1 # id of a blank symbol
for p in prediction:
if (p != previous or previous == blank_id) and p != blank_id:
decoded_prediction.append(p)
previous = p
hypothesis = ''.join([labels_map[c] for c in decoded_prediction])
hypotheses.append(hypothesis)
return hypotheses
def greedy_wer(preds, tgt, tgt_lens, labels):
"""
Takes output of greedy ctc decoder and performs ctc decoding algorithm to
remove duplicates and special symbol. Prints wer and prediction examples to screen
Args:
tensors: A list of 3 tensors (predictions, targets, target_lengths)
labels: A list of labels
Returns:
word error rate
"""
with torch.no_grad():
references = gather_transcripts([tgt], [tgt_lens], labels)
hypotheses = ctc_decoder_predictions_tensor(preds, labels)
wer, _, _ = word_error_rate(hypotheses, references)
return wer, hypotheses[0], references[0]
def gather_losses(losses_list):
return [torch.mean(torch.stack(losses_list))]
def gather_predictions(predictions_list, labels):
results = []
for prediction in predictions_list:
results += ctc_decoder_predictions_tensor(prediction, labels=labels)
return results
def gather_transcripts(transcript_list, transcript_len_list, labels):
results = []
labels_map = {i: labels[i] for i in range(len(labels))}
# iterate over workers
for txt, lens in zip(transcript_list, transcript_len_list):
for t, l in zip(txt.long().cpu(), lens.long().cpu()):
t = list(t.numpy())
results.append(''.join([labels_map[c] for c in t[:l]]))
return results
def process_evaluation_batch(tensors, global_vars, labels):
"""
Processes results of an iteration and saves it in global_vars
Args:
tensors: dictionary with results of an evaluation iteration, e.g. loss, predictions, transcript, and output
global_vars: dictionary where processes results of iteration are saved
labels: A list of labels
"""
for kv, v in tensors.items():
if kv.startswith('loss'):
global_vars['EvalLoss'] += gather_losses(v)
elif kv.startswith('predictions'):
global_vars['preds'] += gather_predictions(v, labels)
elif kv.startswith('transcript_length'):
transcript_len_list = v
elif kv.startswith('transcript'):
transcript_list = v
elif kv.startswith('output'):
global_vars['logits'] += v
global_vars['txts'] += gather_transcripts(
transcript_list, transcript_len_list, labels)
def process_evaluation_epoch(aggregates, tag=None):
"""
Processes results from each worker at the end of evaluation and combine to final result
Args:
aggregates: dictionary containing information of entire evaluation
Return:
wer: final word error rate
loss: final loss
"""
if 'losses' in aggregates:
eloss = torch.mean(torch.stack(aggregates['losses'])).item()
else:
eloss = None
hypotheses = aggregates['preds']
references = aggregates['txts']
wer, scores, num_words = word_error_rate(hypotheses, references)
multi_gpu = dist.is_initialized()
if multi_gpu:
if eloss is not None:
eloss /= dist.get_world_size()
eloss_tensor = torch.tensor(eloss).cuda()
dist.all_reduce(eloss_tensor)
eloss = eloss_tensor.item()
scores_tensor = torch.tensor(scores).cuda()
dist.all_reduce(scores_tensor)
scores = scores_tensor.item()
num_words_tensor = torch.tensor(num_words).cuda()
dist.all_reduce(num_words_tensor)
num_words = num_words_tensor.item()
wer = scores * 1.0 / num_words
return wer, eloss
def num_weights(module):
return sum(p.numel() for p in module.parameters() if p.requires_grad)
class Checkpointer(object):
def __init__(self, save_dir, model_name, keep_milestones=[100, 200, 300]):
self.save_dir = save_dir
self.keep_milestones = keep_milestones
self.model_name = model_name
tracked = [
(int(re.search('epoch(\d+)_', f).group(1)), f)
for f in glob.glob(f'{save_dir}/{self.model_name}_epoch*_checkpoint.pt')]
tracked = sorted(tracked, key=lambda t: t[0])
self.tracked = OrderedDict(tracked)
def save(self, model, ema_model, optimizer, scaler, epoch, step, best_wer,
is_best=False):
"""Saves model checkpoint for inference/resuming training.
Args:
model: the model, optionally wrapped by DistributedDataParallel
ema_model: model with averaged weights, can be None
optimizer: optimizer
epoch (int): epoch during which the model is saved
step (int): number of steps since beginning of training
best_wer (float): lowest recorded WER on the dev set
is_best (bool, optional): set name of checkpoint to 'best'
and overwrite the previous one
"""
rank = 0
if dist.is_initialized():
dist.barrier()
rank = dist.get_rank()
if rank != 0:
return
# Checkpoint already saved
if not is_best and epoch in self.tracked:
return
unwrap_ddp = lambda model: getattr(model, 'module', model)
state = {
'epoch': epoch,
'step': step,
'best_wer': best_wer,
'state_dict': unwrap_ddp(model).state_dict(),
'ema_state_dict': unwrap_ddp(ema_model).state_dict() if ema_model is not None else None,
'optimizer': optimizer.state_dict(),
'scaler': scaler.state_dict(),
}
if is_best:
fpath = os.path.join(
self.save_dir, f"{self.model_name}_best_checkpoint.pt")
else:
fpath = os.path.join(
self.save_dir, f"{self.model_name}_epoch{epoch}_checkpoint.pt")
print_once(f"Saving {fpath}...")
torch.save(state, fpath)
if not is_best:
# Remove old checkpoints; keep milestones and the last two
self.tracked[epoch] = fpath
for epoch in set(list(self.tracked)[:-2]) - set(self.keep_milestones):
try:
os.remove(self.tracked[epoch])
except:
pass
del self.tracked[epoch]
def last_checkpoint(self):
tracked = list(self.tracked.values())
if len(tracked) >= 1:
try:
torch.load(tracked[-1], map_location='cpu')
return tracked[-1]
except:
print_once(f'Last checkpoint {tracked[-1]} appears corrupted.')
elif len(tracked) >= 2:
return tracked[-2]
else:
return None
def load(self, fpath, model, ema_model, optimizer, scaler, meta):
print_once(f'Loading model from {fpath}')
checkpoint = torch.load(fpath, map_location="cpu")
unwrap_ddp = lambda model: getattr(model, 'module', model)
state_dict = checkpoint['state_dict']
unwrap_ddp(model).load_state_dict(state_dict, strict=True)
if ema_model is not None:
if checkpoint.get('ema_state_dict') is not None:
key = 'ema_state_dict'
else:
key = 'state_dict'
print_once('WARNING: EMA weights not found in the checkpoint.')
print_once('WARNING: Initializing EMA model with regular params.')
state_dict = checkpoint[key]
unwrap_ddp(ema_model).load_state_dict(state_dict, strict=True)
optimizer.load_state_dict(checkpoint['optimizer'])
scaler.load_state_dict(checkpoint['scaler'])
meta['start_epoch'] = checkpoint.get('epoch')
meta['best_wer'] = checkpoint.get('best_wer', meta['best_wer'])

View file

@ -0,0 +1,59 @@
# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
def __levenshtein(a, b):
"""Calculates the Levenshtein distance between two sequences."""
n, m = len(a), len(b)
if n > m:
# Make sure n <= m, to use O(min(n,m)) space
a, b = b, a
n, m = m, n
current = list(range(n + 1))
for i in range(1, m + 1):
previous, current = current, [i] + [0] * n
for j in range(1, n + 1):
add, delete = previous[j] + 1, current[j - 1] + 1
change = previous[j - 1]
if a[j - 1] != b[i - 1]:
change = change + 1
current[j] = min(add, delete, change)
return current[n]
def word_error_rate(hypotheses, references):
"""Computes average Word Error Rate (WER) between two text lists."""
scores = 0
words = 0
len_diff = len(references) - len(hypotheses)
if len_diff > 0:
raise ValueError("Uneqal number of hypthoses and references: "
"{0} and {1}".format(len(hypotheses), len(references)))
elif len_diff < 0:
hypotheses = hypotheses[:len_diff]
for h, r in zip(hypotheses, references):
h_list = h.split()
r_list = r.split()
words += len(r_list)
scores += __levenshtein(h_list, r_list)
if words!=0:
wer = 1.0*scores/words
else:
wer = float('inf')
return wer, scores, words

View file

@ -0,0 +1,269 @@
# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import torch
from torch.optim import Optimizer
import math
def lr_policy(step, epoch, initial_lr, optimizer, steps_per_epoch, warmup_epochs,
hold_epochs, num_epochs=None, policy='linear', min_lr=1e-5,
exp_gamma=None):
"""
learning rate decay
Args:
initial_lr: base learning rate
step: current iteration number
N: total number of iterations over which learning rate is decayed
lr_steps: list of steps to apply exp_gamma
"""
warmup_steps = warmup_epochs * steps_per_epoch
hold_steps = hold_epochs * steps_per_epoch
if policy == 'legacy':
assert num_epochs is not None
tot_steps = num_epochs * steps_per_epoch
if step < warmup_steps:
a = (step + 1) / (warmup_steps + 1)
elif step < warmup_steps + hold_steps:
a = 1.0
else:
a = (((tot_steps - step)
/ (tot_steps - warmup_steps - hold_steps)) ** 2)
elif policy == 'exponential':
assert exp_gamma is not None
if step < warmup_steps:
a = (step + 1) / (warmup_steps + 1)
elif step < warmup_steps + hold_steps:
a = 1.0
else:
a = exp_gamma ** (epoch - warmup_epochs - hold_epochs)
else:
raise ValueError
new_lr = max(a * initial_lr, min_lr)
for param_group in optimizer.param_groups:
param_group['lr'] = new_lr
class AdamW(Optimizer):
"""Implements AdamW algorithm.
It has been proposed in `Adam: A Method for Stochastic Optimization`_.
Arguments:
params (iterable): iterable of parameters to optimize or dicts defining
parameter groups
lr (float, optional): learning rate (default: 1e-3)
betas (Tuple[float, float], optional): coefficients used for computing
running averages of gradient and its square (default: (0.9, 0.999))
eps (float, optional): term added to the denominator to improve
numerical stability (default: 1e-8)
weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
amsgrad (boolean, optional): whether to use the AMSGrad variant of this
algorithm from the paper `On the Convergence of Adam and Beyond`_
Adam: A Method for Stochastic Optimization:
https://arxiv.org/abs/1412.6980
On the Convergence of Adam and Beyond:
https://openreview.net/forum?id=ryQu7f-RZ
"""
def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8,
weight_decay=0, amsgrad=False):
if not 0.0 <= lr:
raise ValueError("Invalid learning rate: {}".format(lr))
if not 0.0 <= eps:
raise ValueError("Invalid epsilon value: {}".format(eps))
if not 0.0 <= betas[0] < 1.0:
raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0]))
if not 0.0 <= betas[1] < 1.0:
raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1]))
defaults = dict(lr=lr, betas=betas, eps=eps,
weight_decay=weight_decay, amsgrad=amsgrad)
super(AdamW, self).__init__(params, defaults)
def __setstate__(self, state):
super(AdamW, self).__setstate__(state)
for group in self.param_groups:
group.setdefault('amsgrad', False)
def step(self, closure=None):
"""Performs a single optimization step.
Arguments:
closure (callable, optional): A closure that reevaluates the model
and returns the loss.
"""
loss = None
if closure is not None:
loss = closure()
for group in self.param_groups:
for p in group['params']:
if p.grad is None:
continue
grad = p.grad.data
if grad.is_sparse:
raise RuntimeError('Adam does not support sparse gradients, please consider SparseAdam instead')
amsgrad = group['amsgrad']
state = self.state[p]
# State initialization
if len(state) == 0:
state['step'] = 0
# Exponential moving average of gradient values
state['exp_avg'] = torch.zeros_like(p.data)
# Exponential moving average of squared gradient values
state['exp_avg_sq'] = torch.zeros_like(p.data)
if amsgrad:
# Maintains max of all exp. moving avg. of sq. grad. values
state['max_exp_avg_sq'] = torch.zeros_like(p.data)
exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']
if amsgrad:
max_exp_avg_sq = state['max_exp_avg_sq']
beta1, beta2 = group['betas']
state['step'] += 1
# Decay the first and second moment running average coefficient
exp_avg.mul_(beta1).add_(grad, alpha=1 - beta1)
exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad)
if amsgrad:
# Maintains the maximum of all 2nd moment running avg. till now
torch.max(max_exp_avg_sq, exp_avg_sq, out=max_exp_avg_sq)
# Use the max. for normalizing running avg. of gradient
denom = max_exp_avg_sq.sqrt().add_(group['eps'])
else:
denom = exp_avg_sq.sqrt().add_(group['eps'])
bias_correction1 = 1 - beta1 ** state['step']
bias_correction2 = 1 - beta2 ** state['step']
step_size = group['lr'] * math.sqrt(bias_correction2) / bias_correction1
p.data.add_(torch.mul(p.data, group['weight_decay']).addcdiv_(1, exp_avg, denom), alpha=-step_size)
return loss
class Novograd(Optimizer):
"""
Implements Novograd algorithm.
Args:
params (iterable): iterable of parameters to optimize or dicts defining
parameter groups
lr (float, optional): learning rate (default: 1e-3)
betas (Tuple[float, float], optional): coefficients used for computing
running averages of gradient and its square (default: (0.95, 0))
eps (float, optional): term added to the denominator to improve
numerical stability (default: 1e-8)
weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
grad_averaging: gradient averaging
amsgrad (boolean, optional): whether to use the AMSGrad variant of this
algorithm from the paper `On the Convergence of Adam and Beyond`_
(default: False)
"""
def __init__(self, params, lr=1e-3, betas=(0.95, 0), eps=1e-8,
weight_decay=0, grad_averaging=False, amsgrad=False):
if not 0.0 <= lr:
raise ValueError("Invalid learning rate: {}".format(lr))
if not 0.0 <= eps:
raise ValueError("Invalid epsilon value: {}".format(eps))
if not 0.0 <= betas[0] < 1.0:
raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0]))
if not 0.0 <= betas[1] < 1.0:
raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1]))
defaults = dict(lr=lr, betas=betas, eps=eps,
weight_decay=weight_decay,
grad_averaging=grad_averaging,
amsgrad=amsgrad)
super(Novograd, self).__init__(params, defaults)
def __setstate__(self, state):
super(Novograd, self).__setstate__(state)
for group in self.param_groups:
group.setdefault('amsgrad', False)
def step(self, closure=None):
"""Performs a single optimization step.
Arguments:
closure (callable, optional): A closure that reevaluates the model
and returns the loss.
"""
loss = None
if closure is not None:
loss = closure()
for group in self.param_groups:
for p in group['params']:
if p.grad is None:
continue
grad = p.grad.data
if grad.is_sparse:
raise RuntimeError('Sparse gradients are not supported.')
amsgrad = group['amsgrad']
state = self.state[p]
# State initialization
if len(state) == 0:
state['step'] = 0
# Exponential moving average of gradient values
state['exp_avg'] = torch.zeros_like(p.data)
# Exponential moving average of squared gradient values
state['exp_avg_sq'] = torch.zeros([]).to(state['exp_avg'].device)
if amsgrad:
# Maintains max of all exp. moving avg. of sq. grad. values
state['max_exp_avg_sq'] = torch.zeros([]).to(state['exp_avg'].device)
exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']
if amsgrad:
max_exp_avg_sq = state['max_exp_avg_sq']
beta1, beta2 = group['betas']
state['step'] += 1
norm = torch.sum(torch.pow(grad, 2))
if exp_avg_sq == 0:
exp_avg_sq.copy_(norm)
else:
exp_avg_sq.mul_(beta2).add_(norm, alpha=1 - beta2)
if amsgrad:
# Maintains the maximum of all 2nd moment running avg. till now
torch.max(max_exp_avg_sq, exp_avg_sq, out=max_exp_avg_sq)
# Use the max. for normalizing running avg. of gradient
denom = max_exp_avg_sq.sqrt().add_(group['eps'])
else:
denom = exp_avg_sq.sqrt().add_(group['eps'])
grad.div_(denom)
if group['weight_decay'] != 0:
grad.add_(p.data, alpha=group['weight_decay'])
if group['grad_averaging']:
grad.mul_(1 - beta1)
exp_avg.mul_(beta1).add_(grad)
p.data.add_(exp_avg, alpha=-group['lr'])
return loss

View file

@ -0,0 +1,128 @@
import torch
import numpy as np
from torch.utils.data.sampler import Sampler
class DistributedSampler(Sampler):
def __init__(self, dataset, batch_size, world_size, rank):
"""
Constructor for the DistributedSampler.
:param dataset: dataset
:param batch_size: local batch size
:param world_size: number of distributed workers
:param rank: rank of the current process
"""
self.dataset = dataset
self.world_size = world_size
self.rank = rank
self.epoch = 0
self.batch_size = batch_size
self.global_batch_size = batch_size * world_size
self.data_len = len(self.dataset)
self.num_samples = self.data_len // self.global_batch_size \
* self.global_batch_size
def distribute_batches(self, indices):
"""
Assigns batches to workers.
Consecutive ranks are getting consecutive batches.
:param indices: torch.tensor with batch indices
"""
assert len(indices) == self.num_samples
indices = indices.view(-1, self.batch_size)
indices = indices[self.rank::self.world_size].contiguous()
indices = indices.view(-1)
indices = indices.tolist()
assert len(indices) == self.num_samples // self.world_size
return indices
def reshuffle_batches(self, indices, rng):
"""
Permutes global batches
:param indices: torch.tensor with batch indices
:param rng: instance of torch.Generator
"""
indices = indices.view(-1, self.global_batch_size)
num_batches = indices.shape[0]
order = torch.randperm(num_batches, generator=rng)
indices = indices[order, :]
indices = indices.view(-1)
return indices
def __iter__(self):
g = torch.Generator()
g.manual_seed(self.epoch)
# generate permutation
indices = torch.randperm(self.data_len, generator=rng)
# make indices evenly divisible by (batch_size * world_size)
indices = indices[:self.num_samples]
# assign batches to workers
indices = self.distribute_batches(indices)
return iter(indices)
def set_epoch(self, epoch):
"""
Sets current epoch index.
Epoch index is used to seed RNG in __iter__() function.
:param epoch: index of current epoch
"""
self.epoch = epoch
def __len__(self):
return self.num_samples // self.world_size
class BucketingSampler(DistributedSampler):
def __init__(self, dataset, batch_size, num_buckets, world_size, rank):
"""
Bucketing sampler with approx. equally-sized buckets.
:param dataset: dataset
:param batch_size: local batch size
:param seeds: list of seeds, one seed for each training epoch
:param num_buckets: number of buckets
:param world_size: number of distributed workers
:param rank: rank of the current process
"""
super().__init__(dataset, batch_size, world_size, rank)
self.num_buckets = num_buckets
len_ids = np.argsort([sample['duration'] for sample in dataset.samples])
self.buckets = [torch.from_numpy(t)
for t in np.array_split(len_ids, num_buckets)]
global_bs = self.global_batch_size
def __iter__(self):
g = torch.Generator()
g.manual_seed(self.epoch)
global_bsz = self.global_batch_size
indices = []
for bid in range(self.num_buckets):
# random shuffle within current bucket
perm = torch.randperm(len(self.buckets[bid]), generator=g)
bucket_indices = self.buckets[bid][perm]
# add samples from current bucket to indices for current epoch
indices.append(bucket_indices)
indices = torch.cat(indices)
# make indices evenly divisible by global batch size
length = len(indices) // global_bsz * global_bsz
indices = indices[:length]
assert len(indices) % self.global_batch_size == 0
# perform global reshuffle of all global batches
indices = self.reshuffle_batches(indices, g)
# distribute batches to individual workers
indices = self.distribute_batches(indices)
return iter(indices)

View file

@ -0,0 +1,173 @@
# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import atexit
import glob
import os
import re
import numpy as np
import torch
from torch.utils.tensorboard import SummaryWriter
import dllogger
from dllogger import StdOutBackend, JSONStreamBackend, Verbosity
tb_loggers = {}
class TBLogger:
"""
xyz_dummies: stretch the screen with empty plots so the legend would
always fit for other plots
"""
def __init__(self, enabled, log_dir, name, interval=1, dummies=True):
self.enabled = enabled
self.interval = interval
self.cache = {}
if self.enabled:
self.summary_writer = SummaryWriter(
log_dir=os.path.join(log_dir, name),
flush_secs=120, max_queue=200)
atexit.register(self.summary_writer.close)
if dummies:
for key in ('aaa', 'zzz'):
self.summary_writer.add_scalar(key, 0.0, 1)
def log(self, step, data):
for k, v in data.items():
self.log_value(step, k, v.item() if type(v) is torch.Tensor else v)
def log_value(self, step, key, val, stat='mean'):
if self.enabled:
if key not in self.cache:
self.cache[key] = []
self.cache[key].append(val)
if len(self.cache[key]) == self.interval:
agg_val = getattr(np, stat)(self.cache[key])
self.summary_writer.add_scalar(key, agg_val, step)
del self.cache[key]
def log_grads(self, step, model):
if self.enabled:
norms = [p.grad.norm().item() for p in model.parameters()
if p.grad is not None]
for stat in ('max', 'min', 'mean'):
self.log_value(step, f'grad_{stat}', getattr(np, stat)(norms),
stat=stat)
def unique_log_fpath(log_fpath):
if not os.path.isfile(log_fpath):
return log_fpath
# Avoid overwriting old logs
saved = sorted([int(re.search('\.(\d+)', f).group(1))
for f in glob.glob(f'{log_fpath}.*')])
log_num = (saved[-1] if saved else 0) + 1
return f'{log_fpath}.{log_num}'
def stdout_step_format(step):
if isinstance(step, str):
return step
fields = []
if len(step) > 0:
fields.append("epoch {:>4}".format(step[0]))
if len(step) > 1:
fields.append("iter {:>4}".format(step[1]))
if len(step) > 2:
fields[-1] += "/{}".format(step[2])
return " | ".join(fields)
def stdout_metric_format(metric, metadata, value):
name = metadata.get("name", metric + " : ")
unit = metadata.get("unit", None)
format = f'{{{metadata.get("format", "")}}}'
fields = [name, format.format(value) if value is not None else value, unit]
fields = [f for f in fields if f is not None]
return "| " + " ".join(fields)
def init_log(args):
enabled = (args.local_rank == 0)
if enabled:
fpath = args.log_file or os.path.join(args.output_dir, 'nvlog.json')
backends = [JSONStreamBackend(Verbosity.DEFAULT,
unique_log_fpath(fpath)),
StdOutBackend(Verbosity.VERBOSE,
step_format=stdout_step_format,
metric_format=stdout_metric_format)]
else:
backends = []
dllogger.init(backends=backends)
dllogger.metadata("train_lrate", {"name": "lrate", "format": ":>3.2e"})
for id_, pref in [('train', ''), ('train_avg', 'avg train '),
('dev', ' avg dev '), ('dev_ema', ' EMA dev ')]:
dllogger.metadata(f"{id_}_loss",
{"name": f"{pref}loss", "format": ":>7.2f"})
dllogger.metadata(f"{id_}_wer",
{"name": f"{pref}wer", "format": ":>6.2f"})
dllogger.metadata(f"{id_}_throughput",
{"name": f"{pref}utts/s", "format": ":>5.0f"})
dllogger.metadata(f"{id_}_took",
{"name": "took", "unit": "s", "format": ":>5.2f"})
tb_subsets = ['train', 'dev', 'dev_ema'] if args.ema else ['train', 'dev']
global tb_loggers
tb_loggers = {s: TBLogger(enabled, args.output_dir, name=s)
for s in tb_subsets}
log_parameters(vars(args), tb_subset='train')
def log(step, tb_total_steps=None, subset='train', data={}):
if tb_total_steps is not None:
tb_loggers[subset].log(tb_total_steps, data)
if subset != '':
data = {f'{subset}_{key}': val for key, val in data.items()}
dllogger.log(step, data=data)
def log_grads_tb(tb_total_steps, grads, tb_subset='train'):
tb_loggers[tb_subset].log_grads(tb_total_steps, grads)
def log_parameters(data, verbosity=0, tb_subset=None):
for k, v in data.items():
dllogger.log(step="PARAMETER", data={k: v}, verbosity=verbosity)
if tb_subset is not None and tb_loggers[tb_subset].enabled:
tb_data = {k: v for k, v in data.items()
if type(v) in (str, bool, int, float)}
tb_loggers[tb_subset].summary_writer.add_hparams(tb_data, {})
def flush_log():
dllogger.flush()
for tbl in tb_loggers.values():
if tbl.enabled:
tbl.summary_writer.flush()

View file

@ -0,0 +1,19 @@
Copyright (c) 2017 Keith Ito
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.

View file

@ -0,0 +1,32 @@
# Copyright (c) 2017 Keith Ito
""" from https://github.com/keithito/tacotron """
import re
import string
from . import cleaners
def _clean_text(text, cleaner_names, *args):
for name in cleaner_names:
cleaner = getattr(cleaners, name)
if not cleaner:
raise Exception('Unknown cleaner: %s' % name)
text = cleaner(text, *args)
return text
def punctuation_map(labels):
# Punctuation to remove
punctuation = string.punctuation
punctuation = punctuation.replace("+", "")
punctuation = punctuation.replace("&", "")
# TODO We might also want to consider:
# @ -> at
# # -> number, pound, hashtag
# ~ -> tilde
# _ -> underscore
# % -> percent
# If a punctuation symbol is inside our vocab, we do not remove from text
for l in labels:
punctuation = punctuation.replace(l, "")
# Turn all punctuation to whitespace
table = str.maketrans(punctuation, " " * len(punctuation))
return table

View file

@ -0,0 +1,107 @@
# Copyright (c) 2017 Keith Ito
# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" from https://github.com/keithito/tacotron
Modified to add puncturation removal
"""
'''
Cleaners are transformations that run over the input text at both training and eval time.
Cleaners can be selected by passing a comma-delimited list of cleaner names as the "cleaners"
hyperparameter. Some cleaners are English-specific. You'll typically want to use:
1. "english_cleaners" for English text
2. "transliteration_cleaners" for non-English text that can be transliterated to ASCII using
the Unidecode library (https://pypi.python.org/pypi/Unidecode)
3. "basic_cleaners" if you do not want to transliterate (in this case, you should also update
the symbols in symbols.py to match your data).
'''
import re
from unidecode import unidecode
from .numbers import normalize_numbers
# Regular expression matching whitespace:
_whitespace_re = re.compile(r'\s+')
# List of (regular expression, replacement) pairs for abbreviations:
_abbreviations = [(re.compile('\\b%s\\.' % x[0], re.IGNORECASE), x[1]) for x in [
('mrs', 'misess'),
('mr', 'mister'),
('dr', 'doctor'),
('st', 'saint'),
('co', 'company'),
('jr', 'junior'),
('maj', 'major'),
('gen', 'general'),
('drs', 'doctors'),
('rev', 'reverend'),
('lt', 'lieutenant'),
('hon', 'honorable'),
('sgt', 'sergeant'),
('capt', 'captain'),
('esq', 'esquire'),
('ltd', 'limited'),
('col', 'colonel'),
('ft', 'fort'),
]]
def expand_abbreviations(text):
for regex, replacement in _abbreviations:
text = re.sub(regex, replacement, text)
return text
def expand_numbers(text):
return normalize_numbers(text)
def lowercase(text):
return text.lower()
def collapse_whitespace(text):
return re.sub(_whitespace_re, ' ', text)
def convert_to_ascii(text):
return unidecode(text)
def remove_punctuation(text, table):
text = text.translate(table)
text = re.sub(r'&', " and ", text)
text = re.sub(r'\+', " plus ", text)
return text
def basic_cleaners(text):
'''Basic pipeline that lowercases and collapses whitespace without transliteration.'''
text = lowercase(text)
text = collapse_whitespace(text)
return text
def transliteration_cleaners(text):
'''Pipeline for non-English text that transliterates to ASCII.'''
text = convert_to_ascii(text)
text = lowercase(text)
text = collapse_whitespace(text)
return text
def english_cleaners(text, table=None):
'''Pipeline for English text, including number and abbreviation expansion.'''
text = convert_to_ascii(text)
text = lowercase(text)
text = expand_numbers(text)
text = expand_abbreviations(text)
if table is not None:
text = remove_punctuation(text, table)
text = collapse_whitespace(text)
return text

View file

@ -0,0 +1,99 @@
# Copyright (c) 2017 Keith Ito
# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" from https://github.com/keithito/tacotron
Modifed to add support for time and slight tweaks to _expand_number
"""
import inflect
import re
_inflect = inflect.engine()
_comma_number_re = re.compile(r'([0-9][0-9\,]+[0-9])')
_decimal_number_re = re.compile(r'([0-9]+\.[0-9]+)')
_pounds_re = re.compile(r'£([0-9\,]*[0-9]+)')
_dollars_re = re.compile(r'\$([0-9\.\,]*[0-9]+)')
_ordinal_re = re.compile(r'[0-9]+(st|nd|rd|th)')
_number_re = re.compile(r'[0-9]+')
_time_re = re.compile(r'([0-9]{1,2}):([0-9]{2})')
def _remove_commas(m):
return m.group(1).replace(',', '')
def _expand_decimal_point(m):
return m.group(1).replace('.', ' point ')
def _expand_dollars(m):
match = m.group(1)
parts = match.split('.')
if len(parts) > 2:
return match + ' dollars' # Unexpected format
dollars = int(parts[0]) if parts[0] else 0
cents = int(parts[1]) if len(parts) > 1 and parts[1] else 0
if dollars and cents:
dollar_unit = 'dollar' if dollars == 1 else 'dollars'
cent_unit = 'cent' if cents == 1 else 'cents'
return '%s %s, %s %s' % (dollars, dollar_unit, cents, cent_unit)
elif dollars:
dollar_unit = 'dollar' if dollars == 1 else 'dollars'
return '%s %s' % (dollars, dollar_unit)
elif cents:
cent_unit = 'cent' if cents == 1 else 'cents'
return '%s %s' % (cents, cent_unit)
else:
return 'zero dollars'
def _expand_ordinal(m):
return _inflect.number_to_words(m.group(0))
def _expand_number(m):
if int(m.group(0)[0]) == 0:
return _inflect.number_to_words(m.group(0), andword='', group=1)
num = int(m.group(0))
if num > 1000 and num < 3000:
if num == 2000:
return 'two thousand'
elif num > 2000 and num < 2010:
return 'two thousand ' + _inflect.number_to_words(num % 100)
elif num % 100 == 0:
return _inflect.number_to_words(num // 100) + ' hundred'
else:
return _inflect.number_to_words(num, andword='', zero='oh', group=2).replace(', ', ' ')
# Add check for number phones and other large numbers
elif num > 1000000000 and num % 10000 != 0:
return _inflect.number_to_words(num, andword='', group=1)
else:
return _inflect.number_to_words(num, andword='')
def _expand_time(m):
mins = int(m.group(2))
if mins == 0:
return _inflect.number_to_words(m.group(1))
return " ".join([_inflect.number_to_words(m.group(1)), _inflect.number_to_words(m.group(2))])
def normalize_numbers(text):
text = re.sub(_comma_number_re, _remove_commas, text)
text = re.sub(_pounds_re, r'\1 pounds', text)
text = re.sub(_dollars_re, _expand_dollars, text)
text = re.sub(_decimal_number_re, _expand_decimal_point, text)
text = re.sub(_ordinal_re, _expand_ordinal, text)
text = re.sub(_number_re, _expand_number, text)
text = re.sub(_time_re, _expand_time, text)
return text

View file

@ -0,0 +1,19 @@
# Copyright (c) 2017 Keith Ito
""" from https://github.com/keithito/tacotron """
'''
Defines the set of symbols used in text input to the model.
The default is a set of ASCII characters that works well for English or text that has been run through Unidecode. For other data, you can modify _characters. See TRAINING_DATA.md for details. '''
from . import cmudict
_pad = '_'
_punctuation = '!\'(),.:;? '
_special = '-'
_letters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz'
# Prepend "@" to ARPAbet symbols to ensure uniqueness (some are the same as uppercase letters):
_arpabet = ['@' + s for s in cmudict.valid_symbols]
# Export all symbols:
symbols = [_pad] + list(_special) + list(_punctuation) + list(_letters) + _arpabet

View file

@ -0,0 +1,20 @@
import numpy as np
class BenchmarkStats:
""" Tracks statistics used for benchmarking. """
def __init__(self):
self.utts = []
self.times = []
self.losses = []
def update(self, utts, times, losses):
self.utts.append(utts)
self.times.append(times)
self.losses.append(losses)
def get(self, n_epochs):
throughput = sum(self.utts[-n_epochs:]) / sum(self.times[-n_epochs:])
return {'throughput': throughput, 'benchmark_epochs_num': n_epochs,
'loss': np.mean(self.losses[-n_epochs:])}

View file

@ -0,0 +1,151 @@
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
name: "QuartzNet"
labels: [" ", "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m",
"n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z", "'"]
input_val:
audio_dataset: &val_dataset
sample_rate: &sample_rate 16000
trim_silence: true
normalize_transcripts: true
filterbank_features: &val_features
normalize: per_feature
sample_rate: *sample_rate
window_size: 0.02
window_stride: 0.01
window: hann
n_filt: &n_filt 64
n_fft: 512
frame_splicing: &frame_splicing 1
dither: 0.00001
pad_align: 16
# For training we keep samples < 16.7s and apply augmentation
input_train:
audio_dataset:
<<: *val_dataset
max_duration: 16.7
ignore_offline_speed_perturbation: true
speed_perturbation:
min_rate: 0.85
max_rate: 1.15
filterbank_features:
<<: *val_features
max_duration: 16.7
spec_augment:
freq_masks: 2
max_freq: 15
time_masks: 2
max_time: 55
quartznet:
encoder:
init: xavier_uniform
in_feats: *n_filt
frame_splicing: *frame_splicing
activation: relu
use_conv_masks: true
blocks:
- &Conv1
filters: 256
repeat: 1
kernel_size: [33]
dilation: [1]
stride: [2]
dropout: 0.0
residual: false
separable: true
- &B1
filters: 256
repeat: 5
kernel_size: [33]
dilation: [1]
stride: [1]
dropout: 0.0
residual: true
separable: true
- *B1
- *B1
- &B2
filters: 256
repeat: 5
kernel_size: [39]
dilation: [1]
stride: [1]
dropout: 0.0
residual: true
separable: true
- *B2
- *B2
- &B3
filters: 512
repeat: 5
kernel_size: [51]
dilation: [1]
stride: [1]
dropout: 0.0
residual: true
separable: true
- *B3
- *B3
- &B4
filters: 512
repeat: 5
kernel_size: [63]
dilation: [1]
stride: [1]
dropout: 0.0
residual: true
separable: true
- *B4
- *B4
- &B5
filters: 512
repeat: 5
kernel_size: [75]
dilation: [1]
stride: [1]
dropout: 0.0
residual: true
separable: true
- *B5
- *B5
- &Conv2
filters: 512
repeat: 1
kernel_size: [87]
dilation: [2]
stride: [1]
dropout: 0.0
residual: false
separable: true
- &Conv3
filters: &enc_feats 1024
repeat: 1
kernel_size: [1]
dilation: [1]
stride: [1]
dropout: 0.0
residual: false
separable: false
decoder:
in_feats: *enc_feats
init: xavier_uniform

View file

@ -0,0 +1,151 @@
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
name: "QuartzNet"
labels: [" ", "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m",
"n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z", "'"]
input_val:
audio_dataset: &val_dataset
sample_rate: &sample_rate 16000
trim_silence: true
normalize_transcripts: true
filterbank_features: &val_features
normalize: per_feature
sample_rate: *sample_rate
window_size: 0.02
window_stride: 0.01
window: hann
n_filt: &n_filt 64
n_fft: 512
frame_splicing: &frame_splicing 1
dither: 0.00001
pad_align: 16
# For training we keep samples < 16.7s and apply augmentation
input_train:
audio_dataset:
<<: *val_dataset
max_duration: 16.7
ignore_offline_speed_perturbation: true
speed_perturbation:
min_rate: 0.85
max_rate: 1.15
filterbank_features:
<<: *val_features
max_duration: 16.7
spec_augment:
freq_masks: 2
max_freq: 20
time_masks: 2
max_time: 75
quartznet:
encoder:
init: xavier_uniform
in_feats: *n_filt
frame_splicing: *frame_splicing
activation: relu
use_conv_masks: true
blocks:
- &Conv1
filters: 256
repeat: 1
kernel_size: [33]
dilation: [1]
stride: [2]
dropout: 0.2
residual: false
separable: true
- &B1
filters: 256
repeat: 5
kernel_size: [33]
dilation: [1]
stride: [1]
dropout: 0.2
residual: true
separable: true
- *B1
- *B1
- &B2
filters: 256
repeat: 5
kernel_size: [39]
dilation: [1]
stride: [1]
dropout: 0.2
residual: true
separable: true
- *B2
- *B2
- &B3
filters: 512
repeat: 5
kernel_size: [51]
dilation: [1]
stride: [1]
dropout: 0.2
residual: true
separable: true
- *B3
- *B3
- &B4
filters: 512
repeat: 5
kernel_size: [63]
dilation: [1]
stride: [1]
dropout: 0.2
residual: true
separable: true
- *B4
- *B4
- &B5
filters: 512
repeat: 5
kernel_size: [75]
dilation: [1]
stride: [1]
dropout: 0.2
residual: true
separable: true
- *B5
- *B5
- &Conv2
filters: 512
repeat: 1
kernel_size: [87]
dilation: [2]
stride: [1]
dropout: 0.2
residual: false
separable: true
- &Conv3
filters: &enc_feats 1024
repeat: 1
kernel_size: [1]
dilation: [1]
stride: [1]
dropout: 0.2
residual: false
separable: false
decoder:
in_feats: *enc_feats
init: xavier_uniform

Binary file not shown.

After

Width:  |  Height:  |  Size: 117 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 37 KiB

View file

@ -0,0 +1,390 @@
# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import math
import os
import random
import time
from heapq import nlargest
from itertools import chain, repeat
from pathlib import Path
from tqdm import tqdm
import dllogger
import torch
import numpy as np
import torch.distributed as distrib
from dllogger import JSONStreamBackend, StdOutBackend, Verbosity
from quartznet import config
from common import helpers
from common.dali.data_loader import DaliDataLoader
from common.dataset import (AudioDataset, FilelistDataset, get_data_loader,
SingleAudioDataset)
from common.features import BaseFeatures, FilterbankFeatures
from common.helpers import print_once, process_evaluation_epoch
from quartznet.model import GreedyCTCDecoder, QuartzNet
from common.tb_dllogger import stdout_metric_format, unique_log_fpath
def get_parser():
parser = argparse.ArgumentParser(description='QuartzNet inference')
parser.add_argument('--batch_size', default=16, type=int,
help='Data batch size')
parser.add_argument('--steps', default=0, type=int,
help='Eval this many steps for every worker')
parser.add_argument('--warmup_steps', default=0, type=int,
help='Burn-in period before measuring latencies')
parser.add_argument('--model_config', type=str, required=True,
help='Relative model config path given dataset folder')
parser.add_argument('--dataset_dir', type=str,
help='Absolute path to dataset folder')
parser.add_argument('--val_manifests', type=str, nargs='+',
help='Relative path to evaluation dataset manifest files')
parser.add_argument('--ckpt', default=None, type=str,
help='Path to model checkpoint')
parser.add_argument('--amp', '--fp16', action='store_true',
help='Use FP16 precision')
parser.add_argument('--cudnn_benchmark', action='store_true',
help='Enable cudnn benchmark')
parser.add_argument('--cpu', action='store_true',
help='Run inference on CPU')
parser.add_argument("--seed", default=None, type=int, help='Random seed')
parser.add_argument('--local_rank', default=os.getenv('LOCAL_RANK', 0),
type=int, help='GPU id used for distributed training')
io = parser.add_argument_group('feature and checkpointing setup')
io.add_argument('--dali_device', type=str, choices=['none', 'cpu', 'gpu'],
default='gpu', help='Use DALI pipeline for fast data processing')
io.add_argument('--save_predictions', type=str, default=None,
help='Save predictions in text form at this location')
io.add_argument('--save_logits', default=None, type=str,
help='Save output logits under specified path')
io.add_argument('--transcribe_wav', type=str,
help='Path to a single .wav file (16KHz)')
io.add_argument('--transcribe_filelist', type=str,
help='Path to a filelist with one .wav path per line')
io.add_argument('-o', '--output_dir', default='results/',
help='Output folder to save audio (file per phrase)')
io.add_argument('--log_file', type=str, default=None,
help='Path to a DLLogger log file')
io.add_argument('--ema', action='store_true',
help='Load averaged model weights')
io.add_argument('--torchscript', action='store_true',
help='Evaluate with a TorchScripted model')
io.add_argument('--torchscript_export', action='store_true',
help='Export the model with torch.jit to the output_dir')
io.add_argument('--override_config', type=str, action='append',
help='Overrides arbitrary config value.'
' Syntax: `--override_config nested.config.key=val`.')
return parser
def durs_to_percentiles(durations, ratios):
durations = np.asarray(durations) * 1000 # in ms
latency = durations
latency = latency[5:]
mean_latency = np.mean(latency)
latency_worst = nlargest(math.ceil((1 - min(ratios)) * len(latency)), latency)
latency_ranges = get_percentile(ratios, latency_worst, len(latency))
latency_ranges[0.5] = mean_latency
return latency_ranges
def get_percentile(ratios, arr, nsamples):
res = {}
for a in ratios:
idx = max(int(nsamples * (1 - a)), 0)
res[a] = arr[idx]
return res
def torchscript_export(data_loader, audio_processor, model, greedy_decoder,
output_dir, use_amp, use_conv_masks, model_config, device,
save):
audio_processor.to(device)
for batch in data_loader:
batch = [t.to(device, non_blocking=True) for t in batch]
audio, audio_len, _, _ = batch
feats, feat_lens = audio_processor(audio, audio_len)
break
print("\nExporting featurizer...")
print("\nNOTE: Dithering causes warnings about non-determinism.\n")
ts_feat = torch.jit.trace(audio_processor, (audio, audio_len))
print("\nExporting acoustic model...")
model(feats, feat_lens)
ts_acoustic = torch.jit.trace(model, (feats, feat_lens))
print("\nExporting decoder...")
log_probs = model(feats, feat_lens)
ts_decoder = torch.jit.script(greedy_decoder, log_probs)
print("\nJIT export complete.")
if save:
precision = "fp16" if use_amp else "fp32"
module_name = f'{os.path.basename(model_config)}_{precision}'
ts_feat.save(os.path.join(output_dir, module_name + "_feat.pt"))
ts_acoustic.save(os.path.join(output_dir, module_name + "_acoustic.pt"))
ts_decoder.save(os.path.join(output_dir, module_name + "_decoder.pt"))
return ts_feat, ts_acoustic, ts_decoder
def main():
parser = get_parser()
args = parser.parse_args()
log_fpath = args.log_file or str(Path(args.output_dir, 'nvlog_infer.json'))
log_fpath = unique_log_fpath(log_fpath)
dllogger.init(backends=[JSONStreamBackend(Verbosity.DEFAULT, log_fpath),
StdOutBackend(Verbosity.VERBOSE,
metric_format=stdout_metric_format)])
[dllogger.log("PARAMETER", {k: v}) for k, v in vars(args).items()]
for step in ['DNN', 'data+DNN', 'data']:
for c in [0.99, 0.95, 0.9, 0.5]:
cs = 'avg' if c == 0.5 else f'{int(100*c)}%'
dllogger.metadata(f'{step.lower()}_latency_{c}',
{'name': f'{step} latency {cs}',
'format': ':>7.2f', 'unit': 'ms'})
dllogger.metadata(
'eval_wer', {'name': 'WER', 'format': ':>3.2f', 'unit': '%'})
if args.cpu:
device = torch.device('cpu')
else:
assert torch.cuda.is_available()
device = torch.device('cuda')
torch.backends.cudnn.benchmark = args.cudnn_benchmark
if args.seed is not None:
torch.manual_seed(args.seed + args.local_rank)
np.random.seed(args.seed + args.local_rank)
random.seed(args.seed + args.local_rank)
# set up distributed training
multi_gpu = not args.cpu and int(os.environ.get('WORLD_SIZE', 1)) > 1
if multi_gpu:
torch.cuda.set_device(args.local_rank)
distrib.init_process_group(backend='nccl', init_method='env://')
print_once(f'Inference with {distrib.get_world_size()} GPUs')
cfg = config.load(args.model_config)
config.apply_config_overrides(cfg, args)
symbols = helpers.add_ctc_blank(cfg['labels'])
use_dali = args.dali_device in ('cpu', 'gpu')
dataset_kw, features_kw = config.input(cfg, 'val')
measure_perf = args.steps > 0
# dataset
if args.transcribe_wav or args.transcribe_filelist:
if use_dali:
print("DALI supported only with input .json files; disabling")
use_dali = False
assert not args.pad_to_max_duration
assert not (args.transcribe_wav and args.transcribe_filelist)
if args.transcribe_wav:
dataset = SingleAudioDataset(args.transcribe_wav)
else:
dataset = FilelistDataset(args.transcribe_filelist)
data_loader = get_data_loader(dataset,
batch_size=1,
multi_gpu=multi_gpu,
shuffle=False,
num_workers=0,
drop_last=(True if measure_perf else False))
_, features_kw = config.input(cfg, 'val')
feat_proc = FilterbankFeatures(**features_kw)
elif use_dali:
# pad_to_max_duration is not supported by DALI - have simple padders
if features_kw['pad_to_max_duration']:
feat_proc = BaseFeatures(
pad_align=features_kw['pad_align'],
pad_to_max_duration=True,
max_duration=features_kw['max_duration'],
sample_rate=features_kw['sample_rate'],
window_size=features_kw['window_size'],
window_stride=features_kw['window_stride'])
features_kw['pad_to_max_duration'] = False
else:
feat_proc = None
data_loader = DaliDataLoader(
gpu_id=args.local_rank or 0,
dataset_path=args.dataset_dir,
config_data=dataset_kw,
config_features=features_kw,
json_names=args.val_manifests,
batch_size=args.batch_size,
pipeline_type=("train" if measure_perf else "val"), # no drop_last
device_type=args.dali_device,
symbols=symbols)
else:
dataset = AudioDataset(args.dataset_dir,
args.val_manifests,
symbols,
**dataset_kw)
data_loader = get_data_loader(dataset,
args.batch_size,
multi_gpu=multi_gpu,
shuffle=False,
num_workers=4,
drop_last=False)
feat_proc = FilterbankFeatures(**features_kw)
model = QuartzNet(encoder_kw=config.encoder(cfg),
decoder_kw=config.decoder(cfg, n_classes=len(symbols)))
if args.ckpt is not None:
print(f'Loading the model from {args.ckpt} ...')
checkpoint = torch.load(args.ckpt, map_location="cpu")
key = 'ema_state_dict' if args.ema else 'state_dict'
state_dict = checkpoint[key]
model.load_state_dict(state_dict, strict=True)
model.to(device)
model.eval()
if feat_proc is not None:
feat_proc.to(device)
feat_proc.eval()
if args.amp:
model = model.half()
if args.torchscript:
greedy_decoder = GreedyCTCDecoder()
feat_proc, model, greedy_decoder = torchscript_export(
data_loader, feat_proc, model, greedy_decoder, args.output_dir,
use_amp=args.amp, use_conv_masks=True, model_toml=args.model_toml,
device=device, save=args.torchscript_export)
if multi_gpu:
model = torch.nn.parallel.DistributedDataParallel(
model, device_ids=[args.local_rank], output_device=args.local_rank)
agg = {'txts': [], 'preds': [], 'logits': []}
dur = {'data': [], 'dnn': [], 'data+dnn': []}
looped_loader = chain.from_iterable(repeat(data_loader))
greedy_decoder = GreedyCTCDecoder()
sync = lambda: torch.cuda.synchronize() if device.type == 'cuda' else None
steps = args.steps + args.warmup_steps or len(data_loader)
with torch.no_grad():
for it, batch in enumerate(tqdm(looped_loader, initial=1, total=steps)):
if use_dali:
feats, feat_lens, txt, txt_lens = batch
if feat_proc is not None:
feats, feat_lens = feat_proc(feats, feat_lens)
else:
batch = [t.to(device, non_blocking=True) for t in batch]
audio, audio_lens, txt, txt_lens = batch
feats, feat_lens = feat_proc(audio, audio_lens)
sync()
t1 = time.perf_counter()
if args.amp:
feats = feats.half()
if model.encoder.use_conv_masks:
log_probs, log_prob_lens = model(feats, feat_lens)
else:
log_probs = model(feats, feat_lens)
preds = greedy_decoder(log_probs)
sync()
t2 = time.perf_counter()
# burn-in period; wait for a new loader due to num_workers
if it >= 1 and (args.steps == 0 or it >= args.warmup_steps):
dur['data'].append(t1 - t0)
dur['dnn'].append(t2 - t1)
dur['data+dnn'].append(t2 - t0)
if txt is not None:
agg['txts'] += helpers.gather_transcripts([txt], [txt_lens],
symbols)
agg['preds'] += helpers.gather_predictions([preds], symbols)
agg['logits'].append(log_probs)
if it + 1 == steps:
break
sync()
t0 = time.perf_counter()
# communicate the results
if args.transcribe_wav:
for idx, p in enumerate(agg['preds']):
print_once(f'Prediction {idx+1: >3}: {p}')
elif args.transcribe_filelist:
pass
elif not multi_gpu or distrib.get_rank() == 0:
wer, _ = process_evaluation_epoch(agg)
dllogger.log(step=(), data={'eval_wer': 100 * wer})
if args.save_predictions:
with open(args.save_predictions, 'w') as f:
f.write('\n'.join(agg['preds']))
if args.save_logits:
logits = torch.cat(agg['logits'], dim=0).cpu()
torch.save(logits, args.save_logits)
# report timings
if len(dur['data']) >= 20:
ratios = [0.9, 0.95, 0.99]
for stage in dur:
lat = durs_to_percentiles(dur[stage], ratios)
for k in [0.99, 0.95, 0.9, 0.5]:
kk = str(k).replace('.', '_')
dllogger.log(step=(), data={f'{stage.lower()}_latency_{kk}': lat[k]})
else:
print_once('Not enough samples to measure latencies.')
if __name__ == "__main__":
main()

View file

@ -0,0 +1,10 @@
#!/bin/bash
set -a
: ${NUM_GPUS:=16}
: ${GPU_BATCH_SIZE:=36}
: ${GRAD_ACCUMULATION:=2}
: ${AMP=:true}
bash scripts/train.sh "$@"

View file

@ -0,0 +1,10 @@
#!/bin/bash
set -a
: ${NUM_GPUS:=8}
: ${GPU_BATCH_SIZE:=36}
: ${GRAD_ACCUMULATION:=4}
: ${AMP=:true}
bash scripts/train.sh "$@"

View file

@ -0,0 +1,10 @@
#!/bin/bash
set -a
: ${NUM_GPUS:=16}
: ${GPU_BATCH_SIZE:=36}
: ${GRAD_ACCUMULATION:=2}
: ${AMP=:false}
bash scripts/train.sh "$@"

View file

@ -0,0 +1,10 @@
#!/bin/bash
set -a
: ${NUM_GPUS:=8}
: ${GPU_BATCH_SIZE:=36}
: ${GRAD_ACCUMULATION:=4}
: ${AMP=:false}
bash scripts/train.sh "$@"

View file

@ -0,0 +1,10 @@
#!/bin/bash
set -a
: ${NUM_GPUS:=8}
: ${GPU_BATCH_SIZE:=72}
: ${GRAD_ACCUMULATION:=2}
: ${AMP=:true}
bash scripts/train.sh "$@"

View file

@ -0,0 +1,10 @@
#!/bin/bash
set -a
: ${NUM_GPUS:=8}
: ${GPU_BATCH_SIZE:=72}
: ${GRAD_ACCUMULATION:=2}
: ${AMP=:false}
bash scripts/train.sh "$@"

View file

@ -0,0 +1,140 @@
# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import copy
import inspect
from ast import literal_eval
from contextlib import suppress
from numbers import Number
import yaml
from common.audio import GainPerturbation, ShiftPerturbation, SpeedPerturbation
from common.dataset import AudioDataset
from common.features import (CutoutAugment, FilterbankFeatures, SpecAugment)
from quartznet.model import JasperDecoderForCTC, JasperBlock, JasperEncoder
def default_args(klass):
sig = inspect.signature(klass.__init__)
return {k: v.default for k, v in sig.parameters.items() if k != 'self'}
def load(fpath):
cfg = yaml.safe_load(open(fpath, 'r'))
# Reload to deep copy shallow copies, which were made with yaml anchors
yaml.Dumper.ignore_aliases = lambda *args: True
cfg = yaml.dump(cfg)
cfg = yaml.safe_load(cfg)
return cfg
def validate_and_fill(klass, user_conf, ignore_unk=[], optional=[]):
conf = default_args(klass)
for k, v in user_conf.items():
assert k in conf or k in ignore_unk, f'Unknown param {k} for {klass}'
conf[k] = v
# Keep only mandatory or optional-nonempty
conf = {k: v for k, v in conf.items()
if k not in optional or v is not inspect.Parameter.empty}
# Validate
for k, v in conf.items():
assert v is not inspect.Parameter.empty, \
f'Value for {k} not specified for {klass}'
return conf
def input(conf_yaml, split='train'):
conf = copy.deepcopy(conf_yaml[f'input_{split}'])
conf_dataset = conf.pop('audio_dataset')
conf_features = conf.pop('filterbank_features')
# Validate known inner classes
inner_classes = [
(conf_dataset, 'speed_perturbation', SpeedPerturbation),
(conf_dataset, 'gain_perturbation', GainPerturbation),
(conf_dataset, 'shift_perturbation', ShiftPerturbation),
(conf_features, 'spec_augment', SpecAugment),
(conf_features, 'cutout_augment', CutoutAugment),
]
for conf_tgt, key, klass in inner_classes:
if key in conf_tgt:
conf_tgt[key] = validate_and_fill(klass, conf_tgt[key])
for k in conf:
raise ValueError(f'Unknown key {k}')
# Validate outer classes
conf_dataset = validate_and_fill(
AudioDataset, conf_dataset,
optional=['data_dir', 'labels', 'manifest_fpaths'])
# klass = feature_class(conf_features['feature_type'])
# conf_features = validate_and_fill(
# klass, conf_features, ignore_unk=['feature_type'])
conf_features = validate_and_fill(
FilterbankFeatures, conf_features) # , ignore_unk=['feature_type'])
# Check params shared between classes
shared = ['sample_rate', 'max_duration', 'pad_to_max_duration']
for sh in shared:
assert conf_dataset[sh] == conf_features[sh], (
f'{sh} should match in Dataset and FeatureProcessor: '
f'{conf_dataset[sh]}, {conf_features[sh]}')
return conf_dataset, conf_features
def encoder(conf):
"""Validate config for JasperEncoder and subsequent JasperBlocks"""
# Validate, but don't overwrite with defaults
for blk in conf['quartznet']['encoder']['blocks']:
validate_and_fill(JasperBlock, blk, optional=['infilters'],
ignore_unk=['residual_dense'])
return validate_and_fill(JasperEncoder, conf['quartznet']['encoder'])
def decoder(conf, n_classes):
decoder_kw = {'n_classes': n_classes, **conf['quartznet']['decoder']}
return validate_and_fill(JasperDecoderForCTC, decoder_kw)
def apply_config_overrides(conf, args):
if args.override_config is None:
return
for override_key_val in args.override_config:
key, val = override_key_val.split('=')
with suppress(TypeError, ValueError):
val = literal_eval(val)
apply_nested_config_override(conf, key, val)
def apply_nested_config_override(conf, key_str, val):
fields = key_str.split('.')
for f in fields[:-1]:
conf = conf[f]
f = fields[-1]
assert (f not in conf
or type(val) is type(conf[f])
or (isinstance(val, Number) and isinstance(conf[f], Number)))
conf[f] = val

View file

@ -0,0 +1,391 @@
# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import torch
import torch.nn as nn
import torch.nn.functional as F
activations = {
"hardtanh": nn.Hardtanh,
"relu": nn.ReLU,
"selu": nn.SELU,
}
def init_weights(m, mode='xavier_uniform'):
if type(m) == nn.Conv1d or type(m) == MaskedConv1d:
if mode == 'xavier_uniform':
nn.init.xavier_uniform_(m.weight, gain=1.0)
elif mode == 'xavier_normal':
nn.init.xavier_normal_(m.weight, gain=1.0)
elif mode == 'kaiming_uniform':
nn.init.kaiming_uniform_(m.weight, nonlinearity="relu")
elif mode == 'kaiming_normal':
nn.init.kaiming_normal_(m.weight, nonlinearity="relu")
else:
raise ValueError("Unknown Initialization mode: {0}".format(mode))
elif type(m) == nn.BatchNorm1d:
if m.track_running_stats:
m.running_mean.zero_()
m.running_var.fill_(1)
m.num_batches_tracked.zero_()
if m.affine:
nn.init.ones_(m.weight)
nn.init.zeros_(m.bias)
def compute_new_kernel_size(kernel_size, kernel_width):
new_kernel_size = max(int(kernel_size * kernel_width), 1)
# If kernel is even shape, round up to make it odd
if new_kernel_size % 2 == 0:
new_kernel_size += 1
return new_kernel_size
def get_same_padding(kernel_size, stride, dilation):
if stride > 1 and dilation > 1:
raise ValueError("Only stride OR dilation may be greater than 1")
return (kernel_size // 2) * dilation
class GroupShuffle(nn.Module):
def __init__(self, groups, channels):
super(GroupShuffle, self).__init__()
self.groups = groups
self.channels_per_group = channels // groups
def forward(self, x):
sh = x.shape
x = x.view(-1, self.groups, self.channels_per_group, sh[-1])
x = torch.transpose(x, 1, 2).contiguous()
x = x.view(-1, self.groups * self.channels_per_group, sh[-1])
return x
class MaskedConv1d(nn.Conv1d):
"""1D convolution with sequence masking
"""
__constants__ = ["masked"]
def __init__(self, in_channels, out_channels, kernel_size, stride=1,
padding=0, dilation=1, groups=1, bias=False, use_mask=True,
heads=-1):
# Jasper refactor compat
assert heads == -1 # Unsupported
masked = use_mask
super(MaskedConv1d, self).__init__(
in_channels, out_channels, kernel_size, stride=stride,
padding=padding, dilation=dilation, groups=groups, bias=bias)
self.masked = masked
def get_seq_len(self, lens):
pad, ks = self.padding[0], self.kernel_size[0]
return torch.div(lens + 2 * pad - self.dilation[0] * (ks - 1) - 1,
self.stride[0], rounding_mode='trunc') + 1
def forward(self, x, x_lens=None):
if self.masked:
max_len = x.size(2)
idxs = torch.arange(max_len, dtype=x_lens.dtype, device=x.device)
mask = idxs.expand(x_lens.size(0), max_len) >= x_lens.unsqueeze(1)
x = x.masked_fill(mask.unsqueeze(1).to(device=x.device), 0)
x_lens = self.get_seq_len(x_lens)
return super(MaskedConv1d, self).forward(x), x_lens
class JasperBlock(nn.Module):
__constants__ = ["conv_mask", "separable", "res", "mconv"]
def __init__(self, infilters, filters, repeat=3, kernel_size=11,
kernel_size_factor=1, stride=1, dilation=1, padding='same',
dropout=0.2, activation=None, residual=True, groups=1,
separable=False, heads=-1, normalization="batch",
norm_groups=1, residual_panes=[], use_conv_masks=False):
super(JasperBlock, self).__init__()
# Fix params being passed as list, but default to ints
wrap = lambda v: [v] if type(v) is int else v
kernel_size = wrap(kernel_size)
dilation = wrap(dilation)
padding = wrap(padding)
stride = wrap(stride)
if padding != "same":
raise ValueError("currently only 'same' padding is supported")
kernel_size_factor = float(kernel_size_factor)
if type(kernel_size) in (list, tuple):
kernel_size = [compute_new_kernel_size(k, kernel_size_factor)
for k in kernel_size]
else:
kernel_size = compute_new_kernel_size(kernel_size,
kernel_size_factor)
padding_val = get_same_padding(kernel_size[0], stride[0], dilation[0])
self.conv_mask = use_conv_masks
self.separable = separable
infilters_loop = infilters
conv = nn.ModuleList()
for _ in range(repeat - 1):
conv.extend(
self._get_conv_bn_layer(
infilters_loop, filters, kernel_size=kernel_size,
stride=stride, dilation=dilation, padding=padding_val,
groups=groups, heads=heads, separable=separable,
normalization=normalization, norm_groups=norm_groups)
)
conv.extend(self._get_act_dropout_layer(drop_prob=dropout,
activation=activation))
infilters_loop = filters
conv.extend(
self._get_conv_bn_layer(
infilters_loop, filters, kernel_size=kernel_size, stride=stride,
dilation=dilation, padding=padding_val, groups=groups,
heads=heads, separable=separable, normalization=normalization,
norm_groups=norm_groups)
)
self.mconv = conv
res_panes = residual_panes.copy()
self.dense_residual = residual
if residual:
res_list = nn.ModuleList()
if len(residual_panes) == 0:
res_panes = [infilters]
self.dense_residual = False
for ip in res_panes:
res_list.append(nn.ModuleList(
self._get_conv_bn_layer(ip, filters, kernel_size=1,
normalization=normalization,
norm_groups=norm_groups, stride=[1])
))
self.res = res_list
else:
self.res = None
self.mout = nn.Sequential(*self._get_act_dropout_layer(
drop_prob=dropout, activation=activation))
def _get_conv(self, in_channels, out_channels, kernel_size=11, stride=1,
dilation=1, padding=0, bias=False, groups=1, heads=-1,
separable=False):
kw = {'in_channels': in_channels, 'out_channels': out_channels,
'kernel_size': kernel_size, 'stride': stride, 'dilation': dilation,
'padding': padding, 'bias': bias, 'groups': groups}
if self.conv_mask:
return MaskedConv1d(**kw, heads=heads, use_mask=self.conv_mask)
else:
return nn.Conv1d(**kw)
def _get_conv_bn_layer(self, in_channels, out_channels, kernel_size=11,
stride=1, dilation=1, padding=0, bias=False,
groups=1, heads=-1, separable=False,
normalization="batch", norm_groups=1):
if norm_groups == -1:
norm_groups = out_channels
if separable:
layers = [
self._get_conv(in_channels, in_channels, kernel_size,
stride=stride, dilation=dilation, padding=padding,
bias=bias, groups=in_channels, heads=heads),
self._get_conv(in_channels, out_channels, kernel_size=1,
stride=1, dilation=1, padding=0, bias=bias,
groups=groups),
]
else:
layers = [
self._get_conv(in_channels, out_channels, kernel_size,
stride=stride, dilation=dilation,
padding=padding, bias=bias, groups=groups)
]
if normalization == "group":
layers.append(nn.GroupNorm(num_groups=norm_groups,
num_channels=out_channels))
elif normalization == "instance":
layers.append(nn.GroupNorm(num_groups=out_channels,
num_channels=out_channels))
elif normalization == "layer":
layers.append(nn.GroupNorm(num_groups=1, num_channels=out_channels))
elif normalization == "batch":
layers.append(nn.BatchNorm1d(out_channels, eps=1e-3, momentum=0.1))
else:
raise ValueError(
f"Normalization method ({normalization}) does not match"
f" one of [batch, layer, group, instance]."
)
if groups > 1:
layers.append(GroupShuffle(groups, out_channels))
return layers
def _get_act_dropout_layer(self, drop_prob=0.2, activation=None):
if activation is None:
activation = nn.Hardtanh(min_val=0.0, max_val=20.0)
layers = [activation, nn.Dropout(p=drop_prob)]
return layers
def forward(self, xs, xs_lens=None):
if not self.conv_mask:
xs_lens = 0
# compute forward convolutions
out = xs[-1]
lens = xs_lens
for i, l in enumerate(self.mconv):
# if we're doing masked convolutions, we need to pass in and
# possibly update the sequence lengths
# if (i % 4) == 0 and self.conv_mask:
if isinstance(l, MaskedConv1d):
out, lens = l(out, lens)
else:
out = l(out)
# compute the residuals
if self.res is not None:
for i, layer in enumerate(self.res):
res_out = xs[i]
for j, res_layer in enumerate(layer):
if isinstance(res_layer, MaskedConv1d):
res_out, _ = res_layer(res_out, xs_lens)
else:
res_out = res_layer(res_out)
out = out + res_out
# compute the output
out = self.mout(out)
if self.res is not None and self.dense_residual:
out = xs + [out]
else:
out = [out]
return (out, lens) if self.conv_mask else (out, None)
class JasperEncoder(nn.Module):
__constants__ = ["use_conv_masks"]
def __init__(self, in_feats, activation, frame_splicing=1,
init='xavier_uniform', use_conv_masks=False, blocks=[]):
super(JasperEncoder, self).__init__()
self.use_conv_masks = use_conv_masks
self.layers = nn.ModuleList()
in_feats *= frame_splicing
all_residual_panes = []
for i, blk in enumerate(blocks):
blk['activation'] = activations[activation]()
has_residual_dense = blk.pop('residual_dense', False)
if has_residual_dense:
all_residual_panes += [in_feats]
blk['residual_panes'] = all_residual_panes
else:
blk['residual_panes'] = []
self.layers.append(
JasperBlock(in_feats, use_conv_masks=use_conv_masks, **blk))
in_feats = blk['filters']
self.apply(lambda x: init_weights(x, mode=init))
def forward(self, x, x_lens=None):
out, out_lens = [x], x_lens
for layer in self.layers:
out, out_lens = layer(out, out_lens)
return out, out_lens
class JasperDecoderForCTC(nn.Module):
def __init__(self, in_feats, n_classes, init='xavier_uniform'):
super(JasperDecoderForCTC, self).__init__()
self.layers = nn.Sequential(
nn.Conv1d(in_feats, n_classes, kernel_size=1, bias=True),)
self.apply(lambda x: init_weights(x, mode=init))
def forward(self, enc_out):
out = self.layers(enc_out[-1]).transpose(1, 2)
return F.log_softmax(out, dim=2)
class GreedyCTCDecoder(nn.Module):
@torch.no_grad()
def forward(self, log_probs):
return log_probs.argmax(dim=-1, keepdim=False).int()
class QuartzNet(nn.Module):
def __init__(self, encoder_kw, decoder_kw, transpose_in=False):
super(QuartzNet, self).__init__()
self.transpose_in = transpose_in
self.encoder = JasperEncoder(**encoder_kw)
self.decoder = JasperDecoderForCTC(**decoder_kw)
def forward(self, x, x_lens=None):
if self.encoder.use_conv_masks:
assert x_lens is not None
enc, enc_lens = self.encoder(x, x_lens)
out = self.decoder(enc)
return out, enc_lens
else:
if self.transpose_in:
x = x.transpose(1, 2)
enc, _ = self.encoder(x)
out = self.decoder(enc)
return out # XXX torchscript refuses to output None
# TODO Explicitly add x_lens=None for inference (now x can be a Tensor or tuple)
def infer(self, x):
if self.encoder.use_conv_masks:
return self.forward(x)
else:
ret = self.forward(x[0])
return ret, len(ret)
class CTCLossNM:
def __init__(self, n_classes):
self._criterion = nn.CTCLoss(blank=n_classes-1, reduction='none')
def __call__(self, log_probs, targets, input_length, target_length):
input_length = input_length.long()
target_length = target_length.long()
targets = targets.long()
loss = self._criterion(log_probs.transpose(1, 0), targets,
input_length, target_length)
# note that this is different from reduction = 'mean'
# because we are not dividing by target lengths
return torch.mean(loss)

View file

@ -0,0 +1,6 @@
tqdm==4.53.0
librosa==0.8.0
soundfile
sox==1.4.1
pyyaml
git+git://github.com/NVIDIA/dllogger.git@26a0f8f1958de2c0c460925ff6102a4d2486d6cc#egg=dllogger

View file

@ -0,0 +1,3 @@
#!/bin/bash
docker build . --rm -t quartznet

View file

@ -0,0 +1,24 @@
#!/bin/bash
SCRIPT_DIR=$(cd $(dirname $0); pwd)
QN_REPO=${QN_REPO:-"${SCRIPT_DIR}/../.."}
DATA_DIR=${1:-${DATA_DIR-${QN_REPO}"/datasets"}}
CHECKPOINT_DIR=${2:-${CHECKPOINT_DIR:-${QN_REPO}"/checkpoints"}}
RESULT_DIR=${3:-${RESULT_DIR:-${QN_REPO}"/results"}}
PROGRAM_PATH=${PROGRAM_PATH}
MOUNTS=""
MOUNTS+=" -v $DATA_DIR:/datasets"
MOUNTS+=" -v $CHECKPOINT_DIR:/checkpoints"
MOUNTS+=" -v $RESULT_DIR:/results"
MOUNTS+=" -v ${QN_REPO}:/quartznet"
docker run -it --rm --gpus all\
--env PYTHONDONTWRITEBYTECODE=1 \
--shm-size=4g \
--ulimit memlock=-1 \
--ulimit stack=67108864 \
$MOUNTS \
-w /quartznet \
quartznet:latest bash $PROGRAM_PATH

View file

@ -0,0 +1,32 @@
#!/usr/bin/env bash
# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
DATA_SET="LibriSpeech"
DATA_ROOT_DIR="/datasets"
DATA_DIR="${DATA_ROOT_DIR}/${DATA_SET}"
if [ ! -d "$DATA_DIR" ]
then
mkdir --mode 755 $DATA_DIR
python utils/download_librispeech.py \
utils/librispeech.csv \
$DATA_DIR \
-e ${DATA_ROOT_DIR}/
else
echo "Directory $DATA_DIR already exists."
fi

View file

@ -0,0 +1,21 @@
#!/bin/bash
# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
set -a
: ${PREDICTION_FILE:=}
bash ./scripts/inference.sh "$@"

View file

@ -0,0 +1,63 @@
#!/bin/bash
# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
: ${DATA_DIR:=${1:-"/datasets/LibriSpeech"}}
: ${MODEL_CONFIG:=${2:-"configs/quartznet15x5_speedp-online-1.15_speca.yaml"}}
: ${OUTPUT_DIR:=${3:-"/results"}}
: ${CHECKPOINT:=${4:-"/checkpoints/quartznet_fp16.pt"}}
: ${DATASET:="test-other"}
: ${LOG_FILE:=""}
: ${CUDNN_BENCHMARK:=false}
: ${MAX_DURATION:=""}
: ${PAD_TO_MAX_DURATION:=false}
: ${NUM_GPUS:=1}
: ${NUM_STEPS:=0}
: ${NUM_WARMUP_STEPS:=0}
: ${AMP:=false}
: ${BATCH_SIZE:=64}
: ${EMA:=true}
: ${SEED:=0}
: ${DALI_DEVICE:="gpu"}
: ${CPU:=false}
: ${LOGITS_FILE:=}
: ${PREDICTION_FILE:="${OUTPUT_DIR}/${DATASET}.predictions"}
mkdir -p "$OUTPUT_DIR"
ARGS="--dataset_dir=$DATA_DIR"
ARGS+=" --val_manifest=$DATA_DIR/librispeech-${DATASET}-wav.json"
ARGS+=" --model_config=$MODEL_CONFIG"
ARGS+=" --output_dir=$OUTPUT_DIR"
ARGS+=" --batch_size=$BATCH_SIZE"
ARGS+=" --seed=$SEED"
ARGS+=" --dali_device=$DALI_DEVICE"
ARGS+=" --steps $NUM_STEPS"
ARGS+=" --warmup_steps $NUM_WARMUP_STEPS"
[ "$AMP" = true ] && ARGS+=" --amp"
[ "$EMA" = true ] && ARGS+=" --ema"
[ "$CUDNN_BENCHMARK" = true ] && ARGS+=" --cudnn_benchmark"
[ -n "$CHECKPOINT" ] && ARGS+=" --ckpt=${CHECKPOINT}"
[ -n "$LOG_FILE" ] && ARGS+=" --log_file $LOG_FILE"
[ -n "$PREDICTION_FILE" ] && ARGS+=" --save_prediction $PREDICTION_FILE"
[ -n "$LOGITS_FILE" ] && ARGS+=" --logits_save_to $LOGITS_FILE"
[ "$CPU" == "true" ] && ARGS+=" --cpu"
[ -n "$MAX_DURATION" ] && ARGS+=" --override_config input_val.audio_dataset.max_duration=$MAX_DURATION" \
ARGS+=" --override_config input_val.filterbank_features.max_duration=$MAX_DURATION"
[ "$PAD_TO_MAX_DURATION" = true ] && ARGS+=" --override_config input_val.audio_dataset.pad_to_max_duration=True" \
ARGS+=" --override_config input_val.filterbank_features.pad_to_max_duration=True"
python -m torch.distributed.launch --nproc_per_node=$NUM_GPUS inference.py $ARGS

View file

@ -0,0 +1,37 @@
#!/bin/bash
# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
set -a
: ${OUTPUT_DIR:=${3:-"/results"}}
: ${CUDNN_BENCHMARK:=true}
: ${PAD_TO_MAX_DURATION:=true}
: ${NUM_WARMUP_STEPS:=10}
: ${NUM_STEPS:=500}
: ${AMP:=false}
: ${DALI_DEVICE:="cpu"}
: ${BATCH_SIZE_SEQ:="1 2 4 8 16"}
: ${MAX_DURATION_SEQ:="2 7 16.7"}
for MAX_DURATION in $MAX_DURATION_SEQ; do
for BATCH_SIZE in $BATCH_SIZE_SEQ; do
LOG_FILE="$OUTPUT_DIR/perf-infer_dali-${DALI_DEVICE}_amp-${AMP}_dur${MAX_DURATION}_bs${BATCH_SIZE}.json"
bash ./scripts/inference.sh "$@"
done
done

View file

@ -0,0 +1,51 @@
# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#!/usr/bin/env bash
python ./utils/convert_librispeech.py \
--input_dir /datasets/LibriSpeech/train-clean-100 \
--dest_dir /datasets/LibriSpeech/train-clean-100-wav \
--output_json /datasets/LibriSpeech/librispeech-train-clean-100-wav.json \
--speed 0.9 1.1
python ./utils/convert_librispeech.py \
--input_dir /datasets/LibriSpeech/train-clean-360 \
--dest_dir /datasets/LibriSpeech/train-clean-360-wav \
--output_json /datasets/LibriSpeech/librispeech-train-clean-360-wav.json \
--speed 0.9 1.1
python ./utils/convert_librispeech.py \
--input_dir /datasets/LibriSpeech/train-other-500 \
--dest_dir /datasets/LibriSpeech/train-other-500-wav \
--output_json /datasets/LibriSpeech/librispeech-train-other-500-wav.json \
--speed 0.9 1.1
python ./utils/convert_librispeech.py \
--input_dir /datasets/LibriSpeech/dev-clean \
--dest_dir /datasets/LibriSpeech/dev-clean-wav \
--output_json /datasets/LibriSpeech/librispeech-dev-clean-wav.json
python ./utils/convert_librispeech.py \
--input_dir /datasets/LibriSpeech/dev-other \
--dest_dir /datasets/LibriSpeech/dev-other-wav \
--output_json /datasets/LibriSpeech/librispeech-dev-other-wav.json
python ./utils/convert_librispeech.py \
--input_dir /datasets/LibriSpeech/test-clean \
--dest_dir /datasets/LibriSpeech/test-clean-wav \
--output_json /datasets/LibriSpeech/librispeech-test-clean-wav.json
python ./utils/convert_librispeech.py \
--input_dir /datasets/LibriSpeech/test-other \
--dest_dir /datasets/LibriSpeech/test-other-wav \
--output_json /datasets/LibriSpeech/librispeech-test-other-wav.json

View file

@ -0,0 +1,100 @@
#!/bin/bash
# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
export OMP_NUM_THREADS=1
: ${DATA_DIR:=${1:-"/datasets/LibriSpeech"}}
: ${MODEL_CONFIG:=${2:-"configs/quartznet15x5_speedp-online-1.15_speca.yaml"}}
: ${OUTPUT_DIR:=${3:-"/results"}}
: ${CHECKPOINT:=${4:-}}
: ${CUDNN_BENCHMARK:=true}
: ${NUM_GPUS:=8}
: ${AMP:=false}
: ${GPU_BATCH_SIZE:=72}
: ${GRAD_ACCUMULATION:=2}
: ${OPTIMIZER:=fused_novograd}
: ${LEARNING_RATE:=0.01}
: ${LR_POLICY:=exponential}
: ${LR_EXP_GAMMA:=0.981}
: ${EMA:=0.999}
: ${MULTI_TENSOR_EMA:=true}
: ${SEED:=0}
: ${EPOCHS:=260}
: ${WARMUP_EPOCHS:=2}
: ${HOLD_EPOCHS:=140}
: ${SAVE_FREQUENCY:=10}
: ${EPOCHS_THIS_JOB:=0}
: ${DALI_DEVICE:="gpu"}
: ${PAD_TO_MAX_DURATION:=false}
: ${EVAL_FREQUENCY:=241}
: ${PREDICTION_FREQUENCY:=241}
: ${TRAIN_MANIFESTS:="$DATA_DIR/librispeech-train-clean-100-wav.json \
$DATA_DIR/librispeech-train-clean-360-wav.json \
$DATA_DIR/librispeech-train-other-500-wav.json"}
: ${VAL_MANIFESTS:="$DATA_DIR/librispeech-dev-clean-wav.json"}
mkdir -p "$OUTPUT_DIR"
ARGS="--dataset_dir=$DATA_DIR"
ARGS+=" --val_manifests $VAL_MANIFESTS"
ARGS+=" --train_manifests $TRAIN_MANIFESTS"
ARGS+=" --model_config=$MODEL_CONFIG"
ARGS+=" --output_dir=$OUTPUT_DIR"
ARGS+=" --lr=$LEARNING_RATE"
ARGS+=" --gpu_batch_size=$GPU_BATCH_SIZE"
ARGS+=" --min_lr=1e-5"
ARGS+=" --lr_policy=$LR_POLICY"
ARGS+=" --lr_exp_gamma=$LR_EXP_GAMMA"
ARGS+=" --epochs=$EPOCHS"
ARGS+=" --warmup_epochs=$WARMUP_EPOCHS"
ARGS+=" --hold_epochs=$HOLD_EPOCHS"
ARGS+=" --epochs_this_job=$EPOCHS_THIS_JOB"
ARGS+=" --ema=$EMA"
ARGS+=" --seed=$SEED"
ARGS+=" --optimizer=$OPTIMIZER"
ARGS+=" --weight_decay=1e-3"
ARGS+=" --resume"
ARGS+=" --save_frequency=$SAVE_FREQUENCY"
ARGS+=" --keep_milestones 100 200"
ARGS+=" --save_best_from=200"
ARGS+=" --log_frequency=1"
ARGS+=" --eval_frequency=$EVAL_FREQUENCY"
ARGS+=" --prediction_frequency=$PREDICTION_FREQUENCY"
ARGS+=" --grad_accumulation=$GRAD_ACCUMULATION "
ARGS+=" --dali_device=$DALI_DEVICE"
[ "$AMP" = true ] && ARGS+=" --amp"
[ "$CUDNN_BENCHMARK" = true ] && ARGS+=" --cudnn_benchmark"
[ -n "$MAX_DURATION" ] && ARGS+=" --override_config input_train.audio_dataset.max_duration=$MAX_DURATION" \
ARGS+=" --override_config input_train.filterbank_features.max_duration=$MAX_DURATION"
[ "$PAD_TO_MAX_DURATION" = true ] && ARGS+=" --override_config input_train.audio_dataset.pad_to_max_duration=True" \
ARGS+=" --override_config input_train.filterbank_features.pad_to_max_duration=True"
[ -n "$CHECKPOINT" ] && ARGS+=" --ckpt=${CHECKPOINT}"
[ -n "$LOG_FILE" ] && ARGS+=" --log_file $LOG_FILE"
[ -n "$PRE_ALLOCATE" ] && ARGS+=" --pre_allocate_range $PRE_ALLOCATE"
[ "$MULTI_TENSOR_EMA" = true ] && ARGS+=" --multi_tensor_ema"
[ -n "$BENCHMARK_EPOCHS" ] && ARGS+=" --benchmark_epochs_num=$BENCHMARK_EPOCHS"
GBS=$(($NUM_GPUS * $GPU_BATCH_SIZE * $GRAD_ACCUMULATION))
if [ $GBS -ne $((8 * 144)) ]; then
echo -e "\nWARNING: Global batch size changed from $((8 * 144)) to ${GBS}."
sleep 3
fi
echo -e "\nAMP=$AMP,""${NUM_GPUS}x${GPU_BATCH_SIZE}x${GRAD_ACCUMULATION}" \
"(global batch size ${GBS})\n"
: ${DISTRIBUTED:="-m torch.distributed.launch --nproc_per_node=$NUM_GPUS"}
python $DISTRIBUTED train.py $ARGS

View file

@ -0,0 +1,57 @@
#!/bin/bash
# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
set -a
: ${DATA_DIR:=${1:-"/datasets/LibriSpeech"}}
: ${OUTPUT_DIR:=${3:-"/results"}}
: ${TRAIN_MANIFESTS:="$DATA_DIR/librispeech-train-clean-100-wav.json"}
: ${BENCHMARK_EPOCHS:=20}
: ${EPOCHS:=100000}
: ${RESUME:=false}
: ${SAVE_FREQUENCY:=100000}
: ${EVAL_FREQUENCY:=100000}
: ${LEARNING_RATE:=0.0001}
: ${AMP:=false}
: ${EMA:=0}
: ${DALI_DEVICE:="gpu"}
: ${NUM_GPUS_SEQ:="8 4 1"}
: ${ACC_BATCH_SIZE:="144"}
: ${GRAD_ACC_SEQ:="4 2"}
# A range of batch lengths for LibriSpeech
# with continuous speed perturbation (0.85, 1.15) and max duration 16.7s
: ${PRE_ALLOCATE:="1408 1920"}
for NUM_GPUS in $NUM_GPUS_SEQ; do
for GRAD_ACCUMULATION in $GRAD_ACC_SEQ; do
# Scale the number of epochs to the number of GPUs
BMARK=$((BENCHMARK_EPOCHS * NUM_GPUS / 8))
BMARK=$((BMARK < 2 ? 2 : BMARK))
BMARK=$((BMARK > BENCHMARK_EPOCHS ? BENCHMARK_EPOCHS : BMARK))
EPOCHS_THIS_JOB=$((BMARK + 1))
GPU_BATCH_SIZE=$((ACC_BATCH_SIZE / $GRAD_ACCUMULATION * 8 / $NUM_GPUS))
LOG_FILE="$OUTPUT_DIR/perf-train_dali-${DALI_DEVICE}_amp-${AMP}_"
LOG_FILE+="1x${NUM_GPUS}x${GPU_BATCH_SIZE}x${GRAD_ACCUMULATION}.json"
BENCHMARK_EPOCHS=$BMARK bash ./scripts/train.sh "$@"
done
done

View file

@ -0,0 +1,558 @@
# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import copy
import os
import random
import time
try:
import nvidia_dlprof_pytorch_nvtx as pyprof
except:
import pyprof
import torch
import amp_C
import numpy as np
import torch.cuda.profiler as profiler
import torch.distributed as dist
from apex.optimizers import FusedLAMB, FusedNovoGrad
from contextlib import suppress as empty_context
from common import helpers
from common.dali.data_loader import DaliDataLoader
from common.dataset import AudioDataset, get_data_loader
from common.features import BaseFeatures, FilterbankFeatures
from common.helpers import (Checkpointer, greedy_wer, num_weights, print_once,
process_evaluation_epoch)
from common.optimizers import AdamW, lr_policy, Novograd
from common.tb_dllogger import flush_log, init_log, log
from common.utils import BenchmarkStats
from quartznet import config
from quartznet.model import CTCLossNM, GreedyCTCDecoder, QuartzNet
def parse_args():
parser = argparse.ArgumentParser(description='QuartzNet')
training = parser.add_argument_group('training setup')
training.add_argument('--epochs', default=400, type=int,
help='Number of epochs for the entire training; influences the lr schedule')
training.add_argument("--warmup_epochs", default=0, type=int,
help='Initial epochs of increasing learning rate')
training.add_argument("--hold_epochs", default=0, type=int,
help='Constant max learning rate epochs after warmup')
training.add_argument('--epochs_this_job', default=0, type=int,
help=('Run for a number of epochs with no effect on the lr schedule.'
'Useful for re-starting the training.'))
training.add_argument('--cudnn_benchmark', action='store_true', default=True,
help='Enable cudnn benchmark')
training.add_argument('--amp', '--fp16', action='store_true', default=False,
help='Use pytorch native mixed precision training')
training.add_argument('--seed', default=1, type=int, help='Random seed')
training.add_argument('--local_rank', default=os.getenv('LOCAL_RANK', 0), type=int,
help='GPU id used for distributed training')
training.add_argument('--pre_allocate_range', default=None, type=int, nargs=2,
help='Warmup with batches of length [min, max] before training')
training.add_argument('--pyprof', action='store_true', help='Enable pyprof profiling')
optim = parser.add_argument_group('optimization setup')
optim.add_argument('--gpu_batch_size', default=32, type=int,
help='Batch size for a single forward/backward pass. '
'The Effective batch size is gpu_batch_size * grad_accumulation.')
optim.add_argument('--lr', default=1e-3, type=float,
help='Peak learning rate')
optim.add_argument("--min_lr", default=1e-5, type=float,
help='minimum learning rate')
optim.add_argument("--lr_policy", default='exponential', type=str,
choices=['exponential', 'legacy'], help='lr scheduler')
optim.add_argument("--lr_exp_gamma", default=0.99, type=float,
help='gamma factor for exponential lr scheduler')
optim.add_argument('--weight_decay', default=1e-3, type=float,
help='Weight decay for the optimizer')
optim.add_argument('--grad_accumulation', '--update-freq', default=1, type=int,
help='Number of accumulation steps')
optim.add_argument('--optimizer', default='novograd', type=str,
choices=['novograd', 'adamw', 'lamb98', 'fused_novograd'],
help='Optimization algorithm')
optim.add_argument('--ema', type=float, default=0.0,
help='Discount factor for exp averaging of model weights')
optim.add_argument('--multi_tensor_ema', action='store_true',
help='Use multi_tensor_apply for EMA')
io = parser.add_argument_group('feature and checkpointing setup')
io.add_argument('--dali_device', type=str, choices=['none', 'cpu', 'gpu'],
default='gpu', help='Use DALI pipeline for fast data processing')
io.add_argument('--resume', action='store_true',
help='Try to resume from last saved checkpoint.')
io.add_argument('--ckpt', default=None, type=str,
help='Path to a checkpoint for resuming training')
io.add_argument('--save_frequency', default=10, type=int,
help='Checkpoint saving frequency in epochs')
io.add_argument('--keep_milestones', default=[100, 200, 300], type=int, nargs='+',
help='Milestone checkpoints to keep from removing')
io.add_argument('--save_best_from', default=380, type=int,
help='Epoch on which to begin tracking best checkpoint (dev WER)')
io.add_argument('--eval_frequency', default=200, type=int,
help='Number of steps between evaluations on dev set')
io.add_argument('--log_frequency', default=25, type=int,
help='Number of steps between printing training stats')
io.add_argument('--prediction_frequency', default=100, type=int,
help='Number of steps between printing sample decodings')
io.add_argument('--model_config', type=str, required=True,
help='Path of the model configuration file')
io.add_argument('--train_manifests', type=str, required=True, nargs='+',
help='Paths of the training dataset manifest file')
io.add_argument('--val_manifests', type=str, required=True, nargs='+',
help='Paths of the evaluation datasets manifest files')
io.add_argument('--dataset_dir', required=True, type=str,
help='Root dir of dataset')
io.add_argument('--output_dir', type=str, required=True,
help='Directory for logs and checkpoints')
io.add_argument('--log_file', type=str, default=None,
help='Path to save the training logfile.')
io.add_argument('--benchmark_epochs_num', type=int, default=1,
help='Number of epochs accounted in final average throughput.')
io.add_argument('--override_config', type=str, action='append',
help='Overrides arbitrary config value.'
' Syntax: `--override_config nested.config.key=val`.')
return parser.parse_args()
def reduce_tensor(tensor, num_gpus):
rt = tensor.clone()
dist.all_reduce(rt, op=dist.ReduceOp.SUM)
return rt.true_divide(num_gpus)
def init_multi_tensor_ema(model, ema_model):
model_weights = list(model.state_dict().values())
ema_model_weights = list(ema_model.state_dict().values())
ema_overflow_buf = torch.cuda.IntTensor([0])
return model_weights, ema_model_weights, ema_overflow_buf
def apply_multi_tensor_ema(decay, model_weights, ema_model_weights, overflow_buf):
amp_C.multi_tensor_axpby(
65536, overflow_buf,
[ema_model_weights, model_weights, ema_model_weights],
decay, 1-decay, -1)
def apply_ema(model, ema_model, decay):
if not decay:
return
sd = getattr(model, 'module', model).state_dict()
for k, v in ema_model.state_dict().items():
v.copy_(decay * v + (1 - decay) * sd[k])
@torch.no_grad()
def evaluate(epoch, step, val_loader, val_feat_proc, labels, model,
ema_model, ctc_loss, greedy_decoder, use_amp, use_dali=False):
for model, subset in [(model, 'dev'), (ema_model, 'dev_ema')]:
if model is None:
continue
model.eval()
start_time = time.time()
agg = {'losses': [], 'preds': [], 'txts': []}
for batch in val_loader:
if use_dali:
# with DALI, the data is already on GPU
feat, feat_lens, txt, txt_lens = batch
if val_feat_proc is not None:
feat, feat_lens = val_feat_proc(feat, feat_lens)
else:
batch = [t.cuda(non_blocking=True) for t in batch]
audio, audio_lens, txt, txt_lens = batch
feat, feat_lens = val_feat_proc(audio, audio_lens)
with torch.cuda.amp.autocast(enabled=use_amp):
log_probs, enc_lens = model(feat, feat_lens)
loss = ctc_loss(log_probs, txt, enc_lens, txt_lens)
pred = greedy_decoder(log_probs)
agg['losses'] += helpers.gather_losses([loss])
agg['preds'] += helpers.gather_predictions([pred], labels)
agg['txts'] += helpers.gather_transcripts([txt], [txt_lens], labels)
wer, loss = process_evaluation_epoch(agg)
log((epoch,), step, subset, {'loss': loss, 'wer': 100.0 * wer,
'took': time.time() - start_time})
model.train()
return wer
def main():
args = parse_args()
assert(torch.cuda.is_available())
assert args.prediction_frequency % args.log_frequency == 0
torch.backends.cudnn.benchmark = args.cudnn_benchmark
# set up distributed training
multi_gpu = int(os.environ.get('WORLD_SIZE', 1)) > 1
if multi_gpu:
torch.cuda.set_device(args.local_rank)
dist.init_process_group(backend='nccl', init_method='env://')
world_size = dist.get_world_size()
print_once(f'Distributed training with {world_size} GPUs\n')
else:
world_size = 1
torch.manual_seed(args.seed + args.local_rank)
np.random.seed(args.seed + args.local_rank)
random.seed(args.seed + args.local_rank)
init_log(args)
cfg = config.load(args.model_config)
config.apply_config_overrides(cfg, args)
symbols = helpers.add_ctc_blank(cfg['labels'])
assert args.grad_accumulation >= 1
batch_size = args.gpu_batch_size
print_once('Setting up datasets...')
train_dataset_kw, train_features_kw = config.input(cfg, 'train')
val_dataset_kw, val_features_kw = config.input(cfg, 'val')
use_dali = args.dali_device in ('cpu', 'gpu')
if use_dali:
assert train_dataset_kw['ignore_offline_speed_perturbation'], \
"DALI doesn't support offline speed perturbation"
# pad_to_max_duration is not supported by DALI - have simple padders
if train_features_kw['pad_to_max_duration']:
train_feat_proc = BaseFeatures(
pad_align=train_features_kw['pad_align'],
pad_to_max_duration=True,
max_duration=train_features_kw['max_duration'],
sample_rate=train_features_kw['sample_rate'],
window_size=train_features_kw['window_size'],
window_stride=train_features_kw['window_stride'])
train_features_kw['pad_to_max_duration'] = False
else:
train_feat_proc = None
if val_features_kw['pad_to_max_duration']:
val_feat_proc = BaseFeatures(
pad_align=val_features_kw['pad_align'],
pad_to_max_duration=True,
max_duration=val_features_kw['max_duration'],
sample_rate=val_features_kw['sample_rate'],
window_size=val_features_kw['window_size'],
window_stride=val_features_kw['window_stride'])
val_features_kw['pad_to_max_duration'] = False
else:
val_feat_proc = None
train_loader = DaliDataLoader(gpu_id=args.local_rank,
dataset_path=args.dataset_dir,
config_data=train_dataset_kw,
config_features=train_features_kw,
json_names=args.train_manifests,
batch_size=batch_size,
grad_accumulation_steps=args.grad_accumulation,
pipeline_type="train",
device_type=args.dali_device,
symbols=symbols)
val_loader = DaliDataLoader(gpu_id=args.local_rank,
dataset_path=args.dataset_dir,
config_data=val_dataset_kw,
config_features=val_features_kw,
json_names=args.val_manifests,
batch_size=batch_size,
pipeline_type="val",
device_type=args.dali_device,
symbols=symbols)
else:
train_dataset_kw, train_features_kw = config.input(cfg, 'train')
train_dataset = AudioDataset(args.dataset_dir,
args.train_manifests,
symbols,
**train_dataset_kw)
train_loader = get_data_loader(train_dataset,
batch_size,
multi_gpu=multi_gpu,
shuffle=True,
num_workers=4)
train_feat_proc = FilterbankFeatures(**train_features_kw)
val_dataset_kw, val_features_kw = config.input(cfg, 'val')
val_dataset = AudioDataset(args.dataset_dir,
args.val_manifests,
symbols,
**val_dataset_kw)
val_loader = get_data_loader(val_dataset,
batch_size,
multi_gpu=multi_gpu,
shuffle=False,
num_workers=4,
drop_last=False)
val_feat_proc = FilterbankFeatures(**val_features_kw)
dur = train_dataset.duration / 3600
dur_f = train_dataset.duration_filtered / 3600
nsampl = len(train_dataset)
print_once(f'Training samples: {nsampl} ({dur:.1f}h, '
f'filtered {dur_f:.1f}h)')
if train_feat_proc is not None:
train_feat_proc.cuda()
if val_feat_proc is not None:
val_feat_proc.cuda()
steps_per_epoch = len(train_loader) // args.grad_accumulation
# set up the model
model = QuartzNet(encoder_kw=config.encoder(cfg),
decoder_kw=config.decoder(cfg, n_classes=len(symbols)))
model.cuda()
ctc_loss = CTCLossNM(n_classes=len(symbols))
greedy_decoder = GreedyCTCDecoder()
print_once(f'Model size: {num_weights(model) / 10**6:.1f}M params\n')
# optimization
kw = {'lr': args.lr, 'weight_decay': args.weight_decay}
if args.optimizer == "novograd":
optimizer = Novograd(model.parameters(), **kw)
elif args.optimizer == "adamw":
optimizer = AdamW(model.parameters(), **kw)
elif args.optimizer == 'lamb98':
optimizer = FusedLAMB(model.parameters(), betas=(0.9, 0.98), eps=1e-9,
**kw)
elif args.optimizer == 'fused_novograd':
optimizer = FusedNovoGrad(model.parameters(), betas=(0.95, 0),
bias_correction=False, reg_inside_moment=True,
grad_averaging=False, **kw)
else:
raise ValueError(f'Invalid optimizer "{args.optimizer}"')
scaler = torch.cuda.amp.GradScaler(enabled=args.amp)
adjust_lr = lambda step, epoch, optimizer: lr_policy(
step, epoch, args.lr, optimizer, steps_per_epoch=steps_per_epoch,
warmup_epochs=args.warmup_epochs, hold_epochs=args.hold_epochs,
num_epochs=args.epochs, policy=args.lr_policy, min_lr=args.min_lr,
exp_gamma=args.lr_exp_gamma)
if args.ema > 0:
ema_model = copy.deepcopy(model)
else:
ema_model = None
if multi_gpu:
model = torch.nn.parallel.DistributedDataParallel(
model, device_ids=[args.local_rank], output_device=args.local_rank)
if args.pyprof:
pyprof.init(enable_function_stack=True)
# load checkpoint
meta = {'best_wer': 10**6, 'start_epoch': 0}
checkpointer = Checkpointer(args.output_dir, 'QuartzNet',
args.keep_milestones)
if args.resume:
args.ckpt = checkpointer.last_checkpoint() or args.ckpt
if args.ckpt is not None:
checkpointer.load(args.ckpt, model, ema_model, optimizer, scaler, meta)
start_epoch = meta['start_epoch']
best_wer = meta['best_wer']
epoch = 1
step = start_epoch * steps_per_epoch + 1
if args.pyprof:
torch.autograd.profiler.emit_nvtx().__enter__()
profiler.start()
# training loop
model.train()
if args.ema > 0.0:
mt_ema_params = init_multi_tensor_ema(model, ema_model)
# ema_model_weight_list, model_weight_list, overflow_buf_for_ema = ema_
# pre-allocate
if args.pre_allocate_range is not None:
n_feats = train_features_kw['n_filt']
pad_align = train_features_kw['pad_align']
a, b = args.pre_allocate_range
for n_frames in range(a, b + pad_align, pad_align):
print_once(f'Pre-allocation ({batch_size}x{n_feats}x{n_frames})...')
feat = torch.randn(batch_size, n_feats, n_frames, device='cuda')
feat_lens = torch.ones(batch_size, device='cuda').fill_(n_frames)
txt = torch.randint(high=len(symbols)-1, size=(batch_size, 100),
device='cuda')
txt_lens = torch.ones(batch_size, device='cuda').fill_(100)
with torch.cuda.amp.autocast(enabled=args.amp):
log_probs, enc_lens = model(feat, feat_lens)
del feat
loss = ctc_loss(log_probs, txt, enc_lens, txt_lens)
loss.backward()
model.zero_grad()
torch.cuda.empty_cache()
bmark_stats = BenchmarkStats()
for epoch in range(start_epoch + 1, args.epochs + 1):
if multi_gpu and not use_dali:
train_loader.sampler.set_epoch(epoch)
epoch_utts = 0
epoch_loss = 0
accumulated_batches = 0
epoch_start_time = time.time()
epoch_eval_time = 0
for batch in train_loader:
if accumulated_batches == 0:
step_loss = 0
step_utts = 0
step_start_time = time.time()
if use_dali:
# with DALI, the data is already on GPU
feat, feat_lens, txt, txt_lens = batch
if train_feat_proc is not None:
feat, feat_lens = train_feat_proc(feat, feat_lens)
else:
batch = [t.cuda(non_blocking=True) for t in batch]
audio, audio_lens, txt, txt_lens = batch
feat, feat_lens = train_feat_proc(audio, audio_lens)
# Use context manager to prevent redundant accumulation of gradients
if (multi_gpu and accumulated_batches + 1 < args.grad_accumulation):
ctx = model.no_sync()
else:
ctx = empty_context()
with ctx:
with torch.cuda.amp.autocast(enabled=args.amp):
log_probs, enc_lens = model(feat, feat_lens)
loss = ctc_loss(log_probs, txt, enc_lens, txt_lens)
loss /= args.grad_accumulation
if multi_gpu:
reduced_loss = reduce_tensor(loss.data, world_size)
else:
reduced_loss = loss
if torch.isnan(reduced_loss).any():
print_once(f'WARNING: loss is NaN; skipping update')
continue
else:
step_loss += reduced_loss.item()
step_utts += batch[0].size(0) * world_size
epoch_utts += batch[0].size(0) * world_size
accumulated_batches += 1
scaler.scale(loss).backward()
if accumulated_batches % args.grad_accumulation == 0:
epoch_loss += step_loss
scaler.step(optimizer)
scaler.update()
adjust_lr(step, epoch, optimizer)
optimizer.zero_grad()
if args.ema > 0.0:
apply_multi_tensor_ema(args.ema, *mt_ema_params)
if step % args.log_frequency == 0:
preds = greedy_decoder(log_probs)
wer, pred_utt, ref = greedy_wer(preds, txt, txt_lens, symbols)
if step % args.prediction_frequency == 0:
print_once(f' Decoded: {pred_utt[:90]}')
print_once(f' Reference: {ref[:90]}')
step_time = time.time() - step_start_time
log((epoch, step % steps_per_epoch or steps_per_epoch, steps_per_epoch),
step, 'train',
{'loss': step_loss,
'wer': 100.0 * wer,
'throughput': step_utts / step_time,
'took': step_time,
'lrate': optimizer.param_groups[0]['lr']})
step_start_time = time.time()
if step % args.eval_frequency == 0:
tik = time.time()
wer = evaluate(epoch, step, val_loader, val_feat_proc,
symbols, model, ema_model, ctc_loss,
greedy_decoder, args.amp, use_dali)
if wer < best_wer and epoch >= args.save_best_from:
checkpointer.save(model, ema_model, optimizer, scaler,
epoch, step, best_wer, is_best=True)
best_wer = wer
epoch_eval_time += time.time() - tik
step += 1
accumulated_batches = 0
# end of step
# DALI iterator need to be exhausted;
# if not using DALI, simulate drop_last=True with grad accumulation
if not use_dali and step > steps_per_epoch * epoch:
break
epoch_time = time.time() - epoch_start_time
epoch_loss /= steps_per_epoch
log((epoch,), None, 'train_avg', {'throughput': epoch_utts / epoch_time,
'took': epoch_time,
'loss': epoch_loss})
bmark_stats.update(epoch_utts, epoch_time, epoch_loss)
if epoch % args.save_frequency == 0 or epoch in args.keep_milestones:
checkpointer.save(model, ema_model, optimizer, scaler, epoch, step,
best_wer)
if 0 < args.epochs_this_job <= epoch - start_epoch:
print_once(f'Finished after {args.epochs_this_job} epochs.')
break
# end of epoch
if args.pyprof:
profiler.stop()
torch.autograd.profiler.emit_nvtx().__exit__(None, None, None)
log((), None, 'train_avg', bmark_stats.get(args.benchmark_epochs_num))
if epoch == args.epochs:
evaluate(epoch, step, val_loader, val_feat_proc, symbols, model,
ema_model, ctc_loss, greedy_decoder, args.amp, use_dali)
checkpointer.save(model, ema_model, optimizer, scaler, epoch, step,
best_wer)
flush_log()
if __name__ == "__main__":
main()

View file

@ -0,0 +1,81 @@
# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#!/usr/bin/env python
import argparse
import os
import glob
import multiprocessing
import json
import pandas as pd
from preprocessing_utils import parallel_preprocess
parser = argparse.ArgumentParser(description='Preprocess LibriSpeech.')
parser.add_argument('--input_dir', type=str, required=True,
help='LibriSpeech collection input dir')
parser.add_argument('--dest_dir', type=str, required=True,
help='Output dir')
parser.add_argument('--output_json', type=str, default='./',
help='name of the output json file.')
parser.add_argument('-s','--speed', type=float, nargs='*',
help='Speed perturbation ratio')
parser.add_argument('--target_sr', type=int, default=None,
help='Target sample rate. '
'defaults to the input sample rate')
parser.add_argument('--overwrite', action='store_true',
help='Overwrite file if exists')
parser.add_argument('--parallel', type=int, default=multiprocessing.cpu_count(),
help='Number of threads to use when processing audio files')
args = parser.parse_args()
args.input_dir = args.input_dir.rstrip('/')
args.dest_dir = args.dest_dir.rstrip('/')
def build_input_arr(input_dir):
txt_files = glob.glob(os.path.join(input_dir, '**', '*.trans.txt'),
recursive=True)
input_data = []
for txt_file in txt_files:
rel_path = os.path.relpath(txt_file, input_dir)
with open(txt_file) as fp:
for line in fp:
fname, _, transcript = line.partition(' ')
input_data.append(dict(input_relpath=os.path.dirname(rel_path),
input_fname=fname+'.flac',
transcript=transcript))
return input_data
print("[%s] Scaning input dir..." % args.output_json)
dataset = build_input_arr(input_dir=args.input_dir)
print("[%s] Converting audio files..." % args.output_json)
dataset = parallel_preprocess(dataset=dataset,
input_dir=args.input_dir,
dest_dir=args.dest_dir,
target_sr=args.target_sr,
speed=args.speed,
overwrite=args.overwrite,
parallel=args.parallel)
print("[%s] Generating json..." % args.output_json)
df = pd.DataFrame(dataset, dtype=object)
# Save json with python. df.to_json() produces back slashed in file paths
dataset = df.to_dict(orient='records')
with open(args.output_json, 'w') as fp:
json.dump(dataset, fp, indent=2)

View file

@ -0,0 +1,72 @@
# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#!/usr/bin/env python
import os
import argparse
import pandas as pd
from download_utils import download_file, md5_checksum, extract
parser = argparse.ArgumentParser(description='Download, verify and extract dataset files')
parser.add_argument('csv', type=str,
help='CSV file with urls and checksums to download.')
parser.add_argument('dest', type=str,
help='Download destnation folder.')
parser.add_argument('-e', type=str, default=None,
help='Extraction destnation folder. Defaults to download folder if not provided')
parser.add_argument('--skip_download', action='store_true',
help='Skip downloading the files')
parser.add_argument('--skip_checksum', action='store_true',
help='Skip checksum')
parser.add_argument('--skip_extract', action='store_true',
help='Skip extracting files')
args = parser.parse_args()
args.e = args.e or args.dest
df = pd.read_csv(args.csv, delimiter=',')
if not args.skip_download:
for url in df.url:
fname = url.split('/')[-1]
print("Downloading %s:" % fname)
download_file(url=url, dest_folder=args.dest, fname=fname)
else:
print("Skipping file download")
if not args.skip_checksum:
for index, row in df.iterrows():
url = row['url']
md5 = row['md5']
fname = url.split('/')[-1]
fpath = os.path.join(args.dest, fname)
print("Verifing %s: " % fname, end='')
ret = md5_checksum(fpath=fpath, target_hash=md5)
print("Passed" if ret else "Failed")
else:
print("Skipping checksum")
if not args.skip_extract:
for url in df.url:
fname = url.split('/')[-1]
fpath = os.path.join(args.dest, fname)
print("Decompressing %s:" % fpath)
extract(fpath=fpath, dest_folder=args.e)
else:
print("Skipping file extraction")

View file

@ -0,0 +1,71 @@
# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#!/usr/bin/env python
import hashlib
import requests
import os
import tarfile
import tqdm
def download_file(url, dest_folder, fname, overwrite=False):
fpath = os.path.join(dest_folder, fname)
if os.path.isfile(fpath):
if overwrite:
print("Overwriting existing file")
else:
print("File exists, skipping download.")
return
tmp_fpath = fpath + '.tmp'
if not os.path.exists(os.path.dirname(tmp_fpath)):
os.makedirs(os.path.dirname(tmp_fpath))
r = requests.get(url, stream=True)
file_size = int(r.headers['Content-Length'])
chunk_size = 1024 * 1024 # 1MB
total_chunks = int(file_size / chunk_size)
with open(tmp_fpath, 'wb') as fp:
content_iterator = r.iter_content(chunk_size=chunk_size)
chunks = tqdm.tqdm(content_iterator, total=total_chunks,
unit='MB', desc=fpath, leave=True)
for chunk in chunks:
fp.write(chunk)
os.rename(tmp_fpath, fpath)
def md5_checksum(fpath, target_hash):
file_hash = hashlib.md5()
with open(fpath, "rb") as fp:
for chunk in iter(lambda: fp.read(1024*1024), b""):
file_hash.update(chunk)
return file_hash.hexdigest() == target_hash
def extract(fpath, dest_folder):
if fpath.endswith('.tar.gz'):
mode = 'r:gz'
elif fpath.endswith('.tar'):
mode = 'r:'
else:
raise IOError('fpath has unknown extention: %s' % fpath)
with tarfile.open(fpath, mode) as tar:
members = tar.getmembers()
for member in tqdm.tqdm(iterable=members, total=len(members), leave=True):
tar.extract(path=dest_folder, member=member)

View file

@ -0,0 +1,5 @@
url,md5
http://www.openslr.org/resources/12/dev-clean.tar.gz,42e2234ba48799c1f50f24a7926300a1
http://www.openslr.org/resources/12/dev-other.tar.gz,c8d0bcc9cca99d4f8b62fcc847357931
http://www.openslr.org/resources/12/test-clean.tar.gz,32fa31d27d2e1cad72775fee3f4849a9
http://www.openslr.org/resources/12/test-other.tar.gz,fb5a50374b501bb3bac4815ee91d3135
1 url md5
2 http://www.openslr.org/resources/12/dev-clean.tar.gz 42e2234ba48799c1f50f24a7926300a1
3 http://www.openslr.org/resources/12/dev-other.tar.gz c8d0bcc9cca99d4f8b62fcc847357931
4 http://www.openslr.org/resources/12/test-clean.tar.gz 32fa31d27d2e1cad72775fee3f4849a9
5 http://www.openslr.org/resources/12/test-other.tar.gz fb5a50374b501bb3bac4815ee91d3135

View file

@ -0,0 +1,8 @@
url,md5
http://www.openslr.org/resources/12/dev-clean.tar.gz,42e2234ba48799c1f50f24a7926300a1
http://www.openslr.org/resources/12/dev-other.tar.gz,c8d0bcc9cca99d4f8b62fcc847357931
http://www.openslr.org/resources/12/test-clean.tar.gz,32fa31d27d2e1cad72775fee3f4849a9
http://www.openslr.org/resources/12/test-other.tar.gz,fb5a50374b501bb3bac4815ee91d3135
http://www.openslr.org/resources/12/train-clean-100.tar.gz,2a93770f6d5c6c964bc36631d331a522
http://www.openslr.org/resources/12/train-clean-360.tar.gz,c0e676e450a7ff2f54aeade5171606fa
http://www.openslr.org/resources/12/train-other-500.tar.gz,d1a0fd59409feb2c614ce4d30c387708
1 url md5
2 http://www.openslr.org/resources/12/dev-clean.tar.gz 42e2234ba48799c1f50f24a7926300a1
3 http://www.openslr.org/resources/12/dev-other.tar.gz c8d0bcc9cca99d4f8b62fcc847357931
4 http://www.openslr.org/resources/12/test-clean.tar.gz 32fa31d27d2e1cad72775fee3f4849a9
5 http://www.openslr.org/resources/12/test-other.tar.gz fb5a50374b501bb3bac4815ee91d3135
6 http://www.openslr.org/resources/12/train-clean-100.tar.gz 2a93770f6d5c6c964bc36631d331a522
7 http://www.openslr.org/resources/12/train-clean-360.tar.gz c0e676e450a7ff2f54aeade5171606fa
8 http://www.openslr.org/resources/12/train-other-500.tar.gz d1a0fd59409feb2c614ce4d30c387708

View file

@ -0,0 +1,76 @@
# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#!/usr/bin/env python
import os
import multiprocessing
import librosa
import functools
import sox
from tqdm import tqdm
def preprocess(data, input_dir, dest_dir, target_sr=None, speed=None,
overwrite=True):
speed = speed or []
speed.append(1)
speed = list(set(speed)) # Make uniqe
input_fname = os.path.join(input_dir,
data['input_relpath'],
data['input_fname'])
input_sr = sox.file_info.sample_rate(input_fname)
target_sr = target_sr or input_sr
os.makedirs(os.path.join(dest_dir, data['input_relpath']), exist_ok=True)
output_dict = {}
output_dict['transcript'] = data['transcript'].lower().strip()
output_dict['files'] = []
fname = os.path.splitext(data['input_fname'])[0]
for s in speed:
output_fname = fname + '{}.wav'.format('' if s==1 else '-{}'.format(s))
output_fpath = os.path.join(dest_dir,
data['input_relpath'],
output_fname)
if not os.path.exists(output_fpath) or overwrite:
cbn = sox.Transformer().speed(factor=s).convert(target_sr)
cbn.build(input_fname, output_fpath)
file_info = sox.file_info.info(output_fpath)
file_info['fname'] = os.path.join(os.path.basename(dest_dir),
data['input_relpath'],
output_fname)
file_info['speed'] = s
output_dict['files'].append(file_info)
if s == 1:
file_info = sox.file_info.info(output_fpath)
output_dict['original_duration'] = file_info['duration']
output_dict['original_num_samples'] = file_info['num_samples']
return output_dict
def parallel_preprocess(dataset, input_dir, dest_dir, target_sr, speed, overwrite, parallel):
with multiprocessing.Pool(parallel) as p:
func = functools.partial(preprocess,
input_dir=input_dir, dest_dir=dest_dir,
target_sr=target_sr, speed=speed, overwrite=overwrite)
dataset = list(tqdm(p.imap(func, dataset), total=len(dataset)))
return dataset