152 lines
3.2 KiB
YAML
152 lines
3.2 KiB
YAML
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
|
|
name: "QuartzNet"
|
|
labels: [" ", "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m",
|
|
"n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z", "'"]
|
|
|
|
input_val:
|
|
audio_dataset: &val_dataset
|
|
sample_rate: &sample_rate 16000
|
|
trim_silence: true
|
|
normalize_transcripts: true
|
|
|
|
filterbank_features: &val_features
|
|
normalize: per_feature
|
|
sample_rate: *sample_rate
|
|
window_size: 0.02
|
|
window_stride: 0.01
|
|
window: hann
|
|
n_filt: &n_filt 64
|
|
n_fft: 512
|
|
frame_splicing: &frame_splicing 1
|
|
dither: 0.00001
|
|
pad_align: 16
|
|
|
|
# For training we keep samples < 16.7s and apply augmentation
|
|
input_train:
|
|
audio_dataset:
|
|
<<: *val_dataset
|
|
max_duration: 16.7
|
|
ignore_offline_speed_perturbation: true
|
|
|
|
speed_perturbation:
|
|
min_rate: 0.85
|
|
max_rate: 1.15
|
|
|
|
filterbank_features:
|
|
<<: *val_features
|
|
max_duration: 16.7
|
|
|
|
spec_augment:
|
|
freq_masks: 2
|
|
max_freq: 20
|
|
time_masks: 2
|
|
max_time: 75
|
|
|
|
quartznet:
|
|
encoder:
|
|
init: xavier_uniform
|
|
in_feats: *n_filt
|
|
frame_splicing: *frame_splicing
|
|
activation: relu
|
|
use_conv_masks: true
|
|
blocks:
|
|
- &Conv1
|
|
filters: 256
|
|
repeat: 1
|
|
kernel_size: [33]
|
|
dilation: [1]
|
|
stride: [2]
|
|
dropout: 0.2
|
|
residual: false
|
|
separable: true
|
|
- &B1
|
|
filters: 256
|
|
repeat: 5
|
|
kernel_size: [33]
|
|
dilation: [1]
|
|
stride: [1]
|
|
dropout: 0.2
|
|
residual: true
|
|
separable: true
|
|
- *B1
|
|
- *B1
|
|
- &B2
|
|
filters: 256
|
|
repeat: 5
|
|
kernel_size: [39]
|
|
dilation: [1]
|
|
stride: [1]
|
|
dropout: 0.2
|
|
residual: true
|
|
separable: true
|
|
- *B2
|
|
- *B2
|
|
- &B3
|
|
filters: 512
|
|
repeat: 5
|
|
kernel_size: [51]
|
|
dilation: [1]
|
|
stride: [1]
|
|
dropout: 0.2
|
|
residual: true
|
|
separable: true
|
|
- *B3
|
|
- *B3
|
|
- &B4
|
|
filters: 512
|
|
repeat: 5
|
|
kernel_size: [63]
|
|
dilation: [1]
|
|
stride: [1]
|
|
dropout: 0.2
|
|
residual: true
|
|
separable: true
|
|
- *B4
|
|
- *B4
|
|
- &B5
|
|
filters: 512
|
|
repeat: 5
|
|
kernel_size: [75]
|
|
dilation: [1]
|
|
stride: [1]
|
|
dropout: 0.2
|
|
residual: true
|
|
separable: true
|
|
- *B5
|
|
- *B5
|
|
- &Conv2
|
|
filters: 512
|
|
repeat: 1
|
|
kernel_size: [87]
|
|
dilation: [2]
|
|
stride: [1]
|
|
dropout: 0.2
|
|
residual: false
|
|
separable: true
|
|
- &Conv3
|
|
filters: &enc_feats 1024
|
|
repeat: 1
|
|
kernel_size: [1]
|
|
dilation: [1]
|
|
stride: [1]
|
|
dropout: 0.2
|
|
residual: false
|
|
separable: false
|
|
|
|
decoder:
|
|
in_feats: *enc_feats
|
|
init: xavier_uniform
|