DeepLearningExamples/PyTorch/SpeechRecognition/QuartzNet/configs/quartznet15x5_speedp-online-1.15_speca_drop0.2.yaml
2021-09-14 06:03:36 -07:00

152 lines
3.2 KiB
YAML

# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
name: "QuartzNet"
labels: [" ", "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m",
"n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z", "'"]
input_val:
audio_dataset: &val_dataset
sample_rate: &sample_rate 16000
trim_silence: true
normalize_transcripts: true
filterbank_features: &val_features
normalize: per_feature
sample_rate: *sample_rate
window_size: 0.02
window_stride: 0.01
window: hann
n_filt: &n_filt 64
n_fft: 512
frame_splicing: &frame_splicing 1
dither: 0.00001
pad_align: 16
# For training we keep samples < 16.7s and apply augmentation
input_train:
audio_dataset:
<<: *val_dataset
max_duration: 16.7
ignore_offline_speed_perturbation: true
speed_perturbation:
min_rate: 0.85
max_rate: 1.15
filterbank_features:
<<: *val_features
max_duration: 16.7
spec_augment:
freq_masks: 2
max_freq: 20
time_masks: 2
max_time: 75
quartznet:
encoder:
init: xavier_uniform
in_feats: *n_filt
frame_splicing: *frame_splicing
activation: relu
use_conv_masks: true
blocks:
- &Conv1
filters: 256
repeat: 1
kernel_size: [33]
dilation: [1]
stride: [2]
dropout: 0.2
residual: false
separable: true
- &B1
filters: 256
repeat: 5
kernel_size: [33]
dilation: [1]
stride: [1]
dropout: 0.2
residual: true
separable: true
- *B1
- *B1
- &B2
filters: 256
repeat: 5
kernel_size: [39]
dilation: [1]
stride: [1]
dropout: 0.2
residual: true
separable: true
- *B2
- *B2
- &B3
filters: 512
repeat: 5
kernel_size: [51]
dilation: [1]
stride: [1]
dropout: 0.2
residual: true
separable: true
- *B3
- *B3
- &B4
filters: 512
repeat: 5
kernel_size: [63]
dilation: [1]
stride: [1]
dropout: 0.2
residual: true
separable: true
- *B4
- *B4
- &B5
filters: 512
repeat: 5
kernel_size: [75]
dilation: [1]
stride: [1]
dropout: 0.2
residual: true
separable: true
- *B5
- *B5
- &Conv2
filters: 512
repeat: 1
kernel_size: [87]
dilation: [2]
stride: [1]
dropout: 0.2
residual: false
separable: true
- &Conv3
filters: &enc_feats 1024
repeat: 1
kernel_size: [1]
dilation: [1]
stride: [1]
dropout: 0.2
residual: false
separable: false
decoder:
in_feats: *enc_feats
init: xavier_uniform