DeepLearningExamples/PyTorch/SpeechRecognition/QuartzNet/configs/quartznet15x5_speedp-online-1.15_speca_drop0.2.yaml

# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

name: "QuartzNet"
labels: [" ", "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m",
         "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z", "'"]

input_val:
  audio_dataset: &val_dataset
    sample_rate: &sample_rate 16000
    trim_silence: true
    normalize_transcripts: true

  filterbank_features: &val_features
    normalize: per_feature
    sample_rate: *sample_rate
    window_size: 0.02
    window_stride: 0.01
    window: hann
    n_filt: &n_filt 64
    n_fft: 512
    frame_splicing: &frame_splicing 1
    dither: 0.00001
    pad_align: 16

# For training we keep samples < 16.7s and apply augmentation
input_train:
  audio_dataset:
    <<: *val_dataset
    max_duration: 16.7
    ignore_offline_speed_perturbation: true

    speed_perturbation:
      min_rate: 0.85
      max_rate: 1.15

  filterbank_features:
    <<: *val_features
    max_duration: 16.7

    spec_augment:
      freq_masks: 2
      max_freq: 20
      time_masks: 2
      max_time: 75

quartznet:
  encoder:
    init: xavier_uniform
    in_feats: *n_filt
    frame_splicing: *frame_splicing
    activation: relu
    use_conv_masks: true
    blocks:
    - &Conv1
      filters: 256
      repeat: 1
      kernel_size: [33]
      dilation: [1]
      stride: [2]
      dropout: 0.2
      residual: false
      separable: true
    - &B1
      filters: 256
      repeat: 5
      kernel_size: [33]
      dilation: [1]
      stride: [1]
      dropout: 0.2
      residual: true
      separable: true
    - *B1
    - *B1
    - &B2
      filters: 256
      repeat: 5
      kernel_size: [39]
      dilation: [1]
      stride: [1]
      dropout: 0.2
      residual: true
      separable: true
    - *B2
    - *B2
    - &B3
      filters: 512
      repeat: 5
      kernel_size: [51]
      dilation: [1]
      stride: [1]
      dropout: 0.2
      residual: true
      separable: true
    - *B3
    - *B3
    - &B4
      filters: 512
      repeat: 5
      kernel_size: [63]
      dilation: [1]
      stride: [1]
      dropout: 0.2
      residual: true
      separable: true
    - *B4
    - *B4
    - &B5
      filters: 512
      repeat: 5
      kernel_size: [75]
      dilation: [1]
      stride: [1]
      dropout: 0.2
      residual: true
      separable: true
    - *B5
    - *B5
    - &Conv2
      filters: 512
      repeat: 1
      kernel_size: [87]
      dilation: [2]
      stride: [1]
      dropout: 0.2
      residual: false
      separable: true
    - &Conv3
      filters: &enc_feats 1024
      repeat: 1
      kernel_size: [1]
      dilation: [1]
      stride: [1]
      dropout: 0.2
      residual: false
      separable: false

  decoder:
    in_feats: *enc_feats
    init: xavier_uniform