DeepLearningExamples/PyTorch/SpeechRecognition/Jasper/configs/jasper10x5dr_speedp-online-discrete.yaml

# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

name: "Jasper"
labels: [" ", "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m",
         "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z", "'"]

input_val:
  audio_dataset: &val_dataset
    sample_rate: &sample_rate 16000
    trim_silence: true
    normalize_transcripts: true

  filterbank_features: &val_features
    normalize: per_feature
    sample_rate: *sample_rate
    window_size: 0.02
    window_stride: 0.01
    window: hann
    n_filt: &n_filt 64
    n_fft: 512
    frame_splicing: &frame_splicing 1
    dither: 0.00001
    pad_align: 16

# For training we keep samples < 16.7s and apply augmentation
input_train:
  audio_dataset:
    <<: *val_dataset
    max_duration: 16.7
    ignore_offline_speed_perturbation: true

    speed_perturbation:
      discrete: true
      min_rate: 0.9
      max_rate: 1.1

  filterbank_features:
    <<: *val_features
    max_duration: 16.7

    spec_augment:
      freq_masks: 0
      max_freq: 20
      time_masks: 0
      max_time: 75

jasper:
  encoder:
    init: xavier_uniform
    in_feats: *n_filt
    frame_splicing: *frame_splicing
    activation: relu
    use_conv_masks: true
    blocks:
    - &Conv1
      filters: 256
      repeat: 1
      kernel_size: [11]
      stride: [2]
      dilation: [1]
      dropout: 0.2
      residual: false
    - &B1
      filters: 256
      repeat: 5
      kernel_size: [11]
      stride: [1]
      dilation: [1]
      dropout: 0.2
      residual: true
      residual_dense: true
    - *B1
    - &B2
      filters: 384
      repeat: 5
      kernel_size: [13]
      stride: [1]
      dilation: [1]
      dropout: 0.2
      residual: true
      residual_dense: true
    - *B2
    - &B3
      filters: 512
      repeat: 5
      kernel_size: [17]
      stride: [1]
      dilation: [1]
      dropout: 0.2
      residual: true
      residual_dense: true
    - *B3
    - &B4
      filters: 640
      repeat: 5
      kernel_size: [21]
      stride: [1]
      dilation: [1]
      dropout: 0.3
      residual: true
      residual_dense: true
    - *B4
    - &B5
      filters: 768
      repeat: 5
      kernel_size: [25]
      stride: [1]
      dilation: [1]
      dropout: 0.3
      residual: true
      residual_dense: true
    - *B5
    - &Conv2
      filters: 896
      repeat: 1
      kernel_size: [29]
      stride: [1]
      dilation: [2]
      dropout: 0.4
      residual: false
    - &Conv3
      filters: &enc_feats 1024
      repeat: 1
      kernel_size: [1]
      stride: [1]
      dilation: [1]
      dropout: 0.4
      residual: false

  decoder:
    in_feats: *enc_feats
    init: xavier_uniform