# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. name: "QuartzNet" labels: [" ", "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z", "'"] input_val: audio_dataset: &val_dataset sample_rate: &sample_rate 16000 trim_silence: true normalize_transcripts: true filterbank_features: &val_features normalize: per_feature sample_rate: *sample_rate window_size: 0.02 window_stride: 0.01 window: hann n_filt: &n_filt 64 n_fft: 512 frame_splicing: &frame_splicing 1 dither: 0.00001 pad_align: 16 # For training we keep samples < 16.7s and apply augmentation input_train: audio_dataset: <<: *val_dataset max_duration: 16.7 ignore_offline_speed_perturbation: true speed_perturbation: min_rate: 0.85 max_rate: 1.15 filterbank_features: <<: *val_features max_duration: 16.7 spec_augment: freq_masks: 2 max_freq: 20 time_masks: 2 max_time: 75 quartznet: encoder: init: xavier_uniform in_feats: *n_filt frame_splicing: *frame_splicing activation: relu use_conv_masks: true blocks: - &Conv1 filters: 256 repeat: 1 kernel_size: [33] dilation: [1] stride: [2] dropout: 0.2 residual: false separable: true - &B1 filters: 256 repeat: 5 kernel_size: [33] dilation: [1] stride: [1] dropout: 0.2 residual: true separable: true - *B1 - *B1 - &B2 filters: 256 repeat: 5 kernel_size: [39] dilation: [1] stride: [1] dropout: 0.2 residual: true separable: true - *B2 - *B2 - &B3 filters: 512 repeat: 5 kernel_size: [51] dilation: [1] stride: [1] dropout: 0.2 residual: true separable: true - *B3 - *B3 - &B4 filters: 512 repeat: 5 kernel_size: [63] dilation: [1] stride: [1] dropout: 0.2 residual: true separable: true - *B4 - *B4 - &B5 filters: 512 repeat: 5 kernel_size: [75] dilation: [1] stride: [1] dropout: 0.2 residual: true separable: true - *B5 - *B5 - &Conv2 filters: 512 repeat: 1 kernel_size: [87] dilation: [2] stride: [1] dropout: 0.2 residual: false separable: true - &Conv3 filters: &enc_feats 1024 repeat: 1 kernel_size: [1] dilation: [1] stride: [1] dropout: 0.2 residual: false separable: false decoder: in_feats: *enc_feats init: xavier_uniform