Adding parallel transcribe for ASR models - suppports multi-gpu/multi-node (#3017)

* added transcribe_speech_parallel.py. Signed-off-by: Vahid <vnoroozi@nvidia.com> * fixed style. Signed-off-by: Vahid <vnoroozi@nvidia.com> * removed comments. Signed-off-by: Vahid <vnoroozi@nvidia.com> * fixed bug. Signed-off-by: Vahid <vnoroozi@nvidia.com> * fixed bug. Signed-off-by: Vahid <vnoroozi@nvidia.com> * added comments inside the script. Signed-off-by: Vahid <vnoroozi@nvidia.com> * fixed speed_collate_fn for TTS. Signed-off-by: Vahid <vnoroozi@nvidia.com> * fixed speed_collate_fn for TTS. Signed-off-by: Vahid <vnoroozi@nvidia.com> * fixed speed_collate_fn for TTS. Signed-off-by: Vahid <vnoroozi@nvidia.com> * fixed speed_collate_fn for TTS. Signed-off-by: Vahid <vnoroozi@nvidia.com> * added return_sample_id. Signed-off-by: Vahid <vnoroozi@nvidia.com> * added return_sample_id. Signed-off-by: Vahid <vnoroozi@nvidia.com> * merged dataset configs. Signed-off-by: Vahid <vnoroozi@nvidia.com> * merged dataset configs. Signed-off-by: Vahid <vnoroozi@nvidia.com> * dropped sample_ids from train and validation. Signed-off-by: Vahid <vnoroozi@nvidia.com> * dropped sample_ids from train and validation. Signed-off-by: Vahid <vnoroozi@nvidia.com> * reverted tts patches. Signed-off-by: Vahid <vnoroozi@nvidia.com> * reverted tts patches. Signed-off-by: Vahid <vnoroozi@nvidia.com> * fixed the default values in the dataset's config Signed-off-by: Vahid <vnoroozi@nvidia.com> * fixed the default values in the dataset's config Signed-off-by: Vahid <vnoroozi@nvidia.com> * Fixed the bug for optional outputs. Signed-off-by: Vahid <vnoroozi@nvidia.com> * Fixed the bug for optional outputs. Signed-off-by: Vahid <vnoroozi@nvidia.com> * addressed some comments. Signed-off-by: Vahid <vnoroozi@nvidia.com> * disabled dali support for return_sample_id. Signed-off-by: Vahid <vnoroozi@nvidia.com> * disabled dali support for return_sample_id. Signed-off-by: Vahid <vnoroozi@nvidia.com> * converted the config to omegaconf. Signed-off-by: Vahid <vnoroozi@nvidia.com> * converted the config to omegaconf. Signed-off-by: Vahid <vnoroozi@nvidia.com> * converted the config to omegaconf. Signed-off-by: Vahid <vnoroozi@nvidia.com> * moved wer/cer calculation to the end. Signed-off-by: Vahid <vnoroozi@nvidia.com> * moved wer/cer calculation to the end. Signed-off-by: Vahid <vnoroozi@nvidia.com> * moved wer/cer calculation to the end. Signed-off-by: Vahid <vnoroozi@nvidia.com> * moved wer/cer calculation to the end. Signed-off-by: Vahid <vnoroozi@nvidia.com> * moved wer/cer calculation to the end. Signed-off-by: Vahid <vnoroozi@nvidia.com> * calculates global wer instead of per sample. Signed-off-by: Vahid <vnoroozi@nvidia.com> * calculates global wer instead of per sample. Signed-off-by: Vahid <vnoroozi@nvidia.com> * calculates global wer instead of per sample. Signed-off-by: Vahid <vnoroozi@nvidia.com>
2021-11-10 00:37:19 -08:00 · 2021-11-10 00:37:19 -08:00 · cfcf694e30
parent b12ac8ae85
commit cfcf694e30
16 changed files with 379 additions and 64 deletions
--- a/examples/asr/conf/conformer/conformer_transducer_bpe.yaml
+++ b/examples/asr/conf/conformer/conformer_transducer_bpe.yaml
@ -28,9 +28,6 @@ model:
  log_prediction: true # enables logging sample predictions in the output during training
  model_defaults:
    se: true
    se_context_size: -1
    # encoder / decoder / joint values
    enc_hidden: ${model.encoder.d_model}
    pred_hidden: 640
    joint_hidden: 640
@ -107,8 +104,10 @@ model:
    # Multi-headed Attention Module's params
    self_attention_model: rel_pos # rel_pos or abs_pos
-    n_heads: 8
+    n_heads: 8 # may need to be lower for smaller d_models
-    xscaling: true # scales up the inputs by sqrt(d_model)
+    # [left, right] specifies the number of steps to be seen from left and right of each step in self-attention
    att_context_size: [-1, -1] # -1 means unlimited context
    xscaling: true # scales up the input embeddings by sqrt(d_model)
    untie_biases: true # unties the biases of the TransformerXL layers
    pos_emb_max_len: 5000
--- a/examples/asr/conf/conformer/conformer_transducer_char.yaml
+++ b/examples/asr/conf/conformer/conformer_transducer_char.yaml
@ -31,9 +31,6 @@ model:
            "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z", "'"]
  model_defaults:
    se: true
    se_context_size: -1
    # encoder / decoder / joint values
    enc_hidden: ${model.encoder.d_model}
    pred_hidden: 640
    joint_hidden: 640
@ -102,8 +99,10 @@ model:
    # Multi-headed Attention Module's params
    self_attention_model: rel_pos # rel_pos or abs_pos
-    n_heads: 8
+    n_heads: 8 # may need to be lower for smaller d_models
-    xscaling: true # scales up the inputs by sqrt(d_model)
+    # [left, right] specifies the number of steps to be seen from left and right of each step in self-attention
    att_context_size: [-1, -1] # -1 means unlimited context
    xscaling: true # scales up the input embeddings by sqrt(d_model)
    untie_biases: true # unties the biases of the TransformerXL layers
    pos_emb_max_len: 5000
--- a/examples/asr/transcribe_speech_parallel.py
+++ b/examples/asr/transcribe_speech_parallel.py
@ -0,0 +1,189 @@
 # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """
 # ASR transcribe/inference with multi-GPU/multi-node support for large datasets
 # It supports both tarred and non-tarred datasets
 # Arguments
 #    model: path to a nemo/PTL checkpoint file or name of a pretrained model
 #    predict_ds: config of the dataset/dataloader
 #    output_path: path to store the predictions
 #    return_predictions: whether to return the predictions as output other than writing into the files
 #    use_cer: whether to calculate the error in terms of CER or use the default WER
 #
 # Results of each GPU/worker is written into a file named 'predictions_{rank}.json, and aggregated results of all workers are written into 'predictions_all.json'
 Example for non-tarred datasets:
 python transcribe_speech_parallel.py \
    model=stt_en_conformer_ctc_large \
    predict_ds.manifest_filepath=/dataset/manifest_file.json \
    predict_ds.batch_size=16 \
    output_path=/tmp/
 Example for tarred datasets:
 python transcribe_speech_parallel.py \
    predict_ds.is_tarred=true \
    predict_ds.manifest_filepath=/tarred_dataset/tarred_audio_manifest.json \
    predict_ds.tarred_audio_filepaths=/tarred_dataset/audio__OP_0..127_CL_.tar \
    ...
 By default the trainer uses all the GPUs available and default precision is FP32.
 By setting the trainer config you may control these configs. For example to do the predictions with AMP on just two GPUs:
 python transcribe_speech_parallel.py \
    trainer.precision=16 \
    trainer.gpus=2 \
    ...
 You may control the dataloader's config by setting the predict_ds:
 python transcribe_speech_parallel.py \
    predict_ds.num_workers=8 \
    predict_ds.min_duration=2.0 \
    predict_ds.sample_rate=16000 \
    model=stt_en_conformer_ctc_small \
    ...
 """
 import itertools
 import json
 import os
 from dataclasses import dataclass, is_dataclass
 from typing import Optional
 import pytorch_lightning as ptl
 import torch
 from omegaconf import OmegaConf
 from nemo.collections.asr.data.audio_to_text_dataset import ASRPredictionWriter
 from nemo.collections.asr.metrics.rnnt_wer import RNNTDecodingConfig
 from nemo.collections.asr.metrics.wer import word_error_rate
 from nemo.collections.asr.models import ASRModel
 from nemo.collections.asr.models.configs.asr_models_config import ASRDatasetConfig
 from nemo.core.config import TrainerConfig, hydra_runner
 from nemo.utils import logging
 from nemo.utils.get_rank import is_global_rank_zero
@dataclass
 class ParallelTranscriptionConfig:
    model: Optional[str] = None  # name
    predict_ds: ASRDatasetConfig = ASRDatasetConfig(return_sample_id=True, num_workers=4)
    output_path: Optional[str] = None
    # when return_predictions is enabled, the prediction call would keep all the predictions in memory and return them when prediction is done
    return_predictions: bool = False
    use_cer: bool = False
    # decoding strategy for RNNT models
    rnnt_decoding: RNNTDecodingConfig = RNNTDecodingConfig()
    trainer: TrainerConfig = TrainerConfig(gpus=-1, accelerator="ddp")
 def match_train_config(predict_ds, train_ds):
    # It copies the important configurations from the train dataset of the model
    # into the predict_ds to be used for prediction. It is needed to match the training configurations.
    if train_ds is None:
        return
    predict_ds.sample_rate = train_ds.get("sample_rate", 16000)
    cfg_name_list = [
        "int_values",
        "use_start_end_token",
        "blank_index",
        "unk_index",
        "normalize",
        "parser",
        "eos_id",
        "bos_id",
        "pad_id",
    ]
    if is_dataclass(predict_ds):
        predict_ds = OmegaConf.structured(predict_ds)
    for cfg_name in cfg_name_list:
        if hasattr(train_ds, cfg_name):
            setattr(predict_ds, cfg_name, getattr(train_ds, cfg_name))
    return predict_ds
@hydra_runner(config_name="TranscriptionConfig", schema=ParallelTranscriptionConfig)
 def main(cfg: ParallelTranscriptionConfig):
    if cfg.model.endswith(".nemo"):
        logging.info("Attempting to initialize from .nemo file")
        model = ASRModel.restore_from(restore_path=cfg.model, map_location="cpu")
    elif cfg.model.endswith(".ckpt"):
        logging.info("Attempting to initialize from .ckpt file")
        model = ASRModel.load_from_checkpoint(checkpoint_path=cfg.model, map_location="cpu")
    else:
        logging.info(
            "Attempting to initialize from a pretrained model as the model name does not have the extension of .nemo or .ckpt"
        )
        model = ASRModel.from_pretrained(model_name=cfg.model, map_location="cpu")
    trainer = ptl.Trainer(**cfg.trainer)
    cfg.predict_ds.return_sample_id = True
    cfg.predict_ds = match_train_config(predict_ds=cfg.predict_ds, train_ds=model.cfg.train_ds)
    data_loader = model._setup_dataloader_from_config(cfg.predict_ds)
    os.makedirs(cfg.output_path, exist_ok=True)
    # trainer.global_rank is not valid before predict() is called. Need this hack to find the correct global_rank.
    global_rank = trainer.node_rank * trainer.num_gpus + int(os.environ.get("LOCAL_RANK", 0))
    output_file = os.path.join(cfg.output_path, f"predictions_{global_rank}.json")
    predictor_writer = ASRPredictionWriter(dataset=data_loader.dataset, output_file=output_file)
    trainer.callbacks.extend([predictor_writer])
    predictions = trainer.predict(model=model, dataloaders=data_loader, return_predictions=cfg.return_predictions)
    if predictions is not None:
        predictions = list(itertools.chain.from_iterable(predictions))
    samples_num = predictor_writer.close_output_file()
    logging.info(
        f"Prediction on rank {global_rank} is done for {samples_num} samples and results are stored in {output_file}."
    )
    if torch.distributed.is_initialized():
        torch.distributed.barrier()
    samples_num = 0
    pred_text_list = []
    text_list = []
    if is_global_rank_zero():
        output_file = os.path.join(cfg.output_path, f"predictions_all.json")
        logging.info(f"Prediction files are being aggregated in {output_file}.")
        with open(output_file, 'w') as outf:
            for rank in range(trainer.world_size):
                input_file = os.path.join(cfg.output_path, f"predictions_{rank}.json")
                with open(input_file, 'r') as inpf:
                    lines = inpf.readlines()
                    for line in lines:
                        item = json.loads(line)
                        pred_text_list.append(item["pred_text"])
                        text_list.append(item["text"])
                        outf.write(json.dumps(item) + "\n")
                        samples_num += 1
        wer_cer = word_error_rate(hypotheses=pred_text_list, references=text_list, use_cer=cfg.use_cer)
        logging.info(
            f"Prediction is done for {samples_num} samples in total on all workers and results are aggregated in {output_file}."
        )
        logging.info("{} for all predictions is {:.4f}.".format("CER" if cfg.use_cer else "WER", wer_cer))
 if __name__ == '__main__':
    main()
--- a/nemo/collections/asr/data/audio_to_text.py
+++ b/nemo/collections/asr/data/audio_to_text.py
@ -50,7 +50,14 @@ def _speech_collate_fn(batch, pad_id):
               encoded tokens, and encoded tokens length.  This collate func
               assumes the signals are 1d torch tensors (i.e. mono audio).
    """
-    _, audio_lengths, _, tokens_lengths = zip(*batch)
+    packed_batch = list(zip(*batch))
    if len(packed_batch) == 5:
        _, audio_lengths, _, tokens_lengths, sample_ids = packed_batch
    elif len(packed_batch) == 4:
        sample_ids = None
        _, audio_lengths, _, tokens_lengths = packed_batch
    else:
        raise ValueError("Expects 4 or 5 tensors in the batch!")
    max_audio_len = 0
    has_audio = audio_lengths[0] is not None
    if has_audio:
@ -58,7 +65,11 @@ def _speech_collate_fn(batch, pad_id):
    max_tokens_len = max(tokens_lengths).item()
    audio_signal, tokens = [], []
-    for sig, sig_len, tokens_i, tokens_i_len in batch:
+    for b in batch:
        if len(b) == 5:
            sig, sig_len, tokens_i, tokens_i_len, _ = b
        else:
            sig, sig_len, tokens_i, tokens_i_len = b
        if has_audio:
            sig_len = sig_len.item()
            if sig_len < max_audio_len:
@ -78,8 +89,11 @@ def _speech_collate_fn(batch, pad_id):
        audio_signal, audio_lengths = None, None
    tokens = torch.stack(tokens)
    tokens_lengths = torch.stack(tokens_lengths)
-
+    if sample_ids is None:
-    return audio_signal, audio_lengths, tokens, tokens_lengths
+        return audio_signal, audio_lengths, tokens, tokens_lengths
    else:
        sample_ids = torch.tensor(sample_ids, dtype=torch.int32)
        return audio_signal, audio_lengths, tokens, tokens_lengths, sample_ids
 class ASRManifestProcessor:
@ -115,7 +129,7 @@ class ASRManifestProcessor:
        self.parser = parser
        self.collection = collections.ASRAudioText(
-            manifests_files=manifest_filepath.split(','),
+            manifests_files=manifest_filepath,
            parser=parser,
            min_duration=min_duration,
            max_duration=max_duration,
@ -164,6 +178,7 @@ class _AudioTextDataset(Dataset):
        normalize: whether to normalize transcript text (default): True
        bos_id: Id of beginning of sequence symbol to append if not None
        eos_id: Id of end of sequence symbol to append if not None
        return_sample_id (bool): whether to return the sample_id as a part of each sample
    """
    @property
@ -175,6 +190,7 @@ class _AudioTextDataset(Dataset):
            'a_sig_length': NeuralType(tuple('B'), LengthsType()),
            'transcripts': NeuralType(('B', 'T'), LabelsType()),
            'transcript_length': NeuralType(tuple('B'), LengthsType()),
            'sample_id': NeuralType(tuple('B'), LengthsType(), optional=True),
        }
    def __init__(
@ -191,6 +207,7 @@ class _AudioTextDataset(Dataset):
        bos_id: Optional[int] = None,
        eos_id: Optional[int] = None,
        pad_id: int = 0,
        return_sample_id: bool = False,
    ):
        self.manifest_processor = ASRManifestProcessor(
            manifest_filepath=manifest_filepath,
@ -204,6 +221,10 @@ class _AudioTextDataset(Dataset):
        )
        self.featurizer = WaveformFeaturizer(sample_rate=sample_rate, int_values=int_values, augmentor=augmentor)
        self.trim = trim
        self.return_sample_id = return_sample_id
    def get_manifest_sample(self, sample_id):
        return self.manifest_processor.collection[sample_id]
    def __getitem__(self, index):
        sample = self.manifest_processor.collection[index]
@ -219,7 +240,10 @@ class _AudioTextDataset(Dataset):
        t, tl = self.manifest_processor.process_text(index)
-        output = f, fl, torch.tensor(t).long(), torch.tensor(tl).long()
+        if self.return_sample_id:
            output = f, fl, torch.tensor(t).long(), torch.tensor(tl).long(), index
        else:
            output = f, fl, torch.tensor(t).long(), torch.tensor(tl).long()
        return output
@ -258,6 +282,7 @@ class AudioToCharDataset(_AudioTextDataset):
        normalize: whether to normalize transcript text (default): True
        bos_id: Id of beginning of sequence symbol to append if not None
        eos_id: Id of end of sequence symbol to append if not None
        return_sample_id (bool): whether to return the sample_id as a part of each sample
    """
    @property
@ -269,6 +294,7 @@ class AudioToCharDataset(_AudioTextDataset):
            'a_sig_length': NeuralType(tuple('B'), LengthsType()),
            'transcripts': NeuralType(('B', 'T'), LabelsType()),
            'transcript_length': NeuralType(tuple('B'), LengthsType()),
            'sample_id': NeuralType(tuple('B'), LengthsType(), optional=True),
        }
    def __init__(
@ -289,6 +315,7 @@ class AudioToCharDataset(_AudioTextDataset):
        eos_id: Optional[int] = None,
        pad_id: int = 0,
        parser: Union[str, Callable] = 'en',
        return_sample_id: bool = False,
    ):
        self.labels = labels
@ -309,6 +336,7 @@ class AudioToCharDataset(_AudioTextDataset):
            bos_id=bos_id,
            eos_id=eos_id,
            pad_id=pad_id,
            return_sample_id=return_sample_id,
        )
@ -806,6 +834,7 @@ class AudioToBPEDataset(_AudioTextDataset):
        trim: Whether to trim silence segments
        use_start_end_token: Boolean which dictates whether to add [BOS] and [EOS]
            tokens to beginning and ending of speech respectively.
        return_sample_id (bool): whether to return the sample_id as a part of each sample
    """
    @property
@ -817,6 +846,7 @@ class AudioToBPEDataset(_AudioTextDataset):
            'a_sig_length': NeuralType(tuple('B'), LengthsType()),
            'transcripts': NeuralType(('B', 'T'), LabelsType()),
            'transcript_length': NeuralType(tuple('B'), LengthsType()),
            'sample_id': NeuralType(tuple('B'), LengthsType(), optional=True),
        }
    def __init__(
@ -831,6 +861,7 @@ class AudioToBPEDataset(_AudioTextDataset):
        max_utts: int = 0,
        trim: bool = False,
        use_start_end_token: bool = True,
        return_sample_id: bool = False,
    ):
        if use_start_end_token and hasattr(tokenizer, 'bos_token'):
            bos_id = tokenizer.bos_id
@ -868,6 +899,7 @@ class AudioToBPEDataset(_AudioTextDataset):
            eos_id=eos_id,
            pad_id=pad_id,
            trim=trim,
            return_sample_id=return_sample_id,
        )
@ -956,12 +988,13 @@ class _TarredAudioToTextDataset(IterableDataset):
                sampled at least once during 1 epoch.
        global_rank (int): Worker rank, used for partitioning shards. Defaults to 0.
        world_size (int): Total number of processes, used for partitioning shards. Defaults to 0.
        return_sample_id (bool): whether to return the sample_id as a part of each sample
    """
    def __init__(
        self,
        audio_tar_filepaths: Union[str, List[str]],
-        manifest_filepath: Union[str, List[str]],
+        manifest_filepath: str,
        parser: Callable,
        sample_rate: int,
        int_values: bool = False,
@ -977,6 +1010,7 @@ class _TarredAudioToTextDataset(IterableDataset):
        shard_strategy: str = "scatter",
        global_rank: int = 0,
        world_size: int = 0,
        return_sample_id: bool = False,
    ):
        self.collection = collections.ASRAudioText(
            manifests_files=manifest_filepath,
@ -992,6 +1026,7 @@ class _TarredAudioToTextDataset(IterableDataset):
        self.eos_id = eos_id
        self.bos_id = bos_id
        self.pad_id = pad_id
        self.return_sample_id = return_sample_id
        valid_shard_strategies = ['scatter', 'replicate']
        if shard_strategy not in valid_shard_strategies:
@ -1118,7 +1153,13 @@ class _TarredAudioToTextDataset(IterableDataset):
            t = t + [self.eos_id]
            tl += 1
-        return f, fl, torch.tensor(t).long(), torch.tensor(tl).long()
+        if self.return_sample_id:
            return f, fl, torch.tensor(t).long(), torch.tensor(tl).long(), manifest_idx
        else:
            return f, fl, torch.tensor(t).long(), torch.tensor(tl).long()
    def get_manifest_sample(self, sample_id):
        return self.collection[sample_id]
    def __iter__(self):
        return self._dataset.__iter__()
@ -1208,6 +1249,7 @@ class TarredAudioToCharDataset(_TarredAudioToTextDataset):
                sampled at least once during 1 epoch.
        global_rank (int): Worker rank, used for partitioning shards. Defaults to 0.
        world_size (int): Total number of processes, used for partitioning shards. Defaults to 0.
        return_sample_id (bool): whether to return the sample_id as a part of each sample
    """
    def __init__(
@ -1233,6 +1275,7 @@ class TarredAudioToCharDataset(_TarredAudioToTextDataset):
        shard_strategy: str = "scatter",
        global_rank: int = 0,
        world_size: int = 0,
        return_sample_id: bool = False,
    ):
        self.labels = labels
@ -1258,6 +1301,7 @@ class TarredAudioToCharDataset(_TarredAudioToTextDataset):
            shard_strategy=shard_strategy,
            global_rank=global_rank,
            world_size=world_size,
            return_sample_id=return_sample_id,
        )
@ -1332,12 +1376,13 @@ class TarredAudioToBPEDataset(_TarredAudioToTextDataset):
                sampled at least once during 1 epoch.
        global_rank (int): Worker rank, used for partitioning shards. Defaults to 0.
        world_size (int): Total number of processes, used for partitioning shards. Defaults to 0.
        return_sample_id (bool): whether to return the sample_id as a part of each sample
    """
    def __init__(
        self,
        audio_tar_filepaths: Union[str, List[str]],
-        manifest_filepath: Union[str, List[str]],
+        manifest_filepath: str,
        tokenizer: 'nemo.collections.common.tokenizers.TokenizerSpec',
        sample_rate: int,
        int_values: bool = False,
@ -1351,6 +1396,7 @@ class TarredAudioToBPEDataset(_TarredAudioToTextDataset):
        shard_strategy: str = "scatter",
        global_rank: int = 0,
        world_size: int = 0,
        return_sample_id: bool = False,
    ):
        if use_start_end_token and hasattr(tokenizer, 'bos_token'):
            bos_id = tokenizer.bos_id
@ -1393,4 +1439,5 @@ class TarredAudioToBPEDataset(_TarredAudioToTextDataset):
            shard_strategy=shard_strategy,
            global_rank=global_rank,
            world_size=world_size,
            return_sample_id=return_sample_id,
        )
--- a/nemo/collections/asr/data/audio_to_text_dali.py
+++ b/nemo/collections/asr/data/audio_to_text_dali.py
@ -140,6 +140,7 @@ class _AudioTextDALIDataset(Iterator):
        global_rank (int): Worker rank, used for partitioning shards. Defaults to 0.
        world_size (int): Total number of processes, used for partitioning shards. Defaults to 1.
        preprocessor_cfg (DictConfig): Preprocessor configuration. Supports AudioToMelSpectrogramPreprocessor and AudioToMFCCPreprocessor.
        return_sample_id (bool): whether to return the sample_id as a part of each sample (not supported yet).
    """
    def __init__(
@ -162,8 +163,15 @@ class _AudioTextDALIDataset(Iterator):
        global_rank: int = 0,
        world_size: int = 1,
        preprocessor_cfg: DictConfig = None,
        return_sample_id: bool = False,
    ):
        self.drop_last = drop_last  # used by lr_scheduler
        if return_sample_id:
            raise ValueError(
                "Currently DALI data layers don't support returning the sample_id and return_sample_id can not be enabled."
            )
        self.return_sample_id = return_sample_id
        if not HAVE_DALI:
            raise ModuleNotFoundError(
                f"{self} requires NVIDIA DALI to be installed. "
@ -519,6 +527,7 @@ class AudioToCharDALIDataset(_AudioTextDALIDataset):
        global_rank (int): Worker rank, used for partitioning shards. Defaults to 0.
        world_size (int): Total number of processes, used for partitioning shards. Defaults to 1.
        preprocessor_cfg (DictConfig): Preprocessor configuration. Supports AudioToMelSpectrogramPreprocessor and AudioToMFCCPreprocessor.
        return_sample_id (bool): whether to return the sample_id as a part of each sample (not supported yet).
    """
    def __init__(
@ -545,6 +554,7 @@ class AudioToCharDALIDataset(_AudioTextDALIDataset):
        global_rank: int = 0,
        world_size: int = 1,
        preprocessor_cfg: DictConfig = None,
        return_sample_id: bool = False,
    ):
        self.labels = labels
@ -571,6 +581,7 @@ class AudioToCharDALIDataset(_AudioTextDALIDataset):
            global_rank=global_rank,
            world_size=world_size,
            preprocessor_cfg=preprocessor_cfg,
            return_sample_id=return_sample_id,
        )
@ -607,6 +618,7 @@ class AudioToBPEDALIDataset(_AudioTextDALIDataset):
        preprocessor_cfg (DictConfig): Preprocessor configuration. Supports AudioToMelSpectrogramPreprocessor and AudioToMFCCPreprocessor.
        use_start_end_token (bool): Boolean which dictates whether to add [BOS] and [EOS] tokens to beginning and
            ending of speech respectively.
        return_sample_id (bool): whether to return the sample_id as a part of each sample (not supported yet).
    """
    def __init__(
@ -627,7 +639,9 @@ class AudioToBPEDALIDataset(_AudioTextDALIDataset):
        world_size: int = 1,
        preprocessor_cfg: DictConfig = None,
        use_start_end_token: bool = True,
        return_sample_id: bool = False,
    ):
        if use_start_end_token and hasattr(tokenizer, 'bos_token'):
            bos_id = tokenizer.bos_id
        else:
@ -670,4 +684,5 @@ class AudioToBPEDALIDataset(_AudioTextDALIDataset):
            global_rank=global_rank,
            world_size=world_size,
            preprocessor_cfg=preprocessor_cfg,
            return_sample_id=return_sample_id,
        )
--- a/nemo/collections/asr/data/audio_to_text_dataset.py
+++ b/nemo/collections/asr/data/audio_to_text_dataset.py
@ -12,11 +12,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from typing import Optional, Union
+import json
 from typing import Any, List, Optional, Union
 import torch
 from omegaconf import DictConfig, open_dict
 from omegaconf.listconfig import ListConfig
 from pytorch_lightning.callbacks import BasePredictionWriter
 from torch.utils.data import ChainDataset
 from nemo.collections.asr.data import audio_to_text, audio_to_text_dali
@ -91,6 +93,7 @@ def get_char_dataset(config: dict, augmentor: Optional['AudioAugmentor'] = None)
        normalize=config.get('normalize_transcripts', False),
        trim=config.get('trim_silence', False),
        parser=config.get('parser', 'en'),
        return_sample_id=config.get('return_sample_id', False),
    )
    return dataset
@ -120,6 +123,7 @@ def get_bpe_dataset(
        max_utts=config.get('max_utts', 0),
        trim=config.get('trim_silence', False),
        use_start_end_token=config.get('use_start_end_token', True),
        return_sample_id=config.get('return_sample_id', False),
    )
    return dataset
@ -182,6 +186,7 @@ def get_tarred_dataset(
                shard_strategy=config.get('tarred_shard_strategy', 'scatter'),
                global_rank=global_rank,
                world_size=world_size,
                return_sample_id=config.get('return_sample_id', False),
            )
        else:
            dataset = audio_to_text.TarredAudioToBPEDataset(
@ -200,6 +205,7 @@ def get_tarred_dataset(
                shard_strategy=config.get('tarred_shard_strategy', 'scatter'),
                global_rank=global_rank,
                world_size=world_size,
                return_sample_id=config.get('return_sample_id', False),
            )
        datasets.append(dataset)
@ -251,6 +257,7 @@ def get_dali_char_dataset(
        global_rank=global_rank,
        world_size=world_size,
        preprocessor_cfg=preprocessor_cfg,
        return_sample_id=config.get('return_sample_id', False),
    )
    return dataset
@ -295,10 +302,44 @@ def get_dali_bpe_dataset(
        global_rank=global_rank,
        world_size=world_size,
        preprocessor_cfg=preprocessor_cfg,
        return_sample_id=config.get('return_sample_id', False),
    )
    return dataset
 class ASRPredictionWriter(BasePredictionWriter):
    def __init__(self, dataset, output_file: str):
        super().__init__(write_interval="batch")
        self.outf = open(output_file, 'w')
        self.dataset = dataset
        self.samples_num = 0
    def write_on_batch_end(
        self,
        trainer,
        pl_module: 'LightningModule',
        prediction: Any,
        batch_indices: List[int],
        batch: Any,
        batch_idx: int,
        dataloader_idx: int,
    ):
        for sample_id, transcribed_text in prediction:
            item = {}
            sample = self.dataset.get_manifest_sample(sample_id)
            item["audio_filepath"] = sample.audio_file
            item["duration"] = sample.duration
            item["text"] = sample.text_raw
            item["pred_text"] = transcribed_text
            self.outf.write(json.dumps(item) + "\n")
            self.samples_num += 1
        return
    def close_output_file(self):
        self.outf.close()
        return self.samples_num
 def convert_to_config_list(initial_list):
    if initial_list is None or initial_list == []:
        raise ValueError("manifest_filepaths and tarred_audio_filepaths must not be empty.")
--- a/nemo/collections/asr/models/configs/init.py
+++ b/nemo/collections/asr/models/configs/init.py
@ -12,16 +12,16 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from nemo.collections.asr.models.configs.asr_models_config import (
    ASRDatasetConfig,
    EncDecCTCConfig,
    EncDecCTCModelConfig,
 )
 from nemo.collections.asr.models.configs.classification_models_config import (
    EncDecClassificationConfig,
    EncDecClassificationDatasetConfig,
    EncDecClassificationModelConfig,
 )
 from nemo.collections.asr.models.configs.ctc_models_config import (
    EncDecCTCConfig,
    EncDecCTCDatasetConfig,
    EncDecCTCModelConfig,
 )
 from nemo.collections.asr.models.configs.matchboxnet_config import (
    EncDecClassificationModelConfigBuilder,
    MatchboxNetModelConfig,
--- a/nemo/collections/asr/models/configs/asr_models_config.py
+++ b/nemo/collections/asr/models/configs/asr_models_config.py
@ -27,15 +27,15 @@ from nemo.core.config import modelPT as model_cfg
@dataclass
-class EncDecCTCDatasetConfig(nemo.core.classes.dataset.DatasetConfig):
+class ASRDatasetConfig(nemo.core.classes.dataset.DatasetConfig):
-    manifest_filepath: Optional[str] = None
+    manifest_filepath: Optional[Any] = None
    sample_rate: int = MISSING
    labels: List[str] = MISSING
    trim_silence: bool = False
    # Tarred dataset support
    is_tarred: bool = False
-    tarred_audio_filepaths: Optional[str] = None
+    tarred_audio_filepaths: Optional[Any] = None
    tarred_shard_strategy: str = "scatter"
    shuffle_n: int = 0
@ -54,6 +54,7 @@ class EncDecCTCDatasetConfig(nemo.core.classes.dataset.DatasetConfig):
    bos_id: Optional[int] = None
    pad_id: int = 0
    use_start_end_token: bool = False
    return_sample_id: Optional[bool] = False
@dataclass
@ -66,9 +67,9 @@ class EncDecCTCConfig(model_cfg.ModelConfig):
    labels: List[str] = MISSING
    # Dataset configs
-    train_ds: EncDecCTCDatasetConfig = EncDecCTCDatasetConfig(manifest_filepath=None, shuffle=True)
+    train_ds: ASRDatasetConfig = ASRDatasetConfig(manifest_filepath=None, shuffle=True)
-    validation_ds: EncDecCTCDatasetConfig = EncDecCTCDatasetConfig(manifest_filepath=None, shuffle=False)
+    validation_ds: ASRDatasetConfig = ASRDatasetConfig(manifest_filepath=None, shuffle=False)
-    test_ds: EncDecCTCDatasetConfig = EncDecCTCDatasetConfig(manifest_filepath=None, shuffle=False)
+    test_ds: ASRDatasetConfig = ASRDatasetConfig(manifest_filepath=None, shuffle=False)
    # Optimizer / Scheduler config
    optim: Optional[model_cfg.OptimConfig] = model_cfg.OptimConfig(sched=model_cfg.SchedConfig())
--- a/nemo/collections/asr/models/configs/quartznet_config.py
+++ b/nemo/collections/asr/models/configs/quartznet_config.py
@ -17,7 +17,7 @@ from typing import Any, Callable, List, Optional
 from omegaconf import MISSING
-from nemo.collections.asr.models.configs import ctc_models_config as ctc_cfg
+from nemo.collections.asr.models.configs import asr_models_config as ctc_cfg
 from nemo.collections.asr.modules.audio_preprocessing import (
    AudioToMelSpectrogramPreprocessorConfig,
    SpectrogramAugmentationConfig,
@ -174,13 +174,11 @@ class JasperModelConfig(ctc_cfg.EncDecCTCConfig):
    labels: List[str] = MISSING
    # Dataset configs
-    train_ds: ctc_cfg.EncDecCTCDatasetConfig = ctc_cfg.EncDecCTCDatasetConfig(
+    train_ds: ctc_cfg.ASRDatasetConfig = ctc_cfg.ASRDatasetConfig(
        manifest_filepath=None, shuffle=True, trim_silence=True
    )
-    validation_ds: ctc_cfg.EncDecCTCDatasetConfig = ctc_cfg.EncDecCTCDatasetConfig(
+    validation_ds: ctc_cfg.ASRDatasetConfig = ctc_cfg.ASRDatasetConfig(manifest_filepath=None, shuffle=False)
-        manifest_filepath=None, shuffle=False
+    test_ds: ctc_cfg.ASRDatasetConfig = ctc_cfg.ASRDatasetConfig(manifest_filepath=None, shuffle=False)
    )
    test_ds: ctc_cfg.EncDecCTCDatasetConfig = ctc_cfg.EncDecCTCDatasetConfig(manifest_filepath=None, shuffle=False)
    # Optimizer / Scheduler config
    optim: Optional[model_cfg.OptimConfig] = model_cfg.OptimConfig(sched=model_cfg.SchedConfig())
@ -262,13 +260,13 @@ class EncDecCTCModelConfigBuilder(model_cfg.ModelConfigBuilder):
    # Note: Autocomplete for users wont work without these overrides
    # But practically it is not needed since python will infer at runtime
-    # def set_train_ds(self, cfg: Optional[ctc_cfg.EncDecCTCDatasetConfig] = None):
+    # def set_train_ds(self, cfg: Optional[ctc_cfg.ASRDatasetConfig] = None):
    #     super().set_train_ds(cfg)
    #
-    # def set_validation_ds(self, cfg: Optional[ctc_cfg.EncDecCTCDatasetConfig] = None):
+    # def set_validation_ds(self, cfg: Optional[ctc_cfg.ASRDatasetConfig] = None):
    #     super().set_validation_ds(cfg)
    #
-    # def set_test_ds(self, cfg: Optional[ctc_cfg.EncDecCTCDatasetConfig] = None):
+    # def set_test_ds(self, cfg: Optional[ctc_cfg.ASRDatasetConfig] = None):
    #     super().set_test_ds(cfg)
    def _finalize_cfg(self):
--- a/nemo/collections/asr/models/ctc_models.py
+++ b/nemo/collections/asr/models/ctc_models.py
@ -506,6 +506,7 @@ class EncDecCTCModel(ASRModel, ExportableEncDecModel, ASRModuleMixin):
            "input_signal_length": NeuralType(tuple('B'), LengthsType(), optional=True),
            "processed_signal": NeuralType(('B', 'D', 'T'), SpectrogramType(), optional=True),
            "processed_signal_length": NeuralType(tuple('B'), LengthsType(), optional=True),
            "sample_id": NeuralType(tuple('B'), LengthsType(), optional=True),
        }
    @property
@ -596,6 +597,22 @@ class EncDecCTCModel(ASRModel, ExportableEncDecModel, ASRModuleMixin):
        return {'loss': loss_value, 'log': tensorboard_logs}
    def predict_step(self, batch, batch_idx, dataloader_idx=0):
        signal, signal_len, transcript, transcript_len, sample_id = batch
        if isinstance(batch, DALIOutputs) and batch.has_processed_signal:
            log_probs, encoded_len, predictions = self.forward(
                processed_signal=signal, processed_signal_length=signal_len
            )
        else:
            log_probs, encoded_len, predictions = self.forward(input_signal=signal, input_signal_length=signal_len)
        transcribed_texts = self._wer.ctc_decoder_predictions_tensor(
            predictions=predictions, predictions_len=encoded_len, return_hypotheses=False,
        )
        sample_id = sample_id.cpu().detach().numpy()
        return list(zip(sample_id, transcribed_texts))
    def validation_step(self, batch, batch_idx, dataloader_idx=0):
        signal, signal_len, transcript, transcript_len = batch
        if isinstance(batch, DALIOutputs) and batch.has_processed_signal:
--- a/nemo/collections/asr/models/rnnt_models.py
+++ b/nemo/collections/asr/models/rnnt_models.py
@ -647,7 +647,6 @@ class EncDecRNNTModel(ASRModel, ASRModuleMixin, ExportableEncDecJointModel):
        if hasattr(self, '_trainer') and self._trainer is not None:
            log_every_n_steps = self._trainer.log_every_n_steps
            sample_id = self._trainer.global_step
        else:
            log_every_n_steps = 1
            sample_id = batch_nb
@ -699,6 +698,23 @@ class EncDecRNNTModel(ASRModel, ASRModuleMixin, ExportableEncDecJointModel):
        return {'loss': loss_value}
    def predict_step(self, batch, batch_idx, dataloader_idx=0):
        signal, signal_len, transcript, transcript_len, sample_id = batch
        # forward() only performs encoder forward
        if isinstance(batch, DALIOutputs) and batch.has_processed_signal:
            encoded, encoded_len = self.forward(processed_signal=signal, processed_signal_length=signal_len)
        else:
            encoded, encoded_len = self.forward(input_signal=signal, input_signal_length=signal_len)
        del signal
        best_hyp_text, all_hyp_text = self.decoding.rnnt_decoder_predictions_tensor(
            encoder_output=encoded, encoded_lengths=encoded_len, return_hypotheses=False
        )
        sample_id = sample_id.cpu().detach().numpy()
        return list(zip(sample_id, best_hyp_text))
    def validation_step(self, batch, batch_idx, dataloader_idx=0):
        signal, signal_len, transcript, transcript_len = batch
--- a/nemo/core/classes/common.py
+++ b/nemo/core/classes/common.py
@ -269,6 +269,7 @@ class Typing(ABC):
            # Precompute metadata
            metadata = TypecheckMetadata(original_types=output_types, ignore_collections=ignore_collections)
            out_types_list = list(metadata.base_types.items())
            mandatory_out_types_list = list(metadata.mandatory_types.items())
            # First convert all outputs to list/tuple format to check correct number of outputs
            if type(out_objects) in (list, tuple):
@ -287,15 +288,15 @@ class Typing(ABC):
                pass
            # In all other cases, python will wrap multiple outputs into an outer tuple.
-            # As such, now the number of elements in this outer tuple should exactly match
+            # Allow number of output arguments to be <= total output neural types and >= mandatory outputs.
-            # the number of output types defined.
+
-            elif len(out_types_list) != len(out_container):
+            elif len(out_container) > len(out_types_list) or len(out_container) < len(mandatory_out_types_list):
                raise TypeError(
-                    "Number of output arguments provided ({}) is not as expected ({}).\n"
+                    "Number of output arguments provided ({}) is not as expected. It should be larger than {} and less than {}.\n"
-                    "This can be either because insufficient number of output NeuralTypes were provided,"
+                    "This can be either because insufficient/extra number of output NeuralTypes were provided,"
                    "or the provided NeuralTypes {} should enable container support "
                    "(add '[]' to the NeuralType definition)".format(
-                        len(out_container), len(output_types), output_types
+                        len(out_container), len(out_types_list), len(mandatory_out_types_list), output_types
                    )
                )
--- a/nemo/core/classes/dataset.py
+++ b/nemo/core/classes/dataset.py
@ -105,5 +105,5 @@ class DatasetConfig:
    batch_size: int = 32
    drop_last: bool = False
    shuffle: bool = False
-    num_workers: Optional[int] = None
+    num_workers: Optional[int] = 0
    pin_memory: bool = True
--- a/tests/collections/asr/test_asr_ctc_encoder_model_bpe.py
+++ b/tests/collections/asr/test_asr_ctc_encoder_model_bpe.py
@ -237,7 +237,7 @@ class TestEncDecCTCModel:
                os.rename(old_tokenizer_dir + '.bkp', old_tokenizer_dir)
    @pytest.mark.unit
-    def test_EncDecCTCDatasetConfig_for_AudioToBPEDataset(self):
+    def test_ASRDatasetConfig_for_AudioToBPEDataset(self):
        # ignore some additional arguments as dataclass is generic
        IGNORE_ARGS = [
            'is_tarred',
@ -261,10 +261,7 @@ class TestEncDecCTCModel:
        REMAP_ARGS = {'trim_silence': 'trim', 'labels': 'tokenizer'}
        result = assert_dataclass_signature_match(
-            audio_to_text.AudioToBPEDataset,
+            audio_to_text.AudioToBPEDataset, configs.ASRDatasetConfig, ignore_args=IGNORE_ARGS, remap_args=REMAP_ARGS,
            configs.EncDecCTCDatasetConfig,
            ignore_args=IGNORE_ARGS,
            remap_args=REMAP_ARGS,
        )
        signatures_match, cls_subset, dataclass_subset = result
@ -273,7 +270,7 @@ class TestEncDecCTCModel:
        assert dataclass_subset is None
    @pytest.mark.unit
-    def test_EncDecCTCDatasetConfig_for_TarredAudioToBPEDataset(self):
+    def test_ASRDatasetConfig_for_TarredAudioToBPEDataset(self):
        # ignore some additional arguments as dataclass is generic
        IGNORE_ARGS = [
            'is_tarred',
@ -303,7 +300,7 @@ class TestEncDecCTCModel:
        result = assert_dataclass_signature_match(
            audio_to_text.TarredAudioToBPEDataset,
-            configs.EncDecCTCDatasetConfig,
+            configs.ASRDatasetConfig,
            ignore_args=IGNORE_ARGS,
            remap_args=REMAP_ARGS,
        )
--- a/tests/collections/asr/test_asr_ctcencdec_model.py
+++ b/tests/collections/asr/test_asr_ctcencdec_model.py
@ -240,7 +240,7 @@ class TestEncDecCTCModel:
        # trainer and exp manager should be there
    @pytest.mark.unit
-    def test_EncDecCTCDatasetConfig_for_AudioToCharDataset(self):
+    def test_ASRDatasetConfig_for_AudioToCharDataset(self):
        # ignore some additional arguments as dataclass is generic
        IGNORE_ARGS = [
            'is_tarred',
@ -259,10 +259,7 @@ class TestEncDecCTCModel:
        REMAP_ARGS = {'trim_silence': 'trim'}
        result = assert_dataclass_signature_match(
-            audio_to_text.AudioToCharDataset,
+            audio_to_text.AudioToCharDataset, configs.ASRDatasetConfig, ignore_args=IGNORE_ARGS, remap_args=REMAP_ARGS,
            configs.EncDecCTCDatasetConfig,
            ignore_args=IGNORE_ARGS,
            remap_args=REMAP_ARGS,
        )
        signatures_match, cls_subset, dataclass_subset = result
@ -271,7 +268,7 @@ class TestEncDecCTCModel:
        assert dataclass_subset is None
    @pytest.mark.unit
-    def test_EncDecCTCDatasetConfig_for_TarredAudioToCharDataset(self):
+    def test_ASRDatasetConfig_for_TarredAudioToCharDataset(self):
        # ignore some additional arguments as dataclass is generic
        IGNORE_ARGS = [
            'is_tarred',
@ -294,7 +291,7 @@ class TestEncDecCTCModel:
        result = assert_dataclass_signature_match(
            audio_to_text.TarredAudioToCharDataset,
-            configs.EncDecCTCDatasetConfig,
+            configs.ASRDatasetConfig,
            ignore_args=IGNORE_ARGS,
            remap_args=REMAP_ARGS,
        )
--- a/tests/collections/asr/test_asr_datasets.py
+++ b/tests/collections/asr/test_asr_datasets.py
@ -341,7 +341,6 @@ class TestASRDatasets:
        num_samples = 10
        batch_size = 1
        device = 'gpu' if torch.cuda.is_available() else 'cpu'
        texts = []
        with tempfile.NamedTemporaryFile(mode='w', encoding='utf-8') as f:
@ -394,7 +393,6 @@ class TestASRDatasets:
            )
            ref_preprocessor = EncDecCTCModel.from_config_dict(preprocessor_cfg)
            count = 0
            for ref_data, dali_data in zip(ref_dataloader, dali_dataset):
                ref_audio, ref_audio_len, _, _ = ref_data
                ref_features, ref_features_len = ref_preprocessor(input_signal=ref_audio, length=ref_audio_len)