Merge branch 'asr_telephony' of github.com:NVIDIA/NeMo into asr_telephony

2021-06-17 17:55:08 -07:00 · 2021-06-17 17:55:08 -07:00 · e101445a06
parent 5b47d81e30 df215a63b7
commit e101445a06
29 changed files with 3042 additions and 83 deletions
--- a/33
+++ b/33
@ -1265,6 +1265,39 @@ pipeline {
      }
    }

+    stage('L2: NMT Megatron Model Parallel Size 2 Encoder') {
+      when {
+        anyOf{
+          branch 'main'
+          changeRequest target: 'main'
+        }
+      }
+      failFast true
+      steps{
+        sh 'cd examples/nlp/machine_translation && \
+        python enc_dec_nmt.py \
+        --config-path=conf \
+        --config-name=megatron \
+        model.encoder.model_name=megatron-bert-uncased \
+        model.encoder.checkpoint_file=/home/TestData/nlp/mp_2_bert_toy/iter_2000000 \
+        model.encoder.hidden_size=1024 \
+        model.encoder.num_attention_heads=16 \
+        model.encoder.num_layers=24 \
+        model.encoder.max_position_embeddings=512 \
+        model.train_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
+        model.train_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref \
+        model.validation_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
+        model.validation_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
+        model.test_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
+        model.test_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
+        model.decoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/tt_tokenizer.BPE.4096.model \
+        trainer.gpus=[0,1] \
+        +trainer.fast_dev_run=true \
+        exp_manager=null \
+        '
+      }
+    }
+
    stage('L2: NMT Tarred Dataset Creation') {
      when {
        anyOf {
--- a/docs/source/_static/css/custom.css
+++ b/docs/source/_static/css/custom.css
@ -42,7 +42,7 @@ a

 a:visited
 {
-    color: #b6b6b6;
+    color: #218219;
 }


--- a/docs/source/asr/data/asr_results.csv
+++ b/docs/source/asr/data/asr_results.csv
@ -1,21 +0,0 @@
-Model Name,Model Base Class,Model Card
-QuartzNet15x5Base-En,EncDecCTCModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemospeechmodels"
-stt_en_jasper10x5dr,EncDecCTCModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_jasper10x5dr"
-stt_en_citrinet_256,EncDecCTCBPEModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_citrinet_256"
-stt_en_citrinet_512,EncDecCTCBPEModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_citrinet_512"
-stt_en_citrinet_1024,EncDecCTCBPEModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_citrinet_1024"
-stt_ca_quartznet15x5,EncDecCTCModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_ca_quartznet15x5"
-stt_it_quartznet15x5,EncDecCTCModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_it_quartznet15x5"
-stt_fr_quartznet15x5,EncDecCTCModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_fr_quartznet15x5"
-stt_es_quartznet15x5,EncDecCTCModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_es_quartznet15x5"
-stt_de_quartznet15x5,EncDecCTCModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_de_quartznet15x5"
-stt_pl_quartznet15x5,EncDecCTCModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_pl_quartznet15x5"
-stt_ru_quartznet15x5,EncDecCTCModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_ru_quartznet15x5"
-stt_zh_quartznet15x5,EncDecCTCModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_zh_quartznet15x5"
-stt_zh_citrinet_512,EncDecCTCBPEModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_zh_citrinet_512"
-stt_en_conformer_ctc_small,EncDecCTCBPEModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_conformer_ctc_small"
-stt_en_conformer_ctc_medium,EncDecCTCBPEModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_conformer_ctc_medium"
-stt_en_conformer_ctc_large,EncDecCTCBPEModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_conformer_ctc_large"
-stt_en_conformer_ctc_small_ls,EncDecCTCBPEModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_conformer_ctc_small_ls"
-stt_en_conformer_ctc_medium_ls,EncDecCTCBPEModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_conformer_ctc_medium_ls"
-stt_en_conformer_ctc_large_ls,EncDecCTCBPEModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_conformer_ctc_large_ls"
--- a/docs/source/asr/data/benchmark_en.csv
+++ b/docs/source/asr/data/benchmark_en.csv
@ -1,11 +1,12 @@
 Model Name,Model Base Class,Model Card
 QuartzNet15x5Base-En,EncDecCTCModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemospeechmodels"
-stt_zh_quartznet15x5,EncDecCTCModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_zh_quartznet15x5"
 stt_en_jasper10x5dr,EncDecCTCModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_jasper10x5dr"
 stt_en_citrinet_256,EncDecCTCBPEModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_citrinet_256"
 stt_en_citrinet_512,EncDecCTCBPEModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_citrinet_512"
 stt_en_citrinet_1024,EncDecCTCBPEModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_citrinet_1024"
-stt_zh_citrinet_512,EncDecCTCBPEModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_zh_citrinet_512"
+stt_en_citrinet_256_gamma_0_25,EncDecCTCBPEModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_citrinet_256_gamma_0_25"
+stt_en_citrinet_512_gamma_0_25,EncDecCTCBPEModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_citrinet_512_gamma_0_25"
+stt_en_citrinet_1024_gamma_0_25,EncDecCTCBPEModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_citrinet_1024_gamma_0_25"
 stt_en_conformer_ctc_small,EncDecCTCBPEModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_conformer_ctc_small"
 stt_en_conformer_ctc_medium,EncDecCTCBPEModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_conformer_ctc_medium"
 stt_en_conformer_ctc_large,EncDecCTCBPEModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_conformer_ctc_large"
--- a/docs/source/asr/data/benchmark_es.csv
+++ b/docs/source/asr/data/benchmark_es.csv
@ -1,3 +1,3 @@
 Model,Model Base Class,Model Card
 stt_es_quartznet15x5,EncDecCTCModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_es_quartznet15x5"
-
+stt_es_citrinet_512,EncDecCTCModelBPE,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_es_citrinet_512"
--- a/docs/source/asr/data/benchmark_zh.csv
+++ b/docs/source/asr/data/benchmark_zh.csv
@ -1,3 +1,2 @@
 Model,Model Base Class,Model Card
-stt_zh_quartznet15x5,EncDecCTCModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_zh_quartznet15x5"
 stt_zh_citrinet_512,EncDecCTCBPEModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_zh_citrinet_512"
--- a/docs/source/asr/results.rst
+++ b/docs/source/asr/results.rst
@ -71,14 +71,16 @@ To perform inference and transcribe a sample of speech after loading the model,
 Setting the argument ``logprobs`` to ``True`` returns the log probabilities instead of transcriptions. For more information, see :doc:`./api.html#modules`.
 The audio files should be 16KHz monochannel wav files.

-Automatic Speech Recognition Models
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+Fine-tuning on Different Datasets
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

-.. csv-table::
-   :file: data/asr_results.csv
-   :align: left
-   :widths: 30, 30, 40
-   :header-rows: 1
+There are multiple ASR tutorials provided in the :ref:`Tutorials <tutorials>` section. Most of these tutorials explain how to instantiate a pre-trained model, prepare the model for fine-tuning on some dataset (in the same language) as a demonstration.
+
+
+Automatic Speech Recognition Models
+-----------------------------------
+
+Below is a list of all the ASR models that are available in NeMo for specific languages, as well as auxiliary language models for certain languages.

 Language Models for ASR
 ^^^^^^^^^^^^^^^^^^^^^^^
@ -89,7 +91,8 @@ Language Models for ASR
   :widths: 30, 30, 40
   :header-rows: 1

-   
+|
+
 Speech Recognition (Languages)
 ------------------------------

--- a/docs/source/core/export.rst
+++ b/docs/source/core/export.rst
@ -0,0 +1,185 @@
+Exporting NeMo Models
+=====================
+
+Exporting Models
+----------------
+
+Most of the NeMo models can be exported to ONNX or TorchScript to be deployed for inference in optimized execution environments, such as Jarvis or Triton Inference Server.  
+Export interface is provided by the ``Exportable`` mix-in class. If a model extends ``Exportable``, it can be exported by:
+
+.. code-block:: Python
+
+   from nemo.core.classes import ModelPT, Exportable
+   # deriving from Exportable
+   class MyExportableModel(ModelPT, Exportable):
+   ...
+
+   mymodel = MyExportableModel.from_pretrained(model_name="MyModelName")
+   
+   # exporting pre-trained model to ONNX file for deployment.	
+   mymodel.export('mymodel.onnx', [options])
+
+
+How to Use Model Export
+-----------------------
+The following arguments are for ``Exportable.export()``. In most cases, you should only supply the name of the output file and use all defaults:
+.. code-block:: Python
+    def export(
+        self,
+        output: str,
+        input_example=None,
+        output_example=None,
+        verbose=False,
+        export_params=True,
+        do_constant_folding=True,
+        keep_initializers_as_inputs=False,
+        onnx_opset_version: int = 13,
+        try_script: bool = False,
+        set_eval: bool = True,
+        check_trace: bool = False,
+        use_dynamic_axes: bool = True,
+        dynamic_axes=None,
+        check_tolerance=0.01,
+    ):
+
+The ``output``, ``input_example``, ``output_example``, ``verbose``, ``export_params``, ``do_constant_folding``, ``keep_initializers_as_inputs``, ``onnx_opset_version``, ``set_eval`` options have the same semantics as in Pytorch ``onnx.export()`` and ``jit.trace()`` functions and are passed through. For more information about Pytorch's``onnx.export()``, refer to the `torch.onnx functions documentation
+<https://pytorch.org/docs/stable/onnx.html#functions>`_.
+
+The file extension of the ``output`` parameter determines export format: ``.onnx->ONNX``, ``.pt`` or ``.ts`` -> ``TorchScript``. If ``input_example`` is None, ``Exportable.input_example()`` is called.
+
+**TorchScript-specific**: If ``try_script`` is ``True``, ``export()`` tries ``jit.script()`` before ``jit.trace()``.
+The ``check_trace`` arg is passed through to ``jit.trace()``.
+**ONNX-specific**: If ``use_dynamic_axes`` is True, ``onnx.export()`` is called with dynamic axes. If ``dynamic_axes`` is ``None``, they are inferred from the model's ``input_types`` definition (batch dimension is dynamic, and so is duration etc).
+
+If ``check_trace`` is ``True``, the resulting ONNX also runs on ``input_example`` and the results compared to ``output_example`` using the ``check_tolerance`` argument. Note the higher tolerance default.
+
+
+How to Make Model Exportable
+----------------------------
+
+If you are simply using NeMo models, the previous example is all you need to know.
+If you write your own models, this section highlights the things you need to be aware of after extending ``Exportable``.
+
+Exportable Hooks and Overrides
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+You should not normally need to override ``Exportable`` default methods. However, ``Exportable.export()`` relies on the assumptions that certain methods are available in your class.
+
+.. code-block:: Python
+
+    @property
+    def input_example(self) # => Tuple(input, [(input, ...], [Dict])
+         """
+        Generates input examples for tracing etc.
+        Returns:
+            A tuple of input examples. 
+	 """
+This function should return a tuple of (normally) Tensors - one per each of model inputs (args to ``forward()``). The last element may be a ``Dict`` to specify non-positional arguments by name, as per Torch ``export()`` convention. For more information, refer to the `Using dictionaries to handle Named Arguments as model inputs
+<https://pytorch.org/docs/stable/onnx.html#using-dictionaries-to-handle-named-arguments-as-model-inputs>`_.
+  Note: ``Dict`` currently does not work with Torchscript ``trace()``.
+.. code-block:: Python
+
+    @property
+    def input_types(self):
+    @property
+    def output_types(self):
+    
+Those are needed for inferring in/out names and dynamic axes. If your model derives from ``ModulePT``, those are already there. Another common scenario is that your model contains one or more modules that processes input and generates output. Then, you should override ``Exportable`` methods ``input_module()`` and ``output_module()`` to point to them, like in this example:
+
+.. code-block:: Python
+
+    @property
+    def input_module(self):
+        return self.fastpitch
+
+    @property
+    def output_module(self):
+        return self.fastpitch
+
+Your model should also have an export-friendly ``forward()`` method - that can mean different things for ONNX ant TorchScript. For ONNX, you can't have forced named parameters without default, like ``forward(self, *, text)``. For TorchScript, you should avoid ``None`` and use ``Optional`` instead. The criterias are highly volatile and may change with every PyTorch version, so it's a trial-and-error process. There is also the general issue that in many cases, ``forward()`` for inference can be simplified and even use less inputs/outputs. To address this, ``Exportable`` looks for ``forward_for_export()`` method in your model and uses that instead of ``forward()`` to export:
+
+.. code-block:: Python
+    # Uses forced named args, many default parameters. 
+    def forward(
+        self,
+        *,
+        text,
+        durs=None,
+        pitch=None,
+        speaker=0,
+        pace=1.0,
+        spec=None,
+        attn_prior=None,
+        mel_lens=None,
+        input_lens=None,
+    ):
+        # Passes through all self.fastpitch outputs
+        return self.fastpitch(
+            text=text,
+            durs=durs,
+            pitch=pitch,
+            speaker=speaker,
+            pace=pace,
+            spec=spec,
+            attn_prior=attn_prior,
+            mel_lens=mel_lens,
+            input_lens=input_lens,
+        )
+
+
+    # Uses less inputs, no '*', returns less outputs:
+    def forward_for_export(self, text):
+        (
+            spect,
+            durs_predicted,
+            log_durs_predicted,
+            pitch_predicted,
+            attn_soft,
+            attn_logprob,
+            attn_hard,
+            attn_hard_dur,
+            pitch,
+        ) = self.fastpitch(text=text)
+        return spect, durs_predicted, log_durs_predicted, pitch_predicted
+
+To stay consistent with input_types()/output_types(), there are also those hooks in ``Exportable`` that let you exclude particular inputs/outputs from the export process:
+
+.. code-block:: Python
+
+    @property
+    def disabled_deployment_input_names(self):
+        """Implement this method to return a set of input names disabled for export"""
+        return set(["durs", "pitch", "speaker", "pace", "spec", "attn_prior", "mel_lens", "input_lens"])
+
+    @property
+    def disabled_deployment_output_names(self):
+
+
+Another common requirement for models that are being exported is to run certain net modifications for inference efficiency before exporting - like disabling masks in some convolutions or removing batch normalizations. A better style is to make those happen on ``ModelPT.eval()`` (and reversed on ``.train()``), but it's not always feasible so the following hook is provided in ``Exportable`` to run those:
+
+.. code-block:: Python
+
+    def _prepare_for_export(self, **kwargs):
+        """
+        Override this method to prepare module for export. This is in-place operation.
+        Base version does common necessary module replacements (Apex etc)
+        """
+	# do graph modifications specific for this model
+        replace_1D_2D = kwargs.get('replace_1D_2D', False)
+        replace_for_export(self, replace_1D_2D)
+	# call base method for common set of modifications 
+	Exportable._prepare_for_export(self, **kwargs)
+
+
+Exportable Model Code
+~~~~~~~~~~~~~~~~~~~~~
+
+Most importantly, the actual Torch code in your model should be ONNX or TorchScript - compatible (ideally, both).
+#. Ensure the code is written in Torch - avoid bare `Numpy or Python operands <https://pytorch.org/docs/stable/onnx.html#write-pytorch-model-in-torch-way>`_.
+#. Create your model ``Exportable`` and add an export unit test, to catch any operation/construct not supported in ONNX/TorchScript, immediately.
+
+For more information, refer to the PyTorch documentation:
+       - `List of supported operators <https://pytorch.org/docs/stable/onnx.html#supported-operators>`_
+       - `Tracing vs. scripting <https://pytorch.org/docs/stable/onnx.html#tracing-vs-scripting>`_ 
+       - `AlexNet example <https://pytorch.org/docs/stable/onnx.html#example-end-to-end-alexnet-from-pytorch-to-onnx>`_
+
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@ -17,6 +17,7 @@ NVIDIA NeMo User Guide
   :name: core

   core/core
+   core/export


 .. toctree::
--- a/docs/source/nlp/machine_translation.rst
+++ b/docs/source/nlp/machine_translation.rst
@ -457,6 +457,111 @@ can be used to compute sacreBLEU scores.

    cat test.en-es.translations | sacrebleu test.es

+Pretrained Encoders
+-------------------
+
+Pretrained BERT encoders from either `HuggingFace Transformers <https://huggingface.co/models>`__ 
+or `Megatron-LM <https://github.com/NVIDIA/Megatron-LM>`__ 
+can be used to to train NeMo NMT models.
+
+The ``library`` flag takes values: ``huggingface``, ``megatron``, and ``nemo``.
+
+The ``model_name`` flag is used to indicate a *named* model architecture.
+For example, we can use ``bert_base_cased`` from HuggingFace or ``megatron-bert-345m-cased`` from Megatron-LM.
+
+The ``pretrained`` flag indicates whether or not to download the pretrained weights (``pretrained=True``) or 
+instantiate the same model architecture with random weights (``pretrained=False``).
+
+To use a custom model architecture from a specific library, use ``model_name=null`` and then add the 
+custom configuration under the ``encoder`` configuration.
+
+HuggingFace
+^^^^^^^^^^^
+
+We have provided a `HuggingFace config file <https://github.com/NVIDIA/NeMo/blob/main/examples/nlp/machine_translation/conf/huggingface.yaml>`__
+to use with HuggingFace encoders. 
+
+To use the config file from CLI:
+
+.. code ::
+
+  --config-path=conf \
+  --config-name=huggingface \
+
+As an example, we can configure the NeMo NMT encoder to use ``bert-base-cased`` from HuggingFace 
+by using the ``huggingface`` config file and setting
+
+.. code ::
+
+  model.encoder.pretrained=true \
+  model.encoder.model_name=bert-base-cased \
+
+To use a custom architecture from HuggingFace we can use
+
+.. code ::
+
+  +model.encoder._target_=transformers.BertConfig \
+  +model.encoder.hidden_size=1536 \
+
+Note the ``+`` symbol is needed if we're not adding the arguments to the YAML config file.
+
+Megatron
+^^^^^^^^
+
+We have provided a `Megatron config file <https://github.com/NVIDIA/NeMo/blob/main/examples/nlp/machine_translation/conf/megatron.yaml>`__
+to use with Megatron encoders. 
+
+To use the config file from CLI:
+
+.. code ::
+
+  --config-path=conf \
+  --config-name=megatron \
+
+The ``checkpoint_file`` should be the path to Megatron-LM checkpoint:
+
+.. code ::
+
+  /path/to/your/megatron/checkpoint/model_optim_rng.pt
+
+In case your megatron model requires model parallelism, then ``checkpoint_file`` should point to the directory containing the
+standard Megatron-LM checkpoint format:
+
+.. code ::
+
+  3.9b_bert_no_rng
+  ├── mp_rank_00
+  │   └── model_optim_rng.pt
+  ├── mp_rank_01
+  │   └── model_optim_rng.pt
+  ├── mp_rank_02
+  │   └── model_optim_rng.pt
+  └── mp_rank_03
+      └── model_optim_rng.pt
+
+As an example, to train a NeMo NMT model with a 3.9B Megatron BERT encoder,
+we would use the following encoder configuration:
+
+.. code ::
+
+  model.encoder.checkpoint_file=/path/to/megatron/checkpoint/3.9b_bert_no_rng \
+  model.encoder.hidden_size=2560 \
+  model.encoder.num_attention_heads=40 \
+  model.encoder.num_layers=48 \
+  model.encoder.max_position_embeddings=512 \
+
+To train a Megatron 345M BERT, we would use
+
+.. code ::
+
+  model.encoder.model_name=megatron-bert-cased \
+  model.encoder.checkpoint_file=/path/to/your/megatron/checkpoint/model_optim_rng.pt \
+  model.encoder.hidden_size=1024 \
+  model.encoder.num_attention_heads=16 \
+  model.encoder.num_layers=24 \
+  model.encoder.max_position_embeddings=512 \
+
+
 References
 ----------

--- a/docs/source/starthere/tutorials.rst
+++ b/docs/source/starthere/tutorials.rst
@ -43,6 +43,9 @@ To run a tutorial:
   * - ASR
     - Online ASR inference with Microphone
     - `Online ASR Microphone <https://github.com/NVIDIA/NeMo/blob/v1.0.2/tutorials/asr/02_Online_ASR_Microphone_Demo.ipynb>`_
+   * - ASR
+     - Fine-tuning CTC Models on New Languages
+     - `ASR CTC Language Fine-Tuning <https://colab.research.google.com/github/NVIDIA/NeMo/blob/main/tutorials/asr/10_ASR_CTC_Language_Finetuning.ipynb>`_
   * - ASR
     - Speech Commands
     - `Speech Commands <https://colab.research.google.com/github/NVIDIA/NeMo/blob/v1.0.2/tutorials/asr/03_Speech_Commands.ipynb>`_
--- a/examples/nlp/dialogue_state_tracking/sgd_qa.py
+++ b/examples/nlp/dialogue_state_tracking/sgd_qa.py
@ -132,6 +132,17 @@ def main(cfg: DictConfig) -> None:
        trainer.fit(model)
        if cfg.model.nemo_path:
            model.save_to(cfg.model.nemo_path)
+    else:
+        data_dir = cfg.model.dataset.get('data_dir', None)
+        dialogues_example_dir = cfg.model.dataset.get('dialogues_example_dir', None)
+
+        if data_dir is None or dialogues_example_dir is None:
+            raise ValueError('No dataset directory provided. Skipping evaluation. ')
+        elif not os.path.exists(data_dir):
+            raise ValueError(f'{data_dir} is not found, skipping evaluation on the test set.')
+        else:
+            model.update_data_dirs(data_dir=data_dir, dialogues_example_dir=dialogues_example_dir)
+            model._cfg.dataset = cfg.model.dataset

    if hasattr(cfg.model, 'test_ds') and cfg.model.test_ds.ds_item is not None:
        gpu = 1 if cfg.trainer.gpus != 0 else 0
--- a/examples/nlp/machine_translation/conf/megatron.yaml
+++ b/examples/nlp/machine_translation/conf/megatron.yaml
@ -0,0 +1,160 @@
+name: MegatronEncoder
+do_training: True # set to False if only preprocessing data
+do_testing: False # set to True to run evaluation on test data after training
+
+model:
+  beam_size: 4
+  len_pen: 0.6
+  max_generation_delta: 5
+  label_smoothing: 0.1
+  shared_tokenizer: false
+  preproc_out_dir: null
+  src_language: 'en'
+  tgt_language: 'de'
+
+  train_ds:
+    src_file_name: null
+    tgt_file_name: null
+    use_tarred_dataset: False # if true tar_file_name and meta_file_name will be used (or created automatically) 
+    # config for preprocessing training data and creating a tarred datset automatically
+    tar_file_prefix: parallel # prefix for tar file names
+    tar_files: null # if data has already been preprocessed (rest of config ignored)
+    metadata_file: null # metadata for tarred dataset
+    lines_per_dataset_fragment: 1000000 # Number of lines to consider for bucketing and padding
+    num_batches_per_tarfile: 100 # Number of batches (pickle files) within each tarfile
+    tar_shuffle_n: 100 # How many samples to look ahead and load to be shuffled
+    shard_strategy: scatter # tarred dataset shard distribution strategy
+    n_preproc_jobs: -2 # number of processes to use for data preprocessing (-2 means all but 2)
+    tokens_in_batch: 512
+    clean: true
+    max_seq_length: 512
+    shuffle: true
+    num_samples: -1
+    drop_last: false
+    pin_memory: false
+    num_workers: 8
+
+  validation_ds:
+    src_file_name: null
+    tgt_file_name: null
+    tokens_in_batch: 512
+    clean: false
+    max_seq_length: 512
+    shuffle: false
+    num_samples: -1
+    drop_last: false
+    pin_memory: false
+    num_workers: 8
+
+  test_ds:
+    src_file_name: null
+    tgt_file_name: null
+    tokens_in_batch: 512
+    clean: false
+    max_seq_length: 512
+    shuffle: false
+    num_samples: -1
+    drop_last: false
+    pin_memory: false
+    num_workers: 8
+
+  optim:
+    name: adam
+    lr: 0.001
+    betas:
+      - 0.9
+      - 0.98
+    weight_decay: 0.0
+    sched:
+      name: InverseSquareRootAnnealing
+      min_lr: 0.0
+      last_epoch: -1
+      warmup_ratio: 0.1
+
+  encoder_tokenizer:
+    library: megatron
+    tokenizer_model: null
+    vocab_file: null
+    special_tokens: null
+    vocab_size: null
+    model_name: null
+
+  decoder_tokenizer:
+    library: yttm
+    tokenizer_model: null
+    vocab_file: null
+    special_tokens: null
+    vocab_size: null
+
+  encoder:
+    library: megatron
+
+    # If using a pretrained megatron bert model from NGC, then use the corresponding model name
+    # For example, 'megatron-bert-345m-uncased'.
+    # If restoring from a local checkpoint, then use either 'megatron-bert-uncased' or 'megatron-bert-cased'
+    model_name: megatron-bert-uncased # or megatron-bert-cased
+
+    # If restoring from a model parallel checkpoint, then checkpoint_file should be a path to 
+    # the directory containing the megatron-lm checkpoints. The directory will have the structure:
+
+    #     /path/to/my/checkpoint/
+    # ├── mp_rank_00
+    # │   └── model_optim_rng.pt
+    # └── mp_rank_01
+    #     └── model_optim_rng.pt
+
+    # If not using a model parallel checkpoint, then use the full path to the checkpoint:
+
+    # /path/to/my/checkpoint/model_optim_rng.pt
+    checkpoint_file: null
+    vocab_file : null
+
+    pretrained: true # only pretrained=true supported for now
+
+    # model architecture configuration
+    hidden_size: 1024
+    num_attention_heads: 16
+    num_layers: 24
+    max_position_embeddings: 512
+    num_tokentypes: 0
+
+  decoder:
+    library: nemo
+    model_name: null
+    pretrained: false
+    max_sequence_length: 512
+    num_token_types: 2
+    embedding_dropout: 0.1
+    learn_positional_encodings: false
+    hidden_size: 512
+    inner_size: 2048
+    num_layers: 6
+    num_attention_heads: 8
+    ffn_dropout: 0.1
+    attn_score_dropout: 0.1
+    attn_layer_dropout: 0.1
+    hidden_act: relu
+    pre_ln: false
+
+  head:
+    num_layers: 1
+    activation: relu
+    log_softmax: true
+    dropout: 0.0
+    use_transformer_init: true
+
+trainer:
+  gpus: 4
+  num_nodes: 1
+  max_epochs: 200
+  amp_level: O2 # O1/O2 for mixed precision
+  precision: 16 # Should be set to 16 for O1 and O2, default is 16 as PT ignores it when am_level is O0
+  accelerator: ddp
+  checkpoint_callback: False
+  logger: False
+  log_every_n_steps: 50  # Interval of logging.
+  check_val_every_n_epoch: 1
+
+exp_manager:
+  name: ${name}
+  files_to_copy: []
--- a/examples/nlp/machine_translation/enc_dec_nmt.py
+++ b/examples/nlp/machine_translation/enc_dec_nmt.py
@ -21,6 +21,7 @@ from pytorch_lightning import Trainer
 from nemo.collections.nlp.data.machine_translation.preproc_mt_data import MTDataPreproc
 from nemo.collections.nlp.models.machine_translation.mt_enc_dec_config import MTEncDecModelConfig
 from nemo.collections.nlp.models.machine_translation.mt_enc_dec_model import MTEncDecModel
+from nemo.collections.nlp.parts.nlp_overrides import NLPDDPPlugin
 from nemo.core.config import hydra_runner
 from nemo.core.config.modelPT import NemoConfig
 from nemo.core.config.pytorch_lightning import TrainerConfig
@ -108,7 +109,9 @@ def main(cfg: MTEncDecConfig) -> None:
    logging.info(f'Config: {OmegaConf.to_yaml(cfg)}')

    # training is managed by PyTorch Lightning
-    trainer = Trainer(**cfg.trainer)
+    trainer_cfg = OmegaConf.to_container(cfg.trainer)
+    trainer_cfg.pop('plugins', None)
+    trainer = Trainer(plugins=[NLPDDPPlugin()], **trainer_cfg)

    # tokenizers will be trained and and tarred training data will be created if needed
    # model config is then updated
--- a/nemo/collections/asr/models/ctc_bpe_models.py
+++ b/nemo/collections/asr/models/ctc_bpe_models.py
@ -63,6 +63,27 @@ class EncDecCTCModelBPE(EncDecCTCModel, ASRBPEMixin):
            description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_citrinet_1024",
            location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_en_citrinet_1024/versions/1.0.0rc1/files/stt_en_citrinet_1024.nemo",
        )
+        results.append(model)
+
+        model = PretrainedModelInfo(
+            pretrained_model_name="stt_en_citrinet_256_gamma_0_25",
+            description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_citrinet_256_gamma_0_25",
+            location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_en_citrinet_256_gamma_0_25/versions/1.0.0/files/stt_en_citrinet_256_gamma_0_25.nemo",
+        )
+        results.append(model)
+
+        model = PretrainedModelInfo(
+            pretrained_model_name="stt_en_citrinet_512_gamma_0_25",
+            description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_citrinet_512_gamma_0_25",
+            location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_en_citrinet_512_gamma_0_25/versions/1.0.0/files/stt_en_citrinet_512_gamma_0_25.nemo",
+        )
+        results.append(model)
+
+        model = PretrainedModelInfo(
+            pretrained_model_name="stt_en_citrinet_1024_gamma_0_25",
+            description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_citrinet_1024_gamma_0_25",
+            location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_en_citrinet_1024_gamma_0_25/versions/1.0.0/files/stt_en_citrinet_1024_gamma_0_25.nemo",
+        )

        results.append(model)

--- a/nemo/collections/asr/models/ctc_models.py
+++ b/nemo/collections/asr/models/ctc_models.py
@ -64,13 +64,6 @@ class EncDecCTCModel(ASRModel, ExportableEncDecModel, ASRModuleMixin):
        )
        results.append(model)

-        model = PretrainedModelInfo(
-            pretrained_model_name="stt_zh_quartznet15x5",
-            description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_zh_quartznet15x5",
-            location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_zh_quartznet15x5/versions/1.0.0rc1/files/stt_zh_quartznet15x5.nemo",
-        )
-        results.append(model)
-
        model = PretrainedModelInfo(
            pretrained_model_name="stt_en_jasper10x5dr",
            description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_jasper10x5dr",
--- a/nemo/collections/nlp/data/machine_translation/preproc_mt_data.py
+++ b/nemo/collections/nlp/data/machine_translation/preproc_mt_data.py
@ -64,7 +64,7 @@ class MTDataPreproc:
            self.world_size = trainer.num_nodes * trainer.num_gpus

        if hasattr(cfg, 'train_ds'):
-            supported_tokenizers = ['yttm', 'huggingface', 'sentencepiece']
+            supported_tokenizers = ['yttm', 'huggingface', 'sentencepiece', 'megatron']
            supported_train_tokenizers = ['yttm', 'sentencepiece']

            if (
@ -182,7 +182,7 @@ class MTDataPreproc:
                    # Preprocess data and cache for use during training
                    if self.global_rank == 0:
                        logging.info(
-                            f"Using tarred dataset for src: {cfg.train_ds.get('src_file_name')} and tgt: {cfg.train_ds.get('tgt_file_name')}"
+                            f"Creating tarred dataset for src: {cfg.train_ds.get('src_file_name')} and tgt: {cfg.train_ds.get('tgt_file_name')}"
                        )

                    if not cfg.get('multilingual'):
@ -247,6 +247,7 @@ class MTDataPreproc:
                    logging.info(
                        f"Using tarred dataset created in folder(s) {outdir_list} and metadata created at {self._cfg.train_ds.metadata_file}"
                    )
+
                elif cfg.train_ds.get('tar_files') is not None and cfg.train_ds.get('metadata_file') is None:
                    raise ValueError('A metadata file is required for tarred dataset but cfg.metadata_file is None.')
                elif cfg.train_ds.get('tar_files') is None and cfg.train_ds.get('metadata_file') is not None:
--- a/nemo/collections/nlp/models/dialogue_state_tracking/sgdqa_model.py
+++ b/nemo/collections/nlp/models/dialogue_state_tracking/sgdqa_model.py
@ -36,6 +36,7 @@ from nemo.collections.nlp.modules.common.lm_utils import get_lm_model
 from nemo.collections.nlp.parts.utils_funcs import tensor2list
 from nemo.core.classes.common import PretrainedModelInfo, typecheck
 from nemo.core.neural_types import NeuralType
+from nemo.utils import logging
 from nemo.utils.get_rank import is_global_rank_zero

 __all__ = ['SGDQAModel']
@ -543,6 +544,21 @@ class SGDQAModel(NLPModel):

        self.data_prepared = True

+    def update_data_dirs(self, data_dir: str, dialogues_example_dir: str):
+        """
+        Update data directories
+
+        Args:
+            data_dir: path to data directory
+            dialogues_example_dir: path to preprocessed dialogues example directory, if not exists will be created.
+        """
+        if not os.path.exists(data_dir):
+            raise ValueError(f"{data_dir} is not found")
+        self._cfg.dataset.data_dir = data_dir
+        self._cfg.dataset.dialogues_example_dir = dialogues_example_dir
+        logging.info(f'Setting model.dataset.data_dir to {data_dir}.')
+        logging.info(f'Setting model.dataset.dialogues_example_dir to {dialogues_example_dir}.')
+
    def setup_training_data(self, train_data_config: Optional[DictConfig] = None):
        self.prepare_data()
        self._train_dl = self._setup_dataloader_from_config(cfg=train_data_config, split=train_data_config.ds_item)
@ -577,4 +593,19 @@ class SGDQAModel(NLPModel):

    @classmethod
    def list_available_models(cls) -> Optional[PretrainedModelInfo]:
-        pass
+        """
+        This method returns a list of pre-trained model which can be instantiated directly from NVIDIA's NGC cloud.
+
+        Returns:
+            List of available pre-trained models.
+        """
+        result = []
+
+        result.append(
+            PretrainedModelInfo(
+                pretrained_model_name="sgdqa_bertbasecased",
+                location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/sgdqa_bertbasecased/versions/1.0.0/files/sgdqa_bertbasecased.nemo",
+                description="Dialogue State Tracking model finetuned from NeMo BERT Base Cased on Google SGD dataset which has a joint goal accuracy of 59.72% on dev set and 45.85% on test set.",
+            )
+        )
+        return result
--- a/nemo/collections/nlp/models/machine_translation/mt_enc_dec_config.py
+++ b/nemo/collections/nlp/models/machine_translation/mt_enc_dec_config.py
@ -25,7 +25,7 @@ from nemo.collections.nlp.modules.common.transformer.transformer import (
    NeMoTransformerConfig,
    NeMoTransformerEncoderConfig,
 )
-from nemo.core.config.modelPT import ModelConfig, OptimConfig, SchedConfig
+from nemo.core.config.modelPT import OptimConfig, SchedConfig


@dataclass
--- a/nemo/collections/nlp/models/machine_translation/mt_enc_dec_model.py
+++ b/nemo/collections/nlp/models/machine_translation/mt_enc_dec_model.py
@ -132,6 +132,7 @@ class MTEncDecModel(EncDecNLPModel):
        library = encoder_cfg_dict.pop('library', 'nemo')
        model_name = encoder_cfg_dict.pop('model_name', None)
        pretrained = encoder_cfg_dict.pop('pretrained', False)
+        checkpoint_file = encoder_cfg_dict.pop('checkpoint_file', None)
        self.encoder = get_transformer(
            library=library,
            model_name=model_name,
@ -139,6 +140,7 @@ class MTEncDecModel(EncDecNLPModel):
            config_dict=encoder_cfg_dict,
            encoder=True,
            pre_ln_final_layer_norm=encoder_cfg_dict.get('pre_ln_final_layer_norm', False),
+            checkpoint_file=checkpoint_file,
        )

        # decoder from NeMo, Megatron-LM, or HuggingFace
@ -383,7 +385,7 @@ class MTEncDecModel(EncDecNLPModel):
        decoder_model_name=None,
    ):

-        supported_tokenizers = ['yttm', 'huggingface', 'sentencepiece']
+        supported_tokenizers = ['yttm', 'huggingface', 'sentencepiece', 'megatron']
        if (
            encoder_tokenizer_library not in supported_tokenizers
            or decoder_tokenizer_library not in supported_tokenizers
--- a/nemo/collections/nlp/models/nlp_model.py
+++ b/nemo/collections/nlp/models/nlp_model.py
@ -32,6 +32,7 @@ from transformers import TRANSFORMERS_CACHE

 from nemo.collections.common.tokenizers.huggingface.auto_tokenizer import AutoTokenizer
 from nemo.collections.nlp.modules import BertModule, MegatronBertEncoder
+from nemo.collections.nlp.modules.common.megatron.megatron_encoder import MegatronEncoderModule
 from nemo.collections.nlp.modules.common.megatron.megatron_utils import compute_model_parallel_rank
 from nemo.collections.nlp.modules.common.tokenizer_utils import get_tokenizer
 from nemo.collections.nlp.parts.nlp_overrides import NLPCheckpointConnector
@ -430,21 +431,19 @@ class NLPModel(ModelPT, Exportable):
    @rank_zero_only
    def register_megatron_checkpoint_version(self):
        """ Adds checkpoint version to .nemo archive """
-        if self.bert_model is None:
-            raise ValueError('Instantiate self.bert_model before registering megatron checkpoint version.')
+        if self.has_megatron_encoder:
+            checkpoint_version = get_checkpoint_version()
+            if checkpoint_version is None:
+                raise ValueError('Unable to get megatron checkpoint version.')
+            else:
+                checkpoint_version_dict = {'checkpoint_version': checkpoint_version}
+                checkpoint_version_path = 'megatron_checkpoint_version.json'
+                checkpoint_version_src = os.path.join(NEMO_NLP_TMP, checkpoint_version_path)
+                with open(checkpoint_version_src, 'w') as f:
+                    f.write(json.dumps(checkpoint_version_dict))
+                self.register_artifact(checkpoint_version_path, checkpoint_version_src)
        else:
-            # get encoder config and create source for artifact
-            if isinstance(self.bert_model, MegatronBertEncoder):
-                checkpoint_version = get_checkpoint_version()
-                if checkpoint_version is None:
-                    raise ValueError('Unable to get megatron checkpoint version.')
-                else:
-                    checkpoint_version_dict = {'checkpoint_version': checkpoint_version}
-                    checkpoint_version_path = 'megatron_checkpoint_version.json'
-                    checkpoint_version_src = os.path.join(NEMO_NLP_TMP, checkpoint_version_path)
-                    with open(checkpoint_version_src, 'w') as f:
-                        f.write(json.dumps(checkpoint_version_dict))
-                    self.register_artifact(checkpoint_version_path, checkpoint_version_src)
+            raise ValueError('Registering Megatron checkpoint version but no Megatron encoder detected.')

    @staticmethod
    def _unpack_nemo_file(path2file: str, out_folder: str) -> str:
@ -461,3 +460,39 @@ class NLPModel(ModelPT, Exportable):
    @property
    def output_module(self):
        return self.classifier
+
+    @property
+    def has_megatron_encoder(self):
+        if hasattr(self, 'bert_model'):
+            if isinstance(self.bert_model, MegatronBertEncoder):
+                return True
+            else:
+                return False
+        elif hasattr(self, 'encoder'):
+            if isinstance(self.encoder, MegatronEncoderModule):
+                return True
+            else:
+                return False
+        else:
+            return False
+
+    @property
+    def is_model_parallel_initialized(self):
+        app_state = AppState()
+        if app_state.model_parallel_group is not None:
+            return True
+        else:
+            return False
+
+    def restore_megatron_encoder_weights(self):
+        """ Model parallel weights need to be restored after DDP is initialized and 
+            model parallel ranks are known.
+        """
+        if hasattr(self, 'bert_model'):
+            if isinstance(self.bert_model, MegatronBertEncoder):
+                logging.info(f"Restoring from pretrained model parallel checkpoint: {self.bert_model._restore_path}")
+                self.bert_model.restore_weights(self.bert_model._restore_path)
+        elif hasattr(self, 'encoder'):
+            if isinstance(self.encoder, MegatronEncoderModule):
+                logging.info(f"Restoring from pretrained model parallel checkpoint: {self.encoder.checkpoint_file}")
+                self.encoder._encoder.restore_weights(self.encoder.checkpoint_file)
--- a/nemo/collections/nlp/modules/common/lm_utils.py
+++ b/nemo/collections/nlp/modules/common/lm_utils.py
@ -33,6 +33,7 @@ from nemo.collections.nlp.modules.common.megatron.megatron_utils import (
 from nemo.collections.nlp.modules.common.transformer.transformer import NeMoTransformerConfig
 from nemo.collections.nlp.modules.common.transformer.transformer_utils import (
    get_huggingface_transformer,
+    get_megatron_transformer,
    get_nemo_transformer,
 )
 from nemo.utils import logging
@ -176,4 +177,16 @@ def get_transformer(
            model_name=model_name, pretrained=pretrained, config_dict=config_dict, encoder=encoder
        )

+    elif library == 'megatron':
+        model = get_megatron_transformer(
+            model_name=model_name,
+            pretrained=pretrained,
+            config_dict=config_dict,
+            encoder=encoder,
+            checkpoint_file=checkpoint_file,
+        )
+
+    else:
+        raise ValueError("Libary must be 'nemo', 'huggingface' or 'megatron'")
+
    return model
--- a/nemo/collections/nlp/modules/common/megatron/megatron_bert.py
+++ b/nemo/collections/nlp/modules/common/megatron/megatron_bert.py
@ -65,6 +65,13 @@ class MegatronBertEncoder(BertModule):
        self._app_state = None
        self._model_name = model_name

+        if 'vocab_size' in config:
+            self._vocab_size = config.pop('vocab_size')
+        else:
+            self._vocab_size = None
+
+        self._hidden_size = config.get('hidden_size')
+
        if not os.path.exists(vocab_file):
            raise ValueError(f'Vocab file not found at {vocab_file}')

@ -76,6 +83,8 @@ class MegatronBertEncoder(BertModule):
        config['lazy_mpu_init'] = True
        config['onnx_safe'] = True

+        num_tokentypes = config.pop('num_tokentypes', 2)
+
        # if 'model_parallel_size' in config:
        if self._model_parallel_size is not None:
            app_state = AppState()
@ -109,7 +118,7 @@ class MegatronBertEncoder(BertModule):
        logging.info(f'Megatron-lm argparse args: {args}')

        self.language_model, self._language_model_key = get_language_model(
-            attention_mask_func=bert_attention_mask_func, num_tokentypes=2, add_pooler=False
+            attention_mask_func=bert_attention_mask_func, num_tokentypes=num_tokentypes, add_pooler=False
        )

        self.config = OmegaConf.create(config)
@ -151,8 +160,18 @@ class MegatronBertEncoder(BertModule):
        """
        return self._hidden_size

+    @property
+    def vocab_size(self):
+        """
+        Property returning vocab size.
+
+        Returns:
+            vocab size.
+        """
+        return self._vocab_size
+
    @typecheck()
-    def forward(self, input_ids, attention_mask, token_type_ids):
+    def forward(self, input_ids, attention_mask, token_type_ids=None):
        app_state = AppState()
        if app_state.model_parallel_size is None:
            self.complete_lazy_init()
--- a/nemo/collections/nlp/modules/common/megatron/megatron_encoder.py
+++ b/nemo/collections/nlp/modules/common/megatron/megatron_encoder.py
@ -0,0 +1,101 @@
+# Copyright (c) 2021, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Optional
+
+from nemo.collections.nlp.modules.common.encoder_module import EncoderModule
+from nemo.collections.nlp.modules.common.megatron.megatron_utils import get_megatron_lm_model
+from nemo.core.classes.common import typecheck
+from nemo.utils import logging
+
+
+class MegatronEncoderModule(EncoderModule):
+    """ Class for using Megatron encoders in NeMo NLP."""
+
+    def __init__(
+        self,
+        model_name: Optional[str] = None,
+        pretrained: bool = True,
+        config_dict: Optional[dict] = None,
+        checkpoint_file: Optional[str] = None,
+        vocab_file: Optional[str] = None,
+    ):
+        """Gets Megatron BERT based model to be used as an Encoder in NeMo NLP.
+        Use the model_name arg to get a named model architecture. 
+        Available model names can be found with get_megatron_lm_models_list(). 
+        Use the pretrained arg to get the named model architecture with or without pretrained weights.
+
+        Use config_dict to pass in arguments needed for Megatron-LM.
+        For example, to instantiate a Megatron BERT large model we would do:
+            config_dict={
+                'hidden_size': 1024,
+	            'num_attention_heads': 16,
+                'num_layers': 24,
+                'max_position_embeddings: 512, 
+            } 
+
+
+        Args:
+            model_name (Optional[str]): Named model Megatron architecture from NeMo. Defaults to None.
+            pretrained (bool): Use True to get pretrained weights. 
+                                        False will use the same architecture but with randomly initialized weights.
+                                        Not implemented yet for Megatron encoders.
+                                        Defaults to True.
+            config_dict (Optional[dict], optional): Use for configuration of the Megatron model. Defaults to None.
+            checkpoint_file (Optional[str], optional): Provide weights for the transformer from a local checkpoint.
+                                                       If using model parallel then this should be a directory. Defaults to None.
+            vocab_file (Optional[str], optional): Path to vocab file that was used when pretraining the Megatron model.
+        """
+        super().__init__()
+
+        if not pretrained:
+            raise ValueError('We currently only support pretrained Megatron models. Please set pretrained=True')
+
+        if not checkpoint_file and not model_name:
+            raise ValueError(
+                'Currently Megatron models must be loaded from a pretrained model name or a pretrained checkpoint.'
+            )
+
+        if model_name or checkpoint_file:
+            model, checkpoint_file = get_megatron_lm_model(
+                pretrained_model_name=model_name,
+                config_dict=config_dict,
+                checkpoint_file=checkpoint_file,
+                vocab_file=vocab_file,
+            )
+
+        self._checkpoint_file = checkpoint_file
+        self._hidden_size = model.hidden_size
+        self._vocab_size = model.vocab_size
+
+        self._encoder = model
+
+    @typecheck()
+    def forward(self, input_ids, encoder_mask):
+        encoder_hidden_states = self._encoder.forward(
+            input_ids=input_ids, attention_mask=encoder_mask, token_type_ids=None
+        )
+        return encoder_hidden_states
+
+    @property
+    def checkpoint_file(self) -> Optional[str]:
+        return self._checkpoint_file
+
+    @property
+    def hidden_size(self) -> Optional[int]:
+        return self._hidden_size
+
+    @property
+    def vocab_size(self) -> Optional[int]:
+        return self._vocab_size
--- a/nemo/collections/nlp/modules/common/tokenizer_utils.py
+++ b/nemo/collections/nlp/modules/common/tokenizer_utils.py
@ -111,7 +111,7 @@ def get_nmt_tokenizer(
 ):
    """
    Args:
-        model_name: if using a pretrained model from NeMo or HuggingFace
+        model_name: if using a pretrained model from NeMo, HuggingFace, or Megatron
        tokenizer_model: tokenizer model file of sentencepiece or youtokentome
        special_tokens: dict of special tokens
        vocab_file: path to vocab file
@ -138,7 +138,12 @@ def get_nmt_tokenizer(
        return nemo.collections.common.tokenizers.sentencepiece_tokenizer.SentencePieceTokenizer(
            model_path=tokenizer_model, special_tokens=special_tokens_dict
        )
+    elif library == 'megatron':
+        logging.info(
+            f'Getting Megatron tokenizer with pretrained model name: {model_name} and custom vocab file: {vocab_file}'
+        )
+        return get_tokenizer(tokenizer_name=model_name, vocab_file=vocab_file)
    else:
        raise NotImplementedError(
-            'Currently we only support "yttm", "huggingface", and "sentencepiece" tokenizer library.'
+            'Currently we only support "yttm", "huggingface", "megatron", and "sentencepiece" tokenizer library.'
        )
--- a/nemo/collections/nlp/modules/common/transformer/transformer_utils.py
+++ b/nemo/collections/nlp/modules/common/transformer/transformer_utils.py
@ -17,8 +17,10 @@ from typing import Optional, Union

 from omegaconf.dictconfig import DictConfig

+from nemo.collections.nlp.modules.common.encoder_module import EncoderModule
 from nemo.collections.nlp.modules.common.huggingface.huggingface_decoder import HuggingFaceDecoderModule
 from nemo.collections.nlp.modules.common.huggingface.huggingface_encoder import HuggingFaceEncoderModule
+from nemo.collections.nlp.modules.common.megatron.megatron_encoder import MegatronEncoderModule
 from nemo.collections.nlp.modules.common.transformer.transformer import TransformerDecoderNM, TransformerEncoderNM


@ -110,9 +112,33 @@ def get_huggingface_transformer(
    config_dict: Optional[Union[dict, DictConfig]] = None,
    encoder: bool = True,
 ) -> Union[HuggingFaceEncoderModule, HuggingFaceDecoderModule]:
+
    if encoder:
        model = HuggingFaceEncoderModule(model_name, pretrained, config_dict)
    else:
        model = HuggingFaceDecoderModule(model_name, pretrained, config_dict)

    return model
+
+
+def get_megatron_transformer(
+    model_name: Optional[str] = None,
+    pretrained: bool = True,
+    config_dict: Optional[Union[dict, DictConfig]] = None,
+    encoder: bool = True,
+    checkpoint_file: str = None,
+) -> MegatronEncoderModule:
+
+    vocab_file = config_dict.pop('vocab_file', None)
+    if encoder:
+        model = MegatronEncoderModule(
+            model_name=model_name,
+            pretrained=pretrained,
+            config_dict=config_dict,
+            checkpoint_file=checkpoint_file,
+            vocab_file=vocab_file,
+        )
+    else:
+        raise ValueError('Megatron decoders are not currently supported.')
+
+    return model
--- a/nemo/collections/nlp/parts/nlp_overrides.py
+++ b/nemo/collections/nlp/parts/nlp_overrides.py
@ -29,6 +29,7 @@ from pytorch_lightning.utilities.cloud_io import atomic_save
 from torch.nn.parallel import DistributedDataParallel

 from nemo.collections.nlp.modules.common.megatron.megatron_bert import MegatronBertEncoder
+from nemo.collections.nlp.modules.common.megatron.megatron_encoder import MegatronEncoderModule
 from nemo.utils import AppState, logging


@ -52,20 +53,18 @@ class NLPDDPPlugin(DDPPlugin):
        # call PTL init ddp
        super().init_ddp_connection()

-        # init model parallel
+        # init model parallel if needed
        app_state = AppState()

        if app_state.model_parallel_size is not None:

-            if isinstance(self.lightning_module.bert_model, MegatronBertEncoder):
-
-                if app_state.model_parallel_group is None:
-                    self.init_model_parallel(app_state.global_rank, app_state.world_size)
+            if self.lightning_module.has_megatron_encoder and not self.lightning_module.is_model_parallel_initialized:
+                self.init_model_parallel(app_state.global_rank, app_state.world_size)

    def start_training(self, trainer: 'Trainer') -> None:
        """ PTL Hook that is called after DPP is initialized. """

-        if isinstance(self.lightning_module.bert_model, MegatronBertEncoder):
+        if self.lightning_module.has_megatron_encoder:
            app_state = AppState()
            if app_state.model_parallel_size is not None:
                # mpu grad clipping needs parameters to have the attribute model_parallel
@ -74,12 +73,8 @@ class NLPDDPPlugin(DDPPlugin):
                    if not hasattr(p, 'model_parallel'):
                        p.model_parallel = False

-                # TODO: figure out how to override clip gradients again
-                # Update PTL trainer to use our _clip_gradients
-                # self._trainer.accelerator_backend._clip_gradients = self._clip_gradients
-
-                if get_checkpoint_version():
-                    # Restored from .nemo, checkpoint_version will already be set
+                if get_checkpoint_version() is not None:
+                    # megatron checkpoint already restored
                    pass
                elif trainer.resume_from_checkpoint is not None:
                    # PTL auto-resuming, need to update checkpoint name
@ -98,10 +93,13 @@ class NLPDDPPlugin(DDPPlugin):
                        logging.warning('Megatron-lm checkpoint version not found. Setting checkpoint_version to 0.')
                        set_checkpoint_version(0)
                else:
-                    logging.info(
-                        f"Restoring from pretrained model parallel checkpoint: {self.lightning_module.bert_model._restore_path}"
-                    )
-                    self.lightning_module.bert_model.restore_weights(self.lightning_module.bert_model._restore_path)
+                    self.lightning_module.restore_megatron_encoder_weights()
+            else:
+                if get_checkpoint_version() is not None:
+                    # megatron checkpoint already restored
+                    pass
+                else:
+                    self.lightning_module.restore_megatron_encoder_weights()

            self.lightning_module.register_megatron_checkpoint_version()

@ -113,7 +111,7 @@ class NLPDDPPlugin(DDPPlugin):

        if app_state.model_parallel_size is not None:

-            if isinstance(self.lightning_module.bert_model, MegatronBertEncoder):
+            if self.has_megatron_encoder:
                # check megatron checkpoint version
                checkpoint_version = get_checkpoint_version()
                if checkpoint_version is None:
@ -140,6 +138,7 @@ class NLPDDPPlugin(DDPPlugin):
                device_ids=device_ids,
                output_device=device_ids[0],
                process_group=app_state.data_parallel_group,
+                find_unused_parameters=True,
                **self._ddp_kwargs,
            )

@ -168,7 +167,6 @@ class NLPDDPPlugin(DDPPlugin):
                app_state.data_parallel_size = mpu.get_data_parallel_world_size()
                logging.info(f'mp_rank: {app_state.model_parallel_rank}')
                logging.info(f'dp_rank: {app_state.data_parallel_rank}')
-                # TODO: get random seed from PTL
                seed = os.environ.get("PL_GLOBAL_SEED", 1234)
                # random seed must be set for megatron model parallel init
                _set_random_seed(seed)
--- a/nemo/utils/exp_manager.py
+++ b/nemo/utils/exp_manager.py
@ -30,6 +30,7 @@ from pytorch_lightning.callbacks import ModelCheckpoint
 from pytorch_lightning.loggers import LoggerCollection as _LoggerCollection
 from pytorch_lightning.loggers import TensorBoardLogger, WandbLogger
 from pytorch_lightning.utilities import rank_zero_only
+from pytorch_lightning.utilities.types import _METRIC

 from nemo.constants import NEMO_ENV_VARNAME_VERSION
 from nemo.utils import app_state, logging
@ -645,6 +646,73 @@ class NeMoModelCheckpoint(ModelCheckpoint):
            trainer.checkpoint_connector.restore(self.best_model_path, on_gpu=trainer.on_gpu)
        pl_module.save_to(save_path=os.path.join(self.dirpath, self.prefix + self.postfix))

+    def _del_model(self, filepath: str) -> None:
+        """ Overrides PTL method to account for model parallel checkpoints.
+            Updates checkpoint path based on model parallel rank.
+        """
+        app_state = AppState()
+        if app_state.model_parallel_size is not None:
+            # filepath needs to be updated to include mp_rank
+            dirname = os.path.dirname(filepath)
+            basename = os.path.basename(filepath)
+            filepath = f'{dirname}/mp_rank_{app_state.model_parallel_rank:02d}/{basename}'
+
+            # each model parallel rank needs to remove its model
+            if app_state.data_parallel_rank == 0:
+                if self._fs.exists(filepath):
+                    self._fs.rm(filepath)
+                    logging.info(f"Removed model parallel checkpoint: {filepath}")
+
+        else:
+            return super()._del_model(filepath)
+
+    def _save_last_checkpoint(self, trainer: 'pl.Trainer', monitor_candidates: Dict[str, _METRIC]) -> None:
+        """ Overrides PTL method to account for model parallel checkpoints.
+            Checks for data parallel rank 0 rather than global rank 0.
+        """
+        app_state = AppState()
+        if app_state.model_parallel_size is not None:
+            if not self.save_last:
+                return
+
+            filepath = self._format_checkpoint_name(self.CHECKPOINT_NAME_LAST, monitor_candidates)
+            filepath = os.path.join(self.dirpath, f"{filepath}{self.FILE_EXTENSION}")
+
+            self._save_model(trainer, filepath)
+
+            # for model parallel we need to delete models for each model parallel rank
+            if self.last_model_path and self.last_model_path != filepath and app_state.data_parallel_rank == 0:
+                self._del_model(self.last_model_path)
+
+            self.last_model_path = filepath
+
+        else:
+            return super()._save_last_checkpoint(trainer, monitor_candidates)
+
+    def _save_none_monitor_checkpoint(self, trainer: 'pl.Trainer', monitor_candidates: Dict[str, _METRIC]) -> None:
+        """ Overrides PTL method to account for model parallel checkpoints.
+            Checks for data parallel rank 0 rather than global rank 0.
+        """
+        app_state = AppState()
+        if app_state.model_parallel_size is not None:
+            if self.monitor is not None or self.save_top_k == 0:
+                return
+
+            filepath = self._get_metric_interpolated_filepath_name(monitor_candidates, trainer)
+            self._save_model(trainer, filepath)
+
+            if (
+                self.save_top_k is None
+                and self.best_model_path
+                and self.best_model_path != filepath
+                and app_state.data_parallel_rank == 0
+            ):
+                self._del_model(self.best_model_path)
+
+            self.best_model_path = filepath
+        else:
+            return super()._save_none_monitor_checkpoint(trainer, monitor_candidates)
+

 def configure_checkpointing(trainer: 'pytorch_lightning.Trainer', log_dir: Path, name: str, params: 'DictConfig'):
    """ Adds ModelCheckpoint to trainer. Raises CheckpointMisconfigurationError if trainer already has a ModelCheckpoint
--- a/tutorials/asr/10_ASR_CTC_Language_Finetuning.ipynb
+++ b/tutorials/asr/10_ASR_CTC_Language_Finetuning.ipynb