Merge branch 'asr_telephony' of github.com:NVIDIA/NeMo into asr_telephony

This commit is contained in:
jbalam 2021-06-17 17:55:08 -07:00
commit e101445a06
29 changed files with 3042 additions and 83 deletions

33
Jenkinsfile vendored
View file

@ -1265,6 +1265,39 @@ pipeline {
}
}
stage('L2: NMT Megatron Model Parallel Size 2 Encoder') {
when {
anyOf{
branch 'main'
changeRequest target: 'main'
}
}
failFast true
steps{
sh 'cd examples/nlp/machine_translation && \
python enc_dec_nmt.py \
--config-path=conf \
--config-name=megatron \
model.encoder.model_name=megatron-bert-uncased \
model.encoder.checkpoint_file=/home/TestData/nlp/mp_2_bert_toy/iter_2000000 \
model.encoder.hidden_size=1024 \
model.encoder.num_attention_heads=16 \
model.encoder.num_layers=24 \
model.encoder.max_position_embeddings=512 \
model.train_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
model.train_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref \
model.validation_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
model.validation_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
model.test_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
model.test_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
model.decoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/tt_tokenizer.BPE.4096.model \
trainer.gpus=[0,1] \
+trainer.fast_dev_run=true \
exp_manager=null \
'
}
}
stage('L2: NMT Tarred Dataset Creation') {
when {
anyOf {

View file

@ -42,7 +42,7 @@ a
a:visited
{
color: #b6b6b6;
color: #218219;
}

View file

@ -1,21 +0,0 @@
Model Name,Model Base Class,Model Card
QuartzNet15x5Base-En,EncDecCTCModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemospeechmodels"
stt_en_jasper10x5dr,EncDecCTCModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_jasper10x5dr"
stt_en_citrinet_256,EncDecCTCBPEModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_citrinet_256"
stt_en_citrinet_512,EncDecCTCBPEModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_citrinet_512"
stt_en_citrinet_1024,EncDecCTCBPEModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_citrinet_1024"
stt_ca_quartznet15x5,EncDecCTCModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_ca_quartznet15x5"
stt_it_quartznet15x5,EncDecCTCModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_it_quartznet15x5"
stt_fr_quartznet15x5,EncDecCTCModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_fr_quartznet15x5"
stt_es_quartznet15x5,EncDecCTCModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_es_quartznet15x5"
stt_de_quartznet15x5,EncDecCTCModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_de_quartznet15x5"
stt_pl_quartznet15x5,EncDecCTCModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_pl_quartznet15x5"
stt_ru_quartznet15x5,EncDecCTCModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_ru_quartznet15x5"
stt_zh_quartznet15x5,EncDecCTCModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_zh_quartznet15x5"
stt_zh_citrinet_512,EncDecCTCBPEModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_zh_citrinet_512"
stt_en_conformer_ctc_small,EncDecCTCBPEModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_conformer_ctc_small"
stt_en_conformer_ctc_medium,EncDecCTCBPEModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_conformer_ctc_medium"
stt_en_conformer_ctc_large,EncDecCTCBPEModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_conformer_ctc_large"
stt_en_conformer_ctc_small_ls,EncDecCTCBPEModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_conformer_ctc_small_ls"
stt_en_conformer_ctc_medium_ls,EncDecCTCBPEModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_conformer_ctc_medium_ls"
stt_en_conformer_ctc_large_ls,EncDecCTCBPEModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_conformer_ctc_large_ls"
1 Model Name Model Base Class Model Card
2 QuartzNet15x5Base-En EncDecCTCModel https://ngc.nvidia.com/catalog/models/nvidia:nemospeechmodels
3 stt_en_jasper10x5dr EncDecCTCModel https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_jasper10x5dr
4 stt_en_citrinet_256 EncDecCTCBPEModel https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_citrinet_256
5 stt_en_citrinet_512 EncDecCTCBPEModel https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_citrinet_512
6 stt_en_citrinet_1024 EncDecCTCBPEModel https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_citrinet_1024
7 stt_ca_quartznet15x5 EncDecCTCModel https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_ca_quartznet15x5
8 stt_it_quartznet15x5 EncDecCTCModel https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_it_quartznet15x5
9 stt_fr_quartznet15x5 EncDecCTCModel https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_fr_quartznet15x5
10 stt_es_quartznet15x5 EncDecCTCModel https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_es_quartznet15x5
11 stt_de_quartznet15x5 EncDecCTCModel https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_de_quartznet15x5
12 stt_pl_quartznet15x5 EncDecCTCModel https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_pl_quartznet15x5
13 stt_ru_quartznet15x5 EncDecCTCModel https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_ru_quartznet15x5
14 stt_zh_quartznet15x5 EncDecCTCModel https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_zh_quartznet15x5
15 stt_zh_citrinet_512 EncDecCTCBPEModel https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_zh_citrinet_512
16 stt_en_conformer_ctc_small EncDecCTCBPEModel https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_conformer_ctc_small
17 stt_en_conformer_ctc_medium EncDecCTCBPEModel https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_conformer_ctc_medium
18 stt_en_conformer_ctc_large EncDecCTCBPEModel https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_conformer_ctc_large
19 stt_en_conformer_ctc_small_ls EncDecCTCBPEModel https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_conformer_ctc_small_ls
20 stt_en_conformer_ctc_medium_ls EncDecCTCBPEModel https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_conformer_ctc_medium_ls
21 stt_en_conformer_ctc_large_ls EncDecCTCBPEModel https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_conformer_ctc_large_ls

View file

@ -1,11 +1,12 @@
Model Name,Model Base Class,Model Card
QuartzNet15x5Base-En,EncDecCTCModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemospeechmodels"
stt_zh_quartznet15x5,EncDecCTCModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_zh_quartznet15x5"
stt_en_jasper10x5dr,EncDecCTCModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_jasper10x5dr"
stt_en_citrinet_256,EncDecCTCBPEModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_citrinet_256"
stt_en_citrinet_512,EncDecCTCBPEModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_citrinet_512"
stt_en_citrinet_1024,EncDecCTCBPEModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_citrinet_1024"
stt_zh_citrinet_512,EncDecCTCBPEModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_zh_citrinet_512"
stt_en_citrinet_256_gamma_0_25,EncDecCTCBPEModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_citrinet_256_gamma_0_25"
stt_en_citrinet_512_gamma_0_25,EncDecCTCBPEModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_citrinet_512_gamma_0_25"
stt_en_citrinet_1024_gamma_0_25,EncDecCTCBPEModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_citrinet_1024_gamma_0_25"
stt_en_conformer_ctc_small,EncDecCTCBPEModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_conformer_ctc_small"
stt_en_conformer_ctc_medium,EncDecCTCBPEModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_conformer_ctc_medium"
stt_en_conformer_ctc_large,EncDecCTCBPEModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_conformer_ctc_large"

1 Model Name Model Base Class Model Card
2 QuartzNet15x5Base-En EncDecCTCModel https://ngc.nvidia.com/catalog/models/nvidia:nemospeechmodels
stt_zh_quartznet15x5 EncDecCTCModel https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_zh_quartznet15x5
3 stt_en_jasper10x5dr EncDecCTCModel https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_jasper10x5dr
4 stt_en_citrinet_256 EncDecCTCBPEModel https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_citrinet_256
5 stt_en_citrinet_512 EncDecCTCBPEModel https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_citrinet_512
6 stt_en_citrinet_1024 EncDecCTCBPEModel https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_citrinet_1024
7 stt_zh_citrinet_512 stt_en_citrinet_256_gamma_0_25 EncDecCTCBPEModel https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_zh_citrinet_512 https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_citrinet_256_gamma_0_25
8 stt_en_citrinet_512_gamma_0_25 EncDecCTCBPEModel https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_citrinet_512_gamma_0_25
9 stt_en_citrinet_1024_gamma_0_25 EncDecCTCBPEModel https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_citrinet_1024_gamma_0_25
10 stt_en_conformer_ctc_small EncDecCTCBPEModel https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_conformer_ctc_small
11 stt_en_conformer_ctc_medium EncDecCTCBPEModel https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_conformer_ctc_medium
12 stt_en_conformer_ctc_large EncDecCTCBPEModel https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_conformer_ctc_large

View file

@ -1,3 +1,3 @@
Model,Model Base Class,Model Card
stt_es_quartznet15x5,EncDecCTCModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_es_quartznet15x5"
stt_es_citrinet_512,EncDecCTCModelBPE,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_es_citrinet_512"

1 Model Model Base Class Model Card
2 stt_es_quartznet15x5 EncDecCTCModel https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_es_quartznet15x5
3 stt_es_citrinet_512 EncDecCTCModelBPE https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_es_citrinet_512

View file

@ -1,3 +1,2 @@
Model,Model Base Class,Model Card
stt_zh_quartznet15x5,EncDecCTCModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_zh_quartznet15x5"
stt_zh_citrinet_512,EncDecCTCBPEModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_zh_citrinet_512"

1 Model Model Base Class Model Card
stt_zh_quartznet15x5 EncDecCTCModel https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_zh_quartznet15x5
2 stt_zh_citrinet_512 EncDecCTCBPEModel https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_zh_citrinet_512

View file

@ -71,14 +71,16 @@ To perform inference and transcribe a sample of speech after loading the model,
Setting the argument ``logprobs`` to ``True`` returns the log probabilities instead of transcriptions. For more information, see :doc:`./api.html#modules`.
The audio files should be 16KHz monochannel wav files.
Automatic Speech Recognition Models
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
Fine-tuning on Different Datasets
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
.. csv-table::
:file: data/asr_results.csv
:align: left
:widths: 30, 30, 40
:header-rows: 1
There are multiple ASR tutorials provided in the :ref:`Tutorials <tutorials>` section. Most of these tutorials explain how to instantiate a pre-trained model, prepare the model for fine-tuning on some dataset (in the same language) as a demonstration.
Automatic Speech Recognition Models
-----------------------------------
Below is a list of all the ASR models that are available in NeMo for specific languages, as well as auxiliary language models for certain languages.
Language Models for ASR
^^^^^^^^^^^^^^^^^^^^^^^
@ -89,7 +91,8 @@ Language Models for ASR
:widths: 30, 30, 40
:header-rows: 1
|
Speech Recognition (Languages)
------------------------------

185
docs/source/core/export.rst Normal file
View file

@ -0,0 +1,185 @@
Exporting NeMo Models
=====================
Exporting Models
----------------
Most of the NeMo models can be exported to ONNX or TorchScript to be deployed for inference in optimized execution environments, such as Jarvis or Triton Inference Server.
Export interface is provided by the ``Exportable`` mix-in class. If a model extends ``Exportable``, it can be exported by:
.. code-block:: Python
from nemo.core.classes import ModelPT, Exportable
# deriving from Exportable
class MyExportableModel(ModelPT, Exportable):
...
mymodel = MyExportableModel.from_pretrained(model_name="MyModelName")
# exporting pre-trained model to ONNX file for deployment.
mymodel.export('mymodel.onnx', [options])
How to Use Model Export
-----------------------
The following arguments are for ``Exportable.export()``. In most cases, you should only supply the name of the output file and use all defaults:
.. code-block:: Python
def export(
self,
output: str,
input_example=None,
output_example=None,
verbose=False,
export_params=True,
do_constant_folding=True,
keep_initializers_as_inputs=False,
onnx_opset_version: int = 13,
try_script: bool = False,
set_eval: bool = True,
check_trace: bool = False,
use_dynamic_axes: bool = True,
dynamic_axes=None,
check_tolerance=0.01,
):
The ``output``, ``input_example``, ``output_example``, ``verbose``, ``export_params``, ``do_constant_folding``, ``keep_initializers_as_inputs``, ``onnx_opset_version``, ``set_eval`` options have the same semantics as in Pytorch ``onnx.export()`` and ``jit.trace()`` functions and are passed through. For more information about Pytorch's``onnx.export()``, refer to the `torch.onnx functions documentation
<https://pytorch.org/docs/stable/onnx.html#functions>`_.
The file extension of the ``output`` parameter determines export format: ``.onnx->ONNX``, ``.pt`` or ``.ts`` -> ``TorchScript``. If ``input_example`` is None, ``Exportable.input_example()`` is called.
**TorchScript-specific**: If ``try_script`` is ``True``, ``export()`` tries ``jit.script()`` before ``jit.trace()``.
The ``check_trace`` arg is passed through to ``jit.trace()``.
**ONNX-specific**: If ``use_dynamic_axes`` is True, ``onnx.export()`` is called with dynamic axes. If ``dynamic_axes`` is ``None``, they are inferred from the model's ``input_types`` definition (batch dimension is dynamic, and so is duration etc).
If ``check_trace`` is ``True``, the resulting ONNX also runs on ``input_example`` and the results compared to ``output_example`` using the ``check_tolerance`` argument. Note the higher tolerance default.
How to Make Model Exportable
----------------------------
If you are simply using NeMo models, the previous example is all you need to know.
If you write your own models, this section highlights the things you need to be aware of after extending ``Exportable``.
Exportable Hooks and Overrides
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
You should not normally need to override ``Exportable`` default methods. However, ``Exportable.export()`` relies on the assumptions that certain methods are available in your class.
.. code-block:: Python
@property
def input_example(self) # => Tuple(input, [(input, ...], [Dict])
"""
Generates input examples for tracing etc.
Returns:
A tuple of input examples.
"""
This function should return a tuple of (normally) Tensors - one per each of model inputs (args to ``forward()``). The last element may be a ``Dict`` to specify non-positional arguments by name, as per Torch ``export()`` convention. For more information, refer to the `Using dictionaries to handle Named Arguments as model inputs
<https://pytorch.org/docs/stable/onnx.html#using-dictionaries-to-handle-named-arguments-as-model-inputs>`_.
Note: ``Dict`` currently does not work with Torchscript ``trace()``.
.. code-block:: Python
@property
def input_types(self):
@property
def output_types(self):
Those are needed for inferring in/out names and dynamic axes. If your model derives from ``ModulePT``, those are already there. Another common scenario is that your model contains one or more modules that processes input and generates output. Then, you should override ``Exportable`` methods ``input_module()`` and ``output_module()`` to point to them, like in this example:
.. code-block:: Python
@property
def input_module(self):
return self.fastpitch
@property
def output_module(self):
return self.fastpitch
Your model should also have an export-friendly ``forward()`` method - that can mean different things for ONNX ant TorchScript. For ONNX, you can't have forced named parameters without default, like ``forward(self, *, text)``. For TorchScript, you should avoid ``None`` and use ``Optional`` instead. The criterias are highly volatile and may change with every PyTorch version, so it's a trial-and-error process. There is also the general issue that in many cases, ``forward()`` for inference can be simplified and even use less inputs/outputs. To address this, ``Exportable`` looks for ``forward_for_export()`` method in your model and uses that instead of ``forward()`` to export:
.. code-block:: Python
# Uses forced named args, many default parameters.
def forward(
self,
*,
text,
durs=None,
pitch=None,
speaker=0,
pace=1.0,
spec=None,
attn_prior=None,
mel_lens=None,
input_lens=None,
):
# Passes through all self.fastpitch outputs
return self.fastpitch(
text=text,
durs=durs,
pitch=pitch,
speaker=speaker,
pace=pace,
spec=spec,
attn_prior=attn_prior,
mel_lens=mel_lens,
input_lens=input_lens,
)
# Uses less inputs, no '*', returns less outputs:
def forward_for_export(self, text):
(
spect,
durs_predicted,
log_durs_predicted,
pitch_predicted,
attn_soft,
attn_logprob,
attn_hard,
attn_hard_dur,
pitch,
) = self.fastpitch(text=text)
return spect, durs_predicted, log_durs_predicted, pitch_predicted
To stay consistent with input_types()/output_types(), there are also those hooks in ``Exportable`` that let you exclude particular inputs/outputs from the export process:
.. code-block:: Python
@property
def disabled_deployment_input_names(self):
"""Implement this method to return a set of input names disabled for export"""
return set(["durs", "pitch", "speaker", "pace", "spec", "attn_prior", "mel_lens", "input_lens"])
@property
def disabled_deployment_output_names(self):
Another common requirement for models that are being exported is to run certain net modifications for inference efficiency before exporting - like disabling masks in some convolutions or removing batch normalizations. A better style is to make those happen on ``ModelPT.eval()`` (and reversed on ``.train()``), but it's not always feasible so the following hook is provided in ``Exportable`` to run those:
.. code-block:: Python
def _prepare_for_export(self, **kwargs):
"""
Override this method to prepare module for export. This is in-place operation.
Base version does common necessary module replacements (Apex etc)
"""
# do graph modifications specific for this model
replace_1D_2D = kwargs.get('replace_1D_2D', False)
replace_for_export(self, replace_1D_2D)
# call base method for common set of modifications
Exportable._prepare_for_export(self, **kwargs)
Exportable Model Code
~~~~~~~~~~~~~~~~~~~~~
Most importantly, the actual Torch code in your model should be ONNX or TorchScript - compatible (ideally, both).
#. Ensure the code is written in Torch - avoid bare `Numpy or Python operands <https://pytorch.org/docs/stable/onnx.html#write-pytorch-model-in-torch-way>`_.
#. Create your model ``Exportable`` and add an export unit test, to catch any operation/construct not supported in ONNX/TorchScript, immediately.
For more information, refer to the PyTorch documentation:
- `List of supported operators <https://pytorch.org/docs/stable/onnx.html#supported-operators>`_
- `Tracing vs. scripting <https://pytorch.org/docs/stable/onnx.html#tracing-vs-scripting>`_
- `AlexNet example <https://pytorch.org/docs/stable/onnx.html#example-end-to-end-alexnet-from-pytorch-to-onnx>`_

View file

@ -17,6 +17,7 @@ NVIDIA NeMo User Guide
:name: core
core/core
core/export
.. toctree::

View file

@ -457,6 +457,111 @@ can be used to compute sacreBLEU scores.
cat test.en-es.translations | sacrebleu test.es
Pretrained Encoders
-------------------
Pretrained BERT encoders from either `HuggingFace Transformers <https://huggingface.co/models>`__
or `Megatron-LM <https://github.com/NVIDIA/Megatron-LM>`__
can be used to to train NeMo NMT models.
The ``library`` flag takes values: ``huggingface``, ``megatron``, and ``nemo``.
The ``model_name`` flag is used to indicate a *named* model architecture.
For example, we can use ``bert_base_cased`` from HuggingFace or ``megatron-bert-345m-cased`` from Megatron-LM.
The ``pretrained`` flag indicates whether or not to download the pretrained weights (``pretrained=True``) or
instantiate the same model architecture with random weights (``pretrained=False``).
To use a custom model architecture from a specific library, use ``model_name=null`` and then add the
custom configuration under the ``encoder`` configuration.
HuggingFace
^^^^^^^^^^^
We have provided a `HuggingFace config file <https://github.com/NVIDIA/NeMo/blob/main/examples/nlp/machine_translation/conf/huggingface.yaml>`__
to use with HuggingFace encoders.
To use the config file from CLI:
.. code ::
--config-path=conf \
--config-name=huggingface \
As an example, we can configure the NeMo NMT encoder to use ``bert-base-cased`` from HuggingFace
by using the ``huggingface`` config file and setting
.. code ::
model.encoder.pretrained=true \
model.encoder.model_name=bert-base-cased \
To use a custom architecture from HuggingFace we can use
.. code ::
+model.encoder._target_=transformers.BertConfig \
+model.encoder.hidden_size=1536 \
Note the ``+`` symbol is needed if we're not adding the arguments to the YAML config file.
Megatron
^^^^^^^^
We have provided a `Megatron config file <https://github.com/NVIDIA/NeMo/blob/main/examples/nlp/machine_translation/conf/megatron.yaml>`__
to use with Megatron encoders.
To use the config file from CLI:
.. code ::
--config-path=conf \
--config-name=megatron \
The ``checkpoint_file`` should be the path to Megatron-LM checkpoint:
.. code ::
/path/to/your/megatron/checkpoint/model_optim_rng.pt
In case your megatron model requires model parallelism, then ``checkpoint_file`` should point to the directory containing the
standard Megatron-LM checkpoint format:
.. code ::
3.9b_bert_no_rng
├── mp_rank_00
│ └── model_optim_rng.pt
├── mp_rank_01
│ └── model_optim_rng.pt
├── mp_rank_02
│ └── model_optim_rng.pt
└── mp_rank_03
└── model_optim_rng.pt
As an example, to train a NeMo NMT model with a 3.9B Megatron BERT encoder,
we would use the following encoder configuration:
.. code ::
model.encoder.checkpoint_file=/path/to/megatron/checkpoint/3.9b_bert_no_rng \
model.encoder.hidden_size=2560 \
model.encoder.num_attention_heads=40 \
model.encoder.num_layers=48 \
model.encoder.max_position_embeddings=512 \
To train a Megatron 345M BERT, we would use
.. code ::
model.encoder.model_name=megatron-bert-cased \
model.encoder.checkpoint_file=/path/to/your/megatron/checkpoint/model_optim_rng.pt \
model.encoder.hidden_size=1024 \
model.encoder.num_attention_heads=16 \
model.encoder.num_layers=24 \
model.encoder.max_position_embeddings=512 \
References
----------

View file

@ -43,6 +43,9 @@ To run a tutorial:
* - ASR
- Online ASR inference with Microphone
- `Online ASR Microphone <https://github.com/NVIDIA/NeMo/blob/v1.0.2/tutorials/asr/02_Online_ASR_Microphone_Demo.ipynb>`_
* - ASR
- Fine-tuning CTC Models on New Languages
- `ASR CTC Language Fine-Tuning <https://colab.research.google.com/github/NVIDIA/NeMo/blob/main/tutorials/asr/10_ASR_CTC_Language_Finetuning.ipynb>`_
* - ASR
- Speech Commands
- `Speech Commands <https://colab.research.google.com/github/NVIDIA/NeMo/blob/v1.0.2/tutorials/asr/03_Speech_Commands.ipynb>`_

View file

@ -132,6 +132,17 @@ def main(cfg: DictConfig) -> None:
trainer.fit(model)
if cfg.model.nemo_path:
model.save_to(cfg.model.nemo_path)
else:
data_dir = cfg.model.dataset.get('data_dir', None)
dialogues_example_dir = cfg.model.dataset.get('dialogues_example_dir', None)
if data_dir is None or dialogues_example_dir is None:
raise ValueError('No dataset directory provided. Skipping evaluation. ')
elif not os.path.exists(data_dir):
raise ValueError(f'{data_dir} is not found, skipping evaluation on the test set.')
else:
model.update_data_dirs(data_dir=data_dir, dialogues_example_dir=dialogues_example_dir)
model._cfg.dataset = cfg.model.dataset
if hasattr(cfg.model, 'test_ds') and cfg.model.test_ds.ds_item is not None:
gpu = 1 if cfg.trainer.gpus != 0 else 0

View file

@ -0,0 +1,160 @@
name: MegatronEncoder
do_training: True # set to False if only preprocessing data
do_testing: False # set to True to run evaluation on test data after training
model:
beam_size: 4
len_pen: 0.6
max_generation_delta: 5
label_smoothing: 0.1
shared_tokenizer: false
preproc_out_dir: null
src_language: 'en'
tgt_language: 'de'
train_ds:
src_file_name: null
tgt_file_name: null
use_tarred_dataset: False # if true tar_file_name and meta_file_name will be used (or created automatically)
# config for preprocessing training data and creating a tarred datset automatically
tar_file_prefix: parallel # prefix for tar file names
tar_files: null # if data has already been preprocessed (rest of config ignored)
metadata_file: null # metadata for tarred dataset
lines_per_dataset_fragment: 1000000 # Number of lines to consider for bucketing and padding
num_batches_per_tarfile: 100 # Number of batches (pickle files) within each tarfile
tar_shuffle_n: 100 # How many samples to look ahead and load to be shuffled
shard_strategy: scatter # tarred dataset shard distribution strategy
n_preproc_jobs: -2 # number of processes to use for data preprocessing (-2 means all but 2)
tokens_in_batch: 512
clean: true
max_seq_length: 512
shuffle: true
num_samples: -1
drop_last: false
pin_memory: false
num_workers: 8
validation_ds:
src_file_name: null
tgt_file_name: null
tokens_in_batch: 512
clean: false
max_seq_length: 512
shuffle: false
num_samples: -1
drop_last: false
pin_memory: false
num_workers: 8
test_ds:
src_file_name: null
tgt_file_name: null
tokens_in_batch: 512
clean: false
max_seq_length: 512
shuffle: false
num_samples: -1
drop_last: false
pin_memory: false
num_workers: 8
optim:
name: adam
lr: 0.001
betas:
- 0.9
- 0.98
weight_decay: 0.0
sched:
name: InverseSquareRootAnnealing
min_lr: 0.0
last_epoch: -1
warmup_ratio: 0.1
encoder_tokenizer:
library: megatron
tokenizer_model: null
vocab_file: null
special_tokens: null
vocab_size: null
model_name: null
decoder_tokenizer:
library: yttm
tokenizer_model: null
vocab_file: null
special_tokens: null
vocab_size: null
encoder:
library: megatron
# If using a pretrained megatron bert model from NGC, then use the corresponding model name
# For example, 'megatron-bert-345m-uncased'.
# If restoring from a local checkpoint, then use either 'megatron-bert-uncased' or 'megatron-bert-cased'
model_name: megatron-bert-uncased # or megatron-bert-cased
# If restoring from a model parallel checkpoint, then checkpoint_file should be a path to
# the directory containing the megatron-lm checkpoints. The directory will have the structure:
# /path/to/my/checkpoint/
# ├── mp_rank_00
# │ └── model_optim_rng.pt
# └── mp_rank_01
# └── model_optim_rng.pt
# If not using a model parallel checkpoint, then use the full path to the checkpoint:
# /path/to/my/checkpoint/model_optim_rng.pt
checkpoint_file: null
vocab_file : null
pretrained: true # only pretrained=true supported for now
# model architecture configuration
hidden_size: 1024
num_attention_heads: 16
num_layers: 24
max_position_embeddings: 512
num_tokentypes: 0
decoder:
library: nemo
model_name: null
pretrained: false
max_sequence_length: 512
num_token_types: 2
embedding_dropout: 0.1
learn_positional_encodings: false
hidden_size: 512
inner_size: 2048
num_layers: 6
num_attention_heads: 8
ffn_dropout: 0.1
attn_score_dropout: 0.1
attn_layer_dropout: 0.1
hidden_act: relu
pre_ln: false
head:
num_layers: 1
activation: relu
log_softmax: true
dropout: 0.0
use_transformer_init: true
trainer:
gpus: 4
num_nodes: 1
max_epochs: 200
amp_level: O2 # O1/O2 for mixed precision
precision: 16 # Should be set to 16 for O1 and O2, default is 16 as PT ignores it when am_level is O0
accelerator: ddp
checkpoint_callback: False
logger: False
log_every_n_steps: 50 # Interval of logging.
check_val_every_n_epoch: 1
exp_manager:
name: ${name}
files_to_copy: []

View file

@ -21,6 +21,7 @@ from pytorch_lightning import Trainer
from nemo.collections.nlp.data.machine_translation.preproc_mt_data import MTDataPreproc
from nemo.collections.nlp.models.machine_translation.mt_enc_dec_config import MTEncDecModelConfig
from nemo.collections.nlp.models.machine_translation.mt_enc_dec_model import MTEncDecModel
from nemo.collections.nlp.parts.nlp_overrides import NLPDDPPlugin
from nemo.core.config import hydra_runner
from nemo.core.config.modelPT import NemoConfig
from nemo.core.config.pytorch_lightning import TrainerConfig
@ -108,7 +109,9 @@ def main(cfg: MTEncDecConfig) -> None:
logging.info(f'Config: {OmegaConf.to_yaml(cfg)}')
# training is managed by PyTorch Lightning
trainer = Trainer(**cfg.trainer)
trainer_cfg = OmegaConf.to_container(cfg.trainer)
trainer_cfg.pop('plugins', None)
trainer = Trainer(plugins=[NLPDDPPlugin()], **trainer_cfg)
# tokenizers will be trained and and tarred training data will be created if needed
# model config is then updated

View file

@ -63,6 +63,27 @@ class EncDecCTCModelBPE(EncDecCTCModel, ASRBPEMixin):
description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_citrinet_1024",
location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_en_citrinet_1024/versions/1.0.0rc1/files/stt_en_citrinet_1024.nemo",
)
results.append(model)
model = PretrainedModelInfo(
pretrained_model_name="stt_en_citrinet_256_gamma_0_25",
description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_citrinet_256_gamma_0_25",
location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_en_citrinet_256_gamma_0_25/versions/1.0.0/files/stt_en_citrinet_256_gamma_0_25.nemo",
)
results.append(model)
model = PretrainedModelInfo(
pretrained_model_name="stt_en_citrinet_512_gamma_0_25",
description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_citrinet_512_gamma_0_25",
location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_en_citrinet_512_gamma_0_25/versions/1.0.0/files/stt_en_citrinet_512_gamma_0_25.nemo",
)
results.append(model)
model = PretrainedModelInfo(
pretrained_model_name="stt_en_citrinet_1024_gamma_0_25",
description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_citrinet_1024_gamma_0_25",
location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_en_citrinet_1024_gamma_0_25/versions/1.0.0/files/stt_en_citrinet_1024_gamma_0_25.nemo",
)
results.append(model)

View file

@ -64,13 +64,6 @@ class EncDecCTCModel(ASRModel, ExportableEncDecModel, ASRModuleMixin):
)
results.append(model)
model = PretrainedModelInfo(
pretrained_model_name="stt_zh_quartznet15x5",
description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_zh_quartznet15x5",
location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_zh_quartznet15x5/versions/1.0.0rc1/files/stt_zh_quartznet15x5.nemo",
)
results.append(model)
model = PretrainedModelInfo(
pretrained_model_name="stt_en_jasper10x5dr",
description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_jasper10x5dr",

View file

@ -64,7 +64,7 @@ class MTDataPreproc:
self.world_size = trainer.num_nodes * trainer.num_gpus
if hasattr(cfg, 'train_ds'):
supported_tokenizers = ['yttm', 'huggingface', 'sentencepiece']
supported_tokenizers = ['yttm', 'huggingface', 'sentencepiece', 'megatron']
supported_train_tokenizers = ['yttm', 'sentencepiece']
if (
@ -182,7 +182,7 @@ class MTDataPreproc:
# Preprocess data and cache for use during training
if self.global_rank == 0:
logging.info(
f"Using tarred dataset for src: {cfg.train_ds.get('src_file_name')} and tgt: {cfg.train_ds.get('tgt_file_name')}"
f"Creating tarred dataset for src: {cfg.train_ds.get('src_file_name')} and tgt: {cfg.train_ds.get('tgt_file_name')}"
)
if not cfg.get('multilingual'):
@ -247,6 +247,7 @@ class MTDataPreproc:
logging.info(
f"Using tarred dataset created in folder(s) {outdir_list} and metadata created at {self._cfg.train_ds.metadata_file}"
)
elif cfg.train_ds.get('tar_files') is not None and cfg.train_ds.get('metadata_file') is None:
raise ValueError('A metadata file is required for tarred dataset but cfg.metadata_file is None.')
elif cfg.train_ds.get('tar_files') is None and cfg.train_ds.get('metadata_file') is not None:

View file

@ -36,6 +36,7 @@ from nemo.collections.nlp.modules.common.lm_utils import get_lm_model
from nemo.collections.nlp.parts.utils_funcs import tensor2list
from nemo.core.classes.common import PretrainedModelInfo, typecheck
from nemo.core.neural_types import NeuralType
from nemo.utils import logging
from nemo.utils.get_rank import is_global_rank_zero
__all__ = ['SGDQAModel']
@ -543,6 +544,21 @@ class SGDQAModel(NLPModel):
self.data_prepared = True
def update_data_dirs(self, data_dir: str, dialogues_example_dir: str):
"""
Update data directories
Args:
data_dir: path to data directory
dialogues_example_dir: path to preprocessed dialogues example directory, if not exists will be created.
"""
if not os.path.exists(data_dir):
raise ValueError(f"{data_dir} is not found")
self._cfg.dataset.data_dir = data_dir
self._cfg.dataset.dialogues_example_dir = dialogues_example_dir
logging.info(f'Setting model.dataset.data_dir to {data_dir}.')
logging.info(f'Setting model.dataset.dialogues_example_dir to {dialogues_example_dir}.')
def setup_training_data(self, train_data_config: Optional[DictConfig] = None):
self.prepare_data()
self._train_dl = self._setup_dataloader_from_config(cfg=train_data_config, split=train_data_config.ds_item)
@ -577,4 +593,19 @@ class SGDQAModel(NLPModel):
@classmethod
def list_available_models(cls) -> Optional[PretrainedModelInfo]:
pass
"""
This method returns a list of pre-trained model which can be instantiated directly from NVIDIA's NGC cloud.
Returns:
List of available pre-trained models.
"""
result = []
result.append(
PretrainedModelInfo(
pretrained_model_name="sgdqa_bertbasecased",
location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/sgdqa_bertbasecased/versions/1.0.0/files/sgdqa_bertbasecased.nemo",
description="Dialogue State Tracking model finetuned from NeMo BERT Base Cased on Google SGD dataset which has a joint goal accuracy of 59.72% on dev set and 45.85% on test set.",
)
)
return result

View file

@ -25,7 +25,7 @@ from nemo.collections.nlp.modules.common.transformer.transformer import (
NeMoTransformerConfig,
NeMoTransformerEncoderConfig,
)
from nemo.core.config.modelPT import ModelConfig, OptimConfig, SchedConfig
from nemo.core.config.modelPT import OptimConfig, SchedConfig
@dataclass

View file

@ -132,6 +132,7 @@ class MTEncDecModel(EncDecNLPModel):
library = encoder_cfg_dict.pop('library', 'nemo')
model_name = encoder_cfg_dict.pop('model_name', None)
pretrained = encoder_cfg_dict.pop('pretrained', False)
checkpoint_file = encoder_cfg_dict.pop('checkpoint_file', None)
self.encoder = get_transformer(
library=library,
model_name=model_name,
@ -139,6 +140,7 @@ class MTEncDecModel(EncDecNLPModel):
config_dict=encoder_cfg_dict,
encoder=True,
pre_ln_final_layer_norm=encoder_cfg_dict.get('pre_ln_final_layer_norm', False),
checkpoint_file=checkpoint_file,
)
# decoder from NeMo, Megatron-LM, or HuggingFace
@ -383,7 +385,7 @@ class MTEncDecModel(EncDecNLPModel):
decoder_model_name=None,
):
supported_tokenizers = ['yttm', 'huggingface', 'sentencepiece']
supported_tokenizers = ['yttm', 'huggingface', 'sentencepiece', 'megatron']
if (
encoder_tokenizer_library not in supported_tokenizers
or decoder_tokenizer_library not in supported_tokenizers

View file

@ -32,6 +32,7 @@ from transformers import TRANSFORMERS_CACHE
from nemo.collections.common.tokenizers.huggingface.auto_tokenizer import AutoTokenizer
from nemo.collections.nlp.modules import BertModule, MegatronBertEncoder
from nemo.collections.nlp.modules.common.megatron.megatron_encoder import MegatronEncoderModule
from nemo.collections.nlp.modules.common.megatron.megatron_utils import compute_model_parallel_rank
from nemo.collections.nlp.modules.common.tokenizer_utils import get_tokenizer
from nemo.collections.nlp.parts.nlp_overrides import NLPCheckpointConnector
@ -430,21 +431,19 @@ class NLPModel(ModelPT, Exportable):
@rank_zero_only
def register_megatron_checkpoint_version(self):
""" Adds checkpoint version to .nemo archive """
if self.bert_model is None:
raise ValueError('Instantiate self.bert_model before registering megatron checkpoint version.')
if self.has_megatron_encoder:
checkpoint_version = get_checkpoint_version()
if checkpoint_version is None:
raise ValueError('Unable to get megatron checkpoint version.')
else:
checkpoint_version_dict = {'checkpoint_version': checkpoint_version}
checkpoint_version_path = 'megatron_checkpoint_version.json'
checkpoint_version_src = os.path.join(NEMO_NLP_TMP, checkpoint_version_path)
with open(checkpoint_version_src, 'w') as f:
f.write(json.dumps(checkpoint_version_dict))
self.register_artifact(checkpoint_version_path, checkpoint_version_src)
else:
# get encoder config and create source for artifact
if isinstance(self.bert_model, MegatronBertEncoder):
checkpoint_version = get_checkpoint_version()
if checkpoint_version is None:
raise ValueError('Unable to get megatron checkpoint version.')
else:
checkpoint_version_dict = {'checkpoint_version': checkpoint_version}
checkpoint_version_path = 'megatron_checkpoint_version.json'
checkpoint_version_src = os.path.join(NEMO_NLP_TMP, checkpoint_version_path)
with open(checkpoint_version_src, 'w') as f:
f.write(json.dumps(checkpoint_version_dict))
self.register_artifact(checkpoint_version_path, checkpoint_version_src)
raise ValueError('Registering Megatron checkpoint version but no Megatron encoder detected.')
@staticmethod
def _unpack_nemo_file(path2file: str, out_folder: str) -> str:
@ -461,3 +460,39 @@ class NLPModel(ModelPT, Exportable):
@property
def output_module(self):
return self.classifier
@property
def has_megatron_encoder(self):
if hasattr(self, 'bert_model'):
if isinstance(self.bert_model, MegatronBertEncoder):
return True
else:
return False
elif hasattr(self, 'encoder'):
if isinstance(self.encoder, MegatronEncoderModule):
return True
else:
return False
else:
return False
@property
def is_model_parallel_initialized(self):
app_state = AppState()
if app_state.model_parallel_group is not None:
return True
else:
return False
def restore_megatron_encoder_weights(self):
""" Model parallel weights need to be restored after DDP is initialized and
model parallel ranks are known.
"""
if hasattr(self, 'bert_model'):
if isinstance(self.bert_model, MegatronBertEncoder):
logging.info(f"Restoring from pretrained model parallel checkpoint: {self.bert_model._restore_path}")
self.bert_model.restore_weights(self.bert_model._restore_path)
elif hasattr(self, 'encoder'):
if isinstance(self.encoder, MegatronEncoderModule):
logging.info(f"Restoring from pretrained model parallel checkpoint: {self.encoder.checkpoint_file}")
self.encoder._encoder.restore_weights(self.encoder.checkpoint_file)

View file

@ -33,6 +33,7 @@ from nemo.collections.nlp.modules.common.megatron.megatron_utils import (
from nemo.collections.nlp.modules.common.transformer.transformer import NeMoTransformerConfig
from nemo.collections.nlp.modules.common.transformer.transformer_utils import (
get_huggingface_transformer,
get_megatron_transformer,
get_nemo_transformer,
)
from nemo.utils import logging
@ -176,4 +177,16 @@ def get_transformer(
model_name=model_name, pretrained=pretrained, config_dict=config_dict, encoder=encoder
)
elif library == 'megatron':
model = get_megatron_transformer(
model_name=model_name,
pretrained=pretrained,
config_dict=config_dict,
encoder=encoder,
checkpoint_file=checkpoint_file,
)
else:
raise ValueError("Libary must be 'nemo', 'huggingface' or 'megatron'")
return model

View file

@ -65,6 +65,13 @@ class MegatronBertEncoder(BertModule):
self._app_state = None
self._model_name = model_name
if 'vocab_size' in config:
self._vocab_size = config.pop('vocab_size')
else:
self._vocab_size = None
self._hidden_size = config.get('hidden_size')
if not os.path.exists(vocab_file):
raise ValueError(f'Vocab file not found at {vocab_file}')
@ -76,6 +83,8 @@ class MegatronBertEncoder(BertModule):
config['lazy_mpu_init'] = True
config['onnx_safe'] = True
num_tokentypes = config.pop('num_tokentypes', 2)
# if 'model_parallel_size' in config:
if self._model_parallel_size is not None:
app_state = AppState()
@ -109,7 +118,7 @@ class MegatronBertEncoder(BertModule):
logging.info(f'Megatron-lm argparse args: {args}')
self.language_model, self._language_model_key = get_language_model(
attention_mask_func=bert_attention_mask_func, num_tokentypes=2, add_pooler=False
attention_mask_func=bert_attention_mask_func, num_tokentypes=num_tokentypes, add_pooler=False
)
self.config = OmegaConf.create(config)
@ -151,8 +160,18 @@ class MegatronBertEncoder(BertModule):
"""
return self._hidden_size
@property
def vocab_size(self):
"""
Property returning vocab size.
Returns:
vocab size.
"""
return self._vocab_size
@typecheck()
def forward(self, input_ids, attention_mask, token_type_ids):
def forward(self, input_ids, attention_mask, token_type_ids=None):
app_state = AppState()
if app_state.model_parallel_size is None:
self.complete_lazy_init()

View file

@ -0,0 +1,101 @@
# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import Optional
from nemo.collections.nlp.modules.common.encoder_module import EncoderModule
from nemo.collections.nlp.modules.common.megatron.megatron_utils import get_megatron_lm_model
from nemo.core.classes.common import typecheck
from nemo.utils import logging
class MegatronEncoderModule(EncoderModule):
""" Class for using Megatron encoders in NeMo NLP."""
def __init__(
self,
model_name: Optional[str] = None,
pretrained: bool = True,
config_dict: Optional[dict] = None,
checkpoint_file: Optional[str] = None,
vocab_file: Optional[str] = None,
):
"""Gets Megatron BERT based model to be used as an Encoder in NeMo NLP.
Use the model_name arg to get a named model architecture.
Available model names can be found with get_megatron_lm_models_list().
Use the pretrained arg to get the named model architecture with or without pretrained weights.
Use config_dict to pass in arguments needed for Megatron-LM.
For example, to instantiate a Megatron BERT large model we would do:
config_dict={
'hidden_size': 1024,
'num_attention_heads': 16,
'num_layers': 24,
'max_position_embeddings: 512,
}
Args:
model_name (Optional[str]): Named model Megatron architecture from NeMo. Defaults to None.
pretrained (bool): Use True to get pretrained weights.
False will use the same architecture but with randomly initialized weights.
Not implemented yet for Megatron encoders.
Defaults to True.
config_dict (Optional[dict], optional): Use for configuration of the Megatron model. Defaults to None.
checkpoint_file (Optional[str], optional): Provide weights for the transformer from a local checkpoint.
If using model parallel then this should be a directory. Defaults to None.
vocab_file (Optional[str], optional): Path to vocab file that was used when pretraining the Megatron model.
"""
super().__init__()
if not pretrained:
raise ValueError('We currently only support pretrained Megatron models. Please set pretrained=True')
if not checkpoint_file and not model_name:
raise ValueError(
'Currently Megatron models must be loaded from a pretrained model name or a pretrained checkpoint.'
)
if model_name or checkpoint_file:
model, checkpoint_file = get_megatron_lm_model(
pretrained_model_name=model_name,
config_dict=config_dict,
checkpoint_file=checkpoint_file,
vocab_file=vocab_file,
)
self._checkpoint_file = checkpoint_file
self._hidden_size = model.hidden_size
self._vocab_size = model.vocab_size
self._encoder = model
@typecheck()
def forward(self, input_ids, encoder_mask):
encoder_hidden_states = self._encoder.forward(
input_ids=input_ids, attention_mask=encoder_mask, token_type_ids=None
)
return encoder_hidden_states
@property
def checkpoint_file(self) -> Optional[str]:
return self._checkpoint_file
@property
def hidden_size(self) -> Optional[int]:
return self._hidden_size
@property
def vocab_size(self) -> Optional[int]:
return self._vocab_size

View file

@ -111,7 +111,7 @@ def get_nmt_tokenizer(
):
"""
Args:
model_name: if using a pretrained model from NeMo or HuggingFace
model_name: if using a pretrained model from NeMo, HuggingFace, or Megatron
tokenizer_model: tokenizer model file of sentencepiece or youtokentome
special_tokens: dict of special tokens
vocab_file: path to vocab file
@ -138,7 +138,12 @@ def get_nmt_tokenizer(
return nemo.collections.common.tokenizers.sentencepiece_tokenizer.SentencePieceTokenizer(
model_path=tokenizer_model, special_tokens=special_tokens_dict
)
elif library == 'megatron':
logging.info(
f'Getting Megatron tokenizer with pretrained model name: {model_name} and custom vocab file: {vocab_file}'
)
return get_tokenizer(tokenizer_name=model_name, vocab_file=vocab_file)
else:
raise NotImplementedError(
'Currently we only support "yttm", "huggingface", and "sentencepiece" tokenizer library.'
'Currently we only support "yttm", "huggingface", "megatron", and "sentencepiece" tokenizer library.'
)

View file

@ -17,8 +17,10 @@ from typing import Optional, Union
from omegaconf.dictconfig import DictConfig
from nemo.collections.nlp.modules.common.encoder_module import EncoderModule
from nemo.collections.nlp.modules.common.huggingface.huggingface_decoder import HuggingFaceDecoderModule
from nemo.collections.nlp.modules.common.huggingface.huggingface_encoder import HuggingFaceEncoderModule
from nemo.collections.nlp.modules.common.megatron.megatron_encoder import MegatronEncoderModule
from nemo.collections.nlp.modules.common.transformer.transformer import TransformerDecoderNM, TransformerEncoderNM
@ -110,9 +112,33 @@ def get_huggingface_transformer(
config_dict: Optional[Union[dict, DictConfig]] = None,
encoder: bool = True,
) -> Union[HuggingFaceEncoderModule, HuggingFaceDecoderModule]:
if encoder:
model = HuggingFaceEncoderModule(model_name, pretrained, config_dict)
else:
model = HuggingFaceDecoderModule(model_name, pretrained, config_dict)
return model
def get_megatron_transformer(
model_name: Optional[str] = None,
pretrained: bool = True,
config_dict: Optional[Union[dict, DictConfig]] = None,
encoder: bool = True,
checkpoint_file: str = None,
) -> MegatronEncoderModule:
vocab_file = config_dict.pop('vocab_file', None)
if encoder:
model = MegatronEncoderModule(
model_name=model_name,
pretrained=pretrained,
config_dict=config_dict,
checkpoint_file=checkpoint_file,
vocab_file=vocab_file,
)
else:
raise ValueError('Megatron decoders are not currently supported.')
return model

View file

@ -29,6 +29,7 @@ from pytorch_lightning.utilities.cloud_io import atomic_save
from torch.nn.parallel import DistributedDataParallel
from nemo.collections.nlp.modules.common.megatron.megatron_bert import MegatronBertEncoder
from nemo.collections.nlp.modules.common.megatron.megatron_encoder import MegatronEncoderModule
from nemo.utils import AppState, logging
@ -52,20 +53,18 @@ class NLPDDPPlugin(DDPPlugin):
# call PTL init ddp
super().init_ddp_connection()
# init model parallel
# init model parallel if needed
app_state = AppState()
if app_state.model_parallel_size is not None:
if isinstance(self.lightning_module.bert_model, MegatronBertEncoder):
if app_state.model_parallel_group is None:
self.init_model_parallel(app_state.global_rank, app_state.world_size)
if self.lightning_module.has_megatron_encoder and not self.lightning_module.is_model_parallel_initialized:
self.init_model_parallel(app_state.global_rank, app_state.world_size)
def start_training(self, trainer: 'Trainer') -> None:
""" PTL Hook that is called after DPP is initialized. """
if isinstance(self.lightning_module.bert_model, MegatronBertEncoder):
if self.lightning_module.has_megatron_encoder:
app_state = AppState()
if app_state.model_parallel_size is not None:
# mpu grad clipping needs parameters to have the attribute model_parallel
@ -74,12 +73,8 @@ class NLPDDPPlugin(DDPPlugin):
if not hasattr(p, 'model_parallel'):
p.model_parallel = False
# TODO: figure out how to override clip gradients again
# Update PTL trainer to use our _clip_gradients
# self._trainer.accelerator_backend._clip_gradients = self._clip_gradients
if get_checkpoint_version():
# Restored from .nemo, checkpoint_version will already be set
if get_checkpoint_version() is not None:
# megatron checkpoint already restored
pass
elif trainer.resume_from_checkpoint is not None:
# PTL auto-resuming, need to update checkpoint name
@ -98,10 +93,13 @@ class NLPDDPPlugin(DDPPlugin):
logging.warning('Megatron-lm checkpoint version not found. Setting checkpoint_version to 0.')
set_checkpoint_version(0)
else:
logging.info(
f"Restoring from pretrained model parallel checkpoint: {self.lightning_module.bert_model._restore_path}"
)
self.lightning_module.bert_model.restore_weights(self.lightning_module.bert_model._restore_path)
self.lightning_module.restore_megatron_encoder_weights()
else:
if get_checkpoint_version() is not None:
# megatron checkpoint already restored
pass
else:
self.lightning_module.restore_megatron_encoder_weights()
self.lightning_module.register_megatron_checkpoint_version()
@ -113,7 +111,7 @@ class NLPDDPPlugin(DDPPlugin):
if app_state.model_parallel_size is not None:
if isinstance(self.lightning_module.bert_model, MegatronBertEncoder):
if self.has_megatron_encoder:
# check megatron checkpoint version
checkpoint_version = get_checkpoint_version()
if checkpoint_version is None:
@ -140,6 +138,7 @@ class NLPDDPPlugin(DDPPlugin):
device_ids=device_ids,
output_device=device_ids[0],
process_group=app_state.data_parallel_group,
find_unused_parameters=True,
**self._ddp_kwargs,
)
@ -168,7 +167,6 @@ class NLPDDPPlugin(DDPPlugin):
app_state.data_parallel_size = mpu.get_data_parallel_world_size()
logging.info(f'mp_rank: {app_state.model_parallel_rank}')
logging.info(f'dp_rank: {app_state.data_parallel_rank}')
# TODO: get random seed from PTL
seed = os.environ.get("PL_GLOBAL_SEED", 1234)
# random seed must be set for megatron model parallel init
_set_random_seed(seed)

View file

@ -30,6 +30,7 @@ from pytorch_lightning.callbacks import ModelCheckpoint
from pytorch_lightning.loggers import LoggerCollection as _LoggerCollection
from pytorch_lightning.loggers import TensorBoardLogger, WandbLogger
from pytorch_lightning.utilities import rank_zero_only
from pytorch_lightning.utilities.types import _METRIC
from nemo.constants import NEMO_ENV_VARNAME_VERSION
from nemo.utils import app_state, logging
@ -645,6 +646,73 @@ class NeMoModelCheckpoint(ModelCheckpoint):
trainer.checkpoint_connector.restore(self.best_model_path, on_gpu=trainer.on_gpu)
pl_module.save_to(save_path=os.path.join(self.dirpath, self.prefix + self.postfix))
def _del_model(self, filepath: str) -> None:
""" Overrides PTL method to account for model parallel checkpoints.
Updates checkpoint path based on model parallel rank.
"""
app_state = AppState()
if app_state.model_parallel_size is not None:
# filepath needs to be updated to include mp_rank
dirname = os.path.dirname(filepath)
basename = os.path.basename(filepath)
filepath = f'{dirname}/mp_rank_{app_state.model_parallel_rank:02d}/{basename}'
# each model parallel rank needs to remove its model
if app_state.data_parallel_rank == 0:
if self._fs.exists(filepath):
self._fs.rm(filepath)
logging.info(f"Removed model parallel checkpoint: {filepath}")
else:
return super()._del_model(filepath)
def _save_last_checkpoint(self, trainer: 'pl.Trainer', monitor_candidates: Dict[str, _METRIC]) -> None:
""" Overrides PTL method to account for model parallel checkpoints.
Checks for data parallel rank 0 rather than global rank 0.
"""
app_state = AppState()
if app_state.model_parallel_size is not None:
if not self.save_last:
return
filepath = self._format_checkpoint_name(self.CHECKPOINT_NAME_LAST, monitor_candidates)
filepath = os.path.join(self.dirpath, f"{filepath}{self.FILE_EXTENSION}")
self._save_model(trainer, filepath)
# for model parallel we need to delete models for each model parallel rank
if self.last_model_path and self.last_model_path != filepath and app_state.data_parallel_rank == 0:
self._del_model(self.last_model_path)
self.last_model_path = filepath
else:
return super()._save_last_checkpoint(trainer, monitor_candidates)
def _save_none_monitor_checkpoint(self, trainer: 'pl.Trainer', monitor_candidates: Dict[str, _METRIC]) -> None:
""" Overrides PTL method to account for model parallel checkpoints.
Checks for data parallel rank 0 rather than global rank 0.
"""
app_state = AppState()
if app_state.model_parallel_size is not None:
if self.monitor is not None or self.save_top_k == 0:
return
filepath = self._get_metric_interpolated_filepath_name(monitor_candidates, trainer)
self._save_model(trainer, filepath)
if (
self.save_top_k is None
and self.best_model_path
and self.best_model_path != filepath
and app_state.data_parallel_rank == 0
):
self._del_model(self.best_model_path)
self.best_model_path = filepath
else:
return super()._save_none_monitor_checkpoint(trainer, monitor_candidates)
def configure_checkpointing(trainer: 'pytorch_lightning.Trainer', log_dir: Path, name: str, params: 'DictConfig'):
""" Adds ModelCheckpoint to trainer. Raises CheckpointMisconfigurationError if trainer already has a ModelCheckpoint

File diff suppressed because it is too large Load diff