Merge branch 'asr_telephony' of github.com:NVIDIA/NeMo into asr_telephony
This commit is contained in:
commit
e101445a06
33
Jenkinsfile
vendored
33
Jenkinsfile
vendored
|
@ -1265,6 +1265,39 @@ pipeline {
|
|||
}
|
||||
}
|
||||
|
||||
stage('L2: NMT Megatron Model Parallel Size 2 Encoder') {
|
||||
when {
|
||||
anyOf{
|
||||
branch 'main'
|
||||
changeRequest target: 'main'
|
||||
}
|
||||
}
|
||||
failFast true
|
||||
steps{
|
||||
sh 'cd examples/nlp/machine_translation && \
|
||||
python enc_dec_nmt.py \
|
||||
--config-path=conf \
|
||||
--config-name=megatron \
|
||||
model.encoder.model_name=megatron-bert-uncased \
|
||||
model.encoder.checkpoint_file=/home/TestData/nlp/mp_2_bert_toy/iter_2000000 \
|
||||
model.encoder.hidden_size=1024 \
|
||||
model.encoder.num_attention_heads=16 \
|
||||
model.encoder.num_layers=24 \
|
||||
model.encoder.max_position_embeddings=512 \
|
||||
model.train_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
|
||||
model.train_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref \
|
||||
model.validation_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
|
||||
model.validation_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
|
||||
model.test_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
|
||||
model.test_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
|
||||
model.decoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/tt_tokenizer.BPE.4096.model \
|
||||
trainer.gpus=[0,1] \
|
||||
+trainer.fast_dev_run=true \
|
||||
exp_manager=null \
|
||||
'
|
||||
}
|
||||
}
|
||||
|
||||
stage('L2: NMT Tarred Dataset Creation') {
|
||||
when {
|
||||
anyOf {
|
||||
|
|
|
@ -42,7 +42,7 @@ a
|
|||
|
||||
a:visited
|
||||
{
|
||||
color: #b6b6b6;
|
||||
color: #218219;
|
||||
}
|
||||
|
||||
|
||||
|
|
|
@ -1,21 +0,0 @@
|
|||
Model Name,Model Base Class,Model Card
|
||||
QuartzNet15x5Base-En,EncDecCTCModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemospeechmodels"
|
||||
stt_en_jasper10x5dr,EncDecCTCModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_jasper10x5dr"
|
||||
stt_en_citrinet_256,EncDecCTCBPEModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_citrinet_256"
|
||||
stt_en_citrinet_512,EncDecCTCBPEModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_citrinet_512"
|
||||
stt_en_citrinet_1024,EncDecCTCBPEModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_citrinet_1024"
|
||||
stt_ca_quartznet15x5,EncDecCTCModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_ca_quartznet15x5"
|
||||
stt_it_quartznet15x5,EncDecCTCModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_it_quartznet15x5"
|
||||
stt_fr_quartznet15x5,EncDecCTCModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_fr_quartznet15x5"
|
||||
stt_es_quartznet15x5,EncDecCTCModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_es_quartznet15x5"
|
||||
stt_de_quartznet15x5,EncDecCTCModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_de_quartznet15x5"
|
||||
stt_pl_quartznet15x5,EncDecCTCModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_pl_quartznet15x5"
|
||||
stt_ru_quartznet15x5,EncDecCTCModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_ru_quartznet15x5"
|
||||
stt_zh_quartznet15x5,EncDecCTCModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_zh_quartznet15x5"
|
||||
stt_zh_citrinet_512,EncDecCTCBPEModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_zh_citrinet_512"
|
||||
stt_en_conformer_ctc_small,EncDecCTCBPEModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_conformer_ctc_small"
|
||||
stt_en_conformer_ctc_medium,EncDecCTCBPEModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_conformer_ctc_medium"
|
||||
stt_en_conformer_ctc_large,EncDecCTCBPEModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_conformer_ctc_large"
|
||||
stt_en_conformer_ctc_small_ls,EncDecCTCBPEModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_conformer_ctc_small_ls"
|
||||
stt_en_conformer_ctc_medium_ls,EncDecCTCBPEModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_conformer_ctc_medium_ls"
|
||||
stt_en_conformer_ctc_large_ls,EncDecCTCBPEModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_conformer_ctc_large_ls"
|
|
|
@ -1,11 +1,12 @@
|
|||
Model Name,Model Base Class,Model Card
|
||||
QuartzNet15x5Base-En,EncDecCTCModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemospeechmodels"
|
||||
stt_zh_quartznet15x5,EncDecCTCModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_zh_quartznet15x5"
|
||||
stt_en_jasper10x5dr,EncDecCTCModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_jasper10x5dr"
|
||||
stt_en_citrinet_256,EncDecCTCBPEModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_citrinet_256"
|
||||
stt_en_citrinet_512,EncDecCTCBPEModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_citrinet_512"
|
||||
stt_en_citrinet_1024,EncDecCTCBPEModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_citrinet_1024"
|
||||
stt_zh_citrinet_512,EncDecCTCBPEModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_zh_citrinet_512"
|
||||
stt_en_citrinet_256_gamma_0_25,EncDecCTCBPEModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_citrinet_256_gamma_0_25"
|
||||
stt_en_citrinet_512_gamma_0_25,EncDecCTCBPEModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_citrinet_512_gamma_0_25"
|
||||
stt_en_citrinet_1024_gamma_0_25,EncDecCTCBPEModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_citrinet_1024_gamma_0_25"
|
||||
stt_en_conformer_ctc_small,EncDecCTCBPEModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_conformer_ctc_small"
|
||||
stt_en_conformer_ctc_medium,EncDecCTCBPEModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_conformer_ctc_medium"
|
||||
stt_en_conformer_ctc_large,EncDecCTCBPEModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_conformer_ctc_large"
|
||||
|
|
|
|
@ -1,3 +1,3 @@
|
|||
Model,Model Base Class,Model Card
|
||||
stt_es_quartznet15x5,EncDecCTCModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_es_quartznet15x5"
|
||||
|
||||
stt_es_citrinet_512,EncDecCTCModelBPE,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_es_citrinet_512"
|
||||
|
|
|
|
@ -1,3 +1,2 @@
|
|||
Model,Model Base Class,Model Card
|
||||
stt_zh_quartznet15x5,EncDecCTCModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_zh_quartznet15x5"
|
||||
stt_zh_citrinet_512,EncDecCTCBPEModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_zh_citrinet_512"
|
||||
|
|
|
|
@ -71,14 +71,16 @@ To perform inference and transcribe a sample of speech after loading the model,
|
|||
Setting the argument ``logprobs`` to ``True`` returns the log probabilities instead of transcriptions. For more information, see :doc:`./api.html#modules`.
|
||||
The audio files should be 16KHz monochannel wav files.
|
||||
|
||||
Automatic Speech Recognition Models
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
Fine-tuning on Different Datasets
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
.. csv-table::
|
||||
:file: data/asr_results.csv
|
||||
:align: left
|
||||
:widths: 30, 30, 40
|
||||
:header-rows: 1
|
||||
There are multiple ASR tutorials provided in the :ref:`Tutorials <tutorials>` section. Most of these tutorials explain how to instantiate a pre-trained model, prepare the model for fine-tuning on some dataset (in the same language) as a demonstration.
|
||||
|
||||
|
||||
Automatic Speech Recognition Models
|
||||
-----------------------------------
|
||||
|
||||
Below is a list of all the ASR models that are available in NeMo for specific languages, as well as auxiliary language models for certain languages.
|
||||
|
||||
Language Models for ASR
|
||||
^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
@ -89,7 +91,8 @@ Language Models for ASR
|
|||
:widths: 30, 30, 40
|
||||
:header-rows: 1
|
||||
|
||||
|
||||
|
|
||||
|
||||
Speech Recognition (Languages)
|
||||
------------------------------
|
||||
|
||||
|
|
185
docs/source/core/export.rst
Normal file
185
docs/source/core/export.rst
Normal file
|
@ -0,0 +1,185 @@
|
|||
Exporting NeMo Models
|
||||
=====================
|
||||
|
||||
Exporting Models
|
||||
----------------
|
||||
|
||||
Most of the NeMo models can be exported to ONNX or TorchScript to be deployed for inference in optimized execution environments, such as Jarvis or Triton Inference Server.
|
||||
Export interface is provided by the ``Exportable`` mix-in class. If a model extends ``Exportable``, it can be exported by:
|
||||
|
||||
.. code-block:: Python
|
||||
|
||||
from nemo.core.classes import ModelPT, Exportable
|
||||
# deriving from Exportable
|
||||
class MyExportableModel(ModelPT, Exportable):
|
||||
...
|
||||
|
||||
mymodel = MyExportableModel.from_pretrained(model_name="MyModelName")
|
||||
|
||||
# exporting pre-trained model to ONNX file for deployment.
|
||||
mymodel.export('mymodel.onnx', [options])
|
||||
|
||||
|
||||
How to Use Model Export
|
||||
-----------------------
|
||||
The following arguments are for ``Exportable.export()``. In most cases, you should only supply the name of the output file and use all defaults:
|
||||
.. code-block:: Python
|
||||
def export(
|
||||
self,
|
||||
output: str,
|
||||
input_example=None,
|
||||
output_example=None,
|
||||
verbose=False,
|
||||
export_params=True,
|
||||
do_constant_folding=True,
|
||||
keep_initializers_as_inputs=False,
|
||||
onnx_opset_version: int = 13,
|
||||
try_script: bool = False,
|
||||
set_eval: bool = True,
|
||||
check_trace: bool = False,
|
||||
use_dynamic_axes: bool = True,
|
||||
dynamic_axes=None,
|
||||
check_tolerance=0.01,
|
||||
):
|
||||
|
||||
The ``output``, ``input_example``, ``output_example``, ``verbose``, ``export_params``, ``do_constant_folding``, ``keep_initializers_as_inputs``, ``onnx_opset_version``, ``set_eval`` options have the same semantics as in Pytorch ``onnx.export()`` and ``jit.trace()`` functions and are passed through. For more information about Pytorch's``onnx.export()``, refer to the `torch.onnx functions documentation
|
||||
<https://pytorch.org/docs/stable/onnx.html#functions>`_.
|
||||
|
||||
The file extension of the ``output`` parameter determines export format: ``.onnx->ONNX``, ``.pt`` or ``.ts`` -> ``TorchScript``. If ``input_example`` is None, ``Exportable.input_example()`` is called.
|
||||
|
||||
**TorchScript-specific**: If ``try_script`` is ``True``, ``export()`` tries ``jit.script()`` before ``jit.trace()``.
|
||||
The ``check_trace`` arg is passed through to ``jit.trace()``.
|
||||
**ONNX-specific**: If ``use_dynamic_axes`` is True, ``onnx.export()`` is called with dynamic axes. If ``dynamic_axes`` is ``None``, they are inferred from the model's ``input_types`` definition (batch dimension is dynamic, and so is duration etc).
|
||||
|
||||
If ``check_trace`` is ``True``, the resulting ONNX also runs on ``input_example`` and the results compared to ``output_example`` using the ``check_tolerance`` argument. Note the higher tolerance default.
|
||||
|
||||
|
||||
How to Make Model Exportable
|
||||
----------------------------
|
||||
|
||||
If you are simply using NeMo models, the previous example is all you need to know.
|
||||
If you write your own models, this section highlights the things you need to be aware of after extending ``Exportable``.
|
||||
|
||||
Exportable Hooks and Overrides
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
You should not normally need to override ``Exportable`` default methods. However, ``Exportable.export()`` relies on the assumptions that certain methods are available in your class.
|
||||
|
||||
.. code-block:: Python
|
||||
|
||||
@property
|
||||
def input_example(self) # => Tuple(input, [(input, ...], [Dict])
|
||||
"""
|
||||
Generates input examples for tracing etc.
|
||||
Returns:
|
||||
A tuple of input examples.
|
||||
"""
|
||||
This function should return a tuple of (normally) Tensors - one per each of model inputs (args to ``forward()``). The last element may be a ``Dict`` to specify non-positional arguments by name, as per Torch ``export()`` convention. For more information, refer to the `Using dictionaries to handle Named Arguments as model inputs
|
||||
<https://pytorch.org/docs/stable/onnx.html#using-dictionaries-to-handle-named-arguments-as-model-inputs>`_.
|
||||
Note: ``Dict`` currently does not work with Torchscript ``trace()``.
|
||||
.. code-block:: Python
|
||||
|
||||
@property
|
||||
def input_types(self):
|
||||
@property
|
||||
def output_types(self):
|
||||
|
||||
Those are needed for inferring in/out names and dynamic axes. If your model derives from ``ModulePT``, those are already there. Another common scenario is that your model contains one or more modules that processes input and generates output. Then, you should override ``Exportable`` methods ``input_module()`` and ``output_module()`` to point to them, like in this example:
|
||||
|
||||
.. code-block:: Python
|
||||
|
||||
@property
|
||||
def input_module(self):
|
||||
return self.fastpitch
|
||||
|
||||
@property
|
||||
def output_module(self):
|
||||
return self.fastpitch
|
||||
|
||||
Your model should also have an export-friendly ``forward()`` method - that can mean different things for ONNX ant TorchScript. For ONNX, you can't have forced named parameters without default, like ``forward(self, *, text)``. For TorchScript, you should avoid ``None`` and use ``Optional`` instead. The criterias are highly volatile and may change with every PyTorch version, so it's a trial-and-error process. There is also the general issue that in many cases, ``forward()`` for inference can be simplified and even use less inputs/outputs. To address this, ``Exportable`` looks for ``forward_for_export()`` method in your model and uses that instead of ``forward()`` to export:
|
||||
|
||||
.. code-block:: Python
|
||||
# Uses forced named args, many default parameters.
|
||||
def forward(
|
||||
self,
|
||||
*,
|
||||
text,
|
||||
durs=None,
|
||||
pitch=None,
|
||||
speaker=0,
|
||||
pace=1.0,
|
||||
spec=None,
|
||||
attn_prior=None,
|
||||
mel_lens=None,
|
||||
input_lens=None,
|
||||
):
|
||||
# Passes through all self.fastpitch outputs
|
||||
return self.fastpitch(
|
||||
text=text,
|
||||
durs=durs,
|
||||
pitch=pitch,
|
||||
speaker=speaker,
|
||||
pace=pace,
|
||||
spec=spec,
|
||||
attn_prior=attn_prior,
|
||||
mel_lens=mel_lens,
|
||||
input_lens=input_lens,
|
||||
)
|
||||
|
||||
|
||||
# Uses less inputs, no '*', returns less outputs:
|
||||
def forward_for_export(self, text):
|
||||
(
|
||||
spect,
|
||||
durs_predicted,
|
||||
log_durs_predicted,
|
||||
pitch_predicted,
|
||||
attn_soft,
|
||||
attn_logprob,
|
||||
attn_hard,
|
||||
attn_hard_dur,
|
||||
pitch,
|
||||
) = self.fastpitch(text=text)
|
||||
return spect, durs_predicted, log_durs_predicted, pitch_predicted
|
||||
|
||||
To stay consistent with input_types()/output_types(), there are also those hooks in ``Exportable`` that let you exclude particular inputs/outputs from the export process:
|
||||
|
||||
.. code-block:: Python
|
||||
|
||||
@property
|
||||
def disabled_deployment_input_names(self):
|
||||
"""Implement this method to return a set of input names disabled for export"""
|
||||
return set(["durs", "pitch", "speaker", "pace", "spec", "attn_prior", "mel_lens", "input_lens"])
|
||||
|
||||
@property
|
||||
def disabled_deployment_output_names(self):
|
||||
|
||||
|
||||
Another common requirement for models that are being exported is to run certain net modifications for inference efficiency before exporting - like disabling masks in some convolutions or removing batch normalizations. A better style is to make those happen on ``ModelPT.eval()`` (and reversed on ``.train()``), but it's not always feasible so the following hook is provided in ``Exportable`` to run those:
|
||||
|
||||
.. code-block:: Python
|
||||
|
||||
def _prepare_for_export(self, **kwargs):
|
||||
"""
|
||||
Override this method to prepare module for export. This is in-place operation.
|
||||
Base version does common necessary module replacements (Apex etc)
|
||||
"""
|
||||
# do graph modifications specific for this model
|
||||
replace_1D_2D = kwargs.get('replace_1D_2D', False)
|
||||
replace_for_export(self, replace_1D_2D)
|
||||
# call base method for common set of modifications
|
||||
Exportable._prepare_for_export(self, **kwargs)
|
||||
|
||||
|
||||
Exportable Model Code
|
||||
~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
Most importantly, the actual Torch code in your model should be ONNX or TorchScript - compatible (ideally, both).
|
||||
#. Ensure the code is written in Torch - avoid bare `Numpy or Python operands <https://pytorch.org/docs/stable/onnx.html#write-pytorch-model-in-torch-way>`_.
|
||||
#. Create your model ``Exportable`` and add an export unit test, to catch any operation/construct not supported in ONNX/TorchScript, immediately.
|
||||
|
||||
For more information, refer to the PyTorch documentation:
|
||||
- `List of supported operators <https://pytorch.org/docs/stable/onnx.html#supported-operators>`_
|
||||
- `Tracing vs. scripting <https://pytorch.org/docs/stable/onnx.html#tracing-vs-scripting>`_
|
||||
- `AlexNet example <https://pytorch.org/docs/stable/onnx.html#example-end-to-end-alexnet-from-pytorch-to-onnx>`_
|
||||
|
|
@ -17,6 +17,7 @@ NVIDIA NeMo User Guide
|
|||
:name: core
|
||||
|
||||
core/core
|
||||
core/export
|
||||
|
||||
|
||||
.. toctree::
|
||||
|
|
|
@ -457,6 +457,111 @@ can be used to compute sacreBLEU scores.
|
|||
|
||||
cat test.en-es.translations | sacrebleu test.es
|
||||
|
||||
Pretrained Encoders
|
||||
-------------------
|
||||
|
||||
Pretrained BERT encoders from either `HuggingFace Transformers <https://huggingface.co/models>`__
|
||||
or `Megatron-LM <https://github.com/NVIDIA/Megatron-LM>`__
|
||||
can be used to to train NeMo NMT models.
|
||||
|
||||
The ``library`` flag takes values: ``huggingface``, ``megatron``, and ``nemo``.
|
||||
|
||||
The ``model_name`` flag is used to indicate a *named* model architecture.
|
||||
For example, we can use ``bert_base_cased`` from HuggingFace or ``megatron-bert-345m-cased`` from Megatron-LM.
|
||||
|
||||
The ``pretrained`` flag indicates whether or not to download the pretrained weights (``pretrained=True``) or
|
||||
instantiate the same model architecture with random weights (``pretrained=False``).
|
||||
|
||||
To use a custom model architecture from a specific library, use ``model_name=null`` and then add the
|
||||
custom configuration under the ``encoder`` configuration.
|
||||
|
||||
HuggingFace
|
||||
^^^^^^^^^^^
|
||||
|
||||
We have provided a `HuggingFace config file <https://github.com/NVIDIA/NeMo/blob/main/examples/nlp/machine_translation/conf/huggingface.yaml>`__
|
||||
to use with HuggingFace encoders.
|
||||
|
||||
To use the config file from CLI:
|
||||
|
||||
.. code ::
|
||||
|
||||
--config-path=conf \
|
||||
--config-name=huggingface \
|
||||
|
||||
As an example, we can configure the NeMo NMT encoder to use ``bert-base-cased`` from HuggingFace
|
||||
by using the ``huggingface`` config file and setting
|
||||
|
||||
.. code ::
|
||||
|
||||
model.encoder.pretrained=true \
|
||||
model.encoder.model_name=bert-base-cased \
|
||||
|
||||
To use a custom architecture from HuggingFace we can use
|
||||
|
||||
.. code ::
|
||||
|
||||
+model.encoder._target_=transformers.BertConfig \
|
||||
+model.encoder.hidden_size=1536 \
|
||||
|
||||
Note the ``+`` symbol is needed if we're not adding the arguments to the YAML config file.
|
||||
|
||||
Megatron
|
||||
^^^^^^^^
|
||||
|
||||
We have provided a `Megatron config file <https://github.com/NVIDIA/NeMo/blob/main/examples/nlp/machine_translation/conf/megatron.yaml>`__
|
||||
to use with Megatron encoders.
|
||||
|
||||
To use the config file from CLI:
|
||||
|
||||
.. code ::
|
||||
|
||||
--config-path=conf \
|
||||
--config-name=megatron \
|
||||
|
||||
The ``checkpoint_file`` should be the path to Megatron-LM checkpoint:
|
||||
|
||||
.. code ::
|
||||
|
||||
/path/to/your/megatron/checkpoint/model_optim_rng.pt
|
||||
|
||||
In case your megatron model requires model parallelism, then ``checkpoint_file`` should point to the directory containing the
|
||||
standard Megatron-LM checkpoint format:
|
||||
|
||||
.. code ::
|
||||
|
||||
3.9b_bert_no_rng
|
||||
├── mp_rank_00
|
||||
│ └── model_optim_rng.pt
|
||||
├── mp_rank_01
|
||||
│ └── model_optim_rng.pt
|
||||
├── mp_rank_02
|
||||
│ └── model_optim_rng.pt
|
||||
└── mp_rank_03
|
||||
└── model_optim_rng.pt
|
||||
|
||||
As an example, to train a NeMo NMT model with a 3.9B Megatron BERT encoder,
|
||||
we would use the following encoder configuration:
|
||||
|
||||
.. code ::
|
||||
|
||||
model.encoder.checkpoint_file=/path/to/megatron/checkpoint/3.9b_bert_no_rng \
|
||||
model.encoder.hidden_size=2560 \
|
||||
model.encoder.num_attention_heads=40 \
|
||||
model.encoder.num_layers=48 \
|
||||
model.encoder.max_position_embeddings=512 \
|
||||
|
||||
To train a Megatron 345M BERT, we would use
|
||||
|
||||
.. code ::
|
||||
|
||||
model.encoder.model_name=megatron-bert-cased \
|
||||
model.encoder.checkpoint_file=/path/to/your/megatron/checkpoint/model_optim_rng.pt \
|
||||
model.encoder.hidden_size=1024 \
|
||||
model.encoder.num_attention_heads=16 \
|
||||
model.encoder.num_layers=24 \
|
||||
model.encoder.max_position_embeddings=512 \
|
||||
|
||||
|
||||
References
|
||||
----------
|
||||
|
||||
|
|
|
@ -43,6 +43,9 @@ To run a tutorial:
|
|||
* - ASR
|
||||
- Online ASR inference with Microphone
|
||||
- `Online ASR Microphone <https://github.com/NVIDIA/NeMo/blob/v1.0.2/tutorials/asr/02_Online_ASR_Microphone_Demo.ipynb>`_
|
||||
* - ASR
|
||||
- Fine-tuning CTC Models on New Languages
|
||||
- `ASR CTC Language Fine-Tuning <https://colab.research.google.com/github/NVIDIA/NeMo/blob/main/tutorials/asr/10_ASR_CTC_Language_Finetuning.ipynb>`_
|
||||
* - ASR
|
||||
- Speech Commands
|
||||
- `Speech Commands <https://colab.research.google.com/github/NVIDIA/NeMo/blob/v1.0.2/tutorials/asr/03_Speech_Commands.ipynb>`_
|
||||
|
|
|
@ -132,6 +132,17 @@ def main(cfg: DictConfig) -> None:
|
|||
trainer.fit(model)
|
||||
if cfg.model.nemo_path:
|
||||
model.save_to(cfg.model.nemo_path)
|
||||
else:
|
||||
data_dir = cfg.model.dataset.get('data_dir', None)
|
||||
dialogues_example_dir = cfg.model.dataset.get('dialogues_example_dir', None)
|
||||
|
||||
if data_dir is None or dialogues_example_dir is None:
|
||||
raise ValueError('No dataset directory provided. Skipping evaluation. ')
|
||||
elif not os.path.exists(data_dir):
|
||||
raise ValueError(f'{data_dir} is not found, skipping evaluation on the test set.')
|
||||
else:
|
||||
model.update_data_dirs(data_dir=data_dir, dialogues_example_dir=dialogues_example_dir)
|
||||
model._cfg.dataset = cfg.model.dataset
|
||||
|
||||
if hasattr(cfg.model, 'test_ds') and cfg.model.test_ds.ds_item is not None:
|
||||
gpu = 1 if cfg.trainer.gpus != 0 else 0
|
||||
|
|
160
examples/nlp/machine_translation/conf/megatron.yaml
Normal file
160
examples/nlp/machine_translation/conf/megatron.yaml
Normal file
|
@ -0,0 +1,160 @@
|
|||
name: MegatronEncoder
|
||||
do_training: True # set to False if only preprocessing data
|
||||
do_testing: False # set to True to run evaluation on test data after training
|
||||
|
||||
model:
|
||||
beam_size: 4
|
||||
len_pen: 0.6
|
||||
max_generation_delta: 5
|
||||
label_smoothing: 0.1
|
||||
shared_tokenizer: false
|
||||
preproc_out_dir: null
|
||||
src_language: 'en'
|
||||
tgt_language: 'de'
|
||||
|
||||
train_ds:
|
||||
src_file_name: null
|
||||
tgt_file_name: null
|
||||
use_tarred_dataset: False # if true tar_file_name and meta_file_name will be used (or created automatically)
|
||||
# config for preprocessing training data and creating a tarred datset automatically
|
||||
tar_file_prefix: parallel # prefix for tar file names
|
||||
tar_files: null # if data has already been preprocessed (rest of config ignored)
|
||||
metadata_file: null # metadata for tarred dataset
|
||||
lines_per_dataset_fragment: 1000000 # Number of lines to consider for bucketing and padding
|
||||
num_batches_per_tarfile: 100 # Number of batches (pickle files) within each tarfile
|
||||
tar_shuffle_n: 100 # How many samples to look ahead and load to be shuffled
|
||||
shard_strategy: scatter # tarred dataset shard distribution strategy
|
||||
n_preproc_jobs: -2 # number of processes to use for data preprocessing (-2 means all but 2)
|
||||
tokens_in_batch: 512
|
||||
clean: true
|
||||
max_seq_length: 512
|
||||
shuffle: true
|
||||
num_samples: -1
|
||||
drop_last: false
|
||||
pin_memory: false
|
||||
num_workers: 8
|
||||
|
||||
validation_ds:
|
||||
src_file_name: null
|
||||
tgt_file_name: null
|
||||
tokens_in_batch: 512
|
||||
clean: false
|
||||
max_seq_length: 512
|
||||
shuffle: false
|
||||
num_samples: -1
|
||||
drop_last: false
|
||||
pin_memory: false
|
||||
num_workers: 8
|
||||
|
||||
test_ds:
|
||||
src_file_name: null
|
||||
tgt_file_name: null
|
||||
tokens_in_batch: 512
|
||||
clean: false
|
||||
max_seq_length: 512
|
||||
shuffle: false
|
||||
num_samples: -1
|
||||
drop_last: false
|
||||
pin_memory: false
|
||||
num_workers: 8
|
||||
|
||||
optim:
|
||||
name: adam
|
||||
lr: 0.001
|
||||
betas:
|
||||
- 0.9
|
||||
- 0.98
|
||||
weight_decay: 0.0
|
||||
sched:
|
||||
name: InverseSquareRootAnnealing
|
||||
min_lr: 0.0
|
||||
last_epoch: -1
|
||||
warmup_ratio: 0.1
|
||||
|
||||
encoder_tokenizer:
|
||||
library: megatron
|
||||
tokenizer_model: null
|
||||
vocab_file: null
|
||||
special_tokens: null
|
||||
vocab_size: null
|
||||
model_name: null
|
||||
|
||||
decoder_tokenizer:
|
||||
library: yttm
|
||||
tokenizer_model: null
|
||||
vocab_file: null
|
||||
special_tokens: null
|
||||
vocab_size: null
|
||||
|
||||
encoder:
|
||||
library: megatron
|
||||
|
||||
# If using a pretrained megatron bert model from NGC, then use the corresponding model name
|
||||
# For example, 'megatron-bert-345m-uncased'.
|
||||
# If restoring from a local checkpoint, then use either 'megatron-bert-uncased' or 'megatron-bert-cased'
|
||||
model_name: megatron-bert-uncased # or megatron-bert-cased
|
||||
|
||||
# If restoring from a model parallel checkpoint, then checkpoint_file should be a path to
|
||||
# the directory containing the megatron-lm checkpoints. The directory will have the structure:
|
||||
|
||||
# /path/to/my/checkpoint/
|
||||
# ├── mp_rank_00
|
||||
# │ └── model_optim_rng.pt
|
||||
# └── mp_rank_01
|
||||
# └── model_optim_rng.pt
|
||||
|
||||
# If not using a model parallel checkpoint, then use the full path to the checkpoint:
|
||||
|
||||
# /path/to/my/checkpoint/model_optim_rng.pt
|
||||
checkpoint_file: null
|
||||
vocab_file : null
|
||||
|
||||
pretrained: true # only pretrained=true supported for now
|
||||
|
||||
# model architecture configuration
|
||||
hidden_size: 1024
|
||||
num_attention_heads: 16
|
||||
num_layers: 24
|
||||
max_position_embeddings: 512
|
||||
num_tokentypes: 0
|
||||
|
||||
decoder:
|
||||
library: nemo
|
||||
model_name: null
|
||||
pretrained: false
|
||||
max_sequence_length: 512
|
||||
num_token_types: 2
|
||||
embedding_dropout: 0.1
|
||||
learn_positional_encodings: false
|
||||
hidden_size: 512
|
||||
inner_size: 2048
|
||||
num_layers: 6
|
||||
num_attention_heads: 8
|
||||
ffn_dropout: 0.1
|
||||
attn_score_dropout: 0.1
|
||||
attn_layer_dropout: 0.1
|
||||
hidden_act: relu
|
||||
pre_ln: false
|
||||
|
||||
head:
|
||||
num_layers: 1
|
||||
activation: relu
|
||||
log_softmax: true
|
||||
dropout: 0.0
|
||||
use_transformer_init: true
|
||||
|
||||
trainer:
|
||||
gpus: 4
|
||||
num_nodes: 1
|
||||
max_epochs: 200
|
||||
amp_level: O2 # O1/O2 for mixed precision
|
||||
precision: 16 # Should be set to 16 for O1 and O2, default is 16 as PT ignores it when am_level is O0
|
||||
accelerator: ddp
|
||||
checkpoint_callback: False
|
||||
logger: False
|
||||
log_every_n_steps: 50 # Interval of logging.
|
||||
check_val_every_n_epoch: 1
|
||||
|
||||
exp_manager:
|
||||
name: ${name}
|
||||
files_to_copy: []
|
|
@ -21,6 +21,7 @@ from pytorch_lightning import Trainer
|
|||
from nemo.collections.nlp.data.machine_translation.preproc_mt_data import MTDataPreproc
|
||||
from nemo.collections.nlp.models.machine_translation.mt_enc_dec_config import MTEncDecModelConfig
|
||||
from nemo.collections.nlp.models.machine_translation.mt_enc_dec_model import MTEncDecModel
|
||||
from nemo.collections.nlp.parts.nlp_overrides import NLPDDPPlugin
|
||||
from nemo.core.config import hydra_runner
|
||||
from nemo.core.config.modelPT import NemoConfig
|
||||
from nemo.core.config.pytorch_lightning import TrainerConfig
|
||||
|
@ -108,7 +109,9 @@ def main(cfg: MTEncDecConfig) -> None:
|
|||
logging.info(f'Config: {OmegaConf.to_yaml(cfg)}')
|
||||
|
||||
# training is managed by PyTorch Lightning
|
||||
trainer = Trainer(**cfg.trainer)
|
||||
trainer_cfg = OmegaConf.to_container(cfg.trainer)
|
||||
trainer_cfg.pop('plugins', None)
|
||||
trainer = Trainer(plugins=[NLPDDPPlugin()], **trainer_cfg)
|
||||
|
||||
# tokenizers will be trained and and tarred training data will be created if needed
|
||||
# model config is then updated
|
||||
|
|
|
@ -63,6 +63,27 @@ class EncDecCTCModelBPE(EncDecCTCModel, ASRBPEMixin):
|
|||
description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_citrinet_1024",
|
||||
location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_en_citrinet_1024/versions/1.0.0rc1/files/stt_en_citrinet_1024.nemo",
|
||||
)
|
||||
results.append(model)
|
||||
|
||||
model = PretrainedModelInfo(
|
||||
pretrained_model_name="stt_en_citrinet_256_gamma_0_25",
|
||||
description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_citrinet_256_gamma_0_25",
|
||||
location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_en_citrinet_256_gamma_0_25/versions/1.0.0/files/stt_en_citrinet_256_gamma_0_25.nemo",
|
||||
)
|
||||
results.append(model)
|
||||
|
||||
model = PretrainedModelInfo(
|
||||
pretrained_model_name="stt_en_citrinet_512_gamma_0_25",
|
||||
description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_citrinet_512_gamma_0_25",
|
||||
location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_en_citrinet_512_gamma_0_25/versions/1.0.0/files/stt_en_citrinet_512_gamma_0_25.nemo",
|
||||
)
|
||||
results.append(model)
|
||||
|
||||
model = PretrainedModelInfo(
|
||||
pretrained_model_name="stt_en_citrinet_1024_gamma_0_25",
|
||||
description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_citrinet_1024_gamma_0_25",
|
||||
location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_en_citrinet_1024_gamma_0_25/versions/1.0.0/files/stt_en_citrinet_1024_gamma_0_25.nemo",
|
||||
)
|
||||
|
||||
results.append(model)
|
||||
|
||||
|
|
|
@ -64,13 +64,6 @@ class EncDecCTCModel(ASRModel, ExportableEncDecModel, ASRModuleMixin):
|
|||
)
|
||||
results.append(model)
|
||||
|
||||
model = PretrainedModelInfo(
|
||||
pretrained_model_name="stt_zh_quartznet15x5",
|
||||
description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_zh_quartznet15x5",
|
||||
location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_zh_quartznet15x5/versions/1.0.0rc1/files/stt_zh_quartznet15x5.nemo",
|
||||
)
|
||||
results.append(model)
|
||||
|
||||
model = PretrainedModelInfo(
|
||||
pretrained_model_name="stt_en_jasper10x5dr",
|
||||
description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_jasper10x5dr",
|
||||
|
|
|
@ -64,7 +64,7 @@ class MTDataPreproc:
|
|||
self.world_size = trainer.num_nodes * trainer.num_gpus
|
||||
|
||||
if hasattr(cfg, 'train_ds'):
|
||||
supported_tokenizers = ['yttm', 'huggingface', 'sentencepiece']
|
||||
supported_tokenizers = ['yttm', 'huggingface', 'sentencepiece', 'megatron']
|
||||
supported_train_tokenizers = ['yttm', 'sentencepiece']
|
||||
|
||||
if (
|
||||
|
@ -182,7 +182,7 @@ class MTDataPreproc:
|
|||
# Preprocess data and cache for use during training
|
||||
if self.global_rank == 0:
|
||||
logging.info(
|
||||
f"Using tarred dataset for src: {cfg.train_ds.get('src_file_name')} and tgt: {cfg.train_ds.get('tgt_file_name')}"
|
||||
f"Creating tarred dataset for src: {cfg.train_ds.get('src_file_name')} and tgt: {cfg.train_ds.get('tgt_file_name')}"
|
||||
)
|
||||
|
||||
if not cfg.get('multilingual'):
|
||||
|
@ -247,6 +247,7 @@ class MTDataPreproc:
|
|||
logging.info(
|
||||
f"Using tarred dataset created in folder(s) {outdir_list} and metadata created at {self._cfg.train_ds.metadata_file}"
|
||||
)
|
||||
|
||||
elif cfg.train_ds.get('tar_files') is not None and cfg.train_ds.get('metadata_file') is None:
|
||||
raise ValueError('A metadata file is required for tarred dataset but cfg.metadata_file is None.')
|
||||
elif cfg.train_ds.get('tar_files') is None and cfg.train_ds.get('metadata_file') is not None:
|
||||
|
|
|
@ -36,6 +36,7 @@ from nemo.collections.nlp.modules.common.lm_utils import get_lm_model
|
|||
from nemo.collections.nlp.parts.utils_funcs import tensor2list
|
||||
from nemo.core.classes.common import PretrainedModelInfo, typecheck
|
||||
from nemo.core.neural_types import NeuralType
|
||||
from nemo.utils import logging
|
||||
from nemo.utils.get_rank import is_global_rank_zero
|
||||
|
||||
__all__ = ['SGDQAModel']
|
||||
|
@ -543,6 +544,21 @@ class SGDQAModel(NLPModel):
|
|||
|
||||
self.data_prepared = True
|
||||
|
||||
def update_data_dirs(self, data_dir: str, dialogues_example_dir: str):
|
||||
"""
|
||||
Update data directories
|
||||
|
||||
Args:
|
||||
data_dir: path to data directory
|
||||
dialogues_example_dir: path to preprocessed dialogues example directory, if not exists will be created.
|
||||
"""
|
||||
if not os.path.exists(data_dir):
|
||||
raise ValueError(f"{data_dir} is not found")
|
||||
self._cfg.dataset.data_dir = data_dir
|
||||
self._cfg.dataset.dialogues_example_dir = dialogues_example_dir
|
||||
logging.info(f'Setting model.dataset.data_dir to {data_dir}.')
|
||||
logging.info(f'Setting model.dataset.dialogues_example_dir to {dialogues_example_dir}.')
|
||||
|
||||
def setup_training_data(self, train_data_config: Optional[DictConfig] = None):
|
||||
self.prepare_data()
|
||||
self._train_dl = self._setup_dataloader_from_config(cfg=train_data_config, split=train_data_config.ds_item)
|
||||
|
@ -577,4 +593,19 @@ class SGDQAModel(NLPModel):
|
|||
|
||||
@classmethod
|
||||
def list_available_models(cls) -> Optional[PretrainedModelInfo]:
|
||||
pass
|
||||
"""
|
||||
This method returns a list of pre-trained model which can be instantiated directly from NVIDIA's NGC cloud.
|
||||
|
||||
Returns:
|
||||
List of available pre-trained models.
|
||||
"""
|
||||
result = []
|
||||
|
||||
result.append(
|
||||
PretrainedModelInfo(
|
||||
pretrained_model_name="sgdqa_bertbasecased",
|
||||
location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/sgdqa_bertbasecased/versions/1.0.0/files/sgdqa_bertbasecased.nemo",
|
||||
description="Dialogue State Tracking model finetuned from NeMo BERT Base Cased on Google SGD dataset which has a joint goal accuracy of 59.72% on dev set and 45.85% on test set.",
|
||||
)
|
||||
)
|
||||
return result
|
||||
|
|
|
@ -25,7 +25,7 @@ from nemo.collections.nlp.modules.common.transformer.transformer import (
|
|||
NeMoTransformerConfig,
|
||||
NeMoTransformerEncoderConfig,
|
||||
)
|
||||
from nemo.core.config.modelPT import ModelConfig, OptimConfig, SchedConfig
|
||||
from nemo.core.config.modelPT import OptimConfig, SchedConfig
|
||||
|
||||
|
||||
@dataclass
|
||||
|
|
|
@ -132,6 +132,7 @@ class MTEncDecModel(EncDecNLPModel):
|
|||
library = encoder_cfg_dict.pop('library', 'nemo')
|
||||
model_name = encoder_cfg_dict.pop('model_name', None)
|
||||
pretrained = encoder_cfg_dict.pop('pretrained', False)
|
||||
checkpoint_file = encoder_cfg_dict.pop('checkpoint_file', None)
|
||||
self.encoder = get_transformer(
|
||||
library=library,
|
||||
model_name=model_name,
|
||||
|
@ -139,6 +140,7 @@ class MTEncDecModel(EncDecNLPModel):
|
|||
config_dict=encoder_cfg_dict,
|
||||
encoder=True,
|
||||
pre_ln_final_layer_norm=encoder_cfg_dict.get('pre_ln_final_layer_norm', False),
|
||||
checkpoint_file=checkpoint_file,
|
||||
)
|
||||
|
||||
# decoder from NeMo, Megatron-LM, or HuggingFace
|
||||
|
@ -383,7 +385,7 @@ class MTEncDecModel(EncDecNLPModel):
|
|||
decoder_model_name=None,
|
||||
):
|
||||
|
||||
supported_tokenizers = ['yttm', 'huggingface', 'sentencepiece']
|
||||
supported_tokenizers = ['yttm', 'huggingface', 'sentencepiece', 'megatron']
|
||||
if (
|
||||
encoder_tokenizer_library not in supported_tokenizers
|
||||
or decoder_tokenizer_library not in supported_tokenizers
|
||||
|
|
|
@ -32,6 +32,7 @@ from transformers import TRANSFORMERS_CACHE
|
|||
|
||||
from nemo.collections.common.tokenizers.huggingface.auto_tokenizer import AutoTokenizer
|
||||
from nemo.collections.nlp.modules import BertModule, MegatronBertEncoder
|
||||
from nemo.collections.nlp.modules.common.megatron.megatron_encoder import MegatronEncoderModule
|
||||
from nemo.collections.nlp.modules.common.megatron.megatron_utils import compute_model_parallel_rank
|
||||
from nemo.collections.nlp.modules.common.tokenizer_utils import get_tokenizer
|
||||
from nemo.collections.nlp.parts.nlp_overrides import NLPCheckpointConnector
|
||||
|
@ -430,21 +431,19 @@ class NLPModel(ModelPT, Exportable):
|
|||
@rank_zero_only
|
||||
def register_megatron_checkpoint_version(self):
|
||||
""" Adds checkpoint version to .nemo archive """
|
||||
if self.bert_model is None:
|
||||
raise ValueError('Instantiate self.bert_model before registering megatron checkpoint version.')
|
||||
if self.has_megatron_encoder:
|
||||
checkpoint_version = get_checkpoint_version()
|
||||
if checkpoint_version is None:
|
||||
raise ValueError('Unable to get megatron checkpoint version.')
|
||||
else:
|
||||
checkpoint_version_dict = {'checkpoint_version': checkpoint_version}
|
||||
checkpoint_version_path = 'megatron_checkpoint_version.json'
|
||||
checkpoint_version_src = os.path.join(NEMO_NLP_TMP, checkpoint_version_path)
|
||||
with open(checkpoint_version_src, 'w') as f:
|
||||
f.write(json.dumps(checkpoint_version_dict))
|
||||
self.register_artifact(checkpoint_version_path, checkpoint_version_src)
|
||||
else:
|
||||
# get encoder config and create source for artifact
|
||||
if isinstance(self.bert_model, MegatronBertEncoder):
|
||||
checkpoint_version = get_checkpoint_version()
|
||||
if checkpoint_version is None:
|
||||
raise ValueError('Unable to get megatron checkpoint version.')
|
||||
else:
|
||||
checkpoint_version_dict = {'checkpoint_version': checkpoint_version}
|
||||
checkpoint_version_path = 'megatron_checkpoint_version.json'
|
||||
checkpoint_version_src = os.path.join(NEMO_NLP_TMP, checkpoint_version_path)
|
||||
with open(checkpoint_version_src, 'w') as f:
|
||||
f.write(json.dumps(checkpoint_version_dict))
|
||||
self.register_artifact(checkpoint_version_path, checkpoint_version_src)
|
||||
raise ValueError('Registering Megatron checkpoint version but no Megatron encoder detected.')
|
||||
|
||||
@staticmethod
|
||||
def _unpack_nemo_file(path2file: str, out_folder: str) -> str:
|
||||
|
@ -461,3 +460,39 @@ class NLPModel(ModelPT, Exportable):
|
|||
@property
|
||||
def output_module(self):
|
||||
return self.classifier
|
||||
|
||||
@property
|
||||
def has_megatron_encoder(self):
|
||||
if hasattr(self, 'bert_model'):
|
||||
if isinstance(self.bert_model, MegatronBertEncoder):
|
||||
return True
|
||||
else:
|
||||
return False
|
||||
elif hasattr(self, 'encoder'):
|
||||
if isinstance(self.encoder, MegatronEncoderModule):
|
||||
return True
|
||||
else:
|
||||
return False
|
||||
else:
|
||||
return False
|
||||
|
||||
@property
|
||||
def is_model_parallel_initialized(self):
|
||||
app_state = AppState()
|
||||
if app_state.model_parallel_group is not None:
|
||||
return True
|
||||
else:
|
||||
return False
|
||||
|
||||
def restore_megatron_encoder_weights(self):
|
||||
""" Model parallel weights need to be restored after DDP is initialized and
|
||||
model parallel ranks are known.
|
||||
"""
|
||||
if hasattr(self, 'bert_model'):
|
||||
if isinstance(self.bert_model, MegatronBertEncoder):
|
||||
logging.info(f"Restoring from pretrained model parallel checkpoint: {self.bert_model._restore_path}")
|
||||
self.bert_model.restore_weights(self.bert_model._restore_path)
|
||||
elif hasattr(self, 'encoder'):
|
||||
if isinstance(self.encoder, MegatronEncoderModule):
|
||||
logging.info(f"Restoring from pretrained model parallel checkpoint: {self.encoder.checkpoint_file}")
|
||||
self.encoder._encoder.restore_weights(self.encoder.checkpoint_file)
|
||||
|
|
|
@ -33,6 +33,7 @@ from nemo.collections.nlp.modules.common.megatron.megatron_utils import (
|
|||
from nemo.collections.nlp.modules.common.transformer.transformer import NeMoTransformerConfig
|
||||
from nemo.collections.nlp.modules.common.transformer.transformer_utils import (
|
||||
get_huggingface_transformer,
|
||||
get_megatron_transformer,
|
||||
get_nemo_transformer,
|
||||
)
|
||||
from nemo.utils import logging
|
||||
|
@ -176,4 +177,16 @@ def get_transformer(
|
|||
model_name=model_name, pretrained=pretrained, config_dict=config_dict, encoder=encoder
|
||||
)
|
||||
|
||||
elif library == 'megatron':
|
||||
model = get_megatron_transformer(
|
||||
model_name=model_name,
|
||||
pretrained=pretrained,
|
||||
config_dict=config_dict,
|
||||
encoder=encoder,
|
||||
checkpoint_file=checkpoint_file,
|
||||
)
|
||||
|
||||
else:
|
||||
raise ValueError("Libary must be 'nemo', 'huggingface' or 'megatron'")
|
||||
|
||||
return model
|
||||
|
|
|
@ -65,6 +65,13 @@ class MegatronBertEncoder(BertModule):
|
|||
self._app_state = None
|
||||
self._model_name = model_name
|
||||
|
||||
if 'vocab_size' in config:
|
||||
self._vocab_size = config.pop('vocab_size')
|
||||
else:
|
||||
self._vocab_size = None
|
||||
|
||||
self._hidden_size = config.get('hidden_size')
|
||||
|
||||
if not os.path.exists(vocab_file):
|
||||
raise ValueError(f'Vocab file not found at {vocab_file}')
|
||||
|
||||
|
@ -76,6 +83,8 @@ class MegatronBertEncoder(BertModule):
|
|||
config['lazy_mpu_init'] = True
|
||||
config['onnx_safe'] = True
|
||||
|
||||
num_tokentypes = config.pop('num_tokentypes', 2)
|
||||
|
||||
# if 'model_parallel_size' in config:
|
||||
if self._model_parallel_size is not None:
|
||||
app_state = AppState()
|
||||
|
@ -109,7 +118,7 @@ class MegatronBertEncoder(BertModule):
|
|||
logging.info(f'Megatron-lm argparse args: {args}')
|
||||
|
||||
self.language_model, self._language_model_key = get_language_model(
|
||||
attention_mask_func=bert_attention_mask_func, num_tokentypes=2, add_pooler=False
|
||||
attention_mask_func=bert_attention_mask_func, num_tokentypes=num_tokentypes, add_pooler=False
|
||||
)
|
||||
|
||||
self.config = OmegaConf.create(config)
|
||||
|
@ -151,8 +160,18 @@ class MegatronBertEncoder(BertModule):
|
|||
"""
|
||||
return self._hidden_size
|
||||
|
||||
@property
|
||||
def vocab_size(self):
|
||||
"""
|
||||
Property returning vocab size.
|
||||
|
||||
Returns:
|
||||
vocab size.
|
||||
"""
|
||||
return self._vocab_size
|
||||
|
||||
@typecheck()
|
||||
def forward(self, input_ids, attention_mask, token_type_ids):
|
||||
def forward(self, input_ids, attention_mask, token_type_ids=None):
|
||||
app_state = AppState()
|
||||
if app_state.model_parallel_size is None:
|
||||
self.complete_lazy_init()
|
||||
|
|
101
nemo/collections/nlp/modules/common/megatron/megatron_encoder.py
Normal file
101
nemo/collections/nlp/modules/common/megatron/megatron_encoder.py
Normal file
|
@ -0,0 +1,101 @@
|
|||
# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from typing import Optional
|
||||
|
||||
from nemo.collections.nlp.modules.common.encoder_module import EncoderModule
|
||||
from nemo.collections.nlp.modules.common.megatron.megatron_utils import get_megatron_lm_model
|
||||
from nemo.core.classes.common import typecheck
|
||||
from nemo.utils import logging
|
||||
|
||||
|
||||
class MegatronEncoderModule(EncoderModule):
|
||||
""" Class for using Megatron encoders in NeMo NLP."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
model_name: Optional[str] = None,
|
||||
pretrained: bool = True,
|
||||
config_dict: Optional[dict] = None,
|
||||
checkpoint_file: Optional[str] = None,
|
||||
vocab_file: Optional[str] = None,
|
||||
):
|
||||
"""Gets Megatron BERT based model to be used as an Encoder in NeMo NLP.
|
||||
Use the model_name arg to get a named model architecture.
|
||||
Available model names can be found with get_megatron_lm_models_list().
|
||||
Use the pretrained arg to get the named model architecture with or without pretrained weights.
|
||||
|
||||
Use config_dict to pass in arguments needed for Megatron-LM.
|
||||
For example, to instantiate a Megatron BERT large model we would do:
|
||||
config_dict={
|
||||
'hidden_size': 1024,
|
||||
'num_attention_heads': 16,
|
||||
'num_layers': 24,
|
||||
'max_position_embeddings: 512,
|
||||
}
|
||||
|
||||
|
||||
Args:
|
||||
model_name (Optional[str]): Named model Megatron architecture from NeMo. Defaults to None.
|
||||
pretrained (bool): Use True to get pretrained weights.
|
||||
False will use the same architecture but with randomly initialized weights.
|
||||
Not implemented yet for Megatron encoders.
|
||||
Defaults to True.
|
||||
config_dict (Optional[dict], optional): Use for configuration of the Megatron model. Defaults to None.
|
||||
checkpoint_file (Optional[str], optional): Provide weights for the transformer from a local checkpoint.
|
||||
If using model parallel then this should be a directory. Defaults to None.
|
||||
vocab_file (Optional[str], optional): Path to vocab file that was used when pretraining the Megatron model.
|
||||
"""
|
||||
super().__init__()
|
||||
|
||||
if not pretrained:
|
||||
raise ValueError('We currently only support pretrained Megatron models. Please set pretrained=True')
|
||||
|
||||
if not checkpoint_file and not model_name:
|
||||
raise ValueError(
|
||||
'Currently Megatron models must be loaded from a pretrained model name or a pretrained checkpoint.'
|
||||
)
|
||||
|
||||
if model_name or checkpoint_file:
|
||||
model, checkpoint_file = get_megatron_lm_model(
|
||||
pretrained_model_name=model_name,
|
||||
config_dict=config_dict,
|
||||
checkpoint_file=checkpoint_file,
|
||||
vocab_file=vocab_file,
|
||||
)
|
||||
|
||||
self._checkpoint_file = checkpoint_file
|
||||
self._hidden_size = model.hidden_size
|
||||
self._vocab_size = model.vocab_size
|
||||
|
||||
self._encoder = model
|
||||
|
||||
@typecheck()
|
||||
def forward(self, input_ids, encoder_mask):
|
||||
encoder_hidden_states = self._encoder.forward(
|
||||
input_ids=input_ids, attention_mask=encoder_mask, token_type_ids=None
|
||||
)
|
||||
return encoder_hidden_states
|
||||
|
||||
@property
|
||||
def checkpoint_file(self) -> Optional[str]:
|
||||
return self._checkpoint_file
|
||||
|
||||
@property
|
||||
def hidden_size(self) -> Optional[int]:
|
||||
return self._hidden_size
|
||||
|
||||
@property
|
||||
def vocab_size(self) -> Optional[int]:
|
||||
return self._vocab_size
|
|
@ -111,7 +111,7 @@ def get_nmt_tokenizer(
|
|||
):
|
||||
"""
|
||||
Args:
|
||||
model_name: if using a pretrained model from NeMo or HuggingFace
|
||||
model_name: if using a pretrained model from NeMo, HuggingFace, or Megatron
|
||||
tokenizer_model: tokenizer model file of sentencepiece or youtokentome
|
||||
special_tokens: dict of special tokens
|
||||
vocab_file: path to vocab file
|
||||
|
@ -138,7 +138,12 @@ def get_nmt_tokenizer(
|
|||
return nemo.collections.common.tokenizers.sentencepiece_tokenizer.SentencePieceTokenizer(
|
||||
model_path=tokenizer_model, special_tokens=special_tokens_dict
|
||||
)
|
||||
elif library == 'megatron':
|
||||
logging.info(
|
||||
f'Getting Megatron tokenizer with pretrained model name: {model_name} and custom vocab file: {vocab_file}'
|
||||
)
|
||||
return get_tokenizer(tokenizer_name=model_name, vocab_file=vocab_file)
|
||||
else:
|
||||
raise NotImplementedError(
|
||||
'Currently we only support "yttm", "huggingface", and "sentencepiece" tokenizer library.'
|
||||
'Currently we only support "yttm", "huggingface", "megatron", and "sentencepiece" tokenizer library.'
|
||||
)
|
||||
|
|
|
@ -17,8 +17,10 @@ from typing import Optional, Union
|
|||
|
||||
from omegaconf.dictconfig import DictConfig
|
||||
|
||||
from nemo.collections.nlp.modules.common.encoder_module import EncoderModule
|
||||
from nemo.collections.nlp.modules.common.huggingface.huggingface_decoder import HuggingFaceDecoderModule
|
||||
from nemo.collections.nlp.modules.common.huggingface.huggingface_encoder import HuggingFaceEncoderModule
|
||||
from nemo.collections.nlp.modules.common.megatron.megatron_encoder import MegatronEncoderModule
|
||||
from nemo.collections.nlp.modules.common.transformer.transformer import TransformerDecoderNM, TransformerEncoderNM
|
||||
|
||||
|
||||
|
@ -110,9 +112,33 @@ def get_huggingface_transformer(
|
|||
config_dict: Optional[Union[dict, DictConfig]] = None,
|
||||
encoder: bool = True,
|
||||
) -> Union[HuggingFaceEncoderModule, HuggingFaceDecoderModule]:
|
||||
|
||||
if encoder:
|
||||
model = HuggingFaceEncoderModule(model_name, pretrained, config_dict)
|
||||
else:
|
||||
model = HuggingFaceDecoderModule(model_name, pretrained, config_dict)
|
||||
|
||||
return model
|
||||
|
||||
|
||||
def get_megatron_transformer(
|
||||
model_name: Optional[str] = None,
|
||||
pretrained: bool = True,
|
||||
config_dict: Optional[Union[dict, DictConfig]] = None,
|
||||
encoder: bool = True,
|
||||
checkpoint_file: str = None,
|
||||
) -> MegatronEncoderModule:
|
||||
|
||||
vocab_file = config_dict.pop('vocab_file', None)
|
||||
if encoder:
|
||||
model = MegatronEncoderModule(
|
||||
model_name=model_name,
|
||||
pretrained=pretrained,
|
||||
config_dict=config_dict,
|
||||
checkpoint_file=checkpoint_file,
|
||||
vocab_file=vocab_file,
|
||||
)
|
||||
else:
|
||||
raise ValueError('Megatron decoders are not currently supported.')
|
||||
|
||||
return model
|
||||
|
|
|
@ -29,6 +29,7 @@ from pytorch_lightning.utilities.cloud_io import atomic_save
|
|||
from torch.nn.parallel import DistributedDataParallel
|
||||
|
||||
from nemo.collections.nlp.modules.common.megatron.megatron_bert import MegatronBertEncoder
|
||||
from nemo.collections.nlp.modules.common.megatron.megatron_encoder import MegatronEncoderModule
|
||||
from nemo.utils import AppState, logging
|
||||
|
||||
|
||||
|
@ -52,20 +53,18 @@ class NLPDDPPlugin(DDPPlugin):
|
|||
# call PTL init ddp
|
||||
super().init_ddp_connection()
|
||||
|
||||
# init model parallel
|
||||
# init model parallel if needed
|
||||
app_state = AppState()
|
||||
|
||||
if app_state.model_parallel_size is not None:
|
||||
|
||||
if isinstance(self.lightning_module.bert_model, MegatronBertEncoder):
|
||||
|
||||
if app_state.model_parallel_group is None:
|
||||
self.init_model_parallel(app_state.global_rank, app_state.world_size)
|
||||
if self.lightning_module.has_megatron_encoder and not self.lightning_module.is_model_parallel_initialized:
|
||||
self.init_model_parallel(app_state.global_rank, app_state.world_size)
|
||||
|
||||
def start_training(self, trainer: 'Trainer') -> None:
|
||||
""" PTL Hook that is called after DPP is initialized. """
|
||||
|
||||
if isinstance(self.lightning_module.bert_model, MegatronBertEncoder):
|
||||
if self.lightning_module.has_megatron_encoder:
|
||||
app_state = AppState()
|
||||
if app_state.model_parallel_size is not None:
|
||||
# mpu grad clipping needs parameters to have the attribute model_parallel
|
||||
|
@ -74,12 +73,8 @@ class NLPDDPPlugin(DDPPlugin):
|
|||
if not hasattr(p, 'model_parallel'):
|
||||
p.model_parallel = False
|
||||
|
||||
# TODO: figure out how to override clip gradients again
|
||||
# Update PTL trainer to use our _clip_gradients
|
||||
# self._trainer.accelerator_backend._clip_gradients = self._clip_gradients
|
||||
|
||||
if get_checkpoint_version():
|
||||
# Restored from .nemo, checkpoint_version will already be set
|
||||
if get_checkpoint_version() is not None:
|
||||
# megatron checkpoint already restored
|
||||
pass
|
||||
elif trainer.resume_from_checkpoint is not None:
|
||||
# PTL auto-resuming, need to update checkpoint name
|
||||
|
@ -98,10 +93,13 @@ class NLPDDPPlugin(DDPPlugin):
|
|||
logging.warning('Megatron-lm checkpoint version not found. Setting checkpoint_version to 0.')
|
||||
set_checkpoint_version(0)
|
||||
else:
|
||||
logging.info(
|
||||
f"Restoring from pretrained model parallel checkpoint: {self.lightning_module.bert_model._restore_path}"
|
||||
)
|
||||
self.lightning_module.bert_model.restore_weights(self.lightning_module.bert_model._restore_path)
|
||||
self.lightning_module.restore_megatron_encoder_weights()
|
||||
else:
|
||||
if get_checkpoint_version() is not None:
|
||||
# megatron checkpoint already restored
|
||||
pass
|
||||
else:
|
||||
self.lightning_module.restore_megatron_encoder_weights()
|
||||
|
||||
self.lightning_module.register_megatron_checkpoint_version()
|
||||
|
||||
|
@ -113,7 +111,7 @@ class NLPDDPPlugin(DDPPlugin):
|
|||
|
||||
if app_state.model_parallel_size is not None:
|
||||
|
||||
if isinstance(self.lightning_module.bert_model, MegatronBertEncoder):
|
||||
if self.has_megatron_encoder:
|
||||
# check megatron checkpoint version
|
||||
checkpoint_version = get_checkpoint_version()
|
||||
if checkpoint_version is None:
|
||||
|
@ -140,6 +138,7 @@ class NLPDDPPlugin(DDPPlugin):
|
|||
device_ids=device_ids,
|
||||
output_device=device_ids[0],
|
||||
process_group=app_state.data_parallel_group,
|
||||
find_unused_parameters=True,
|
||||
**self._ddp_kwargs,
|
||||
)
|
||||
|
||||
|
@ -168,7 +167,6 @@ class NLPDDPPlugin(DDPPlugin):
|
|||
app_state.data_parallel_size = mpu.get_data_parallel_world_size()
|
||||
logging.info(f'mp_rank: {app_state.model_parallel_rank}')
|
||||
logging.info(f'dp_rank: {app_state.data_parallel_rank}')
|
||||
# TODO: get random seed from PTL
|
||||
seed = os.environ.get("PL_GLOBAL_SEED", 1234)
|
||||
# random seed must be set for megatron model parallel init
|
||||
_set_random_seed(seed)
|
||||
|
|
|
@ -30,6 +30,7 @@ from pytorch_lightning.callbacks import ModelCheckpoint
|
|||
from pytorch_lightning.loggers import LoggerCollection as _LoggerCollection
|
||||
from pytorch_lightning.loggers import TensorBoardLogger, WandbLogger
|
||||
from pytorch_lightning.utilities import rank_zero_only
|
||||
from pytorch_lightning.utilities.types import _METRIC
|
||||
|
||||
from nemo.constants import NEMO_ENV_VARNAME_VERSION
|
||||
from nemo.utils import app_state, logging
|
||||
|
@ -645,6 +646,73 @@ class NeMoModelCheckpoint(ModelCheckpoint):
|
|||
trainer.checkpoint_connector.restore(self.best_model_path, on_gpu=trainer.on_gpu)
|
||||
pl_module.save_to(save_path=os.path.join(self.dirpath, self.prefix + self.postfix))
|
||||
|
||||
def _del_model(self, filepath: str) -> None:
|
||||
""" Overrides PTL method to account for model parallel checkpoints.
|
||||
Updates checkpoint path based on model parallel rank.
|
||||
"""
|
||||
app_state = AppState()
|
||||
if app_state.model_parallel_size is not None:
|
||||
# filepath needs to be updated to include mp_rank
|
||||
dirname = os.path.dirname(filepath)
|
||||
basename = os.path.basename(filepath)
|
||||
filepath = f'{dirname}/mp_rank_{app_state.model_parallel_rank:02d}/{basename}'
|
||||
|
||||
# each model parallel rank needs to remove its model
|
||||
if app_state.data_parallel_rank == 0:
|
||||
if self._fs.exists(filepath):
|
||||
self._fs.rm(filepath)
|
||||
logging.info(f"Removed model parallel checkpoint: {filepath}")
|
||||
|
||||
else:
|
||||
return super()._del_model(filepath)
|
||||
|
||||
def _save_last_checkpoint(self, trainer: 'pl.Trainer', monitor_candidates: Dict[str, _METRIC]) -> None:
|
||||
""" Overrides PTL method to account for model parallel checkpoints.
|
||||
Checks for data parallel rank 0 rather than global rank 0.
|
||||
"""
|
||||
app_state = AppState()
|
||||
if app_state.model_parallel_size is not None:
|
||||
if not self.save_last:
|
||||
return
|
||||
|
||||
filepath = self._format_checkpoint_name(self.CHECKPOINT_NAME_LAST, monitor_candidates)
|
||||
filepath = os.path.join(self.dirpath, f"{filepath}{self.FILE_EXTENSION}")
|
||||
|
||||
self._save_model(trainer, filepath)
|
||||
|
||||
# for model parallel we need to delete models for each model parallel rank
|
||||
if self.last_model_path and self.last_model_path != filepath and app_state.data_parallel_rank == 0:
|
||||
self._del_model(self.last_model_path)
|
||||
|
||||
self.last_model_path = filepath
|
||||
|
||||
else:
|
||||
return super()._save_last_checkpoint(trainer, monitor_candidates)
|
||||
|
||||
def _save_none_monitor_checkpoint(self, trainer: 'pl.Trainer', monitor_candidates: Dict[str, _METRIC]) -> None:
|
||||
""" Overrides PTL method to account for model parallel checkpoints.
|
||||
Checks for data parallel rank 0 rather than global rank 0.
|
||||
"""
|
||||
app_state = AppState()
|
||||
if app_state.model_parallel_size is not None:
|
||||
if self.monitor is not None or self.save_top_k == 0:
|
||||
return
|
||||
|
||||
filepath = self._get_metric_interpolated_filepath_name(monitor_candidates, trainer)
|
||||
self._save_model(trainer, filepath)
|
||||
|
||||
if (
|
||||
self.save_top_k is None
|
||||
and self.best_model_path
|
||||
and self.best_model_path != filepath
|
||||
and app_state.data_parallel_rank == 0
|
||||
):
|
||||
self._del_model(self.best_model_path)
|
||||
|
||||
self.best_model_path = filepath
|
||||
else:
|
||||
return super()._save_none_monitor_checkpoint(trainer, monitor_candidates)
|
||||
|
||||
|
||||
def configure_checkpointing(trainer: 'pytorch_lightning.Trainer', log_dir: Path, name: str, params: 'DictConfig'):
|
||||
""" Adds ModelCheckpoint to trainer. Raises CheckpointMisconfigurationError if trainer already has a ModelCheckpoint
|
||||
|
|
2163
tutorials/asr/10_ASR_CTC_Language_Finetuning.ipynb
Normal file
2163
tutorials/asr/10_ASR_CTC_Language_Finetuning.ipynb
Normal file
File diff suppressed because it is too large
Load diff
Loading…
Reference in a new issue