diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 8d58130f4..394ed9278 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -5,20 +5,31 @@ We do all of NeMo's development in the open. Contributions from NeMo community a # Pull Requests (PR) Guidelines +**Send your PRs to the `main` branch** + 1) Make sure your PR does one thing. Have a clear answer to "What does this PR do?". 2) Read General Principles and style guide below -3) Make sure unittest pass on your machine -4) Make sure you sign your commits. E.g. use ``git commit -s`` when before your commit -5) Make sure all unittests finish successfully before sending PR ``pytest`` or (if yor dev box does not have GPU) ``pytest --cpu`` from NeMo's root folder -6) Send your PR and request a review +3) Make sure you sign your commits. E.g. use ``git commit -s`` when before your commit +4) Make sure all unittests finish successfully before sending PR ``pytest`` or (if yor dev box does not have GPU) ``pytest --cpu`` from NeMo's root folder +5) Send your PR and request a review -Send your PR to the `main` branch +## Unit tests +Quick tests (locally, while developing) +``` +pytest +# If you don't have NVIDIA GPU do: +# pytest --cpu +``` +Full tests, including pre-trained model downloads +``` +pytest --with_downloads +``` -Whom should you ask for review: -1. For changes to NeMo's core: @okuchaiev, @blisc, @titu1994, @tkornuta-nvidia, or @ericharper -1. For changes to NeMo's ASR collection: @okuchaiev, @titu1994, @redoctopus, @blisc, or @vsl9 -1. For changes to NeMo's NLP collection: @ekmb, @yzhang123, @VahidooX, @vladgets, or @ericharper -1. For changes to NeMo's TTS collection: @blisc or @stasbel +## Whom should you ask for review: +1. For changes to NeMo's core: @ericharper, @titu1994, @blisc, or @okuchaiev +1. For changes to NeMo's ASR collection: @titu1994, @redoctopus, @jbalam-nv, or @okuchaiev +1. For changes to NeMo's NLP collection: @MaximumEntropy, @ericharper, @ekmb, @yzhang123, @VahidooX, @vladgets, or @okuchaiev +1. For changes to NeMo's TTS collection: @blisc or @stasbel, or @okuchaiev Note that some people may self-assign to review your PR - in which case, please wait for them to add a review. @@ -27,7 +38,7 @@ Your pull requests must pass all checks and peer-review before they can be merg # General principles 1. **User-oriented**: make it easy for end users, even at the cost of writing more code in the background 1. **Robust**: make it hard for users to make mistakes. -1. **Supporting of both training and inferencing**: if a module can only be used for training, write a companion module to be used during inference. +1. **Well-tested**: please add simple, fast unittests. Consider adding CI tests for end-to-end functionality. 1. **Reusable**: for every piece of code, think about how it can be reused in the future and make it easy to be reused. 1. **Readable**: code should be easier to read. 1. **Legal**: if you copy even one line of code from the Internet, make sure that the code allows the license that NeMo supports. Give credit and link back to the code. diff --git a/Jenkinsfile b/Jenkinsfile index 6c0921675..89e402e8b 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -54,7 +54,7 @@ pipeline { stage('L0: Unit Tests GPU') { steps { - sh 'pytest -m "unit and not skipduringci and not pleasefixme"' + sh 'pytest -m "not pleasefixme" --with_downloads' } } @@ -66,7 +66,7 @@ pipeline { } } steps { - sh 'CUDA_VISIBLE_DEVICES="" pytest -m "unit and not pleasefixme" --cpu' + sh 'CUDA_VISIBLE_DEVICES="" pytest -m "not pleasefixme" --cpu --with_downloads' } } diff --git a/tests/collections/nlp/test_huggingface.py b/tests/collections/nlp/test_huggingface.py index 5c5fff136..8656c0a15 100644 --- a/tests/collections/nlp/test_huggingface.py +++ b/tests/collections/nlp/test_huggingface.py @@ -40,30 +40,35 @@ class TestHuggingFace(TestCase): pretrained_lm_models = nemo_nlp.modules.get_pretrained_lm_models_list() self.assertTrue(len(pretrained_lm_models) > 0) + @pytest.mark.with_downloads() @pytest.mark.unit def test_get_pretrained_bert_model(self): model = nemo_nlp.modules.get_lm_model(pretrained_model_name='bert-base-uncased') assert isinstance(model, nemo_nlp.modules.BertEncoder) do_export(model, "bert-base-uncased") + @pytest.mark.with_downloads() @pytest.mark.unit def test_get_pretrained_distilbert_model(self): model = nemo_nlp.modules.get_lm_model(pretrained_model_name='distilbert-base-uncased') assert isinstance(model, nemo_nlp.modules.DistilBertEncoder) do_export(model, "distilbert-base-uncased") + @pytest.mark.with_downloads() @pytest.mark.unit def test_get_pretrained_roberta_model(self): model = nemo_nlp.modules.get_lm_model(pretrained_model_name='roberta-base') assert isinstance(model, nemo_nlp.modules.RobertaEncoder) do_export(model, "roberta-base-uncased") + @pytest.mark.with_downloads() @pytest.mark.unit def test_get_pretrained_albert_model(self): model = nemo_nlp.modules.get_lm_model(pretrained_model_name='albert-base-v1') assert isinstance(model, nemo_nlp.modules.AlbertEncoder) do_export(model, "albert-base-v1") + @pytest.mark.with_downloads() @pytest.mark.unit def test_get_pretrained_chinese_bert_wwm_model(self): model_name = 'hfl/chinese-bert-wwm' @@ -72,6 +77,7 @@ class TestHuggingFace(TestCase): tokenizer = get_tokenizer(tokenizer_name=model_name) assert isinstance(tokenizer, AutoTokenizer) + @pytest.mark.with_downloads() @pytest.mark.unit def test_get_pretrained_arabic_model(self): model_name = 'asafaya/bert-base-arabic' diff --git a/tests/collections/nlp/test_megatron.py b/tests/collections/nlp/test_megatron.py index 868c5add9..ad6d531bc 100644 --- a/tests/collections/nlp/test_megatron.py +++ b/tests/collections/nlp/test_megatron.py @@ -31,6 +31,14 @@ import nemo.collections.nlp as nemo_nlp from nemo.core.classes import typecheck +def get_pretrained_bert_345m_uncased_model(): + model_name = "megatron-bert-345m-uncased" + model = nemo_nlp.modules.get_lm_model(pretrained_model_name=model_name) + if torch.cuda.is_available(): + model = model.cuda() + return model + + class TestMegatron(TestCase): @pytest.mark.run_only_on('GPU') @pytest.mark.unit @@ -38,33 +46,31 @@ class TestMegatron(TestCase): pretrained_lm_models = nemo_nlp.modules.get_pretrained_lm_models_list() self.assertTrue(len(pretrained_lm_models) > 0) + @pytest.mark.skipif(not os.path.exists('/home/TestData/nlp'), reason='Not a Jenkins machine') + @pytest.mark.with_downloads() @pytest.mark.run_only_on('GPU') @pytest.mark.unit - def test_get_pretrained_bert_345m_uncased_model(self): - model_name = "megatron-bert-345m-uncased" - model = nemo_nlp.modules.get_lm_model(pretrained_model_name=model_name) - if torch.cuda.is_available(): - model = model.cuda() - + def test_get_model(self): + model = get_pretrained_bert_345m_uncased_model() assert isinstance(model, nemo_nlp.modules.MegatronBertEncoder) typecheck.set_typecheck_enabled(enabled=False) inp = model.input_example() out = model.forward(*inp) typecheck.set_typecheck_enabled(enabled=True) - self.model = model @pytest.mark.run_only_on('GPU') @pytest.mark.unit @pytest.mark.skip('ONNX export is broken in PyTorch') def test_onnx_export(self): - assert self.model + model = get_pretrained_bert_345m_uncased_model() + assert model with tempfile.TemporaryDirectory() as tmpdir: # Generate filename in the temporary directory. # Test export. - self.model.export(os.path.join(tmpdir, "megatron.onnx")) + model.export(os.path.join(tmpdir, "megatron.onnx")) if __name__ == "__main__": t = TestMegatron() - t.test_get_pretrained_bert_345m_uncased_model() + t.test_get_model() diff --git a/tests/collections/nlp/test_pretrained_models_performance.py b/tests/collections/nlp/test_pretrained_models_performance.py index d3359c64c..f771193ee 100644 --- a/tests/collections/nlp/test_pretrained_models_performance.py +++ b/tests/collections/nlp/test_pretrained_models_performance.py @@ -55,6 +55,7 @@ def data_exists(data_dir): class TestPretrainedModelPerformance(TestCase): + @pytest.mark.with_downloads() @pytest.mark.unit @pytest.mark.run_only_on('GPU') @pytest.mark.skipif( @@ -83,6 +84,7 @@ class TestPretrainedModelPerformance(TestCase): @pytest.mark.skipif( not data_exists('/home/TestData/nlp/token_classification_punctuation/fisher'), reason='Not a Jenkins machine' ) + @pytest.mark.with_downloads() def test_punct_capit_with_distilbert(self): data_dir = '/home/TestData/nlp/token_classification_punctuation/fisher' model = models.PunctuationCapitalizationModel.from_pretrained("punctuation_en_distilbert") @@ -93,6 +95,7 @@ class TestPretrainedModelPerformance(TestCase): assert abs(metrics['punct_f1'] - 52.4225) < 0.001 assert int(model.punct_class_report.total_examples) == 128 + @pytest.mark.with_downloads() @pytest.mark.unit @pytest.mark.run_only_on('GPU') @pytest.mark.skipif( diff --git a/tests/conftest.py b/tests/conftest.py index 8b5ec47ff..75a8b6ddc 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -42,6 +42,11 @@ def pytest_addoption(parser): action='store_true', help="pass that argument to use local test data/skip downloading from URL/GitHub (DEFAULT: False)", ) + parser.addoption( + '--with_downloads', + action='store_true', + help="pass this argument to active tests which download models from the cloud.", + ) @pytest.fixture @@ -60,6 +65,15 @@ def run_only_on_device_fixture(request, device): pytest.skip('skipped on this device: {}'.format(device)) +@pytest.fixture(autouse=True) +def downloads_weights(request, device): + if request.node.get_closest_marker('with_downloads'): + if not request.config.getoption("--with_downloads"): + pytest.skip( + 'To run this test, pass --with_downloads option. It will download (and cache) models from cloud.' + ) + + @pytest.fixture def cleanup_local_folder(): # Asserts in fixture are not recommended, but I'd rather stop users from deleting expensive training runs diff --git a/tests/core/test_save_restore.py b/tests/core/test_save_restore.py index a4c8e4fd3..c23f2ca0b 100644 --- a/tests/core/test_save_restore.py +++ b/tests/core/test_save_restore.py @@ -120,24 +120,28 @@ class TestSaveRestore: return model_copy + @pytest.mark.with_downloads() @pytest.mark.unit def test_EncDecCTCModel(self): # TODO: Switch to using named configs because here we don't really care about weights qn = EncDecCTCModel.from_pretrained(model_name="QuartzNet15x5Base-En") self.__test_restore_elsewhere(model=qn, attr_for_eq_check=set(["decoder._feat_in", "decoder._num_classes"])) + @pytest.mark.with_downloads() @pytest.mark.unit def test_EncDecCTCModelBPE(self): # TODO: Switch to using named configs because here we don't really care about weights cn = EncDecCTCModelBPE.from_pretrained(model_name="stt_en_citrinet_256") self.__test_restore_elsewhere(model=cn, attr_for_eq_check=set(["decoder._feat_in", "decoder._num_classes"])) + @pytest.mark.with_downloads() @pytest.mark.unit def test_EncDecCTCModelBPE_v2(self): # TODO: Switch to using named configs because here we don't really care about weights cn = EncDecCTCModelBPE.from_pretrained(model_name="stt_en_conformer_ctc_small") self.__test_restore_elsewhere(model=cn, attr_for_eq_check=set(["decoder._feat_in", "decoder._num_classes"])) + @pytest.mark.with_downloads() @pytest.mark.unit def test_PunctuationCapitalization(self): # TODO: Switch to using named configs because here we don't really care about weights