Compare commits
6 commits
main
...
PTL_upgrad
Author | SHA1 | Date | |
---|---|---|---|
cf90279444 | |||
19d3b7d73c | |||
629201e81b | |||
4b3616106e | |||
dd85012beb | |||
2596571904 |
138
Jenkinsfile
vendored
138
Jenkinsfile
vendored
|
@ -45,8 +45,8 @@ pipeline {
|
|||
stage('Torch TTS unit tests') {
|
||||
when {
|
||||
anyOf {
|
||||
branch 'main'
|
||||
changeRequest target: 'main'
|
||||
branch 'r1.5.0'
|
||||
changeRequest target: 'r1.5.0'
|
||||
}
|
||||
}
|
||||
steps {
|
||||
|
@ -101,8 +101,8 @@ pipeline {
|
|||
stage('L0: Unit Tests CPU') {
|
||||
when {
|
||||
anyOf {
|
||||
branch 'main'
|
||||
changeRequest target: 'main'
|
||||
branch 'r1.5.0'
|
||||
changeRequest target: 'r1.5.0'
|
||||
}
|
||||
}
|
||||
steps {
|
||||
|
@ -113,8 +113,8 @@ pipeline {
|
|||
stage('L0: TN/ITN Tests CPU') {
|
||||
when {
|
||||
anyOf {
|
||||
branch 'main'
|
||||
changeRequest target: 'main'
|
||||
branch 'r1.5.0'
|
||||
changeRequest target: 'r1.5.0'
|
||||
}
|
||||
}
|
||||
failFast true
|
||||
|
@ -159,8 +159,8 @@ pipeline {
|
|||
stage('L2: NeMo text processing') {
|
||||
when {
|
||||
anyOf {
|
||||
branch 'main'
|
||||
changeRequest target: 'main'
|
||||
branch 'r1.5.0'
|
||||
changeRequest target: 'r1.5.0'
|
||||
}
|
||||
}
|
||||
failFast true
|
||||
|
@ -214,8 +214,8 @@ pipeline {
|
|||
stage('L0: Computer Vision Integration') {
|
||||
when {
|
||||
anyOf {
|
||||
branch 'main'
|
||||
changeRequest target: 'main'
|
||||
branch 'r1.5.0'
|
||||
changeRequest target: 'r1.5.0'
|
||||
}
|
||||
}
|
||||
failFast true
|
||||
|
@ -242,8 +242,8 @@ pipeline {
|
|||
// stage('L0: Integration Tests CPU') {
|
||||
// when {
|
||||
// anyOf{
|
||||
// branch 'main'
|
||||
// changeRequest target: 'main'
|
||||
// branch 'r1.5.0'
|
||||
// changeRequest target: 'r1.5.0'
|
||||
// }
|
||||
// }
|
||||
// steps {
|
||||
|
@ -262,7 +262,7 @@ pipeline {
|
|||
// when {
|
||||
// anyOf{
|
||||
// branch 'dev
|
||||
// changeRequest target: 'main'
|
||||
// changeRequest target: 'r1.5.0'
|
||||
// }
|
||||
// }
|
||||
// steps {
|
||||
|
@ -273,8 +273,8 @@ pipeline {
|
|||
stage('L2: ASR dev run') {
|
||||
when {
|
||||
anyOf {
|
||||
branch 'main'
|
||||
changeRequest target: 'main'
|
||||
branch 'r1.5.0'
|
||||
changeRequest target: 'r1.5.0'
|
||||
}
|
||||
}
|
||||
failFast true
|
||||
|
@ -377,8 +377,8 @@ pipeline {
|
|||
// stage('L2: ASR DALI dev run') {
|
||||
// when {
|
||||
// anyOf {
|
||||
// branch 'main'
|
||||
// changeRequest target: 'main'
|
||||
// branch 'r1.5.0'
|
||||
// changeRequest target: 'r1.5.0'
|
||||
// }
|
||||
// }
|
||||
// failFast true
|
||||
|
@ -442,8 +442,8 @@ pipeline {
|
|||
// stage('L2: ASR RNNT dev run') {
|
||||
// when {
|
||||
// anyOf {
|
||||
// branch 'main'
|
||||
// changeRequest target: 'main'
|
||||
// branch 'r1.5.0'
|
||||
// changeRequest target: 'r1.5.0'
|
||||
// }
|
||||
// }
|
||||
// failFast true
|
||||
|
@ -484,8 +484,8 @@ pipeline {
|
|||
stage('L2: ASR Multi-dataloader dev run') {
|
||||
when {
|
||||
anyOf {
|
||||
branch 'main'
|
||||
changeRequest target: 'main'
|
||||
branch 'r1.5.0'
|
||||
changeRequest target: 'r1.5.0'
|
||||
}
|
||||
}
|
||||
failFast true
|
||||
|
@ -530,8 +530,8 @@ pipeline {
|
|||
stage('L2: Speech Transcription') {
|
||||
when {
|
||||
anyOf {
|
||||
branch 'main'
|
||||
changeRequest target: 'main'
|
||||
branch 'r1.5.0'
|
||||
changeRequest target: 'r1.5.0'
|
||||
}
|
||||
}
|
||||
failFast true
|
||||
|
@ -553,8 +553,8 @@ pipeline {
|
|||
stage('L2: Segmentation Tool') {
|
||||
when {
|
||||
anyOf {
|
||||
branch 'main'
|
||||
changeRequest target: 'main'
|
||||
branch 'r1.5.0'
|
||||
changeRequest target: 'r1.5.0'
|
||||
}
|
||||
}
|
||||
stages {
|
||||
|
@ -617,8 +617,8 @@ pipeline {
|
|||
// stage('L2: Multi-GPU Megatron finetuning') {
|
||||
// when {
|
||||
// anyOf {
|
||||
// branch 'main'
|
||||
// changeRequest target: 'main'
|
||||
// branch 'r1.5.0'
|
||||
// changeRequest target: 'r1.5.0'
|
||||
// }
|
||||
// }
|
||||
// failFast true
|
||||
|
@ -642,8 +642,8 @@ pipeline {
|
|||
stage('L2: SGD-QA') {
|
||||
when {
|
||||
anyOf {
|
||||
branch 'main'
|
||||
changeRequest target: 'main'
|
||||
branch 'r1.5.0'
|
||||
changeRequest target: 'r1.5.0'
|
||||
}
|
||||
}
|
||||
failFast true
|
||||
|
@ -708,8 +708,8 @@ pipeline {
|
|||
stage('L2: Parallel BERT SQUAD v1.1 / v2.0') {
|
||||
when {
|
||||
anyOf {
|
||||
branch 'main'
|
||||
changeRequest target: 'main'
|
||||
branch 'r1.5.0'
|
||||
changeRequest target: 'r1.5.0'
|
||||
}
|
||||
}
|
||||
failFast true
|
||||
|
@ -788,8 +788,8 @@ pipeline {
|
|||
// stage('L2: MegaBERT Token Classification') {
|
||||
// when {
|
||||
// anyOf {
|
||||
// branch 'main'
|
||||
// changeRequest target: 'main'
|
||||
// branch 'r1.5.0'
|
||||
// changeRequest target: 'r1.5.0'
|
||||
// }
|
||||
// }
|
||||
// failFast true
|
||||
|
@ -812,8 +812,8 @@ pipeline {
|
|||
stage('L2: Parallel SQUAD v1.1 & v2.0') {
|
||||
when {
|
||||
anyOf {
|
||||
branch 'main'
|
||||
changeRequest target: 'main'
|
||||
branch 'r1.5.0'
|
||||
changeRequest target: 'r1.5.0'
|
||||
}
|
||||
}
|
||||
failFast true
|
||||
|
@ -902,8 +902,8 @@ pipeline {
|
|||
// stage('L2: Model Parallel Size 2 Megatron Text Classification') {
|
||||
// when {
|
||||
// anyOf{
|
||||
// branch 'main'
|
||||
// changeRequest target: 'main'
|
||||
// branch 'r1.5.0'
|
||||
// changeRequest target: 'r1.5.0'
|
||||
// }
|
||||
// }
|
||||
// failFast true
|
||||
|
@ -930,8 +930,8 @@ pipeline {
|
|||
// stage('L2: Model Parallel Size 2 Megatron Autoresume') {
|
||||
// when {
|
||||
// anyOf{
|
||||
// branch 'main'
|
||||
// changeRequest target: 'main'
|
||||
// branch 'r1.5.0'
|
||||
// changeRequest target: 'r1.5.0'
|
||||
// }
|
||||
// }
|
||||
// failFast true
|
||||
|
@ -960,8 +960,8 @@ pipeline {
|
|||
// stage('L2: Model Parallel Size 2 Megatron Evaluation from .nemo') {
|
||||
// when {
|
||||
// anyOf{
|
||||
// branch 'main'
|
||||
// changeRequest target: 'main'
|
||||
// branch 'r1.5.0'
|
||||
// changeRequest target: 'r1.5.0'
|
||||
// }
|
||||
// }
|
||||
// failFast true
|
||||
|
@ -980,8 +980,8 @@ pipeline {
|
|||
// stage('L2: Model Parallel Size 2 Megatron Train from .nemo') {
|
||||
// when {
|
||||
// anyOf{
|
||||
// branch 'main'
|
||||
// changeRequest target: 'main'
|
||||
// branch 'r1.5.0'
|
||||
// changeRequest target: 'r1.5.0'
|
||||
// }
|
||||
// }
|
||||
// failFast true
|
||||
|
@ -1002,8 +1002,8 @@ pipeline {
|
|||
stage('L2: Parallel NLP Examples 2') {
|
||||
when {
|
||||
anyOf {
|
||||
branch 'main'
|
||||
changeRequest target: 'main'
|
||||
branch 'r1.5.0'
|
||||
changeRequest target: 'r1.5.0'
|
||||
}
|
||||
}
|
||||
failFast true
|
||||
|
@ -1084,8 +1084,8 @@ pipeline {
|
|||
stage('L2: Parallel Pretraining BERT pretraining from Text/Preprocessed') {
|
||||
when {
|
||||
anyOf {
|
||||
branch 'main'
|
||||
changeRequest target: 'main'
|
||||
branch 'r1.5.0'
|
||||
changeRequest target: 'r1.5.0'
|
||||
}
|
||||
}
|
||||
failFast true
|
||||
|
@ -1144,8 +1144,8 @@ pipeline {
|
|||
stage('L2: Entity Linking') {
|
||||
when {
|
||||
anyOf {
|
||||
branch 'main'
|
||||
changeRequest target: 'main'
|
||||
branch 'r1.5.0'
|
||||
changeRequest target: 'r1.5.0'
|
||||
}
|
||||
}
|
||||
failFast true
|
||||
|
@ -1170,8 +1170,8 @@ pipeline {
|
|||
stage('L2: NMT Attention is All You Need Training') {
|
||||
when {
|
||||
anyOf {
|
||||
branch 'main'
|
||||
changeRequest target: 'main'
|
||||
branch 'r1.5.0'
|
||||
changeRequest target: 'r1.5.0'
|
||||
}
|
||||
}
|
||||
failFast true
|
||||
|
@ -1251,8 +1251,8 @@ pipeline {
|
|||
stage('L2: NMT Attention is All You Need Inference') {
|
||||
when {
|
||||
anyOf {
|
||||
branch 'main'
|
||||
changeRequest target: 'main'
|
||||
branch 'r1.5.0'
|
||||
changeRequest target: 'r1.5.0'
|
||||
}
|
||||
}
|
||||
failFast true
|
||||
|
@ -1286,8 +1286,8 @@ pipeline {
|
|||
stage('L2: NMT with HuggingFace') {
|
||||
when {
|
||||
anyOf {
|
||||
branch 'main'
|
||||
changeRequest target: 'main'
|
||||
branch 'r1.5.0'
|
||||
changeRequest target: 'r1.5.0'
|
||||
}
|
||||
}
|
||||
failFast true
|
||||
|
@ -1361,8 +1361,8 @@ pipeline {
|
|||
// stage('L2: NMT Megatron BERT Model Parallel Size 2 Encoder') {
|
||||
// when {
|
||||
// anyOf{
|
||||
// branch 'main'
|
||||
// changeRequest target: 'main'
|
||||
// branch 'r1.5.0'
|
||||
// changeRequest target: 'r1.5.0'
|
||||
// }
|
||||
// }
|
||||
// failFast true
|
||||
|
@ -1395,8 +1395,8 @@ pipeline {
|
|||
stage('L2: NMT Tarred Dataset Creation') {
|
||||
when {
|
||||
anyOf {
|
||||
branch 'main'
|
||||
changeRequest target: 'main'
|
||||
branch 'r1.5.0'
|
||||
changeRequest target: 'r1.5.0'
|
||||
}
|
||||
}
|
||||
failFast true
|
||||
|
@ -1449,8 +1449,8 @@ pipeline {
|
|||
// stage('L2: NMT Bottleneck Fallback') {
|
||||
// when {
|
||||
// anyOf {
|
||||
// branch 'main'
|
||||
// changeRequest target: 'main'
|
||||
// branch 'r1.5.0'
|
||||
// changeRequest target: 'r1.5.0'
|
||||
// }
|
||||
// }
|
||||
// failFast true
|
||||
|
@ -1495,8 +1495,8 @@ pipeline {
|
|||
// stage('L2: NMT Bottleneck Architecture') {
|
||||
// when {
|
||||
// anyOf {
|
||||
// branch 'main'
|
||||
// changeRequest target: 'main'
|
||||
// branch 'r1.5.0'
|
||||
// changeRequest target: 'r1.5.0'
|
||||
// }
|
||||
// }
|
||||
// failFast true
|
||||
|
@ -1576,8 +1576,8 @@ pipeline {
|
|||
// stage('L2: NMT Bottleneck LVM') {
|
||||
// when {
|
||||
// anyOf {
|
||||
// branch 'main'
|
||||
// changeRequest target: 'main'
|
||||
// branch 'r1.5.0'
|
||||
// changeRequest target: 'r1.5.0'
|
||||
// }
|
||||
// }
|
||||
// failFast true
|
||||
|
@ -1658,8 +1658,8 @@ pipeline {
|
|||
stage('L2: TTS Fast dev runs 1') {
|
||||
when {
|
||||
anyOf {
|
||||
branch 'main'
|
||||
changeRequest target: 'main'
|
||||
branch 'r1.5.0'
|
||||
changeRequest target: 'r1.5.0'
|
||||
}
|
||||
}
|
||||
parallel {
|
||||
|
@ -1741,8 +1741,8 @@ pipeline {
|
|||
stage('L??: Speech Checkpoints tests') {
|
||||
when {
|
||||
anyOf {
|
||||
branch 'main'
|
||||
changeRequest target: 'main'
|
||||
branch 'r1.5.0'
|
||||
changeRequest target: 'r1.5.0'
|
||||
}
|
||||
}
|
||||
failFast true
|
||||
|
|
|
@ -228,7 +228,8 @@ trainer:
|
|||
max_epochs: 100
|
||||
max_steps: null # computed at runtime if not set
|
||||
num_nodes: 1
|
||||
accelerator: ddp
|
||||
accelerator: gpu
|
||||
stratergy: ddp
|
||||
accumulate_grad_batches: 1
|
||||
checkpoint_callback: false # Provided by exp_manager
|
||||
logger: false # Provided by exp_manager
|
||||
|
|
|
@ -392,7 +392,8 @@ trainer:
|
|||
max_epochs: 100
|
||||
max_steps: null # computed at runtime if not set
|
||||
num_nodes: 1
|
||||
accelerator: ddp
|
||||
accelerator: gpu
|
||||
stratergy: ddp
|
||||
accumulate_grad_batches: 1
|
||||
checkpoint_callback: false # Provided by exp_manager
|
||||
logger: false # Provided by exp_manager
|
||||
|
|
|
@ -392,7 +392,8 @@ trainer:
|
|||
max_epochs: 100
|
||||
max_steps: null # computed at runtime if not set
|
||||
num_nodes: 1
|
||||
accelerator: ddp
|
||||
accelerator: gpu
|
||||
stratergy: ddp
|
||||
accumulate_grad_batches: 1
|
||||
checkpoint_callback: false # Provided by exp_manager
|
||||
logger: false # Provided by exp_manager
|
||||
|
|
|
@ -158,7 +158,8 @@ trainer:
|
|||
max_epochs: 5
|
||||
max_steps: null # computed at runtime if not set
|
||||
num_nodes: 1
|
||||
accelerator: ddp
|
||||
accelerator: gpu
|
||||
stratergy: ddp
|
||||
accumulate_grad_batches: 1
|
||||
checkpoint_callback: False # Provided by exp_manager
|
||||
logger: False # Provided by exp_manager
|
||||
|
|
|
@ -162,7 +162,8 @@ trainer:
|
|||
max_epochs: 5
|
||||
max_steps: null # computed at runtime if not set
|
||||
num_nodes: 1
|
||||
accelerator: ddp
|
||||
accelerator: gpu
|
||||
stratergy: ddp
|
||||
accumulate_grad_batches: 1
|
||||
checkpoint_callback: False # Provided by exp_manager
|
||||
logger: False # Provided by exp_manager
|
||||
|
|
|
@ -153,7 +153,8 @@ trainer:
|
|||
max_epochs: 1000
|
||||
max_steps: null # computed at runtime if not set
|
||||
val_check_interval: 1.0 # Set to 0.25 to check 4 times per epoch, or an int for number of iterations
|
||||
accelerator: ddp
|
||||
accelerator: gpu
|
||||
stratergy: ddp
|
||||
accumulate_grad_batches: 1
|
||||
gradient_clip_val: 0.0
|
||||
precision: 32 # Should be set to 16 for O1 and O2 to enable the AMP.
|
||||
|
|
|
@ -128,7 +128,8 @@ trainer:
|
|||
max_epochs: 1000
|
||||
max_steps: null # computed at runtime if not set
|
||||
val_check_interval: 1.0 # Set to 0.25 to check 4 times per epoch, or an int for number of iterations
|
||||
accelerator: ddp
|
||||
accelerator: gpu
|
||||
stratergy: ddp
|
||||
accumulate_grad_batches: 1
|
||||
gradient_clip_val: 0.0
|
||||
precision: 32 # Should be set to 16 for O1 and O2 to enable the AMP.
|
||||
|
|
|
@ -203,7 +203,8 @@ trainer:
|
|||
max_epochs: 1000
|
||||
max_steps: null # computed at runtime if not set
|
||||
val_check_interval: 1.0 # Set to 0.25 to check 4 times per epoch, or an int for number of iterations
|
||||
accelerator: ddp
|
||||
accelerator: gpu
|
||||
stratergy: ddp
|
||||
accumulate_grad_batches: 1
|
||||
gradient_clip_val: 0.0
|
||||
precision: 32 # Should be set to 16 for O1 and O2 to enable the AMP.
|
||||
|
|
|
@ -198,7 +198,8 @@ trainer:
|
|||
max_epochs: 1000
|
||||
max_steps: null # computed at runtime if not set
|
||||
val_check_interval: 1.0 # Set to 0.25 to check 4 times per epoch, or an int for number of iterations
|
||||
accelerator: ddp
|
||||
accelerator: gpu
|
||||
stratergy: ddp
|
||||
accumulate_grad_batches: 1
|
||||
gradient_clip_val: 0.0
|
||||
precision: 32 # Should be set to 16 for O1 and O2 to enable the AMP.
|
||||
|
|
|
@ -225,7 +225,8 @@ trainer:
|
|||
max_epochs: 5
|
||||
max_steps: null # computed at runtime if not set
|
||||
num_nodes: 1
|
||||
accelerator: ddp
|
||||
accelerator: gpu
|
||||
stratergy: ddp
|
||||
precision: 32
|
||||
accumulate_grad_batches: 1
|
||||
checkpoint_callback: False # Provided by exp_manager
|
||||
|
|
|
@ -226,7 +226,8 @@ trainer:
|
|||
max_epochs: 5
|
||||
max_steps: null # computed at runtime if not set
|
||||
num_nodes: 1
|
||||
accelerator: ddp
|
||||
accelerator: gpu
|
||||
stratergy: ddp
|
||||
precision: 32
|
||||
accumulate_grad_batches: 1
|
||||
checkpoint_callback: False # Provided by exp_manager
|
||||
|
|
|
@ -471,7 +471,8 @@ trainer:
|
|||
max_epochs: 100
|
||||
max_steps: null # computed at runtime if not set
|
||||
num_nodes: 1 # Should be set via SLURM variable `SLURM_JOB_NUM_NODES`
|
||||
accelerator: ddp
|
||||
accelerator: gpu
|
||||
stratergy: ddp
|
||||
accumulate_grad_batches: 1
|
||||
checkpoint_callback: false # Provided by exp_manager
|
||||
logger: false # Provided by exp_manager
|
||||
|
|
|
@ -472,7 +472,8 @@ trainer:
|
|||
max_epochs: 100
|
||||
max_steps: null # computed at runtime if not set
|
||||
num_nodes: 1 # Should be set via SLURM variable `SLURM_JOB_NUM_NODES`
|
||||
accelerator: ddp
|
||||
accelerator: gpu
|
||||
stratergy: ddp
|
||||
accumulate_grad_batches: 1
|
||||
checkpoint_callback: false # Provided by exp_manager
|
||||
logger: false # Provided by exp_manager
|
||||
|
|
|
@ -183,7 +183,8 @@ trainer:
|
|||
max_epochs: 5
|
||||
max_steps: null # computed at runtime if not set
|
||||
num_nodes: 1
|
||||
accelerator: ddp
|
||||
accelerator: gpu
|
||||
stratergy: ddp
|
||||
accumulate_grad_batches: 1
|
||||
checkpoint_callback: False # Provided by exp_manager
|
||||
logger: False # Provided by exp_manager
|
||||
|
|
|
@ -155,7 +155,8 @@ trainer:
|
|||
max_epochs: 150
|
||||
max_steps: null # computed at runtime if not set
|
||||
num_nodes: 1
|
||||
accelerator: ddp
|
||||
accelerator: gpu
|
||||
stratergy: ddp
|
||||
accumulate_grad_batches: 1
|
||||
checkpoint_callback: False # Provided by exp_manager
|
||||
logger: False # Provided by exp_manager
|
||||
|
|
|
@ -168,7 +168,8 @@ trainer:
|
|||
max_epochs: 200
|
||||
max_steps: null # computed at runtime if not set
|
||||
num_nodes: 1
|
||||
accelerator: ddp
|
||||
accelerator: gpu
|
||||
stratergy: ddp
|
||||
accumulate_grad_batches: 1
|
||||
checkpoint_callback: False # Provided by exp_manager
|
||||
logger: False # Provided by exp_manager
|
||||
|
|
|
@ -168,7 +168,8 @@ trainer:
|
|||
max_epochs: 200
|
||||
max_steps: null # computed at runtime if not set
|
||||
num_nodes: 1
|
||||
accelerator: ddp
|
||||
accelerator: gpu
|
||||
stratergy: ddp
|
||||
accumulate_grad_batches: 1
|
||||
checkpoint_callback: False # Provided by exp_manager
|
||||
logger: False # Provided by exp_manager
|
||||
|
|
|
@ -253,7 +253,8 @@ trainer:
|
|||
max_epochs: 5
|
||||
max_steps: null # computed at runtime if not set
|
||||
num_nodes: 1
|
||||
accelerator: ddp
|
||||
accelerator: gpu
|
||||
stratergy: ddp
|
||||
accumulate_grad_batches: 1
|
||||
checkpoint_callback: False # Provided by exp_manager
|
||||
logger: False # Provided by exp_manager
|
||||
|
|
|
@ -455,7 +455,8 @@ trainer:
|
|||
max_epochs: 5
|
||||
max_steps: null # computed at runtime if not set
|
||||
num_nodes: 1
|
||||
accelerator: ddp
|
||||
accelerator: gpu
|
||||
stratergy: ddp
|
||||
accumulate_grad_batches: 1
|
||||
checkpoint_callback: False # Provided by exp_manager
|
||||
logger: False # Provided by exp_manager
|
||||
|
|
|
@ -265,7 +265,8 @@ trainer:
|
|||
max_epochs: 5
|
||||
max_steps: null # computed at runtime if not set
|
||||
num_nodes: 1
|
||||
accelerator: ddp
|
||||
accelerator: gpu
|
||||
stratergy: ddp
|
||||
accumulate_grad_batches: 1
|
||||
checkpoint_callback: False # Provided by exp_manager
|
||||
logger: False # Provided by exp_manager
|
||||
|
|
|
@ -226,7 +226,8 @@ trainer:
|
|||
gpus: 0 # number of gpus
|
||||
max_epochs: 5
|
||||
num_nodes: 1
|
||||
accelerator: ddp
|
||||
accelerator: gpu
|
||||
stratergy: ddp
|
||||
accumulate_grad_batches: 1
|
||||
log_every_n_steps: 1 # Interval of logging.
|
||||
val_check_interval: 1.0 # Set to 0.25 to check 4 times per epoch, or an int for number of iterations
|
||||
|
|
|
@ -97,7 +97,8 @@ trainer:
|
|||
max_epochs: 100
|
||||
max_steps: null # computed at runtime if not set
|
||||
val_check_interval: 0.5 # Set to 0.25 to check 4 times per epoch, or an int for number of iterations
|
||||
accelerator: ddp
|
||||
accelerator: gpu
|
||||
stratergy: ddp
|
||||
accumulate_grad_batches: 1
|
||||
gradient_clip_val: 0.0
|
||||
precision: 32 # Should be set to 16 for O1 and O2 to enable the AMP.
|
||||
|
|
|
@ -97,7 +97,8 @@ trainer:
|
|||
max_epochs: 100
|
||||
max_steps: null # computed at runtime if not set
|
||||
val_check_interval: 0.5 # Set to 0.25 to check 4 times per epoch, or an int for number of iterations
|
||||
accelerator: ddp
|
||||
accelerator: gpu
|
||||
stratergy: ddp
|
||||
accumulate_grad_batches: 1
|
||||
gradient_clip_val: 0.0
|
||||
precision: 32 # Should be set to 16 for O1 and O2 to enable the AMP.
|
||||
|
|
|
@ -34,7 +34,8 @@ python speech_to_label.py \
|
|||
model.train_ds.manifest_filepath="<path to train manifest>" \
|
||||
model.validation_ds.manifest_filepath=["<path to val manifest>","<path to test manifest>"] \
|
||||
trainer.gpus=2 \
|
||||
trainer.accelerator="ddp" \
|
||||
trainer.accelerator="gpu" \
|
||||
trainer.strategy="ddp" \
|
||||
trainer.max_epochs=200 \
|
||||
exp_manager.create_wandb_logger=True \
|
||||
exp_manager.wandb_logger_kwargs.name="MatchboxNet-3x1x64-v1" \
|
||||
|
@ -67,7 +68,8 @@ python speech_to_label.py \
|
|||
model.train_ds.manifest_filepath="<path to train manifest>" \
|
||||
model.validation_ds.manifest_filepath=["<path to val manifest>","<path to test manifest>"] \
|
||||
trainer.gpus=2 \
|
||||
trainer.accelerator="ddp" \
|
||||
trainer.accelerator="gpu" \
|
||||
trainer.strategy="ddp" \
|
||||
trainer.max_epochs=200 \
|
||||
exp_manager.create_wandb_logger=True \
|
||||
exp_manager.wandb_logger_kwargs.name="MatchboxNet-3x1x64-vad" \
|
||||
|
@ -94,7 +96,8 @@ python speech_to_label.py \
|
|||
+model.train_ds.num_worker=<num_shards used generating tarred dataset> \
|
||||
model.validation_ds.manifest_filepath=<path to validation audio_manifest.json>\
|
||||
trainer.gpus=2 \
|
||||
trainer.accelerator="ddp" \
|
||||
trainer.accelerator="gpu" \
|
||||
trainer.strategy="ddp" \
|
||||
trainer.max_epochs=200 \
|
||||
exp_manager.create_wandb_logger=True \
|
||||
exp_manager.wandb_logger_kwargs.name="MatchboxNet-3x1x64-vad" \
|
||||
|
|
|
@ -39,7 +39,8 @@ python speech_to_text_bpe.py \
|
|||
model.tokenizer.dir=<path to directory of tokenizer (not full path to the vocab file!)> \
|
||||
model.tokenizer.type=<either bpe or wpe> \
|
||||
trainer.gpus=-1 \
|
||||
trainer.accelerator="ddp" \
|
||||
trainer.accelerator="gpu" \
|
||||
trainer.strategy="ddp" \
|
||||
trainer.max_epochs=100 \
|
||||
model.optim.name="adamw" \
|
||||
model.optim.lr=0.001 \
|
||||
|
|
|
@ -39,7 +39,7 @@ python speech_to_text_rnnt_bpe.py \
|
|||
model.tokenizer.dir=<path to directory of tokenizer (not full path to the vocab file!)> \
|
||||
model.tokenizer.type=<either bpe or wpe> \
|
||||
trainer.gpus=-1 \
|
||||
trainer.accelerator="ddp" \
|
||||
trainer.accelerator="gpu" \
|
||||
trainer.max_epochs=100 \
|
||||
model.optim.name="adamw" \
|
||||
model.optim.lr=0.001 \
|
||||
|
|
|
@ -35,7 +35,7 @@ class AppConfig(Config):
|
|||
"""
|
||||
|
||||
name: str = "Training of a LeNet-5 Model using a pure PyTorchLightning approach - using DDP on 2 GPUs."
|
||||
trainer: TrainerConfig = TrainerConfig(gpus=2, accelerator="ddp")
|
||||
trainer: TrainerConfig = TrainerConfig(gpus=2, accelerator="gpu", strategy="ddp")
|
||||
model: MNISTLeNet5Config = MNISTLeNet5Config()
|
||||
|
||||
|
||||
|
|
|
@ -23,7 +23,8 @@ trainer:
|
|||
accumulate_grad_batches: 1 # accumulates grads every k batches
|
||||
gradient_clip_val: 1.0
|
||||
precision: 16 # Should be set to 16 for O1 and O2 to enable the AMP.
|
||||
accelerator: ddp
|
||||
accelerator: gpu
|
||||
stratergy: ddp
|
||||
log_every_n_steps: 5 # Interval of logging.
|
||||
val_check_interval: 1.0 # Set to 0.25 to check 4 times per epoch, or an int for number of iterations
|
||||
resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc.
|
||||
|
|
|
@ -16,7 +16,8 @@ tagger_trainer:
|
|||
accumulate_grad_batches: 1 # accumulates grads every k batches
|
||||
gradient_clip_val: 0.0
|
||||
precision: 32 # Should be set to 16 for O1 and O2 to enable the AMP.
|
||||
accelerator: ddp
|
||||
accelerator: gpu
|
||||
stratergy: ddp
|
||||
|
||||
tagger_model:
|
||||
do_training: true
|
||||
|
@ -66,7 +67,8 @@ decoder_trainer:
|
|||
accumulate_grad_batches: 1 # accumulates grads every k batches
|
||||
gradient_clip_val: 0.0
|
||||
precision: 32 # Should be set to 16 for O1 and O2 to enable the AMP.
|
||||
accelerator: ddp
|
||||
accelerator: gpu
|
||||
stratergy: ddp
|
||||
log_every_n_steps: 1 # Interval of logging.
|
||||
val_check_interval: 1.0 # Set to 0.25 to check 4 times per epoch, or an int for number of iterations
|
||||
resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc.
|
||||
|
|
|
@ -7,7 +7,8 @@ trainer:
|
|||
max_steps: null
|
||||
accumulate_grad_batches: 1
|
||||
precision: 16
|
||||
accelerator: ddp
|
||||
accelerator: gpu
|
||||
stratergy: ddp
|
||||
gradient_clip_val: 0.0
|
||||
log_every_n_steps: 1
|
||||
val_check_interval: 2
|
||||
|
|
|
@ -7,7 +7,8 @@ trainer:
|
|||
max_steps: null
|
||||
accumulate_grad_batches: 1
|
||||
precision: 16
|
||||
accelerator: ddp
|
||||
accelerator: gpu
|
||||
stratergy: ddp
|
||||
gradient_clip_val: 0.0
|
||||
log_every_n_steps: 1
|
||||
val_check_interval: 1000
|
||||
|
|
|
@ -8,7 +8,8 @@ trainer:
|
|||
max_steps: null # precedence over max_epochs
|
||||
accumulate_grad_batches: 1 # accumulates grads every k batches
|
||||
precision: 16
|
||||
accelerator: ddp
|
||||
accelerator: gpu
|
||||
stratergy: ddp
|
||||
checkpoint_callback: False # Provided by exp_manager
|
||||
logger: False # Provided by exp_manager
|
||||
|
||||
|
|
|
@ -7,7 +7,8 @@ trainer:
|
|||
max_steps: null # precedence over max_epochs
|
||||
accumulate_grad_batches: 1 # accumulates grads every k batches
|
||||
precision: 16 # 16 to use AMP
|
||||
accelerator: ddp
|
||||
accelerator: gpu
|
||||
stratergy: ddp
|
||||
log_every_n_steps: 1 # Interval of logging.
|
||||
val_check_interval: 0.05 # Set to 0.25 to check 4 times per epoch, or an int for number of iterations
|
||||
checkpoint_callback: false # provided by exp_manager
|
||||
|
|
|
@ -7,7 +7,8 @@ trainer:
|
|||
max_steps: null # precedence over max_epochs
|
||||
accumulate_grad_batches: 1 # accumulates grads every k batches
|
||||
precision: 32 # Should be set to 16 for O1 and O2 amp_level to enable the AMP.
|
||||
accelerator: ddp
|
||||
accelerator: gpu
|
||||
stratergy: ddp
|
||||
log_every_n_steps: 1 # Interval of logging.
|
||||
val_check_interval: 1.0 # Set to 0.25 to check 4 times per epoch, or an int for number of iterations
|
||||
resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc.
|
||||
|
|
|
@ -8,7 +8,8 @@ trainer:
|
|||
replace_sampler_ddp: false # needed for bert pretraining from preproc
|
||||
accumulate_grad_batches: 1 # accumulates grads every k batches
|
||||
precision: 16 # 16 to use AMP
|
||||
accelerator: ddp
|
||||
accelerator: gpu
|
||||
stratergy: ddp
|
||||
gradient_clip_val: 1.0
|
||||
log_every_n_steps: 1
|
||||
val_check_interval: 1.0 # check once per epoch .25 for 4 times per epoch
|
||||
|
|
|
@ -7,7 +7,8 @@ trainer:
|
|||
max_steps: null # precedence over max_epochs
|
||||
accumulate_grad_batches: 1 # accumulates grads every k batches
|
||||
precision: 16 # 16 to use AMP
|
||||
accelerator: ddp
|
||||
accelerator: gpu
|
||||
stratergy: ddp
|
||||
gradient_clip_val: 0.0
|
||||
log_every_n_steps: 1
|
||||
val_check_interval: 1.0 # check once per epoch .25 for 4 times per epoch
|
||||
|
|
|
@ -4,7 +4,8 @@ restore_from_path: null # used when starting from a .nemo file
|
|||
trainer:
|
||||
gpus: 1
|
||||
num_nodes: 1
|
||||
accelerator: ddp
|
||||
accelerator: gpu
|
||||
stratergy: ddp
|
||||
precision: 16
|
||||
logger: False # logger provided by exp_manager
|
||||
checkpoint_callback: False
|
||||
|
|
|
@ -89,7 +89,8 @@ trainer:
|
|||
num_nodes: 1
|
||||
max_epochs: 200
|
||||
precision: 16 # Should be set to 16 for O1 and O2, default is 16 as PT ignores it when am_level is O0
|
||||
accelerator: ddp
|
||||
accelerator: gpu
|
||||
stratergy: ddp
|
||||
checkpoint_callback: False
|
||||
logger: False
|
||||
log_every_n_steps: 50 # Interval of logging.
|
||||
|
|
|
@ -146,7 +146,8 @@ trainer:
|
|||
num_nodes: 1
|
||||
max_epochs: 200
|
||||
precision: 16 # Should be set to 16 for O1 and O2, default is 16 as PT ignores it when am_level is O0
|
||||
accelerator: ddp
|
||||
accelerator: gpu
|
||||
stratergy: ddp
|
||||
checkpoint_callback: False
|
||||
logger: False
|
||||
log_every_n_steps: 50 # Interval of logging.
|
||||
|
|
|
@ -173,7 +173,8 @@ trainer:
|
|||
num_nodes: 1
|
||||
max_epochs: 200
|
||||
precision: 16 # Should be set to 16 for O1 and O2, default is 16 as PT ignores it when am_level is O0
|
||||
accelerator: ddp
|
||||
accelerator: gpu
|
||||
stratergy: ddp
|
||||
checkpoint_callback: False
|
||||
logger: False
|
||||
log_every_n_steps: 50 # Interval of logging.
|
||||
|
|
|
@ -120,7 +120,8 @@ trainer:
|
|||
num_nodes: 1
|
||||
max_epochs: 200
|
||||
precision: 16 # Should be set to 16 for O1 and O2, default is 16 as PT ignores it when am_level is O0
|
||||
accelerator: ddp
|
||||
accelerator: gpu
|
||||
stratergy: ddp
|
||||
checkpoint_callback: False
|
||||
logger: False
|
||||
log_every_n_steps: 50 # Interval of logging.
|
||||
|
|
|
@ -148,7 +148,8 @@ trainer:
|
|||
num_nodes: 1
|
||||
max_epochs: 200
|
||||
precision: 16 # Should be set to 16 for O1 and O2, default is 16 as PT ignores it when am_level is O0
|
||||
accelerator: ddp
|
||||
accelerator: gpu
|
||||
stratergy: ddp
|
||||
checkpoint_callback: False
|
||||
logger: False
|
||||
log_every_n_steps: 50 # Interval of logging.
|
||||
|
|
|
@ -22,6 +22,7 @@ USAGE Example:
|
|||
"""
|
||||
|
||||
|
||||
import json
|
||||
from argparse import ArgumentParser
|
||||
|
||||
import torch
|
||||
|
@ -35,6 +36,62 @@ from nemo.collections.nlp.modules.common.transformer import (
|
|||
from nemo.utils import logging
|
||||
|
||||
|
||||
def translate_text(
|
||||
models, args, src_text, tgt_text, tgt_text_all, src_texts, all_scores, all_timing, ensemble_generator
|
||||
):
|
||||
if len(models) > 1:
|
||||
src_ids, src_mask = models[0].prepare_inference_batch(src_text)
|
||||
best_translations = ensemble_generator(src_ids, src_mask, return_beam_scores=args.write_scores)
|
||||
if args.write_scores:
|
||||
all_results, scores, best_translations = (
|
||||
best_translations[0],
|
||||
best_translations[1],
|
||||
best_translations[2],
|
||||
)
|
||||
scores = scores.view(-1).data.cpu().numpy().tolist()
|
||||
all_scores += scores
|
||||
src_texts += [item for item in src_text for i in range(args.beam_size)]
|
||||
all_results = models[0].ids_to_postprocessed_text(
|
||||
all_results, models[0].decoder_tokenizer, models[0].target_processor
|
||||
)
|
||||
tgt_text_all += all_results
|
||||
best_translations = models[0].ids_to_postprocessed_text(
|
||||
best_translations, models[0].decoder_tokenizer, models[0].target_processor
|
||||
)
|
||||
tgt_text += best_translations
|
||||
else:
|
||||
model = models[0]
|
||||
best_translations = model.translate(
|
||||
text=src_text,
|
||||
source_lang=args.source_lang,
|
||||
target_lang=args.target_lang,
|
||||
return_beam_scores=args.write_scores,
|
||||
log_timing=args.write_timing,
|
||||
)
|
||||
|
||||
if args.write_timing:
|
||||
*best_translations, timing_dict = best_translations
|
||||
all_timing.append(timing_dict)
|
||||
else:
|
||||
best_translations = (best_translations,)
|
||||
|
||||
if args.write_scores:
|
||||
all_results, scores, best_translations = (
|
||||
best_translations[0],
|
||||
best_translations[1],
|
||||
best_translations[2],
|
||||
)
|
||||
all_scores += scores
|
||||
src_texts += [item for item in src_text for i in range(args.beam_size)]
|
||||
tgt_text_all += all_results
|
||||
else:
|
||||
best_translations = best_translations[0]
|
||||
|
||||
tgt_text += best_translations
|
||||
|
||||
print(f"Translated {len(tgt_text)} sentences")
|
||||
|
||||
|
||||
def main():
|
||||
parser = ArgumentParser()
|
||||
parser.add_argument(
|
||||
|
@ -71,6 +128,11 @@ def main():
|
|||
action="store_true",
|
||||
help="Whether to write a separate file with scores not including length penalties corresponding to each beam hypothesis (.score suffix)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--write_timing",
|
||||
action="store_true",
|
||||
help="Whether to write a separate file with detailed timing info (.timing.json suffix)",
|
||||
)
|
||||
# shallow fusion specific parameters
|
||||
parser.add_argument(
|
||||
"--lm_model",
|
||||
|
@ -92,11 +154,15 @@ def main():
|
|||
model = nemo_nlp.models.machine_translation.MTEncDecModel.restore_from(restore_path=model_path).eval()
|
||||
models.append(model)
|
||||
|
||||
if (len(models) > 1) and (args.write_timing):
|
||||
raise RuntimeError("Cannot measure timing when more than 1 model is used")
|
||||
|
||||
src_text = []
|
||||
tgt_text = []
|
||||
tgt_text_all = []
|
||||
src_texts = []
|
||||
all_scores = []
|
||||
all_timing = []
|
||||
|
||||
if torch.cuda.is_available():
|
||||
models = [model.cuda() for model in models]
|
||||
|
@ -124,6 +190,7 @@ def main():
|
|||
)
|
||||
else:
|
||||
model = models[0]
|
||||
ensemble_generator = None
|
||||
if lm_model is not None:
|
||||
model.beam_search = BeamSearchSequenceGeneratorWithLanguageModel(
|
||||
embedding=model.decoder.embedding,
|
||||
|
@ -155,91 +222,49 @@ def main():
|
|||
|
||||
logging.info(f"Translating: {args.srctext}")
|
||||
|
||||
count = 0
|
||||
with open(args.srctext, 'r') as src_f:
|
||||
for line in src_f:
|
||||
src_text.append(line.strip())
|
||||
if len(src_text) == args.batch_size:
|
||||
if len(models) > 1:
|
||||
src_ids, src_mask = models[0].prepare_inference_batch(src_text)
|
||||
best_translations = ensemble_generator(src_ids, src_mask, return_beam_scores=args.write_scores)
|
||||
if args.write_scores:
|
||||
all_results, scores, best_translations = (
|
||||
best_translations[0],
|
||||
best_translations[1],
|
||||
best_translations[2],
|
||||
)
|
||||
scores = scores.view(-1).data.cpu().numpy().tolist()
|
||||
all_scores += scores
|
||||
src_texts += [item for item in src_text for i in range(args.beam_size)]
|
||||
all_results = models[0].ids_to_postprocessed_text(
|
||||
all_results, models[0].decoder_tokenizer, models[0].target_processor
|
||||
)
|
||||
tgt_text_all += all_results
|
||||
best_translations = models[0].ids_to_postprocessed_text(
|
||||
best_translations, models[0].decoder_tokenizer, models[0].target_processor
|
||||
# warmup when measuring timing
|
||||
if not all_timing:
|
||||
print("running a warmup batch")
|
||||
translate_text(
|
||||
models=models,
|
||||
args=args,
|
||||
src_text=src_text,
|
||||
tgt_text=[],
|
||||
tgt_text_all=[],
|
||||
src_texts=[],
|
||||
all_scores=[],
|
||||
all_timing=[],
|
||||
ensemble_generator=ensemble_generator,
|
||||
)
|
||||
tgt_text += best_translations
|
||||
else:
|
||||
best_translations = model.translate(
|
||||
text=src_text,
|
||||
source_lang=args.source_lang,
|
||||
target_lang=args.target_lang,
|
||||
return_beam_scores=args.write_scores,
|
||||
)
|
||||
if args.write_scores:
|
||||
all_results, scores, best_translations = (
|
||||
best_translations[0],
|
||||
best_translations[1],
|
||||
best_translations[2],
|
||||
)
|
||||
all_scores += scores
|
||||
src_texts += [item for item in src_text for i in range(args.beam_size)]
|
||||
tgt_text_all += all_results
|
||||
tgt_text += best_translations
|
||||
translate_text(
|
||||
models=models,
|
||||
args=args,
|
||||
src_text=src_text,
|
||||
tgt_text=tgt_text,
|
||||
tgt_text_all=tgt_text_all,
|
||||
src_texts=src_texts,
|
||||
all_scores=all_scores,
|
||||
all_timing=all_timing,
|
||||
ensemble_generator=ensemble_generator,
|
||||
)
|
||||
src_text = []
|
||||
print(f"Translated {count + 1} sentences")
|
||||
count += 1
|
||||
|
||||
if len(src_text) > 0:
|
||||
if len(models) > 1:
|
||||
src_ids, src_mask = models[0].prepare_inference_batch(src_text)
|
||||
best_translations = ensemble_generator(src_ids, src_mask, return_beam_scores=args.write_scores)
|
||||
if args.write_scores:
|
||||
all_results, scores, best_translations = (
|
||||
best_translations[0],
|
||||
best_translations[1],
|
||||
best_translations[2],
|
||||
)
|
||||
scores = scores.view(-1).data.cpu().numpy().tolist()
|
||||
all_scores += scores
|
||||
src_texts += [item for item in src_text for i in range(args.beam_size)]
|
||||
all_results = models[0].ids_to_postprocessed_text(
|
||||
all_results, models[0].decoder_tokenizer, models[0].target_processor
|
||||
)
|
||||
tgt_text_all += all_results
|
||||
best_translations = models[0].ids_to_postprocessed_text(
|
||||
best_translations, models[0].decoder_tokenizer, models[0].target_processor
|
||||
)
|
||||
tgt_text += best_translations
|
||||
else:
|
||||
best_translations = model.translate(
|
||||
text=src_text,
|
||||
source_lang=args.source_lang,
|
||||
target_lang=args.target_lang,
|
||||
return_beam_scores=args.write_scores,
|
||||
)
|
||||
if args.write_scores:
|
||||
all_results, scores, best_translations = (
|
||||
best_translations[0],
|
||||
best_translations[1],
|
||||
best_translations[2],
|
||||
)
|
||||
all_scores += scores
|
||||
src_texts += [item for item in src_text for i in range(args.beam_size)]
|
||||
tgt_text_all += all_results
|
||||
tgt_text += best_translations
|
||||
src_text = []
|
||||
print(f"Translated {count} sentences")
|
||||
translate_text(
|
||||
models=models,
|
||||
args=args,
|
||||
src_text=src_text,
|
||||
tgt_text=tgt_text,
|
||||
tgt_text_all=tgt_text_all,
|
||||
src_texts=src_texts,
|
||||
all_scores=all_scores,
|
||||
all_timing=all_timing,
|
||||
ensemble_generator=ensemble_generator,
|
||||
)
|
||||
|
||||
with open(args.tgtout, 'w') as tgt_f:
|
||||
for line in tgt_text:
|
||||
|
@ -250,6 +275,16 @@ def main():
|
|||
for line, score, inp in zip(tgt_text_all, all_scores, src_texts):
|
||||
tgt_f_scores.write(inp + "\t" + line + "\t" + str(score) + "\n")
|
||||
|
||||
if args.write_timing:
|
||||
# collect list of dicts to a dict of lists
|
||||
timing_dict = {}
|
||||
if len(all_timing):
|
||||
for k in all_timing[0].keys():
|
||||
timing_dict[k] = [t[k] for t in all_timing]
|
||||
|
||||
with open(args.tgtout + '.timing.json', 'w') as timing_fh:
|
||||
json.dump(timing_dict, timing_fh)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main() # noqa pylint: disable=no-value-for-parameter
|
||||
|
|
|
@ -10,7 +10,8 @@ trainer:
|
|||
max_steps: null # precedence over max_epochs
|
||||
accumulate_grad_batches: 1 # accumulates grads every k batches
|
||||
precision: 16 # 16 to use AMP
|
||||
accelerator: ddp
|
||||
accelerator: gpu
|
||||
stratergy: ddp
|
||||
gradient_clip_val: 0.0
|
||||
val_check_interval: 1.0 # check once per epoch .25 for 4 times per epoch
|
||||
checkpoint_callback: false # provided by exp_manager
|
||||
|
|
|
@ -8,7 +8,8 @@ trainer:
|
|||
max_epochs: 2 # the number of training epochs
|
||||
max_steps: null # precedence over max_epochs
|
||||
accumulate_grad_batches: 1 # accumulates grads every k batches
|
||||
accelerator: ddp
|
||||
accelerator: gpu
|
||||
stratergy: ddp
|
||||
gradient_clip_val: 0.0
|
||||
log_every_n_steps: 1
|
||||
val_check_interval: 1.0 # check once per epoch .25 for 4 times per epoch
|
||||
|
|
|
@ -22,7 +22,8 @@ trainer:
|
|||
accumulate_grad_batches: 1 # accumulates grads every k batches
|
||||
gradient_clip_val: 0.0
|
||||
precision: 32 # Should be set to 16 for O1 and O2 to enable the AMP.
|
||||
accelerator: ddp
|
||||
accelerator: gpu
|
||||
stratergy: ddp
|
||||
log_every_n_steps: 1 # Interval of logging.
|
||||
val_check_interval: 1.0 # Set to 0.25 to check 4 times per epoch, or an int for number of iterations
|
||||
resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc.
|
||||
|
|
|
@ -25,7 +25,8 @@ trainer:
|
|||
accumulate_grad_batches: 1 # accumulates grads every k batches
|
||||
gradient_clip_val: 0.0
|
||||
precision: 16 # Should be set to 16 for O1 and O2, default is 16 as PT ignores it when am_level is O0
|
||||
accelerator: ddp
|
||||
accelerator: gpu
|
||||
stratergy: ddp
|
||||
checkpoint_callback: false # Provided by exp_manager
|
||||
logger: false # Provided by exp_manager
|
||||
log_every_n_steps: 1 # Interval of logging.
|
||||
|
|
|
@ -24,7 +24,8 @@ trainer:
|
|||
accumulate_grad_batches: 1 # accumulates grads every k batches
|
||||
gradient_clip_val: 0.0
|
||||
precision: 16 # Should be set to 16 for O1 and O2, default is 16 as PT ignores it when am_level is O0
|
||||
accelerator: ddp
|
||||
accelerator: gpu
|
||||
stratergy: ddp
|
||||
checkpoint_callback: False # Provided by exp_manager
|
||||
logger: False # Provided by exp_manager
|
||||
log_every_n_steps: 1 # Interval of logging.
|
||||
|
|
|
@ -20,7 +20,8 @@ trainer:
|
|||
max_steps: null # precedence over max_epochs
|
||||
accumulate_grad_batches: 1 # accumulates grads every k batches
|
||||
precision: 16
|
||||
accelerator: ddp
|
||||
accelerator: gpu
|
||||
stratergy: ddp
|
||||
log_every_n_steps: 1 # Interval of logging.
|
||||
val_check_interval: 1.0 # Set to 0.25 to check 4 times per epoch, or an int for number of iterations
|
||||
resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc.
|
||||
|
|
|
@ -141,7 +141,8 @@ trainer:
|
|||
max_epochs: 200
|
||||
max_steps: null # computed at runtime if not set
|
||||
num_nodes: 1
|
||||
accelerator: ddp
|
||||
accelerator: gpu
|
||||
stratergy: ddp
|
||||
accumulate_grad_batches: 1
|
||||
deterministic: True
|
||||
checkpoint_callback: False
|
||||
|
|
|
@ -129,7 +129,8 @@ trainer:
|
|||
max_epochs: 200
|
||||
max_steps: null # computed at runtime if not set
|
||||
num_nodes: 1
|
||||
accelerator: ddp
|
||||
accelerator: gpu
|
||||
stratergy: ddp
|
||||
accumulate_grad_batches: 1
|
||||
deterministic: True
|
||||
checkpoint_callback: False
|
||||
|
|
|
@ -91,7 +91,8 @@ trainer:
|
|||
max_epochs: 250
|
||||
max_steps: null # computed at runtime if not set
|
||||
num_nodes: 1
|
||||
accelerator: ddp
|
||||
accelerator: gpu
|
||||
stratergy: ddp
|
||||
accumulate_grad_batches: 1
|
||||
deterministic: False
|
||||
checkpoint_callback: False
|
||||
|
|
|
@ -104,7 +104,8 @@ trainer:
|
|||
gpus: 1
|
||||
max_epochs: ???
|
||||
num_nodes: 1
|
||||
accelerator: ddp
|
||||
accelerator: gpu
|
||||
stratergy: ddp
|
||||
accumulate_grad_batches: 1
|
||||
checkpoint_callback: False
|
||||
logger: False
|
||||
|
|
|
@ -90,7 +90,8 @@ trainer:
|
|||
gpus: 1 # number of gpus
|
||||
max_epochs: 200
|
||||
num_nodes: 1
|
||||
accelerator: ddp
|
||||
accelerator: gpu
|
||||
stratergy: ddp
|
||||
accumulate_grad_batches: 1
|
||||
checkpoint_callback: False # Provided by exp_manager
|
||||
logger: False # Provided by exp_manager
|
||||
|
|
|
@ -86,7 +86,8 @@ trainer:
|
|||
gpus: 1 # number of gpus
|
||||
max_epochs: 200
|
||||
num_nodes: 1
|
||||
accelerator: ddp
|
||||
accelerator: gpu
|
||||
stratergy: ddp
|
||||
accumulate_grad_batches: 1
|
||||
checkpoint_callback: False # Provided by exp_manager
|
||||
logger: False # Provided by exp_manager
|
||||
|
|
|
@ -128,7 +128,8 @@ trainer:
|
|||
gpus: -1 # number of gpus
|
||||
max_epochs: 1500
|
||||
num_nodes: 1
|
||||
accelerator: ddp
|
||||
accelerator: gpu
|
||||
stratergy: ddp
|
||||
accumulate_grad_batches: 1
|
||||
checkpoint_callback: False # Provided by exp_manager
|
||||
logger: False # Provided by exp_manager
|
||||
|
|
|
@ -167,7 +167,8 @@ trainer:
|
|||
gpus: -1 # number of gpus
|
||||
max_epochs: 1500
|
||||
num_nodes: 1
|
||||
accelerator: ddp
|
||||
accelerator: gpu
|
||||
stratergy: ddp
|
||||
accumulate_grad_batches: 1
|
||||
checkpoint_callback: False # Provided by exp_manager
|
||||
logger: False # Provided by exp_manager
|
||||
|
|
|
@ -170,7 +170,8 @@ trainer:
|
|||
gpus: -1 # number of gpus
|
||||
max_epochs: 1500
|
||||
num_nodes: 1
|
||||
accelerator: ddp
|
||||
accelerator: gpu
|
||||
stratergy: ddp
|
||||
accumulate_grad_batches: 1
|
||||
checkpoint_callback: False # Provided by exp_manager
|
||||
logger: False # Provided by exp_manager
|
||||
|
|
|
@ -121,7 +121,8 @@ trainer:
|
|||
gpus: -1 # number of gpus
|
||||
max_epochs: 1500
|
||||
num_nodes: 1
|
||||
accelerator: ddp
|
||||
accelerator: gpu
|
||||
stratergy: ddp
|
||||
accumulate_grad_batches: 1
|
||||
checkpoint_callback: False # Provided by exp_manager
|
||||
logger: False # Provided by exp_manager
|
||||
|
|
|
@ -129,7 +129,8 @@ trainer:
|
|||
gpus: 1 # number of gpus
|
||||
max_epochs: ???
|
||||
num_nodes: 1
|
||||
accelerator: ddp
|
||||
accelerator: gpu
|
||||
stratergy: ddp
|
||||
accumulate_grad_batches: 1
|
||||
checkpoint_callback: False # Provided by exp_manager
|
||||
logger: False # Provided by exp_manager
|
||||
|
|
|
@ -120,7 +120,8 @@ trainer:
|
|||
gpus: 1 # number of gpus
|
||||
max_epochs: ???
|
||||
num_nodes: 1
|
||||
accelerator: ddp
|
||||
accelerator: gpu
|
||||
stratergy: ddp
|
||||
accumulate_grad_batches: 1
|
||||
checkpoint_callback: False # Provided by exp_manager
|
||||
logger: False # Provided by exp_manager
|
||||
|
|
|
@ -127,7 +127,8 @@ trainer:
|
|||
gpus: -1 # number of gpus
|
||||
max_epochs: 350
|
||||
num_nodes: 1
|
||||
accelerator: ddp
|
||||
accelerator: gpu
|
||||
stratergy: ddp
|
||||
accumulate_grad_batches: 1
|
||||
checkpoint_callback: False # Provided by exp_manager
|
||||
logger: False # Provided by exp_manager
|
||||
|
|
|
@ -49,7 +49,8 @@ trainer:
|
|||
gpus: -1 # number of gpus
|
||||
max_steps: ${model.max_steps}
|
||||
num_nodes: 1
|
||||
accelerator: ddp
|
||||
accelerator: gpu
|
||||
stratergy: ddp
|
||||
accumulate_grad_batches: 1
|
||||
checkpoint_callback: False # Provided by exp_manager
|
||||
logger: False # Provided by exp_manager
|
||||
|
|
|
@ -49,7 +49,8 @@ trainer:
|
|||
gpus: -1 # number of gpus
|
||||
max_steps: ${model.max_steps}
|
||||
num_nodes: 1
|
||||
accelerator: ddp
|
||||
accelerator: gpu
|
||||
stratergy: ddp
|
||||
accumulate_grad_batches: 1
|
||||
checkpoint_callback: False # Provided by exp_manager
|
||||
logger: False # Provided by exp_manager
|
||||
|
|
|
@ -94,7 +94,8 @@ trainer:
|
|||
gpus: 1 # number of gpus
|
||||
max_epochs: ???
|
||||
num_nodes: 1
|
||||
accelerator: ddp
|
||||
accelerator: gpu
|
||||
stratergy: ddp
|
||||
accumulate_grad_batches: 1
|
||||
checkpoint_callback: False # Provided by exp_manager
|
||||
logger: False # Provided by exp_manager
|
||||
|
|
|
@ -87,7 +87,8 @@ trainer:
|
|||
max_epochs: 10000
|
||||
max_steps: 600000
|
||||
num_nodes: 1
|
||||
accelerator: ddp
|
||||
accelerator: gpu
|
||||
stratergy: ddp
|
||||
accumulate_grad_batches: 1
|
||||
checkpoint_callback: false # Provided by exp_manager
|
||||
logger: false # Provided by exp_manager
|
||||
|
|
|
@ -124,7 +124,8 @@ trainer:
|
|||
gpus: 1 # number of gpus
|
||||
max_epochs: ???
|
||||
num_nodes: 1
|
||||
accelerator: ddp
|
||||
accelerator: gpu
|
||||
stratergy: ddp
|
||||
accumulate_grad_batches: 1
|
||||
checkpoint_callback: False # Provided by exp_manager
|
||||
logger: False # Provided by exp_manager
|
||||
|
|
|
@ -159,7 +159,8 @@ trainer:
|
|||
gpus: 1 # number of gpus
|
||||
max_epochs: ???
|
||||
num_nodes: 1
|
||||
accelerator: ddp
|
||||
accelerator: gpu
|
||||
stratergy: ddp
|
||||
accumulate_grad_batches: 1
|
||||
checkpoint_callback: False # Provided by exp_manager
|
||||
logger: False # Provided by exp_manager
|
||||
|
|
|
@ -156,7 +156,8 @@ trainer:
|
|||
gpus: 1
|
||||
max_epochs: ???
|
||||
num_nodes: 1
|
||||
accelerator: ddp
|
||||
accelerator: gpu
|
||||
stratergy: ddp
|
||||
accumulate_grad_batches: 1
|
||||
checkpoint_callback: False
|
||||
logger: False
|
||||
|
|
|
@ -160,7 +160,8 @@ trainer:
|
|||
gpus: 1
|
||||
max_epochs: ???
|
||||
num_nodes: 1
|
||||
accelerator: ddp
|
||||
accelerator: gpu
|
||||
stratergy: ddp
|
||||
accumulate_grad_batches: 1
|
||||
checkpoint_callback: False
|
||||
logger: False
|
||||
|
|
|
@ -225,7 +225,8 @@ trainer:
|
|||
gpus: 1
|
||||
max_epochs: ???
|
||||
num_nodes: 1
|
||||
accelerator: ddp
|
||||
accelerator: gpu
|
||||
stratergy: ddp
|
||||
accumulate_grad_batches: 1
|
||||
checkpoint_callback: False
|
||||
logger: False
|
||||
|
|
|
@ -80,7 +80,8 @@ trainer:
|
|||
gpus: 1 # number of gpus
|
||||
max_epochs: ???
|
||||
num_nodes: 1
|
||||
accelerator: ddp
|
||||
accelerator: gpu
|
||||
stratergy: ddp
|
||||
accumulate_grad_batches: 1
|
||||
checkpoint_callback: False # Provided by exp_manager
|
||||
logger: False # Provided by exp_manager
|
||||
|
|
|
@ -81,7 +81,8 @@ trainer:
|
|||
gpus: 1 # number of gpus
|
||||
max_epochs: ???
|
||||
num_nodes: 1
|
||||
accelerator: ddp
|
||||
accelerator: gpu
|
||||
stratergy: ddp
|
||||
accumulate_grad_batches: 1
|
||||
checkpoint_callback: False # Provided by exp_manager
|
||||
logger: False # Provided by exp_manager
|
||||
|
|
|
@ -395,6 +395,8 @@ class ClusteringDiarizer(Model, DiarizationMixin):
|
|||
save_path: Path to .nemo file where model instance should be saved
|
||||
"""
|
||||
|
||||
# TODO: Why does this override the main save_to?
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
config_yaml = os.path.join(tmpdir, _MODEL_CONFIG_YAML)
|
||||
spkr_model = os.path.join(tmpdir, _SPEAKER_MODEL)
|
||||
|
|
|
@ -17,5 +17,4 @@ from nemo.collections.nlp.data.language_modeling.lm_bert_dataset import (
|
|||
BertPretrainingDataset,
|
||||
BertPretrainingPreprocessedDataloader,
|
||||
)
|
||||
from nemo.collections.nlp.data.language_modeling.megatron import GPTDataset, IndexedDataset, MMapIndexedDataset
|
||||
from nemo.collections.nlp.data.language_modeling.sentence_dataset import SentenceDataset, TarredSentenceDataset
|
||||
|
|
|
@ -317,13 +317,18 @@ class MTBottleneckModel(MTEncDecModel):
|
|||
inputs: a list of string containing detokenized inputs
|
||||
"""
|
||||
mode = self.training
|
||||
timer = cache.get("timer", None)
|
||||
try:
|
||||
self.eval()
|
||||
|
||||
# build posterior distribution q(x|z)
|
||||
if ("z" not in cache) or ("z_mean" not in cache) or ("z_mask" not in cache):
|
||||
if timer is not None:
|
||||
timer.start("encoder")
|
||||
enc_hiddens, enc_mask = self.encoder(input_ids=src, encoder_mask=src_mask, return_mask=True)
|
||||
z, z_mean, _ = self.encode_latent(hidden=enc_hiddens)
|
||||
if timer is not None:
|
||||
timer.stop("encoder")
|
||||
else:
|
||||
enc_mask = cache["z_mask"]
|
||||
z = cache["z"]
|
||||
|
@ -332,8 +337,8 @@ class MTBottleneckModel(MTEncDecModel):
|
|||
if getattr(self, "deterministic_translate", True):
|
||||
z = z_mean
|
||||
|
||||
if cache.get("timer", None) is not None:
|
||||
cache["timer"].start("sampler")
|
||||
if timer is not None:
|
||||
timer.start("sampler")
|
||||
# decoding cross attention context
|
||||
context_hiddens = self.latent2hidden(z)
|
||||
|
||||
|
@ -342,8 +347,8 @@ class MTBottleneckModel(MTEncDecModel):
|
|||
encoder_input_mask=enc_mask,
|
||||
return_beam_scores=return_beam_scores,
|
||||
)
|
||||
if cache.get("timer", None) is not None:
|
||||
cache["timer"].stop("sampler")
|
||||
if timer is not None:
|
||||
timer.stop("sampler")
|
||||
|
||||
if return_beam_scores:
|
||||
all_translations, scores, best_translations = best_translations
|
||||
|
|
|
@ -46,7 +46,7 @@ from nemo.collections.nlp.modules.common.lm_utils import get_transformer
|
|||
from nemo.collections.nlp.modules.common.tokenizer_utils import get_nmt_tokenizer
|
||||
from nemo.collections.nlp.modules.common.transformer import BeamSearchSequenceGenerator, TopKSequenceGenerator
|
||||
from nemo.core.classes.common import PretrainedModelInfo, typecheck
|
||||
from nemo.utils import logging, model_utils
|
||||
from nemo.utils import logging, model_utils, timers
|
||||
|
||||
__all__ = ['MTEncDecModel']
|
||||
|
||||
|
@ -765,7 +765,9 @@ class MTEncDecModel(EncDecNLPModel):
|
|||
return translations
|
||||
|
||||
@torch.no_grad()
|
||||
def batch_translate(self, src: torch.LongTensor, src_mask: torch.LongTensor, return_beam_scores: bool = False):
|
||||
def batch_translate(
|
||||
self, src: torch.LongTensor, src_mask: torch.LongTensor, return_beam_scores: bool = False, cache={}
|
||||
):
|
||||
"""
|
||||
Translates a minibatch of inputs from source language to target language.
|
||||
Args:
|
||||
|
@ -776,12 +778,20 @@ class MTEncDecModel(EncDecNLPModel):
|
|||
inputs: a list of string containing detokenized inputs
|
||||
"""
|
||||
mode = self.training
|
||||
timer = cache.get("timer", None)
|
||||
try:
|
||||
self.eval()
|
||||
if timer is not None:
|
||||
timer.start("encoder")
|
||||
src_hiddens = self.encoder(input_ids=src, encoder_mask=src_mask)
|
||||
if timer is not None:
|
||||
timer.stop("encoder")
|
||||
timer.start("sampler")
|
||||
best_translations = self.beam_search(
|
||||
encoder_hidden_states=src_hiddens, encoder_input_mask=src_mask, return_beam_scores=return_beam_scores
|
||||
)
|
||||
if timer is not None:
|
||||
timer.stop("sampler")
|
||||
if return_beam_scores:
|
||||
all_translations, scores, best_translations = best_translations
|
||||
scores = scores.view(-1)
|
||||
|
@ -827,7 +837,12 @@ class MTEncDecModel(EncDecNLPModel):
|
|||
# TODO: We should drop source/target_lang arguments in favor of using self.src/tgt_language
|
||||
@torch.no_grad()
|
||||
def translate(
|
||||
self, text: List[str], source_lang: str = None, target_lang: str = None, return_beam_scores: bool = False
|
||||
self,
|
||||
text: List[str],
|
||||
source_lang: str = None,
|
||||
target_lang: str = None,
|
||||
return_beam_scores: bool = False,
|
||||
log_timing: bool = False,
|
||||
) -> List[str]:
|
||||
"""
|
||||
Translates list of sentences from source language to target language.
|
||||
|
@ -855,19 +870,41 @@ class MTEncDecModel(EncDecNLPModel):
|
|||
elif tgt_symbol in self.multilingual_ids:
|
||||
prepend_ids = [tgt_symbol]
|
||||
|
||||
if log_timing:
|
||||
timer = timers.NamedTimer()
|
||||
else:
|
||||
timer = None
|
||||
|
||||
cache = {
|
||||
"timer": timer,
|
||||
}
|
||||
|
||||
try:
|
||||
self.eval()
|
||||
src, src_mask = self.prepare_inference_batch(text, prepend_ids)
|
||||
if return_beam_scores:
|
||||
_, all_translations, scores, best_translations = self.batch_translate(
|
||||
src, src_mask, return_beam_scores=True
|
||||
src, src_mask, return_beam_scores=True, cache=cache,
|
||||
)
|
||||
return all_translations, scores, best_translations
|
||||
return_val = all_translations, scores, best_translations
|
||||
else:
|
||||
_, translations = self.batch_translate(src, src_mask, return_beam_scores=False)
|
||||
_, best_translations = self.batch_translate(src, src_mask, return_beam_scores=False, cache=cache)
|
||||
return_val = best_translations
|
||||
finally:
|
||||
self.train(mode=mode)
|
||||
return translations
|
||||
|
||||
if log_timing:
|
||||
timing = timer.export()
|
||||
timing["mean_src_length"] = src_mask.sum().cpu().item() / src_mask.shape[0]
|
||||
tgt, tgt_mask = self.prepare_inference_batch(best_translations, prepend_ids)
|
||||
timing["mean_tgt_length"] = tgt_mask.sum().cpu().item() / tgt_mask.shape[0]
|
||||
|
||||
if type(return_val) is tuple:
|
||||
return_val = return_val + (timing,)
|
||||
else:
|
||||
return_val = (return_val, timing)
|
||||
|
||||
return return_val
|
||||
|
||||
@classmethod
|
||||
def list_available_models(cls) -> Optional[Dict[str, str]]:
|
||||
|
|
|
@ -15,7 +15,7 @@
|
|||
import hashlib
|
||||
import json
|
||||
import os
|
||||
from typing import Any, Dict, Optional
|
||||
from typing import Any, Optional
|
||||
|
||||
from omegaconf import DictConfig, OmegaConf
|
||||
from pytorch_lightning import Trainer
|
||||
|
@ -26,19 +26,23 @@ from pytorch_lightning.utilities.migration import pl_legacy_patch
|
|||
from transformers import TRANSFORMERS_CACHE
|
||||
|
||||
from nemo.collections.common.tokenizers.huggingface.auto_tokenizer import AutoTokenizer
|
||||
from nemo.collections.nlp.modules import BertModule, MegatronBertEncoder
|
||||
from nemo.collections.nlp.modules import BertModule
|
||||
from nemo.collections.nlp.modules.common.huggingface.huggingface_utils import VOCAB_FILE_NAME
|
||||
from nemo.collections.nlp.modules.common.megatron.megatron_bert import (
|
||||
get_megatron_checkpoint_version,
|
||||
set_megatron_checkpoint_version,
|
||||
)
|
||||
from nemo.collections.nlp.modules.common.megatron.megatron_encoder import MegatronEncoderModule
|
||||
from nemo.collections.nlp.modules.common.tokenizer_utils import get_tokenizer
|
||||
from nemo.collections.nlp.parts.nlp_overrides import NLPCheckpointConnector, NLPSaveRestoreConnector
|
||||
from nemo.collections.nlp.parts.nlp_overrides import NLPSaveRestoreConnector
|
||||
from nemo.core.classes import ModelPT
|
||||
from nemo.core.classes.exportable import Exportable
|
||||
from nemo.utils import AppState, logging
|
||||
|
||||
try:
|
||||
import apex
|
||||
|
||||
HAVE_APEX = True
|
||||
|
||||
except (ImportError, ModuleNotFoundError):
|
||||
HAVE_APEX = False
|
||||
|
||||
|
||||
__all__ = ['NLPModel']
|
||||
|
||||
NEMO_NLP_TMP = os.path.join(os.path.dirname(str(TRANSFORMERS_CACHE)), "nemo_nlp_tmp")
|
||||
|
@ -55,6 +59,8 @@ class NLPModel(ModelPT, Exportable):
|
|||
# handles model parallel save and restore logic
|
||||
self._save_restore_connector = NLPSaveRestoreConnector()
|
||||
self.set_world_size(trainer)
|
||||
if not HAVE_APEX:
|
||||
logging.warning("Apex was not found. Using model parallel or megatron models will error out.")
|
||||
|
||||
def register_artifact(
|
||||
self, config_path: str, src: str, verify_src_exists: bool = False,
|
||||
|
@ -71,15 +77,8 @@ class NLPModel(ModelPT, Exportable):
|
|||
raise ValueError('Instantiate self.bert_model before registering it.')
|
||||
else:
|
||||
# get encoder config and create source for artifact
|
||||
if isinstance(self.bert_model, MegatronBertEncoder):
|
||||
pretrained_model_name = self.bert_model._model_name
|
||||
encoder_config_path = pretrained_model_name + '_encoder_config'
|
||||
encoder_config_src = os.path.join(NEMO_NLP_TMP, encoder_config_path + '.json')
|
||||
config_for_json = OmegaConf.to_container(self.bert_model.config)
|
||||
with open(encoder_config_src, 'w', encoding='utf-8') as f:
|
||||
f.write(json.dumps(config_for_json, indent=2, sort_keys=True) + '\n')
|
||||
self.register_artifact('language_model.config_file', encoder_config_src) # for .nemo
|
||||
elif isinstance(self.bert_model, BertModule):
|
||||
|
||||
if isinstance(self.bert_model, BertModule):
|
||||
# HuggingFace Transformer Config
|
||||
pretrained_model_name = self.bert_model.name_or_path
|
||||
# Some HF names have "/" in them so we replace with _
|
||||
|
@ -179,43 +178,6 @@ class NLPModel(ModelPT, Exportable):
|
|||
f'Registering tokenizer vocab for {self.tokenizer} is not yet supported. Please override this method if needed.'
|
||||
)
|
||||
|
||||
def on_save_checkpoint(self, checkpoint: Dict[str, Any]) -> None:
|
||||
""" LightningModule hook that's used to save things in addition to model weights. """
|
||||
|
||||
if hasattr(self, "bert_model") and isinstance(self.bert_model, MegatronBertEncoder):
|
||||
checkpoint['checkpoint_version'] = get_megatron_checkpoint_version()
|
||||
return None
|
||||
|
||||
def on_load_checkpoint(self, checkpoint: Dict[str, Any]) -> None:
|
||||
""" LightningModule hook that's used to restore things saved with on_save_checkpoint."""
|
||||
|
||||
if hasattr(self, "bert_model") and isinstance(self.bert_model, MegatronBertEncoder):
|
||||
if get_megatron_checkpoint_version():
|
||||
assert (
|
||||
checkpoint['checkpoint_version'] == get_megatron_checkpoint_version()
|
||||
), 'checkpoint version found on_load_checkpoint different than get_megatron_checkpoint_version'
|
||||
else:
|
||||
set_megatron_checkpoint_version(checkpoint['checkpoint_version'])
|
||||
logging.info(f"Setting Megatron checkpoint version: {checkpoint['checkpoint_version']}")
|
||||
return None
|
||||
|
||||
@rank_zero_only
|
||||
def register_megatron_checkpoint_version(self):
|
||||
""" Adds checkpoint version to .nemo archive """
|
||||
if self.has_megatron_encoder:
|
||||
checkpoint_version = get_megatron_checkpoint_version()
|
||||
if checkpoint_version is None:
|
||||
raise ValueError('Unable to get megatron checkpoint version.')
|
||||
else:
|
||||
checkpoint_version_dict = {'checkpoint_version': checkpoint_version}
|
||||
checkpoint_version_path = 'megatron_checkpoint_version.json'
|
||||
checkpoint_version_src = os.path.join(NEMO_NLP_TMP, checkpoint_version_path)
|
||||
with open(checkpoint_version_src, 'w') as f:
|
||||
f.write(json.dumps(checkpoint_version_dict))
|
||||
self.register_artifact(checkpoint_version_path, checkpoint_version_src)
|
||||
else:
|
||||
raise ValueError('Registering Megatron checkpoint version but no Megatron encoder detected.')
|
||||
|
||||
@staticmethod
|
||||
def _unpack_nemo_file(path2file: str, out_folder: str) -> str:
|
||||
return super(NLPModel, NLPModel)._unpack_nemo_file(path2file, out_folder)
|
||||
|
@ -232,21 +194,6 @@ class NLPModel(ModelPT, Exportable):
|
|||
def output_module(self):
|
||||
return self.classifier
|
||||
|
||||
@property
|
||||
def has_megatron_encoder(self):
|
||||
if hasattr(self, 'bert_model'):
|
||||
if isinstance(self.bert_model, MegatronBertEncoder):
|
||||
return True
|
||||
else:
|
||||
return False
|
||||
elif hasattr(self, 'encoder'):
|
||||
if isinstance(self.encoder, MegatronEncoderModule):
|
||||
return True
|
||||
else:
|
||||
return False
|
||||
else:
|
||||
return False
|
||||
|
||||
@property
|
||||
def is_model_parallel_initialized(self):
|
||||
app_state = AppState()
|
||||
|
@ -255,19 +202,6 @@ class NLPModel(ModelPT, Exportable):
|
|||
else:
|
||||
return False
|
||||
|
||||
def restore_megatron_encoder_weights(self):
|
||||
""" Model parallel weights need to be restored after DDP is initialized and
|
||||
model parallel ranks are known.
|
||||
"""
|
||||
if hasattr(self, 'bert_model'):
|
||||
if isinstance(self.bert_model, MegatronBertEncoder):
|
||||
logging.info(f"Restoring from pretrained model parallel checkpoint: {self.bert_model._restore_path}")
|
||||
self.bert_model.restore_weights(self.bert_model._restore_path)
|
||||
elif hasattr(self, 'encoder'):
|
||||
if isinstance(self.encoder, MegatronEncoderModule):
|
||||
logging.info(f"Restoring from pretrained model parallel checkpoint: {self.encoder.checkpoint_file}")
|
||||
self.encoder._encoder.restore_weights(self.encoder.checkpoint_file)
|
||||
|
||||
@classmethod
|
||||
def load_from_checkpoint(
|
||||
cls,
|
||||
|
@ -321,18 +255,3 @@ class NLPModel(ModelPT, Exportable):
|
|||
finally:
|
||||
cls._set_model_restore_state(is_being_restored=False)
|
||||
return checkpoint
|
||||
|
||||
def save_to(self, save_path: str):
|
||||
app_state = AppState()
|
||||
# Add NeMo rank check as well
|
||||
if app_state.model_parallel_size is not None:
|
||||
if app_state.model_parallel_size > 1:
|
||||
if not isinstance(self._save_restore_connector, NLPSaveRestoreConnector):
|
||||
logging.warning(
|
||||
f"Using {self._save_restore_connector.__class__} to save a model parallel model. Overriding with NLPSaveRestoreConnector. Make sure to subclass NLPSaveRestoreConnector."
|
||||
)
|
||||
self._save_restore_connector = NLPSaveRestoreConnector()
|
||||
save_path = os.path.abspath(os.path.expanduser(save_path))
|
||||
self._save_restore_connector.save_to(self, save_path)
|
||||
else:
|
||||
super(NLPModel, self).save_to(save_path=save_path)
|
||||
|
|
|
@ -18,13 +18,11 @@ from nemo.collections.nlp.modules.common import (
|
|||
BertEncoder,
|
||||
BertModule,
|
||||
DistilBertEncoder,
|
||||
MegatronBertEncoder,
|
||||
RobertaEncoder,
|
||||
SequenceClassifier,
|
||||
SequenceRegression,
|
||||
SequenceTokenClassifier,
|
||||
get_lm_model,
|
||||
get_megatron_lm_models_list,
|
||||
get_pretrained_lm_models_list,
|
||||
get_tokenizer,
|
||||
get_tokenizer_list,
|
||||
|
|
|
@ -21,12 +21,7 @@ from nemo.collections.nlp.modules.common.huggingface import (
|
|||
DistilBertEncoder,
|
||||
RobertaEncoder,
|
||||
)
|
||||
from nemo.collections.nlp.modules.common.lm_utils import (
|
||||
get_lm_model,
|
||||
get_megatron_lm_models_list,
|
||||
get_pretrained_lm_models_list,
|
||||
)
|
||||
from nemo.collections.nlp.modules.common.megatron import MegatronBertEncoder
|
||||
from nemo.collections.nlp.modules.common.lm_utils import get_lm_model, get_pretrained_lm_models_list
|
||||
from nemo.collections.nlp.modules.common.sequence_classifier import SequenceClassifier
|
||||
from nemo.collections.nlp.modules.common.sequence_regression import SequenceRegression
|
||||
from nemo.collections.nlp.modules.common.sequence_token_classifier import SequenceTokenClassifier
|
||||
|
|
|
@ -25,14 +25,9 @@ from nemo.collections.nlp.modules.common.huggingface.huggingface_utils import (
|
|||
get_huggingface_lm_model,
|
||||
get_huggingface_pretrained_lm_models_list,
|
||||
)
|
||||
from nemo.collections.nlp.modules.common.megatron.megatron_utils import (
|
||||
get_megatron_lm_model,
|
||||
get_megatron_lm_models_list,
|
||||
)
|
||||
from nemo.collections.nlp.modules.common.transformer.transformer import NeMoTransformerConfig
|
||||
from nemo.collections.nlp.modules.common.transformer.transformer_utils import (
|
||||
get_huggingface_transformer,
|
||||
get_megatron_transformer,
|
||||
get_nemo_transformer,
|
||||
)
|
||||
from nemo.utils import AppState, logging
|
||||
|
@ -48,7 +43,7 @@ def get_pretrained_lm_models_list(include_external: bool = False) -> List[str]:
|
|||
include_external if true includes all HuggingFace model names, not only those supported language models in NeMo.
|
||||
|
||||
"""
|
||||
return get_megatron_lm_models_list() + get_huggingface_pretrained_lm_models_list(include_external=include_external)
|
||||
return get_huggingface_pretrained_lm_models_list(include_external=include_external)
|
||||
|
||||
|
||||
def get_lm_model(
|
||||
|
|
|
@ -25,9 +25,18 @@ from nemo.collections.common.tokenizers.word_tokenizer import WordTokenizer
|
|||
from nemo.collections.common.tokenizers.youtokentome_tokenizer import YouTokenToMeTokenizer
|
||||
from nemo.collections.nlp.modules.common.huggingface.huggingface_utils import get_huggingface_pretrained_lm_models_list
|
||||
from nemo.collections.nlp.modules.common.lm_utils import get_pretrained_lm_models_list
|
||||
from nemo.collections.nlp.modules.common.megatron.megatron_utils import get_megatron_tokenizer
|
||||
from nemo.collections.nlp.parts.nlp_overrides import HAVE_APEX
|
||||
from nemo.utils import logging
|
||||
|
||||
try:
|
||||
from nemo.collections.nlp.modules.common.megatron.megatron_utils import get_megatron_tokenizer
|
||||
|
||||
HAVE_APEX = True
|
||||
|
||||
except (ImportError, ModuleNotFoundError):
|
||||
HAVE_APEX = False
|
||||
|
||||
|
||||
__all__ = ['get_tokenizer', 'get_tokenizer_list']
|
||||
|
||||
|
||||
|
@ -88,6 +97,8 @@ def get_tokenizer(
|
|||
special_tokens_dict = special_tokens
|
||||
|
||||
if 'megatron' in tokenizer_name:
|
||||
if not HAVE_APEX:
|
||||
raise RuntimeError("Apex required to use megatron.")
|
||||
if vocab_file is None:
|
||||
vocab_file = nemo.collections.nlp.modules.common.megatron.megatron_utils.get_megatron_vocab_file(
|
||||
tokenizer_name
|
||||
|
|
|
@ -17,10 +17,8 @@ from typing import Optional, Union
|
|||
|
||||
from omegaconf.dictconfig import DictConfig
|
||||
|
||||
from nemo.collections.nlp.modules.common.encoder_module import EncoderModule
|
||||
from nemo.collections.nlp.modules.common.huggingface.huggingface_decoder import HuggingFaceDecoderModule
|
||||
from nemo.collections.nlp.modules.common.huggingface.huggingface_encoder import HuggingFaceEncoderModule
|
||||
from nemo.collections.nlp.modules.common.megatron.megatron_encoder import MegatronEncoderModule
|
||||
from nemo.collections.nlp.modules.common.transformer.transformer import TransformerDecoderNM, TransformerEncoderNM
|
||||
from nemo.collections.nlp.modules.common.transformer.transformer_bottleneck import TransformerBottleneckEncoderNM
|
||||
|
||||
|
@ -157,18 +155,22 @@ def get_megatron_transformer(
|
|||
config_dict: Optional[Union[dict, DictConfig]] = None,
|
||||
encoder: bool = True,
|
||||
checkpoint_file: str = None,
|
||||
) -> MegatronEncoderModule:
|
||||
) -> None:
|
||||
|
||||
vocab_file = config_dict.pop('vocab_file', None)
|
||||
if encoder:
|
||||
model = MegatronEncoderModule(
|
||||
model_name=model_name,
|
||||
pretrained=pretrained,
|
||||
config_dict=config_dict,
|
||||
checkpoint_file=checkpoint_file,
|
||||
vocab_file=vocab_file,
|
||||
)
|
||||
else:
|
||||
raise ValueError('Megatron decoders are not currently supported.')
|
||||
raise ValueError(
|
||||
"megatron-lm bert encoders are deprecated in NeMo 1.5.0. Please use NeMo 1.4.0 until megatron bert support is added again."
|
||||
)
|
||||
|
||||
return model
|
||||
# vocab_file = config_dict.pop('vocab_file', None)
|
||||
# if encoder:
|
||||
# model = MegatronEncoderModule(
|
||||
# model_name=model_name,
|
||||
# pretrained=pretrained,
|
||||
# config_dict=config_dict,
|
||||
# checkpoint_file=checkpoint_file,
|
||||
# vocab_file=vocab_file,
|
||||
# )
|
||||
# else:
|
||||
# raise ValueError('Megatron decoders are not currently supported.')
|
||||
|
||||
# return model
|
||||
|
|
|
@ -19,7 +19,6 @@ from typing import Any, Dict, List, Optional, Union
|
|||
|
||||
import pytorch_lightning as pl
|
||||
import torch
|
||||
from apex.transformer import parallel_state
|
||||
from pytorch_lightning.overrides import LightningDistributedModule
|
||||
from pytorch_lightning.plugins.environments.cluster_environment import ClusterEnvironment
|
||||
from pytorch_lightning.plugins.io.checkpoint_plugin import CheckpointIO
|
||||
|
@ -35,14 +34,19 @@ from torch.nn.modules.module import Module
|
|||
from torch.nn.parallel import DistributedDataParallel
|
||||
from torch.optim.optimizer import Optimizer
|
||||
|
||||
from nemo.collections.nlp.modules.common.megatron.clip_grads import clip_grad_norm_fp32
|
||||
from nemo.collections.nlp.modules.common.megatron.megatron_bert import (
|
||||
get_megatron_checkpoint_version,
|
||||
set_megatron_checkpoint_version,
|
||||
)
|
||||
from nemo.core.connectors.save_restore_connector import SaveRestoreConnector
|
||||
from nemo.utils import AppState, logging
|
||||
|
||||
try:
|
||||
from apex.transformer import parallel_state
|
||||
from nemo.collections.nlp.modules.common.megatron.clip_grads import clip_grad_norm_fp32
|
||||
|
||||
HAVE_APEX = True
|
||||
|
||||
except (ImportError, ModuleNotFoundError):
|
||||
|
||||
HAVE_APEX = False
|
||||
|
||||
|
||||
class NLPDDPPlugin(DDPPlugin):
|
||||
""" DDP plugin for Pytorch Lightning. Needed to customize DDP for model parallel models.
|
||||
|
@ -61,6 +65,9 @@ class NLPDDPPlugin(DDPPlugin):
|
|||
) -> None:
|
||||
super().__init__(parallel_devices, num_nodes, cluster_environment, checkpoint_io, sync_batchnorm, **kwargs)
|
||||
|
||||
if not HAVE_APEX:
|
||||
logging.warning("Apex was not found. Using model parallel or megatron models will error out.")
|
||||
|
||||
def setup_distributed(self, global_rank: int = None, world_size: int = None) -> None:
|
||||
# call PTL init ddp
|
||||
super().setup_distributed()
|
||||
|
@ -70,70 +77,6 @@ class NLPDDPPlugin(DDPPlugin):
|
|||
|
||||
if app_state.model_parallel_size is not None:
|
||||
self.init_model_parallel(app_state.global_rank, app_state.world_size)
|
||||
# if self.lightning_module.has_megatron_encoder and not self.lightning_module.is_model_parallel_initialized:
|
||||
# self.init_model_parallel(app_state.global_rank, app_state.world_size)
|
||||
|
||||
def start_training(self, trainer: 'Trainer') -> None:
|
||||
""" PTL Hook that is called after DPP is initialized. """
|
||||
|
||||
if self.lightning_module.has_megatron_encoder:
|
||||
app_state = AppState()
|
||||
if app_state.model_parallel_size is not None:
|
||||
# mpu grad clipping needs parameters to have the attribute model_parallel
|
||||
parameters = self.lightning_module.parameters()
|
||||
for p in parameters:
|
||||
if not hasattr(p, 'model_parallel'):
|
||||
p.model_parallel = False
|
||||
|
||||
if get_megatron_checkpoint_version() is not None:
|
||||
# megatron checkpoint already restored
|
||||
pass
|
||||
elif trainer.checkpoint_connector.resume_checkpoint_path is not None:
|
||||
# PTL auto-resuming, need to update checkpoint name
|
||||
# update path based on model parallel rank
|
||||
filepath = trainer.checkpoint_connector.resume_checkpoint_path
|
||||
dirname = os.path.dirname(os.path.dirname(filepath))
|
||||
basename = os.path.basename(filepath)
|
||||
filepath = f'{dirname}/mp_rank_{app_state.model_parallel_rank:02d}/{basename}'
|
||||
trainer.checkpoint_connector.resume_checkpoint_path = filepath
|
||||
logging.info(
|
||||
f'Resuming training from checkpoint {trainer.checkpoint_connector.resume_checkpoint_path}'
|
||||
)
|
||||
# need to set checkpoint version for megatron-lm
|
||||
checkpoint_version = torch.load(trainer.checkpoint_connector.resume_checkpoint_path).get(
|
||||
'checkpoint_version', None
|
||||
)
|
||||
if checkpoint_version is not None:
|
||||
set_megatron_checkpoint_version(checkpoint_version)
|
||||
else:
|
||||
logging.warning('Megatron-lm checkpoint version not found. Setting checkpoint_version to 0.')
|
||||
set_megatron_checkpoint_version(0)
|
||||
else:
|
||||
self.lightning_module.restore_megatron_encoder_weights()
|
||||
else:
|
||||
if get_megatron_checkpoint_version() is not None:
|
||||
# megatron checkpoint already restored
|
||||
pass
|
||||
else:
|
||||
self.lightning_module.restore_megatron_encoder_weights()
|
||||
|
||||
self.lightning_module.register_megatron_checkpoint_version()
|
||||
|
||||
return super().start_training(trainer)
|
||||
|
||||
def start_testing(self, trainer: 'Trainer') -> None:
|
||||
""" PTL Hook that is called after DPP is initialized. """
|
||||
app_state = AppState()
|
||||
|
||||
if app_state.model_parallel_size is not None:
|
||||
|
||||
if self.has_megatron_encoder:
|
||||
# check megatron checkpoint version
|
||||
checkpoint_version = get_megatron_checkpoint_version()
|
||||
if checkpoint_version is None:
|
||||
raise ValueError("Unable to find megatron checkpoint version.")
|
||||
|
||||
return super().start_testing(trainer)
|
||||
|
||||
def configure_ddp(self):
|
||||
""" Override LightningModule ddp if using model parallel.
|
||||
|
@ -227,6 +170,8 @@ class NLPCheckpointConnector(CheckpointConnector):
|
|||
|
||||
def __init__(self, trainer, resume_from_checkpoint):
|
||||
super().__init__(trainer, resume_from_checkpoint)
|
||||
if not HAVE_APEX:
|
||||
logging.warning("Apex was not found. Using model parallel or megatron models will error out.")
|
||||
|
||||
def save_checkpoint(self, filepath, weights_only: bool = False) -> None:
|
||||
"""Slightly modified version of PyTorch Lightning's save_checkpoint.
|
||||
|
@ -254,6 +199,8 @@ class NLPCheckpointConnector(CheckpointConnector):
|
|||
class NLPSaveRestoreConnector(SaveRestoreConnector):
|
||||
def __init__(self) -> None:
|
||||
super().__init__()
|
||||
if not HAVE_APEX:
|
||||
logging.warning("Apex was not found. Using model parallel or megatron models will error out.")
|
||||
|
||||
def save_to(self, model, save_path: str):
|
||||
app_state = AppState()
|
||||
|
@ -318,6 +265,8 @@ class NLPNativeMixedPrecisionPlugin(NativeMixedPrecisionPlugin):
|
|||
class NLPNativeBfloat16PrecisionPlugin(NativeMixedPrecisionPlugin):
|
||||
def __init__(self) -> None:
|
||||
super().__init__(precision='bf16')
|
||||
if not HAVE_APEX:
|
||||
logging.warning("Apex was not found. Using model parallel or megatron models will error out.")
|
||||
|
||||
def clip_gradients(
|
||||
self,
|
||||
|
|
|
@ -18,7 +18,7 @@ import os
|
|||
import uuid
|
||||
from abc import abstractmethod
|
||||
from os import path
|
||||
from os.path import expanduser
|
||||
from pathlib import Path
|
||||
from typing import Callable, Dict, List, Optional, Union
|
||||
|
||||
import hydra
|
||||
|
@ -214,24 +214,27 @@ class ModelPT(LightningModule, Model):
|
|||
save_path: Path to .nemo file where model instance should be saved
|
||||
"""
|
||||
|
||||
def maybe_make_save_dir(path: 'pathlib.Path'):
|
||||
if not path.parent.exists():
|
||||
path.parent.mkdir(parents=True)
|
||||
|
||||
save_path = Path(save_path).expanduser().resolve()
|
||||
app_state = AppState()
|
||||
# Add NeMo rank check as well
|
||||
if app_state.model_parallel_size is not None:
|
||||
if app_state.model_parallel_size > 1:
|
||||
if isinstance(self._save_restore_connector, SaveRestoreConnector):
|
||||
if type(self._save_restore_connector) == SaveRestoreConnector:
|
||||
raise ValueError(
|
||||
'Default NeMo SaveRestoreConnector will not work in model parallel mode. You should use a connector which supports model parallel mode, such as NLPSaveRestoreConnector in NLP. You can also you custom one.'
|
||||
'Default NeMo SaveRestoreConnector will not work in model parallel mode. You should use a '
|
||||
'connector which supports model parallel mode, such as NLPSaveRestoreConnector in NLP. You '
|
||||
'can also use a custom one.'
|
||||
)
|
||||
|
||||
save_path = os.path.abspath(os.path.expanduser(save_path))
|
||||
if app_state.data_parallel_rank == 0:
|
||||
maybe_make_save_dir(save_path)
|
||||
# connector checks for ranks properly, no need to check here
|
||||
self._save_restore_connector.save_to(self, save_path)
|
||||
else:
|
||||
if not is_global_rank_zero():
|
||||
return
|
||||
else:
|
||||
save_path = os.path.abspath(os.path.expanduser(save_path))
|
||||
self._save_restore_connector.save_to(self, save_path)
|
||||
self._save_restore_connector.save_to(self, str(save_path)) # downstream tasks expect str, not Path
|
||||
elif is_global_rank_zero():
|
||||
maybe_make_save_dir(save_path)
|
||||
self._save_restore_connector.save_to(self, str(save_path)) # downstream tasks expect str, not Path
|
||||
|
||||
@classmethod
|
||||
def restore_from(
|
||||
|
@ -457,16 +460,16 @@ class ModelPT(LightningModule, Model):
|
|||
optim_config['sched']['t_max_epochs'] = self._trainer.max_epochs
|
||||
optim_config['sched']['t_accumulate_grad_batches'] = self._trainer.accumulate_grad_batches
|
||||
optim_config['sched']['t_limit_train_batches'] = self._trainer.limit_train_batches
|
||||
if self._trainer.distributed_backend is None:
|
||||
if self._trainer.accelerator_connector.strategy is None:
|
||||
optim_config['sched']['t_num_workers'] = self._trainer.num_gpus or 1
|
||||
elif self._trainer.distributed_backend == "ddp_cpu":
|
||||
elif self._trainer.accelerator_connector.strategy == "ddp_cpu":
|
||||
optim_config['sched']['t_num_workers'] = self._trainer.num_processes * self._trainer.num_nodes
|
||||
elif self._trainer.distributed_backend == "ddp":
|
||||
elif self._trainer.accelerator_connector.strategy == "ddp":
|
||||
optim_config['sched']['t_num_workers'] = self._trainer.num_gpus * self._trainer.num_nodes
|
||||
else:
|
||||
logging.warning(
|
||||
f"The lightning trainer received accelerator: {self._trainer.distributed_backend}. We "
|
||||
"recommend to use 'ddp' instead."
|
||||
"recommend to use strategy 'ddp' instead."
|
||||
)
|
||||
optim_config['sched']['t_num_workers'] = self._trainer.num_gpus * self._trainer.num_nodes
|
||||
else:
|
||||
|
|
|
@ -665,6 +665,7 @@ class NeMoModelCheckpoint(ModelCheckpoint):
|
|||
super().__init__(**kwargs)
|
||||
|
||||
if self.save_top_k != -1 and n_resume:
|
||||
logging.debug("Checking previous runs")
|
||||
self.nemo_topk_check_previous_run()
|
||||
|
||||
def nemo_topk_check_previous_run(self):
|
||||
|
@ -753,8 +754,13 @@ class NeMoModelCheckpoint(ModelCheckpoint):
|
|||
|
||||
# Load the best model and then re-save it
|
||||
if self.save_best_model:
|
||||
trainer.checkpoint_connector.restore(self.best_model_path)
|
||||
|
||||
if self.best_model_path is "":
|
||||
logging.warning(
|
||||
f"{self} was told to save the best checkpoint at the end of training, but no saved checkpoints "
|
||||
"were found. Saving latest model instead."
|
||||
)
|
||||
else:
|
||||
trainer.checkpoint_connector.restore(self.best_model_path)
|
||||
pl_module.save_to(save_path=os.path.join(self.dirpath, self.prefix + self.postfix))
|
||||
|
||||
def _del_model(self, trainer: "pl.Trainer", filepath: str) -> None:
|
||||
|
|
|
@ -0,0 +1,147 @@
|
|||
# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import multiprocessing as mp
|
||||
import os
|
||||
from functools import partial
|
||||
|
||||
import numpy as np
|
||||
from matplotlib import pyplot as plt
|
||||
|
||||
from nemo.collections.nlp.modules.common.tokenizer_utils import get_nmt_tokenizer
|
||||
|
||||
# =============================================================================#
|
||||
# Auxiliary methods
|
||||
# =============================================================================#
|
||||
|
||||
|
||||
def read_batch(fh, batch_size):
|
||||
"""
|
||||
Reads a batch (or smaller) chunk of lines.
|
||||
"""
|
||||
lines = []
|
||||
for i in range(batch_size):
|
||||
l = fh.readline()
|
||||
if l is None:
|
||||
break
|
||||
else:
|
||||
lines.append(l.strip())
|
||||
|
||||
return lines
|
||||
|
||||
|
||||
def tokenize_line(line, tokenizer):
|
||||
"""
|
||||
Returns a tokenized line
|
||||
"""
|
||||
tokens = tokenizer.text_to_ids(line)
|
||||
|
||||
return tokens
|
||||
|
||||
|
||||
def line_len(line, tokenizer):
|
||||
"""
|
||||
Returns a tokenized length of a text line
|
||||
"""
|
||||
tokens = tokenize_line(line, tokenizer)
|
||||
|
||||
return len(tokens)
|
||||
|
||||
|
||||
# =============================================================================#
|
||||
# Main script
|
||||
# =============================================================================#
|
||||
if __name__ == '__main__':
|
||||
parser = argparse.ArgumentParser(description='Collects statistics over tokenized dataset')
|
||||
parser.add_argument('input_files', metavar='N', type=str, nargs='+', help='Input files to parse')
|
||||
parser.add_argument(
|
||||
'--tokenizer_library', type=str, required=True, help='Path to pre-trained nemo-supported tokenizer model'
|
||||
)
|
||||
parser.add_argument(
|
||||
'--tokenizer_model', type=str, required=True, help='Path to pre-trained nemo-supported tokenizer model'
|
||||
)
|
||||
parser.add_argument(
|
||||
'--num_workers', type=int, default=mp.cpu_count(), help='Number of workers (default to number of CPUs)'
|
||||
)
|
||||
parser.add_argument('--max_lines', type=int, default=-1, help='Max number of lines to parse')
|
||||
parser.add_argument('--batch_size', type=int, default=10000000, help='Batch size to parse in parallel')
|
||||
parser.add_argument('--out_dir', type=str, default="", help='Path to store data and plots')
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
tokenizer = get_nmt_tokenizer(library=args.tokenizer_library, tokenizer_model=args.tokenizer_model,)
|
||||
|
||||
all_len = []
|
||||
|
||||
for fn in args.input_files:
|
||||
print(f"Parsing fn = {fn}")
|
||||
# read file
|
||||
fh = open(fn)
|
||||
|
||||
# read all batches
|
||||
while True:
|
||||
lines = read_batch(fh, args.batch_size)
|
||||
|
||||
# move to next file when no lines are read
|
||||
if not lines:
|
||||
break
|
||||
|
||||
# tokenize lines
|
||||
with mp.Pool(args.num_workers) as p:
|
||||
all_len.extend(p.map(partial(line_len, tokenizer=tokenizer), lines))
|
||||
|
||||
print(f"{fn}: Parsed {len(all_len)} lines")
|
||||
|
||||
# early stop, if required
|
||||
if (args.max_lines > 0) and (len(all_len) >= args.max_lines):
|
||||
lines = lines[: args.max_lines]
|
||||
break
|
||||
|
||||
# early stop, if required
|
||||
if (args.max_lines > 0) and (len(all_len) >= args.max_lines):
|
||||
lines = lines[: args.max_lines]
|
||||
break
|
||||
|
||||
# compute stats
|
||||
|
||||
# save all results
|
||||
if args.out_dir:
|
||||
if not os.path.exists(args.out_dir):
|
||||
os.mkdir(args.out_dir)
|
||||
|
||||
stats = {
|
||||
"samples": int(len(all_len)),
|
||||
"mean": float(np.mean(all_len)),
|
||||
"stdev": float(np.std(all_len)),
|
||||
"min": float(np.min(all_len)),
|
||||
"max": float(np.max(all_len)),
|
||||
"median": float(np.median(all_len)),
|
||||
}
|
||||
|
||||
print(f"stats = \n{stats}")
|
||||
|
||||
# save all results
|
||||
if args.out_dir:
|
||||
if not os.path.exists(args.out_dir):
|
||||
os.makedirs(args.out_dir, exist_ok=True)
|
||||
|
||||
fh = open(os.path.join(args.out_dir, "lengths.txt"), "w")
|
||||
fh.writelines(["{l}\n".format(l=l) for l in all_len])
|
||||
|
||||
json.dump(stats, open(os.path.join(args.out_dir, "stats.json"), "w"))
|
||||
|
||||
fig = plt.hist(all_len)
|
||||
plt.savefig(os.path.join(args.out_dir, "lengths_hist.pdf"))
|
99
scripts/neural_machine_translation/plot_detailed_timing.py
Executable file
99
scripts/neural_machine_translation/plot_detailed_timing.py
Executable file
|
@ -0,0 +1,99 @@
|
|||
#!/usr/bin/env python3
|
||||
# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""
|
||||
This script takes as an input XXXX.json files
|
||||
(i.e., the output of nmt_transformer_infer.py --write_timing)
|
||||
and creates plots XXX.PLOT_NAME.png at the same path.
|
||||
"""
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
|
||||
from matplotlib import pyplot as plt
|
||||
|
||||
# =============================================================================#
|
||||
# Control Variables
|
||||
# =============================================================================#
|
||||
|
||||
PLOTS_EXT = "pdf"
|
||||
PLOT_TITLE = False
|
||||
PLOT_XLABEL = True
|
||||
PLOT_YLABEL = True
|
||||
PLOT_LABEL_FONT_SIZE = 16
|
||||
PLOT_GRID = True
|
||||
|
||||
# =============================================================================#
|
||||
# Helper functions
|
||||
# =============================================================================#
|
||||
|
||||
|
||||
def plot_timing(lengths, timings, lengths_name, timings_name, fig=None):
|
||||
if fig is None:
|
||||
fig = plt.figure()
|
||||
|
||||
plt.scatter(lengths, timings, label=timings_name)
|
||||
if PLOT_XLABEL:
|
||||
plt.xlabel(f"{lengths_name} [tokens]", fontsize=PLOT_LABEL_FONT_SIZE)
|
||||
if PLOT_YLABEL:
|
||||
plt.ylabel(f"{timings_name} [sec]", fontsize=PLOT_LABEL_FONT_SIZE)
|
||||
if PLOT_GRID:
|
||||
plt.grid(True)
|
||||
if PLOT_TITLE:
|
||||
plt.title(f"{timings_name} vs. {lengths_name}")
|
||||
|
||||
plt.xticks(fontsize=PLOT_LABEL_FONT_SIZE)
|
||||
plt.yticks(fontsize=PLOT_LABEL_FONT_SIZE)
|
||||
plt.tight_layout()
|
||||
|
||||
return fig
|
||||
|
||||
|
||||
# =============================================================================#
|
||||
# Main script
|
||||
# =============================================================================#
|
||||
if __name__ == "__main__":
|
||||
print("Usage: plot_detailed_timing.py <JSON FILE> <SJON FILE> ...")
|
||||
for timing_fn in sys.argv[1:]:
|
||||
# load data
|
||||
print(f"Parsing file = {timing_fn}")
|
||||
data = json.load(open(timing_fn))
|
||||
|
||||
# plot data
|
||||
gifs_dict = {}
|
||||
gifs_dict["encoder-src_len"] = plot_timing(
|
||||
lengths=data["mean_src_length"],
|
||||
timings=data["encoder"],
|
||||
lengths_name="src length",
|
||||
timings_name="encoder",
|
||||
)
|
||||
gifs_dict["sampler-src_len"] = plot_timing(
|
||||
lengths=data["mean_src_length"],
|
||||
timings=data["sampler"],
|
||||
lengths_name="src length",
|
||||
timings_name="sampler",
|
||||
)
|
||||
gifs_dict["sampler-tgt_len"] = plot_timing(
|
||||
lengths=data["mean_tgt_length"],
|
||||
timings=data["sampler"],
|
||||
lengths_name="tgt length",
|
||||
timings_name="sampler",
|
||||
)
|
||||
|
||||
# save data
|
||||
base_fn = os.path.splitext(timing_fn)[0]
|
||||
for name, fig in gifs_dict.items():
|
||||
plot_fn = f"{base_fn}.{name}.{PLOTS_EXT}"
|
||||
print(f"Saving pot = {plot_fn}")
|
||||
fig.savefig(plot_fn)
|
|
@ -320,7 +320,7 @@ class TestExpManager:
|
|||
@pytest.mark.unit
|
||||
def test_nemo_checkpoint_save_best_model_1(self, tmp_path):
|
||||
test_trainer = pl.Trainer(checkpoint_callback=False, logger=False, max_epochs=4)
|
||||
log_dir = exp_manager(
|
||||
exp_manager(
|
||||
test_trainer,
|
||||
{"checkpoint_callback_params": {"save_best_model": True}, "explicit_log_dir": str(tmp_path / "test")},
|
||||
)
|
||||
|
@ -335,7 +335,9 @@ class TestExpManager:
|
|||
@pytest.mark.unit
|
||||
def test_nemo_checkpoint_save_best_model_2(self, tmp_path):
|
||||
test_trainer = pl.Trainer(checkpoint_callback=False, logger=False, max_epochs=4)
|
||||
log_dir = exp_manager(test_trainer, {"explicit_log_dir": str(tmp_path / "test")},)
|
||||
exp_manager(
|
||||
test_trainer, {"explicit_log_dir": str(tmp_path / "test")},
|
||||
)
|
||||
model = ExampleModel()
|
||||
test_trainer.fit(model)
|
||||
|
||||
|
@ -347,7 +349,7 @@ class TestExpManager:
|
|||
@pytest.mark.unit
|
||||
def test_nemo_checkpoint_always_save_nemo(self, tmp_path):
|
||||
test_trainer = pl.Trainer(checkpoint_callback=False, logger=False, max_epochs=4)
|
||||
log_dir = exp_manager(
|
||||
exp_manager(
|
||||
test_trainer,
|
||||
{
|
||||
"checkpoint_callback_params": {"save_best_model": True, "always_save_nemo": True},
|
||||
|
@ -361,3 +363,18 @@ class TestExpManager:
|
|||
|
||||
model = ExampleModel.restore_from(str(tmp_path / "test" / "checkpoints" / "default.nemo"))
|
||||
assert float(model(torch.tensor([1.0, 1.0], device=model.device))) == 0.0
|
||||
|
||||
@pytest.mark.unit
|
||||
def test_nemo_checkpoint_make_checkpoint_dir(self, tmp_path):
|
||||
test_trainer = pl.Trainer(checkpoint_callback=False, logger=False, max_epochs=4, check_val_every_n_epoch=5)
|
||||
exp_manager(
|
||||
test_trainer,
|
||||
{
|
||||
"checkpoint_callback_params": {"save_best_model": True, "always_save_nemo": True},
|
||||
"explicit_log_dir": str(tmp_path / "test"),
|
||||
},
|
||||
)
|
||||
model = ExampleModel()
|
||||
test_trainer.fit(model)
|
||||
|
||||
assert Path(str(tmp_path / "test" / "checkpoints" / "default.nemo")).exists()
|
||||
|
|
|
@ -733,7 +733,8 @@ class TestOptimizersSchedulers:
|
|||
):
|
||||
trainer = pl.Trainer(
|
||||
max_epochs=max_epochs,
|
||||
accelerator="ddp_cpu",
|
||||
accelerator="cpu",
|
||||
strategy="ddp",
|
||||
num_processes=num_processes,
|
||||
accumulate_grad_batches=accumulate_grad_batches,
|
||||
limit_train_batches=limit_train_batches,
|
||||
|
|
|
@ -78,7 +78,7 @@ class ExampleModel(ModelPT):
|
|||
|
||||
def instantiate_multinode_ddp_if_possible():
|
||||
num_gpus = torch.cuda.device_count()
|
||||
trainer = Trainer(gpus=num_gpus, accelerator='ddp', logger=None, checkpoint_callback=False)
|
||||
trainer = Trainer(gpus=num_gpus, accelerator='gpu', logger=None, checkpoint_callback=False)
|
||||
|
||||
exp_manager_cfg = ExpManagerConfig(exp_dir='./ddp_check/', use_datetime_version=False, version="")
|
||||
exp_manager(trainer, cfg=OmegaConf.structured(exp_manager_cfg))
|
||||
|
|
|
@ -65,7 +65,7 @@
|
|||
"!pip install unidecode\n",
|
||||
"\n",
|
||||
"# ## Install NeMo\n",
|
||||
"BRANCH = 'main'\n",
|
||||
"BRANCH = 'r1.5.0'\n",
|
||||
"!python -m pip install git+https://github.com/NVIDIA/NeMo.git@$BRANCH#egg=nemo_toolkit[all]\n",
|
||||
"\n",
|
||||
"## Install TorchAudio\n",
|
||||
|
|
|
@ -37,7 +37,7 @@
|
|||
"!pip install unidecode\n",
|
||||
"\n",
|
||||
"# ## Install NeMo\n",
|
||||
"BRANCH = 'main'\n",
|
||||
"BRANCH = 'r1.5.0'\n",
|
||||
"!python -m pip install git+https://github.com/NVIDIA/NeMo.git@$BRANCH#egg=nemo_toolkit[all]\n",
|
||||
"\n",
|
||||
"## Install TorchAudio\n",
|
||||
|
|
|
@ -34,7 +34,7 @@
|
|||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"source": [
|
||||
"BRANCH = 'main'\n",
|
||||
"BRANCH = 'r1.5.0'\n",
|
||||
"!python -m pip install git+https://github.com/NVIDIA/NeMo.git@$BRANCH#egg=nemo_toolkit[all]"
|
||||
],
|
||||
"outputs": [],
|
||||
|
|
|
@ -33,7 +33,7 @@
|
|||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"source": [
|
||||
"BRANCH = 'main'\n",
|
||||
"BRANCH = 'r1.5.0'\n",
|
||||
"!python -m pip install git+https://github.com/NVIDIA/NeMo.git@$BRANCH#egg=nemo_toolkit[all]\n"
|
||||
],
|
||||
"outputs": [],
|
||||
|
@ -44,7 +44,7 @@
|
|||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"BRANCH = 'main'\n",
|
||||
"BRANCH = 'r1.5.0'\n",
|
||||
"!python -m pip install git+https://github.com/NVIDIA/NeMo.git@$BRANCH#egg=nemo_toolkit[all]\n"
|
||||
]
|
||||
},
|
||||
|
|
|
@ -39,7 +39,7 @@
|
|||
"!pip install matplotlib>=3.3.2\n",
|
||||
"\n",
|
||||
"## Install NeMo\n",
|
||||
"BRANCH = 'main'\n",
|
||||
"BRANCH = 'r1.5.0'\n",
|
||||
"!python -m pip install git+https://github.com/NVIDIA/NeMo.git@$BRANCH#egg=nemo_toolkit[all]\n",
|
||||
"\n",
|
||||
"\"\"\"\n",
|
||||
|
|
|
@ -27,7 +27,7 @@
|
|||
"!pip install matplotlib>=3.3.2\n",
|
||||
"\n",
|
||||
"## Install NeMo\n",
|
||||
"BRANCH = 'main'\n",
|
||||
"BRANCH = 'r1.5.0'\n",
|
||||
"!python -m pip install git+https://github.com/NVIDIA/NeMo.git@$BRANCH#egg=nemo_toolkit[all]\n",
|
||||
"\n",
|
||||
"## Grab the config we'll use in this example\n",
|
||||
|
|
|
@ -62,7 +62,7 @@
|
|||
"!pip install matplotlib>=3.3.2\n",
|
||||
"\n",
|
||||
"## Install NeMo\n",
|
||||
"BRANCH = 'main'\n",
|
||||
"BRANCH = 'r1.5.0'\n",
|
||||
"!python -m pip install git+https://github.com/NVIDIA/NeMo.git@$BRANCH#egg=nemo_toolkit[all]\n",
|
||||
"\n",
|
||||
"## Grab the config we'll use in this example\n",
|
||||
|
@ -967,7 +967,7 @@
|
|||
"trainer = pl.Trainer(amp_level='O1', precision=16)\n",
|
||||
"\n",
|
||||
"# Trainer with a distributed backend:\n",
|
||||
"trainer = pl.Trainer(gpus=2, num_nodes=2, accelerator='ddp')\n",
|
||||
"trainer = pl.Trainer(gpus=2, num_nodes=2, accelerator='gpu', strategy='ddp')\n",
|
||||
"\n",
|
||||
"# Of course, you can combine these flags as well.\n",
|
||||
"```\n",
|
||||
|
|
|
@ -40,7 +40,7 @@
|
|||
"!pip install matplotlib>=3.3.2\n",
|
||||
"\n",
|
||||
"## Install NeMo\n",
|
||||
"BRANCH = 'main'\n",
|
||||
"BRANCH = 'r1.5.0'\n",
|
||||
"!python -m pip install git+https://github.com/NVIDIA/NeMo.git@$BRANCH#egg=nemo_toolkit[all]\n",
|
||||
"\n",
|
||||
"## Grab the config we'll use in this example\n",
|
||||
|
@ -1373,7 +1373,7 @@
|
|||
"trainer = pl.Trainer(amp_level='O1', precision=16)\r\n",
|
||||
"\r\n",
|
||||
"# Trainer with a distributed backend:\r\n",
|
||||
"trainer = pl.Trainer(gpus=2, num_nodes=2, accelerator='ddp')\r\n",
|
||||
"trainer = pl.Trainer(gpus=2, num_nodes=2, accelerator='gpu', strategy='ddp')\r\n",
|
||||
"\r\n",
|
||||
"# Of course, you can combine these flags as well.\r\n",
|
||||
"```\r\n",
|
||||
|
|
Some files were not shown because too many files have changed in this diff Show more
Loading…
Reference in a new issue