NeMo/examples/nlp/machine_translation/conf/aayn_bottleneck.yaml

# The bottleneck architecture supports three learning framework (i.e., losses)
# via model.model_type:
#   1) nll - Conditional cross entropy (the usual NMT loss)
#   2) mim - MIM learning framework. A latent variable model with good
#                   reconstruction and compressed latent representation.
#                   https://arxiv.org/pdf/2003.02645.pdf
#   3) vae - VAE learning framework. A latent variable model which learns
#                   good probability estimation over observations and
#                   a regularized latent representation.
#                   https://arxiv.org/pdf/1312.6114.pdf
# The bottleneck architecture supports three encoder architectures via
# model.encoder.arch:
#   1) seq2seq -  the usual NMT model without bottleneck
#   2) bridge - a bottleneck which projects the encoder output to a fixed
#               number of steps using attention bridge (https://arxiv.org/pdf/1703.03130.pdf)
#   3) perceiver - a bottleneck by projecting inputs to a fixed
#               number of steps using perceiver architecture (https://arxiv.org/pdf/2103.03206.pdf)
#   4) max_pool / avg_pool - a reduction by halving the number of steps at the end of every hidden block.
#                            reduction is using max pooling or average pooling.
name: AttentionIsAllYouNeedBottleneck
do_training: True # set to False if only preprocessing data
do_testing: False # set to True to run evaluation on test data after training

model:
  beam_size: 4
  len_pen: 0.6
  multilingual: False
  max_generation_delta: -1
  label_smoothing: 0.1
  shared_tokenizer: True # train tokenizer model across src and tgt train data
  preproc_out_dir: null # path to store data preprocessing outputs
  src_language: 'en'
  tgt_language: 'de'
  model_type: 'nll' # learning (i.e., loss) type: nll (i.e., cross-entropy/auto-encoder), mim, vae (see description above)
  min_logv: -6 # minimal allowed log variance for mim
  latent_size: -1 # dimension of latent (projected from hidden) -1 will take value of hidden size
  non_recon_warmup_batches: 200000 # warm-up steps for mim, and vae losses
  recon_per_token: true # when false reconstruction is computed per sample, not per token

  train_ds:
    src_file_name: null
    tgt_file_name: null
    use_tarred_dataset: False # if true tar_file_name and meta_file_name will be used (or created automatically)
    # config for preprocessing training data and creating a tarred datset automatically
    tar_file_prefix: parallel # prefix for tar file names
    tar_files: null # if data has already been preprocessed (rest of config ignored)
    metadata_file: null # metadata for tarred dataset
    lines_per_dataset_fragment: 1000000 # Number of lines to consider for bucketing and padding
    num_batches_per_tarfile: 100 # Number of batches (pickle files) within each tarfile
    tar_shuffle_n: 100 # How many samples to look ahead and load to be shuffled
    shard_strategy: scatter # tarred dataset shard distribution strategy
    n_preproc_jobs: -2 # number of processes to use for data preprocessing (-2 means all but 2)
    tokens_in_batch: 512
    clean: true
    max_seq_length: 512
    shuffle: true
    num_samples: -1
    drop_last: false
    pin_memory: false
    num_workers: 8
    concat_sampling_technique: temperature # only used with ConcatTranslationDataset
    concat_sampling_temperature: 5 # only used with ConcatTranslationDataset
    concat_sampling_probabilities: null # only used with ConcatTranslationDataset

  validation_ds:
    src_file_name: ???
    tgt_file_name: ???
    tokens_in_batch: 512
    clean: false
    max_seq_length: 512
    shuffle: false
    num_samples: -1
    drop_last: false
    pin_memory: false
    num_workers: 8

  test_ds:
    src_file_name: ???
    tgt_file_name: ???
    tokens_in_batch: 512
    clean: false
    max_seq_length: 512
    shuffle: false
    num_samples: -1
    drop_last: false
    pin_memory: false
    num_workers: 8

  optim:
    name: adam
    lr: 0.001
    betas:
      - 0.9
      - 0.98
    weight_decay: 0.0
    sched:
      name: InverseSquareRootAnnealing
      min_lr: 0.0
      last_epoch: -1
      warmup_ratio: 0.1

  encoder_tokenizer:
    library: yttm
    tokenizer_model: null
    vocab_size: null # vocab size for training bpe
    bpe_dropout: null
    vocab_file: null
    special_tokens: null
    training_sample_size: null # valid for sentencepiece tokenizer

  decoder_tokenizer:
    library: yttm
    tokenizer_model: null
    vocab_size: null # vocab size for training bpe
    bpe_dropout: null
    vocab_file: null
    special_tokens: null
    training_sample_size: null # valid for sentencepiece tokenizer

  encoder:
    library: nemo
    model_name: null
    pretrained: false
    max_sequence_length: 512
    num_token_types: 0
    embedding_dropout: 0.1
    learn_positional_encodings: false
    hidden_size: 512
    num_layers: 6
    inner_size: 2048
    num_attention_heads: 8
    ffn_dropout: 0.1
    attn_score_dropout: 0.1
    attn_layer_dropout: 0.1
    hidden_act: relu
    mask_future: false
    pre_ln: false
    pre_ln_final_layer_norm: true
    arch: seq2seq # avg_pool, max_pool, seq2seq, bridge, perceiver (see description above)
    hidden_steps: 32 # fixed number of hidden steps
    hidden_blocks: 1 # number of repeat blocks (see classes for description)
    hidden_init_method: default # see classes for available values

  decoder:
    library: nemo
    model_name: null
    pretrained: false
    max_sequence_length: 512
    num_token_types: 0
    embedding_dropout: 0.1
    learn_positional_encodings: false
    hidden_size: 512
    inner_size: 2048
    num_layers: 6
    num_attention_heads: 8
    ffn_dropout: 0.1
    attn_score_dropout: 0.1
    attn_layer_dropout: 0.1
    hidden_act: relu
    pre_ln: false
    pre_ln_final_layer_norm: true
    arch: seq2seq # currently only seq2seq is supported

  head:
    num_layers: 1
    activation: relu
    log_softmax: true
    dropout: 0.0
    use_transformer_init: true

trainer:
  gpus: 4
  num_nodes: 1
  max_epochs: 200
  precision: 16 # Should be set to 16 for O1 and O2, default is 16 as PT ignores it when am_level is O0
  accelerator: ddp
  checkpoint_callback: False
  logger: False
  log_every_n_steps: 50  # Interval of logging.
  check_val_every_n_epoch: 1

exp_manager:
  name: AAYNBase
  files_to_copy: []