DeepLearningExamples/PyTorch/LanguageModeling/BERT/configurations.yml
2019-09-10 17:21:52 +02:00

183 lines
3.3 KiB
YAML

# Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#1 DGX1 phase1
bert--DGX1:
<<: *BERT_ON_CLUSTER
<<: *DGX1
variables:
<<: *DGX1_VARS
NNODES: "1"
BATCHSIZE: "8192"
LR: "6e-3"
GRADIENT_STEPS: "512"
PHASE: "1"
#4 DGX1 phase1
bert--DGX1_4x8x16x128:
<<: *BERT_ON_CLUSTER
<<: *DGX1
variables:
<<: *DGX1_VARS
NNODES: "4"
BATCHSIZE: "2048"
LR: "6e-3"
GRADIENT_STEPS: "128"
PHASE: "1"
#16 DGX1 phase1
bert--DGX1_16x8x16x32:
<<: *BERT_ON_CLUSTER
<<: *DGX1
variables:
<<: *DGX1_VARS
NNODES: "16"
BATCHSIZE: "512"
LR: "6e-3"
GRADIENT_STEPS: "32"
PHASE: "1"
#1 DGX2 phase1
bert--DGX2:
<<: *BERT_ON_CLUSTER
<<: *DGX2
variables:
<<: *DGX2_VARS
NNODES: "1"
BATCHSIZE: "4096"
LR: "6e-3"
GRADIENT_STEPS: "64"
PHASE: "1"
#4 DGX2 phase1
bert--DGX2_4x16x64x16:
<<: *BERT_ON_CLUSTER
<<: *DGX2
variables:
<<: *DGX2_VARS
NNODES: "4"
BATCHSIZE: "1024"
LR: "6e-3"
GRADIENT_STEPS: "16"
PHASE: "1"
#16 DGX2 phase1
bert--DGX2_16x16x64x4:
<<: *BERT_ON_CLUSTER
<<: *DGX2
variables:
<<: *DGX2_VARS
NNODES: "16"
BATCHSIZE: "256"
LR: "6e-3"
GRADIENT_STEPS: "4"
PHASE: "1"
#64 DGX2 phase1
bert--DGX2_64x16x64:
<<: *BERT_ON_CLUSTER
<<: *DGX2
variables:
<<: *DGX2_VARS
NNODES: "64"
BATCHSIZE: "64"
LR: "6e-3"
GRADIENT_STEPS: "1"
PHASE: "1"
#1 DGX1 phase2
bert--DGX1_1x8x4x1024:
<<: *BERT_ON_CLUSTER
<<: *DGX1
variables:
<<: *DGX1_VARS
NNODES: "1"
BATCHSIZE: "4096"
LR: "4e-3"
GRADIENT_STEPS: "1024"
PHASE: "2"
#4 DGX1 phase2
bert--DGX1_4x8x4x256:
<<: *BERT_ON_CLUSTER
<<: *DGX1
variables:
<<: *DGX1_VARS
NNODES: "4"
BATCHSIZE: "1024"
LR: "4e-3"
GRADIENT_STEPS: "256"
PHASE: "2"
#16 DGX1 phase2
bert--DGX1_16x8x4x64:
<<: *BERT_ON_CLUSTER
<<: *DGX1
variables:
<<: *DGX1_VARS
NNODES: "16"
BATCHSIZE: "256"
LR: "4e-3"
GRADIENT_STEPS: "64"
PHASE: "2"
#1 DGX2 phase2
bert--DGX2_1x16x8x256:
<<: *BERT_ON_CLUSTER
<<: *DGX2
variables:
<<: *DGX2_VARS
NNODES: "1"
BATCHSIZE: "2048"
LR: "4e-3"
GRADIENT_STEPS: "256"
PHASE: "2"
#4 DGX2 phase2
bert--DGX2_4x16x8x64:
<<: *BERT_ON_CLUSTER
<<: *DGX2
variables:
<<: *DGX2_VARS
NNODES: "4"
BATCHSIZE: "512"
LR: "4e-3"
GRADIENT_STEPS: "64"
PHASE: "2"
#16 DGX2 phase2
bert--DGX2_16x16x8x16:
<<: *BERT_ON_CLUSTER
<<: *DGX2
variables:
<<: *DGX2_VARS
NNODES: "16"
BATCHSIZE: "128"
LR: "4e-3"
GRADIENT_STEPS: "16"
PHASE: "2"
#64 DGX2 phase2
bert--DGX2_64x16x8x4:
<<: *BERT_ON_CLUSTER
<<: *DGX2
variables:
<<: *DGX2_VARS
NNODES: "64"
BATCHSIZE: "32"
LR: "4e-3"
GRADIENT_STEPS: "4"
PHASE: "2"