updating BERT (single node LAMB support)

This commit is contained in:
Krzysztof Kudrynski 2019-08-13 23:27:54 +02:00
parent 7118f12b8a
commit bae6e931bd
71 changed files with 3302 additions and 32098 deletions

View file

@ -1,27 +1,28 @@
ARG FROM_IMAGE_NAME=nvcr.io/nvidia/pytorch:19.06-py3
ARG FROM_IMAGE_NAME=nvcr.io/nvidia/pytorch:19.07-py3
FROM ${FROM_IMAGE_NAME}
RUN apt-get update && apt-get install -y pbzip2 pv bzip2 cabextract
ENV BERT_PREP_WORKING_DIR /workspace/bert/data
WORKDIR /opt
RUN rm -rf /opt/pytorch/apex ; \
git clone https://github.com/NVIDIA/apex.git pytorch/apex ; \
cd pytorch/apex ; \
pip uninstall --yes apex; \
git checkout 880ab925bce9f817a93988b021e12db5f67f7787; \
git pull; \
pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" .
#WORKDIR /opt
#RUN cd pytorch/apex \
# && git fetch origin pull/182/head:norm_fix \
# && git checkout norm_fix \
# && git fetch origin pull/334/head:multi_tensor_lamb_optimizer \
# && git checkout multi_tensor_lamb_optimizer \
# && python setup.py develop --cuda_ext --cpp_ext
WORKDIR /opt
RUN cd pytorch/apex ; \
pip uninstall apex; \
pip uninstall apex; \
git checkout master; \
git pull; \
pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" .
WORKDIR /workspace
RUN git clone https://github.com/attardi/wikiextractor.git
RUN git clone https://github.com/soskek/bookcorpus.git
WORKDIR /workspace/bert
RUN pip install tqdm boto3 requests six ipdb h5py html2text nltk progressbar
COPY . .
RUN pip install tqdm boto3 requests six ipdb h5py html2text nltk progressbar

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,124 @@
import sys
import subprocess
import os
import socket
from argparse import ArgumentParser, REMAINDER
import torch
def parse_args():
"""
Helper function parsing the command line options
@retval ArgumentParser
"""
parser = ArgumentParser(description="PyTorch distributed training launch "
"helper utilty that will spawn up "
"multiple distributed processes")
# Optional arguments for the launch helper
parser.add_argument("--nnodes", type=int, default=1,
help="The number of nodes to use for distributed "
"training")
parser.add_argument("--node_rank", type=int, default=0,
help="The rank of the node for multi-node distributed "
"training")
parser.add_argument("--nproc_per_node", type=int, default=1,
help="The number of processes to launch on each node, "
"for GPU training, this is recommended to be set "
"to the number of GPUs in your system so that "
"each process can be bound to a single GPU.")
parser.add_argument("--master_addr", default="127.0.0.1", type=str,
help="Master node (rank 0)'s address, should be either "
"the IP address or the hostname of node 0, for "
"single node multi-proc training, the "
"--master_addr can simply be 127.0.0.1")
parser.add_argument("--master_port", default=29500, type=int,
help="Master node (rank 0)'s free port that needs to "
"be used for communciation during distributed "
"training")
parser.add_argument('--no_hyperthreads', action='store_true',
help='Flag to disable binding to hyperthreads')
parser.add_argument('--no_membind', action='store_true',
help='Flag to disable memory binding')
# non-optional arguments for binding
parser.add_argument("--nsockets_per_node", type=int, required=True,
help="Number of CPU sockets on a node")
parser.add_argument("--ncores_per_socket", type=int, required=True,
help="Number of CPU cores per socket")
# positional
parser.add_argument("training_script", type=str,
help="The full path to the single GPU training "
"program/script to be launched in parallel, "
"followed by all the arguments for the "
"training script")
# rest from the training program
parser.add_argument('training_script_args', nargs=REMAINDER)
return parser.parse_args()
def main():
args = parse_args()
# variables for numactrl binding
NSOCKETS = args.nsockets_per_node
NGPUS_PER_SOCKET = (args.nproc_per_node // args.nsockets_per_node) + (1 if (args.nproc_per_node % args.nsockets_per_node) else 0)
NCORES_PER_GPU = args.ncores_per_socket // NGPUS_PER_SOCKET
# world size in terms of number of processes
dist_world_size = args.nproc_per_node * args.nnodes
# set PyTorch distributed related environmental variables
current_env = os.environ.copy()
current_env["MASTER_ADDR"] = args.master_addr
current_env["MASTER_PORT"] = str(args.master_port)
current_env["WORLD_SIZE"] = str(dist_world_size)
processes = []
for local_rank in range(0, args.nproc_per_node):
# each process's rank
dist_rank = args.nproc_per_node * args.node_rank + local_rank
current_env["RANK"] = str(dist_rank)
# form numactrl binding command
cpu_ranges = [local_rank * NCORES_PER_GPU,
(local_rank + 1) * NCORES_PER_GPU - 1,
local_rank * NCORES_PER_GPU + (NCORES_PER_GPU * NGPUS_PER_SOCKET * NSOCKETS),
(local_rank + 1) * NCORES_PER_GPU + (NCORES_PER_GPU * NGPUS_PER_SOCKET * NSOCKETS) - 1]
numactlargs = []
if args.no_hyperthreads:
numactlargs += [ "--physcpubind={}-{}".format(*cpu_ranges[0:2]) ]
else:
numactlargs += [ "--physcpubind={}-{},{}-{}".format(*cpu_ranges) ]
if not args.no_membind:
memnode = local_rank // NGPUS_PER_SOCKET
numactlargs += [ "--membind={}".format(memnode) ]
# spawn the processes
cmd = [ "/usr/bin/numactl" ] \
+ numactlargs \
+ [ sys.executable,
"-u",
args.training_script,
"--local_rank={}".format(local_rank)
] \
+ args.training_script_args
process = subprocess.Popen(cmd, env=current_env)
processes.append(process)
for process in processes:
process.wait()
if __name__ == "__main__":
main()

View file

@ -0,0 +1,21 @@
#!/bin/bash
## DL params
BATCHSIZE=16
LEARNING_RATE=6e-3
WARMUP_UPDATES=0.2843
EXTRA_PARAMS="--input_dir=workspace/data --do_train --config_file=bert_config.json --max_seq_length=128 --max_predictions_per_seq=20 --output_dir /results --fp16 --max_steps=7508 --num_steps_per_checkpoint=200"
## System run parms
DGXNNODES=1
DGXSYSTEM=$(basename $(readlink -f ${BASH_SOURCE[0]}) | sed 's/^config_//' | sed 's/\.sh$//' )
WALLTIME=00:15:00
DEADLINE=$(date -d '+168 hours' '+%FT%T')
SLURM_EMAIL_TYPE="END"
## System config params
DGXNGPU=8
DGXSOCKETCORES=20
DGXNSOCKET=2
DGXHT=2 # HT is on is 2, HT off is 1
DGXIBDEVICES=''

View file

@ -0,0 +1,21 @@
#!/bin/bash
## DL params
BATCHSIZE=4096
LEARNING_RATE="6e-3"
WARMUP_UPDATES=0.2843
EXTRA_PARAMS="--input_dir=/workspace/data --do_train --config_file=bert_config.json --max_seq_length=128 --max_predictions_per_seq=20 --output_dir /results/output --fp16 --max_steps=7038 --num_steps_per_checkpoint=2500 --log_freq=1 --gradient_accumulation_steps=64 --allreduce_post_accumulation --allreduce_post_accumulation_fp16"
## System run parms
DGXNNODES=1
DGXSYSTEM=$(basename $(readlink -f ${BASH_SOURCE[0]}) | sed 's/^config_//' | sed 's/\.sh$//' )
DEADLINE=$(date -d '+168 hours' '+%FT%T')
SLURM_EMAIL_TYPE="END"
WALLTIME="3-00:00:00"
## System config params
DGXNGPU=16
DGXSOCKETCORES=24
DGXNSOCKET=2
DGXHT=2 # HT is on is 2, HT off is 1
DGXIBDEVICES='--device=/dev/infiniband/rdma_cm --device=/dev/infiniband/ucm10 --device=/dev/infiniband/ucm9 --device=/dev/infiniband/ucm8 --device=/dev/infiniband/ucm7 --device=/dev/infiniband/ucm4 --device=/dev/infiniband/ucm3 --device=/dev/infiniband/ucm2 --device=/dev/infiniband/ucm1 --device=/dev/infiniband/uverbs10 --device=/dev/infiniband/uverbs9 --device=/dev/infiniband/uverbs8 --device=/dev/infiniband/uverbs7 --device=/dev/infiniband/uverbs4 --device=/dev/infiniband/uverbs3 --device=/dev/infiniband/uverbs2 --device=/dev/infiniband/uverbs1 --device=/dev/infiniband/issm10 --device=/dev/infiniband/umad10 --device=/dev/infiniband/issm9 --device=/dev/infiniband/umad9 --device=/dev/infiniband/issm8 --device=/dev/infiniband/umad8 --device=/dev/infiniband/issm7 --device=/dev/infiniband/umad7 --device=/dev/infiniband/issm4 --device=/dev/infiniband/umad4 --device=/dev/infiniband/issm3 --device=/dev/infiniband/umad3 --device=/dev/infiniband/issm2 --device=/dev/infiniband/umad2 --device=/dev/infiniband/issm1 --device=/dev/infiniband/umad1'

View file

@ -0,0 +1,19 @@
#!/bin/bash
## DL params
BATCHSIZE=256
LEARNING_RATE="6e-3"
WARMUP_UPDATES=0.2843
EXTRA_PARAMS="--input_dir=/workspace/data --do_train --config_file=bert_config.json --max_seq_length=128 --max_predictions_per_seq=20 --output_dir /checkpoints --fp16 --max_steps=7038 --num_steps_per_checkpoint=2500 --log_freq=1 --gradient_accumulation_steps=4 --allreduce_post_accumulation --allreduce_post_accumulation_fp16 --resume_from_checkpoint"
## System run parms
DGXNNODES=16
DGXSYSTEM=$(basename $(readlink -f ${BASH_SOURCE[0]}) | sed 's/^config_//' | sed 's/\.sh$//' )
WALLTIME=02:00:00
## System config params
DGXNGPU=16
DGXSOCKETCORES=24
DGXNSOCKET=2
DGXHT=2 # HT is on is 2, HT off is 1
DGXIBDEVICES='--device=/dev/infiniband/rdma_cm --device=/dev/infiniband/ucm10 --device=/dev/infiniband/ucm9 --device=/dev/infiniband/ucm8 --device=/dev/infiniband/ucm7 --device=/dev/infiniband/ucm4 --device=/dev/infiniband/ucm3 --device=/dev/infiniband/ucm2 --device=/dev/infiniband/ucm1 --device=/dev/infiniband/uverbs10 --device=/dev/infiniband/uverbs9 --device=/dev/infiniband/uverbs8 --device=/dev/infiniband/uverbs7 --device=/dev/infiniband/uverbs4 --device=/dev/infiniband/uverbs3 --device=/dev/infiniband/uverbs2 --device=/dev/infiniband/uverbs1 --device=/dev/infiniband/issm10 --device=/dev/infiniband/umad10 --device=/dev/infiniband/issm9 --device=/dev/infiniband/umad9 --device=/dev/infiniband/issm8 --device=/dev/infiniband/umad8 --device=/dev/infiniband/issm7 --device=/dev/infiniband/umad7 --device=/dev/infiniband/issm4 --device=/dev/infiniband/umad4 --device=/dev/infiniband/issm3 --device=/dev/infiniband/umad3 --device=/dev/infiniband/issm2 --device=/dev/infiniband/umad2 --device=/dev/infiniband/issm1 --device=/dev/infiniband/umad1'

View file

@ -0,0 +1,21 @@
#!/bin/bash
## DL params
BATCHSIZE=128
LEARNING_RATE="4e-3"
WARMUP_UPDATES=0.128
EXTRA_PARAMS="--input_dir=/workspace/data_phase2 --do_train --config_file=bert_config.json --max_seq_length=512 --max_predictions_per_seq=80 --output_dir /checkpoints --fp16 --max_steps=1563 --num_steps_per_checkpoint=1000 --log_freq=1 --phase2 --gradient_accumulation_steps=16 --resume_from_checkpoint --allreduce_post_accumulation --allreduce_post_accumulation_fp16 --resume_step=7038"
## System run parms
DGXNNODES=16
DGXSYSTEM=$(basename $(readlink -f ${BASH_SOURCE[0]}) | sed 's/^config_//' | sed 's/\.sh$//' )
WALLTIME=02:00:00
DEADLINE=$(date -d '+168 hours' '+%FT%T')
SLURM_EMAIL_TYPE="END"
## System config params
DGXNGPU=16
DGXSOCKETCORES=24
DGXNSOCKET=2
DGXHT=2 # HT is on is 2, HT off is 1
DGXIBDEVICES='--device=/dev/infiniband/rdma_cm --device=/dev/infiniband/ucm10 --device=/dev/infiniband/ucm9 --device=/dev/infiniband/ucm8 --device=/dev/infiniband/ucm7 --device=/dev/infiniband/ucm4 --device=/dev/infiniband/ucm3 --device=/dev/infiniband/ucm2 --device=/dev/infiniband/ucm1 --device=/dev/infiniband/uverbs10 --device=/dev/infiniband/uverbs9 --device=/dev/infiniband/uverbs8 --device=/dev/infiniband/uverbs7 --device=/dev/infiniband/uverbs4 --device=/dev/infiniband/uverbs3 --device=/dev/infiniband/uverbs2 --device=/dev/infiniband/uverbs1 --device=/dev/infiniband/issm10 --device=/dev/infiniband/umad10 --device=/dev/infiniband/issm9 --device=/dev/infiniband/umad9 --device=/dev/infiniband/issm8 --device=/dev/infiniband/umad8 --device=/dev/infiniband/issm7 --device=/dev/infiniband/umad7 --device=/dev/infiniband/issm4 --device=/dev/infiniband/umad4 --device=/dev/infiniband/issm3 --device=/dev/infiniband/umad3 --device=/dev/infiniband/issm2 --device=/dev/infiniband/umad2 --device=/dev/infiniband/issm1 --device=/dev/infiniband/umad1'

View file

@ -0,0 +1,21 @@
#!/bin/bash
## DL params
BATCHSIZE=2048
LEARNING_RATE="4e-3"
WARMUP_UPDATES=0.128
EXTRA_PARAMS="--input_dir=/workspace/data_phase2 --do_train --config_file=bert_config.json --max_seq_length=512 --max_predictions_per_seq=80 --output_dir /checkpoints --fp16 --max_steps=1563 --num_steps_per_checkpoint=1000 --log_freq=1 --phase2 --gradient_accumulation_steps=256 --resume_from_checkpoint --allreduce_post_accumulation --allreduce_post_accumulation_fp16 --resume_step=7038"
## System run parms
DGXNNODES=1
DGXSYSTEM=$(basename $(readlink -f ${BASH_SOURCE[0]}) | sed 's/^config_//' | sed 's/\.sh$//' )
WALLTIME="00:15:00"
DEADLINE=$(date -d '+168 hours' '+%FT%T')
SLURM_EMAIL_TYPE="END"
## System config params
DGXNGPU=16
DGXSOCKETCORES=24
DGXNSOCKET=2
DGXHT=2 # HT is on is 2, HT off is 1
DGXIBDEVICES='--device=/dev/infiniband/rdma_cm --device=/dev/infiniband/ucm10 --device=/dev/infiniband/ucm9 --device=/dev/infiniband/ucm8 --device=/dev/infiniband/ucm7 --device=/dev/infiniband/ucm4 --device=/dev/infiniband/ucm3 --device=/dev/infiniband/ucm2 --device=/dev/infiniband/ucm1 --device=/dev/infiniband/uverbs10 --device=/dev/infiniband/uverbs9 --device=/dev/infiniband/uverbs8 --device=/dev/infiniband/uverbs7 --device=/dev/infiniband/uverbs4 --device=/dev/infiniband/uverbs3 --device=/dev/infiniband/uverbs2 --device=/dev/infiniband/uverbs1 --device=/dev/infiniband/issm10 --device=/dev/infiniband/umad10 --device=/dev/infiniband/issm9 --device=/dev/infiniband/umad9 --device=/dev/infiniband/issm8 --device=/dev/infiniband/umad8 --device=/dev/infiniband/issm7 --device=/dev/infiniband/umad7 --device=/dev/infiniband/issm4 --device=/dev/infiniband/umad4 --device=/dev/infiniband/issm3 --device=/dev/infiniband/umad3 --device=/dev/infiniband/issm2 --device=/dev/infiniband/umad2 --device=/dev/infiniband/issm1 --device=/dev/infiniband/umad1'

View file

@ -0,0 +1,28 @@
#!/bin/bash
## DL params
BATCHSIZE=64
LEARNING_RATE="6e-3"
WARMUP_UPDATES=0.2843
EXTRA_PARAMS="--input_dir=/workspace/data \
--do_train \
--config_file=bert_config.json \
--max_seq_length=128 \
--max_predictions_per_seq=20 \
--output_dir=/results/output \
--fp16 \
--max_steps=7508 \
--num_steps_per_checkpoint=200 \
--log_freq=1"
## System run parms
DGXNNODES=2
DGXSYSTEM=$(basename $(readlink -f ${BASH_SOURCE[0]}) | sed 's/^config_//' | sed 's/\.sh$//' )
WALLTIME=00:30:00
DEADLINE=$(date -d '+72 hours' '+%FT%T')
## System config params
DGXNGPU=16
DGXSOCKETCORES=24
DGXNSOCKET=2
DGXHT=2 # HT is on is 2, HT off is 1
DGXIBDEVICES='--device=/dev/infiniband/rdma_cm --device=/dev/infiniband/ucm10 --device=/dev/infiniband/ucm9 --device=/dev/infiniband/ucm8 --device=/dev/infiniband/ucm7 --device=/dev/infiniband/ucm4 --device=/dev/infiniband/ucm3 --device=/dev/infiniband/ucm2 --device=/dev/infiniband/ucm1 --device=/dev/infiniband/uverbs10 --device=/dev/infiniband/uverbs9 --device=/dev/infiniband/uverbs8 --device=/dev/infiniband/uverbs7 --device=/dev/infiniband/uverbs4 --device=/dev/infiniband/uverbs3 --device=/dev/infiniband/uverbs2 --device=/dev/infiniband/uverbs1 --device=/dev/infiniband/issm10 --device=/dev/infiniband/umad10 --device=/dev/infiniband/issm9 --device=/dev/infiniband/umad9 --device=/dev/infiniband/issm8 --device=/dev/infiniband/umad8 --device=/dev/infiniband/issm7 --device=/dev/infiniband/umad7 --device=/dev/infiniband/issm4 --device=/dev/infiniband/umad4 --device=/dev/infiniband/issm3 --device=/dev/infiniband/umad3 --device=/dev/infiniband/issm2 --device=/dev/infiniband/umad2 --device=/dev/infiniband/issm1 --device=/dev/infiniband/umad1'

View file

@ -0,0 +1,22 @@
#!/bin/bash
## DL params
BATCHSIZE=96
LEARNING_RATE="6e-3"
WARMUP_UPDATES=0.2843
SEED=23448
EXTRA_PARAMS="--input_dir=/workspace/data --do_train --config_file=bert_config.json --max_seq_length=128 --max_predictions_per_seq=20 --output_dir /results/output --max_steps=7038 --num_steps_per_checkpoint=10000 --log_freq=1 --gradient_accumulation_steps=3"
## System run parms
DGXNNODES=46
DGXSYSTEM=$(basename $(readlink -f ${BASH_SOURCE[0]}) | sed 's/^config_//' | sed 's/\.sh$//' )
WALLTIME=05:00:00
DEADLINE=$(date -d '+168 hours' '+%FT%T')
SLURM_EMAIL_TYPE="END"
## System config params
DGXNGPU=16
DGXSOCKETCORES=24
DGXNSOCKET=2
DGXHT=2 # HT is on is 2, HT off is 1
DGXIBDEVICES='--device=/dev/infiniband/rdma_cm --device=/dev/infiniband/ucm10 --device=/dev/infiniband/ucm9 --device=/dev/infiniband/ucm8 --device=/dev/infiniband/ucm7 --device=/dev/infiniband/ucm4 --device=/dev/infiniband/ucm3 --device=/dev/infiniband/ucm2 --device=/dev/infiniband/ucm1 --device=/dev/infiniband/uverbs10 --device=/dev/infiniband/uverbs9 --device=/dev/infiniband/uverbs8 --device=/dev/infiniband/uverbs7 --device=/dev/infiniband/uverbs4 --device=/dev/infiniband/uverbs3 --device=/dev/infiniband/uverbs2 --device=/dev/infiniband/uverbs1 --device=/dev/infiniband/issm10 --device=/dev/infiniband/umad10 --device=/dev/infiniband/issm9 --device=/dev/infiniband/umad9 --device=/dev/infiniband/issm8 --device=/dev/infiniband/umad8 --device=/dev/infiniband/issm7 --device=/dev/infiniband/umad7 --device=/dev/infiniband/issm4 --device=/dev/infiniband/umad4 --device=/dev/infiniband/issm3 --device=/dev/infiniband/umad3 --device=/dev/infiniband/issm2 --device=/dev/infiniband/umad2 --device=/dev/infiniband/issm1 --device=/dev/infiniband/umad1'

View file

@ -0,0 +1,19 @@
#!/bin/bash
## DL params
BATCHSIZE=96
LEARNING_RATE="6e-3"
WARMUP_UPDATES=0.2843
EXTRA_PARAMS="--input_dir=/workspace/data --do_train --config_file=bert_config.json --max_seq_length=128 --max_predictions_per_seq=20 --output_dir /results/output --fp16 --max_steps=7038 --num_steps_per_checkpoint=10000 --log_freq=1 --gradient_accumulation_steps=2 --allreduce_post_accumulation --allreduce_post_accumulation_fp16 --accumulate_into_fp16"
## System run parms
DGXNNODES=46
DGXSYSTEM=$(basename $(readlink -f ${BASH_SOURCE[0]}) | sed 's/^config_//' | sed 's/\.sh$//' )
WALLTIME=02:00:00
## System config params
DGXNGPU=16
DGXSOCKETCORES=24
DGXNSOCKET=2
DGXHT=2 # HT is on is 2, HT off is 1
DGXIBDEVICES='--device=/dev/infiniband/rdma_cm --device=/dev/infiniband/ucm10 --device=/dev/infiniband/ucm9 --device=/dev/infiniband/ucm8 --device=/dev/infiniband/ucm7 --device=/dev/infiniband/ucm4 --device=/dev/infiniband/ucm3 --device=/dev/infiniband/ucm2 --device=/dev/infiniband/ucm1 --device=/dev/infiniband/uverbs10 --device=/dev/infiniband/uverbs9 --device=/dev/infiniband/uverbs8 --device=/dev/infiniband/uverbs7 --device=/dev/infiniband/uverbs4 --device=/dev/infiniband/uverbs3 --device=/dev/infiniband/uverbs2 --device=/dev/infiniband/uverbs1 --device=/dev/infiniband/issm10 --device=/dev/infiniband/umad10 --device=/dev/infiniband/issm9 --device=/dev/infiniband/umad9 --device=/dev/infiniband/issm8 --device=/dev/infiniband/umad8 --device=/dev/infiniband/issm7 --device=/dev/infiniband/umad7 --device=/dev/infiniband/issm4 --device=/dev/infiniband/umad4 --device=/dev/infiniband/issm3 --device=/dev/infiniband/umad3 --device=/dev/infiniband/issm2 --device=/dev/infiniband/umad2 --device=/dev/infiniband/issm1 --device=/dev/infiniband/umad1'

View file

@ -0,0 +1,21 @@
#!/bin/bash
## DL params
BATCHSIZE=48
LEARNING_RATE="4.12e-3"
WARMUP_UPDATES=0.138
EXTRA_PARAMS="--input_dir=/workspace/data_phase2 --do_train --config_file=bert_config.json --max_seq_length=512 --max_predictions_per_seq=80 --output_dir /checkpoints --max_steps=1450 --num_steps_per_checkpoint=1000 --log_freq=1 --phase2 --gradient_accumulation_steps=12 --resume_from_checkpoint"
## System run parms
DGXNNODES=46
DGXSYSTEM=$(basename $(readlink -f ${BASH_SOURCE[0]}) | sed 's/^config_//' | sed 's/\.sh$//' )
WALLTIME=05:00:00
DEADLINE=$(date -d '+168 hours' '+%FT%T')
SLURM_EMAIL_TYPE="END"
## System config params
DGXNGPU=16
DGXSOCKETCORES=24
DGXNSOCKET=2
DGXHT=2 # HT is on is 2, HT off is 1
DGXIBDEVICES='--device=/dev/infiniband/rdma_cm --device=/dev/infiniband/ucm10 --device=/dev/infiniband/ucm9 --device=/dev/infiniband/ucm8 --device=/dev/infiniband/ucm7 --device=/dev/infiniband/ucm4 --device=/dev/infiniband/ucm3 --device=/dev/infiniband/ucm2 --device=/dev/infiniband/ucm1 --device=/dev/infiniband/uverbs10 --device=/dev/infiniband/uverbs9 --device=/dev/infiniband/uverbs8 --device=/dev/infiniband/uverbs7 --device=/dev/infiniband/uverbs4 --device=/dev/infiniband/uverbs3 --device=/dev/infiniband/uverbs2 --device=/dev/infiniband/uverbs1 --device=/dev/infiniband/issm10 --device=/dev/infiniband/umad10 --device=/dev/infiniband/issm9 --device=/dev/infiniband/umad9 --device=/dev/infiniband/issm8 --device=/dev/infiniband/umad8 --device=/dev/infiniband/issm7 --device=/dev/infiniband/umad7 --device=/dev/infiniband/issm4 --device=/dev/infiniband/umad4 --device=/dev/infiniband/issm3 --device=/dev/infiniband/umad3 --device=/dev/infiniband/issm2 --device=/dev/infiniband/umad2 --device=/dev/infiniband/issm1 --device=/dev/infiniband/umad1'

View file

@ -0,0 +1,21 @@
#!/bin/bash
## DL params
BATCHSIZE=48
LEARNING_RATE="4.12e-3"
WARMUP_UPDATES=0.138
EXTRA_PARAMS="--input_dir=/workspace/data_phase2 --do_train --config_file=bert_config.json --max_seq_length=512 --max_predictions_per_seq=80 --output_dir /checkpoints --fp16 --max_steps=1450 --num_steps_per_checkpoint=1000 --log_freq=1 --phase2 --gradient_accumulation_steps=6 --resume_from_checkpoint --allreduce_post_accumulation --allreduce_post_accumulation_fp16 --accumulate_into_fp16 --resume_step=7038"
## System run parms
DGXNNODES=46
DGXSYSTEM=$(basename $(readlink -f ${BASH_SOURCE[0]}) | sed 's/^config_//' | sed 's/\.sh$//' )
WALLTIME=02:00:00
DEADLINE=$(date -d '+168 hours' '+%FT%T')
SLURM_EMAIL_TYPE="END"
## System config params
DGXNGPU=16
DGXSOCKETCORES=24
DGXNSOCKET=2
DGXHT=2 # HT is on is 2, HT off is 1
DGXIBDEVICES='--device=/dev/infiniband/rdma_cm --device=/dev/infiniband/ucm10 --device=/dev/infiniband/ucm9 --device=/dev/infiniband/ucm8 --device=/dev/infiniband/ucm7 --device=/dev/infiniband/ucm4 --device=/dev/infiniband/ucm3 --device=/dev/infiniband/ucm2 --device=/dev/infiniband/ucm1 --device=/dev/infiniband/uverbs10 --device=/dev/infiniband/uverbs9 --device=/dev/infiniband/uverbs8 --device=/dev/infiniband/uverbs7 --device=/dev/infiniband/uverbs4 --device=/dev/infiniband/uverbs3 --device=/dev/infiniband/uverbs2 --device=/dev/infiniband/uverbs1 --device=/dev/infiniband/issm10 --device=/dev/infiniband/umad10 --device=/dev/infiniband/issm9 --device=/dev/infiniband/umad9 --device=/dev/infiniband/issm8 --device=/dev/infiniband/umad8 --device=/dev/infiniband/issm7 --device=/dev/infiniband/umad7 --device=/dev/infiniband/issm4 --device=/dev/infiniband/umad4 --device=/dev/infiniband/issm3 --device=/dev/infiniband/umad3 --device=/dev/infiniband/issm2 --device=/dev/infiniband/umad2 --device=/dev/infiniband/issm1 --device=/dev/infiniband/umad1'

View file

@ -0,0 +1,19 @@
#!/bin/bash
## DL params
BATCHSIZE=128
LEARNING_RATE="6.5e-3"
WARMUP_UPDATES=0.5328
EXTRA_PARAMS="--input_dir=/workspace/data --do_train --config_file=bert_config.json --max_seq_length=128 --max_predictions_per_seq=20 --output_dir /results/output --fp16 --max_steps=4692 --num_steps_per_checkpoint=10000 --log_freq=1 --gradient_accumulation_steps=2"
## System run parms
DGXNNODES=48
DGXSYSTEM=$(basename $(readlink -f ${BASH_SOURCE[0]}) | sed 's/^config_//' | sed 's/\.sh$//' )
WALLTIME=02:00:00
## System config params
DGXNGPU=16
DGXSOCKETCORES=24
DGXNSOCKET=2
DGXHT=2 # HT is on is 2, HT off is 1
DGXIBDEVICES='--device=/dev/infiniband/rdma_cm --device=/dev/infiniband/ucm10 --device=/dev/infiniband/ucm9 --device=/dev/infiniband/ucm8 --device=/dev/infiniband/ucm7 --device=/dev/infiniband/ucm4 --device=/dev/infiniband/ucm3 --device=/dev/infiniband/ucm2 --device=/dev/infiniband/ucm1 --device=/dev/infiniband/uverbs10 --device=/dev/infiniband/uverbs9 --device=/dev/infiniband/uverbs8 --device=/dev/infiniband/uverbs7 --device=/dev/infiniband/uverbs4 --device=/dev/infiniband/uverbs3 --device=/dev/infiniband/uverbs2 --device=/dev/infiniband/uverbs1 --device=/dev/infiniband/issm10 --device=/dev/infiniband/umad10 --device=/dev/infiniband/issm9 --device=/dev/infiniband/umad9 --device=/dev/infiniband/issm8 --device=/dev/infiniband/umad8 --device=/dev/infiniband/issm7 --device=/dev/infiniband/umad7 --device=/dev/infiniband/issm4 --device=/dev/infiniband/umad4 --device=/dev/infiniband/issm3 --device=/dev/infiniband/umad3 --device=/dev/infiniband/issm2 --device=/dev/infiniband/umad2 --device=/dev/infiniband/issm1 --device=/dev/infiniband/umad1'

View file

@ -0,0 +1,19 @@
#!/bin/bash
## DL params
BATCHSIZE=64
LEARNING_RATE="5e-3"
WARMUP_UPDATES=0.192
EXTRA_PARAMS="--input_dir=/workspace/data_phase2 --do_train --config_file=bert_config.json --max_seq_length=512 --max_predictions_per_seq=80 --output_dir /checkpoints --fp16 --max_steps=1042 --num_steps_per_checkpoint=1000 --log_freq=1 --gradient_accumulation_steps=8 --resume_from_checkpoint --phase2 --allreduce_post_accumulation --allreduce_post_accumulation_fp16"
## System run parms
DGXNNODES=48
DGXSYSTEM=$(basename $(readlink -f ${BASH_SOURCE[0]}) | sed 's/^config_//' | sed 's/\.sh$//' )
WALLTIME=02:00:00
## System config params
DGXNGPU=16
DGXSOCKETCORES=24
DGXNSOCKET=2
DGXHT=2 # HT is on is 2, HT off is 1
DGXIBDEVICES='--device=/dev/infiniband/rdma_cm --device=/dev/infiniband/ucm10 --device=/dev/infiniband/ucm9 --device=/dev/infiniband/ucm8 --device=/dev/infiniband/ucm7 --device=/dev/infiniband/ucm4 --device=/dev/infiniband/ucm3 --device=/dev/infiniband/ucm2 --device=/dev/infiniband/ucm1 --device=/dev/infiniband/uverbs10 --device=/dev/infiniband/uverbs9 --device=/dev/infiniband/uverbs8 --device=/dev/infiniband/uverbs7 --device=/dev/infiniband/uverbs4 --device=/dev/infiniband/uverbs3 --device=/dev/infiniband/uverbs2 --device=/dev/infiniband/uverbs1 --device=/dev/infiniband/issm10 --device=/dev/infiniband/umad10 --device=/dev/infiniband/issm9 --device=/dev/infiniband/umad9 --device=/dev/infiniband/issm8 --device=/dev/infiniband/umad8 --device=/dev/infiniband/issm7 --device=/dev/infiniband/umad7 --device=/dev/infiniband/issm4 --device=/dev/infiniband/umad4 --device=/dev/infiniband/issm3 --device=/dev/infiniband/umad3 --device=/dev/infiniband/issm2 --device=/dev/infiniband/umad2 --device=/dev/infiniband/issm1 --device=/dev/infiniband/umad1'

View file

@ -0,0 +1,21 @@
#!/bin/bash
## DL params
BATCHSIZE=64
LEARNING_RATE="6e-3"
WARMUP_UPDATES=0.2843
EXTRA_PARAMS="--input_dir=/workspace/data --do_train --config_file=bert_config.json --max_seq_length=128 --max_predictions_per_seq=20 --output_dir /results/output --max_steps=7038 --num_steps_per_checkpoint=10000 --log_freq=1 --gradient_accumulation_steps=2"
## System run parms
DGXNNODES=64
DGXSYSTEM=$(basename $(readlink -f ${BASH_SOURCE[0]}) | sed 's/^config_//' | sed 's/\.sh$//' )
WALLTIME=04:00:00
DEADLINE=$(date -d '+168 hours' '+%FT%T')
SLURM_EMAIL_TYPE="END"
## System config params
DGXNGPU=16
DGXSOCKETCORES=24
DGXNSOCKET=2
DGXHT=2 # HT is on is 2, HT off is 1
DGXIBDEVICES='--device=/dev/infiniband/rdma_cm --device=/dev/infiniband/ucm10 --device=/dev/infiniband/ucm9 --device=/dev/infiniband/ucm8 --device=/dev/infiniband/ucm7 --device=/dev/infiniband/ucm4 --device=/dev/infiniband/ucm3 --device=/dev/infiniband/ucm2 --device=/dev/infiniband/ucm1 --device=/dev/infiniband/uverbs10 --device=/dev/infiniband/uverbs9 --device=/dev/infiniband/uverbs8 --device=/dev/infiniband/uverbs7 --device=/dev/infiniband/uverbs4 --device=/dev/infiniband/uverbs3 --device=/dev/infiniband/uverbs2 --device=/dev/infiniband/uverbs1 --device=/dev/infiniband/issm10 --device=/dev/infiniband/umad10 --device=/dev/infiniband/issm9 --device=/dev/infiniband/umad9 --device=/dev/infiniband/issm8 --device=/dev/infiniband/umad8 --device=/dev/infiniband/issm7 --device=/dev/infiniband/umad7 --device=/dev/infiniband/issm4 --device=/dev/infiniband/umad4 --device=/dev/infiniband/issm3 --device=/dev/infiniband/umad3 --device=/dev/infiniband/issm2 --device=/dev/infiniband/umad2 --device=/dev/infiniband/issm1 --device=/dev/infiniband/umad1'

View file

@ -0,0 +1,21 @@
#!/bin/bash
## DL params
BATCHSIZE=32
LEARNING_RATE="4e-3"
WARMUP_UPDATES=0.128
EXTRA_PARAMS="--input_dir=/workspace/data_phase2 --do_train --config_file=bert_config.json --max_seq_length=512 --max_predictions_per_seq=80 --output_dir /checkpoints --max_steps=1563 --num_steps_per_checkpoint=1000 --log_freq=1 --phase2 --gradient_accumulation_steps=8 --resume_from_checkpoint"
## System run parms
DGXNNODES=64
DGXSYSTEM=$(basename $(readlink -f ${BASH_SOURCE[0]}) | sed 's/^config_//' | sed 's/\.sh$//' )
WALLTIME=04:00:00
DEADLINE=$(date -d '+168 hours' '+%FT%T')
SLURM_EMAIL_TYPE="END"
## System config params
DGXNGPU=16
DGXSOCKETCORES=24
DGXNSOCKET=2
DGXHT=2 # HT is on is 2, HT off is 1
DGXIBDEVICES='--device=/dev/infiniband/rdma_cm --device=/dev/infiniband/ucm10 --device=/dev/infiniband/ucm9 --device=/dev/infiniband/ucm8 --device=/dev/infiniband/ucm7 --device=/dev/infiniband/ucm4 --device=/dev/infiniband/ucm3 --device=/dev/infiniband/ucm2 --device=/dev/infiniband/ucm1 --device=/dev/infiniband/uverbs10 --device=/dev/infiniband/uverbs9 --device=/dev/infiniband/uverbs8 --device=/dev/infiniband/uverbs7 --device=/dev/infiniband/uverbs4 --device=/dev/infiniband/uverbs3 --device=/dev/infiniband/uverbs2 --device=/dev/infiniband/uverbs1 --device=/dev/infiniband/issm10 --device=/dev/infiniband/umad10 --device=/dev/infiniband/issm9 --device=/dev/infiniband/umad9 --device=/dev/infiniband/issm8 --device=/dev/infiniband/umad8 --device=/dev/infiniband/issm7 --device=/dev/infiniband/umad7 --device=/dev/infiniband/issm4 --device=/dev/infiniband/umad4 --device=/dev/infiniband/issm3 --device=/dev/infiniband/umad3 --device=/dev/infiniband/issm2 --device=/dev/infiniband/umad2 --device=/dev/infiniband/issm1 --device=/dev/infiniband/umad1'

View file

@ -0,0 +1,19 @@
#!/bin/bash
## DL params
BATCHSIZE=64
LEARNING_RATE="6e-3"
WARMUP_UPDATES=0.2843
EXTRA_PARAMS="--input_dir=/workspace/data --do_train --config_file=bert_config.json --max_seq_length=128 --max_predictions_per_seq=20 --output_dir /results/output --fp16 --max_steps=7038 --num_steps_per_checkpoint=10000 --log_freq=1"
## System run parms
DGXNNODES=64
DGXSYSTEM=$(basename $(readlink -f ${BASH_SOURCE[0]}) | sed 's/^config_//' | sed 's/\.sh$//' )
WALLTIME=02:00:00
## System config params
DGXNGPU=16
DGXSOCKETCORES=24
DGXNSOCKET=2
DGXHT=2 # HT is on is 2, HT off is 1
DGXIBDEVICES='--device=/dev/infiniband/rdma_cm --device=/dev/infiniband/ucm10 --device=/dev/infiniband/ucm9 --device=/dev/infiniband/ucm8 --device=/dev/infiniband/ucm7 --device=/dev/infiniband/ucm4 --device=/dev/infiniband/ucm3 --device=/dev/infiniband/ucm2 --device=/dev/infiniband/ucm1 --device=/dev/infiniband/uverbs10 --device=/dev/infiniband/uverbs9 --device=/dev/infiniband/uverbs8 --device=/dev/infiniband/uverbs7 --device=/dev/infiniband/uverbs4 --device=/dev/infiniband/uverbs3 --device=/dev/infiniband/uverbs2 --device=/dev/infiniband/uverbs1 --device=/dev/infiniband/issm10 --device=/dev/infiniband/umad10 --device=/dev/infiniband/issm9 --device=/dev/infiniband/umad9 --device=/dev/infiniband/issm8 --device=/dev/infiniband/umad8 --device=/dev/infiniband/issm7 --device=/dev/infiniband/umad7 --device=/dev/infiniband/issm4 --device=/dev/infiniband/umad4 --device=/dev/infiniband/issm3 --device=/dev/infiniband/umad3 --device=/dev/infiniband/issm2 --device=/dev/infiniband/umad2 --device=/dev/infiniband/issm1 --device=/dev/infiniband/umad1'

View file

@ -0,0 +1,21 @@
#!/bin/bash
## DL params
BATCHSIZE=32
LEARNING_RATE="4e-3"
WARMUP_UPDATES=0.128
EXTRA_PARAMS="--input_dir=/workspace/data_phase2 --do_train --config_file=bert_config.json --max_seq_length=512 --max_predictions_per_seq=80 --output_dir /checkpoints --fp16 --max_steps=1563 --num_steps_per_checkpoint=1000 --log_freq=1 --phase2 --gradient_accumulation_steps=4 --resume_from_checkpoint --allreduce_post_accumulation --allreduce_post_accumulation_fp16 --resume_step=7038"
## System run parms
DGXNNODES=64
DGXSYSTEM=$(basename $(readlink -f ${BASH_SOURCE[0]}) | sed 's/^config_//' | sed 's/\.sh$//' )
WALLTIME=02:00:00
DEADLINE=$(date -d '+168 hours' '+%FT%T')
SLURM_EMAIL_TYPE="END"
## System config params
DGXNGPU=16
DGXSOCKETCORES=24
DGXNSOCKET=2
DGXHT=2 # HT is on is 2, HT off is 1
DGXIBDEVICES='--device=/dev/infiniband/rdma_cm --device=/dev/infiniband/ucm10 --device=/dev/infiniband/ucm9 --device=/dev/infiniband/ucm8 --device=/dev/infiniband/ucm7 --device=/dev/infiniband/ucm4 --device=/dev/infiniband/ucm3 --device=/dev/infiniband/ucm2 --device=/dev/infiniband/ucm1 --device=/dev/infiniband/uverbs10 --device=/dev/infiniband/uverbs9 --device=/dev/infiniband/uverbs8 --device=/dev/infiniband/uverbs7 --device=/dev/infiniband/uverbs4 --device=/dev/infiniband/uverbs3 --device=/dev/infiniband/uverbs2 --device=/dev/infiniband/uverbs1 --device=/dev/infiniband/issm10 --device=/dev/infiniband/umad10 --device=/dev/infiniband/issm9 --device=/dev/infiniband/umad9 --device=/dev/infiniband/issm8 --device=/dev/infiniband/umad8 --device=/dev/infiniband/issm7 --device=/dev/infiniband/umad7 --device=/dev/infiniband/issm4 --device=/dev/infiniband/umad4 --device=/dev/infiniband/issm3 --device=/dev/infiniband/umad3 --device=/dev/infiniband/issm2 --device=/dev/infiniband/umad2 --device=/dev/infiniband/issm1 --device=/dev/infiniband/umad1'

View file

@ -0,0 +1,19 @@
#!/bin/bash
## DL params
BATCHSIZE=64
LEARNING_RATE="6e-3"
WARMUP_UPDATES=0.256
EXTRA_PARAMS="--input_dir=/workspace/data_phase2 --do_train --config_file=bert_config.json --max_seq_length=512 --max_predictions_per_seq=80 --output_dir /checkpoints --fp16 --max_steps=782 --num_steps_per_checkpoint=1000 --log_freq=1 --phase2 --gradient_accumulation_steps=8 --resume_from_checkpoint --allreduce_post_accumulation --allreduce_post_accumulation_fp16 --accumulate_into_fp16 --resume_step=7038"
## System run parms
DGXNNODES=64
DGXSYSTEM=$(basename $(readlink -f ${BASH_SOURCE[0]}) | sed 's/^config_//' | sed 's/\.sh$//' )
WALLTIME=02:00:00
## System config params
DGXNGPU=16
DGXSOCKETCORES=24
DGXNSOCKET=2
DGXHT=2 # HT is on is 2, HT off is 1
DGXIBDEVICES='--device=/dev/infiniband/rdma_cm --device=/dev/infiniband/ucm10 --device=/dev/infiniband/ucm9 --device=/dev/infiniband/ucm8 --device=/dev/infiniband/ucm7 --device=/dev/infiniband/ucm4 --device=/dev/infiniband/ucm3 --device=/dev/infiniband/ucm2 --device=/dev/infiniband/ucm1 --device=/dev/infiniband/uverbs10 --device=/dev/infiniband/uverbs9 --device=/dev/infiniband/uverbs8 --device=/dev/infiniband/uverbs7 --device=/dev/infiniband/uverbs4 --device=/dev/infiniband/uverbs3 --device=/dev/infiniband/uverbs2 --device=/dev/infiniband/uverbs1 --device=/dev/infiniband/issm10 --device=/dev/infiniband/umad10 --device=/dev/infiniband/issm9 --device=/dev/infiniband/umad9 --device=/dev/infiniband/issm8 --device=/dev/infiniband/umad8 --device=/dev/infiniband/issm7 --device=/dev/infiniband/umad7 --device=/dev/infiniband/issm4 --device=/dev/infiniband/umad4 --device=/dev/infiniband/issm3 --device=/dev/infiniband/umad3 --device=/dev/infiniband/issm2 --device=/dev/infiniband/umad2 --device=/dev/infiniband/issm1 --device=/dev/infiniband/umad1'

View file

@ -0,0 +1,21 @@
#!/bin/bash
## DL params
BATCHSIZE=48
LEARNING_RATE="6e-3"
WARMUP_UPDATES=0.2843
EXTRA_PARAMS="--input_dir=/workspace/data --do_train --config_file=bert_config.json --max_seq_length=128 --max_predictions_per_seq=20 --output_dir /results/output --fp16 --max_steps=7038 --num_steps_per_checkpoint=10000 --log_freq=1"
## System run parms
DGXNNODES=92
DGXSYSTEM=$(basename $(readlink -f ${BASH_SOURCE[0]}) | sed 's/^config_//' | sed 's/\.sh$//' )
WALLTIME=02:00:00
DEADLINE=$(date -d '+168 hours' '+%FT%T')
SLURM_EMAIL_TYPE="END"
## System config params
DGXNGPU=16
DGXSOCKETCORES=24
DGXNSOCKET=2
DGXHT=2 # HT is on is 2, HT off is 1
DGXIBDEVICES='--device=/dev/infiniband/rdma_cm --device=/dev/infiniband/ucm10 --device=/dev/infiniband/ucm9 --device=/dev/infiniband/ucm8 --device=/dev/infiniband/ucm7 --device=/dev/infiniband/ucm4 --device=/dev/infiniband/ucm3 --device=/dev/infiniband/ucm2 --device=/dev/infiniband/ucm1 --device=/dev/infiniband/uverbs10 --device=/dev/infiniband/uverbs9 --device=/dev/infiniband/uverbs8 --device=/dev/infiniband/uverbs7 --device=/dev/infiniband/uverbs4 --device=/dev/infiniband/uverbs3 --device=/dev/infiniband/uverbs2 --device=/dev/infiniband/uverbs1 --device=/dev/infiniband/issm10 --device=/dev/infiniband/umad10 --device=/dev/infiniband/issm9 --device=/dev/infiniband/umad9 --device=/dev/infiniband/issm8 --device=/dev/infiniband/umad8 --device=/dev/infiniband/issm7 --device=/dev/infiniband/umad7 --device=/dev/infiniband/issm4 --device=/dev/infiniband/umad4 --device=/dev/infiniband/issm3 --device=/dev/infiniband/umad3 --device=/dev/infiniband/issm2 --device=/dev/infiniband/umad2 --device=/dev/infiniband/issm1 --device=/dev/infiniband/umad1'

View file

@ -0,0 +1,21 @@
#!/bin/bash
## DL params
BATCHSIZE=64
LEARNING_RATE="6.5e-3"
WARMUP_UPDATES=0.5107
EXTRA_PARAMS="--input_dir=/workspace/data --do_train --config_file=bert_config.json --max_seq_length=128 --max_predictions_per_seq=20 --output_dir /results/output --fp16 --max_steps=4896 --num_steps_per_checkpoint=10000 --log_freq=1"
## System run parms
DGXNNODES=92
DGXSYSTEM=$(basename $(readlink -f ${BASH_SOURCE[0]}) | sed 's/^config_//' | sed 's/\.sh$//' )
WALLTIME=02:00:00
DEADLINE=$(date -d '+168 hours' '+%FT%T')
SLURM_EMAIL_TYPE="END"
## System config params
DGXNGPU=16
DGXSOCKETCORES=24
DGXNSOCKET=2
DGXHT=2 # HT is on is 2, HT off is 1
DGXIBDEVICES='--device=/dev/infiniband/rdma_cm --device=/dev/infiniband/ucm10 --device=/dev/infiniband/ucm9 --device=/dev/infiniband/ucm8 --device=/dev/infiniband/ucm7 --device=/dev/infiniband/ucm4 --device=/dev/infiniband/ucm3 --device=/dev/infiniband/ucm2 --device=/dev/infiniband/ucm1 --device=/dev/infiniband/uverbs10 --device=/dev/infiniband/uverbs9 --device=/dev/infiniband/uverbs8 --device=/dev/infiniband/uverbs7 --device=/dev/infiniband/uverbs4 --device=/dev/infiniband/uverbs3 --device=/dev/infiniband/uverbs2 --device=/dev/infiniband/uverbs1 --device=/dev/infiniband/issm10 --device=/dev/infiniband/umad10 --device=/dev/infiniband/issm9 --device=/dev/infiniband/umad9 --device=/dev/infiniband/issm8 --device=/dev/infiniband/umad8 --device=/dev/infiniband/issm7 --device=/dev/infiniband/umad7 --device=/dev/infiniband/issm4 --device=/dev/infiniband/umad4 --device=/dev/infiniband/issm3 --device=/dev/infiniband/umad3 --device=/dev/infiniband/issm2 --device=/dev/infiniband/umad2 --device=/dev/infiniband/issm1 --device=/dev/infiniband/umad1'

View file

@ -0,0 +1,22 @@
#!/bin/bash
## DL params
BATCHSIZE=24
LEARNING_RATE="4.12e-3"
WARMUP_UPDATES=0.138
EXTRA_PARAMS="--input_dir=/workspace/data_phase2 --do_train --config_file=bert_config.json --max_seq_length=512 --max_predictions_per_seq=80 --output_dir /checkpoints --fp16 --max_steps=1450 --num_steps_per_checkpoint=1000 --log_freq=1 --phase2 --gradient_accumulation_steps=3 --resume_from_checkpoint --allreduce_post_accumulation --allreduce_post_accumulation_fp16 --resume_step=7038"
## System run parms
DGXNNODES=92
DGXSYSTEM=$(basename $(readlink -f ${BASH_SOURCE[0]}) | sed 's/^config_//' | sed 's/\.sh$//' )
WALLTIME=02:00:00
DEADLINE=$(date -d '+168 hours' '+%FT%T')
SLURM_EMAIL_TYPE="END"
## System config params
DGXNGPU=16
DGXSOCKETCORES=24
DGXNSOCKET=2
DGXHT=2 # HT is on is 2, HT off is 1
DGXIBDEVICES='--device=/dev/infiniband/rdma_cm --device=/dev/infiniband/ucm10 --device=/dev/infiniband/ucm9 --device=/dev/infiniband/ucm8 --device=/dev/infiniband/ucm7 --device=/dev/infiniband/ucm4 --device=/dev/infiniband/ucm3 --device=/dev/infiniband/ucm2 --device=/dev/infiniband/ucm1 --device=/dev/infiniband/uverbs10 --device=/dev/infiniband/uverbs9 --device=/dev/infiniband/uverbs8 --device=/dev/infiniband/uverbs7 --device=/dev/infiniband/uverbs4 --device=/dev/infiniband/uverbs3 --device=/dev/infiniband/uverbs2 --device=/dev/infiniband/uverbs1 --device=/dev/infiniband/issm10 --device=/dev/infiniband/umad10 --device=/dev/infiniband/issm9 --device=/dev/infiniband/umad9 --device=/dev/infiniband/issm8 --device=/dev/infiniband/umad8 --device=/dev/infiniband/issm7 --device=/dev/infiniband/umad7 --device=/dev/infiniband/issm4 --device=/dev/infiniband/umad4 --device=/dev/infiniband/issm3 --device=/dev/infiniband/umad3 --device=/dev/infiniband/issm2 --device=/dev/infiniband/umad2 --device=/dev/infiniband/issm1 --device=/dev/infiniband/umad1'

View file

@ -0,0 +1,21 @@
#!/bin/bash
## DL params
BATCHSIZE=32
LEARNING_RATE="5e-3"
WARMUP_UPDATES=0.192
EXTRA_PARAMS="--input_dir=/workspace/data_phase2 --do_train --config_file=bert_config.json --max_seq_length=512 --max_predictions_per_seq=80 --output_dir /checkpoints --fp16 --max_steps=1088 --num_steps_per_checkpoint=1000 --log_freq=1 --gradient_accumulation_steps=4 --resume_from_checkpoint --phase2 --allreduce_post_accumulation --allreduce_post_accumulation_fp16 --resume_step=4896 --phase1_end_step=4896"
## System run parms
DGXNNODES=92
DGXSYSTEM=$(basename $(readlink -f ${BASH_SOURCE[0]}) | sed 's/^config_//' | sed 's/\.sh$//' )
WALLTIME=02:00:00
DEADLINE=$(date -d '+168 hours' '+%FT%T')
SLURM_EMAIL_TYPE="END"
## System config params
DGXNGPU=16
DGXSOCKETCORES=24
DGXNSOCKET=2
DGXHT=2 # HT is on is 2, HT off is 1
DGXIBDEVICES='--device=/dev/infiniband/rdma_cm --device=/dev/infiniband/ucm10 --device=/dev/infiniband/ucm9 --device=/dev/infiniband/ucm8 --device=/dev/infiniband/ucm7 --device=/dev/infiniband/ucm4 --device=/dev/infiniband/ucm3 --device=/dev/infiniband/ucm2 --device=/dev/infiniband/ucm1 --device=/dev/infiniband/uverbs10 --device=/dev/infiniband/uverbs9 --device=/dev/infiniband/uverbs8 --device=/dev/infiniband/uverbs7 --device=/dev/infiniband/uverbs4 --device=/dev/infiniband/uverbs3 --device=/dev/infiniband/uverbs2 --device=/dev/infiniband/uverbs1 --device=/dev/infiniband/issm10 --device=/dev/infiniband/umad10 --device=/dev/infiniband/issm9 --device=/dev/infiniband/umad9 --device=/dev/infiniband/issm8 --device=/dev/infiniband/umad8 --device=/dev/infiniband/issm7 --device=/dev/infiniband/umad7 --device=/dev/infiniband/issm4 --device=/dev/infiniband/umad4 --device=/dev/infiniband/issm3 --device=/dev/infiniband/umad3 --device=/dev/infiniband/issm2 --device=/dev/infiniband/umad2 --device=/dev/infiniband/issm1 --device=/dev/infiniband/umad1'

View file

@ -0,0 +1,19 @@
#!/bin/bash
## DL params
BATCHSIZE=40
LEARNING_RATE="6e-3"
WARMUP_UPDATES=0.2843
EXTRA_PARAMS="--input_dir=/workspace/data --do_train --config_file=bert_config.json --max_seq_length=128 --max_predictions_per_seq=20 --output_dir /results/output --fp16 --max_steps=7508 --num_steps_per_checkpoint=10000 --log_freq=1"
## System run parms
DGXNNODES=96
DGXSYSTEM=$(basename $(readlink -f ${BASH_SOURCE[0]}) | sed 's/^config_//' | sed 's/\.sh$//' )
WALLTIME=02:00:00
## System config params
DGXNGPU=16
DGXSOCKETCORES=24
DGXNSOCKET=2
DGXHT=2 # HT is on is 2, HT off is 1
DGXIBDEVICES='--device=/dev/infiniband/rdma_cm --device=/dev/infiniband/ucm10 --device=/dev/infiniband/ucm9 --device=/dev/infiniband/ucm8 --device=/dev/infiniband/ucm7 --device=/dev/infiniband/ucm4 --device=/dev/infiniband/ucm3 --device=/dev/infiniband/ucm2 --device=/dev/infiniband/ucm1 --device=/dev/infiniband/uverbs10 --device=/dev/infiniband/uverbs9 --device=/dev/infiniband/uverbs8 --device=/dev/infiniband/uverbs7 --device=/dev/infiniband/uverbs4 --device=/dev/infiniband/uverbs3 --device=/dev/infiniband/uverbs2 --device=/dev/infiniband/uverbs1 --device=/dev/infiniband/issm10 --device=/dev/infiniband/umad10 --device=/dev/infiniband/issm9 --device=/dev/infiniband/umad9 --device=/dev/infiniband/issm8 --device=/dev/infiniband/umad8 --device=/dev/infiniband/issm7 --device=/dev/infiniband/umad7 --device=/dev/infiniband/issm4 --device=/dev/infiniband/umad4 --device=/dev/infiniband/issm3 --device=/dev/infiniband/umad3 --device=/dev/infiniband/issm2 --device=/dev/infiniband/umad2 --device=/dev/infiniband/issm1 --device=/dev/infiniband/umad1'

View file

@ -0,0 +1,19 @@
#!/bin/bash
## DL params
BATCHSIZE=64
LEARNING_RATE="6.5e-3"
WARMUP_UPDATES=0.5328
EXTRA_PARAMS="--input_dir=/workspace/data --do_train --config_file=bert_config.json --max_seq_length=128 --max_predictions_per_seq=20 --output_dir /results/output --fp16 --max_steps=4692 --num_steps_per_checkpoint=10000 --log_freq=1"
## System run parms
DGXNNODES=96
DGXSYSTEM=$(basename $(readlink -f ${BASH_SOURCE[0]}) | sed 's/^config_//' | sed 's/\.sh$//' )
WALLTIME=02:00:00
## System config params
DGXNGPU=16
DGXSOCKETCORES=24
DGXNSOCKET=2
DGXHT=2 # HT is on is 2, HT off is 1
DGXIBDEVICES='--device=/dev/infiniband/rdma_cm --device=/dev/infiniband/ucm10 --device=/dev/infiniband/ucm9 --device=/dev/infiniband/ucm8 --device=/dev/infiniband/ucm7 --device=/dev/infiniband/ucm4 --device=/dev/infiniband/ucm3 --device=/dev/infiniband/ucm2 --device=/dev/infiniband/ucm1 --device=/dev/infiniband/uverbs10 --device=/dev/infiniband/uverbs9 --device=/dev/infiniband/uverbs8 --device=/dev/infiniband/uverbs7 --device=/dev/infiniband/uverbs4 --device=/dev/infiniband/uverbs3 --device=/dev/infiniband/uverbs2 --device=/dev/infiniband/uverbs1 --device=/dev/infiniband/issm10 --device=/dev/infiniband/umad10 --device=/dev/infiniband/issm9 --device=/dev/infiniband/umad9 --device=/dev/infiniband/issm8 --device=/dev/infiniband/umad8 --device=/dev/infiniband/issm7 --device=/dev/infiniband/umad7 --device=/dev/infiniband/issm4 --device=/dev/infiniband/umad4 --device=/dev/infiniband/issm3 --device=/dev/infiniband/umad3 --device=/dev/infiniband/issm2 --device=/dev/infiniband/umad2 --device=/dev/infiniband/issm1 --device=/dev/infiniband/umad1'

View file

@ -0,0 +1,19 @@
#!/bin/bash
## DL params
BATCHSIZE=24
LEARNING_RATE="4e-3"
WARMUP_UPDATES=0.144
EXTRA_PARAMS="--input_dir=/workspace/data_phase2 --do_train --config_file=bert_config.json --max_seq_length=512 --max_predictions_per_seq=80 --output_dir /checkpoints --fp16 --max_steps=1390 --num_steps_per_checkpoint=1000 --log_freq=1 --phase2 --gradient_accumulation_steps=3 --resume_from_checkpoint --allreduce_post_accumulation --allreduce_post_accumulation_fp16 --resume_step=7038"
## System run parms
DGXNNODES=96
DGXSYSTEM=$(basename $(readlink -f ${BASH_SOURCE[0]}) | sed 's/^config_//' | sed 's/\.sh$//' )
WALLTIME=02:00:00
## System config params
DGXNGPU=16
DGXSOCKETCORES=24
DGXNSOCKET=2
DGXHT=2 # HT is on is 2, HT off is 1
DGXIBDEVICES='--device=/dev/infiniband/rdma_cm --device=/dev/infiniband/ucm10 --device=/dev/infiniband/ucm9 --device=/dev/infiniband/ucm8 --device=/dev/infiniband/ucm7 --device=/dev/infiniband/ucm4 --device=/dev/infiniband/ucm3 --device=/dev/infiniband/ucm2 --device=/dev/infiniband/ucm1 --device=/dev/infiniband/uverbs10 --device=/dev/infiniband/uverbs9 --device=/dev/infiniband/uverbs8 --device=/dev/infiniband/uverbs7 --device=/dev/infiniband/uverbs4 --device=/dev/infiniband/uverbs3 --device=/dev/infiniband/uverbs2 --device=/dev/infiniband/uverbs1 --device=/dev/infiniband/issm10 --device=/dev/infiniband/umad10 --device=/dev/infiniband/issm9 --device=/dev/infiniband/umad9 --device=/dev/infiniband/issm8 --device=/dev/infiniband/umad8 --device=/dev/infiniband/issm7 --device=/dev/infiniband/umad7 --device=/dev/infiniband/issm4 --device=/dev/infiniband/umad4 --device=/dev/infiniband/issm3 --device=/dev/infiniband/umad3 --device=/dev/infiniband/issm2 --device=/dev/infiniband/umad2 --device=/dev/infiniband/issm1 --device=/dev/infiniband/umad1'

View file

@ -0,0 +1,19 @@
#!/bin/bash
## DL params
BATCHSIZE=32
LEARNING_RATE="5e-3"
WARMUP_UPDATES=0.192
EXTRA_PARAMS="--input_dir=/workspace/data_phase2 --do_train --config_file=bert_config.json --max_seq_length=512 --max_predictions_per_seq=80 --output_dir /checkpoints --fp16 --max_steps=1042 --num_steps_per_checkpoint=1000 --log_freq=1 --gradient_accumulation_steps=4 --resume_from_checkpoint --phase2 --allreduce_post_accumulation --allreduce_post_accumulation_fp16 --resume_step=4896"
## System run parms
DGXNNODES=96
DGXSYSTEM=$(basename $(readlink -f ${BASH_SOURCE[0]}) | sed 's/^config_//' | sed 's/\.sh$//' )
WALLTIME=02:00:00
## System config params
DGXNGPU=16
DGXSOCKETCORES=24
DGXNSOCKET=2
DGXHT=2 # HT is on is 2, HT off is 1
DGXIBDEVICES='--device=/dev/infiniband/rdma_cm --device=/dev/infiniband/ucm10 --device=/dev/infiniband/ucm9 --device=/dev/infiniband/ucm8 --device=/dev/infiniband/ucm7 --device=/dev/infiniband/ucm4 --device=/dev/infiniband/ucm3 --device=/dev/infiniband/ucm2 --device=/dev/infiniband/ucm1 --device=/dev/infiniband/uverbs10 --device=/dev/infiniband/uverbs9 --device=/dev/infiniband/uverbs8 --device=/dev/infiniband/uverbs7 --device=/dev/infiniband/uverbs4 --device=/dev/infiniband/uverbs3 --device=/dev/infiniband/uverbs2 --device=/dev/infiniband/uverbs1 --device=/dev/infiniband/issm10 --device=/dev/infiniband/umad10 --device=/dev/infiniband/issm9 --device=/dev/infiniband/umad9 --device=/dev/infiniband/issm8 --device=/dev/infiniband/umad8 --device=/dev/infiniband/issm7 --device=/dev/infiniband/umad7 --device=/dev/infiniband/issm4 --device=/dev/infiniband/umad4 --device=/dev/infiniband/issm3 --device=/dev/infiniband/umad3 --device=/dev/infiniband/issm2 --device=/dev/infiniband/umad2 --device=/dev/infiniband/issm1 --device=/dev/infiniband/umad1'

View file

@ -0,0 +1,19 @@
#!/bin/bash
## DL params
BATCHSIZE=24
LEARNING_RATE="4e-3"
WARMUP_UPDATES=0.144
EXTRA_PARAMS="--input_dir=/workspace/data_phase2 --do_train --config_file=bert_config.json --max_seq_length=512 --max_predictions_per_seq=80 --output_dir /checkpoints --fp16 --max_steps=1390 --num_steps_per_checkpoint=1000 --log_freq=1 --phase2 --gradient_accumulation_steps=3 --resume_from_checkpoint"
## System run parms
DGXNNODES=1
DGXSYSTEM=$(basename $(readlink -f ${BASH_SOURCE[0]}) | sed 's/^config_//' | sed 's/\.sh$//' )
WALLTIME=02:00:00
## System config params
DGXNGPU=16
DGXSOCKETCORES=24
DGXNSOCKET=2
DGXHT=2 # HT is on is 2, HT off is 1
DGXIBDEVICES='--device=/dev/infiniband/rdma_cm --device=/dev/infiniband/ucm10 --device=/dev/infiniband/ucm9 --device=/dev/infiniband/ucm8 --device=/dev/infiniband/ucm7 --device=/dev/infiniband/ucm4 --device=/dev/infiniband/ucm3 --device=/dev/infiniband/ucm2 --device=/dev/infiniband/ucm1 --device=/dev/infiniband/uverbs10 --device=/dev/infiniband/uverbs9 --device=/dev/infiniband/uverbs8 --device=/dev/infiniband/uverbs7 --device=/dev/infiniband/uverbs4 --device=/dev/infiniband/uverbs3 --device=/dev/infiniband/uverbs2 --device=/dev/infiniband/uverbs1 --device=/dev/infiniband/issm10 --device=/dev/infiniband/umad10 --device=/dev/infiniband/issm9 --device=/dev/infiniband/umad9 --device=/dev/infiniband/issm8 --device=/dev/infiniband/umad8 --device=/dev/infiniband/issm7 --device=/dev/infiniband/umad7 --device=/dev/infiniband/issm4 --device=/dev/infiniband/umad4 --device=/dev/infiniband/issm3 --device=/dev/infiniband/umad3 --device=/dev/infiniband/issm2 --device=/dev/infiniband/umad2 --device=/dev/infiniband/issm1 --device=/dev/infiniband/umad1'

View file

@ -0,0 +1,16 @@
# NVIDIA
import subprocess
class BooksDownloader:
def __init__(self, save_path):
self.save_path = save_path
pass
def download(self):
bookscorpus_download_command = 'python3 /workspace/bookcorpus/download_files.py --list /workspace/bookcorpus/url_list.jsonl --out'
bookscorpus_download_command += ' ' + self.save_path + '/bookscorpus'
bookscorpus_download_command += ' --trash-bad-count'
bookscorpus_download_process = subprocess.run(bookscorpus_download_command, shell=True, check=True)
bookscorpus_download_process.communicate()

View file

@ -0,0 +1,21 @@
# NVIDIA
import glob
import os
class BookscorpusTextFormatting:
def __init__(self, books_path, output_filename, recursive = False):
self.books_path = books_path
self.recursive = recursive
self.output_filename = output_filename
# This puts one book per line
def merge(self):
with open(self.output_filename, mode='w', newline='\n') as ofile:
for filename in glob.glob(self.books_path + '/' + '*.txt', recursive=True):
with open(filename, mode='r', encoding='utf-8-sig', newline='\n') as file:
for line in file:
if line.strip() != '':
ofile.write(line.strip() + ' ')
ofile.write("\n\n")

View file

@ -0,0 +1,80 @@
# NVIDIA
from GooglePretrainedWeightDownloader import GooglePretrainedWeightDownloader
from NVIDIAPretrainedWeightDownloader import NVIDIAPretrainedWeightDownloader
from WikiDownloader import WikiDownloader
from BooksDownloader import BooksDownloader
from MRPCDownloader import MRPCDownloader
from SquadDownloader import SquadDownloader
class Downloader:
def __init__(self, dataset_name, save_path):
self.dataset_name = dataset_name
self.save_path = save_path
def download(self):
if self.dataset_name == 'bookscorpus':
self.download_bookscorpus()
elif self.dataset_name == 'wikicorpus_en':
self.download_wikicorpus('en')
elif self.dataset_name == 'wikicorpus_zh':
self.download_wikicorpus('zh')
elif self.dataset_name == 'google_pretrained_weights':
self.download_google_pretrained_weights()
elif self.dataset_name == 'nvidia_pretrained_weights':
self.download_nvidia_pretrained_weights()
elif self.dataset_name == 'mrpc':
self.download_mrpc()
elif self.dataset_name == 'squad':
self.download_squad()
elif self.dataset_name == 'all':
self.download_bookscorpus(self.save_path)
self.download_wikicorpus('en', self.save_path)
self.download_wikicorpus('zh', self.save_path)
self.download_google_pretrained_weights(self.save_path)
self.download_nvidia_pretrained_weights(self.save_path)
self.download_mrpc(self.save_path)
self.download_squad(self.save_path)
else:
print(self.dataset_name)
assert False, 'Unknown dataset_name provided to downloader'
def download_bookscorpus(self):
downloader = BooksDownloader(self.save_path)
downloader.download()
def download_wikicorpus(self, language):
downloader = WikiDownloader(language, self.save_path)
downloader.download()
def download_google_pretrained_weights(self):
downloader = GooglePretrainedWeightDownloader(self.save_path)
downloader.download()
def download_nvidia_pretrained_weights(self):
downloader = NVIDIAPretrainedWeightDownloader(self.save_path)
downloader.download()
def download_mrpc(self):
downloader = MRPCDownloader(self.save_path)
downloader.download()
def download_squad(self):
downloader = SquadDownloader(self.save_path)
downloader.download()

View file

@ -0,0 +1,147 @@
# NVIDIA
import hashlib
import os
import urllib.request
import zipfile
class GooglePretrainedWeightDownloader:
def __init__(self, save_path):
self.save_path = save_path + '/google_pretrained_weights'
if not os.path.exists(self.save_path):
os.makedirs(self.save_path)
# Download urls
self.model_urls = {
'bert_base_uncased': ('https://storage.googleapis.com/bert_models/2018_10_18/uncased_L-12_H-768_A-12.zip', 'uncased_L-12_H-768_A-12.zip'),
'bert_large_uncased': ('https://storage.googleapis.com/bert_models/2018_10_18/uncased_L-24_H-1024_A-16.zip', 'uncased_L-24_H-1024_A-16.zip'),
'bert_base_cased': ('https://storage.googleapis.com/bert_models/2018_10_18/cased_L-12_H-768_A-12.zip', 'cased_L-12_H-768_A-12.zip'),
'bert_large_cased': ('https://storage.googleapis.com/bert_models/2018_10_18/cased_L-24_H-1024_A-16.zip', 'cased_L-24_H-1024_A-16.zip'),
'bert_base_multilingual_cased': ('https://storage.googleapis.com/bert_models/2018_11_23/multi_cased_L-12_H-768_A-12.zip', 'multi_cased_L-12_H-768_A-12.zip'),
'bert_large_multilingual_uncased': ('https://storage.googleapis.com/bert_models/2018_11_03/multilingual_L-12_H-768_A-12.zip', 'multilingual_L-12_H-768_A-12.zip'),
'bert_base_chinese': ('https://storage.googleapis.com/bert_models/2018_11_03/chinese_L-12_H-768_A-12.zip', 'chinese_L-12_H-768_A-12.zip')
}
# SHA256sum verification for file download integrity (and checking for changes from the download source over time)
self.bert_base_uncased_sha = {
'bert_config.json': '7b4e5f53efbd058c67cda0aacfafb340113ea1b5797d9ce6ee411704ba21fcbc',
'bert_model.ckpt.data-00000-of-00001': '58580dc5e0bf0ae0d2efd51d0e8272b2f808857f0a43a88aaf7549da6d7a8a84',
'bert_model.ckpt.index': '04c1323086e2f1c5b7c0759d8d3e484afbb0ab45f51793daab9f647113a0117b',
'bert_model.ckpt.meta': 'dd5682170a10c3ea0280c2e9b9a45fee894eb62da649bbdea37b38b0ded5f60e',
'vocab.txt': '07eced375cec144d27c900241f3e339478dec958f92fddbc551f295c992038a3',
}
self.bert_large_uncased_sha = {
'bert_config.json': 'bfa42236d269e2aeb3a6d30412a33d15dbe8ea597e2b01dc9518c63cc6efafcb',
'bert_model.ckpt.data-00000-of-00001': 'bc6b3363e3be458c99ecf64b7f472d2b7c67534fd8f564c0556a678f90f4eea1',
'bert_model.ckpt.index': '68b52f2205ffc64dc627d1120cf399c1ef1cbc35ea5021d1afc889ffe2ce2093',
'bert_model.ckpt.meta': '6fcce8ff7628f229a885a593625e3d5ff9687542d5ef128d9beb1b0c05edc4a1',
'vocab.txt': '07eced375cec144d27c900241f3e339478dec958f92fddbc551f295c992038a3',
}
self.bert_base_cased_sha = {
'bert_config.json': 'f11dfb757bea16339a33e1bf327b0aade6e57fd9c29dc6b84f7ddb20682f48bc',
'bert_model.ckpt.data-00000-of-00001': '734d5a1b68bf98d4e9cb6b6692725d00842a1937af73902e51776905d8f760ea',
'bert_model.ckpt.index': '517d6ef5c41fc2ca1f595276d6fccf5521810d57f5a74e32616151557790f7b1',
'bert_model.ckpt.meta': '5f8a9771ff25dadd61582abb4e3a748215a10a6b55947cbb66d0f0ba1694be98',
'vocab.txt': 'eeaa9875b23b04b4c54ef759d03db9d1ba1554838f8fb26c5d96fa551df93d02',
}
self.bert_large_cased_sha = {
'bert_config.json': '7adb2125c8225da495656c982fd1c5f64ba8f20ad020838571a3f8a954c2df57',
'bert_model.ckpt.data-00000-of-00001': '6ff33640f40d472f7a16af0c17b1179ca9dcc0373155fb05335b6a4dd1657ef0',
'bert_model.ckpt.index': 'ef42a53f577fbe07381f4161b13c7cab4f4fc3b167cec6a9ae382c53d18049cf',
'bert_model.ckpt.meta': 'd2ddff3ed33b80091eac95171e94149736ea74eb645e575d942ec4a5e01a40a1',
'vocab.txt': 'eeaa9875b23b04b4c54ef759d03db9d1ba1554838f8fb26c5d96fa551df93d02',
}
self.bert_base_multilingual_cased_sha = {
'bert_config.json': 'e76c3964bc14a8bb37a5530cdc802699d2f4a6fddfab0611e153aa2528f234f0',
'bert_model.ckpt.data-00000-of-00001': '55b8a2df41f69c60c5180e50a7c31b7cdf6238909390c4ddf05fbc0d37aa1ac5',
'bert_model.ckpt.index': '7d8509c2a62b4e300feb55f8e5f1eef41638f4998dd4d887736f42d4f6a34b37',
'bert_model.ckpt.meta': '95e5f1997e8831f1c31e5cf530f1a2e99f121e9cd20887f2dce6fe9e3343e3fa',
'vocab.txt': 'fe0fda7c425b48c516fc8f160d594c8022a0808447475c1a7c6d6479763f310c',
}
self.bert_large_multilingual_uncased_sha = {
'bert_config.json': '49063bb061390211d2fdd108cada1ed86faa5f90b80c8f6fdddf406afa4c4624',
'bert_model.ckpt.data-00000-of-00001': '3cd83912ebeb0efe2abf35c9f1d5a515d8e80295e61c49b75c8853f756658429',
'bert_model.ckpt.index': '87c372c1a3b1dc7effaaa9103c80a81b3cbab04c7933ced224eec3b8ad2cc8e7',
'bert_model.ckpt.meta': '27f504f34f02acaa6b0f60d65195ec3e3f9505ac14601c6a32b421d0c8413a29',
'vocab.txt': '87b44292b452f6c05afa49b2e488e7eedf79ea4f4c39db6f2f4b37764228ef3f',
}
self.bert_base_chinese_sha = {
'bert_config.json': '7aaad0335058e2640bcb2c2e9a932b1cd9da200c46ea7b8957d54431f201c015',
'bert_model.ckpt.data-00000-of-00001': '756699356b78ad0ef1ca9ba6528297bcb3dd1aef5feadd31f4775d7c7fc989ba',
'bert_model.ckpt.index': '46315546e05ce62327b3e2cd1bed22836adcb2ff29735ec87721396edb21b82e',
'bert_model.ckpt.meta': 'c0f8d51e1ab986604bc2b25d6ec0af7fd21ff94cf67081996ec3f3bf5d823047',
'vocab.txt': '45bbac6b341c319adc98a532532882e91a9cefc0329aa57bac9ae761c27b291c',
}
# Relate SHA to urls for loop below
self.model_sha = {
'bert_base_uncased': self.bert_base_uncased_sha,
'bert_large_uncased': self.bert_large_uncased_sha,
'bert_base_cased': self.bert_base_cased_sha,
'bert_large_cased': self.bert_large_cased_sha,
'bert_base_multilingual_cased': self.bert_base_multilingual_cased_sha,
'bert_large_multilingual_uncased': self.bert_large_multilingual_uncased_sha,
'bert_base_chinese': self.bert_base_chinese_sha
}
# Helper to get sha256sum of a file
def sha256sum(self, filename):
h = hashlib.sha256()
b = bytearray(128*1024)
mv = memoryview(b)
with open(filename, 'rb', buffering=0) as f:
for n in iter(lambda : f.readinto(mv), 0):
h.update(mv[:n])
return h.hexdigest()
def download(self):
# Iterate over urls: download, unzip, verify sha256sum
found_mismatch_sha = False
for model in self.model_urls:
url = self.model_urls[model][0]
file = self.save_path + '/' + self.model_urls[model][1]
print('Downloading', url)
response = urllib.request.urlopen(url)
with open(file, 'wb') as handle:
handle.write(response.read())
print('Unzipping', file)
zip = zipfile.ZipFile(file, 'r')
zip.extractall(self.save_path)
zip.close()
sha_dict = self.model_sha[model]
for extracted_file in sha_dict:
sha = sha_dict[extracted_file]
if sha != self.sha256sum(file[:-4] + '/' + extracted_file):
found_mismatch_sha = True
print('SHA256sum does not match on file:', extracted_file, 'from download url:', url)
else:
print(file[:-4] + '/' + extracted_file, '\t', 'verified')
if not found_mismatch_sha:
print("All downloads pass sha256sum verification.")
def serialize(self):
pass
def deserialize(self):
pass
def listAvailableWeights(self):
print("Available Weight Datasets")
for item in self.model_urls:
print(item)
def listLocallyStoredWeights(self):
pass

View file

@ -0,0 +1,33 @@
# NVIDIA
import bz2
import os
import urllib.request
import sys
class MRPCDownloader:
def __init__(self, save_path):
self.save_path = save_path + '/mrpc'
if not os.path.exists(self.save_path):
os.makedirs(self.save_path)
# Documentation - Download link obtained from here: https://github.com/nyu-mll/GLUE-baselines/blob/master/download_glue_data.py
self.download_urls = {
'https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2Fmrpc_dev_ids.tsv?alt=media&token=ec5c0836-31d5-48f4-b431-7480817f1adc' : 'mrpc_dev_ids.tsv'
}
def download(self):
for item in self.download_urls:
url = item
file = self.download_urls[item]
print('Downloading:', url)
if os.path.isfile(self.save_path + '/' + file):
print('** Download file already exists, skipping download')
else:
response = urllib.request.urlopen(url)
with open(self.save_path + '/' + file, "wb") as handle:
handle.write(response.read())

View file

@ -0,0 +1,16 @@
# NVIDIA
import os
class NVIDIAPretrainedWeightDownloader:
def __init__(self, save_path):
self.save_path = save_path + '/nvidia_pretrained_weights'
if not os.path.exists(self.save_path):
os.makedirs(self.save_path)
pass
def download(self):
assert False, 'NVIDIAPretrainedWeightDownloader not implemented yet.'

View file

@ -0,0 +1,43 @@
# NVIDIA
import bz2
import os
import urllib.request
import sys
class SquadDownloader:
def __init__(self, save_path):
self.save_path = save_path + '/squad'
if not os.path.exists(self.save_path):
os.makedirs(self.save_path)
if not os.path.exists(self.save_path + '/v1.1'):
os.makedirs(self.save_path + '/v1.1')
if not os.path.exists(self.save_path + '/v2.0'):
os.makedirs(self.save_path + '/v2.0')
self.download_urls = {
'https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v1.1.json' : 'v1.1/train-v1.1.json',
'https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v1.1.json' : 'v1.1/dev-v1.1.json',
'https://worksheets.codalab.org/rest/bundles/0xbcd57bee090b421c982906709c8c27e1/contents/blob/' : 'v1.1/evaluate-v1.1.py',
'https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v2.0.json' : 'v2.0/train-v2.0.json',
'https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v2.0.json' : 'v2.0/dev-v2.0.json',
'https://worksheets.codalab.org/rest/bundles/0x6b567e1cf2e041ec80d7098f031c5c9e/contents/blob/' : 'v2.0/evaluate-v2.0.py',
}
def download(self):
for item in self.download_urls:
url = item
file = self.download_urls[item]
print('Downloading:', url)
if os.path.isfile(self.save_path + '/' + file):
print('** Download file already exists, skipping download')
else:
response = urllib.request.urlopen(url)
with open(self.save_path + '/' + file, "wb") as handle:
handle.write(response.read())

View file

@ -0,0 +1,316 @@
# NVIDIA
from collections import defaultdict
from itertools import islice
import multiprocessing
import statistics
class Sharding:
def __init__(self, input_files, output_name_prefix, n_training_shards, n_test_shards, fraction_test_set):
assert len(input_files) > 0, 'The input file list must contain at least one file.'
assert n_training_shards > 0, 'There must be at least one output shard.'
assert n_test_shards > 0, 'There must be at least one output shard.'
self.n_training_shards = n_training_shards
self.n_test_shards = n_test_shards
self.fraction_test_set = fraction_test_set
self.input_files = input_files
self.output_name_prefix = output_name_prefix
self.output_training_identifier = '_training'
self.output_test_identifier = '_test'
self.output_file_extension = '.txt'
self.articles = {} # key: integer identifier, value: list of articles
self.sentences = {} # key: integer identifier, value: list of sentences
self.output_training_files = {} # key: filename, value: list of articles to go into file
self.output_test_files = {} # key: filename, value: list of articles to go into file
self.init_output_files()
# Remember, the input files contain one article per line (the whitespace check is to skip extraneous blank lines)
def load_articles(self):
print('Start: Loading Articles')
global_article_count = 0
for input_file in self.input_files:
print('input file:', input_file)
with open(input_file, mode='r', newline='\n') as f:
for i, line in enumerate(f):
if line.strip():
self.articles[global_article_count] = line.rstrip()
global_article_count += 1
print('End: Loading Articles: There are', len(self.articles), 'articles.')
def segment_articles_into_sentences(self, segmenter):
print('Start: Sentence Segmentation')
if len(self.articles) is 0:
self.load_articles()
assert len(self.articles) is not 0, 'Please check that input files are present and contain data.'
# TODO: WIP: multiprocessing (create independent ranges and spawn processes)
use_multiprocessing = 'serial'
def chunks(data, size=len(self.articles)):
it = iter(data)
for i in range(0, len(data), size):
yield {k: data[k] for k in islice(it, size)}
if use_multiprocessing == 'manager':
manager = multiprocessing.Manager()
return_dict = manager.dict()
jobs = []
n_processes = 7 # in addition to the main process, total = n_proc+1
def work(articles, return_dict):
sentences = {}
for i, article in enumerate(articles):
sentences[i] = segmenter.segment_string(articles[article])
if i % 5000 == 0:
print('Segmenting article', i)
return_dict.update(sentences)
for item in chunks(self.articles, len(self.articles)):
p = multiprocessing.Process(target=work, args=(item, return_dict))
# Busy wait
while len(jobs) >= n_processes:
pass
jobs.append(p)
p.start()
for proc in jobs:
proc.join()
elif use_multiprocessing == 'queue':
work_queue = multiprocessing.Queue()
jobs = []
for item in chunks(self.articles, len(self.articles)):
pass
else: # serial option
for i, article in enumerate(self.articles):
self.sentences[i] = segmenter.segment_string(self.articles[article])
if i % 5000 == 0:
print('Segmenting article', i)
print('End: Sentence Segmentation')
def init_output_files(self):
print('Start: Init Output Files')
assert len(self.output_training_files) is 0, 'Internal storage self.output_files already contains data. This function is intended to be used by the constructor only.'
assert len(self.output_test_files) is 0, 'Internal storage self.output_files already contains data. This function is intended to be used by the constructor only.'
for i in range(self.n_training_shards):
name = self.output_name_prefix + self.output_training_identifier + '_' + str(i) + self.output_file_extension
self.output_training_files[name] = []
for i in range(self.n_test_shards):
name = self.output_name_prefix + self.output_test_identifier + '_' + str(i) + self.output_file_extension
self.output_test_files[name] = []
print('End: Init Output Files')
def get_sentences_per_shard(self, shard):
result = 0
for article_id in shard:
result += len(self.sentences[article_id])
return result
def distribute_articles_over_shards(self):
print('Start: Distribute Articles Over Shards')
assert len(self.articles) >= self.n_training_shards + self.n_test_shards, 'There are fewer articles than shards. Please add more data or reduce the number of shards requested.'
# Create dictionary with - key: sentence count per article, value: article id number
sentence_counts = defaultdict(lambda: [])
max_sentences = 0
total_sentences = 0
for article_id in self.sentences:
current_length = len(self.sentences[article_id])
sentence_counts[current_length].append(article_id)
max_sentences = max(max_sentences, current_length)
total_sentences += current_length
n_sentences_assigned_to_training = int((1 - self.fraction_test_set) * total_sentences)
nominal_sentences_per_training_shard = n_sentences_assigned_to_training // self.n_training_shards
nominal_sentences_per_test_shard = (total_sentences - n_sentences_assigned_to_training) // self.n_test_shards
consumed_article_set = set({})
unused_article_set = set(self.articles.keys())
# Make first pass and add one article worth of lines per file
for file in self.output_training_files:
current_article_id = sentence_counts[max_sentences][-1]
sentence_counts[max_sentences].pop(-1)
self.output_training_files[file].append(current_article_id)
consumed_article_set.add(current_article_id)
unused_article_set.remove(current_article_id)
# Maintain the max sentence count
while len(sentence_counts[max_sentences]) == 0 and max_sentences > 0:
max_sentences -= 1
if len(self.sentences[current_article_id]) > nominal_sentences_per_training_shard:
nominal_sentences_per_training_shard = len(self.sentences[current_article_id])
print('Warning: A single article contains more than the nominal number of sentences per training shard.')
for file in self.output_test_files:
current_article_id = sentence_counts[max_sentences][-1]
sentence_counts[max_sentences].pop(-1)
self.output_test_files[file].append(current_article_id)
consumed_article_set.add(current_article_id)
unused_article_set.remove(current_article_id)
# Maintain the max sentence count
while len(sentence_counts[max_sentences]) == 0 and max_sentences > 0:
max_sentences -= 1
if len(self.sentences[current_article_id]) > nominal_sentences_per_test_shard:
nominal_sentences_per_test_shard = len(self.sentences[current_article_id])
print('Warning: A single article contains more than the nominal number of sentences per test shard.')
training_counts = []
test_counts = []
for shard in self.output_training_files:
training_counts.append(self.get_sentences_per_shard(self.output_training_files[shard]))
for shard in self.output_test_files:
test_counts.append(self.get_sentences_per_shard(self.output_test_files[shard]))
training_median = statistics.median(training_counts)
test_median = statistics.median(test_counts)
# Make subsequent passes over files to find articles to add without going over limit
history_remaining = []
n_history_remaining = 4
while len(consumed_article_set) < len(self.articles):
for fidx, file in enumerate(self.output_training_files):
nominal_next_article_size = min(nominal_sentences_per_training_shard - training_counts[fidx], max_sentences)
# Maintain the max sentence count
while len(sentence_counts[max_sentences]) == 0 and max_sentences > 0:
max_sentences -= 1
while len(sentence_counts[nominal_next_article_size]) == 0 and nominal_next_article_size > 0:
nominal_next_article_size -= 1
if nominal_next_article_size not in sentence_counts or nominal_next_article_size is 0 or training_counts[fidx] > training_median:
continue # skip adding to this file, will come back later if no file can accept unused articles
current_article_id = sentence_counts[nominal_next_article_size][-1]
sentence_counts[nominal_next_article_size].pop(-1)
self.output_training_files[file].append(current_article_id)
consumed_article_set.add(current_article_id)
unused_article_set.remove(current_article_id)
for fidx, file in enumerate(self.output_test_files):
nominal_next_article_size = min(nominal_sentences_per_test_shard - test_counts[fidx], max_sentences)
# Maintain the max sentence count
while len(sentence_counts[max_sentences]) == 0 and max_sentences > 0:
max_sentences -= 1
while len(sentence_counts[nominal_next_article_size]) == 0 and nominal_next_article_size > 0:
nominal_next_article_size -= 1
if nominal_next_article_size not in sentence_counts or nominal_next_article_size is 0 or test_counts[fidx] > test_median:
continue # skip adding to this file, will come back later if no file can accept unused articles
current_article_id = sentence_counts[nominal_next_article_size][-1]
sentence_counts[nominal_next_article_size].pop(-1)
self.output_test_files[file].append(current_article_id)
consumed_article_set.add(current_article_id)
unused_article_set.remove(current_article_id)
# If unable to place articles a few times, bump up nominal sizes by fraction until articles get placed
if len(history_remaining) == n_history_remaining:
history_remaining.pop(0)
history_remaining.append(len(unused_article_set))
history_same = True
for i in range(1, len(history_remaining)):
history_same = history_same and (history_remaining[i-1] == history_remaining[i])
if history_same:
nominal_sentences_per_training_shard += 1
# nominal_sentences_per_test_shard += 1
training_counts = []
test_counts = []
for shard in self.output_training_files:
training_counts.append(self.get_sentences_per_shard(self.output_training_files[shard]))
for shard in self.output_test_files:
test_counts.append(self.get_sentences_per_shard(self.output_test_files[shard]))
training_median = statistics.median(training_counts)
test_median = statistics.median(test_counts)
print('Distributing data over shards:', len(unused_article_set), 'articles remaining.')
if len(unused_article_set) != 0:
print('Warning: Some articles did not make it into output files.')
for shard in self.output_training_files:
print('Training shard:', self.get_sentences_per_shard(self.output_training_files[shard]))
for shard in self.output_test_files:
print('Test shard:', self.get_sentences_per_shard(self.output_test_files[shard]))
print('End: Distribute Articles Over Shards')
def write_shards_to_disk(self):
print('Start: Write Shards to Disk')
for shard in self.output_training_files:
self.write_single_shard(shard, self.output_training_files[shard])
for shard in self.output_test_files:
self.write_single_shard(shard, self.output_test_files[shard])
print('End: Write Shards to Disk')
def write_single_shard(self, shard_name, shard):
with open(shard_name, mode='w', newline='\n') as f:
for article_id in shard:
for line in self.sentences[article_id]:
f.write(line + '\n')
f.write('\n') # Line break between articles
import nltk
nltk.download('punkt')
class NLTKSegmenter:
def __init(self):
pass
def segment_string(self, article):
return nltk.tokenize.sent_tokenize(article)

View file

@ -0,0 +1,58 @@
# NVIDIA
import bz2
import os
import urllib.request
import sys
class WikiDownloader:
def __init__(self, language, save_path):
self.save_path = save_path + '/wikicorpus_' + language
if not os.path.exists(self.save_path):
os.makedirs(self.save_path)
self.language = language
self.download_urls = {
'en' : 'https://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles.xml.bz2',
'zh' : 'https://dumps.wikimedia.org/zhwiki/latest/zhwiki-latest-pages-articles.xml.bz2'
}
self.output_files = {
'en' : 'wikicorpus_en.xml.bz2',
'zh' : 'wikicorpus_zh.xml.bz2'
}
def download(self):
if self.language in self.download_urls:
url = self.download_urls[self.language]
file = self.output_files[self.language]
print('Downloading:', url)
if os.path.isfile(self.save_path + '/' + file):
print('** Download file already exists, skipping download')
else:
response = urllib.request.urlopen(url)
with open(self.save_path + '/' + file, "wb") as handle:
handle.write(response.read())
# Always unzipping since this is relatively fast and will overwrite
print('Unzipping:', self.output_files[self.language])
#with open(self.save_path + '/' + file, mode='rb', buffering=131072) as f:
# it = iter(lambda: f.read(131072), b'')
# self.decompression(it, sys.stdout.buffer)
zip = bz2.BZ2File(self.save_path + '/' + file)
open(self.save_path + '/wikicorpus_' + self.language + '.xml', mode='wb', buffering=131072).write(zip.read())
else:
assert False, 'WikiDownloader not implemented for this language yet.'
def decompression(self, input, output):
decomp = bz2.BZ2Decompressor()
for chunk in input:
dc = decomp.decompress(chunk)
output.write(dc)

View file

@ -0,0 +1,35 @@
# NVIDIA
import glob
import os
class WikicorpusTextFormatting:
def __init__(self, wiki_path, output_filename, recursive = False):
self.wiki_path = wiki_path
self.recursive = recursive
self.output_filename = output_filename
# This puts one article per line
def merge(self):
with open(self.output_filename, mode='w', newline='\n') as ofile:
for dirname in glob.glob(self.wiki_path + '/*/', recursive=False):
for filename in glob.glob(dirname + 'wiki_*', recursive=self.recursive):
print(filename)
article_lines = []
article_open = False
with open(filename, mode='r', newline='\n') as file:
for line in file:
if '<doc id=' in line:
article_open = True
elif '</doc>' in line:
article_open = False
for oline in article_lines[1:]:
if oline != '\n':
ofile.write(oline.rstrip() + " ")
ofile.write("\n\n")
article_lines = []
else:
if article_open:
article_lines.append(line)

View file

@ -0,0 +1,345 @@
# NVIDIA
import BookscorpusTextFormatting
import Downloader
import TextSharding
import WikicorpusTextFormatting
import argparse
import itertools
import multiprocessing
import os
import pprint
import subprocess
def main(args):
working_dir = os.environ['BERT_PREP_WORKING_DIR']
print('Working Directory:', working_dir)
print('Action:', args.action)
print('Dataset Name:', args.dataset)
if args.input_files:
args.input_files = args.input_files.split(',')
directory_structure = {
'download' : working_dir + '/download', # Downloaded and decompressed
'extracted' : working_dir +'/extracted', # Extracted from whatever the initial format is (e.g., wikiextractor)
'formatted' : working_dir + '/formatted_one_article_per_line', # This is the level where all sources should look the same
'sharded' : working_dir + '/sharded',
'tfrecord' : working_dir + '/tfrecord',
'hdf5': working_dir + '/hdf5'
}
print('\nDirectory Structure:')
pp = pprint.PrettyPrinter(indent=2)
pp.pprint(directory_structure)
print('')
if args.action == 'download':
if not os.path.exists(directory_structure['download']):
os.makedirs(directory_structure['download'])
downloader = Downloader.Downloader(args.dataset, directory_structure['download'])
downloader.download()
elif args.action == 'text_formatting':
assert args.dataset != 'google_pretrained_weights' and args.dataset != 'nvidia_pretrained_weights' and args.dataset != 'squad' and args.dataset != 'mrpc', 'Cannot perform text_formatting on pretrained weights'
if not os.path.exists(directory_structure['extracted']):
os.makedirs(directory_structure['extracted'])
if not os.path.exists(directory_structure['formatted']):
os.makedirs(directory_structure['formatted'])
if args.dataset == 'bookscorpus':
books_path = directory_structure['download'] + '/bookscorpus'
#books_path = directory_structure['download']
output_filename = directory_structure['formatted'] + '/bookscorpus_one_book_per_line.txt'
books_formatter = BookscorpusTextFormatting.BookscorpusTextFormatting(books_path, output_filename, recursive=True)
books_formatter.merge()
elif args.dataset == 'wikicorpus_en':
if args.skip_wikiextractor == 0:
path_to_wikiextractor_in_container = '/workspace/wikiextractor/WikiExtractor.py'
wikiextractor_command = path_to_wikiextractor_in_container + ' ' + directory_structure['download'] + '/' + args.dataset + '/wikicorpus_en.xml ' + '-b 100M --processes ' + str(args.n_processes) + ' -o ' + directory_structure['extracted'] + '/' + args.dataset
print('WikiExtractor Command:', wikiextractor_command)
wikiextractor_process = subprocess.run(wikiextractor_command, shell=True, check=True)
#wikiextractor_process.communicate()
wiki_path = working_dir + '/' + directory_structure['extracted'] + '/wikicorpus_en'
output_filename = directory_structure['formatted'] + '/wikicorpus_en_one_article_per_line.txt'
wiki_formatter = WikicorpusTextFormatting.WikicorpusTextFormatting(wiki_path, output_filename, recursive=True)
wiki_formatter.merge()
elif args.dataset == 'wikicorpus_zh':
assert False, 'wikicorpus_zh not fully supported at this time. The simplified/tradition Chinese data needs to be translated and properly segmented still, and should work once this step is added.'
if args.skip_wikiextractor == 0:
path_to_wikiextractor_in_container = '/workspace/wikiextractor/WikiExtractor.py'
wikiextractor_command = path_to_wikiextractor_in_container + ' ' + directory_structure['download'] + '/' + args.dataset + '/wikicorpus_zh.xml ' + '-b 100M --processes ' + str(args.n_processes) + ' -o ' + directory_structure['extracted'] + '/' + args.dataset
print('WikiExtractor Command:', wikiextractor_command)
wikiextractor_process = subprocess.run(wikiextractor_command, shell=True, check=True)
#wikiextractor_process.communicate()
wiki_path = working_dir + '/' + directory_structure['extracted'] + '/wikicorpus_zh'
output_filename = directory_structure['formatted'] + '/wikicorpus_zh_one_article_per_line.txt'
wiki_formatter = WikicorpusTextFormatting.WikicorpusTextFormatting(wiki_path, output_filename, recursive=True)
wiki_formatter.merge()
elif args.action == 'sharding':
# Note: books+wiki requires user to provide list of input_files (comma-separated with no spaces)
if args.dataset == 'bookscorpus' or 'wikicorpus' in args.dataset or 'books_wiki' in args.dataset:
if args.input_files is None:
if args.dataset == 'bookscorpus':
args.input_files = [directory_structure['formatted'] + '/bookscorpus_one_book_per_line.txt']
elif args.dataset == 'wikicorpus_en':
args.input_files = [directory_structure['formatted'] + '/wikicorpus_en_one_article_per_line.txt']
elif args.dataset == 'wikicorpus_zh':
args.input_files = [directory_structure['formatted'] + '/wikicorpus_zh_one_article_per_line.txt']
elif args.dataset == 'books_wiki_en_corpus':
args.input_files = [directory_structure['formatted'] + '/bookscorpus_one_book_per_line.txt', directory_structure['formatted'] + '/wikicorpus_en_one_article_per_line.txt']
if args.output_file_prefix is None:
args.output_file_prefix = directory_structure['sharded'] + '/' + args.dataset + '/' + args.dataset
if not os.path.exists(directory_structure['sharded']):
os.makedirs(directory_structure['sharded'])
if not os.path.exists(directory_structure['sharded'] + '/' + args.dataset):
os.makedirs(directory_structure['sharded'] + '/' + args.dataset)
# Segmentation is here because all datasets look the same in one article/book/whatever per line format, and
# it seemed unnecessarily complicated to add an additional preprocessing step to call just for this.
# Different languages (e.g., Chinese simplified/traditional) may require translation and
# other packages to be called from here -- just add a conditional branch for those extra steps
segmenter = TextSharding.NLTKSegmenter()
sharding = TextSharding.Sharding(args.input_files, args.output_file_prefix, args.n_training_shards, args.n_test_shards, args.fraction_test_set)
sharding.load_articles()
sharding.segment_articles_into_sentences(segmenter)
sharding.distribute_articles_over_shards()
sharding.write_shards_to_disk()
else:
assert False, 'Unsupported dataset for sharding'
elif args.action == 'create_tfrecord_files':
assert False, 'TFrecord creation not supported in this PyTorch model example release.' \
''
if not os.path.exists(directory_structure['tfrecord']):
os.makedirs(directory_structure['tfrecord'])
def create_record_worker(filename_prefix, shard_id, output_format='tfrecord'):
bert_preprocessing_command = 'python /workspace/bert/create_pretraining_data.py'
bert_preprocessing_command += ' --input_file=' + directory_structure['sharded'] + '/' + args.dataset + '/' + filename_prefix + '_' + str(shard_id) + '.txt'
bert_preprocessing_command += ' --output_file=' + directory_structure['tfrecord'] + '/' + args.dataset + '/' + filename_prefix + '_' + str(shard_id) + '.' + output_format
bert_preprocessing_command += ' --vocab_file=' + args.vocab_file
bert_preprocessing_command += ' --do_lower_case=' + 'true' if args.do_lower_case else 'false'
bert_preprocessing_command += ' --max_seq_length=' + str(args.max_seq_length)
bert_preprocessing_command += ' --max_predictions_per_seq=' + str(args.max_predictions_per_seq)
bert_preprocessing_command += ' --masked_lm_prob=' + str(args.masked_lm_prob)
bert_preprocessing_command += ' --random_seed=' + str(args.random_seed)
bert_preprocessing_command += ' --dupe_factor=' + str(args.dupe_factor)
bert_preprocessing_process = subprocess.Popen(bert_preprocessing_command, shell=True)
bert_preprocessing_process.communicate()
last_process = bert_preprocessing_process
# This could be better optimized (fine if all take equal time)
if shard_id % args.n_processes == 0 and shard_id > 0:
bert_preprocessing_process.wait()
for i in range(args.n_training_shards):
create_record_worker(args.output_file_prefix + '_training', i)
last_process.wait()
for i in range(args.n_test_shards):
create_record_worker(args.output_file_prefix + '_test', i)
last_process.wait()
elif args.action == 'create_hdf5_files':
last_process = None
def create_record_worker(filename_prefix, shard_id, output_format='hdf5'):
bert_preprocessing_command = 'python /workspace/bert/create_pretraining_data.py'
bert_preprocessing_command += ' --input_file=' + directory_structure['sharded'] + '/' + args.dataset + '/' + filename_prefix + '_' + str(shard_id) + '.txt'
bert_preprocessing_command += ' --output_file=' + directory_structure['tfrecord'] + '/' + args.dataset + '/' + filename_prefix + '_' + str(shard_id) + '.' + output_format
bert_preprocessing_command += ' --vocab_file=' + args.vocab_file
bert_preprocessing_command += ' --do_lower_case' if args.do_lower_case else ''
bert_preprocessing_command += ' --max_seq_length=' + args.max_seq_length
bert_preprocessing_command += ' --max_predictions_per_seq=' + args.max_predictions_per_seq
bert_preprocessing_command += ' --masked_lm_prob=' + args.masked_lm_prob
bert_preprocessing_command += ' --random_seed=' + args.random_seed
bert_preprocessing_command += ' --dupe_factor=' + args.dupe_factor
bert_preprocessing_process = subprocess.Popen(bert_preprocessing_command, shell=True)
bert_preprocessing_process.communicate()
last_process = bert_preprocessing_process
# This could be better optimized (fine if all take equal time)
if shard_id % args.n_processes == 0 and shard_id > 0:
bert_preprocessing_process.wait()
for i in range(args.n_training_shards):
create_record_worker(args.output_file_prefix + '_training', i)
last_process.wait()
for i in range(args.n_test_shards):
create_record_worker(args.output_file_prefix + '_test', i)
last_process.wait()
if __name__ == "__main__":
parser = argparse.ArgumentParser(
description='Preprocessing Application for Everything BERT-related'
)
parser.add_argument(
'--action',
type=str,
help='Specify the action you want the app to take. e.g., generate vocab, segment, create tfrecords',
choices={
'download', # Download and verify mdf5/sha sums
'text_formatting', # Convert into a file that contains one article/book per line
'sharding', # Convert previous formatted text into shards containing one sentence per line
'create_tfrecord_files', # Turn each shard into a TFrecord with masking and next sentence prediction info
'create_hdf5_files' # Turn each shard into a HDF5 file with masking and next sentence prediction info
}
)
parser.add_argument(
'--dataset',
type=str,
help='Specify the dataset to perform --action on',
choices={
'bookscorpus',
'wikicorpus_en',
'wikicorpus_zh',
'books_wiki_en_corpus',
'google_pretrained_weights',
'nvidia_pretrained_weights',
'mrpc',
'squad',
'all'
}
)
parser.add_argument(
'--input_files',
type=str,
help='Specify the input files in a comma-separated list (no spaces)'
)
parser.add_argument(
'--output_file_prefix',
type=str,
help='Specify the naming convention (prefix) of the output files'
)
parser.add_argument(
'--n_training_shards',
type=int,
help='Specify the number of training shards to generate',
default=256
)
parser.add_argument(
'--n_test_shards',
type=int,
help='Specify the number of test shards to generate',
default=256
)
parser.add_argument(
'--fraction_test_set',
type=float,
help='Specify the fraction (0..1) of the data to withhold for the test data split (based on number of sequences)',
default=0.2
)
parser.add_argument(
'--segmentation_method',
type=str,
help='Specify your choice of sentence segmentation',
choices={
'nltk'
},
default='nltk'
)
parser.add_argument(
'--n_processes',
type=int,
help='Specify the max number of processes to allow at one time',
default=4
)
parser.add_argument(
'--random_seed',
type=int,
help='Specify the base seed to use for any random number generation',
default=12345
)
parser.add_argument(
'--dupe_factor',
type=int,
help='Specify the duplication factor',
default=5
)
parser.add_argument(
'--masked_lm_prob',
type=float,
help='Specify the probability for masked lm',
default=0.15
)
parser.add_argument(
'--max_seq_length',
type=int,
help='Specify the maximum sequence length',
default=512
)
parser.add_argument(
'--max_predictions_per_seq',
type=int,
help='Specify the maximum number of masked words per sequence',
default=20
)
parser.add_argument(
'--do_lower_case',
type=int,
help='Specify whether it is cased (0) or uncased (1) (any number greater than 0 will be treated as uncased)',
default=1
)
parser.add_argument(
'--vocab_file',
type=str,
help='Specify absolute path to vocab file to use)'
)
parser.add_argument(
'--skip_wikiextractor',
type=int,
help='Specify whether to skip wikiextractor step 0=False, 1=True',
default=0
)
parser.add_argument(
'--interactive_json_config_generator',
type=str,
help='Specify the action you want the app to take. e.g., generate vocab, segment, create tfrecords'
)
args = parser.parse_args()
main(args)

View file

@ -1,23 +0,0 @@
# NVIDIA
import glob
import os
import argparse
parser = argparse.ArgumentParser(description='Cleaning and merge downloaded bookcorpus files')
parser.add_argument('download_path', type=str)
parser.add_argument('output_file', type=str)
args = parser.parse_args()
download_path = args.download_path
output_file = args.output_file
with open(output_file, "w") as ofile:
for filename in glob.glob('{}/*.txt'.format(download_path), recursive=True):
with open(filename, mode='r', encoding="utf-8-sig") as file:
for line in file:
if line.strip() != "":
ofile.write(line.strip() + " ")
ofile.write("\n\n")

View file

@ -1,9 +0,0 @@
#! /bin/bash
# Download books
mkdir -p ./download
python3 /workspace/bookcorpus/download_files.py --list /workspace/bookcorpus/url_list.jsonl --out ./download --trash-bad-count
# Clean and prep (one book per line)
python3 ./clean_and_merge_text.py ./download bookcorpus.txt

View file

@ -1,38 +1,27 @@
#!/bin/bash
# Note: There are several directories created to make it clear what has been performed at each stage of preprocessing. The intermediate files may be useful if you want to further clean/prepare/augment the data for your own applications.
# NLTK was chosen as the default over spaCy simply due to speed of sentence segmentation on the large files.
# Download
python3 /workspace/bert/data/bertPrep.py --action download --dataset bookscorpus
python3 /workspace/bert/data/bertPrep.py --action download --dataset wikicorpus_en
MERGED_DIR=$1
args="${*:2}"
python3 /workspace/bert/data/bertPrep.py --action download --dataset google_pretrained_weights # Includes vocab
source utils/config.sh
python3 /workspace/bert/data/bertPrep.py --action download --dataset squad
#python3 /workspace/bert/data/bertPrep.py --action download --dataset mrpc
mkdir -p ${MERGED_DIR}
corpus_file=${MERGED_DIR}/corpus.txt
## Shuffle the full corpus texts
if [ ! -z $3 ]
then
echo "Merging $args"
cat $args | sed "/^$/d" | shuf > $corpus_file
else
corpus_file=$2
fi
# Properly format the text files
python3 /workspace/bert/data/bertPrep.py --action text_formatting --dataset bookscorpus
python3 /workspace/bert/data/bertPrep.py --action text_formatting --dataset wikicorpus_en
# Split articles into one-sentence-per-line format for use with BERT scripts
echo "Applying sentence segmentation to get one sentence per line"
mkdir -p ${MERGED_DIR}/final_text_file_single
python3 utils/sentence_segmentation_nltk.py $corpus_file ${MERGED_DIR}/final_text_file_single/corpus.segmented.nltk.txt
## Shard finalized text so that it has a chance of fitting in memory when creating pretraining data into hdf5 (choose appropriate number of shards for distributed training)
echo "Shard text files - size is approximate to prevent splitting an article across shards"
mkdir -p ${MERGED_DIR}/final_text_files_sharded
python3 utils/shard_text_input_file.py ${MERGED_DIR}/final_text_file_single/corpus.segmented.nltk.txt ${MERGED_DIR}/final_text_files_sharded/corpus.segmented.part.
# Shard the text files (group wiki+books then shard)
python3 /workspace/bert/data/bertPrep.py --action sharding --dataset books_wiki_en_corpus
# Convert sharded text files into hdf5 that are ready for BERT pretraining
echo "Creating hdf5 for each text shard"
mkdir -p ${MERGED_DIR}/hdf5_shards
export TARGET_DIR=${MERGED_DIR}
. utils/preprocessing_xargs_wrapper.sh ${N_PROCS_PREPROCESS}
# Create HDF5 files Phase 1
python3 /workspace/bert/data/bertPrep.py --action create_hdf5_files --dataset books_wiki_en_corpus --max_seq_length 128 --max_predictions_per_seq 20
# Create HDF5 files Phase 2
python3 /workspace/bert/data/bertPrep.py --action create_hdf5_files --dataset books_wiki_en_corpus --max_seq_length 512 --max_predictions_per_seq 80

View file

@ -1,29 +0,0 @@
#!/bin/bash
MERGED_DIR=$1 # e.g wikipedia+bookcorpus
INPUTFILES=$2 # directories with hdf5 files separated by comma
NUM_SHARDS=$3
source utils/config.sh
META_DIR=$MERGED_DIR/meta
mkdir -p ${MERGED_DIR}
mkdir -p ${META_DIR}
echo "create mixed dataset ids"
echo "python utils/create_mixed_dataset_ids.py --input_files=${INPUTFILES} --num_output_shards=${NUM_SHARDS} --output_dir=${META_DIR} --random_seed=${SEED}"
python utils/create_mixed_dataset_ids.py --input_files=${INPUTFILES} --num_output_shards=${NUM_SHARDS} --output_dir=${META_DIR} --random_seed=${SEED}
echo "Creating hdf5 for each text shard"
mkdir -p ${MERGED_DIR}/hdf5_shards
echo "create mixed datasets with hdf5 files"
echo "python utils/create_mixed_dataset.py --input_files=${INPUTFILES} --output_dir=${MERGED_DIR}/hdf5_shards --lookup=${META_DIR}/lookup_table.pkl --indices_dir=${META_DIR} --index_range=0-${NUM_SHARDS} --random_seed=${SEED}"
python utils/create_mixed_dataset.py --input_files=${INPUTFILES} --output_dir=${MERGED_DIR}/hdf5_shards --lookup=${META_DIR}/lookup_table.pkl --indices_dir=${META_DIR} --index_range=0-$((NUM_SHARDS-1)) --random_seed=${SEED}
rm -rf ${META_DIR}

View file

@ -1,24 +0,0 @@
#! /bin/bash
set -e
USE_BERT_LARGE=true
MAX_SEQUENCE_LENGTH=512
MAX_PREDICTIONS_PER_SEQUENCE=80
MASKED_LM_PROB=0.15
SEED=12345
DUPE_FACTOR=5
DO_LOWER_CASE="True"
N_LINES_PER_SHARD_APPROX=396000 # Default=396000 creates 256 shards
N_PROCS_PREPROCESS=4 # Adjust this based on memory requirements and available number of cores
BERT_BASE_DIR="/workspace/bert/vocab/uncased_L-12_H-768_A-12"
BERT_LARGE_DIR="/workspace/bert/vocab/uncased_L-24_H-1024_A-16"
if [ "$USE_BERT_LARGE" = true ] ; then
VOCAB_FILE="${BERT_LARGE_DIR}/vocab.txt"
else
VOCAB_FILE="${BERT_BASE_DIR}/vocab.txt"
fi

View file

@ -1,160 +0,0 @@
from __future__ import absolute_import, division, print_function, unicode_literals
import argparse
import logging
import os
import random
from io import open
import h5py
import numpy as np
from tqdm import tqdm, trange
import random
import collections
import math
import multiprocessing as mp
"""
mixing hdf5 shards with each other
"""
def shard_files(output_files, l_instance_ids, lookuptable, files):
l_input_ids = []
l_input_masks = []
l_segment_ids = []
l_masked_lm_positions = []
l_masked_lm_ids = []
l_next_sentence_labels = []
seq_len = 0
pred_len = 0
with h5py.File(files[0], 'r') as f:
seq_len = f['input_ids'].shape[1]
pred_len = f['masked_lm_positions'].shape[1]
assert(seq_len > 0 and pred_len > 0)
for i, output_file in enumerate(output_files):
output_length = len(l_instance_ids[i])
print("preparing to write {} instances to {}".format(output_length, output_file))
input_ids = np.ones([output_length, seq_len], dtype=np.int32)
input_masks = np.ones([output_length, seq_len], dtype=np.int8)
segment_ids = np.ones([output_length, seq_len], dtype=np.int8)
masked_lm_positions = np.ones([output_length, pred_len], dtype=np.int32)
masked_lm_ids= np.ones([output_length, pred_len], dtype=np.int32)
next_sentence_labels = np.ones(output_length, dtype=np.int8)
l_input_ids.append(input_ids)
l_input_masks.append(input_masks)
l_segment_ids.append(segment_ids)
l_masked_lm_positions.append(masked_lm_positions)
l_masked_lm_ids.append(masked_lm_ids)
l_next_sentence_labels.append(next_sentence_labels)
for did, f in enumerate(tqdm(files)):
h5_f = h5py.File(f, 'r')
f_input_ids = h5_f['input_ids'][:]
f_input_masks = h5_f['input_mask'][:]
f_segment_ids = h5_f['segment_ids'][:]
f_masked_lm_positions = h5_f['masked_lm_positions'][:]
f_masked_lm_ids = h5_f['masked_lm_ids'][:]
f_next_sentence_labels = h5_f['next_sentence_labels'][:]
h5_f.close()
for out_i, out_file in enumerate(output_files):
instance_ids = l_instance_ids[out_i]
for l, idx in enumerate(instance_ids):
doc_id, line_id = lookuptable[idx]
if doc_id == did:
l_input_ids[out_i][l] = f_input_ids[line_id]
l_input_masks[out_i][l] = f_input_masks[line_id]
l_segment_ids[out_i][l] = f_segment_ids[line_id]
l_masked_lm_positions[out_i][l] = f_masked_lm_positions[line_id]
l_masked_lm_ids[out_i][l] = f_masked_lm_ids[line_id]
l_next_sentence_labels[out_i][l] = f_next_sentence_labels[line_id]
for out_i, out_file in enumerate(output_files):
output_length = len(l_input_ids[out_i])
print("writing {} instances to {}".format(output_length, out_file))
with h5py.File(out_file, 'w') as f:
f.create_dataset("input_ids", data=l_input_ids[out_i], dtype='i4', compression='gzip')
f.create_dataset("input_mask", data=l_input_masks[out_i], dtype='i1', compression='gzip')
f.create_dataset("segment_ids", data=l_segment_ids[out_i], dtype='i1', compression='gzip')
f.create_dataset("masked_lm_positions", data=l_masked_lm_positions[out_i], dtype='i4', compression='gzip')
f.create_dataset("masked_lm_ids", data=l_masked_lm_ids[out_i], dtype='i4', compression='gzip')
f.create_dataset("next_sentence_labels", data=l_next_sentence_labels[out_i], dtype='i1', compression='gzip')
def main():
parser = argparse.ArgumentParser()
## Required parameters
parser.add_argument("--input_files",
default=None,
type=str,
required=True,
help="comma seperated list of file paths, each path can be either file or directory of files")
parser.add_argument("--output_dir",
default=None,
type=str,
required=True,
help="directory for output shards")
parser.add_argument("--lookup",
default=None,
type=str,
required=True,
help="path to lookup table")
parser.add_argument("--indices_dir",
default=None,
type=str,
required=True,
help="path to shuffled instance indices")
parser.add_argument("--index_range",
default=None,
type=str,
required=True,
help="index range of output files to be written out, e.g specify '0-100' for writing out 0.hdf5 , ..., 100.hdf5")
parser.add_argument('--random_seed',
type=int,
default=12345,
help="random seed for initialization")
args = parser.parse_args()
rng = random.Random(args.random_seed)
np.random.seed(args.random_seed)
input_paths = args.input_files.strip().split(',')
input_paths = [f for f in input_paths if f]
input_files = []
for path in input_paths:
if os.path.isfile(path):
assert (path.endswith('.hdf5')), "file must be hdf5 file"
input_files.append(path)
else:
assert os.path.isdir(path)
hdf5_files = [os.path.join(path, f) for f in os.listdir(path) if os.path.isfile(os.path.join(path, f)) and f.endswith('.hdf5')]
input_files.extend(hdf5_files)
input_files.sort()
assert(os.path.isdir(args.output_dir))
print("loading indices file")
start_idx, end_idx= int(args.index_range.split('-')[0]), int(args.index_range.split('-')[1])
index_files = []
instance_ids = []
for i in range(start_idx, end_idx + 1):
index_files.append(os.path.join(args.indices_dir, "indices_" + str(i) + ".npy"))
instance_ids.append( np.load(index_files[-1]))
output_files = [os.path.join(args.output_dir, indices_file.split('.')[0].split('_')[-1] + ".hdf5") for indices_file in index_files]
print("output_files", output_files)
print("loading lookup table")
lookup_table = np.load(args.lookup)
shard_files(output_files, instance_ids, lookup_table, input_files)
if __name__ == "__main__":
main()

View file

@ -1,134 +0,0 @@
from __future__ import absolute_import, division, print_function, unicode_literals
import argparse
import logging
import os
import random
from io import open
import h5py
import numpy as np
from tqdm import tqdm, trange
import random
import collections
import math
from tqdm import tqdm
import multiprocessing as mp
import pickle
import json
"""
mixing hdf5 shards with each other
"""
def load_and_prepare(input_files, num_shards):
seq_len = None
pred_len = None
input_lengths = []
for input_file in input_files:
with h5py.File(input_file, 'r') as f:
input_lengths.append(len(f['input_ids']))
if seq_len is None:
seq_len = f['input_ids'].shape[1]
pred_len = f['masked_lm_ids'].shape[1]
assert (isinstance(seq_len, int) and isinstance(pred_len, int))
total_instances = sum(input_lengths)
n_inst_per_file = math.ceil(total_instances * 1.0 / num_shards)
permutation = np.random.permutation(total_instances)
instance_indices = []
for i in range(0, num_shards):
start_pos = i * n_inst_per_file
end_pos = min((i+1) * n_inst_per_file, total_instances)
instance_indices.append(permutation[start_pos:end_pos])
return seq_len, pred_len, input_lengths, instance_indices
def main():
parser = argparse.ArgumentParser()
## Required parameters
parser.add_argument("--input_files",
default=None,
type=str,
required=True,
help="comma seperated list of file paths, each path can be either file or directory of hdf5 files")
parser.add_argument("--num_output_shards",
default=None,
type=int,
required=True,
help="number of shards to be created. shards will be created as even as possible.")
parser.add_argument("--output_dir",
default=None,
type=str,
required=True,
help="directory for meta files")
parser.add_argument('--random_seed',
type=int,
default=12345,
help="random seed for initialization")
args = parser.parse_args()
rng = random.Random(args.random_seed)
np.random.seed(args.random_seed)
input_paths = args.input_files.strip().split(',')
input_paths = [f for f in input_paths if f]
input_files = []
for path in input_paths:
if os.path.isfile(path):
assert (path.endswith('.hdf5')), "file must be hdf5 file"
input_files.append(path)
else:
assert os.path.isdir(path)
hdf5_files = [os.path.join(path, f) for f in os.listdir(path) if os.path.isfile(os.path.join(path, f)) and f.endswith('.hdf5')]
input_files.extend(hdf5_files)
input_files.sort()
assert(os.path.isdir(args.output_dir))
print("load and prepare")
seq_len, pred_len, input_lengths, output_inst_indices = load_and_prepare(input_files, args.num_output_shards)
print("preparing lookup table")
total_num_instances = sum(input_lengths)
out_2_in = dict()
length_so_far = 0
for i, l in enumerate(input_lengths):
for j in range(l):
out_2_in[length_so_far + j] = (i, j)
length_so_far += input_lengths[i]
output_files = [os.path.join(args.output_dir, "indices_" + str(i) + ".npy") for i in range(args.num_output_shards)]
print("save data")
with open(os.path.join(args.output_dir, 'lookup_table.pkl'), 'wb') as f:
pickle.dump(out_2_in, f)
for i, out_file in enumerate(output_files):
np.save(out_file, output_inst_indices[i])
meta = {'seq_len': seq_len, 'pred_len':pred_len}
with open(os.path.join(args.output_dir, 'meta_data.pkl'), 'wb') as f:
pickle.dump(meta, f)
if __name__ == "__main__":
main()

View file

@ -1,23 +0,0 @@
#! /bin/bash
SHARD_INDEX=${1}
INPUT_FILE="${TARGET_DIR}/final_text_files_sharded/corpus.segmented.part.${SHARD_INDEX}.txt"
source /workspace/bert/data/utils/config.sh
OUTPUT_DIR=${TARGET_DIR}/hdf5_shards
mkdir -p ${OUTPUT_DIR}
OUTPUT_FILE="${OUTPUT_DIR}/${SHARD_INDEX}.hdf5"
python /workspace/bert/create_pretraining_data.py \
--input_file=${INPUT_FILE} \
--output_file=${OUTPUT_FILE} \
--vocab_file=${VOCAB_FILE} \
--do_lower_case \
--max_seq_length=${MAX_SEQUENCE_LENGTH} \
--max_predictions_per_seq=${MAX_PREDICTIONS_PER_SEQUENCE} \
--masked_lm_prob=${MASKED_LM_PROB} \
--random_seed=${SEED} \
--dupe_factor=${DUPE_FACTOR}

View file

@ -1,15 +0,0 @@
#! /bin/bash
source /workspace/bert/data/utils/config.sh
SHARD_COUNT=0
rm -rf ${TARGET_DIR}/xarg_list.txt
touch ${TARGET_DIR}/xarg_list.txt
for file in ${TARGET_DIR}/final_text_files_sharded/*; do
echo ${SHARD_COUNT} >> ${TARGET_DIR}/xarg_list.txt
SHARD_COUNT=$((SHARD_COUNT+1))
done
xargs -n 1 --max-procs=${N_PROCS_PREPROCESS} --arg-file=${TARGET_DIR}/xarg_list.txt /workspace/bert/data/utils/preprocessing.sh
rm ${TARGET_DIR}/xarg_list.txt

View file

@ -1,28 +0,0 @@
# NVIDIA
import argparse
import nltk
import os
nltk.download('punkt')
parser = argparse.ArgumentParser(description='Sentence Segmentation')
parser.add_argument('input_file', type=str)
parser.add_argument('output_file', type=str)
args = parser.parse_args()
input_file = args.input_file
output_file = args.output_file
doc_seperator = "\n"
with open(input_file) as ifile:
with open(output_file, "w") as ofile:
for line in ifile:
if line != "\n":
sent_list = nltk.tokenize.sent_tokenize(line)
for sent in sent_list:
ofile.write(sent + "\n")
ofile.write(doc_seperator)

View file

@ -1,47 +0,0 @@
# NVIDIA
import os
import argparse
parser = argparse.ArgumentParser(description='Dataset sharding')
parser.add_argument('input_file', type=str)
parser.add_argument('output_file', type=str)
args = parser.parse_args()
input_file = args.input_file
output_file = args.output_file
doc_seperator = "\n"
line_buffer = []
shard_size = 396000 # Approximate, will split at next article break
line_counter = 0
shard_index = 0
ifile_lines = 0
with open(input_file) as ifile:
for line in ifile:
ifile_lines += 1
print("Input file contains", ifile_lines, "lines.")
iline_counter = 1
with open(input_file) as ifile:
for line in ifile:
if line_counter < shard_size and iline_counter < ifile_lines:
line_buffer.append(line)
line_counter += 1
iline_counter += 1
elif line_counter >= shard_size and line != "\n" and iline_counter < ifile_lines:
line_buffer.append(line)
line_counter += 1
iline_counter += 1
else:
with open(output_file + str(shard_index) + ".txt", "w") as ofile:
for oline in line_buffer:
ofile.write(oline)
line_buffer = []
line_counter = 0
shard_index += 1

View file

@ -1,30 +0,0 @@
#! /bin/bash
WIKI_DUMP="https://dumps.wikimedia.org/enwiki/20190320/enwiki-20190320-pages-articles-multistream.xml.bz2"
N_PROCS_PREPROCESS=$(nproc) # Adjust this based on memory requirements and available number of cores
# Download Wikipedia dump file
mkdir -p ./download
# Not using --noclobber since it emits an error if exists (incompatible with bash 'set -e')
echo "Downloading Wikidump"
if [ ! -f ./download/wikidump.xml.bz2 ]; then
wget -O ./download/wikidump.xml.bz2 ${WIKI_DUMP}
fi
# Extract dump
echo "Extracting Wikidump"
mkdir -p ./raw_data
if [ ! -f ./raw_data/wikidump.xml ]; then
pv ./download/wikidump.xml.bz2 | bunzip2 -kdc > ./raw_data/wikidump.xml
fi
# Wikiextractor.py - Creates lots of folders/files in "doc format"
echo "Running Wikiextractor"
mkdir -p ./extracted_articles
/workspace/wikiextractor/WikiExtractor.py ./raw_data/wikidump.xml -b 1000M --processes ${N_PROCS_PREPROCESS} -o ./extracted_articles
# Remove XML Tags and extraneous titles (since they are not sentences)
# Also clean to remove lines between paragraphs within article and use space-separated articles
echo "Cleaning and formatting files (one article per line)"
python3 ./remove_tags_and_clean.py ./extracted_articles ./wikipedia_corpus.txt

View file

@ -1,39 +0,0 @@
# NVIDIA
import glob
import os
import argparse
parser = argparse.ArgumentParser(description='Cleaning and merge downloaded bookcorpus files')
parser.add_argument('extracted_articles_path', type=str)
parser.add_argument('output_file', type=str)
args = parser.parse_args()
extracted_articles_path = args.extracted_articles_path
output_file = args.output_file
with open(output_file, "w") as ofile:
for dirname in glob.glob('{}/*/'.format(extracted_articles_path), recursive=False):
for filename in glob.glob(dirname + 'wiki_*', recursive=True):
print(filename)
article_lines = []
article_open = False
with open(filename, "r") as file:
for line in file:
if "<doc id=" in line:
article_open = True
elif "</doc>" in line:
article_open = False
for oline in article_lines[1:]:
if oline != "\n":
ofile.write(oline.rstrip() + " ")
ofile.write("\n\n")
article_lines = []
else:
if article_open:
article_lines.append(line)

View file

@ -1,205 +0,0 @@
import types
import importlib
import math
import torch
def warmup_cosine(x, warmup=0.002):
if x < warmup:
return x/warmup
return 0.5 * (1.0 + torch.cos(math.pi * x))
def warmup_constant(x, warmup=0.002):
if x < warmup:
return x/warmup
return 1.0
def warmup_linear(x, warmup=0.002):
if x < warmup:
return x/warmup
return 1.0 - x
SCHEDULES = {
'warmup_cosine':warmup_cosine,
'warmup_constant':warmup_constant,
'warmup_linear':warmup_linear,
}
class FusedAdamBert(torch.optim.Optimizer):
"""Implements Adam algorithm. Currently GPU-only. Requires Apex to be installed via
``python setup.py install --cuda_ext --cpp_ext``.
It has been proposed in `Adam: A Method for Stochastic Optimization`_.
Arguments:
params (iterable): iterable of parameters to optimize or dicts defining
parameter groups.
lr (float, optional): learning rate. (default: 1e-3)
betas (Tuple[float, float], optional): coefficients used for computing
running averages of gradient and its square. (default: (0.9, 0.999))
eps (float, optional): term added to the denominator to improve
numerical stability. (default: 1e-8)
weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
amsgrad (boolean, optional): whether to use the AMSGrad variant of this
algorithm from the paper `On the Convergence of Adam and Beyond`_
(default: False) NOT SUPPORTED in FusedAdam!
eps_inside_sqrt (boolean, optional): in the 'update parameters' step,
adds eps to the bias-corrected second moment estimate before
evaluating square root instead of adding it to the square root of
second moment estimate as in the original paper. (default: False)
.. _Adam\: A Method for Stochastic Optimization:
https://arxiv.org/abs/1412.6980
.. _On the Convergence of Adam and Beyond:
https://openreview.net/forum?id=ryQu7f-RZ
"""
# def __init__(self, params,
# lr=1e-3, bias_correction = True,
# betas=(0.9, 0.999), eps=1e-8, eps_inside_sqrt = False,
# weight_decay=0., max_grad_norm=0., amsgrad=False):
def __init__(self, params, lr=1e-3, warmup=-1, t_total=-1, bias_correction=False, betas=(0.9, 0.999), schedule='warmup_linear',
eps=1e-6, eps_inside_sqrt = False, weight_decay=0., max_grad_norm=1.0, amsgrad=False):
global fused_adam_cuda
fused_adam_cuda = importlib.import_module("fused_adam_cuda")
if amsgrad:
raise RuntimeError('FusedAdam does not support the AMSGrad variant.')
defaults = dict(lr=lr, bias_correction=bias_correction,
betas=betas, eps=eps, weight_decay=weight_decay,
max_grad_norm=max_grad_norm)
super(FusedAdamBert, self).__init__(params, defaults)
print("LOCAL FUSED ADAM")
self.eps_mode = 0 if eps_inside_sqrt else 1
self.schedule = schedule
self.t_total = t_total
self.warmup = warmup
def get_lr(self):
lr = []
for group in self.param_groups:
for p in group['params']:
state = self.state[p]
if len(state) == 0:
return [0]
if group['t_total'] != -1:
schedule_fct = SCHEDULES[group['schedule']]
lr_scheduled = group['lr'] * schedule_fct(state['step']/group['t_total'], group['warmup'])
else:
lr_scheduled = group['lr']
lr.append(lr_scheduled)
print("LR {}".format(lr_scheduled))
return lr
def step(self, closure=None, grads=None, output_params=None, scale=1., grad_norms=None):
"""Performs a single optimization step.
Arguments:
closure (callable, optional): A closure that reevaluates the model
and returns the loss.
grads (list of tensors, optional): weight gradient to use for the
optimizer update. If gradients have type torch.half, parameters
are expected to be in type torch.float. (default: None)
output params (list of tensors, optional): A reduced precision copy
of the updated weights written out in addition to the regular
updated weights. Have to be of same type as gradients. (default: None)
scale (float, optional): factor to divide gradient tensor values
by before applying to weights. (default: 1)
"""
loss = None
if closure is not None:
loss = closure()
if grads is None:
grads_group = [None]*len(self.param_groups)
# backward compatibility
# assuming a list/generator of parameter means single group
elif isinstance(grads, types.GeneratorType):
grads_group = [grads]
elif type(grads[0])!=list:
grads_group = [grads]
else:
grads_group = grads
if output_params is None:
output_params_group = [None]*len(self.param_groups)
elif isinstance(output_params, types.GeneratorType):
output_params_group = [output_params]
elif type(output_params[0])!=list:
output_params_group = [output_params]
else:
output_params_group = output_params
if grad_norms is None:
grad_norms = [None]*len(self.param_groups)
#Compute global norm
global_norm = 0.0
for group, grads_this_group, output_params_this_group, grad_norm in zip(self.param_groups, grads_group,
output_params_group, grad_norms):
global_norm = (global_norm ** 2 + grad_norm ** 2) ** 0.5
for group, grads_this_group, output_params_this_group, grad_norm in zip(self.param_groups, grads_group, output_params_group, grad_norms):
if grads_this_group is None:
grads_this_group = [None]*len(group['params'])
if output_params_this_group is None:
output_params_this_group = [None]*len(group['params'])
# compute combined scale factor for this group
combined_scale = scale
if group['max_grad_norm'] > 0:
# norm is in fact norm*scale
clip = ((global_norm / scale) + 1e-6) / group['max_grad_norm']
if clip > 1:
combined_scale = clip * scale
bias_correction = 1 if group['bias_correction'] else 0
for p, grad, output_param in zip(group['params'], grads_this_group, output_params_this_group):
#note: p.grad should not ever be set for correct operation of mixed precision optimizer that sometimes sends None gradients
if p.grad is None and grad is None:
continue
if grad is None:
grad = p.grad.data
if grad.is_sparse:
raise RuntimeError('FusedAdam does not support sparse gradients, please consider SparseAdam instead')
state = self.state[p]
# State initialization
if len(state) == 0:
state['step'] = 0
# Exponential moving average of gradient values
state['exp_avg'] = torch.zeros_like(p.data)
# Exponential moving average of squared gradient values
state['exp_avg_sq'] = torch.zeros_like(p.data)
exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']
beta1, beta2 = group['betas']
state['step'] += 1
out_p = torch.tensor([], dtype = torch.float) if output_param is None else output_param
#Changes sharath
schedule_fct = SCHEDULES[self.schedule]
#schedule_fct(state['step']/self.t_total, self.warmup)
#step_lr = group['lr'] * schedule_fct(state['step']/self.t_total, self.warmup)
#step_lr = group['lr'] * scale#schedule_fct(state['step']/self.t_total, self.warmup)# schedule_fct(state['step']/group['t_total'], group['warmup'])
#print(scale, step_lr)
#print(group['lr'])
fused_adam_cuda.adam(p.data,
out_p,
exp_avg,
exp_avg_sq,
grad,
group['lr'], #step_lr,#group['lr'],
beta1,
beta2,
group['eps'],
combined_scale,
state['step'],
self.eps_mode,
bias_correction,
group['weight_decay'])
return loss

View file

@ -35,6 +35,11 @@ from torch.utils import checkpoint
from file_utils import cached_path
from torch.nn import Module
from torch.nn.parameter import Parameter
import torch.nn.functional as F
import torch.nn.init as init
logger = logging.getLogger(__name__)
PRETRAINED_MODEL_ARCHIVE_MAP = {
@ -111,14 +116,27 @@ def load_tf_weights_in_bert(model, tf_checkpoint_path):
return model
@torch.jit.script
def f_gelu(x):
return x * 0.5 * (1.0 + torch.erf(x / 1.41421))
@torch.jit.script
def bias_gelu(bias, y):
x = bias + y
return x * 0.5 * (1.0 + torch.erf(x / 1.41421))
@torch.jit.script
def bias_tanh(bias, y):
x = bias + y
return torch.tanh(x)
def gelu(x):
"""Implementation of the gelu activation function.
For information: OpenAI GPT's gelu is slightly different (and gives slightly different results):
0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
Also see https://arxiv.org/abs/1606.08415
"""
return x * 0.5 * (1.0 + torch.erf(x / math.sqrt(2.0)))
return f_gelu(x)
def swish(x):
return x * torch.sigmoid(x)
@ -126,6 +144,53 @@ def swish(x):
ACT2FN = {"gelu": gelu, "relu": torch.nn.functional.relu, "swish": swish}
class LinearActivation(Module):
r"""Fused Linear and activation Module.
"""
__constants__ = ['bias']
def __init__(self, in_features, out_features, act='gelu', bias=True):
super(LinearActivation, self).__init__()
self.in_features = in_features
self.out_features = out_features
self.fused_gelu = False
self.fused_tanh = False
if isinstance(act, str) or (sys.version_info[0] == 2 and isinstance(act, unicode)):
if bias and act == 'gelu':
self.fused_gelu = True
elif bias and act == 'tanh':
self.fused_tanh = True
else:
self.act_fn = ACT2FN[act]
else:
self.act_fn = act
self.weight = Parameter(torch.Tensor(out_features, in_features))
if bias:
self.bias = Parameter(torch.Tensor(out_features))
else:
self.register_parameter('bias', None)
self.reset_parameters()
def reset_parameters(self):
init.kaiming_uniform_(self.weight, a=math.sqrt(5))
if self.bias is not None:
fan_in, _ = init._calculate_fan_in_and_fan_out(self.weight)
bound = 1 / math.sqrt(fan_in)
init.uniform_(self.bias, -bound, bound)
def forward(self, input):
if self.fused_gelu:
return bias_gelu(self.bias, F.linear(input, self.weight, None))
elif self.fused_tanh:
return bias_tanh(self.bias, F.linear(input, self.weight, None))
else:
return self.act_fn(F.linear(input, self.weight, self.bias))
def extra_repr(self):
return 'in_features={}, out_features={}, bias={}'.format(
self.in_features, self.out_features, self.bias is not None
)
class BertConfig(object):
"""Configuration class to store the configuration of a `BertModel`.
@ -216,7 +281,11 @@ class BertConfig(object):
return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n"
try:
from apex.normalization.fused_layer_norm import FusedLayerNorm as BertLayerNorm
import apex
#apex.amp.register_half_function(apex.normalization.fused_layer_norm, 'FusedLayerNorm')
import apex.normalization
#apex.amp.register_float_function(apex.normalization.FusedLayerNorm, 'forward')
BertLayerNorm = apex.normalization.FusedLayerNorm
except ImportError:
print("Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex.")
class BertLayerNorm(nn.Module):
@ -281,29 +350,35 @@ class BertSelfAttention(nn.Module):
self.value = nn.Linear(config.hidden_size, self.all_head_size)
self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
self.softmax = nn.Softmax(dim=-1)
def transpose_for_scores(self, x):
new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
x = x.view(*new_x_shape)
return x.permute(0, 2, 1, 3)
def transpose_key_for_scores(self, x):
new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
x = x.view(*new_x_shape)
return x.permute(0, 2, 3, 1)
def forward(self, hidden_states, attention_mask):
mixed_query_layer = self.query(hidden_states)
mixed_key_layer = self.key(hidden_states)
mixed_value_layer = self.value(hidden_states)
query_layer = self.transpose_for_scores(mixed_query_layer)
key_layer = self.transpose_for_scores(mixed_key_layer)
key_layer = self.transpose_key_for_scores(mixed_key_layer)
value_layer = self.transpose_for_scores(mixed_value_layer)
# Take the dot product between "query" and "key" to get the raw attention scores.
attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
attention_scores = torch.matmul(query_layer, key_layer)
attention_scores = attention_scores / math.sqrt(self.attention_head_size)
# Apply the attention mask is (precomputed for all layers in BertModel forward() function)
attention_scores = attention_scores + attention_mask
# Normalize the attention scores to probabilities.
attention_probs = nn.Softmax(dim=-1)(attention_scores)
attention_probs = self.softmax(attention_scores)
# This is actually dropping out entire tokens to attend to, which might
# seem a bit unusual, but is taken from the original Transformer paper.
@ -345,15 +420,10 @@ class BertAttention(nn.Module):
class BertIntermediate(nn.Module):
def __init__(self, config):
super(BertIntermediate, self).__init__()
self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
if isinstance(config.hidden_act, str) or (sys.version_info[0] == 2 and isinstance(config.hidden_act, unicode)):
self.intermediate_act_fn = ACT2FN[config.hidden_act]
else:
self.intermediate_act_fn = config.hidden_act
self.dense_act = LinearActivation(config.hidden_size, config.intermediate_size, act=config.hidden_act)
def forward(self, hidden_states):
hidden_states = self.dense(hidden_states)
hidden_states = self.intermediate_act_fn(hidden_states)
hidden_states = self.dense_act(hidden_states)
return hidden_states
@ -449,31 +519,24 @@ class BertEncoder(nn.Module):
class BertPooler(nn.Module):
def __init__(self, config):
super(BertPooler, self).__init__()
self.dense = nn.Linear(config.hidden_size, config.hidden_size)
self.activation = nn.Tanh()
self.dense_act = LinearActivation(config.hidden_size, config.hidden_size, act="tanh")
def forward(self, hidden_states):
# We "pool" the model by simply taking the hidden state corresponding
# to the first token.
first_token_tensor = hidden_states[:, 0]
pooled_output = self.dense(first_token_tensor)
pooled_output = self.activation(pooled_output)
pooled_output = self.dense_act(first_token_tensor)
return pooled_output
class BertPredictionHeadTransform(nn.Module):
def __init__(self, config):
super(BertPredictionHeadTransform, self).__init__()
self.dense = nn.Linear(config.hidden_size, config.hidden_size)
if isinstance(config.hidden_act, str) or (sys.version_info[0] == 2 and isinstance(config.hidden_act, unicode)):
self.transform_act_fn = ACT2FN[config.hidden_act]
else:
self.transform_act_fn = config.hidden_act
self.dense_act = LinearActivation(config.hidden_size, config.hidden_size, act=config.hidden_act)
self.LayerNorm = BertLayerNorm(config.hidden_size, eps=1e-12)
def forward(self, hidden_states):
hidden_states = self.dense(hidden_states)
hidden_states = self.transform_act_fn(hidden_states)
hidden_states = self.dense_act(hidden_states)
hidden_states = self.LayerNorm(hidden_states)
return hidden_states
@ -493,7 +556,9 @@ class BertLMPredictionHead(nn.Module):
def forward(self, hidden_states):
hidden_states = self.transform(hidden_states)
torch.cuda.nvtx.range_push("decoder input.size() = {}, weight.size() = {}".format(hidden_states.size(), self.decoder.weight.size()))
hidden_states = self.decoder(hidden_states) + self.bias
torch.cuda.nvtx.range_pop()
return hidden_states
@ -1247,3 +1312,4 @@ class BertForQuestionAnswering(BertPreTrainedModel):
return total_loss
else:
return start_logits, end_logits

View file

@ -21,6 +21,13 @@ from torch.optim.optimizer import required
from torch.nn.utils import clip_grad_norm_
#from fused_adam_local import FusedAdam
from apex.optimizers import FusedAdam
from apex.multi_tensor_apply import multi_tensor_applier
import amp_C
multi_tensor_l2norm = amp_C.multi_tensor_l2norm
lamb_compute_update = amp_C.multi_tensor_lamb_stage1_cuda
lamb_apply_update = amp_C.multi_tensor_lamb_stage2_cuda
scale = amp_C.multi_tensor_scale
def warmup_cosine(x, warmup=0.002):
if x < warmup:
@ -35,17 +42,235 @@ def warmup_constant(x, warmup=0.002):
def warmup_linear(x, warmup=0.002):
if x < warmup:
return x/warmup
# return (1.0 - x)
return max((x - 1. )/ (warmup - 1.), 0.)
def warmup_poly(x, warmup=0.002, degree=0.5):
if x < warmup:
return x/warmup
return (1.0 - x)**degree
return max((x - 1. )/ (warmup - 1.), 0.)
SCHEDULES = {
'warmup_cosine':warmup_cosine,
'warmup_constant':warmup_constant,
'warmup_linear':warmup_linear,
'warmup_poly':warmup_poly,
}
class BertLAMB(Optimizer):
"""Implements BERT version of LAMB algorithm.
Params:
lr: learning rate
warmup: portion of t_total for the warmup, -1 means no warmup. Default: -1
t_total: total number of training steps for the learning
rate schedule, -1 means constant learning rate. Default: -1
schedule: schedule to use for the warmup (see above). Default: 'warmup_linear'
b1: LAMBs b1. Default: 0.9
b2: LAMBs b2. Default: 0.999
e: LAMBs epsilon. Default: 1e-6
weight_decay: Weight decay. Default: 0.01
max_grad_norm: Maximum global norm for the gradients. Default: 1.0
"""
def __init__(self, params, lr=required, warmup=-1, t_total=-1, schedule='warmup_poly',
b1=0.9, b2=0.999, e=1e-6, weight_decay=0.01,
max_grad_norm=1.0):
if lr is not required and lr < 0.0:
raise ValueError("Invalid learning rate: {} - should be >= 0.0".format(lr))
if schedule not in SCHEDULES:
raise ValueError("Invalid schedule parameter: {}".format(schedule))
if not 0.0 <= warmup < 1.0 and not warmup == -1:
raise ValueError("Invalid warmup: {} - should be in [0.0, 1.0[ or -1".format(warmup))
if not 0.0 <= b1 < 1.0:
raise ValueError("Invalid b1 parameter: {} - should be in [0.0, 1.0[".format(b1))
if not 0.0 <= b2 < 1.0:
raise ValueError("Invalid b2 parameter: {} - should be in [0.0, 1.0[".format(b2))
if not e >= 0.0:
raise ValueError("Invalid epsilon value: {} - should be >= 0.0".format(e))
defaults = dict(lr=lr, schedule=schedule, warmup=warmup, t_total=t_total,
b1=b1, b2=b2, e=e, weight_decay=weight_decay,
max_grad_norm=max_grad_norm)
super(BertLAMB, self).__init__(params, defaults)
self.step_count = 0
self.b1 = b1
self.b2 = b2
self.epsilon = e
self.max_global_grad_norm = max_grad_norm
self.learning_rate = lr
self.schedule = schedule
self.warmup = warmup
self.max_steps = t_total
self.updates_created=False
def get_lr(self):
lr = []
for group in self.param_groups:
for p in group['params']:
state = self.state[p]
if len(state) == 0:
return [0]
if group['t_total'] != -1:
schedule_fct = SCHEDULES[group['schedule']]
lr_scheduled = group['lr'] * schedule_fct(state['step']/group['t_total'], group['warmup'])
else:
lr_scheduled = group['lr']
lr.append(lr_scheduled)
return lr
def apply_gradients(self, dummy_overflow_buf, lr_scheduled, per_param_decay, grad_list, param_list, momentum, velocity, update):
# Compute global gradient norm
global_grad_norm = multi_tensor_applier(
multi_tensor_l2norm,
dummy_overflow_buf,
[grad_list],
False)[0].item()
# Compute per parameter norm
param_norms = multi_tensor_applier(
multi_tensor_l2norm,
dummy_overflow_buf,
[param_list],
True)[1]
# Compute LAMB update
multi_tensor_applier(
lamb_compute_update,
dummy_overflow_buf,
[grad_list, param_list, momentum, velocity, update],
torch.cuda.FloatTensor(per_param_decay),
self.step_count,
self.b1,
self.b2,
self.epsilon,
global_grad_norm,
self.max_global_grad_norm,
)
# Computer per parameter update norm
update_norms = multi_tensor_applier(
multi_tensor_l2norm,
dummy_overflow_buf,
[update],
True)[1]
# Apply LAMB update on parameters
multi_tensor_applier(
lamb_apply_update,
dummy_overflow_buf,
[param_list, update],
param_norms,
update_norms,
lr_scheduled,
)
def step(self, closure=None):
"""Performs a single optimization step.
Arguments:
closure (callable, optional): A closure that reevaluates the model
and returns the loss.
"""
loss = None
if closure is not None:
loss = closure()
check = 1#torch.norm(all_grads, 2)
grad_list = []
param_list = []
per_param_decay = []
momentum = []
velocity = []
fp16_grad_list = []
fp16_from_fp32_param_list = []
fp32_param_list = []
fp16_per_param_decay = []
fp16_momentum = []
fp16_velocity = []
if not self.updates_created:
self.update = []
self.fp16_update = []
for group in self.param_groups:
for p in group['params']:
if p.grad is None:
continue
grad = p.grad.data
if grad.is_sparse:
raise RuntimeError('Adam does not support sparse gradients, please consider SparseAdam instead')
state = self.state[p]
# State initialization
if len(state) == 0:
# Keep step here for compatibility with earlier resume from checkpoint
state['step'] = 0
# Exponential moving average of gradient values
state['momentum'] = torch.zeros_like(p.data, dtype=torch.float32)
# Exponential moving average of squared gradient values
state['velocity'] = torch.zeros_like(p.data, dtype=torch.float32)
# fp32 master weights
if 'master_param' not in state.keys() and p.type() == 'torch.cuda.HalfTensor':
state['master_param'] = p.detach().clone().float()
# ensure these 3 are float tensors
if state['momentum'].type() != 'torch.cuda.FloatTensor':
state['momentum'] = state['momentum'].float()
if state['velocity'].type() != 'torch.cuda.FloatTensor':
state['velocity'] = state['velocity'].float()
if 'master_param' in state.keys() and state['master_param'].type() != 'torch.cuda.FloatTensor':
state['master_param'] = state['master_param'].float()
# Append all params, gradients, decays, velocity, momentum and updates to a list
if p.type() == 'torch.cuda.HalfTensor':
fp16_grad_list.append(grad)
fp32_param_list.append(state['master_param'])
fp16_from_fp32_param_list.append(p.data)
fp16_per_param_decay.append(group['weight_decay'])
fp16_momentum.append(state["momentum"])
fp16_velocity.append(state["velocity"])
if not self.updates_created:
#self.fp16_update.append(torch.empty_like(p.data, dtype=torch.float32))
# Use fp16 weights as temporary buffer for update term.
# This is safe because fp16 weights are overwritten after apply_gradients
self.fp16_update.append(p.data)
else:
grad_list.append(grad)
param_list.append(p.data)
per_param_decay.append(group['weight_decay'])
momentum.append(state["momentum"])
velocity.append(state["velocity"])
if not self.updates_created:
self.update.append(torch.empty_like(p.data))
state['step'] += 1
self.updates_created=True
update = self.update
fp16_update = self.fp16_update
self.step_count = state['step']
# Calculate learning rate from input schedule
# if self.max_steps != -1:
schedule_fct = SCHEDULES[self.schedule]
lr_scheduled = self.learning_rate * schedule_fct(self.step_count / self.max_steps, self.warmup)
if torch.distributed.get_rank() == 0:
print("Step {} LR {}".format(self.step_count, lr_scheduled))
# else:
# lr_scheduled = self.learning_rate
overflow_buf = torch.cuda.IntTensor([0])
if len(grad_list) > 0:
self.apply_gradients(overflow_buf, lr_scheduled, per_param_decay, grad_list, param_list, momentum, velocity, update)
if len(fp16_grad_list) > 0:
self.apply_gradients(overflow_buf, lr_scheduled, fp16_per_param_decay, fp16_grad_list, fp32_param_list, fp16_momentum, fp16_velocity, fp16_update)
multi_tensor_applier(
scale,
overflow_buf,
[fp32_param_list, fp16_from_fp32_param_list],
1.)
return loss
class BertAdam(Optimizer):
"""Implements BERT version of Adam algorithm with weight decay fix.
Params:
@ -165,54 +390,3 @@ class BertAdam(Optimizer):
return loss
# =======================================================================
class BertAdam_FP16(FusedAdam):
"""Implements BERT version of Adam algorithm with weight decay fix.
Params:
lr: learning rate
warmup: portion of t_total for the warmup, -1 means no warmup. Default: -1
t_total: total number of training steps for the learning
rate schedule, -1 means constant learning rate. Default: -1
schedule: schedule to use for the warmup (see above). Default: 'warmup_linear'
b1: Adams b1. Default: 0.9
b2: Adams b2. Default: 0.999
e: Adams epsilon. Default: 1e-6
weight_decay: Weight decay. Default: 0.01
max_grad_norm: Maximum norm for the gradients (-1 means no clipping). Default: 1.0
"""
def __init__(self, params, lr, warmup=-1, t_total=-1, bias_correction=False, schedule='warmup_linear',
b1=0.9, b2=0.999, e=1e-6, weight_decay=0.01,
max_grad_norm=1.0):
if not lr >= 0.0:
raise ValueError("Invalid learning rate: {} - should be >= 0.0".format(lr))
if schedule not in SCHEDULES:
raise ValueError("Invalid schedule parameter: {}".format(schedule))
if not 0.0 <= warmup < 1.0 and not warmup == -1:
raise ValueError("Invalid warmup: {} - should be in [0.0, 1.0[ or -1".format(warmup))
if not 0.0 <= b1 < 1.0:
raise ValueError("Invalid b1 parameter: {} - should be in [0.0, 1.0[".format(b1))
if not 0.0 <= b2 < 1.0:
raise ValueError("Invalid b2 parameter: {} - should be in [0.0, 1.0[".format(b2))
if not e >= 0.0:
raise ValueError("Invalid epsilon value: {} - should be >= 0.0".format(e))
# defaults = dict(lr=lr, schedule=schedule, warmup=warmup, t_total=t_total,
# b1=b1, b2=b2, e=e, weight_decay=weight_decay,
# max_grad_norm=max_grad_norm)
super(BertAdam_FP16, self).__init__(params, lr=lr, bias_correction=bias_correction, betas=(b1, b2), eps=e, weight_decay=weight_decay, max_grad_norm=max_grad_norm)#defaults)
def get_lr(self):
lr = []
for group in self.param_groups:
for p in group['params']:
state = self.state[p]
if len(state) == 0:
print("returning", state)
return [0]
if group['t_total'] != -1:
schedule_fct = SCHEDULES[group['schedule']]
lr_scheduled = group['lr'] * schedule_fct(state['step']/group['t_total'], group['warmup'])
else:
lr_scheduled = group['lr']
lr.append(lr_scheduled)
print("LR {}".format(lr_scheduled))
return lr

View file

@ -0,0 +1,203 @@
#!/bin/bash
#SBATCH -p batch # partition
#SBATCH -N 1 # number of nodes
#SBATCH -t 1:30:00 # wall time
#SBATCH -J "bert_pyt_lamb" # job name
#SBATCH --exclusive # exclusive node access
#SBATCH --mem=0 # all mem avail
#SBATCH --ntasks-per-node=16 # max 8 tasks per machine (one task per gpu) - Exception for pytorch// srun launch with -n1
#SBATCH --threads-per-core=2 # HT is on
#SBATCH --cpus-per-task=40 # Not used yet (to reach perf pytorch might need overcommit)
#SBATCH --overcommit # Needed for pytorch
#SBATCH --mail-user=sharatht@nvidia.com
#SBATCH --mail-type=END
##SBATCH --deadline=$(date -d '+72 hours' '+%FT%T')
##SBATCH --reservation mlperf # reservation name
##SBATCH --output=./logs/pytorch_%j.out
##SBATCH --exclude=sc-sdgx-[394,397] # targeting nodes with mask until the constraints are implemented
##SBATCH -w sc-sdgx-[377-388],sc-sdgx-[394-408] # avail pod12
##SBATCH -C pod14 # constraint (not implemented yet)
##SBATCH --ntasks-per-socket=4 # Not used (our slurm does not have sockets defined)
## Your data, your container and its volumes
## Your data, your container and its volumes
set -x
DATESTAMP=${DATESTAMP:-`date +'%y-%m-%d-%H-%M-%S-%N'`}
BENCHMARK=${BENCHMARK:-"bert"}
FRAMEWORK=${FRAMEWORK:-"pytorch"}
BENCHMARK_NAME==${FRAMEWORK:-"bert"}
JOBNAME=${JOBNAME:-"bert_lamb_phase1_96n_wiki+books_only_fast_lamb_O1_run_1337"}
#.$DATESTAMP
# Create results directory
#DATADIR=${DATADIR:-"/raid/datasets/bert/hdf5/shard_1472_test_split_10/seq_128_pred_20_dupe_5/training"}
#DATADIR_PHASE2=${DATADIR_PHASE2:-"/raid/datasets/bert/hdf5/shard_1472_test_split_10/seq_512_pred_80_dupe_5/training"}
DATADIR="/raid/datasets/bert/hdf5/shard_1472_test_split_10/seq_128_pred_20_dupe_5/training"
DATADIR_PHASE2="/raid/datasets/bert/hdf5/shard_1472_test_split_10/seq_512_pred_80_dupe_5/training"
#BOOKS_DIR=/raid/datasets/seq_512_pred_80_dupe_5_shard_256
VOCAB_PATH=${VOCAB_PATH:-"/raid/datasets/bert_vocab/vocab.txt"}
DATASET=${DATASET:-"coco/coco-2014"}
CODEDIR=${CODEDIR:="bert_pyt/tree/sharatht/fast_lamb_ci_runs"}
CONT=${CONT:-"gitlab-master.nvidia.com/dl/JoC/bert_pyt:bert_pyt"}
LOGDIR=${LOGDIR:-"/raid/results/$BENCHMARK"}
NEXP=${NEXP:-1}
SEED=${SEED:-$(od -A n -t d -N 3 /dev/urandom)}
#CHECKPOINT_DIR=${CHECKPOINT_DIR:-"/gpfs/fs1/svcnvdlfw/7108495/results/output"}
CHECKPOINT_DIR="/gpfs/fs1/svcnvdlfw/7588296/results/output"
## Load system-specific parameters for benchmark
DGXSYSTEM=${DGXSYSTEM:-"DGX1"}
if [[ ! -f "config_${DGXSYSTEM}.sh" ]]; then
echo "Unknown system, assuming DGX1"
DGXSYSTEM="DGX1"
fi
source config_${DGXSYSTEM}.sh
IBDEVICES=${IBDEVICES:-$DGXIBDEVICES}
## Check whether we are running in a slurm env
INSLURM=1
if [[ -z "$SLURM_JOB_ID" ]]; then
INSLURM=0
export SLURM_JOB_ID="${DATESTAMP}"
export SLURM_NNODES=1
else
env | grep SLURM
fi
if [[ -z "SLURM_JOB_ID" || $SLURM_NNODES -eq 1 ]]; then
# don't need IB if not multi-node
export IBDEVICES=""
fi
# Create results directory
LOGFILE_BASE="${LOGDIR}/${DATESTAMP}"
mkdir -p $(dirname "${LOGFILE_BASE}")
## Docker params
CONTVOLS="-v $DATADIR:/workspace/data -v $LOGDIR:/results -v $CHECKPOINT_DIR:/checkpoints -v $DATADIR_PHASE2:/workspace/data_phase2"
NV_GPU="${NVIDIA_VISIBLE_DEVICES:-$(seq 0 $((${SLURM_NTASKS_PER_NODE:-${DGXNGPU}}-1)) | tr '\n' ',' | sed 's/,$//')}"
DOCKEREXEC="env NV_GPU=${NV_GPU} nvidia-docker run --init --rm --net=host --uts=host --ipc=host --ulimit stack=67108864 --ulimit memlock=-1 --name=cont_${SLURM_JOB_ID} --security-opt seccomp=unconfined $IBDEVICES"
## Get version of the OS
export MLPERF_HOST_OS="$(cat /etc/issue | head -1 | cut -f1-3 -d" ") / $(cat /etc/dgx-release | grep -E "DGX_PRETTY_NAME|DGX_OTA_VERSION" |cut -f2 -d= |cut -f2 -d '"' |paste -sd' ')"
## Prep run and launch
MASTER_IP=`getent hosts \`hostname\` | cut -d ' ' -f1`
PORT=$((4242 + RANDOM%1000))
SSH=''
SRUN=''
if [[ $INSLURM -eq 1 ]]; then
hosts=( `scontrol show hostname |tr "\n" " "` )
SSH='ssh -q -o UserKnownHostsFile=/dev/null -o StrictHostKeyChecking=no $hostn'
SRUN='srun --mem=0 -N 1 -n 1 -w $hostn'
else
hosts=( `hostname` )
fi
# Pull latest image
if [[ "${PULL}" != "0" ]]; then
DOCKERPULL="docker pull $CONT"
pids=();
for hostn in ${hosts[@]}; do
timeout -k 600s 600s \
$(eval echo $SRUN) $DOCKERPULL &
pids+=($!);
done
wait "${pids[@]}"
success=$? ; if [ $success -ne 0 ]; then echo "ERR: Image pull failed."; exit $success ; fi
fi
# Test the base container launch
pids=();
for hostn in ${hosts[@]}; do
timeout -k 600s 600s \
$(eval echo $SRUN) $DOCKEREXEC $CONT python -c 'import torch; print("Found",torch.cuda.device_count(),"CUDA GPUs")' &
pids+=($!);
done
wait "${pids[@]}"
success=$? ; if [ $success -ne 0 ]; then echo "ERR: Base container launch failed."; exit $success ; fi
# Launch containers
pids=(); rets=()
for hostn in ${hosts[@]}; do
$(eval echo $SSH) $DOCKEREXEC $CONTVOLS $CONT sleep infinity &
pids+=($!); rets+=($?);
done
success=0; for s in ${rets[@]}; do ((success+=s)); done ; if [ $success -ne 0 ]; then echo "ERR: Container launch failed."; exit $success ; fi
sleep 30 # Making sure containers have time to launch
# Disable compat check from further running
pids=(); rets=()
for hostn in ${hosts[@]}; do
$(eval echo $SSH) docker exec cont_${SLURM_JOB_ID} rm -f /etc/shinit &
pids+=($!);
done
wait "${pids[@]}"
# Run benchmarks
export SEED
export NEXP
for nrun in `seq 1 $NEXP`; do
(
echo "Beginning trial $nrun of $NEXP"
export VARS=(
"-e" "SLURM_NNODES=$SLURM_NNODES"
"-e" "MLPERF_HOST_OS"
)
## Clear RAM cache dentries and inodes
echo "Clearing caches"
pids=(); rets=()
for hostn in ${hosts[@]}; do
if [[ $INSLURM -eq 1 ]]; then
$(eval echo $SSH) bash -c 'sync && sudo /sbin/sysctl vm.drop_caches=3' &
else
docker run --init --rm --privileged --entrypoint bash $CONT -c "sync && echo 3 > /proc/sys/vm/drop_caches || exit 1" &
fi
pids+=($!); rets+=($?);
done
wait "${pids[@]}"
success=0; for s in ${rets[@]}; do ((success+=s)); done ; if [ $success -ne 0 ]; then echo "ERR: Cache clearing failed."; exit $success ; fi
## Launching benchmark
pids=();
export MULTI_NODE=''
for h in `seq 0 $((SLURM_NNODES-1))`; do
hostn="${hosts[$h]}"
echo "Launching on node $hostn"
if [[ $SLURM_NNODES -gt 1 ]]; then
export MULTI_NODE=" --nnodes=$SLURM_NNODES --node_rank=$h --master_addr=$MASTER_IP --master_port=$PORT"
else
export MULTI_NODE=" --master_port=$PORT"
fi
export DOCKERENV=(
"-e" "DGXSYSTEM=$DGXSYSTEM"
"-e" "MULTI_NODE=$MULTI_NODE"
"-e" "SEED=$SEED"
"-e" "SLURM_JOB_ID=$SLURM_JOB_ID"
"-e" "SLURM_NTASKS_PER_NODE=$SLURM_NTASKS_PER_NODE"
"-e" "SLURM_NNODES=$SLURM_NNODES"
)
# Execute command
set -x
$(eval echo $SRUN) docker exec "${DOCKERENV[@]}" -e MODE=TRAIN cont_${SLURM_JOB_ID} ./run_and_time.sh &
pids+=($!);
set +x
done
wait "${pids[@]}"
) |& tee ${LOGFILE_BASE}_$nrun.log
## SEED update
export SEED=$(od -A n -t d -N 3 /dev/urandom)
done
# Clean up (note: on SLURM we skip this, as the epilogue will take care of it)
if [[ $INSLURM -eq 0 ]]; then
docker rm -f cont_${SLURM_JOB_ID}
fi

View file

@ -0,0 +1,39 @@
#!/bin/bash
#echo "Multi-node $MULTI_NODE"
#echo "Dataset $DATASET"
## DL vars -- Change your parameters below
# To change the number of GPUs per node, change the sbatch param --ntasks-per-node in the launching script
## Need to avoid virtualenv and do python directly
# train.py --data=/dev/shm/$DATASET \
# train.py --data=/raid/datasets/$DATASET \
DGXSYSTEM=${DGXSYSTEM:-"DGX1"}
if [[ -f config_${DGXSYSTEM}.sh ]]; then
source config_${DGXSYSTEM}.sh
else
source config_DGX1.sh
echo "Unknown system, assuming DGX1"
fi
SLURM_NTASKS_PER_NODE=${SLURM_NTASKS_PER_NODE:-$DGXNGPU}
SLURM_JOB_ID=${SLURM_JOB_ID:-$RANDOM}
MULTI_NODE=${MULTI_NODE:-''}
echo "Run vars: id $SLURM_JOB_ID gpus $SLURM_NTASKS_PER_NODE mparams $MULTI_NODE"
# run training
BIND_LAUNCH=1 ## should be the default
if [[ $BIND_LAUNCH -eq 1 ]]; then
LAUNCH_OPT="bind_pyt --nsockets_per_node 2 --ncores_per_socket ${DGXSOCKETCORES} --nproc_per_node ${SLURM_NTASKS_PER_NODE} ${MULTI_NODE}"
else
LAUNCH_OPT="torch.distributed.launch --nproc_per_node ${SLURM_NTASKS_PER_NODE} ${MULTI_NODE}"
fi
# Options
python -m $LAUNCH_OPT \
run_pretraining.py --seed=${SEED} \
--train_batch_size=${BATCHSIZE} \
--learning_rate=${LEARNING_RATE} \
--warmup_proportion=${WARMUP_UPDATES} \
$EXTRA_PARAMS

View file

@ -1,6 +1,7 @@
# coding=utf-8
# Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team.
#
# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
@ -18,10 +19,10 @@ from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
#==================
# ==================
import csv
import os
import time
import logging
import argparse
import random
@ -34,65 +35,73 @@ from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, Datas
from torch.utils.data.distributed import DistributedSampler
import math
from apex import amp
import multiprocessing
from tokenization import BertTokenizer
from modeling import BertForPreTraining, BertConfig
from optimization import BertAdam, BertAdam_FP16
from optimization import BertLAMB
# from fused_adam_local import FusedAdamBert
from file_utils import PYTORCH_PRETRAINED_BERT_CACHE
from apex.optimizers import FusedAdam #, FP16_Optimizer
#from apex.optimizers import FusedAdam
from utils import is_main_process
from apex.parallel import DistributedDataParallel as DDP
from schedulers import LinearWarmUpScheduler
from apex.parallel.distributed import flat_dist_call
import amp_C
import apex_C
from apex.amp import _amp_state
logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s',
datefmt = '%m/%d/%Y %H:%M:%S',
level = logging.INFO)
from concurrent.futures import ProcessPoolExecutor
logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s - %(message)s',
datefmt='%m/%d/%Y %H:%M:%S',
level=logging.INFO)
logger = logging.getLogger(__name__)
def create_pretraining_dataset(input_file, max_pred_length, shared_list, args):
train_data = pretraining_dataset(input_file=input_file, max_pred_length=max_pred_length)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler,
batch_size=args.train_batch_size * args.n_gpu, num_workers=4,
pin_memory=True)
# shared_list["0"] = (train_dataloader, input_file)
return train_dataloader, input_file
class pretraining_dataset(Dataset):
def __init__(self, input_file, max_pred_length):
self.input_file = input_file
self.max_pred_length = max_pred_length
f = h5py.File(input_file, "r")
self.input_ids = np.asarray(f["input_ids"][:]).astype(np.int64)#[num_instances x max_seq_length])
self.input_masks = np.asarray(f["input_mask"][:]).astype(np.int64) #[num_instances x max_seq_length]
self.segment_ids = np.asarray(f["segment_ids"][:]).astype(np.int64) #[num_instances x max_seq_length]
self.masked_lm_positions = np.asarray(f["masked_lm_positions"][:]).astype(np.int64) #[num_instances x max_pred_length]
self.masked_lm_ids= np.asarray(f["masked_lm_ids"][:]).astype(np.int64) #[num_instances x max_pred_length]
self.next_sentence_labels = np.asarray(f["next_sentence_labels"][:]).astype(np.int64) # [num_instances]
keys = ['input_ids', 'input_mask', 'segment_ids', 'masked_lm_positions', 'masked_lm_ids',
'next_sentence_labels']
self.inputs = [np.asarray(f[key][:]) for key in keys]
f.close()
def __len__(self):
'Denotes the total number of samples'
return len(self.input_ids)
return len(self.inputs[0])
def __getitem__(self, index):
input_ids= torch.from_numpy(self.input_ids[index]) # [max_seq_length]
input_mask = torch.from_numpy(self.input_masks[index]) #[max_seq_length]
segment_ids = torch.from_numpy(self.segment_ids[index])# [max_seq_length]
masked_lm_positions = torch.from_numpy(self.masked_lm_positions[index]) #[max_pred_length]
masked_lm_ids = torch.from_numpy(self.masked_lm_ids[index]) #[max_pred_length]
next_sentence_labels = torch.from_numpy(np.asarray(self.next_sentence_labels[index])) #[1]
[input_ids, input_mask, segment_ids, masked_lm_positions, masked_lm_ids, next_sentence_labels] = [
torch.from_numpy(input[index].astype(np.int64)) if indice < 5 else torch.from_numpy(
np.asarray(input[index].astype(np.int64))) for indice, input in enumerate(self.inputs)]
masked_lm_labels = torch.ones(input_ids.shape, dtype=torch.long) * -1
index = self.max_pred_length
# store number of masked tokens in index
if len((masked_lm_positions == 0).nonzero()) != 0:
index = (masked_lm_positions == 0).nonzero()[0].item()
padded_mask_indices = (masked_lm_positions == 0).nonzero()
if len(padded_mask_indices) != 0:
index = padded_mask_indices[0].item()
masked_lm_labels[masked_lm_positions[:index]] = masked_lm_ids[:index]
return [input_ids, segment_ids, input_mask, masked_lm_labels, next_sentence_labels]
return [input_ids, segment_ids, input_mask,
masked_lm_labels, next_sentence_labels]
def main():
def parse_arguments():
print("IN NEW MAIN XD\n")
parser = argparse.ArgumentParser()
## Required parameters
@ -186,233 +195,407 @@ def main():
help="Step to resume training from.")
parser.add_argument('--num_steps_per_checkpoint',
type=int,
default=2000,
default=100,
help="Number of update steps until a model checkpoint is saved to disk.")
parser.add_argument('--phase2',
default=False,
action='store_true',
help="Whether to train with seq len 512")
parser.add_argument('--allreduce_post_accumulation',
default=False,
action='store_true',
help="Whether to do allreduces during gradient accumulation steps.")
parser.add_argument('--allreduce_post_accumulation_fp16',
default=False,
action='store_true',
help="Whether to do fp16 allreduce post accumulation.")
parser.add_argument('--accumulate_into_fp16',
default=False,
action='store_true',
help="Whether to use fp16 gradient accumulators.")
parser.add_argument('--phase1_end_step',
type=int,
default=7038,
help="Number of training steps in Phase1 - seq len 128")
parser.add_argument("--do_train",
default=False,
action='store_true',
help="Whether to run training.")
args = parser.parse_args()
return args
def setup_training(args):
random.seed(args.seed)
np.random.seed(args.seed)
torch.manual_seed(args.seed)
assert(torch.cuda.is_available())
assert (torch.cuda.is_available())
if args.local_rank == -1:
device = torch.device("cuda")
n_gpu = torch.cuda.device_count()
args.n_gpu = torch.cuda.device_count()
else:
torch.cuda.set_device(args.local_rank)
device = torch.device("cuda", args.local_rank)
n_gpu = 1
args.n_gpu = 1
# Initializes the distributed backend which will take care of sychronizing nodes/GPUs
torch.distributed.init_process_group(backend='nccl', init_method='env://')
logger.info("device %s n_gpu %d distributed training %r", device, n_gpu, bool(args.local_rank != -1))
logger.info("device %s n_gpu %d distributed training %r", device, args.n_gpu, bool(args.local_rank != -1))
if args.gradient_accumulation_steps < 1:
raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format(
args.gradient_accumulation_steps))
args.gradient_accumulation_steps))
if args.train_batch_size % args.gradient_accumulation_steps != 0:
raise ValueError("Invalid gradient_accumulation_steps parameter: {}, batch size {} should be divisible".format(
args.gradient_accumulation_steps, args.train_batch_size))
args.gradient_accumulation_steps, args.train_batch_size))
args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps
if not args.do_train:
raise ValueError(" `do_train` must be True.")
if not args.resume_from_checkpoint and os.path.exists(args.output_dir) and (os.listdir(args.output_dir) and os.listdir(args.output_dir)!=['logfile.txt']):
if not args.resume_from_checkpoint and os.path.exists(args.output_dir) and (
os.listdir(args.output_dir) and os.listdir(args.output_dir) != ['logfile.txt']):
raise ValueError("Output directory ({}) already exists and is not empty.".format(args.output_dir))
if not args.resume_from_checkpoint:
os.makedirs(args.output_dir, exist_ok=True)
return device, args
def prepare_model_and_optimizer(args, device):
# Prepare model
config = BertConfig.from_json_file(args.config_file)
# Padding for divisibility by 8
if config.vocab_size % 8 != 0:
config.vocab_size += 8 - (config.vocab_size % 8)
model = BertForPreTraining(config)
checkpoint = None
if not args.resume_from_checkpoint:
global_step = 0
else:
if args.resume_step == -1:
model_names = [f for f in os.listdir(args.output_dir) if f.endswith(".pt")]
args.resume_step = max([int(x.split('.pt')[0].split('_')[1].strip()) for x in model_names])
global_step = args.resume_step
checkpoint = torch.load(os.path.join(args.output_dir, "ckpt_{}.pt".format(global_step)), map_location="cpu")
model.load_state_dict(checkpoint['model'], strict=False)
print("resume step from ", args.resume_step)
if args.phase2:
global_step -= args.phase1_end_step
if is_main_process():
print("resume step from ", args.resume_step)
model.to(device)
# Prepare optimizer
param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
{'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
{'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]
no_decay = ['bias', 'gamma', 'beta', 'LayerNorm']
optimizer_grouped_parameters = []
names = []
count = 1
for n, p in param_optimizer:
count += 1
if not any(nd in n for nd in no_decay):
optimizer_grouped_parameters.append({'params': [p], 'weight_decay': 0.01, 'name': n})
names.append({'params': [n], 'weight_decay': 0.01})
if any(nd in n for nd in no_decay):
optimizer_grouped_parameters.append({'params': [p], 'weight_decay': 0.00, 'name': n})
names.append({'params': [n], 'weight_decay': 0.00})
optimizer = BertLAMB(optimizer_grouped_parameters,
lr=args.learning_rate,
warmup=args.warmup_proportion,
t_total=args.max_steps)
if args.fp16:
optimizer = FusedAdam(optimizer_grouped_parameters,
lr=args.learning_rate,
#warmup=args.warmup_proportion,
#t_total=args.max_steps,
bias_correction=False,
weight_decay=0.01,
max_grad_norm=1.0)
if args.loss_scale == 0:
# optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)
model, optimizer = amp.initialize(model, optimizer, opt_level="O2", keep_batchnorm_fp32=False, loss_scale="dynamic")
model, optimizer = amp.initialize(model, optimizer, opt_level="O2", loss_scale="dynamic",
master_weights=False if args.accumulate_into_fp16 else True)
else:
# optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale)
model, optimizer = amp.initialize(model, optimizer, opt_level="O2", keep_batchnorm_fp32=False, loss_scale=args.loss_scale)
scheduler = LinearWarmUpScheduler(optimizer, warmup=args.warmup_proportion, total_steps=args.max_steps)
else:
optimizer = BertAdam(optimizer_grouped_parameters,
lr=args.learning_rate,
warmup=args.warmup_proportion,
t_total=args.max_steps)
model, optimizer = amp.initialize(model, optimizer, opt_level="O2", loss_scale=args.loss_scale,
master_weights=False if args.accumulate_into_fp16 else True)
amp._amp_state.loss_scalers[0]._loss_scale = 2**20
if args.resume_from_checkpoint:
if args.phase2:
keys = list(checkpoint['optimizer']['state'].keys())
#Override hyperparameters from Phase 1
for key in keys:
checkpoint['optimizer']['state'][key]['step'] = global_step
for iter, item in enumerate(checkpoint['optimizer']['param_groups']):
checkpoint['optimizer']['param_groups'][iter]['t_total'] = args.max_steps
checkpoint['optimizer']['param_groups'][iter]['warmup'] = args.warmup_proportion
checkpoint['optimizer']['param_groups'][iter]['lr'] = args.learning_rate
optimizer.load_state_dict(checkpoint['optimizer']) # , strict=False)
# Restore AMP master parameters
if args.fp16:
optimizer._lazy_init_maybe_master_weights()
optimizer._amp_stash.lazy_init_called = True
optimizer.load_state_dict(checkpoint['optimizer'])
for param, saved_param in zip(amp.master_params(optimizer), checkpoint['master params']):
param.data.copy_(saved_param.data)
if args.local_rank != -1:
model = DDP(model)
elif n_gpu > 1:
model = torch.nn.DataParallel(model)
files = [os.path.join(args.input_dir, f) for f in os.listdir(args.input_dir) if os.path.isfile(os.path.join(args.input_dir, f))]
files.sort()
num_files = len(files)
logger.info("***** Running training *****")
# logger.info(" Num examples = %d", len(train_data))
logger.info(" Batch size = %d", args.train_batch_size)
print(" LR = ", args.learning_rate)
model.train()
print("Training. . .")
most_recent_ckpts_paths = []
print("Training. . .")
tr_loss = 0.0 # total added training loss
average_loss = 0.0 # averaged loss every args.log_freq steps
epoch = 0
training_steps = 0
while True:
if not args.resume_from_checkpoint:
random.shuffle(files)
f_start_id = 0
if not args.allreduce_post_accumulation:
model = DDP(model, message_size=250000000, gradient_predivide_factor=torch.distributed.get_world_size())
else:
f_start_id = checkpoint['files'][0]
files = checkpoint['files'][1:]
args.resume_from_checkpoint = False
for f_id in range(f_start_id, len(files)):
data_file = files[f_id]
logger.info("file no %s file %s" %(f_id, data_file))
train_data = pretraining_dataset(input_file=data_file, max_pred_length=args.max_predictions_per_seq)
flat_dist_call([param.data for param in model.parameters()], torch.distributed.broadcast, (0,) )
elif args.n_gpu > 1:
model = torch.nn.DataParallel(model)
if args.local_rank == -1:
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size * n_gpu, num_workers=4, pin_memory=True)
return model, optimizer, checkpoint, global_step
def take_optimizer_step(args, optimizer, model, overflow_buf, global_step):
if args.allreduce_post_accumulation:
# manually allreduce gradients after all accumulation steps
# check for Inf/NaN
# 1. allocate an uninitialized buffer for flattened gradient
scaler = _amp_state.loss_scalers[0]
master_grads = [p.grad for p in amp.master_params(optimizer) if p.grad is not None]
flat_grad_size = sum(p.numel() for p in master_grads)
allreduce_dtype = torch.float16 if args.allreduce_post_accumulation_fp16 else torch.float32
flat_raw = torch.empty(flat_grad_size, device='cuda', dtype=allreduce_dtype)
# 2. combine unflattening and predivision of unscaled 'raw' gradient
allreduced_views = apex_C.unflatten(flat_raw, master_grads)
overflow_buf.zero_()
amp_C.multi_tensor_scale(65536,
overflow_buf,
[master_grads, allreduced_views],
scaler.loss_scale() / (torch.distributed.get_world_size() * args.gradient_accumulation_steps))
# 3. sum gradient across ranks. Because of the predivision, this averages the gradient
torch.distributed.all_reduce(flat_raw)
# 4. combine unscaling and unflattening of allreduced gradient
overflow_buf.zero_()
amp_C.multi_tensor_scale(65536,
overflow_buf,
[allreduced_views, master_grads],
1./scaler.loss_scale())
# 5. update loss scale
scaler = _amp_state.loss_scalers[0]
old_overflow_buf = scaler._overflow_buf
scaler._overflow_buf = overflow_buf
had_overflow = scaler.update_scale()
scaler._overfloat_buf = old_overflow_buf
# 6. call optimizer step function
if had_overflow == 0:
optimizer.step()
global_step += 1
else:
# Overflow detected, print message and clear gradients
if is_main_process():
print(("Rank {} :: Gradient overflow. Skipping step, " +
"reducing loss scale to {}").format(
torch.distributed.get_rank(),
scaler.loss_scale()))
if _amp_state.opt_properties.master_weights:
for param in optimizer._amp_stash.all_fp32_from_fp16_params:
param.grad = None
for param in model.parameters():
param.grad = None
else:
optimizer.step()
#optimizer.zero_grad()
for param in model.parameters():
param.grad = None
global_step += 1
return global_step
def main():
args = parse_arguments()
random.seed(args.seed)
np.random.seed(args.seed)
torch.manual_seed(args.seed)
device, args = setup_training(args)
# Prepare optimizer
model, optimizer, checkpoint, global_step = prepare_model_and_optimizer(args, device)
if is_main_process():
print("SEED {}".format(args.seed))
if args.do_train:
if is_main_process():
logger.info("***** Running training *****")
# logger.info(" Num examples = %d", len(train_data))
logger.info(" Batch size = %d", args.train_batch_size)
print(" LR = ", args.learning_rate)
print("Training. . .")
model.train()
most_recent_ckpts_paths = []
average_loss = 0.0 # averaged loss every args.log_freq steps
epoch = 0
training_steps = 0
pool = ProcessPoolExecutor(1)
# Note: We loop infinitely over epochs, termination is handled via iteration count
while True:
thread = None
if not args.resume_from_checkpoint or epoch > 0 or args.phase2:
files = [os.path.join(args.input_dir, f) for f in os.listdir(args.input_dir) if
os.path.isfile(os.path.join(args.input_dir, f))]
files.sort()
num_files = len(files)
random.shuffle(files)
f_start_id = 0
else:
train_sampler = DistributedSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size, num_workers=4, pin_memory=True)
f_start_id = checkpoint['files'][0]
files = checkpoint['files'][1:]
args.resume_from_checkpoint = False
num_files = len(files)
for step, batch in enumerate(tqdm(train_dataloader, desc="File Iteration")):
training_steps += 1
batch = [t.to(device) for t in batch]
input_ids, segment_ids, input_mask, masked_lm_labels, next_sentence_labels = batch#\
loss = model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask, masked_lm_labels=masked_lm_labels, next_sentence_label=next_sentence_labels, checkpoint_activations=args.checkpoint_activations)
if n_gpu > 1:
loss = loss.mean() # mean() to average on multi-gpu.
if args.gradient_accumulation_steps > 1:
loss = loss / args.gradient_accumulation_steps
shared_file_list = {}
if args.fp16:
# optimizer.backward(loss)
with amp.scale_loss(loss, optimizer) as scaled_loss:
scaled_loss.backward()
else:
loss.backward()
tr_loss += loss
average_loss += loss.item()
if torch.distributed.is_initialized() and torch.distributed.get_world_size() > num_files:
remainder = torch.distributed.get_world_size() % num_files
data_file = files[(f_start_id*torch.distributed.get_world_size()+torch.distributed.get_rank() + remainder*f_start_id)%num_files]
else:
data_file = files[(f_start_id*torch.distributed.get_world_size()+torch.distributed.get_rank())%num_files]
if training_steps % args.gradient_accumulation_steps == 0:
if args.fp16:
scheduler.step()
optimizer.step()
optimizer.zero_grad()
global_step += 1
previous_file = data_file
train_data = pretraining_dataset(data_file, args.max_predictions_per_seq)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler,
batch_size=args.train_batch_size * args.n_gpu, num_workers=4,
pin_memory=True)
# shared_file_list["0"] = (train_dataloader, data_file)
overflow_buf = None
if args.allreduce_post_accumulation:
overflow_buf = torch.cuda.IntTensor([0])
for f_id in range(f_start_id + 1 , len(files)):
# torch.cuda.synchronize()
# f_start = time.time()
if torch.distributed.get_world_size() > num_files:
data_file = files[(f_id*torch.distributed.get_world_size()+torch.distributed.get_rank() + remainder*f_id)%num_files]
else:
data_file = files[(f_id*torch.distributed.get_world_size()+torch.distributed.get_rank())%num_files]
if training_steps == 1 * args.gradient_accumulation_steps:
logger.info("Step:{} Average Loss = {} Step Loss = {} LR {}".format(global_step, average_loss,
loss.item(), optimizer.param_groups[0]['lr']))
logger.info("file no %s file %s" % (f_id, previous_file))
if training_steps % (args.log_freq * args.gradient_accumulation_steps) == 0:
logger.info("Step:{} Average Loss = {} Step Loss = {} LR {}".format(global_step, average_loss / args.log_freq,
loss.item(), optimizer.param_groups[0]['lr']))
average_loss = 0
previous_file = data_file
if global_step >= args.max_steps or training_steps % (
args.num_steps_per_checkpoint * args.gradient_accumulation_steps) == 0:
# train_dataloader = shared_file_list["0"][0]
if (not torch.distributed.is_initialized() or (torch.distributed.is_initialized() and torch.distributed.get_rank() == 0)):
# Save a trained model
logger.info("** ** * Saving fine - tuned model ** ** * ")
model_to_save = model.module if hasattr(model, 'module') else model # Only save the model it-self
output_save_file = os.path.join(args.output_dir, "ckpt_{}.pt".format(global_step))
torch.save({'model' : model_to_save.state_dict(),
'optimizer' : optimizer.state_dict(),
'files' : [f_id] + files }, output_save_file)
most_recent_ckpts_paths.append(output_save_file)
if len(most_recent_ckpts_paths) > 3:
ckpt_to_be_removed = most_recent_ckpts_paths.pop(0)
os.remove(ckpt_to_be_removed)
# thread = multiprocessing.Process(
# name="LOAD DATA:" + str(f_id) + ":" + str(data_file),
# target=create_pretraining_dataset,
# args=(data_file, args.max_predictions_per_seq, shared_file_list, args, n_gpu)
# )
# thread.start()
dataset_future = pool.submit(create_pretraining_dataset, data_file, args.max_predictions_per_seq, shared_file_list, args)
# torch.cuda.synchronize()
# f_end = time.time()
# print('[{}] : shard overhead {}'.format(torch.distributed.get_rank(), f_end - f_start))
train_iter = tqdm(train_dataloader, desc="Iteration") if is_main_process() else train_dataloader
for step, batch in enumerate(train_iter):
# torch.cuda.synchronize()
# iter_start = time.time()
training_steps += 1
batch = [t.to(device) for t in batch]
input_ids, segment_ids, input_mask, masked_lm_labels, next_sentence_labels = batch
loss = model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask,
masked_lm_labels=masked_lm_labels, next_sentence_label=next_sentence_labels,
checkpoint_activations=args.checkpoint_activations)
if args.n_gpu > 1:
loss = loss.mean() # mean() to average on multi-gpu.
divisor = args.gradient_accumulation_steps
if args.gradient_accumulation_steps > 1:
if not args.allreduce_post_accumulation:
# this division was merged into predivision
loss = loss / args.gradient_accumulation_steps
divisor = 1.0
if args.fp16:
with amp.scale_loss(loss, optimizer, delay_overflow_check=args.allreduce_post_accumulation) as scaled_loss:
scaled_loss.backward()
else:
loss.backward()
average_loss += loss.item()
if training_steps % args.gradient_accumulation_steps == 0:
global_step = take_optimizer_step(args, optimizer, model, overflow_buf, global_step)
if global_step >= args.max_steps:
tr_loss = tr_loss * args.gradient_accumulation_steps / training_steps
last_num_steps = global_step % args.log_freq
last_num_steps = args.log_freq if last_num_steps == 0 else last_num_steps
average_loss = torch.tensor(average_loss, dtype=torch.float32).cuda()
average_loss = average_loss / (last_num_steps * divisor)
if (torch.distributed.is_initialized()):
tr_loss /= torch.distributed.get_world_size()
torch.distributed.all_reduce(tr_loss)
logger.info("Total Steps:{} Final Loss = {}".format(training_steps, tr_loss.item()))
return
del train_dataloader
del train_sampler
del train_data
#for obj in gc.get_objects():
# if torch.is_tensor(obj) or (hasattr(obj, 'data') and torch.is_tensor(obj.data)):
# del obj
average_loss /= torch.distributed.get_world_size()
torch.distributed.all_reduce(average_loss)
if is_main_process():
logger.info("Total Steps:{} Final Loss = {}".format(training_steps, average_loss.item()))
elif training_steps % (args.log_freq * args.gradient_accumulation_steps) == 0:
if is_main_process():
print("Step:{} Average Loss = {} Step Loss = {} LR {}".format(global_step, average_loss / (
args.log_freq * divisor),
loss.item() * args.gradient_accumulation_steps / divisor,
optimizer.param_groups[0][
'lr']))
average_loss = 0
torch.cuda.empty_cache()
epoch += 1
if global_step >= args.max_steps or training_steps % (
args.num_steps_per_checkpoint * args.gradient_accumulation_steps) == 0:
if is_main_process():
# Save a trained model
logger.info("** ** * Saving fine - tuned model ** ** * ")
model_to_save = model.module if hasattr(model,
'module') else model # Only save the model it-self
if args.resume_step < 0 or not args.phase2:
output_save_file = os.path.join(args.output_dir, "ckpt_{}.pt".format(global_step))
else:
output_save_file = os.path.join(args.output_dir, "ckpt_{}.pt".format(global_step + args.phase1_end_step))
if args.do_train:
torch.save({'model': model_to_save.state_dict(),
'optimizer': optimizer.state_dict(),
'master params': list(amp.master_params(optimizer)),
'files': [f_id] + files}, output_save_file)
most_recent_ckpts_paths.append(output_save_file)
if len(most_recent_ckpts_paths) > 3:
ckpt_to_be_removed = most_recent_ckpts_paths.pop(0)
os.remove(ckpt_to_be_removed)
if global_step >= args.max_steps:
del train_dataloader
# thread.join()
return args
# torch.cuda.synchronize()
# iter_end = time.time()
# if torch.distributed.get_rank() == 0:
# print('step {} : {}'.format(global_step, iter_end - iter_start))
del train_dataloader
# thread.join()
# Make sure pool has finished and switch train_dataloader
# NOTE: Will block until complete
train_dataloader, data_file = dataset_future.result(timeout=None)
epoch += 1
if __name__ == "__main__":
main()
now = time.time()
args = main()
if is_main_process():
print("Total time taken {}".format(time.time() - now))

View file

@ -101,11 +101,15 @@ def main():
type=str,
required=False,
help="The BERT model config")
parser.add_argument("--ckpt_dir",
ckpt_group = parser.add_mutually_exclusive_group(required=True)
ckpt_group.add_argument("--ckpt_dir",
default=None,
type=str,
required=True,
help="The ckpt directory, e.g. /results")
ckpt_group.add_argument("--ckpt_path",
default=None,
type=str,
help="Path to the specific checkpoint")
group = parser.add_mutually_exclusive_group(required=True)
group.add_argument('--eval', dest='do_eval', action='store_true')
@ -184,16 +188,21 @@ def main():
# Prepare model
config = BertConfig.from_json_file(args.config_file)
# Padding for divisibility by 8
if config.vocab_size % 8 != 0:
config.vocab_size += 8 - (config.vocab_size % 8)
model = BertForPreTraining(config)
if args.ckpt_step == -1:
#retrieve latest model
model_names = [f for f in os.listdir(args.ckpt_dir) if f.endswith(".model")]
args.ckpt_step = max([int(x.split('.model')[0].split('_')[1].strip()) for x in model_names])
print("load model saved at iteraton", args.ckpt_step)
model_file = os.path.join(args.ckpt_dir, "ckpt_" + str(args.ckpt_step) + ".model")
state_dict = torch.load(model_file, map_location="cpu")
if args.ckpt_dir:
if args.ckpt_step == -1:
#retrieve latest model
model_names = [f for f in os.listdir(args.ckpt_dir) if f.endswith(".model")]
args.ckpt_step = max([int(x.split('.model')[0].split('_')[1].strip()) for x in model_names])
print("load model saved at iteraton", args.ckpt_step)
model_file = os.path.join(args.ckpt_dir, "ckpt_" + str(args.ckpt_step) + ".pt")
else:
model_file = args.ckpt_path
state_dict = torch.load(model_file, map_location="cpu")["model"]
model.load_state_dict(state_dict, strict=False)
if args.fp16:

View file

@ -916,11 +916,16 @@ def main():
# Prepare model
config = BertConfig.from_json_file(args.config_file)
# Padding for divisibility by 8
if config.vocab_size % 8 != 0:
config.vocab_size += 8 - (config.vocab_size % 8)
model = BertForQuestionAnswering(config)
# model = BertForQuestionAnswering.from_pretrained(args.bert_model,
# cache_dir=os.path.join(str(PYTORCH_PRETRAINED_BERT_CACHE), 'distributed_{}'.format(args.local_rank)))
model.load_state_dict(torch.load(args.init_checkpoint, map_location='cpu'), strict=False)
print("USING CHECKOINT")
model.load_state_dict(torch.load(args.init_checkpoint, map_location='cpu')["model"], strict=False)
print("USED CHECKPOINT \n\n")
model.to(device)
if args.fp16 and args.old:
model.half()

View file

@ -1,3 +1,2 @@
#!/bin/bash
docker build . --rm -t bert
docker build . --rm -t bert_pyt

View file

@ -1,17 +1,18 @@
#!/bin/bash
DATA_DIR=${1:-"/mnt/dldata/bert"}
VOCAB_DIR=${2:-"/mnt/dldata/bert/vocab"}
CHECKPOINT_DIR=${3:-"/mnt/dldata/bert/pretrained_models_nvidia_pytorch"}
DATA_DIR=${1:-"${PWD}/data/hdf5/books_wiki_en_corpus"}
VOCAB_DIR=${2:-"${PWD}/data/download/google_pretrained_weights/uncased_L-24_H-1024_A-16"}
CHECKPOINT_DIR=${3:-"${PWD}/checkpoints"}
RESULTS_DIR=${4:-"${PWD}/results"}
docker run -it --rm \
--runtime=nvidia \
-p 8888:8888 \
--shm-size=1g \
--ulimit memlock=-1 \
--ulimit stack=67108864 \
-v $DATA_DIR:/workspace/bert/data \
-v ${PWD}:/workspace/bert \
-v $DATA_DIR:/workspace/bert/data/hdf5/books_wiki_en_corpus \
-v $CHECKPOINT_DIR:/workspace/checkpoints \
-v $VOCAB_DIR:/workspace/bert/vocab \
-v $PWD/results:/results \
bert bash
-v $VOCAB_DIR:/workspace/bert/data/download/google_pretrained_weights/uncased_L-24_H-1024_A-16 \
-v $RESULTS_DIR:/results \
bert_pyt bash

View file

@ -1,28 +1,38 @@
#!/bin/bash
echo "Container nvidia build = " $NVIDIA_BUILD_ID
train_batch_size=${1:-14}
learning_rate=${2:-"0.4375e-4"}
train_batch_size=${1:-8192}
learning_rate=${2:-"6e-3"}
precision=${3:-"fp16"}
num_gpus=${4:-8}
warmup_proportion=${5:-"0.01"}
train_steps=${6:-2285714}
save_checkpoint_steps=${7:-2000}
warmup_proportion=${5:-"0.2843"}
train_steps=${6:-7038}
save_checkpoint_steps=${7:-200}
resume_training=${8:-"false"}
create_logfile=${9:-"true"}
accumulate_gradients=${10:-"false"}
gradient_accumulation_steps=${11:-1}
seed=${12:-42}
job_name=${13:-"job"}
accumulate_gradients=${10:-"true"}
gradient_accumulation_steps=${11:-128}
seed=${12:-$RANDOM}
job_name=${13:-"bert_lamb_pretraining"}
allreduce_post_accumulation=${14:-"true"}
allreduce_post_accumulation_fp16=${15:-"true"}
accumulate_into_fp16=${16:-"true"}
train_batch_size_phase2=${1:-4096}
learning_rate_phase2=${2:-"4e-3"}
warmup_proportion_phase2=${5:-"0.128"}
train_steps_phase2=${6:-1563}
gradient_accumulation_steps_phase2=${11:-512}
DATASET=wikipedia_corpus # change this for other datasets
DATASET=books_wiki_en_corpus # change this for other datasets
DATA_DIR=data/${DATASET}/hdf5_shards/
DATA_DIR=data/${DATASET}/training/
#DATA_DIR=data/hdf5/wiki+book/bert_pytorch_wikipedia_bookcorpus_interseqmix_seq_128_pred_20/
BERT_CONFIG=bert_config.json
RESULTS_DIR=/results
CHECKPOINTS_DIR=/results/checkpoints
mkdir -p $CHECKPOINTS_DIR
@ -63,6 +73,21 @@ if [ "$resume_training" == "true" ] ; then
CHECKPOINT="--resume_from_checkpoint"
fi
ALL_REDUCE_POST_ACCUMULATION=""
if [ "$allreduce_post_accumulation" == "true" ] ; then
ALL_REDUCE_POST_ACCUMULATION="--allreduce_post_accumulation"
fi
ALL_REDUCE_POST_ACCUMULATION_FP16=""
if [ "$allreduce_post_accumulation_fp16" == "true" ] ; then
ALL_REDUCE_POST_ACCUMULATION_FP16="--allreduce_post_accumulation_fp16"
fi
ACCUMULATE_INTO_FP16=""
if [ "$accumulate_into_fp16" == "true" ] ; then
ACCUMULATE_INTO_FP16="--accumulate_into_fp16"
fi
echo $DATA_DIR
INPUT_DIR=$DATA_DIR
CMD=" /workspace/bert/run_pretraining.py"
@ -71,8 +96,8 @@ CMD+=" --output_dir=$CHECKPOINTS_DIR"
CMD+=" --config_file=$BERT_CONFIG"
CMD+=" --bert_model=bert-large-uncased"
CMD+=" --train_batch_size=$train_batch_size"
CMD+=" --max_seq_length=512"
CMD+=" --max_predictions_per_seq=80"
CMD+=" --max_seq_length=128"
CMD+=" --max_predictions_per_seq=20"
CMD+=" --max_steps=$train_steps"
CMD+=" --warmup_proportion=$warmup_proportion"
CMD+=" --num_steps_per_checkpoint=$save_checkpoint_steps"
@ -81,7 +106,10 @@ CMD+=" --seed=$seed"
CMD+=" $PREC"
CMD+=" $ACCUMULATE_GRADIENTS"
CMD+=" $CHECKPOINT"
CMD+=" $ALL_REDUCE_POST_ACCUMULATION"
CMD+=" $ALL_REDUCE_POST_ACCUMULATION_FP16"
CMD+=" $ACCUMULATE_INTO_FP16"
CMD+=" --do_train"
if [ "$num_gpus" -gt 1 ] ; then
CMD="python3 -m torch.distributed.launch --nproc_per_node=$num_gpus $CMD"
@ -115,39 +143,107 @@ target_loss=15
THROUGHPUT=10
THRESHOLD=0.9
throughput=`cat $LOGFILE | grep Iteration | tail -1 | awk -F's/it' '{print $1}' | awk -F',' '{print $2}' | egrep -o [0-9.]+`
throughput=`cat $LOGFILE | grep Iteration | tail -1 | awk -F'it/s' '{print $1}' | awk -F',' '{print $2}' | egrep -o [0-9.]+`
loss=`cat $LOGFILE | grep 'Average Loss' | tail -1 | awk -F'Average Loss =' '{print $2}' | awk -F' ' '{print $1}' | egrep -o [0-9.]+`
final_loss=`cat $LOGFILE | grep 'Total Steps' | tail -1 | awk -F'Final Loss =' '{print $2}' | awk -F' ' '{print $1}' | egrep -o [0-9.]+`
echo "throughput: $throughput s/it"
train_perf=$(awk 'BEGIN {print ('$throughput' * '$num_gpus' * '$train_batch_size')}')
echo " training throughput phase1: $train_perf sequences/second"
echo "average loss: $loss"
echo "final loss: $final_loss"
ACCURACY_TEST_RESULT=$(awk 'BEGIN {print ('${loss}' <= '${target_loss}')}')
#Start Phase2
if [ $ACCURACY_TEST_RESULT == 1 ];
then
echo "&&&& ACCURACY TEST PASSED"
else
echo "&&&& ACCURACY TEST FAILED"
fi
DATASET=merged_wiki+books_phase2 # change this for other datasets
PERFORMANCE_TEST_RESULT=$(awk 'BEGIN {print ('${throughput}' <= ('${THROUGHPUT}' * '${THRESHOLD}'))}')
DATA_DIR=data/${DATASET}/hdf5_shards/
#DATA_DIR=data/hdf5/wiki+book/bert_pytorch_wikipedia_bookcorpus_interseqmix_seq_512_pred_80/
if [ $PERFORMANCE_TEST_RESULT == 1 ];
then
echo "&&&& PERFORMANCE TEST PASSED"
else
echo "&&&& PERFORMANCE TEST FAILED"
fi
PREC=""
if [ "$precision" = "fp16" ] ; then
PREC="--fp16"
elif [ "$precision" = "fp32" ] ; then
PREC=""
else
echo "Unknown <precision> argument"
exit -2
fi
if [ $ACCURACY_TEST_RESULT == 1 -a $PERFORMANCE_TEST_RESULT == 1 ];
then
echo "&&&& PASSED"
exit 0
else
echo "&&&& FAILED"
exit 1
fi
ACCUMULATE_GRADIENTS=""
if [ "$accumulate_gradients" == "true" ] ; then
ACCUMULATE_GRADIENTS="--gradient_accumulation_steps=$gradient_accumulation_steps_phase2"
fi
ALL_REDUCE_POST_ACCUMULATION=""
if [ "$allreduce_post_accumulation" == "true" ] ; then
ALL_REDUCE_POST_ACCUMULATION="--allreduce_post_accumulation"
fi
ALL_REDUCE_POST_ACCUMULATION_FP16=""
if [ "$allreduce_post_accumulation_fp16" == "true" ] ; then
ALL_REDUCE_POST_ACCUMULATION_FP16="--allreduce_post_accumulation_fp16"
fi
ACCUMULATE_INTO_FP16=""
if [ "$accumulate_into_fp16" == "true" ] ; then
ACCUMULATE_INTO_FP16="--accumulate_into_fp16"
fi
echo $DATA_DIR
INPUT_DIR=$DATA_DIR
CMD=" /workspace/bert/run_pretraining.py"
CMD+=" --input_dir=$DATA_DIR"
CMD+=" --output_dir=$CHECKPOINTS_DIR"
CMD+=" --config_file=$BERT_CONFIG"
CMD+=" --bert_model=bert-large-uncased"
CMD+=" --train_batch_size=$train_batch_size_phase2"
CMD+=" --max_seq_length=512"
CMD+=" --max_predictions_per_seq=80"
CMD+=" --max_steps=$train_steps_phase2"
CMD+=" --warmup_proportion=$warmup_proportion_phase2"
CMD+=" --num_steps_per_checkpoint=$save_checkpoint_steps"
CMD+=" --learning_rate=$learning_rate_phase2"
CMD+=" --seed=$seed"
CMD+=" $PREC"
CMD+=" $ACCUMULATE_GRADIENTS"
CMD+=" $CHECKPOINT"
CMD+=" $ALL_REDUCE_POST_ACCUMULATION"
CMD+=" $ALL_REDUCE_POST_ACCUMULATION_FP16"
CMD+=" $ACCUMULATE_INTO_FP16"
CMD+=" --do_train --phase2 --resume_from_checkpoint --phase1_end_step=$train_steps"
if [ "$num_gpus" -gt 1 ] ; then
CMD="python3 -m torch.distributed.launch --nproc_per_node=$num_gpus $CMD"
else
CMD="python3 $CMD"
fi
if [ "$create_logfile" = "true" ] ; then
export GBS=$(expr $train_batch_size \* $num_gpus)
printf -v TAG "pyt_bert_pretraining_%s_gbs%d" "$precision" $GBS
DATESTAMP=`date +'%y%m%d%H%M%S'`
LOGFILE=$RESULTS_DIR/$job_name.$TAG.$DATESTAMP.log
printf "Logs written to %s\n" "$LOGFILE"
fi
set -x
if [ -z "$LOGFILE" ] ; then
$CMD
else
(
$CMD
) |& tee $LOGFILE
fi
set +x
echo "finished phase2"
throughput=`cat $LOGFILE | grep Iteration | tail -1 | awk -F'it/s' '{print $1}' | awk -F',' '{print $2}' | egrep -o [0-9.]+`
loss=`cat $LOGFILE | grep 'Average Loss' | tail -1 | awk -F'Average Loss =' '{print $2}' | awk -F' ' '{print $1}' | egrep -o [0-9.]+`
final_loss=`cat $LOGFILE | grep 'Total Steps' | tail -1 | awk -F'Final Loss =' '{print $2}' | awk -F' ' '{print $1}' | egrep -o [0-9.]+`
train_perf=$(awk 'BEGIN {print ('$throughput' * '$num_gpus' * '$train_batch_size_phase2')}')
echo " training throughput phase2: $train_perf sequences/second"
echo "average loss: $loss"
echo "final loss: $final_loss"

View file

@ -96,51 +96,6 @@ else
fi
set +x
target_loss=15
THROUGHPUT=1.0
THRESHOLD=0.9
throughput=`cat $LOGFILE | grep Iteration | tail -1 | awk -F'it/s' '{print $1}' | awk -F',' '{print $2}' | egrep -o [0-9.]+`
echo "throughput: $throughput it/s"
PERFORMANCE_TEST_RESULT=$(awk 'BEGIN {print ('${throughput}' >= \
('${THROUGHPUT}' * '${THRESHOLD}'))}')
if [ $PERFORMANCE_TEST_RESULT == 1 ];
then
echo "&&&& PERFORMANCE TEST PASSED"
else
echo "&&&& PERFORMANCE TEST FAILED"
fi
if [ "$inference_mode" = "eval" ] ; then
loss=`cat $LOGFILE | grep Finished | tail -1 | awk -F'Final Loss =' '{print $2}' | awk -F' ' '{print $1}' | egrep -o [0-9.]+`
echo "final loss: $loss"
ACCURACY_TEST_RESULT=$(awk 'BEGIN {print ('${loss}' <= '${target_loss}')}')
if [ $ACCURACY_TEST_RESULT == 1 ];
then
echo "&&&& ACCURACY TEST PASSED"
else
echo "&&&& ACCURACY TEST FAILED"
fi
if [ $ACCURACY_TEST_RESULT == 1 -a $PERFORMANCE_TEST_RESULT == 1 ];
then
echo "&&&& PASSED"
exit 0
else
echo "&&&& FAILED"
exit 1
fi
fi
inference_perf=$(awk 'BEGIN {print ('$throughput' * '$num_gpus' * '$eval_batch_size')}')
echo " inference throughput : $inference_perf sequences/second"

View file

@ -13,7 +13,7 @@ precision=${5:-"fp16"}
num_gpu=${6:-"8"}
seed=${7:-"1"}
squad_dir=${8:-"/workspace/bert/data/squad/v1.1"}
vocab_file=${9:-"/workspace/bert/vocab/vocab"}
vocab_file=${9:-"/workspace/bert/data/google_pretrained_weights/uncased_L-24_H-1024_A-16/vocab.txt"}
OUT_DIR=${10:-"/results/SQuAD"}
mode=${11:-"train eval"}
CONFIG_FILE=${12:-"/workspace/bert/bert_config.json"}

View file

@ -45,7 +45,7 @@ fi
printf -v EXTRA_PARAMS "%d %d %e %s 1 %d %d %d false" $train_batch_size $eval_batch_size $learning_rate "$precision" $warmup_proportion $train_steps $save_checkpoint_steps
export ROOTDIR=$root_dir
export DATA_DIR=${DATA_DIR:-$CODEDIR/data/wikipedia_corpus/pyt_hdf5_shards}
export DATA_DIR=${DATA_DIR:-$CODEDIR/data/hdf5/books_wiki_en_corpus}
VOLS="-v $ROOTDIR:/workspace/bert"
VOLS+=" -v $DATA_DIR:/workspace/bert/data/wikipedia_corpus/pyt_hdf5_shards"

View file

@ -0,0 +1,12 @@
import torch
import torch.distributed as dist
def get_rank():
if not dist.is_available():
return 0
if not dist.is_initialized():
return 0
return dist.get_rank()
def is_main_process():
return get_rank() == 0

View file

@ -1,123 +0,0 @@
# NVIDIA
import hashlib
import urllib.request
import zipfile
# Download urls
model_urls = {
'bert_base_uncased' : ('https://storage.googleapis.com/bert_models/2018_10_18/uncased_L-12_H-768_A-12.zip', 'uncased_L-12_H-768_A-12.zip'),
'bert_large_uncased' : ('https://storage.googleapis.com/bert_models/2018_10_18/uncased_L-24_H-1024_A-16.zip', 'uncased_L-24_H-1024_A-16.zip'),
'bert_base_cased' : ('https://storage.googleapis.com/bert_models/2018_10_18/cased_L-12_H-768_A-12.zip', 'cased_L-12_H-768_A-12.zip'),
'bert_large_cased' : ('https://storage.googleapis.com/bert_models/2018_10_18/cased_L-24_H-1024_A-16.zip', 'cased_L-24_H-1024_A-16.zip'),
'bert_base_multilingual_cased' : ('https://storage.googleapis.com/bert_models/2018_11_23/multi_cased_L-12_H-768_A-12.zip', 'multi_cased_L-12_H-768_A-12.zip'),
'bert_large_multilingual_uncased' : ('https://storage.googleapis.com/bert_models/2018_11_03/multilingual_L-12_H-768_A-12.zip', 'multilingual_L-12_H-768_A-12.zip'),
'bert_base_chinese' : ('https://storage.googleapis.com/bert_models/2018_11_03/chinese_L-12_H-768_A-12.zip', 'chinese_L-12_H-768_A-12.zip')
}
# SHA256sum verification for file download integrity (and checking for changes from the download source over time)
bert_base_uncased_sha = {
'bert_config.json' : '7b4e5f53efbd058c67cda0aacfafb340113ea1b5797d9ce6ee411704ba21fcbc',
'bert_model.ckpt.data-00000-of-00001' : '58580dc5e0bf0ae0d2efd51d0e8272b2f808857f0a43a88aaf7549da6d7a8a84',
'bert_model.ckpt.index' : '04c1323086e2f1c5b7c0759d8d3e484afbb0ab45f51793daab9f647113a0117b',
'bert_model.ckpt.meta' : 'dd5682170a10c3ea0280c2e9b9a45fee894eb62da649bbdea37b38b0ded5f60e',
'vocab.txt' : '07eced375cec144d27c900241f3e339478dec958f92fddbc551f295c992038a3',
}
bert_large_uncased_sha = {
'bert_config.json' : 'bfa42236d269e2aeb3a6d30412a33d15dbe8ea597e2b01dc9518c63cc6efafcb',
'bert_model.ckpt.data-00000-of-00001' : 'bc6b3363e3be458c99ecf64b7f472d2b7c67534fd8f564c0556a678f90f4eea1',
'bert_model.ckpt.index' : '68b52f2205ffc64dc627d1120cf399c1ef1cbc35ea5021d1afc889ffe2ce2093',
'bert_model.ckpt.meta' : '6fcce8ff7628f229a885a593625e3d5ff9687542d5ef128d9beb1b0c05edc4a1',
'vocab.txt' : '07eced375cec144d27c900241f3e339478dec958f92fddbc551f295c992038a3',
}
bert_base_cased_sha = {
'bert_config.json' : 'f11dfb757bea16339a33e1bf327b0aade6e57fd9c29dc6b84f7ddb20682f48bc',
'bert_model.ckpt.data-00000-of-00001' : '734d5a1b68bf98d4e9cb6b6692725d00842a1937af73902e51776905d8f760ea',
'bert_model.ckpt.index' : '517d6ef5c41fc2ca1f595276d6fccf5521810d57f5a74e32616151557790f7b1',
'bert_model.ckpt.meta' : '5f8a9771ff25dadd61582abb4e3a748215a10a6b55947cbb66d0f0ba1694be98',
'vocab.txt' : 'eeaa9875b23b04b4c54ef759d03db9d1ba1554838f8fb26c5d96fa551df93d02',
}
bert_large_cased_sha = {
'bert_config.json' : '7adb2125c8225da495656c982fd1c5f64ba8f20ad020838571a3f8a954c2df57',
'bert_model.ckpt.data-00000-of-00001' : '6ff33640f40d472f7a16af0c17b1179ca9dcc0373155fb05335b6a4dd1657ef0',
'bert_model.ckpt.index' : 'ef42a53f577fbe07381f4161b13c7cab4f4fc3b167cec6a9ae382c53d18049cf',
'bert_model.ckpt.meta' : 'd2ddff3ed33b80091eac95171e94149736ea74eb645e575d942ec4a5e01a40a1',
'vocab.txt' : 'eeaa9875b23b04b4c54ef759d03db9d1ba1554838f8fb26c5d96fa551df93d02',
}
bert_base_multilingual_cased_sha = {
'bert_config.json' : 'e76c3964bc14a8bb37a5530cdc802699d2f4a6fddfab0611e153aa2528f234f0',
'bert_model.ckpt.data-00000-of-00001' : '55b8a2df41f69c60c5180e50a7c31b7cdf6238909390c4ddf05fbc0d37aa1ac5',
'bert_model.ckpt.index' : '7d8509c2a62b4e300feb55f8e5f1eef41638f4998dd4d887736f42d4f6a34b37',
'bert_model.ckpt.meta' : '95e5f1997e8831f1c31e5cf530f1a2e99f121e9cd20887f2dce6fe9e3343e3fa',
'vocab.txt' : 'fe0fda7c425b48c516fc8f160d594c8022a0808447475c1a7c6d6479763f310c',
}
bert_large_multilingual_uncased_sha = {
'bert_config.json' : '49063bb061390211d2fdd108cada1ed86faa5f90b80c8f6fdddf406afa4c4624',
'bert_model.ckpt.data-00000-of-00001' : '3cd83912ebeb0efe2abf35c9f1d5a515d8e80295e61c49b75c8853f756658429',
'bert_model.ckpt.index' : '87c372c1a3b1dc7effaaa9103c80a81b3cbab04c7933ced224eec3b8ad2cc8e7',
'bert_model.ckpt.meta' : '27f504f34f02acaa6b0f60d65195ec3e3f9505ac14601c6a32b421d0c8413a29',
'vocab.txt' : '87b44292b452f6c05afa49b2e488e7eedf79ea4f4c39db6f2f4b37764228ef3f',
}
bert_base_chinese_sha = {
'bert_config.json' : '7aaad0335058e2640bcb2c2e9a932b1cd9da200c46ea7b8957d54431f201c015',
'bert_model.ckpt.data-00000-of-00001' : '756699356b78ad0ef1ca9ba6528297bcb3dd1aef5feadd31f4775d7c7fc989ba',
'bert_model.ckpt.index' : '46315546e05ce62327b3e2cd1bed22836adcb2ff29735ec87721396edb21b82e',
'bert_model.ckpt.meta' : 'c0f8d51e1ab986604bc2b25d6ec0af7fd21ff94cf67081996ec3f3bf5d823047',
'vocab.txt' : '45bbac6b341c319adc98a532532882e91a9cefc0329aa57bac9ae761c27b291c',
}
# Relate SHA to urls for loop below
model_sha = {
'bert_base_uncased' : bert_base_uncased_sha,
'bert_large_uncased' : bert_large_uncased_sha,
'bert_base_cased' : bert_base_cased_sha,
'bert_large_cased' : bert_large_cased_sha,
'bert_base_multilingual_cased' : bert_base_multilingual_cased_sha,
'bert_large_multilingual_uncased' : bert_large_multilingual_uncased_sha,
'bert_base_chinese' : bert_base_chinese_sha
}
# Helper to get sha256sum of a file
def sha256sum(filename):
h = hashlib.sha256()
b = bytearray(128*1024)
mv = memoryview(b)
with open(filename, 'rb', buffering=0) as f:
for n in iter(lambda : f.readinto(mv), 0):
h.update(mv[:n])
return h.hexdigest()
# Iterate over urls: download, unzip, verify sha256sum
found_mismatch_sha = False
for model in model_urls:
url = model_urls[model][0]
file = model_urls[model][1]
print("Downloading", url)
response = urllib.request.urlopen(url)
with open(file, "wb") as handle:
handle.write(response.read())
print("Unzipping", file)
zip = zipfile.ZipFile(file, 'r')
zip.extractall()
zip.close()
sha_dict = model_sha[model]
for extracted_file in sha_dict:
sha = sha_dict[extracted_file]
if sha != sha256sum(file[:-4] + "/" + extracted_file):
found_mismatch_sha = True
print("SHA256sum does not match on file:", extracted_file, "from download url:", url)
else:
print(file[:-4] + "/" + extracted_file, "\t", "verified")
if not found_mismatch_sha:
print("All downloads pass sha256sum verification.")

File diff suppressed because it is too large Load diff