updating BERT (single node LAMB support)
This commit is contained in:
parent
7118f12b8a
commit
bae6e931bd
|
@ -1,27 +1,28 @@
|
|||
ARG FROM_IMAGE_NAME=nvcr.io/nvidia/pytorch:19.06-py3
|
||||
ARG FROM_IMAGE_NAME=nvcr.io/nvidia/pytorch:19.07-py3
|
||||
FROM ${FROM_IMAGE_NAME}
|
||||
RUN apt-get update && apt-get install -y pbzip2 pv bzip2 cabextract
|
||||
|
||||
ENV BERT_PREP_WORKING_DIR /workspace/bert/data
|
||||
|
||||
WORKDIR /opt
|
||||
RUN rm -rf /opt/pytorch/apex ; \
|
||||
git clone https://github.com/NVIDIA/apex.git pytorch/apex ; \
|
||||
cd pytorch/apex ; \
|
||||
pip uninstall --yes apex; \
|
||||
git checkout 880ab925bce9f817a93988b021e12db5f67f7787; \
|
||||
git pull; \
|
||||
pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" .
|
||||
|
||||
#WORKDIR /opt
|
||||
#RUN cd pytorch/apex \
|
||||
# && git fetch origin pull/182/head:norm_fix \
|
||||
# && git checkout norm_fix \
|
||||
# && git fetch origin pull/334/head:multi_tensor_lamb_optimizer \
|
||||
# && git checkout multi_tensor_lamb_optimizer \
|
||||
# && python setup.py develop --cuda_ext --cpp_ext
|
||||
|
||||
|
||||
WORKDIR /opt
|
||||
RUN cd pytorch/apex ; \
|
||||
pip uninstall apex; \
|
||||
pip uninstall apex; \
|
||||
git checkout master; \
|
||||
git pull; \
|
||||
pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" .
|
||||
|
||||
WORKDIR /workspace
|
||||
RUN git clone https://github.com/attardi/wikiextractor.git
|
||||
RUN git clone https://github.com/soskek/bookcorpus.git
|
||||
|
||||
WORKDIR /workspace/bert
|
||||
RUN pip install tqdm boto3 requests six ipdb h5py html2text nltk progressbar
|
||||
COPY . .
|
||||
RUN pip install tqdm boto3 requests six ipdb h5py html2text nltk progressbar
|
File diff suppressed because it is too large
Load diff
124
PyTorch/LanguageModeling/BERT/bind_pyt.py
Normal file
124
PyTorch/LanguageModeling/BERT/bind_pyt.py
Normal file
|
@ -0,0 +1,124 @@
|
|||
import sys
|
||||
import subprocess
|
||||
import os
|
||||
import socket
|
||||
from argparse import ArgumentParser, REMAINDER
|
||||
|
||||
import torch
|
||||
|
||||
|
||||
def parse_args():
|
||||
"""
|
||||
Helper function parsing the command line options
|
||||
@retval ArgumentParser
|
||||
"""
|
||||
parser = ArgumentParser(description="PyTorch distributed training launch "
|
||||
"helper utilty that will spawn up "
|
||||
"multiple distributed processes")
|
||||
|
||||
# Optional arguments for the launch helper
|
||||
parser.add_argument("--nnodes", type=int, default=1,
|
||||
help="The number of nodes to use for distributed "
|
||||
"training")
|
||||
parser.add_argument("--node_rank", type=int, default=0,
|
||||
help="The rank of the node for multi-node distributed "
|
||||
"training")
|
||||
parser.add_argument("--nproc_per_node", type=int, default=1,
|
||||
help="The number of processes to launch on each node, "
|
||||
"for GPU training, this is recommended to be set "
|
||||
"to the number of GPUs in your system so that "
|
||||
"each process can be bound to a single GPU.")
|
||||
parser.add_argument("--master_addr", default="127.0.0.1", type=str,
|
||||
help="Master node (rank 0)'s address, should be either "
|
||||
"the IP address or the hostname of node 0, for "
|
||||
"single node multi-proc training, the "
|
||||
"--master_addr can simply be 127.0.0.1")
|
||||
parser.add_argument("--master_port", default=29500, type=int,
|
||||
help="Master node (rank 0)'s free port that needs to "
|
||||
"be used for communciation during distributed "
|
||||
"training")
|
||||
parser.add_argument('--no_hyperthreads', action='store_true',
|
||||
help='Flag to disable binding to hyperthreads')
|
||||
parser.add_argument('--no_membind', action='store_true',
|
||||
help='Flag to disable memory binding')
|
||||
|
||||
# non-optional arguments for binding
|
||||
parser.add_argument("--nsockets_per_node", type=int, required=True,
|
||||
help="Number of CPU sockets on a node")
|
||||
parser.add_argument("--ncores_per_socket", type=int, required=True,
|
||||
help="Number of CPU cores per socket")
|
||||
|
||||
# positional
|
||||
parser.add_argument("training_script", type=str,
|
||||
help="The full path to the single GPU training "
|
||||
"program/script to be launched in parallel, "
|
||||
"followed by all the arguments for the "
|
||||
"training script")
|
||||
|
||||
# rest from the training program
|
||||
parser.add_argument('training_script_args', nargs=REMAINDER)
|
||||
return parser.parse_args()
|
||||
|
||||
def main():
|
||||
args = parse_args()
|
||||
|
||||
# variables for numactrl binding
|
||||
|
||||
|
||||
NSOCKETS = args.nsockets_per_node
|
||||
NGPUS_PER_SOCKET = (args.nproc_per_node // args.nsockets_per_node) + (1 if (args.nproc_per_node % args.nsockets_per_node) else 0)
|
||||
NCORES_PER_GPU = args.ncores_per_socket // NGPUS_PER_SOCKET
|
||||
|
||||
# world size in terms of number of processes
|
||||
dist_world_size = args.nproc_per_node * args.nnodes
|
||||
|
||||
# set PyTorch distributed related environmental variables
|
||||
current_env = os.environ.copy()
|
||||
current_env["MASTER_ADDR"] = args.master_addr
|
||||
current_env["MASTER_PORT"] = str(args.master_port)
|
||||
current_env["WORLD_SIZE"] = str(dist_world_size)
|
||||
|
||||
processes = []
|
||||
|
||||
for local_rank in range(0, args.nproc_per_node):
|
||||
# each process's rank
|
||||
dist_rank = args.nproc_per_node * args.node_rank + local_rank
|
||||
current_env["RANK"] = str(dist_rank)
|
||||
|
||||
# form numactrl binding command
|
||||
cpu_ranges = [local_rank * NCORES_PER_GPU,
|
||||
(local_rank + 1) * NCORES_PER_GPU - 1,
|
||||
local_rank * NCORES_PER_GPU + (NCORES_PER_GPU * NGPUS_PER_SOCKET * NSOCKETS),
|
||||
(local_rank + 1) * NCORES_PER_GPU + (NCORES_PER_GPU * NGPUS_PER_SOCKET * NSOCKETS) - 1]
|
||||
|
||||
numactlargs = []
|
||||
if args.no_hyperthreads:
|
||||
numactlargs += [ "--physcpubind={}-{}".format(*cpu_ranges[0:2]) ]
|
||||
else:
|
||||
numactlargs += [ "--physcpubind={}-{},{}-{}".format(*cpu_ranges) ]
|
||||
|
||||
if not args.no_membind:
|
||||
memnode = local_rank // NGPUS_PER_SOCKET
|
||||
numactlargs += [ "--membind={}".format(memnode) ]
|
||||
|
||||
# spawn the processes
|
||||
cmd = [ "/usr/bin/numactl" ] \
|
||||
+ numactlargs \
|
||||
+ [ sys.executable,
|
||||
"-u",
|
||||
args.training_script,
|
||||
"--local_rank={}".format(local_rank)
|
||||
] \
|
||||
+ args.training_script_args
|
||||
|
||||
process = subprocess.Popen(cmd, env=current_env)
|
||||
processes.append(process)
|
||||
|
||||
for process in processes:
|
||||
process.wait()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
|
21
PyTorch/LanguageModeling/BERT/config_DGX1.sh
Normal file
21
PyTorch/LanguageModeling/BERT/config_DGX1.sh
Normal file
|
@ -0,0 +1,21 @@
|
|||
#!/bin/bash
|
||||
|
||||
## DL params
|
||||
BATCHSIZE=16
|
||||
LEARNING_RATE=6e-3
|
||||
WARMUP_UPDATES=0.2843
|
||||
EXTRA_PARAMS="--input_dir=workspace/data --do_train --config_file=bert_config.json --max_seq_length=128 --max_predictions_per_seq=20 --output_dir /results --fp16 --max_steps=7508 --num_steps_per_checkpoint=200"
|
||||
|
||||
## System run parms
|
||||
DGXNNODES=1
|
||||
DGXSYSTEM=$(basename $(readlink -f ${BASH_SOURCE[0]}) | sed 's/^config_//' | sed 's/\.sh$//' )
|
||||
WALLTIME=00:15:00
|
||||
DEADLINE=$(date -d '+168 hours' '+%FT%T')
|
||||
SLURM_EMAIL_TYPE="END"
|
||||
|
||||
## System config params
|
||||
DGXNGPU=8
|
||||
DGXSOCKETCORES=20
|
||||
DGXNSOCKET=2
|
||||
DGXHT=2 # HT is on is 2, HT off is 1
|
||||
DGXIBDEVICES=''
|
21
PyTorch/LanguageModeling/BERT/config_DGX2.sh
Normal file
21
PyTorch/LanguageModeling/BERT/config_DGX2.sh
Normal file
|
@ -0,0 +1,21 @@
|
|||
#!/bin/bash
|
||||
|
||||
## DL params
|
||||
BATCHSIZE=4096
|
||||
LEARNING_RATE="6e-3"
|
||||
WARMUP_UPDATES=0.2843
|
||||
EXTRA_PARAMS="--input_dir=/workspace/data --do_train --config_file=bert_config.json --max_seq_length=128 --max_predictions_per_seq=20 --output_dir /results/output --fp16 --max_steps=7038 --num_steps_per_checkpoint=2500 --log_freq=1 --gradient_accumulation_steps=64 --allreduce_post_accumulation --allreduce_post_accumulation_fp16"
|
||||
|
||||
## System run parms
|
||||
DGXNNODES=1
|
||||
DGXSYSTEM=$(basename $(readlink -f ${BASH_SOURCE[0]}) | sed 's/^config_//' | sed 's/\.sh$//' )
|
||||
DEADLINE=$(date -d '+168 hours' '+%FT%T')
|
||||
SLURM_EMAIL_TYPE="END"
|
||||
WALLTIME="3-00:00:00"
|
||||
|
||||
## System config params
|
||||
DGXNGPU=16
|
||||
DGXSOCKETCORES=24
|
||||
DGXNSOCKET=2
|
||||
DGXHT=2 # HT is on is 2, HT off is 1
|
||||
DGXIBDEVICES='--device=/dev/infiniband/rdma_cm --device=/dev/infiniband/ucm10 --device=/dev/infiniband/ucm9 --device=/dev/infiniband/ucm8 --device=/dev/infiniband/ucm7 --device=/dev/infiniband/ucm4 --device=/dev/infiniband/ucm3 --device=/dev/infiniband/ucm2 --device=/dev/infiniband/ucm1 --device=/dev/infiniband/uverbs10 --device=/dev/infiniband/uverbs9 --device=/dev/infiniband/uverbs8 --device=/dev/infiniband/uverbs7 --device=/dev/infiniband/uverbs4 --device=/dev/infiniband/uverbs3 --device=/dev/infiniband/uverbs2 --device=/dev/infiniband/uverbs1 --device=/dev/infiniband/issm10 --device=/dev/infiniband/umad10 --device=/dev/infiniband/issm9 --device=/dev/infiniband/umad9 --device=/dev/infiniband/issm8 --device=/dev/infiniband/umad8 --device=/dev/infiniband/issm7 --device=/dev/infiniband/umad7 --device=/dev/infiniband/issm4 --device=/dev/infiniband/umad4 --device=/dev/infiniband/issm3 --device=/dev/infiniband/umad3 --device=/dev/infiniband/issm2 --device=/dev/infiniband/umad2 --device=/dev/infiniband/issm1 --device=/dev/infiniband/umad1'
|
|
@ -0,0 +1,19 @@
|
|||
#!/bin/bash
|
||||
|
||||
## DL params
|
||||
BATCHSIZE=256
|
||||
LEARNING_RATE="6e-3"
|
||||
WARMUP_UPDATES=0.2843
|
||||
EXTRA_PARAMS="--input_dir=/workspace/data --do_train --config_file=bert_config.json --max_seq_length=128 --max_predictions_per_seq=20 --output_dir /checkpoints --fp16 --max_steps=7038 --num_steps_per_checkpoint=2500 --log_freq=1 --gradient_accumulation_steps=4 --allreduce_post_accumulation --allreduce_post_accumulation_fp16 --resume_from_checkpoint"
|
||||
|
||||
## System run parms
|
||||
DGXNNODES=16
|
||||
DGXSYSTEM=$(basename $(readlink -f ${BASH_SOURCE[0]}) | sed 's/^config_//' | sed 's/\.sh$//' )
|
||||
WALLTIME=02:00:00
|
||||
|
||||
## System config params
|
||||
DGXNGPU=16
|
||||
DGXSOCKETCORES=24
|
||||
DGXNSOCKET=2
|
||||
DGXHT=2 # HT is on is 2, HT off is 1
|
||||
DGXIBDEVICES='--device=/dev/infiniband/rdma_cm --device=/dev/infiniband/ucm10 --device=/dev/infiniband/ucm9 --device=/dev/infiniband/ucm8 --device=/dev/infiniband/ucm7 --device=/dev/infiniband/ucm4 --device=/dev/infiniband/ucm3 --device=/dev/infiniband/ucm2 --device=/dev/infiniband/ucm1 --device=/dev/infiniband/uverbs10 --device=/dev/infiniband/uverbs9 --device=/dev/infiniband/uverbs8 --device=/dev/infiniband/uverbs7 --device=/dev/infiniband/uverbs4 --device=/dev/infiniband/uverbs3 --device=/dev/infiniband/uverbs2 --device=/dev/infiniband/uverbs1 --device=/dev/infiniband/issm10 --device=/dev/infiniband/umad10 --device=/dev/infiniband/issm9 --device=/dev/infiniband/umad9 --device=/dev/infiniband/issm8 --device=/dev/infiniband/umad8 --device=/dev/infiniband/issm7 --device=/dev/infiniband/umad7 --device=/dev/infiniband/issm4 --device=/dev/infiniband/umad4 --device=/dev/infiniband/issm3 --device=/dev/infiniband/umad3 --device=/dev/infiniband/issm2 --device=/dev/infiniband/umad2 --device=/dev/infiniband/issm1 --device=/dev/infiniband/umad1'
|
|
@ -0,0 +1,21 @@
|
|||
#!/bin/bash
|
||||
|
||||
## DL params
|
||||
BATCHSIZE=128
|
||||
LEARNING_RATE="4e-3"
|
||||
WARMUP_UPDATES=0.128
|
||||
EXTRA_PARAMS="--input_dir=/workspace/data_phase2 --do_train --config_file=bert_config.json --max_seq_length=512 --max_predictions_per_seq=80 --output_dir /checkpoints --fp16 --max_steps=1563 --num_steps_per_checkpoint=1000 --log_freq=1 --phase2 --gradient_accumulation_steps=16 --resume_from_checkpoint --allreduce_post_accumulation --allreduce_post_accumulation_fp16 --resume_step=7038"
|
||||
|
||||
## System run parms
|
||||
DGXNNODES=16
|
||||
DGXSYSTEM=$(basename $(readlink -f ${BASH_SOURCE[0]}) | sed 's/^config_//' | sed 's/\.sh$//' )
|
||||
WALLTIME=02:00:00
|
||||
DEADLINE=$(date -d '+168 hours' '+%FT%T')
|
||||
SLURM_EMAIL_TYPE="END"
|
||||
|
||||
## System config params
|
||||
DGXNGPU=16
|
||||
DGXSOCKETCORES=24
|
||||
DGXNSOCKET=2
|
||||
DGXHT=2 # HT is on is 2, HT off is 1
|
||||
DGXIBDEVICES='--device=/dev/infiniband/rdma_cm --device=/dev/infiniband/ucm10 --device=/dev/infiniband/ucm9 --device=/dev/infiniband/ucm8 --device=/dev/infiniband/ucm7 --device=/dev/infiniband/ucm4 --device=/dev/infiniband/ucm3 --device=/dev/infiniband/ucm2 --device=/dev/infiniband/ucm1 --device=/dev/infiniband/uverbs10 --device=/dev/infiniband/uverbs9 --device=/dev/infiniband/uverbs8 --device=/dev/infiniband/uverbs7 --device=/dev/infiniband/uverbs4 --device=/dev/infiniband/uverbs3 --device=/dev/infiniband/uverbs2 --device=/dev/infiniband/uverbs1 --device=/dev/infiniband/issm10 --device=/dev/infiniband/umad10 --device=/dev/infiniband/issm9 --device=/dev/infiniband/umad9 --device=/dev/infiniband/issm8 --device=/dev/infiniband/umad8 --device=/dev/infiniband/issm7 --device=/dev/infiniband/umad7 --device=/dev/infiniband/issm4 --device=/dev/infiniband/umad4 --device=/dev/infiniband/issm3 --device=/dev/infiniband/umad3 --device=/dev/infiniband/issm2 --device=/dev/infiniband/umad2 --device=/dev/infiniband/issm1 --device=/dev/infiniband/umad1'
|
|
@ -0,0 +1,21 @@
|
|||
#!/bin/bash
|
||||
|
||||
## DL params
|
||||
BATCHSIZE=2048
|
||||
LEARNING_RATE="4e-3"
|
||||
WARMUP_UPDATES=0.128
|
||||
EXTRA_PARAMS="--input_dir=/workspace/data_phase2 --do_train --config_file=bert_config.json --max_seq_length=512 --max_predictions_per_seq=80 --output_dir /checkpoints --fp16 --max_steps=1563 --num_steps_per_checkpoint=1000 --log_freq=1 --phase2 --gradient_accumulation_steps=256 --resume_from_checkpoint --allreduce_post_accumulation --allreduce_post_accumulation_fp16 --resume_step=7038"
|
||||
|
||||
## System run parms
|
||||
DGXNNODES=1
|
||||
DGXSYSTEM=$(basename $(readlink -f ${BASH_SOURCE[0]}) | sed 's/^config_//' | sed 's/\.sh$//' )
|
||||
WALLTIME="00:15:00"
|
||||
DEADLINE=$(date -d '+168 hours' '+%FT%T')
|
||||
SLURM_EMAIL_TYPE="END"
|
||||
|
||||
## System config params
|
||||
DGXNGPU=16
|
||||
DGXSOCKETCORES=24
|
||||
DGXNSOCKET=2
|
||||
DGXHT=2 # HT is on is 2, HT off is 1
|
||||
DGXIBDEVICES='--device=/dev/infiniband/rdma_cm --device=/dev/infiniband/ucm10 --device=/dev/infiniband/ucm9 --device=/dev/infiniband/ucm8 --device=/dev/infiniband/ucm7 --device=/dev/infiniband/ucm4 --device=/dev/infiniband/ucm3 --device=/dev/infiniband/ucm2 --device=/dev/infiniband/ucm1 --device=/dev/infiniband/uverbs10 --device=/dev/infiniband/uverbs9 --device=/dev/infiniband/uverbs8 --device=/dev/infiniband/uverbs7 --device=/dev/infiniband/uverbs4 --device=/dev/infiniband/uverbs3 --device=/dev/infiniband/uverbs2 --device=/dev/infiniband/uverbs1 --device=/dev/infiniband/issm10 --device=/dev/infiniband/umad10 --device=/dev/infiniband/issm9 --device=/dev/infiniband/umad9 --device=/dev/infiniband/issm8 --device=/dev/infiniband/umad8 --device=/dev/infiniband/issm7 --device=/dev/infiniband/umad7 --device=/dev/infiniband/issm4 --device=/dev/infiniband/umad4 --device=/dev/infiniband/issm3 --device=/dev/infiniband/umad3 --device=/dev/infiniband/issm2 --device=/dev/infiniband/umad2 --device=/dev/infiniband/issm1 --device=/dev/infiniband/umad1'
|
28
PyTorch/LanguageModeling/BERT/config_DGX2_2x16x40.sh
Normal file
28
PyTorch/LanguageModeling/BERT/config_DGX2_2x16x40.sh
Normal file
|
@ -0,0 +1,28 @@
|
|||
#!/bin/bash
|
||||
|
||||
## DL params
|
||||
BATCHSIZE=64
|
||||
LEARNING_RATE="6e-3"
|
||||
WARMUP_UPDATES=0.2843
|
||||
EXTRA_PARAMS="--input_dir=/workspace/data \
|
||||
--do_train \
|
||||
--config_file=bert_config.json \
|
||||
--max_seq_length=128 \
|
||||
--max_predictions_per_seq=20 \
|
||||
--output_dir=/results/output \
|
||||
--fp16 \
|
||||
--max_steps=7508 \
|
||||
--num_steps_per_checkpoint=200 \
|
||||
--log_freq=1"
|
||||
|
||||
## System run parms
|
||||
DGXNNODES=2
|
||||
DGXSYSTEM=$(basename $(readlink -f ${BASH_SOURCE[0]}) | sed 's/^config_//' | sed 's/\.sh$//' )
|
||||
WALLTIME=00:30:00
|
||||
DEADLINE=$(date -d '+72 hours' '+%FT%T')
|
||||
## System config params
|
||||
DGXNGPU=16
|
||||
DGXSOCKETCORES=24
|
||||
DGXNSOCKET=2
|
||||
DGXHT=2 # HT is on is 2, HT off is 1
|
||||
DGXIBDEVICES='--device=/dev/infiniband/rdma_cm --device=/dev/infiniband/ucm10 --device=/dev/infiniband/ucm9 --device=/dev/infiniband/ucm8 --device=/dev/infiniband/ucm7 --device=/dev/infiniband/ucm4 --device=/dev/infiniband/ucm3 --device=/dev/infiniband/ucm2 --device=/dev/infiniband/ucm1 --device=/dev/infiniband/uverbs10 --device=/dev/infiniband/uverbs9 --device=/dev/infiniband/uverbs8 --device=/dev/infiniband/uverbs7 --device=/dev/infiniband/uverbs4 --device=/dev/infiniband/uverbs3 --device=/dev/infiniband/uverbs2 --device=/dev/infiniband/uverbs1 --device=/dev/infiniband/issm10 --device=/dev/infiniband/umad10 --device=/dev/infiniband/issm9 --device=/dev/infiniband/umad9 --device=/dev/infiniband/issm8 --device=/dev/infiniband/umad8 --device=/dev/infiniband/issm7 --device=/dev/infiniband/umad7 --device=/dev/infiniband/issm4 --device=/dev/infiniband/umad4 --device=/dev/infiniband/issm3 --device=/dev/infiniband/umad3 --device=/dev/infiniband/issm2 --device=/dev/infiniband/umad2 --device=/dev/infiniband/issm1 --device=/dev/infiniband/umad1'
|
|
@ -0,0 +1,22 @@
|
|||
#!/bin/bash
|
||||
|
||||
## DL params
|
||||
BATCHSIZE=96
|
||||
LEARNING_RATE="6e-3"
|
||||
WARMUP_UPDATES=0.2843
|
||||
SEED=23448
|
||||
EXTRA_PARAMS="--input_dir=/workspace/data --do_train --config_file=bert_config.json --max_seq_length=128 --max_predictions_per_seq=20 --output_dir /results/output --max_steps=7038 --num_steps_per_checkpoint=10000 --log_freq=1 --gradient_accumulation_steps=3"
|
||||
|
||||
## System run parms
|
||||
DGXNNODES=46
|
||||
DGXSYSTEM=$(basename $(readlink -f ${BASH_SOURCE[0]}) | sed 's/^config_//' | sed 's/\.sh$//' )
|
||||
WALLTIME=05:00:00
|
||||
DEADLINE=$(date -d '+168 hours' '+%FT%T')
|
||||
SLURM_EMAIL_TYPE="END"
|
||||
|
||||
## System config params
|
||||
DGXNGPU=16
|
||||
DGXSOCKETCORES=24
|
||||
DGXNSOCKET=2
|
||||
DGXHT=2 # HT is on is 2, HT off is 1
|
||||
DGXIBDEVICES='--device=/dev/infiniband/rdma_cm --device=/dev/infiniband/ucm10 --device=/dev/infiniband/ucm9 --device=/dev/infiniband/ucm8 --device=/dev/infiniband/ucm7 --device=/dev/infiniband/ucm4 --device=/dev/infiniband/ucm3 --device=/dev/infiniband/ucm2 --device=/dev/infiniband/ucm1 --device=/dev/infiniband/uverbs10 --device=/dev/infiniband/uverbs9 --device=/dev/infiniband/uverbs8 --device=/dev/infiniband/uverbs7 --device=/dev/infiniband/uverbs4 --device=/dev/infiniband/uverbs3 --device=/dev/infiniband/uverbs2 --device=/dev/infiniband/uverbs1 --device=/dev/infiniband/issm10 --device=/dev/infiniband/umad10 --device=/dev/infiniband/issm9 --device=/dev/infiniband/umad9 --device=/dev/infiniband/issm8 --device=/dev/infiniband/umad8 --device=/dev/infiniband/issm7 --device=/dev/infiniband/umad7 --device=/dev/infiniband/issm4 --device=/dev/infiniband/umad4 --device=/dev/infiniband/issm3 --device=/dev/infiniband/umad3 --device=/dev/infiniband/issm2 --device=/dev/infiniband/umad2 --device=/dev/infiniband/issm1 --device=/dev/infiniband/umad1'
|
|
@ -0,0 +1,19 @@
|
|||
#!/bin/bash
|
||||
|
||||
## DL params
|
||||
BATCHSIZE=96
|
||||
LEARNING_RATE="6e-3"
|
||||
WARMUP_UPDATES=0.2843
|
||||
EXTRA_PARAMS="--input_dir=/workspace/data --do_train --config_file=bert_config.json --max_seq_length=128 --max_predictions_per_seq=20 --output_dir /results/output --fp16 --max_steps=7038 --num_steps_per_checkpoint=10000 --log_freq=1 --gradient_accumulation_steps=2 --allreduce_post_accumulation --allreduce_post_accumulation_fp16 --accumulate_into_fp16"
|
||||
|
||||
## System run parms
|
||||
DGXNNODES=46
|
||||
DGXSYSTEM=$(basename $(readlink -f ${BASH_SOURCE[0]}) | sed 's/^config_//' | sed 's/\.sh$//' )
|
||||
WALLTIME=02:00:00
|
||||
|
||||
## System config params
|
||||
DGXNGPU=16
|
||||
DGXSOCKETCORES=24
|
||||
DGXNSOCKET=2
|
||||
DGXHT=2 # HT is on is 2, HT off is 1
|
||||
DGXIBDEVICES='--device=/dev/infiniband/rdma_cm --device=/dev/infiniband/ucm10 --device=/dev/infiniband/ucm9 --device=/dev/infiniband/ucm8 --device=/dev/infiniband/ucm7 --device=/dev/infiniband/ucm4 --device=/dev/infiniband/ucm3 --device=/dev/infiniband/ucm2 --device=/dev/infiniband/ucm1 --device=/dev/infiniband/uverbs10 --device=/dev/infiniband/uverbs9 --device=/dev/infiniband/uverbs8 --device=/dev/infiniband/uverbs7 --device=/dev/infiniband/uverbs4 --device=/dev/infiniband/uverbs3 --device=/dev/infiniband/uverbs2 --device=/dev/infiniband/uverbs1 --device=/dev/infiniband/issm10 --device=/dev/infiniband/umad10 --device=/dev/infiniband/issm9 --device=/dev/infiniband/umad9 --device=/dev/infiniband/issm8 --device=/dev/infiniband/umad8 --device=/dev/infiniband/issm7 --device=/dev/infiniband/umad7 --device=/dev/infiniband/issm4 --device=/dev/infiniband/umad4 --device=/dev/infiniband/issm3 --device=/dev/infiniband/umad3 --device=/dev/infiniband/issm2 --device=/dev/infiniband/umad2 --device=/dev/infiniband/issm1 --device=/dev/infiniband/umad1'
|
|
@ -0,0 +1,21 @@
|
|||
#!/bin/bash
|
||||
|
||||
## DL params
|
||||
BATCHSIZE=48
|
||||
LEARNING_RATE="4.12e-3"
|
||||
WARMUP_UPDATES=0.138
|
||||
EXTRA_PARAMS="--input_dir=/workspace/data_phase2 --do_train --config_file=bert_config.json --max_seq_length=512 --max_predictions_per_seq=80 --output_dir /checkpoints --max_steps=1450 --num_steps_per_checkpoint=1000 --log_freq=1 --phase2 --gradient_accumulation_steps=12 --resume_from_checkpoint"
|
||||
|
||||
## System run parms
|
||||
DGXNNODES=46
|
||||
DGXSYSTEM=$(basename $(readlink -f ${BASH_SOURCE[0]}) | sed 's/^config_//' | sed 's/\.sh$//' )
|
||||
WALLTIME=05:00:00
|
||||
DEADLINE=$(date -d '+168 hours' '+%FT%T')
|
||||
SLURM_EMAIL_TYPE="END"
|
||||
|
||||
## System config params
|
||||
DGXNGPU=16
|
||||
DGXSOCKETCORES=24
|
||||
DGXNSOCKET=2
|
||||
DGXHT=2 # HT is on is 2, HT off is 1
|
||||
DGXIBDEVICES='--device=/dev/infiniband/rdma_cm --device=/dev/infiniband/ucm10 --device=/dev/infiniband/ucm9 --device=/dev/infiniband/ucm8 --device=/dev/infiniband/ucm7 --device=/dev/infiniband/ucm4 --device=/dev/infiniband/ucm3 --device=/dev/infiniband/ucm2 --device=/dev/infiniband/ucm1 --device=/dev/infiniband/uverbs10 --device=/dev/infiniband/uverbs9 --device=/dev/infiniband/uverbs8 --device=/dev/infiniband/uverbs7 --device=/dev/infiniband/uverbs4 --device=/dev/infiniband/uverbs3 --device=/dev/infiniband/uverbs2 --device=/dev/infiniband/uverbs1 --device=/dev/infiniband/issm10 --device=/dev/infiniband/umad10 --device=/dev/infiniband/issm9 --device=/dev/infiniband/umad9 --device=/dev/infiniband/issm8 --device=/dev/infiniband/umad8 --device=/dev/infiniband/issm7 --device=/dev/infiniband/umad7 --device=/dev/infiniband/issm4 --device=/dev/infiniband/umad4 --device=/dev/infiniband/issm3 --device=/dev/infiniband/umad3 --device=/dev/infiniband/issm2 --device=/dev/infiniband/umad2 --device=/dev/infiniband/issm1 --device=/dev/infiniband/umad1'
|
|
@ -0,0 +1,21 @@
|
|||
#!/bin/bash
|
||||
|
||||
## DL params
|
||||
BATCHSIZE=48
|
||||
LEARNING_RATE="4.12e-3"
|
||||
WARMUP_UPDATES=0.138
|
||||
EXTRA_PARAMS="--input_dir=/workspace/data_phase2 --do_train --config_file=bert_config.json --max_seq_length=512 --max_predictions_per_seq=80 --output_dir /checkpoints --fp16 --max_steps=1450 --num_steps_per_checkpoint=1000 --log_freq=1 --phase2 --gradient_accumulation_steps=6 --resume_from_checkpoint --allreduce_post_accumulation --allreduce_post_accumulation_fp16 --accumulate_into_fp16 --resume_step=7038"
|
||||
|
||||
## System run parms
|
||||
DGXNNODES=46
|
||||
DGXSYSTEM=$(basename $(readlink -f ${BASH_SOURCE[0]}) | sed 's/^config_//' | sed 's/\.sh$//' )
|
||||
WALLTIME=02:00:00
|
||||
DEADLINE=$(date -d '+168 hours' '+%FT%T')
|
||||
SLURM_EMAIL_TYPE="END"
|
||||
|
||||
## System config params
|
||||
DGXNGPU=16
|
||||
DGXSOCKETCORES=24
|
||||
DGXNSOCKET=2
|
||||
DGXHT=2 # HT is on is 2, HT off is 1
|
||||
DGXIBDEVICES='--device=/dev/infiniband/rdma_cm --device=/dev/infiniband/ucm10 --device=/dev/infiniband/ucm9 --device=/dev/infiniband/ucm8 --device=/dev/infiniband/ucm7 --device=/dev/infiniband/ucm4 --device=/dev/infiniband/ucm3 --device=/dev/infiniband/ucm2 --device=/dev/infiniband/ucm1 --device=/dev/infiniband/uverbs10 --device=/dev/infiniband/uverbs9 --device=/dev/infiniband/uverbs8 --device=/dev/infiniband/uverbs7 --device=/dev/infiniband/uverbs4 --device=/dev/infiniband/uverbs3 --device=/dev/infiniband/uverbs2 --device=/dev/infiniband/uverbs1 --device=/dev/infiniband/issm10 --device=/dev/infiniband/umad10 --device=/dev/infiniband/issm9 --device=/dev/infiniband/umad9 --device=/dev/infiniband/issm8 --device=/dev/infiniband/umad8 --device=/dev/infiniband/issm7 --device=/dev/infiniband/umad7 --device=/dev/infiniband/issm4 --device=/dev/infiniband/umad4 --device=/dev/infiniband/issm3 --device=/dev/infiniband/umad3 --device=/dev/infiniband/issm2 --device=/dev/infiniband/umad2 --device=/dev/infiniband/issm1 --device=/dev/infiniband/umad1'
|
|
@ -0,0 +1,19 @@
|
|||
#!/bin/bash
|
||||
|
||||
## DL params
|
||||
BATCHSIZE=128
|
||||
LEARNING_RATE="6.5e-3"
|
||||
WARMUP_UPDATES=0.5328
|
||||
EXTRA_PARAMS="--input_dir=/workspace/data --do_train --config_file=bert_config.json --max_seq_length=128 --max_predictions_per_seq=20 --output_dir /results/output --fp16 --max_steps=4692 --num_steps_per_checkpoint=10000 --log_freq=1 --gradient_accumulation_steps=2"
|
||||
|
||||
## System run parms
|
||||
DGXNNODES=48
|
||||
DGXSYSTEM=$(basename $(readlink -f ${BASH_SOURCE[0]}) | sed 's/^config_//' | sed 's/\.sh$//' )
|
||||
WALLTIME=02:00:00
|
||||
|
||||
## System config params
|
||||
DGXNGPU=16
|
||||
DGXSOCKETCORES=24
|
||||
DGXNSOCKET=2
|
||||
DGXHT=2 # HT is on is 2, HT off is 1
|
||||
DGXIBDEVICES='--device=/dev/infiniband/rdma_cm --device=/dev/infiniband/ucm10 --device=/dev/infiniband/ucm9 --device=/dev/infiniband/ucm8 --device=/dev/infiniband/ucm7 --device=/dev/infiniband/ucm4 --device=/dev/infiniband/ucm3 --device=/dev/infiniband/ucm2 --device=/dev/infiniband/ucm1 --device=/dev/infiniband/uverbs10 --device=/dev/infiniband/uverbs9 --device=/dev/infiniband/uverbs8 --device=/dev/infiniband/uverbs7 --device=/dev/infiniband/uverbs4 --device=/dev/infiniband/uverbs3 --device=/dev/infiniband/uverbs2 --device=/dev/infiniband/uverbs1 --device=/dev/infiniband/issm10 --device=/dev/infiniband/umad10 --device=/dev/infiniband/issm9 --device=/dev/infiniband/umad9 --device=/dev/infiniband/issm8 --device=/dev/infiniband/umad8 --device=/dev/infiniband/issm7 --device=/dev/infiniband/umad7 --device=/dev/infiniband/issm4 --device=/dev/infiniband/umad4 --device=/dev/infiniband/issm3 --device=/dev/infiniband/umad3 --device=/dev/infiniband/issm2 --device=/dev/infiniband/umad2 --device=/dev/infiniband/issm1 --device=/dev/infiniband/umad1'
|
|
@ -0,0 +1,19 @@
|
|||
#!/bin/bash
|
||||
|
||||
## DL params
|
||||
BATCHSIZE=64
|
||||
LEARNING_RATE="5e-3"
|
||||
WARMUP_UPDATES=0.192
|
||||
EXTRA_PARAMS="--input_dir=/workspace/data_phase2 --do_train --config_file=bert_config.json --max_seq_length=512 --max_predictions_per_seq=80 --output_dir /checkpoints --fp16 --max_steps=1042 --num_steps_per_checkpoint=1000 --log_freq=1 --gradient_accumulation_steps=8 --resume_from_checkpoint --phase2 --allreduce_post_accumulation --allreduce_post_accumulation_fp16"
|
||||
|
||||
## System run parms
|
||||
DGXNNODES=48
|
||||
DGXSYSTEM=$(basename $(readlink -f ${BASH_SOURCE[0]}) | sed 's/^config_//' | sed 's/\.sh$//' )
|
||||
WALLTIME=02:00:00
|
||||
|
||||
## System config params
|
||||
DGXNGPU=16
|
||||
DGXSOCKETCORES=24
|
||||
DGXNSOCKET=2
|
||||
DGXHT=2 # HT is on is 2, HT off is 1
|
||||
DGXIBDEVICES='--device=/dev/infiniband/rdma_cm --device=/dev/infiniband/ucm10 --device=/dev/infiniband/ucm9 --device=/dev/infiniband/ucm8 --device=/dev/infiniband/ucm7 --device=/dev/infiniband/ucm4 --device=/dev/infiniband/ucm3 --device=/dev/infiniband/ucm2 --device=/dev/infiniband/ucm1 --device=/dev/infiniband/uverbs10 --device=/dev/infiniband/uverbs9 --device=/dev/infiniband/uverbs8 --device=/dev/infiniband/uverbs7 --device=/dev/infiniband/uverbs4 --device=/dev/infiniband/uverbs3 --device=/dev/infiniband/uverbs2 --device=/dev/infiniband/uverbs1 --device=/dev/infiniband/issm10 --device=/dev/infiniband/umad10 --device=/dev/infiniband/issm9 --device=/dev/infiniband/umad9 --device=/dev/infiniband/issm8 --device=/dev/infiniband/umad8 --device=/dev/infiniband/issm7 --device=/dev/infiniband/umad7 --device=/dev/infiniband/issm4 --device=/dev/infiniband/umad4 --device=/dev/infiniband/issm3 --device=/dev/infiniband/umad3 --device=/dev/infiniband/issm2 --device=/dev/infiniband/umad2 --device=/dev/infiniband/issm1 --device=/dev/infiniband/umad1'
|
|
@ -0,0 +1,21 @@
|
|||
#!/bin/bash
|
||||
|
||||
## DL params
|
||||
BATCHSIZE=64
|
||||
LEARNING_RATE="6e-3"
|
||||
WARMUP_UPDATES=0.2843
|
||||
EXTRA_PARAMS="--input_dir=/workspace/data --do_train --config_file=bert_config.json --max_seq_length=128 --max_predictions_per_seq=20 --output_dir /results/output --max_steps=7038 --num_steps_per_checkpoint=10000 --log_freq=1 --gradient_accumulation_steps=2"
|
||||
|
||||
## System run parms
|
||||
DGXNNODES=64
|
||||
DGXSYSTEM=$(basename $(readlink -f ${BASH_SOURCE[0]}) | sed 's/^config_//' | sed 's/\.sh$//' )
|
||||
WALLTIME=04:00:00
|
||||
DEADLINE=$(date -d '+168 hours' '+%FT%T')
|
||||
SLURM_EMAIL_TYPE="END"
|
||||
|
||||
## System config params
|
||||
DGXNGPU=16
|
||||
DGXSOCKETCORES=24
|
||||
DGXNSOCKET=2
|
||||
DGXHT=2 # HT is on is 2, HT off is 1
|
||||
DGXIBDEVICES='--device=/dev/infiniband/rdma_cm --device=/dev/infiniband/ucm10 --device=/dev/infiniband/ucm9 --device=/dev/infiniband/ucm8 --device=/dev/infiniband/ucm7 --device=/dev/infiniband/ucm4 --device=/dev/infiniband/ucm3 --device=/dev/infiniband/ucm2 --device=/dev/infiniband/ucm1 --device=/dev/infiniband/uverbs10 --device=/dev/infiniband/uverbs9 --device=/dev/infiniband/uverbs8 --device=/dev/infiniband/uverbs7 --device=/dev/infiniband/uverbs4 --device=/dev/infiniband/uverbs3 --device=/dev/infiniband/uverbs2 --device=/dev/infiniband/uverbs1 --device=/dev/infiniband/issm10 --device=/dev/infiniband/umad10 --device=/dev/infiniband/issm9 --device=/dev/infiniband/umad9 --device=/dev/infiniband/issm8 --device=/dev/infiniband/umad8 --device=/dev/infiniband/issm7 --device=/dev/infiniband/umad7 --device=/dev/infiniband/issm4 --device=/dev/infiniband/umad4 --device=/dev/infiniband/issm3 --device=/dev/infiniband/umad3 --device=/dev/infiniband/issm2 --device=/dev/infiniband/umad2 --device=/dev/infiniband/issm1 --device=/dev/infiniband/umad1'
|
|
@ -0,0 +1,21 @@
|
|||
#!/bin/bash
|
||||
|
||||
## DL params
|
||||
BATCHSIZE=32
|
||||
LEARNING_RATE="4e-3"
|
||||
WARMUP_UPDATES=0.128
|
||||
EXTRA_PARAMS="--input_dir=/workspace/data_phase2 --do_train --config_file=bert_config.json --max_seq_length=512 --max_predictions_per_seq=80 --output_dir /checkpoints --max_steps=1563 --num_steps_per_checkpoint=1000 --log_freq=1 --phase2 --gradient_accumulation_steps=8 --resume_from_checkpoint"
|
||||
|
||||
## System run parms
|
||||
DGXNNODES=64
|
||||
DGXSYSTEM=$(basename $(readlink -f ${BASH_SOURCE[0]}) | sed 's/^config_//' | sed 's/\.sh$//' )
|
||||
WALLTIME=04:00:00
|
||||
DEADLINE=$(date -d '+168 hours' '+%FT%T')
|
||||
SLURM_EMAIL_TYPE="END"
|
||||
|
||||
## System config params
|
||||
DGXNGPU=16
|
||||
DGXSOCKETCORES=24
|
||||
DGXNSOCKET=2
|
||||
DGXHT=2 # HT is on is 2, HT off is 1
|
||||
DGXIBDEVICES='--device=/dev/infiniband/rdma_cm --device=/dev/infiniband/ucm10 --device=/dev/infiniband/ucm9 --device=/dev/infiniband/ucm8 --device=/dev/infiniband/ucm7 --device=/dev/infiniband/ucm4 --device=/dev/infiniband/ucm3 --device=/dev/infiniband/ucm2 --device=/dev/infiniband/ucm1 --device=/dev/infiniband/uverbs10 --device=/dev/infiniband/uverbs9 --device=/dev/infiniband/uverbs8 --device=/dev/infiniband/uverbs7 --device=/dev/infiniband/uverbs4 --device=/dev/infiniband/uverbs3 --device=/dev/infiniband/uverbs2 --device=/dev/infiniband/uverbs1 --device=/dev/infiniband/issm10 --device=/dev/infiniband/umad10 --device=/dev/infiniband/issm9 --device=/dev/infiniband/umad9 --device=/dev/infiniband/issm8 --device=/dev/infiniband/umad8 --device=/dev/infiniband/issm7 --device=/dev/infiniband/umad7 --device=/dev/infiniband/issm4 --device=/dev/infiniband/umad4 --device=/dev/infiniband/issm3 --device=/dev/infiniband/umad3 --device=/dev/infiniband/issm2 --device=/dev/infiniband/umad2 --device=/dev/infiniband/issm1 --device=/dev/infiniband/umad1'
|
|
@ -0,0 +1,19 @@
|
|||
#!/bin/bash
|
||||
|
||||
## DL params
|
||||
BATCHSIZE=64
|
||||
LEARNING_RATE="6e-3"
|
||||
WARMUP_UPDATES=0.2843
|
||||
EXTRA_PARAMS="--input_dir=/workspace/data --do_train --config_file=bert_config.json --max_seq_length=128 --max_predictions_per_seq=20 --output_dir /results/output --fp16 --max_steps=7038 --num_steps_per_checkpoint=10000 --log_freq=1"
|
||||
|
||||
## System run parms
|
||||
DGXNNODES=64
|
||||
DGXSYSTEM=$(basename $(readlink -f ${BASH_SOURCE[0]}) | sed 's/^config_//' | sed 's/\.sh$//' )
|
||||
WALLTIME=02:00:00
|
||||
|
||||
## System config params
|
||||
DGXNGPU=16
|
||||
DGXSOCKETCORES=24
|
||||
DGXNSOCKET=2
|
||||
DGXHT=2 # HT is on is 2, HT off is 1
|
||||
DGXIBDEVICES='--device=/dev/infiniband/rdma_cm --device=/dev/infiniband/ucm10 --device=/dev/infiniband/ucm9 --device=/dev/infiniband/ucm8 --device=/dev/infiniband/ucm7 --device=/dev/infiniband/ucm4 --device=/dev/infiniband/ucm3 --device=/dev/infiniband/ucm2 --device=/dev/infiniband/ucm1 --device=/dev/infiniband/uverbs10 --device=/dev/infiniband/uverbs9 --device=/dev/infiniband/uverbs8 --device=/dev/infiniband/uverbs7 --device=/dev/infiniband/uverbs4 --device=/dev/infiniband/uverbs3 --device=/dev/infiniband/uverbs2 --device=/dev/infiniband/uverbs1 --device=/dev/infiniband/issm10 --device=/dev/infiniband/umad10 --device=/dev/infiniband/issm9 --device=/dev/infiniband/umad9 --device=/dev/infiniband/issm8 --device=/dev/infiniband/umad8 --device=/dev/infiniband/issm7 --device=/dev/infiniband/umad7 --device=/dev/infiniband/issm4 --device=/dev/infiniband/umad4 --device=/dev/infiniband/issm3 --device=/dev/infiniband/umad3 --device=/dev/infiniband/issm2 --device=/dev/infiniband/umad2 --device=/dev/infiniband/issm1 --device=/dev/infiniband/umad1'
|
|
@ -0,0 +1,21 @@
|
|||
#!/bin/bash
|
||||
|
||||
## DL params
|
||||
BATCHSIZE=32
|
||||
LEARNING_RATE="4e-3"
|
||||
WARMUP_UPDATES=0.128
|
||||
EXTRA_PARAMS="--input_dir=/workspace/data_phase2 --do_train --config_file=bert_config.json --max_seq_length=512 --max_predictions_per_seq=80 --output_dir /checkpoints --fp16 --max_steps=1563 --num_steps_per_checkpoint=1000 --log_freq=1 --phase2 --gradient_accumulation_steps=4 --resume_from_checkpoint --allreduce_post_accumulation --allreduce_post_accumulation_fp16 --resume_step=7038"
|
||||
|
||||
## System run parms
|
||||
DGXNNODES=64
|
||||
DGXSYSTEM=$(basename $(readlink -f ${BASH_SOURCE[0]}) | sed 's/^config_//' | sed 's/\.sh$//' )
|
||||
WALLTIME=02:00:00
|
||||
DEADLINE=$(date -d '+168 hours' '+%FT%T')
|
||||
SLURM_EMAIL_TYPE="END"
|
||||
|
||||
## System config params
|
||||
DGXNGPU=16
|
||||
DGXSOCKETCORES=24
|
||||
DGXNSOCKET=2
|
||||
DGXHT=2 # HT is on is 2, HT off is 1
|
||||
DGXIBDEVICES='--device=/dev/infiniband/rdma_cm --device=/dev/infiniband/ucm10 --device=/dev/infiniband/ucm9 --device=/dev/infiniband/ucm8 --device=/dev/infiniband/ucm7 --device=/dev/infiniband/ucm4 --device=/dev/infiniband/ucm3 --device=/dev/infiniband/ucm2 --device=/dev/infiniband/ucm1 --device=/dev/infiniband/uverbs10 --device=/dev/infiniband/uverbs9 --device=/dev/infiniband/uverbs8 --device=/dev/infiniband/uverbs7 --device=/dev/infiniband/uverbs4 --device=/dev/infiniband/uverbs3 --device=/dev/infiniband/uverbs2 --device=/dev/infiniband/uverbs1 --device=/dev/infiniband/issm10 --device=/dev/infiniband/umad10 --device=/dev/infiniband/issm9 --device=/dev/infiniband/umad9 --device=/dev/infiniband/issm8 --device=/dev/infiniband/umad8 --device=/dev/infiniband/issm7 --device=/dev/infiniband/umad7 --device=/dev/infiniband/issm4 --device=/dev/infiniband/umad4 --device=/dev/infiniband/issm3 --device=/dev/infiniband/umad3 --device=/dev/infiniband/issm2 --device=/dev/infiniband/umad2 --device=/dev/infiniband/issm1 --device=/dev/infiniband/umad1'
|
|
@ -0,0 +1,19 @@
|
|||
#!/bin/bash
|
||||
|
||||
## DL params
|
||||
BATCHSIZE=64
|
||||
LEARNING_RATE="6e-3"
|
||||
WARMUP_UPDATES=0.256
|
||||
EXTRA_PARAMS="--input_dir=/workspace/data_phase2 --do_train --config_file=bert_config.json --max_seq_length=512 --max_predictions_per_seq=80 --output_dir /checkpoints --fp16 --max_steps=782 --num_steps_per_checkpoint=1000 --log_freq=1 --phase2 --gradient_accumulation_steps=8 --resume_from_checkpoint --allreduce_post_accumulation --allreduce_post_accumulation_fp16 --accumulate_into_fp16 --resume_step=7038"
|
||||
|
||||
## System run parms
|
||||
DGXNNODES=64
|
||||
DGXSYSTEM=$(basename $(readlink -f ${BASH_SOURCE[0]}) | sed 's/^config_//' | sed 's/\.sh$//' )
|
||||
WALLTIME=02:00:00
|
||||
|
||||
## System config params
|
||||
DGXNGPU=16
|
||||
DGXSOCKETCORES=24
|
||||
DGXNSOCKET=2
|
||||
DGXHT=2 # HT is on is 2, HT off is 1
|
||||
DGXIBDEVICES='--device=/dev/infiniband/rdma_cm --device=/dev/infiniband/ucm10 --device=/dev/infiniband/ucm9 --device=/dev/infiniband/ucm8 --device=/dev/infiniband/ucm7 --device=/dev/infiniband/ucm4 --device=/dev/infiniband/ucm3 --device=/dev/infiniband/ucm2 --device=/dev/infiniband/ucm1 --device=/dev/infiniband/uverbs10 --device=/dev/infiniband/uverbs9 --device=/dev/infiniband/uverbs8 --device=/dev/infiniband/uverbs7 --device=/dev/infiniband/uverbs4 --device=/dev/infiniband/uverbs3 --device=/dev/infiniband/uverbs2 --device=/dev/infiniband/uverbs1 --device=/dev/infiniband/issm10 --device=/dev/infiniband/umad10 --device=/dev/infiniband/issm9 --device=/dev/infiniband/umad9 --device=/dev/infiniband/issm8 --device=/dev/infiniband/umad8 --device=/dev/infiniband/issm7 --device=/dev/infiniband/umad7 --device=/dev/infiniband/issm4 --device=/dev/infiniband/umad4 --device=/dev/infiniband/issm3 --device=/dev/infiniband/umad3 --device=/dev/infiniband/issm2 --device=/dev/infiniband/umad2 --device=/dev/infiniband/issm1 --device=/dev/infiniband/umad1'
|
|
@ -0,0 +1,21 @@
|
|||
#!/bin/bash
|
||||
|
||||
## DL params
|
||||
BATCHSIZE=48
|
||||
LEARNING_RATE="6e-3"
|
||||
WARMUP_UPDATES=0.2843
|
||||
EXTRA_PARAMS="--input_dir=/workspace/data --do_train --config_file=bert_config.json --max_seq_length=128 --max_predictions_per_seq=20 --output_dir /results/output --fp16 --max_steps=7038 --num_steps_per_checkpoint=10000 --log_freq=1"
|
||||
|
||||
## System run parms
|
||||
DGXNNODES=92
|
||||
DGXSYSTEM=$(basename $(readlink -f ${BASH_SOURCE[0]}) | sed 's/^config_//' | sed 's/\.sh$//' )
|
||||
WALLTIME=02:00:00
|
||||
DEADLINE=$(date -d '+168 hours' '+%FT%T')
|
||||
SLURM_EMAIL_TYPE="END"
|
||||
|
||||
## System config params
|
||||
DGXNGPU=16
|
||||
DGXSOCKETCORES=24
|
||||
DGXNSOCKET=2
|
||||
DGXHT=2 # HT is on is 2, HT off is 1
|
||||
DGXIBDEVICES='--device=/dev/infiniband/rdma_cm --device=/dev/infiniband/ucm10 --device=/dev/infiniband/ucm9 --device=/dev/infiniband/ucm8 --device=/dev/infiniband/ucm7 --device=/dev/infiniband/ucm4 --device=/dev/infiniband/ucm3 --device=/dev/infiniband/ucm2 --device=/dev/infiniband/ucm1 --device=/dev/infiniband/uverbs10 --device=/dev/infiniband/uverbs9 --device=/dev/infiniband/uverbs8 --device=/dev/infiniband/uverbs7 --device=/dev/infiniband/uverbs4 --device=/dev/infiniband/uverbs3 --device=/dev/infiniband/uverbs2 --device=/dev/infiniband/uverbs1 --device=/dev/infiniband/issm10 --device=/dev/infiniband/umad10 --device=/dev/infiniband/issm9 --device=/dev/infiniband/umad9 --device=/dev/infiniband/issm8 --device=/dev/infiniband/umad8 --device=/dev/infiniband/issm7 --device=/dev/infiniband/umad7 --device=/dev/infiniband/issm4 --device=/dev/infiniband/umad4 --device=/dev/infiniband/issm3 --device=/dev/infiniband/umad3 --device=/dev/infiniband/issm2 --device=/dev/infiniband/umad2 --device=/dev/infiniband/issm1 --device=/dev/infiniband/umad1'
|
|
@ -0,0 +1,21 @@
|
|||
#!/bin/bash
|
||||
|
||||
## DL params
|
||||
BATCHSIZE=64
|
||||
LEARNING_RATE="6.5e-3"
|
||||
WARMUP_UPDATES=0.5107
|
||||
EXTRA_PARAMS="--input_dir=/workspace/data --do_train --config_file=bert_config.json --max_seq_length=128 --max_predictions_per_seq=20 --output_dir /results/output --fp16 --max_steps=4896 --num_steps_per_checkpoint=10000 --log_freq=1"
|
||||
|
||||
## System run parms
|
||||
DGXNNODES=92
|
||||
DGXSYSTEM=$(basename $(readlink -f ${BASH_SOURCE[0]}) | sed 's/^config_//' | sed 's/\.sh$//' )
|
||||
WALLTIME=02:00:00
|
||||
DEADLINE=$(date -d '+168 hours' '+%FT%T')
|
||||
SLURM_EMAIL_TYPE="END"
|
||||
|
||||
## System config params
|
||||
DGXNGPU=16
|
||||
DGXSOCKETCORES=24
|
||||
DGXNSOCKET=2
|
||||
DGXHT=2 # HT is on is 2, HT off is 1
|
||||
DGXIBDEVICES='--device=/dev/infiniband/rdma_cm --device=/dev/infiniband/ucm10 --device=/dev/infiniband/ucm9 --device=/dev/infiniband/ucm8 --device=/dev/infiniband/ucm7 --device=/dev/infiniband/ucm4 --device=/dev/infiniband/ucm3 --device=/dev/infiniband/ucm2 --device=/dev/infiniband/ucm1 --device=/dev/infiniband/uverbs10 --device=/dev/infiniband/uverbs9 --device=/dev/infiniband/uverbs8 --device=/dev/infiniband/uverbs7 --device=/dev/infiniband/uverbs4 --device=/dev/infiniband/uverbs3 --device=/dev/infiniband/uverbs2 --device=/dev/infiniband/uverbs1 --device=/dev/infiniband/issm10 --device=/dev/infiniband/umad10 --device=/dev/infiniband/issm9 --device=/dev/infiniband/umad9 --device=/dev/infiniband/issm8 --device=/dev/infiniband/umad8 --device=/dev/infiniband/issm7 --device=/dev/infiniband/umad7 --device=/dev/infiniband/issm4 --device=/dev/infiniband/umad4 --device=/dev/infiniband/issm3 --device=/dev/infiniband/umad3 --device=/dev/infiniband/issm2 --device=/dev/infiniband/umad2 --device=/dev/infiniband/issm1 --device=/dev/infiniband/umad1'
|
|
@ -0,0 +1,22 @@
|
|||
#!/bin/bash
|
||||
|
||||
## DL params
|
||||
BATCHSIZE=24
|
||||
LEARNING_RATE="4.12e-3"
|
||||
WARMUP_UPDATES=0.138
|
||||
EXTRA_PARAMS="--input_dir=/workspace/data_phase2 --do_train --config_file=bert_config.json --max_seq_length=512 --max_predictions_per_seq=80 --output_dir /checkpoints --fp16 --max_steps=1450 --num_steps_per_checkpoint=1000 --log_freq=1 --phase2 --gradient_accumulation_steps=3 --resume_from_checkpoint --allreduce_post_accumulation --allreduce_post_accumulation_fp16 --resume_step=7038"
|
||||
|
||||
## System run parms
|
||||
DGXNNODES=92
|
||||
DGXSYSTEM=$(basename $(readlink -f ${BASH_SOURCE[0]}) | sed 's/^config_//' | sed 's/\.sh$//' )
|
||||
WALLTIME=02:00:00
|
||||
DEADLINE=$(date -d '+168 hours' '+%FT%T')
|
||||
SLURM_EMAIL_TYPE="END"
|
||||
|
||||
|
||||
## System config params
|
||||
DGXNGPU=16
|
||||
DGXSOCKETCORES=24
|
||||
DGXNSOCKET=2
|
||||
DGXHT=2 # HT is on is 2, HT off is 1
|
||||
DGXIBDEVICES='--device=/dev/infiniband/rdma_cm --device=/dev/infiniband/ucm10 --device=/dev/infiniband/ucm9 --device=/dev/infiniband/ucm8 --device=/dev/infiniband/ucm7 --device=/dev/infiniband/ucm4 --device=/dev/infiniband/ucm3 --device=/dev/infiniband/ucm2 --device=/dev/infiniband/ucm1 --device=/dev/infiniband/uverbs10 --device=/dev/infiniband/uverbs9 --device=/dev/infiniband/uverbs8 --device=/dev/infiniband/uverbs7 --device=/dev/infiniband/uverbs4 --device=/dev/infiniband/uverbs3 --device=/dev/infiniband/uverbs2 --device=/dev/infiniband/uverbs1 --device=/dev/infiniband/issm10 --device=/dev/infiniband/umad10 --device=/dev/infiniband/issm9 --device=/dev/infiniband/umad9 --device=/dev/infiniband/issm8 --device=/dev/infiniband/umad8 --device=/dev/infiniband/issm7 --device=/dev/infiniband/umad7 --device=/dev/infiniband/issm4 --device=/dev/infiniband/umad4 --device=/dev/infiniband/issm3 --device=/dev/infiniband/umad3 --device=/dev/infiniband/issm2 --device=/dev/infiniband/umad2 --device=/dev/infiniband/issm1 --device=/dev/infiniband/umad1'
|
|
@ -0,0 +1,21 @@
|
|||
#!/bin/bash
|
||||
|
||||
## DL params
|
||||
BATCHSIZE=32
|
||||
LEARNING_RATE="5e-3"
|
||||
WARMUP_UPDATES=0.192
|
||||
EXTRA_PARAMS="--input_dir=/workspace/data_phase2 --do_train --config_file=bert_config.json --max_seq_length=512 --max_predictions_per_seq=80 --output_dir /checkpoints --fp16 --max_steps=1088 --num_steps_per_checkpoint=1000 --log_freq=1 --gradient_accumulation_steps=4 --resume_from_checkpoint --phase2 --allreduce_post_accumulation --allreduce_post_accumulation_fp16 --resume_step=4896 --phase1_end_step=4896"
|
||||
|
||||
## System run parms
|
||||
DGXNNODES=92
|
||||
DGXSYSTEM=$(basename $(readlink -f ${BASH_SOURCE[0]}) | sed 's/^config_//' | sed 's/\.sh$//' )
|
||||
WALLTIME=02:00:00
|
||||
DEADLINE=$(date -d '+168 hours' '+%FT%T')
|
||||
SLURM_EMAIL_TYPE="END"
|
||||
|
||||
## System config params
|
||||
DGXNGPU=16
|
||||
DGXSOCKETCORES=24
|
||||
DGXNSOCKET=2
|
||||
DGXHT=2 # HT is on is 2, HT off is 1
|
||||
DGXIBDEVICES='--device=/dev/infiniband/rdma_cm --device=/dev/infiniband/ucm10 --device=/dev/infiniband/ucm9 --device=/dev/infiniband/ucm8 --device=/dev/infiniband/ucm7 --device=/dev/infiniband/ucm4 --device=/dev/infiniband/ucm3 --device=/dev/infiniband/ucm2 --device=/dev/infiniband/ucm1 --device=/dev/infiniband/uverbs10 --device=/dev/infiniband/uverbs9 --device=/dev/infiniband/uverbs8 --device=/dev/infiniband/uverbs7 --device=/dev/infiniband/uverbs4 --device=/dev/infiniband/uverbs3 --device=/dev/infiniband/uverbs2 --device=/dev/infiniband/uverbs1 --device=/dev/infiniband/issm10 --device=/dev/infiniband/umad10 --device=/dev/infiniband/issm9 --device=/dev/infiniband/umad9 --device=/dev/infiniband/issm8 --device=/dev/infiniband/umad8 --device=/dev/infiniband/issm7 --device=/dev/infiniband/umad7 --device=/dev/infiniband/issm4 --device=/dev/infiniband/umad4 --device=/dev/infiniband/issm3 --device=/dev/infiniband/umad3 --device=/dev/infiniband/issm2 --device=/dev/infiniband/umad2 --device=/dev/infiniband/issm1 --device=/dev/infiniband/umad1'
|
|
@ -0,0 +1,19 @@
|
|||
#!/bin/bash
|
||||
|
||||
## DL params
|
||||
BATCHSIZE=40
|
||||
LEARNING_RATE="6e-3"
|
||||
WARMUP_UPDATES=0.2843
|
||||
EXTRA_PARAMS="--input_dir=/workspace/data --do_train --config_file=bert_config.json --max_seq_length=128 --max_predictions_per_seq=20 --output_dir /results/output --fp16 --max_steps=7508 --num_steps_per_checkpoint=10000 --log_freq=1"
|
||||
|
||||
## System run parms
|
||||
DGXNNODES=96
|
||||
DGXSYSTEM=$(basename $(readlink -f ${BASH_SOURCE[0]}) | sed 's/^config_//' | sed 's/\.sh$//' )
|
||||
WALLTIME=02:00:00
|
||||
|
||||
## System config params
|
||||
DGXNGPU=16
|
||||
DGXSOCKETCORES=24
|
||||
DGXNSOCKET=2
|
||||
DGXHT=2 # HT is on is 2, HT off is 1
|
||||
DGXIBDEVICES='--device=/dev/infiniband/rdma_cm --device=/dev/infiniband/ucm10 --device=/dev/infiniband/ucm9 --device=/dev/infiniband/ucm8 --device=/dev/infiniband/ucm7 --device=/dev/infiniband/ucm4 --device=/dev/infiniband/ucm3 --device=/dev/infiniband/ucm2 --device=/dev/infiniband/ucm1 --device=/dev/infiniband/uverbs10 --device=/dev/infiniband/uverbs9 --device=/dev/infiniband/uverbs8 --device=/dev/infiniband/uverbs7 --device=/dev/infiniband/uverbs4 --device=/dev/infiniband/uverbs3 --device=/dev/infiniband/uverbs2 --device=/dev/infiniband/uverbs1 --device=/dev/infiniband/issm10 --device=/dev/infiniband/umad10 --device=/dev/infiniband/issm9 --device=/dev/infiniband/umad9 --device=/dev/infiniband/issm8 --device=/dev/infiniband/umad8 --device=/dev/infiniband/issm7 --device=/dev/infiniband/umad7 --device=/dev/infiniband/issm4 --device=/dev/infiniband/umad4 --device=/dev/infiniband/issm3 --device=/dev/infiniband/umad3 --device=/dev/infiniband/issm2 --device=/dev/infiniband/umad2 --device=/dev/infiniband/issm1 --device=/dev/infiniband/umad1'
|
|
@ -0,0 +1,19 @@
|
|||
#!/bin/bash
|
||||
|
||||
## DL params
|
||||
BATCHSIZE=64
|
||||
LEARNING_RATE="6.5e-3"
|
||||
WARMUP_UPDATES=0.5328
|
||||
EXTRA_PARAMS="--input_dir=/workspace/data --do_train --config_file=bert_config.json --max_seq_length=128 --max_predictions_per_seq=20 --output_dir /results/output --fp16 --max_steps=4692 --num_steps_per_checkpoint=10000 --log_freq=1"
|
||||
|
||||
## System run parms
|
||||
DGXNNODES=96
|
||||
DGXSYSTEM=$(basename $(readlink -f ${BASH_SOURCE[0]}) | sed 's/^config_//' | sed 's/\.sh$//' )
|
||||
WALLTIME=02:00:00
|
||||
|
||||
## System config params
|
||||
DGXNGPU=16
|
||||
DGXSOCKETCORES=24
|
||||
DGXNSOCKET=2
|
||||
DGXHT=2 # HT is on is 2, HT off is 1
|
||||
DGXIBDEVICES='--device=/dev/infiniband/rdma_cm --device=/dev/infiniband/ucm10 --device=/dev/infiniband/ucm9 --device=/dev/infiniband/ucm8 --device=/dev/infiniband/ucm7 --device=/dev/infiniband/ucm4 --device=/dev/infiniband/ucm3 --device=/dev/infiniband/ucm2 --device=/dev/infiniband/ucm1 --device=/dev/infiniband/uverbs10 --device=/dev/infiniband/uverbs9 --device=/dev/infiniband/uverbs8 --device=/dev/infiniband/uverbs7 --device=/dev/infiniband/uverbs4 --device=/dev/infiniband/uverbs3 --device=/dev/infiniband/uverbs2 --device=/dev/infiniband/uverbs1 --device=/dev/infiniband/issm10 --device=/dev/infiniband/umad10 --device=/dev/infiniband/issm9 --device=/dev/infiniband/umad9 --device=/dev/infiniband/issm8 --device=/dev/infiniband/umad8 --device=/dev/infiniband/issm7 --device=/dev/infiniband/umad7 --device=/dev/infiniband/issm4 --device=/dev/infiniband/umad4 --device=/dev/infiniband/issm3 --device=/dev/infiniband/umad3 --device=/dev/infiniband/issm2 --device=/dev/infiniband/umad2 --device=/dev/infiniband/issm1 --device=/dev/infiniband/umad1'
|
|
@ -0,0 +1,19 @@
|
|||
#!/bin/bash
|
||||
|
||||
## DL params
|
||||
BATCHSIZE=24
|
||||
LEARNING_RATE="4e-3"
|
||||
WARMUP_UPDATES=0.144
|
||||
EXTRA_PARAMS="--input_dir=/workspace/data_phase2 --do_train --config_file=bert_config.json --max_seq_length=512 --max_predictions_per_seq=80 --output_dir /checkpoints --fp16 --max_steps=1390 --num_steps_per_checkpoint=1000 --log_freq=1 --phase2 --gradient_accumulation_steps=3 --resume_from_checkpoint --allreduce_post_accumulation --allreduce_post_accumulation_fp16 --resume_step=7038"
|
||||
|
||||
## System run parms
|
||||
DGXNNODES=96
|
||||
DGXSYSTEM=$(basename $(readlink -f ${BASH_SOURCE[0]}) | sed 's/^config_//' | sed 's/\.sh$//' )
|
||||
WALLTIME=02:00:00
|
||||
|
||||
## System config params
|
||||
DGXNGPU=16
|
||||
DGXSOCKETCORES=24
|
||||
DGXNSOCKET=2
|
||||
DGXHT=2 # HT is on is 2, HT off is 1
|
||||
DGXIBDEVICES='--device=/dev/infiniband/rdma_cm --device=/dev/infiniband/ucm10 --device=/dev/infiniband/ucm9 --device=/dev/infiniband/ucm8 --device=/dev/infiniband/ucm7 --device=/dev/infiniband/ucm4 --device=/dev/infiniband/ucm3 --device=/dev/infiniband/ucm2 --device=/dev/infiniband/ucm1 --device=/dev/infiniband/uverbs10 --device=/dev/infiniband/uverbs9 --device=/dev/infiniband/uverbs8 --device=/dev/infiniband/uverbs7 --device=/dev/infiniband/uverbs4 --device=/dev/infiniband/uverbs3 --device=/dev/infiniband/uverbs2 --device=/dev/infiniband/uverbs1 --device=/dev/infiniband/issm10 --device=/dev/infiniband/umad10 --device=/dev/infiniband/issm9 --device=/dev/infiniband/umad9 --device=/dev/infiniband/issm8 --device=/dev/infiniband/umad8 --device=/dev/infiniband/issm7 --device=/dev/infiniband/umad7 --device=/dev/infiniband/issm4 --device=/dev/infiniband/umad4 --device=/dev/infiniband/issm3 --device=/dev/infiniband/umad3 --device=/dev/infiniband/issm2 --device=/dev/infiniband/umad2 --device=/dev/infiniband/issm1 --device=/dev/infiniband/umad1'
|
|
@ -0,0 +1,19 @@
|
|||
#!/bin/bash
|
||||
|
||||
## DL params
|
||||
BATCHSIZE=32
|
||||
LEARNING_RATE="5e-3"
|
||||
WARMUP_UPDATES=0.192
|
||||
EXTRA_PARAMS="--input_dir=/workspace/data_phase2 --do_train --config_file=bert_config.json --max_seq_length=512 --max_predictions_per_seq=80 --output_dir /checkpoints --fp16 --max_steps=1042 --num_steps_per_checkpoint=1000 --log_freq=1 --gradient_accumulation_steps=4 --resume_from_checkpoint --phase2 --allreduce_post_accumulation --allreduce_post_accumulation_fp16 --resume_step=4896"
|
||||
|
||||
## System run parms
|
||||
DGXNNODES=96
|
||||
DGXSYSTEM=$(basename $(readlink -f ${BASH_SOURCE[0]}) | sed 's/^config_//' | sed 's/\.sh$//' )
|
||||
WALLTIME=02:00:00
|
||||
|
||||
## System config params
|
||||
DGXNGPU=16
|
||||
DGXSOCKETCORES=24
|
||||
DGXNSOCKET=2
|
||||
DGXHT=2 # HT is on is 2, HT off is 1
|
||||
DGXIBDEVICES='--device=/dev/infiniband/rdma_cm --device=/dev/infiniband/ucm10 --device=/dev/infiniband/ucm9 --device=/dev/infiniband/ucm8 --device=/dev/infiniband/ucm7 --device=/dev/infiniband/ucm4 --device=/dev/infiniband/ucm3 --device=/dev/infiniband/ucm2 --device=/dev/infiniband/ucm1 --device=/dev/infiniband/uverbs10 --device=/dev/infiniband/uverbs9 --device=/dev/infiniband/uverbs8 --device=/dev/infiniband/uverbs7 --device=/dev/infiniband/uverbs4 --device=/dev/infiniband/uverbs3 --device=/dev/infiniband/uverbs2 --device=/dev/infiniband/uverbs1 --device=/dev/infiniband/issm10 --device=/dev/infiniband/umad10 --device=/dev/infiniband/issm9 --device=/dev/infiniband/umad9 --device=/dev/infiniband/issm8 --device=/dev/infiniband/umad8 --device=/dev/infiniband/issm7 --device=/dev/infiniband/umad7 --device=/dev/infiniband/issm4 --device=/dev/infiniband/umad4 --device=/dev/infiniband/issm3 --device=/dev/infiniband/umad3 --device=/dev/infiniband/issm2 --device=/dev/infiniband/umad2 --device=/dev/infiniband/issm1 --device=/dev/infiniband/umad1'
|
19
PyTorch/LanguageModeling/BERT/config_DGX2_phase2.sh
Normal file
19
PyTorch/LanguageModeling/BERT/config_DGX2_phase2.sh
Normal file
|
@ -0,0 +1,19 @@
|
|||
#!/bin/bash
|
||||
|
||||
## DL params
|
||||
BATCHSIZE=24
|
||||
LEARNING_RATE="4e-3"
|
||||
WARMUP_UPDATES=0.144
|
||||
EXTRA_PARAMS="--input_dir=/workspace/data_phase2 --do_train --config_file=bert_config.json --max_seq_length=512 --max_predictions_per_seq=80 --output_dir /checkpoints --fp16 --max_steps=1390 --num_steps_per_checkpoint=1000 --log_freq=1 --phase2 --gradient_accumulation_steps=3 --resume_from_checkpoint"
|
||||
|
||||
## System run parms
|
||||
DGXNNODES=1
|
||||
DGXSYSTEM=$(basename $(readlink -f ${BASH_SOURCE[0]}) | sed 's/^config_//' | sed 's/\.sh$//' )
|
||||
WALLTIME=02:00:00
|
||||
|
||||
## System config params
|
||||
DGXNGPU=16
|
||||
DGXSOCKETCORES=24
|
||||
DGXNSOCKET=2
|
||||
DGXHT=2 # HT is on is 2, HT off is 1
|
||||
DGXIBDEVICES='--device=/dev/infiniband/rdma_cm --device=/dev/infiniband/ucm10 --device=/dev/infiniband/ucm9 --device=/dev/infiniband/ucm8 --device=/dev/infiniband/ucm7 --device=/dev/infiniband/ucm4 --device=/dev/infiniband/ucm3 --device=/dev/infiniband/ucm2 --device=/dev/infiniband/ucm1 --device=/dev/infiniband/uverbs10 --device=/dev/infiniband/uverbs9 --device=/dev/infiniband/uverbs8 --device=/dev/infiniband/uverbs7 --device=/dev/infiniband/uverbs4 --device=/dev/infiniband/uverbs3 --device=/dev/infiniband/uverbs2 --device=/dev/infiniband/uverbs1 --device=/dev/infiniband/issm10 --device=/dev/infiniband/umad10 --device=/dev/infiniband/issm9 --device=/dev/infiniband/umad9 --device=/dev/infiniband/issm8 --device=/dev/infiniband/umad8 --device=/dev/infiniband/issm7 --device=/dev/infiniband/umad7 --device=/dev/infiniband/issm4 --device=/dev/infiniband/umad4 --device=/dev/infiniband/issm3 --device=/dev/infiniband/umad3 --device=/dev/infiniband/issm2 --device=/dev/infiniband/umad2 --device=/dev/infiniband/issm1 --device=/dev/infiniband/umad1'
|
16
PyTorch/LanguageModeling/BERT/data/BooksDownloader.py
Normal file
16
PyTorch/LanguageModeling/BERT/data/BooksDownloader.py
Normal file
|
@ -0,0 +1,16 @@
|
|||
# NVIDIA
|
||||
|
||||
import subprocess
|
||||
|
||||
class BooksDownloader:
|
||||
def __init__(self, save_path):
|
||||
self.save_path = save_path
|
||||
pass
|
||||
|
||||
|
||||
def download(self):
|
||||
bookscorpus_download_command = 'python3 /workspace/bookcorpus/download_files.py --list /workspace/bookcorpus/url_list.jsonl --out'
|
||||
bookscorpus_download_command += ' ' + self.save_path + '/bookscorpus'
|
||||
bookscorpus_download_command += ' --trash-bad-count'
|
||||
bookscorpus_download_process = subprocess.run(bookscorpus_download_command, shell=True, check=True)
|
||||
bookscorpus_download_process.communicate()
|
|
@ -0,0 +1,21 @@
|
|||
# NVIDIA
|
||||
|
||||
import glob
|
||||
import os
|
||||
|
||||
class BookscorpusTextFormatting:
|
||||
def __init__(self, books_path, output_filename, recursive = False):
|
||||
self.books_path = books_path
|
||||
self.recursive = recursive
|
||||
self.output_filename = output_filename
|
||||
|
||||
|
||||
# This puts one book per line
|
||||
def merge(self):
|
||||
with open(self.output_filename, mode='w', newline='\n') as ofile:
|
||||
for filename in glob.glob(self.books_path + '/' + '*.txt', recursive=True):
|
||||
with open(filename, mode='r', encoding='utf-8-sig', newline='\n') as file:
|
||||
for line in file:
|
||||
if line.strip() != '':
|
||||
ofile.write(line.strip() + ' ')
|
||||
ofile.write("\n\n")
|
80
PyTorch/LanguageModeling/BERT/data/Downloader.py
Normal file
80
PyTorch/LanguageModeling/BERT/data/Downloader.py
Normal file
|
@ -0,0 +1,80 @@
|
|||
# NVIDIA
|
||||
|
||||
from GooglePretrainedWeightDownloader import GooglePretrainedWeightDownloader
|
||||
from NVIDIAPretrainedWeightDownloader import NVIDIAPretrainedWeightDownloader
|
||||
from WikiDownloader import WikiDownloader
|
||||
from BooksDownloader import BooksDownloader
|
||||
from MRPCDownloader import MRPCDownloader
|
||||
from SquadDownloader import SquadDownloader
|
||||
|
||||
|
||||
class Downloader:
|
||||
def __init__(self, dataset_name, save_path):
|
||||
self.dataset_name = dataset_name
|
||||
self.save_path = save_path
|
||||
|
||||
|
||||
def download(self):
|
||||
if self.dataset_name == 'bookscorpus':
|
||||
self.download_bookscorpus()
|
||||
|
||||
elif self.dataset_name == 'wikicorpus_en':
|
||||
self.download_wikicorpus('en')
|
||||
|
||||
elif self.dataset_name == 'wikicorpus_zh':
|
||||
self.download_wikicorpus('zh')
|
||||
|
||||
elif self.dataset_name == 'google_pretrained_weights':
|
||||
self.download_google_pretrained_weights()
|
||||
|
||||
elif self.dataset_name == 'nvidia_pretrained_weights':
|
||||
self.download_nvidia_pretrained_weights()
|
||||
|
||||
elif self.dataset_name == 'mrpc':
|
||||
self.download_mrpc()
|
||||
|
||||
elif self.dataset_name == 'squad':
|
||||
self.download_squad()
|
||||
|
||||
elif self.dataset_name == 'all':
|
||||
self.download_bookscorpus(self.save_path)
|
||||
self.download_wikicorpus('en', self.save_path)
|
||||
self.download_wikicorpus('zh', self.save_path)
|
||||
self.download_google_pretrained_weights(self.save_path)
|
||||
self.download_nvidia_pretrained_weights(self.save_path)
|
||||
self.download_mrpc(self.save_path)
|
||||
self.download_squad(self.save_path)
|
||||
|
||||
else:
|
||||
print(self.dataset_name)
|
||||
assert False, 'Unknown dataset_name provided to downloader'
|
||||
|
||||
|
||||
def download_bookscorpus(self):
|
||||
downloader = BooksDownloader(self.save_path)
|
||||
downloader.download()
|
||||
|
||||
|
||||
def download_wikicorpus(self, language):
|
||||
downloader = WikiDownloader(language, self.save_path)
|
||||
downloader.download()
|
||||
|
||||
|
||||
def download_google_pretrained_weights(self):
|
||||
downloader = GooglePretrainedWeightDownloader(self.save_path)
|
||||
downloader.download()
|
||||
|
||||
|
||||
def download_nvidia_pretrained_weights(self):
|
||||
downloader = NVIDIAPretrainedWeightDownloader(self.save_path)
|
||||
downloader.download()
|
||||
|
||||
|
||||
def download_mrpc(self):
|
||||
downloader = MRPCDownloader(self.save_path)
|
||||
downloader.download()
|
||||
|
||||
|
||||
def download_squad(self):
|
||||
downloader = SquadDownloader(self.save_path)
|
||||
downloader.download()
|
|
@ -0,0 +1,147 @@
|
|||
# NVIDIA
|
||||
|
||||
import hashlib
|
||||
import os
|
||||
import urllib.request
|
||||
import zipfile
|
||||
|
||||
class GooglePretrainedWeightDownloader:
|
||||
def __init__(self, save_path):
|
||||
self.save_path = save_path + '/google_pretrained_weights'
|
||||
|
||||
if not os.path.exists(self.save_path):
|
||||
os.makedirs(self.save_path)
|
||||
|
||||
# Download urls
|
||||
self.model_urls = {
|
||||
'bert_base_uncased': ('https://storage.googleapis.com/bert_models/2018_10_18/uncased_L-12_H-768_A-12.zip', 'uncased_L-12_H-768_A-12.zip'),
|
||||
'bert_large_uncased': ('https://storage.googleapis.com/bert_models/2018_10_18/uncased_L-24_H-1024_A-16.zip', 'uncased_L-24_H-1024_A-16.zip'),
|
||||
'bert_base_cased': ('https://storage.googleapis.com/bert_models/2018_10_18/cased_L-12_H-768_A-12.zip', 'cased_L-12_H-768_A-12.zip'),
|
||||
'bert_large_cased': ('https://storage.googleapis.com/bert_models/2018_10_18/cased_L-24_H-1024_A-16.zip', 'cased_L-24_H-1024_A-16.zip'),
|
||||
'bert_base_multilingual_cased': ('https://storage.googleapis.com/bert_models/2018_11_23/multi_cased_L-12_H-768_A-12.zip', 'multi_cased_L-12_H-768_A-12.zip'),
|
||||
'bert_large_multilingual_uncased': ('https://storage.googleapis.com/bert_models/2018_11_03/multilingual_L-12_H-768_A-12.zip', 'multilingual_L-12_H-768_A-12.zip'),
|
||||
'bert_base_chinese': ('https://storage.googleapis.com/bert_models/2018_11_03/chinese_L-12_H-768_A-12.zip', 'chinese_L-12_H-768_A-12.zip')
|
||||
}
|
||||
|
||||
# SHA256sum verification for file download integrity (and checking for changes from the download source over time)
|
||||
self.bert_base_uncased_sha = {
|
||||
'bert_config.json': '7b4e5f53efbd058c67cda0aacfafb340113ea1b5797d9ce6ee411704ba21fcbc',
|
||||
'bert_model.ckpt.data-00000-of-00001': '58580dc5e0bf0ae0d2efd51d0e8272b2f808857f0a43a88aaf7549da6d7a8a84',
|
||||
'bert_model.ckpt.index': '04c1323086e2f1c5b7c0759d8d3e484afbb0ab45f51793daab9f647113a0117b',
|
||||
'bert_model.ckpt.meta': 'dd5682170a10c3ea0280c2e9b9a45fee894eb62da649bbdea37b38b0ded5f60e',
|
||||
'vocab.txt': '07eced375cec144d27c900241f3e339478dec958f92fddbc551f295c992038a3',
|
||||
}
|
||||
|
||||
self.bert_large_uncased_sha = {
|
||||
'bert_config.json': 'bfa42236d269e2aeb3a6d30412a33d15dbe8ea597e2b01dc9518c63cc6efafcb',
|
||||
'bert_model.ckpt.data-00000-of-00001': 'bc6b3363e3be458c99ecf64b7f472d2b7c67534fd8f564c0556a678f90f4eea1',
|
||||
'bert_model.ckpt.index': '68b52f2205ffc64dc627d1120cf399c1ef1cbc35ea5021d1afc889ffe2ce2093',
|
||||
'bert_model.ckpt.meta': '6fcce8ff7628f229a885a593625e3d5ff9687542d5ef128d9beb1b0c05edc4a1',
|
||||
'vocab.txt': '07eced375cec144d27c900241f3e339478dec958f92fddbc551f295c992038a3',
|
||||
}
|
||||
|
||||
self.bert_base_cased_sha = {
|
||||
'bert_config.json': 'f11dfb757bea16339a33e1bf327b0aade6e57fd9c29dc6b84f7ddb20682f48bc',
|
||||
'bert_model.ckpt.data-00000-of-00001': '734d5a1b68bf98d4e9cb6b6692725d00842a1937af73902e51776905d8f760ea',
|
||||
'bert_model.ckpt.index': '517d6ef5c41fc2ca1f595276d6fccf5521810d57f5a74e32616151557790f7b1',
|
||||
'bert_model.ckpt.meta': '5f8a9771ff25dadd61582abb4e3a748215a10a6b55947cbb66d0f0ba1694be98',
|
||||
'vocab.txt': 'eeaa9875b23b04b4c54ef759d03db9d1ba1554838f8fb26c5d96fa551df93d02',
|
||||
}
|
||||
|
||||
self.bert_large_cased_sha = {
|
||||
'bert_config.json': '7adb2125c8225da495656c982fd1c5f64ba8f20ad020838571a3f8a954c2df57',
|
||||
'bert_model.ckpt.data-00000-of-00001': '6ff33640f40d472f7a16af0c17b1179ca9dcc0373155fb05335b6a4dd1657ef0',
|
||||
'bert_model.ckpt.index': 'ef42a53f577fbe07381f4161b13c7cab4f4fc3b167cec6a9ae382c53d18049cf',
|
||||
'bert_model.ckpt.meta': 'd2ddff3ed33b80091eac95171e94149736ea74eb645e575d942ec4a5e01a40a1',
|
||||
'vocab.txt': 'eeaa9875b23b04b4c54ef759d03db9d1ba1554838f8fb26c5d96fa551df93d02',
|
||||
}
|
||||
|
||||
self.bert_base_multilingual_cased_sha = {
|
||||
'bert_config.json': 'e76c3964bc14a8bb37a5530cdc802699d2f4a6fddfab0611e153aa2528f234f0',
|
||||
'bert_model.ckpt.data-00000-of-00001': '55b8a2df41f69c60c5180e50a7c31b7cdf6238909390c4ddf05fbc0d37aa1ac5',
|
||||
'bert_model.ckpt.index': '7d8509c2a62b4e300feb55f8e5f1eef41638f4998dd4d887736f42d4f6a34b37',
|
||||
'bert_model.ckpt.meta': '95e5f1997e8831f1c31e5cf530f1a2e99f121e9cd20887f2dce6fe9e3343e3fa',
|
||||
'vocab.txt': 'fe0fda7c425b48c516fc8f160d594c8022a0808447475c1a7c6d6479763f310c',
|
||||
}
|
||||
|
||||
self.bert_large_multilingual_uncased_sha = {
|
||||
'bert_config.json': '49063bb061390211d2fdd108cada1ed86faa5f90b80c8f6fdddf406afa4c4624',
|
||||
'bert_model.ckpt.data-00000-of-00001': '3cd83912ebeb0efe2abf35c9f1d5a515d8e80295e61c49b75c8853f756658429',
|
||||
'bert_model.ckpt.index': '87c372c1a3b1dc7effaaa9103c80a81b3cbab04c7933ced224eec3b8ad2cc8e7',
|
||||
'bert_model.ckpt.meta': '27f504f34f02acaa6b0f60d65195ec3e3f9505ac14601c6a32b421d0c8413a29',
|
||||
'vocab.txt': '87b44292b452f6c05afa49b2e488e7eedf79ea4f4c39db6f2f4b37764228ef3f',
|
||||
}
|
||||
|
||||
self.bert_base_chinese_sha = {
|
||||
'bert_config.json': '7aaad0335058e2640bcb2c2e9a932b1cd9da200c46ea7b8957d54431f201c015',
|
||||
'bert_model.ckpt.data-00000-of-00001': '756699356b78ad0ef1ca9ba6528297bcb3dd1aef5feadd31f4775d7c7fc989ba',
|
||||
'bert_model.ckpt.index': '46315546e05ce62327b3e2cd1bed22836adcb2ff29735ec87721396edb21b82e',
|
||||
'bert_model.ckpt.meta': 'c0f8d51e1ab986604bc2b25d6ec0af7fd21ff94cf67081996ec3f3bf5d823047',
|
||||
'vocab.txt': '45bbac6b341c319adc98a532532882e91a9cefc0329aa57bac9ae761c27b291c',
|
||||
}
|
||||
|
||||
# Relate SHA to urls for loop below
|
||||
self.model_sha = {
|
||||
'bert_base_uncased': self.bert_base_uncased_sha,
|
||||
'bert_large_uncased': self.bert_large_uncased_sha,
|
||||
'bert_base_cased': self.bert_base_cased_sha,
|
||||
'bert_large_cased': self.bert_large_cased_sha,
|
||||
'bert_base_multilingual_cased': self.bert_base_multilingual_cased_sha,
|
||||
'bert_large_multilingual_uncased': self.bert_large_multilingual_uncased_sha,
|
||||
'bert_base_chinese': self.bert_base_chinese_sha
|
||||
}
|
||||
|
||||
# Helper to get sha256sum of a file
|
||||
def sha256sum(self, filename):
|
||||
h = hashlib.sha256()
|
||||
b = bytearray(128*1024)
|
||||
mv = memoryview(b)
|
||||
with open(filename, 'rb', buffering=0) as f:
|
||||
for n in iter(lambda : f.readinto(mv), 0):
|
||||
h.update(mv[:n])
|
||||
|
||||
return h.hexdigest()
|
||||
|
||||
def download(self):
|
||||
# Iterate over urls: download, unzip, verify sha256sum
|
||||
found_mismatch_sha = False
|
||||
for model in self.model_urls:
|
||||
url = self.model_urls[model][0]
|
||||
file = self.save_path + '/' + self.model_urls[model][1]
|
||||
|
||||
print('Downloading', url)
|
||||
response = urllib.request.urlopen(url)
|
||||
with open(file, 'wb') as handle:
|
||||
handle.write(response.read())
|
||||
|
||||
print('Unzipping', file)
|
||||
zip = zipfile.ZipFile(file, 'r')
|
||||
zip.extractall(self.save_path)
|
||||
zip.close()
|
||||
|
||||
sha_dict = self.model_sha[model]
|
||||
for extracted_file in sha_dict:
|
||||
sha = sha_dict[extracted_file]
|
||||
if sha != self.sha256sum(file[:-4] + '/' + extracted_file):
|
||||
found_mismatch_sha = True
|
||||
print('SHA256sum does not match on file:', extracted_file, 'from download url:', url)
|
||||
else:
|
||||
print(file[:-4] + '/' + extracted_file, '\t', 'verified')
|
||||
|
||||
if not found_mismatch_sha:
|
||||
print("All downloads pass sha256sum verification.")
|
||||
|
||||
def serialize(self):
|
||||
pass
|
||||
|
||||
def deserialize(self):
|
||||
pass
|
||||
|
||||
def listAvailableWeights(self):
|
||||
print("Available Weight Datasets")
|
||||
for item in self.model_urls:
|
||||
print(item)
|
||||
|
||||
def listLocallyStoredWeights(self):
|
||||
pass
|
||||
|
33
PyTorch/LanguageModeling/BERT/data/MRPCDownloader.py
Normal file
33
PyTorch/LanguageModeling/BERT/data/MRPCDownloader.py
Normal file
|
@ -0,0 +1,33 @@
|
|||
# NVIDIA
|
||||
|
||||
import bz2
|
||||
import os
|
||||
import urllib.request
|
||||
import sys
|
||||
|
||||
class MRPCDownloader:
|
||||
def __init__(self, save_path):
|
||||
self.save_path = save_path + '/mrpc'
|
||||
|
||||
if not os.path.exists(self.save_path):
|
||||
os.makedirs(self.save_path)
|
||||
|
||||
# Documentation - Download link obtained from here: https://github.com/nyu-mll/GLUE-baselines/blob/master/download_glue_data.py
|
||||
self.download_urls = {
|
||||
'https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2Fmrpc_dev_ids.tsv?alt=media&token=ec5c0836-31d5-48f4-b431-7480817f1adc' : 'mrpc_dev_ids.tsv'
|
||||
}
|
||||
|
||||
def download(self):
|
||||
for item in self.download_urls:
|
||||
url = item
|
||||
file = self.download_urls[item]
|
||||
|
||||
print('Downloading:', url)
|
||||
if os.path.isfile(self.save_path + '/' + file):
|
||||
print('** Download file already exists, skipping download')
|
||||
else:
|
||||
response = urllib.request.urlopen(url)
|
||||
with open(self.save_path + '/' + file, "wb") as handle:
|
||||
handle.write(response.read())
|
||||
|
||||
|
|
@ -0,0 +1,16 @@
|
|||
# NVIDIA
|
||||
|
||||
import os
|
||||
|
||||
class NVIDIAPretrainedWeightDownloader:
|
||||
def __init__(self, save_path):
|
||||
self.save_path = save_path + '/nvidia_pretrained_weights'
|
||||
|
||||
if not os.path.exists(self.save_path):
|
||||
os.makedirs(self.save_path)
|
||||
|
||||
pass
|
||||
|
||||
|
||||
def download(self):
|
||||
assert False, 'NVIDIAPretrainedWeightDownloader not implemented yet.'
|
43
PyTorch/LanguageModeling/BERT/data/SquadDownloader.py
Normal file
43
PyTorch/LanguageModeling/BERT/data/SquadDownloader.py
Normal file
|
@ -0,0 +1,43 @@
|
|||
# NVIDIA
|
||||
|
||||
import bz2
|
||||
import os
|
||||
import urllib.request
|
||||
import sys
|
||||
|
||||
class SquadDownloader:
|
||||
def __init__(self, save_path):
|
||||
self.save_path = save_path + '/squad'
|
||||
|
||||
if not os.path.exists(self.save_path):
|
||||
os.makedirs(self.save_path)
|
||||
|
||||
if not os.path.exists(self.save_path + '/v1.1'):
|
||||
os.makedirs(self.save_path + '/v1.1')
|
||||
|
||||
if not os.path.exists(self.save_path + '/v2.0'):
|
||||
os.makedirs(self.save_path + '/v2.0')
|
||||
|
||||
self.download_urls = {
|
||||
'https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v1.1.json' : 'v1.1/train-v1.1.json',
|
||||
'https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v1.1.json' : 'v1.1/dev-v1.1.json',
|
||||
'https://worksheets.codalab.org/rest/bundles/0xbcd57bee090b421c982906709c8c27e1/contents/blob/' : 'v1.1/evaluate-v1.1.py',
|
||||
'https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v2.0.json' : 'v2.0/train-v2.0.json',
|
||||
'https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v2.0.json' : 'v2.0/dev-v2.0.json',
|
||||
'https://worksheets.codalab.org/rest/bundles/0x6b567e1cf2e041ec80d7098f031c5c9e/contents/blob/' : 'v2.0/evaluate-v2.0.py',
|
||||
}
|
||||
|
||||
def download(self):
|
||||
for item in self.download_urls:
|
||||
url = item
|
||||
file = self.download_urls[item]
|
||||
|
||||
print('Downloading:', url)
|
||||
if os.path.isfile(self.save_path + '/' + file):
|
||||
print('** Download file already exists, skipping download')
|
||||
else:
|
||||
response = urllib.request.urlopen(url)
|
||||
with open(self.save_path + '/' + file, "wb") as handle:
|
||||
handle.write(response.read())
|
||||
|
||||
|
316
PyTorch/LanguageModeling/BERT/data/TextSharding.py
Normal file
316
PyTorch/LanguageModeling/BERT/data/TextSharding.py
Normal file
|
@ -0,0 +1,316 @@
|
|||
# NVIDIA
|
||||
|
||||
from collections import defaultdict
|
||||
from itertools import islice
|
||||
|
||||
import multiprocessing
|
||||
import statistics
|
||||
|
||||
class Sharding:
|
||||
def __init__(self, input_files, output_name_prefix, n_training_shards, n_test_shards, fraction_test_set):
|
||||
assert len(input_files) > 0, 'The input file list must contain at least one file.'
|
||||
assert n_training_shards > 0, 'There must be at least one output shard.'
|
||||
assert n_test_shards > 0, 'There must be at least one output shard.'
|
||||
|
||||
self.n_training_shards = n_training_shards
|
||||
self.n_test_shards = n_test_shards
|
||||
self.fraction_test_set = fraction_test_set
|
||||
|
||||
self.input_files = input_files
|
||||
|
||||
self.output_name_prefix = output_name_prefix
|
||||
self.output_training_identifier = '_training'
|
||||
self.output_test_identifier = '_test'
|
||||
self.output_file_extension = '.txt'
|
||||
|
||||
self.articles = {} # key: integer identifier, value: list of articles
|
||||
self.sentences = {} # key: integer identifier, value: list of sentences
|
||||
self.output_training_files = {} # key: filename, value: list of articles to go into file
|
||||
self.output_test_files = {} # key: filename, value: list of articles to go into file
|
||||
|
||||
self.init_output_files()
|
||||
|
||||
|
||||
# Remember, the input files contain one article per line (the whitespace check is to skip extraneous blank lines)
|
||||
def load_articles(self):
|
||||
print('Start: Loading Articles')
|
||||
|
||||
global_article_count = 0
|
||||
for input_file in self.input_files:
|
||||
print('input file:', input_file)
|
||||
with open(input_file, mode='r', newline='\n') as f:
|
||||
for i, line in enumerate(f):
|
||||
if line.strip():
|
||||
self.articles[global_article_count] = line.rstrip()
|
||||
global_article_count += 1
|
||||
|
||||
print('End: Loading Articles: There are', len(self.articles), 'articles.')
|
||||
|
||||
|
||||
def segment_articles_into_sentences(self, segmenter):
|
||||
print('Start: Sentence Segmentation')
|
||||
if len(self.articles) is 0:
|
||||
self.load_articles()
|
||||
|
||||
assert len(self.articles) is not 0, 'Please check that input files are present and contain data.'
|
||||
|
||||
# TODO: WIP: multiprocessing (create independent ranges and spawn processes)
|
||||
use_multiprocessing = 'serial'
|
||||
|
||||
def chunks(data, size=len(self.articles)):
|
||||
it = iter(data)
|
||||
for i in range(0, len(data), size):
|
||||
yield {k: data[k] for k in islice(it, size)}
|
||||
|
||||
if use_multiprocessing == 'manager':
|
||||
manager = multiprocessing.Manager()
|
||||
return_dict = manager.dict()
|
||||
jobs = []
|
||||
n_processes = 7 # in addition to the main process, total = n_proc+1
|
||||
|
||||
def work(articles, return_dict):
|
||||
sentences = {}
|
||||
for i, article in enumerate(articles):
|
||||
sentences[i] = segmenter.segment_string(articles[article])
|
||||
|
||||
if i % 5000 == 0:
|
||||
print('Segmenting article', i)
|
||||
|
||||
return_dict.update(sentences)
|
||||
|
||||
for item in chunks(self.articles, len(self.articles)):
|
||||
p = multiprocessing.Process(target=work, args=(item, return_dict))
|
||||
|
||||
# Busy wait
|
||||
while len(jobs) >= n_processes:
|
||||
pass
|
||||
|
||||
jobs.append(p)
|
||||
p.start()
|
||||
|
||||
for proc in jobs:
|
||||
proc.join()
|
||||
|
||||
elif use_multiprocessing == 'queue':
|
||||
work_queue = multiprocessing.Queue()
|
||||
jobs = []
|
||||
|
||||
for item in chunks(self.articles, len(self.articles)):
|
||||
pass
|
||||
|
||||
else: # serial option
|
||||
for i, article in enumerate(self.articles):
|
||||
self.sentences[i] = segmenter.segment_string(self.articles[article])
|
||||
|
||||
if i % 5000 == 0:
|
||||
print('Segmenting article', i)
|
||||
|
||||
print('End: Sentence Segmentation')
|
||||
|
||||
|
||||
def init_output_files(self):
|
||||
print('Start: Init Output Files')
|
||||
assert len(self.output_training_files) is 0, 'Internal storage self.output_files already contains data. This function is intended to be used by the constructor only.'
|
||||
assert len(self.output_test_files) is 0, 'Internal storage self.output_files already contains data. This function is intended to be used by the constructor only.'
|
||||
|
||||
for i in range(self.n_training_shards):
|
||||
name = self.output_name_prefix + self.output_training_identifier + '_' + str(i) + self.output_file_extension
|
||||
self.output_training_files[name] = []
|
||||
|
||||
for i in range(self.n_test_shards):
|
||||
name = self.output_name_prefix + self.output_test_identifier + '_' + str(i) + self.output_file_extension
|
||||
self.output_test_files[name] = []
|
||||
|
||||
print('End: Init Output Files')
|
||||
|
||||
|
||||
def get_sentences_per_shard(self, shard):
|
||||
result = 0
|
||||
for article_id in shard:
|
||||
result += len(self.sentences[article_id])
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def distribute_articles_over_shards(self):
|
||||
print('Start: Distribute Articles Over Shards')
|
||||
assert len(self.articles) >= self.n_training_shards + self.n_test_shards, 'There are fewer articles than shards. Please add more data or reduce the number of shards requested.'
|
||||
|
||||
# Create dictionary with - key: sentence count per article, value: article id number
|
||||
sentence_counts = defaultdict(lambda: [])
|
||||
|
||||
max_sentences = 0
|
||||
total_sentences = 0
|
||||
|
||||
for article_id in self.sentences:
|
||||
current_length = len(self.sentences[article_id])
|
||||
sentence_counts[current_length].append(article_id)
|
||||
max_sentences = max(max_sentences, current_length)
|
||||
total_sentences += current_length
|
||||
|
||||
n_sentences_assigned_to_training = int((1 - self.fraction_test_set) * total_sentences)
|
||||
nominal_sentences_per_training_shard = n_sentences_assigned_to_training // self.n_training_shards
|
||||
nominal_sentences_per_test_shard = (total_sentences - n_sentences_assigned_to_training) // self.n_test_shards
|
||||
|
||||
consumed_article_set = set({})
|
||||
unused_article_set = set(self.articles.keys())
|
||||
|
||||
# Make first pass and add one article worth of lines per file
|
||||
for file in self.output_training_files:
|
||||
current_article_id = sentence_counts[max_sentences][-1]
|
||||
sentence_counts[max_sentences].pop(-1)
|
||||
self.output_training_files[file].append(current_article_id)
|
||||
consumed_article_set.add(current_article_id)
|
||||
unused_article_set.remove(current_article_id)
|
||||
|
||||
# Maintain the max sentence count
|
||||
while len(sentence_counts[max_sentences]) == 0 and max_sentences > 0:
|
||||
max_sentences -= 1
|
||||
|
||||
if len(self.sentences[current_article_id]) > nominal_sentences_per_training_shard:
|
||||
nominal_sentences_per_training_shard = len(self.sentences[current_article_id])
|
||||
print('Warning: A single article contains more than the nominal number of sentences per training shard.')
|
||||
|
||||
for file in self.output_test_files:
|
||||
current_article_id = sentence_counts[max_sentences][-1]
|
||||
sentence_counts[max_sentences].pop(-1)
|
||||
self.output_test_files[file].append(current_article_id)
|
||||
consumed_article_set.add(current_article_id)
|
||||
unused_article_set.remove(current_article_id)
|
||||
|
||||
# Maintain the max sentence count
|
||||
while len(sentence_counts[max_sentences]) == 0 and max_sentences > 0:
|
||||
max_sentences -= 1
|
||||
|
||||
if len(self.sentences[current_article_id]) > nominal_sentences_per_test_shard:
|
||||
nominal_sentences_per_test_shard = len(self.sentences[current_article_id])
|
||||
print('Warning: A single article contains more than the nominal number of sentences per test shard.')
|
||||
|
||||
training_counts = []
|
||||
test_counts = []
|
||||
|
||||
for shard in self.output_training_files:
|
||||
training_counts.append(self.get_sentences_per_shard(self.output_training_files[shard]))
|
||||
|
||||
for shard in self.output_test_files:
|
||||
test_counts.append(self.get_sentences_per_shard(self.output_test_files[shard]))
|
||||
|
||||
training_median = statistics.median(training_counts)
|
||||
test_median = statistics.median(test_counts)
|
||||
|
||||
# Make subsequent passes over files to find articles to add without going over limit
|
||||
history_remaining = []
|
||||
n_history_remaining = 4
|
||||
|
||||
while len(consumed_article_set) < len(self.articles):
|
||||
for fidx, file in enumerate(self.output_training_files):
|
||||
nominal_next_article_size = min(nominal_sentences_per_training_shard - training_counts[fidx], max_sentences)
|
||||
|
||||
# Maintain the max sentence count
|
||||
while len(sentence_counts[max_sentences]) == 0 and max_sentences > 0:
|
||||
max_sentences -= 1
|
||||
|
||||
while len(sentence_counts[nominal_next_article_size]) == 0 and nominal_next_article_size > 0:
|
||||
nominal_next_article_size -= 1
|
||||
|
||||
if nominal_next_article_size not in sentence_counts or nominal_next_article_size is 0 or training_counts[fidx] > training_median:
|
||||
continue # skip adding to this file, will come back later if no file can accept unused articles
|
||||
|
||||
current_article_id = sentence_counts[nominal_next_article_size][-1]
|
||||
sentence_counts[nominal_next_article_size].pop(-1)
|
||||
|
||||
self.output_training_files[file].append(current_article_id)
|
||||
consumed_article_set.add(current_article_id)
|
||||
unused_article_set.remove(current_article_id)
|
||||
|
||||
for fidx, file in enumerate(self.output_test_files):
|
||||
nominal_next_article_size = min(nominal_sentences_per_test_shard - test_counts[fidx], max_sentences)
|
||||
|
||||
# Maintain the max sentence count
|
||||
while len(sentence_counts[max_sentences]) == 0 and max_sentences > 0:
|
||||
max_sentences -= 1
|
||||
|
||||
while len(sentence_counts[nominal_next_article_size]) == 0 and nominal_next_article_size > 0:
|
||||
nominal_next_article_size -= 1
|
||||
|
||||
if nominal_next_article_size not in sentence_counts or nominal_next_article_size is 0 or test_counts[fidx] > test_median:
|
||||
continue # skip adding to this file, will come back later if no file can accept unused articles
|
||||
|
||||
current_article_id = sentence_counts[nominal_next_article_size][-1]
|
||||
sentence_counts[nominal_next_article_size].pop(-1)
|
||||
|
||||
self.output_test_files[file].append(current_article_id)
|
||||
consumed_article_set.add(current_article_id)
|
||||
unused_article_set.remove(current_article_id)
|
||||
|
||||
# If unable to place articles a few times, bump up nominal sizes by fraction until articles get placed
|
||||
if len(history_remaining) == n_history_remaining:
|
||||
history_remaining.pop(0)
|
||||
history_remaining.append(len(unused_article_set))
|
||||
|
||||
history_same = True
|
||||
for i in range(1, len(history_remaining)):
|
||||
history_same = history_same and (history_remaining[i-1] == history_remaining[i])
|
||||
|
||||
if history_same:
|
||||
nominal_sentences_per_training_shard += 1
|
||||
# nominal_sentences_per_test_shard += 1
|
||||
|
||||
training_counts = []
|
||||
test_counts = []
|
||||
for shard in self.output_training_files:
|
||||
training_counts.append(self.get_sentences_per_shard(self.output_training_files[shard]))
|
||||
|
||||
for shard in self.output_test_files:
|
||||
test_counts.append(self.get_sentences_per_shard(self.output_test_files[shard]))
|
||||
|
||||
training_median = statistics.median(training_counts)
|
||||
test_median = statistics.median(test_counts)
|
||||
|
||||
print('Distributing data over shards:', len(unused_article_set), 'articles remaining.')
|
||||
|
||||
|
||||
if len(unused_article_set) != 0:
|
||||
print('Warning: Some articles did not make it into output files.')
|
||||
|
||||
|
||||
for shard in self.output_training_files:
|
||||
print('Training shard:', self.get_sentences_per_shard(self.output_training_files[shard]))
|
||||
|
||||
for shard in self.output_test_files:
|
||||
print('Test shard:', self.get_sentences_per_shard(self.output_test_files[shard]))
|
||||
|
||||
print('End: Distribute Articles Over Shards')
|
||||
|
||||
|
||||
def write_shards_to_disk(self):
|
||||
print('Start: Write Shards to Disk')
|
||||
for shard in self.output_training_files:
|
||||
self.write_single_shard(shard, self.output_training_files[shard])
|
||||
|
||||
for shard in self.output_test_files:
|
||||
self.write_single_shard(shard, self.output_test_files[shard])
|
||||
|
||||
print('End: Write Shards to Disk')
|
||||
|
||||
|
||||
def write_single_shard(self, shard_name, shard):
|
||||
with open(shard_name, mode='w', newline='\n') as f:
|
||||
for article_id in shard:
|
||||
for line in self.sentences[article_id]:
|
||||
f.write(line + '\n')
|
||||
|
||||
f.write('\n') # Line break between articles
|
||||
|
||||
|
||||
import nltk
|
||||
|
||||
nltk.download('punkt')
|
||||
|
||||
class NLTKSegmenter:
|
||||
def __init(self):
|
||||
pass
|
||||
|
||||
def segment_string(self, article):
|
||||
return nltk.tokenize.sent_tokenize(article)
|
||||
|
58
PyTorch/LanguageModeling/BERT/data/WikiDownloader.py
Normal file
58
PyTorch/LanguageModeling/BERT/data/WikiDownloader.py
Normal file
|
@ -0,0 +1,58 @@
|
|||
# NVIDIA
|
||||
|
||||
import bz2
|
||||
import os
|
||||
import urllib.request
|
||||
import sys
|
||||
|
||||
class WikiDownloader:
|
||||
def __init__(self, language, save_path):
|
||||
self.save_path = save_path + '/wikicorpus_' + language
|
||||
|
||||
if not os.path.exists(self.save_path):
|
||||
os.makedirs(self.save_path)
|
||||
|
||||
self.language = language
|
||||
self.download_urls = {
|
||||
'en' : 'https://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles.xml.bz2',
|
||||
'zh' : 'https://dumps.wikimedia.org/zhwiki/latest/zhwiki-latest-pages-articles.xml.bz2'
|
||||
}
|
||||
|
||||
self.output_files = {
|
||||
'en' : 'wikicorpus_en.xml.bz2',
|
||||
'zh' : 'wikicorpus_zh.xml.bz2'
|
||||
}
|
||||
|
||||
|
||||
def download(self):
|
||||
if self.language in self.download_urls:
|
||||
url = self.download_urls[self.language]
|
||||
file = self.output_files[self.language]
|
||||
|
||||
print('Downloading:', url)
|
||||
if os.path.isfile(self.save_path + '/' + file):
|
||||
print('** Download file already exists, skipping download')
|
||||
else:
|
||||
response = urllib.request.urlopen(url)
|
||||
with open(self.save_path + '/' + file, "wb") as handle:
|
||||
handle.write(response.read())
|
||||
|
||||
# Always unzipping since this is relatively fast and will overwrite
|
||||
print('Unzipping:', self.output_files[self.language])
|
||||
#with open(self.save_path + '/' + file, mode='rb', buffering=131072) as f:
|
||||
# it = iter(lambda: f.read(131072), b'')
|
||||
# self.decompression(it, sys.stdout.buffer)
|
||||
|
||||
zip = bz2.BZ2File(self.save_path + '/' + file)
|
||||
open(self.save_path + '/wikicorpus_' + self.language + '.xml', mode='wb', buffering=131072).write(zip.read())
|
||||
|
||||
else:
|
||||
assert False, 'WikiDownloader not implemented for this language yet.'
|
||||
|
||||
def decompression(self, input, output):
|
||||
decomp = bz2.BZ2Decompressor()
|
||||
|
||||
for chunk in input:
|
||||
dc = decomp.decompress(chunk)
|
||||
output.write(dc)
|
||||
|
|
@ -0,0 +1,35 @@
|
|||
# NVIDIA
|
||||
|
||||
import glob
|
||||
import os
|
||||
|
||||
class WikicorpusTextFormatting:
|
||||
def __init__(self, wiki_path, output_filename, recursive = False):
|
||||
self.wiki_path = wiki_path
|
||||
self.recursive = recursive
|
||||
self.output_filename = output_filename
|
||||
|
||||
|
||||
# This puts one article per line
|
||||
def merge(self):
|
||||
with open(self.output_filename, mode='w', newline='\n') as ofile:
|
||||
for dirname in glob.glob(self.wiki_path + '/*/', recursive=False):
|
||||
for filename in glob.glob(dirname + 'wiki_*', recursive=self.recursive):
|
||||
print(filename)
|
||||
article_lines = []
|
||||
article_open = False
|
||||
|
||||
with open(filename, mode='r', newline='\n') as file:
|
||||
for line in file:
|
||||
if '<doc id=' in line:
|
||||
article_open = True
|
||||
elif '</doc>' in line:
|
||||
article_open = False
|
||||
for oline in article_lines[1:]:
|
||||
if oline != '\n':
|
||||
ofile.write(oline.rstrip() + " ")
|
||||
ofile.write("\n\n")
|
||||
article_lines = []
|
||||
else:
|
||||
if article_open:
|
||||
article_lines.append(line)
|
0
PyTorch/LanguageModeling/BERT/data/__init__.py
Normal file
0
PyTorch/LanguageModeling/BERT/data/__init__.py
Normal file
345
PyTorch/LanguageModeling/BERT/data/bertPrep.py
Normal file
345
PyTorch/LanguageModeling/BERT/data/bertPrep.py
Normal file
|
@ -0,0 +1,345 @@
|
|||
# NVIDIA
|
||||
|
||||
import BookscorpusTextFormatting
|
||||
import Downloader
|
||||
import TextSharding
|
||||
import WikicorpusTextFormatting
|
||||
|
||||
import argparse
|
||||
import itertools
|
||||
import multiprocessing
|
||||
import os
|
||||
import pprint
|
||||
import subprocess
|
||||
|
||||
|
||||
def main(args):
|
||||
working_dir = os.environ['BERT_PREP_WORKING_DIR']
|
||||
|
||||
print('Working Directory:', working_dir)
|
||||
print('Action:', args.action)
|
||||
print('Dataset Name:', args.dataset)
|
||||
|
||||
if args.input_files:
|
||||
args.input_files = args.input_files.split(',')
|
||||
|
||||
directory_structure = {
|
||||
'download' : working_dir + '/download', # Downloaded and decompressed
|
||||
'extracted' : working_dir +'/extracted', # Extracted from whatever the initial format is (e.g., wikiextractor)
|
||||
'formatted' : working_dir + '/formatted_one_article_per_line', # This is the level where all sources should look the same
|
||||
'sharded' : working_dir + '/sharded',
|
||||
'tfrecord' : working_dir + '/tfrecord',
|
||||
'hdf5': working_dir + '/hdf5'
|
||||
}
|
||||
|
||||
print('\nDirectory Structure:')
|
||||
pp = pprint.PrettyPrinter(indent=2)
|
||||
pp.pprint(directory_structure)
|
||||
print('')
|
||||
|
||||
if args.action == 'download':
|
||||
if not os.path.exists(directory_structure['download']):
|
||||
os.makedirs(directory_structure['download'])
|
||||
|
||||
downloader = Downloader.Downloader(args.dataset, directory_structure['download'])
|
||||
downloader.download()
|
||||
|
||||
elif args.action == 'text_formatting':
|
||||
assert args.dataset != 'google_pretrained_weights' and args.dataset != 'nvidia_pretrained_weights' and args.dataset != 'squad' and args.dataset != 'mrpc', 'Cannot perform text_formatting on pretrained weights'
|
||||
|
||||
if not os.path.exists(directory_structure['extracted']):
|
||||
os.makedirs(directory_structure['extracted'])
|
||||
|
||||
if not os.path.exists(directory_structure['formatted']):
|
||||
os.makedirs(directory_structure['formatted'])
|
||||
|
||||
if args.dataset == 'bookscorpus':
|
||||
books_path = directory_structure['download'] + '/bookscorpus'
|
||||
#books_path = directory_structure['download']
|
||||
output_filename = directory_structure['formatted'] + '/bookscorpus_one_book_per_line.txt'
|
||||
books_formatter = BookscorpusTextFormatting.BookscorpusTextFormatting(books_path, output_filename, recursive=True)
|
||||
books_formatter.merge()
|
||||
|
||||
elif args.dataset == 'wikicorpus_en':
|
||||
if args.skip_wikiextractor == 0:
|
||||
path_to_wikiextractor_in_container = '/workspace/wikiextractor/WikiExtractor.py'
|
||||
wikiextractor_command = path_to_wikiextractor_in_container + ' ' + directory_structure['download'] + '/' + args.dataset + '/wikicorpus_en.xml ' + '-b 100M --processes ' + str(args.n_processes) + ' -o ' + directory_structure['extracted'] + '/' + args.dataset
|
||||
print('WikiExtractor Command:', wikiextractor_command)
|
||||
wikiextractor_process = subprocess.run(wikiextractor_command, shell=True, check=True)
|
||||
#wikiextractor_process.communicate()
|
||||
|
||||
wiki_path = working_dir + '/' + directory_structure['extracted'] + '/wikicorpus_en'
|
||||
output_filename = directory_structure['formatted'] + '/wikicorpus_en_one_article_per_line.txt'
|
||||
wiki_formatter = WikicorpusTextFormatting.WikicorpusTextFormatting(wiki_path, output_filename, recursive=True)
|
||||
wiki_formatter.merge()
|
||||
|
||||
elif args.dataset == 'wikicorpus_zh':
|
||||
assert False, 'wikicorpus_zh not fully supported at this time. The simplified/tradition Chinese data needs to be translated and properly segmented still, and should work once this step is added.'
|
||||
if args.skip_wikiextractor == 0:
|
||||
path_to_wikiextractor_in_container = '/workspace/wikiextractor/WikiExtractor.py'
|
||||
wikiextractor_command = path_to_wikiextractor_in_container + ' ' + directory_structure['download'] + '/' + args.dataset + '/wikicorpus_zh.xml ' + '-b 100M --processes ' + str(args.n_processes) + ' -o ' + directory_structure['extracted'] + '/' + args.dataset
|
||||
print('WikiExtractor Command:', wikiextractor_command)
|
||||
wikiextractor_process = subprocess.run(wikiextractor_command, shell=True, check=True)
|
||||
#wikiextractor_process.communicate()
|
||||
|
||||
wiki_path = working_dir + '/' + directory_structure['extracted'] + '/wikicorpus_zh'
|
||||
output_filename = directory_structure['formatted'] + '/wikicorpus_zh_one_article_per_line.txt'
|
||||
wiki_formatter = WikicorpusTextFormatting.WikicorpusTextFormatting(wiki_path, output_filename, recursive=True)
|
||||
wiki_formatter.merge()
|
||||
|
||||
elif args.action == 'sharding':
|
||||
# Note: books+wiki requires user to provide list of input_files (comma-separated with no spaces)
|
||||
if args.dataset == 'bookscorpus' or 'wikicorpus' in args.dataset or 'books_wiki' in args.dataset:
|
||||
if args.input_files is None:
|
||||
if args.dataset == 'bookscorpus':
|
||||
args.input_files = [directory_structure['formatted'] + '/bookscorpus_one_book_per_line.txt']
|
||||
elif args.dataset == 'wikicorpus_en':
|
||||
args.input_files = [directory_structure['formatted'] + '/wikicorpus_en_one_article_per_line.txt']
|
||||
elif args.dataset == 'wikicorpus_zh':
|
||||
args.input_files = [directory_structure['formatted'] + '/wikicorpus_zh_one_article_per_line.txt']
|
||||
elif args.dataset == 'books_wiki_en_corpus':
|
||||
args.input_files = [directory_structure['formatted'] + '/bookscorpus_one_book_per_line.txt', directory_structure['formatted'] + '/wikicorpus_en_one_article_per_line.txt']
|
||||
|
||||
if args.output_file_prefix is None:
|
||||
args.output_file_prefix = directory_structure['sharded'] + '/' + args.dataset + '/' + args.dataset
|
||||
|
||||
if not os.path.exists(directory_structure['sharded']):
|
||||
os.makedirs(directory_structure['sharded'])
|
||||
|
||||
if not os.path.exists(directory_structure['sharded'] + '/' + args.dataset):
|
||||
os.makedirs(directory_structure['sharded'] + '/' + args.dataset)
|
||||
|
||||
# Segmentation is here because all datasets look the same in one article/book/whatever per line format, and
|
||||
# it seemed unnecessarily complicated to add an additional preprocessing step to call just for this.
|
||||
# Different languages (e.g., Chinese simplified/traditional) may require translation and
|
||||
# other packages to be called from here -- just add a conditional branch for those extra steps
|
||||
segmenter = TextSharding.NLTKSegmenter()
|
||||
sharding = TextSharding.Sharding(args.input_files, args.output_file_prefix, args.n_training_shards, args.n_test_shards, args.fraction_test_set)
|
||||
|
||||
sharding.load_articles()
|
||||
sharding.segment_articles_into_sentences(segmenter)
|
||||
sharding.distribute_articles_over_shards()
|
||||
sharding.write_shards_to_disk()
|
||||
|
||||
else:
|
||||
assert False, 'Unsupported dataset for sharding'
|
||||
|
||||
elif args.action == 'create_tfrecord_files':
|
||||
assert False, 'TFrecord creation not supported in this PyTorch model example release.' \
|
||||
''
|
||||
if not os.path.exists(directory_structure['tfrecord']):
|
||||
os.makedirs(directory_structure['tfrecord'])
|
||||
|
||||
def create_record_worker(filename_prefix, shard_id, output_format='tfrecord'):
|
||||
bert_preprocessing_command = 'python /workspace/bert/create_pretraining_data.py'
|
||||
bert_preprocessing_command += ' --input_file=' + directory_structure['sharded'] + '/' + args.dataset + '/' + filename_prefix + '_' + str(shard_id) + '.txt'
|
||||
bert_preprocessing_command += ' --output_file=' + directory_structure['tfrecord'] + '/' + args.dataset + '/' + filename_prefix + '_' + str(shard_id) + '.' + output_format
|
||||
bert_preprocessing_command += ' --vocab_file=' + args.vocab_file
|
||||
bert_preprocessing_command += ' --do_lower_case=' + 'true' if args.do_lower_case else 'false'
|
||||
bert_preprocessing_command += ' --max_seq_length=' + str(args.max_seq_length)
|
||||
bert_preprocessing_command += ' --max_predictions_per_seq=' + str(args.max_predictions_per_seq)
|
||||
bert_preprocessing_command += ' --masked_lm_prob=' + str(args.masked_lm_prob)
|
||||
bert_preprocessing_command += ' --random_seed=' + str(args.random_seed)
|
||||
bert_preprocessing_command += ' --dupe_factor=' + str(args.dupe_factor)
|
||||
bert_preprocessing_process = subprocess.Popen(bert_preprocessing_command, shell=True)
|
||||
bert_preprocessing_process.communicate()
|
||||
|
||||
last_process = bert_preprocessing_process
|
||||
|
||||
# This could be better optimized (fine if all take equal time)
|
||||
if shard_id % args.n_processes == 0 and shard_id > 0:
|
||||
bert_preprocessing_process.wait()
|
||||
|
||||
for i in range(args.n_training_shards):
|
||||
create_record_worker(args.output_file_prefix + '_training', i)
|
||||
|
||||
last_process.wait()
|
||||
|
||||
for i in range(args.n_test_shards):
|
||||
create_record_worker(args.output_file_prefix + '_test', i)
|
||||
|
||||
last_process.wait()
|
||||
|
||||
|
||||
elif args.action == 'create_hdf5_files':
|
||||
last_process = None
|
||||
|
||||
def create_record_worker(filename_prefix, shard_id, output_format='hdf5'):
|
||||
bert_preprocessing_command = 'python /workspace/bert/create_pretraining_data.py'
|
||||
bert_preprocessing_command += ' --input_file=' + directory_structure['sharded'] + '/' + args.dataset + '/' + filename_prefix + '_' + str(shard_id) + '.txt'
|
||||
bert_preprocessing_command += ' --output_file=' + directory_structure['tfrecord'] + '/' + args.dataset + '/' + filename_prefix + '_' + str(shard_id) + '.' + output_format
|
||||
bert_preprocessing_command += ' --vocab_file=' + args.vocab_file
|
||||
bert_preprocessing_command += ' --do_lower_case' if args.do_lower_case else ''
|
||||
bert_preprocessing_command += ' --max_seq_length=' + args.max_seq_length
|
||||
bert_preprocessing_command += ' --max_predictions_per_seq=' + args.max_predictions_per_seq
|
||||
bert_preprocessing_command += ' --masked_lm_prob=' + args.masked_lm_prob
|
||||
bert_preprocessing_command += ' --random_seed=' + args.random_seed
|
||||
bert_preprocessing_command += ' --dupe_factor=' + args.dupe_factor
|
||||
bert_preprocessing_process = subprocess.Popen(bert_preprocessing_command, shell=True)
|
||||
bert_preprocessing_process.communicate()
|
||||
|
||||
last_process = bert_preprocessing_process
|
||||
|
||||
# This could be better optimized (fine if all take equal time)
|
||||
if shard_id % args.n_processes == 0 and shard_id > 0:
|
||||
bert_preprocessing_process.wait()
|
||||
|
||||
for i in range(args.n_training_shards):
|
||||
create_record_worker(args.output_file_prefix + '_training', i)
|
||||
|
||||
last_process.wait()
|
||||
|
||||
for i in range(args.n_test_shards):
|
||||
create_record_worker(args.output_file_prefix + '_test', i)
|
||||
|
||||
last_process.wait()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(
|
||||
description='Preprocessing Application for Everything BERT-related'
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--action',
|
||||
type=str,
|
||||
help='Specify the action you want the app to take. e.g., generate vocab, segment, create tfrecords',
|
||||
choices={
|
||||
'download', # Download and verify mdf5/sha sums
|
||||
'text_formatting', # Convert into a file that contains one article/book per line
|
||||
'sharding', # Convert previous formatted text into shards containing one sentence per line
|
||||
'create_tfrecord_files', # Turn each shard into a TFrecord with masking and next sentence prediction info
|
||||
'create_hdf5_files' # Turn each shard into a HDF5 file with masking and next sentence prediction info
|
||||
}
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--dataset',
|
||||
type=str,
|
||||
help='Specify the dataset to perform --action on',
|
||||
choices={
|
||||
'bookscorpus',
|
||||
'wikicorpus_en',
|
||||
'wikicorpus_zh',
|
||||
'books_wiki_en_corpus',
|
||||
'google_pretrained_weights',
|
||||
'nvidia_pretrained_weights',
|
||||
'mrpc',
|
||||
'squad',
|
||||
'all'
|
||||
}
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--input_files',
|
||||
type=str,
|
||||
help='Specify the input files in a comma-separated list (no spaces)'
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--output_file_prefix',
|
||||
type=str,
|
||||
help='Specify the naming convention (prefix) of the output files'
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--n_training_shards',
|
||||
type=int,
|
||||
help='Specify the number of training shards to generate',
|
||||
default=256
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--n_test_shards',
|
||||
type=int,
|
||||
help='Specify the number of test shards to generate',
|
||||
default=256
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--fraction_test_set',
|
||||
type=float,
|
||||
help='Specify the fraction (0..1) of the data to withhold for the test data split (based on number of sequences)',
|
||||
default=0.2
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--segmentation_method',
|
||||
type=str,
|
||||
help='Specify your choice of sentence segmentation',
|
||||
choices={
|
||||
'nltk'
|
||||
},
|
||||
default='nltk'
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--n_processes',
|
||||
type=int,
|
||||
help='Specify the max number of processes to allow at one time',
|
||||
default=4
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--random_seed',
|
||||
type=int,
|
||||
help='Specify the base seed to use for any random number generation',
|
||||
default=12345
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--dupe_factor',
|
||||
type=int,
|
||||
help='Specify the duplication factor',
|
||||
default=5
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--masked_lm_prob',
|
||||
type=float,
|
||||
help='Specify the probability for masked lm',
|
||||
default=0.15
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--max_seq_length',
|
||||
type=int,
|
||||
help='Specify the maximum sequence length',
|
||||
default=512
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--max_predictions_per_seq',
|
||||
type=int,
|
||||
help='Specify the maximum number of masked words per sequence',
|
||||
default=20
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--do_lower_case',
|
||||
type=int,
|
||||
help='Specify whether it is cased (0) or uncased (1) (any number greater than 0 will be treated as uncased)',
|
||||
default=1
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--vocab_file',
|
||||
type=str,
|
||||
help='Specify absolute path to vocab file to use)'
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--skip_wikiextractor',
|
||||
type=int,
|
||||
help='Specify whether to skip wikiextractor step 0=False, 1=True',
|
||||
default=0
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--interactive_json_config_generator',
|
||||
type=str,
|
||||
help='Specify the action you want the app to take. e.g., generate vocab, segment, create tfrecords'
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
main(args)
|
|
@ -1,23 +0,0 @@
|
|||
# NVIDIA
|
||||
|
||||
import glob
|
||||
import os
|
||||
import argparse
|
||||
|
||||
parser = argparse.ArgumentParser(description='Cleaning and merge downloaded bookcorpus files')
|
||||
|
||||
parser.add_argument('download_path', type=str)
|
||||
parser.add_argument('output_file', type=str)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
download_path = args.download_path
|
||||
output_file = args.output_file
|
||||
|
||||
with open(output_file, "w") as ofile:
|
||||
for filename in glob.glob('{}/*.txt'.format(download_path), recursive=True):
|
||||
with open(filename, mode='r', encoding="utf-8-sig") as file:
|
||||
for line in file:
|
||||
if line.strip() != "":
|
||||
ofile.write(line.strip() + " ")
|
||||
ofile.write("\n\n")
|
|
@ -1,9 +0,0 @@
|
|||
#! /bin/bash
|
||||
|
||||
# Download books
|
||||
mkdir -p ./download
|
||||
python3 /workspace/bookcorpus/download_files.py --list /workspace/bookcorpus/url_list.jsonl --out ./download --trash-bad-count
|
||||
|
||||
# Clean and prep (one book per line)
|
||||
python3 ./clean_and_merge_text.py ./download bookcorpus.txt
|
||||
|
|
@ -1,38 +1,27 @@
|
|||
#!/bin/bash
|
||||
|
||||
# Note: There are several directories created to make it clear what has been performed at each stage of preprocessing. The intermediate files may be useful if you want to further clean/prepare/augment the data for your own applications.
|
||||
# NLTK was chosen as the default over spaCy simply due to speed of sentence segmentation on the large files.
|
||||
# Download
|
||||
python3 /workspace/bert/data/bertPrep.py --action download --dataset bookscorpus
|
||||
python3 /workspace/bert/data/bertPrep.py --action download --dataset wikicorpus_en
|
||||
|
||||
MERGED_DIR=$1
|
||||
args="${*:2}"
|
||||
python3 /workspace/bert/data/bertPrep.py --action download --dataset google_pretrained_weights # Includes vocab
|
||||
|
||||
source utils/config.sh
|
||||
python3 /workspace/bert/data/bertPrep.py --action download --dataset squad
|
||||
#python3 /workspace/bert/data/bertPrep.py --action download --dataset mrpc
|
||||
|
||||
mkdir -p ${MERGED_DIR}
|
||||
|
||||
corpus_file=${MERGED_DIR}/corpus.txt
|
||||
## Shuffle the full corpus texts
|
||||
if [ ! -z $3 ]
|
||||
then
|
||||
echo "Merging $args"
|
||||
cat $args | sed "/^$/d" | shuf > $corpus_file
|
||||
else
|
||||
corpus_file=$2
|
||||
fi
|
||||
# Properly format the text files
|
||||
python3 /workspace/bert/data/bertPrep.py --action text_formatting --dataset bookscorpus
|
||||
python3 /workspace/bert/data/bertPrep.py --action text_formatting --dataset wikicorpus_en
|
||||
|
||||
# Split articles into one-sentence-per-line format for use with BERT scripts
|
||||
echo "Applying sentence segmentation to get one sentence per line"
|
||||
mkdir -p ${MERGED_DIR}/final_text_file_single
|
||||
python3 utils/sentence_segmentation_nltk.py $corpus_file ${MERGED_DIR}/final_text_file_single/corpus.segmented.nltk.txt
|
||||
|
||||
## Shard finalized text so that it has a chance of fitting in memory when creating pretraining data into hdf5 (choose appropriate number of shards for distributed training)
|
||||
echo "Shard text files - size is approximate to prevent splitting an article across shards"
|
||||
mkdir -p ${MERGED_DIR}/final_text_files_sharded
|
||||
python3 utils/shard_text_input_file.py ${MERGED_DIR}/final_text_file_single/corpus.segmented.nltk.txt ${MERGED_DIR}/final_text_files_sharded/corpus.segmented.part.
|
||||
# Shard the text files (group wiki+books then shard)
|
||||
python3 /workspace/bert/data/bertPrep.py --action sharding --dataset books_wiki_en_corpus
|
||||
|
||||
# Convert sharded text files into hdf5 that are ready for BERT pretraining
|
||||
echo "Creating hdf5 for each text shard"
|
||||
mkdir -p ${MERGED_DIR}/hdf5_shards
|
||||
export TARGET_DIR=${MERGED_DIR}
|
||||
. utils/preprocessing_xargs_wrapper.sh ${N_PROCS_PREPROCESS}
|
||||
|
||||
# Create HDF5 files Phase 1
|
||||
python3 /workspace/bert/data/bertPrep.py --action create_hdf5_files --dataset books_wiki_en_corpus --max_seq_length 128 --max_predictions_per_seq 20
|
||||
|
||||
|
||||
# Create HDF5 files Phase 2
|
||||
python3 /workspace/bert/data/bertPrep.py --action create_hdf5_files --dataset books_wiki_en_corpus --max_seq_length 512 --max_predictions_per_seq 80
|
||||
|
|
|
@ -1,29 +0,0 @@
|
|||
#!/bin/bash
|
||||
|
||||
|
||||
MERGED_DIR=$1 # e.g wikipedia+bookcorpus
|
||||
INPUTFILES=$2 # directories with hdf5 files separated by comma
|
||||
NUM_SHARDS=$3
|
||||
|
||||
source utils/config.sh
|
||||
|
||||
|
||||
META_DIR=$MERGED_DIR/meta
|
||||
mkdir -p ${MERGED_DIR}
|
||||
mkdir -p ${META_DIR}
|
||||
|
||||
echo "create mixed dataset ids"
|
||||
echo "python utils/create_mixed_dataset_ids.py --input_files=${INPUTFILES} --num_output_shards=${NUM_SHARDS} --output_dir=${META_DIR} --random_seed=${SEED}"
|
||||
python utils/create_mixed_dataset_ids.py --input_files=${INPUTFILES} --num_output_shards=${NUM_SHARDS} --output_dir=${META_DIR} --random_seed=${SEED}
|
||||
|
||||
|
||||
echo "Creating hdf5 for each text shard"
|
||||
mkdir -p ${MERGED_DIR}/hdf5_shards
|
||||
echo "create mixed datasets with hdf5 files"
|
||||
echo "python utils/create_mixed_dataset.py --input_files=${INPUTFILES} --output_dir=${MERGED_DIR}/hdf5_shards --lookup=${META_DIR}/lookup_table.pkl --indices_dir=${META_DIR} --index_range=0-${NUM_SHARDS} --random_seed=${SEED}"
|
||||
python utils/create_mixed_dataset.py --input_files=${INPUTFILES} --output_dir=${MERGED_DIR}/hdf5_shards --lookup=${META_DIR}/lookup_table.pkl --indices_dir=${META_DIR} --index_range=0-$((NUM_SHARDS-1)) --random_seed=${SEED}
|
||||
|
||||
|
||||
rm -rf ${META_DIR}
|
||||
|
||||
|
|
@ -1,24 +0,0 @@
|
|||
#! /bin/bash
|
||||
|
||||
set -e
|
||||
|
||||
USE_BERT_LARGE=true
|
||||
MAX_SEQUENCE_LENGTH=512
|
||||
MAX_PREDICTIONS_PER_SEQUENCE=80
|
||||
MASKED_LM_PROB=0.15
|
||||
SEED=12345
|
||||
DUPE_FACTOR=5
|
||||
DO_LOWER_CASE="True"
|
||||
N_LINES_PER_SHARD_APPROX=396000 # Default=396000 creates 256 shards
|
||||
|
||||
N_PROCS_PREPROCESS=4 # Adjust this based on memory requirements and available number of cores
|
||||
|
||||
BERT_BASE_DIR="/workspace/bert/vocab/uncased_L-12_H-768_A-12"
|
||||
BERT_LARGE_DIR="/workspace/bert/vocab/uncased_L-24_H-1024_A-16"
|
||||
|
||||
if [ "$USE_BERT_LARGE" = true ] ; then
|
||||
VOCAB_FILE="${BERT_LARGE_DIR}/vocab.txt"
|
||||
else
|
||||
VOCAB_FILE="${BERT_BASE_DIR}/vocab.txt"
|
||||
fi
|
||||
|
|
@ -1,160 +0,0 @@
|
|||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
import argparse
|
||||
import logging
|
||||
import os
|
||||
import random
|
||||
from io import open
|
||||
import h5py
|
||||
import numpy as np
|
||||
from tqdm import tqdm, trange
|
||||
import random
|
||||
import collections
|
||||
import math
|
||||
import multiprocessing as mp
|
||||
"""
|
||||
mixing hdf5 shards with each other
|
||||
"""
|
||||
|
||||
|
||||
def shard_files(output_files, l_instance_ids, lookuptable, files):
|
||||
|
||||
l_input_ids = []
|
||||
l_input_masks = []
|
||||
l_segment_ids = []
|
||||
l_masked_lm_positions = []
|
||||
l_masked_lm_ids = []
|
||||
l_next_sentence_labels = []
|
||||
|
||||
seq_len = 0
|
||||
pred_len = 0
|
||||
with h5py.File(files[0], 'r') as f:
|
||||
seq_len = f['input_ids'].shape[1]
|
||||
pred_len = f['masked_lm_positions'].shape[1]
|
||||
|
||||
assert(seq_len > 0 and pred_len > 0)
|
||||
for i, output_file in enumerate(output_files):
|
||||
output_length = len(l_instance_ids[i])
|
||||
print("preparing to write {} instances to {}".format(output_length, output_file))
|
||||
input_ids = np.ones([output_length, seq_len], dtype=np.int32)
|
||||
input_masks = np.ones([output_length, seq_len], dtype=np.int8)
|
||||
segment_ids = np.ones([output_length, seq_len], dtype=np.int8)
|
||||
masked_lm_positions = np.ones([output_length, pred_len], dtype=np.int32)
|
||||
masked_lm_ids= np.ones([output_length, pred_len], dtype=np.int32)
|
||||
next_sentence_labels = np.ones(output_length, dtype=np.int8)
|
||||
l_input_ids.append(input_ids)
|
||||
l_input_masks.append(input_masks)
|
||||
l_segment_ids.append(segment_ids)
|
||||
l_masked_lm_positions.append(masked_lm_positions)
|
||||
l_masked_lm_ids.append(masked_lm_ids)
|
||||
l_next_sentence_labels.append(next_sentence_labels)
|
||||
for did, f in enumerate(tqdm(files)):
|
||||
h5_f = h5py.File(f, 'r')
|
||||
f_input_ids = h5_f['input_ids'][:]
|
||||
f_input_masks = h5_f['input_mask'][:]
|
||||
f_segment_ids = h5_f['segment_ids'][:]
|
||||
f_masked_lm_positions = h5_f['masked_lm_positions'][:]
|
||||
f_masked_lm_ids = h5_f['masked_lm_ids'][:]
|
||||
f_next_sentence_labels = h5_f['next_sentence_labels'][:]
|
||||
h5_f.close()
|
||||
for out_i, out_file in enumerate(output_files):
|
||||
instance_ids = l_instance_ids[out_i]
|
||||
for l, idx in enumerate(instance_ids):
|
||||
doc_id, line_id = lookuptable[idx]
|
||||
if doc_id == did:
|
||||
l_input_ids[out_i][l] = f_input_ids[line_id]
|
||||
l_input_masks[out_i][l] = f_input_masks[line_id]
|
||||
l_segment_ids[out_i][l] = f_segment_ids[line_id]
|
||||
l_masked_lm_positions[out_i][l] = f_masked_lm_positions[line_id]
|
||||
l_masked_lm_ids[out_i][l] = f_masked_lm_ids[line_id]
|
||||
l_next_sentence_labels[out_i][l] = f_next_sentence_labels[line_id]
|
||||
for out_i, out_file in enumerate(output_files):
|
||||
output_length = len(l_input_ids[out_i])
|
||||
print("writing {} instances to {}".format(output_length, out_file))
|
||||
with h5py.File(out_file, 'w') as f:
|
||||
f.create_dataset("input_ids", data=l_input_ids[out_i], dtype='i4', compression='gzip')
|
||||
f.create_dataset("input_mask", data=l_input_masks[out_i], dtype='i1', compression='gzip')
|
||||
f.create_dataset("segment_ids", data=l_segment_ids[out_i], dtype='i1', compression='gzip')
|
||||
f.create_dataset("masked_lm_positions", data=l_masked_lm_positions[out_i], dtype='i4', compression='gzip')
|
||||
f.create_dataset("masked_lm_ids", data=l_masked_lm_ids[out_i], dtype='i4', compression='gzip')
|
||||
f.create_dataset("next_sentence_labels", data=l_next_sentence_labels[out_i], dtype='i1', compression='gzip')
|
||||
|
||||
|
||||
def main():
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
## Required parameters
|
||||
parser.add_argument("--input_files",
|
||||
default=None,
|
||||
type=str,
|
||||
required=True,
|
||||
help="comma seperated list of file paths, each path can be either file or directory of files")
|
||||
parser.add_argument("--output_dir",
|
||||
default=None,
|
||||
type=str,
|
||||
required=True,
|
||||
help="directory for output shards")
|
||||
parser.add_argument("--lookup",
|
||||
default=None,
|
||||
type=str,
|
||||
required=True,
|
||||
help="path to lookup table")
|
||||
parser.add_argument("--indices_dir",
|
||||
default=None,
|
||||
type=str,
|
||||
required=True,
|
||||
help="path to shuffled instance indices")
|
||||
parser.add_argument("--index_range",
|
||||
default=None,
|
||||
type=str,
|
||||
required=True,
|
||||
help="index range of output files to be written out, e.g specify '0-100' for writing out 0.hdf5 , ..., 100.hdf5")
|
||||
parser.add_argument('--random_seed',
|
||||
type=int,
|
||||
default=12345,
|
||||
help="random seed for initialization")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
rng = random.Random(args.random_seed)
|
||||
np.random.seed(args.random_seed)
|
||||
|
||||
|
||||
input_paths = args.input_files.strip().split(',')
|
||||
input_paths = [f for f in input_paths if f]
|
||||
|
||||
input_files = []
|
||||
for path in input_paths:
|
||||
if os.path.isfile(path):
|
||||
assert (path.endswith('.hdf5')), "file must be hdf5 file"
|
||||
input_files.append(path)
|
||||
else:
|
||||
assert os.path.isdir(path)
|
||||
hdf5_files = [os.path.join(path, f) for f in os.listdir(path) if os.path.isfile(os.path.join(path, f)) and f.endswith('.hdf5')]
|
||||
input_files.extend(hdf5_files)
|
||||
|
||||
input_files.sort()
|
||||
assert(os.path.isdir(args.output_dir))
|
||||
|
||||
|
||||
|
||||
print("loading indices file")
|
||||
start_idx, end_idx= int(args.index_range.split('-')[0]), int(args.index_range.split('-')[1])
|
||||
index_files = []
|
||||
instance_ids = []
|
||||
for i in range(start_idx, end_idx + 1):
|
||||
index_files.append(os.path.join(args.indices_dir, "indices_" + str(i) + ".npy"))
|
||||
instance_ids.append( np.load(index_files[-1]))
|
||||
|
||||
output_files = [os.path.join(args.output_dir, indices_file.split('.')[0].split('_')[-1] + ".hdf5") for indices_file in index_files]
|
||||
print("output_files", output_files)
|
||||
|
||||
print("loading lookup table")
|
||||
lookup_table = np.load(args.lookup)
|
||||
shard_files(output_files, instance_ids, lookup_table, input_files)
|
||||
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
|
@ -1,134 +0,0 @@
|
|||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
import argparse
|
||||
import logging
|
||||
import os
|
||||
import random
|
||||
from io import open
|
||||
import h5py
|
||||
import numpy as np
|
||||
from tqdm import tqdm, trange
|
||||
import random
|
||||
import collections
|
||||
import math
|
||||
from tqdm import tqdm
|
||||
import multiprocessing as mp
|
||||
import pickle
|
||||
import json
|
||||
"""
|
||||
mixing hdf5 shards with each other
|
||||
"""
|
||||
def load_and_prepare(input_files, num_shards):
|
||||
|
||||
seq_len = None
|
||||
pred_len = None
|
||||
|
||||
input_lengths = []
|
||||
for input_file in input_files:
|
||||
with h5py.File(input_file, 'r') as f:
|
||||
input_lengths.append(len(f['input_ids']))
|
||||
if seq_len is None:
|
||||
seq_len = f['input_ids'].shape[1]
|
||||
pred_len = f['masked_lm_ids'].shape[1]
|
||||
|
||||
assert (isinstance(seq_len, int) and isinstance(pred_len, int))
|
||||
|
||||
total_instances = sum(input_lengths)
|
||||
n_inst_per_file = math.ceil(total_instances * 1.0 / num_shards)
|
||||
permutation = np.random.permutation(total_instances)
|
||||
|
||||
|
||||
instance_indices = []
|
||||
for i in range(0, num_shards):
|
||||
start_pos = i * n_inst_per_file
|
||||
end_pos = min((i+1) * n_inst_per_file, total_instances)
|
||||
instance_indices.append(permutation[start_pos:end_pos])
|
||||
|
||||
return seq_len, pred_len, input_lengths, instance_indices
|
||||
|
||||
|
||||
|
||||
|
||||
def main():
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
## Required parameters
|
||||
parser.add_argument("--input_files",
|
||||
default=None,
|
||||
type=str,
|
||||
required=True,
|
||||
help="comma seperated list of file paths, each path can be either file or directory of hdf5 files")
|
||||
parser.add_argument("--num_output_shards",
|
||||
default=None,
|
||||
type=int,
|
||||
required=True,
|
||||
help="number of shards to be created. shards will be created as even as possible.")
|
||||
parser.add_argument("--output_dir",
|
||||
default=None,
|
||||
type=str,
|
||||
required=True,
|
||||
help="directory for meta files")
|
||||
parser.add_argument('--random_seed',
|
||||
type=int,
|
||||
default=12345,
|
||||
help="random seed for initialization")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
rng = random.Random(args.random_seed)
|
||||
np.random.seed(args.random_seed)
|
||||
|
||||
|
||||
input_paths = args.input_files.strip().split(',')
|
||||
input_paths = [f for f in input_paths if f]
|
||||
|
||||
input_files = []
|
||||
for path in input_paths:
|
||||
if os.path.isfile(path):
|
||||
assert (path.endswith('.hdf5')), "file must be hdf5 file"
|
||||
input_files.append(path)
|
||||
else:
|
||||
assert os.path.isdir(path)
|
||||
hdf5_files = [os.path.join(path, f) for f in os.listdir(path) if os.path.isfile(os.path.join(path, f)) and f.endswith('.hdf5')]
|
||||
input_files.extend(hdf5_files)
|
||||
input_files.sort()
|
||||
|
||||
assert(os.path.isdir(args.output_dir))
|
||||
|
||||
print("load and prepare")
|
||||
seq_len, pred_len, input_lengths, output_inst_indices = load_and_prepare(input_files, args.num_output_shards)
|
||||
print("preparing lookup table")
|
||||
total_num_instances = sum(input_lengths)
|
||||
out_2_in = dict()
|
||||
length_so_far = 0
|
||||
for i, l in enumerate(input_lengths):
|
||||
for j in range(l):
|
||||
out_2_in[length_so_far + j] = (i, j)
|
||||
length_so_far += input_lengths[i]
|
||||
|
||||
|
||||
|
||||
output_files = [os.path.join(args.output_dir, "indices_" + str(i) + ".npy") for i in range(args.num_output_shards)]
|
||||
print("save data")
|
||||
|
||||
|
||||
with open(os.path.join(args.output_dir, 'lookup_table.pkl'), 'wb') as f:
|
||||
pickle.dump(out_2_in, f)
|
||||
|
||||
for i, out_file in enumerate(output_files):
|
||||
np.save(out_file, output_inst_indices[i])
|
||||
|
||||
|
||||
meta = {'seq_len': seq_len, 'pred_len':pred_len}
|
||||
|
||||
with open(os.path.join(args.output_dir, 'meta_data.pkl'), 'wb') as f:
|
||||
pickle.dump(meta, f)
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
|
@ -1,23 +0,0 @@
|
|||
#! /bin/bash
|
||||
|
||||
SHARD_INDEX=${1}
|
||||
INPUT_FILE="${TARGET_DIR}/final_text_files_sharded/corpus.segmented.part.${SHARD_INDEX}.txt"
|
||||
|
||||
source /workspace/bert/data/utils/config.sh
|
||||
|
||||
OUTPUT_DIR=${TARGET_DIR}/hdf5_shards
|
||||
mkdir -p ${OUTPUT_DIR}
|
||||
|
||||
OUTPUT_FILE="${OUTPUT_DIR}/${SHARD_INDEX}.hdf5"
|
||||
|
||||
python /workspace/bert/create_pretraining_data.py \
|
||||
--input_file=${INPUT_FILE} \
|
||||
--output_file=${OUTPUT_FILE} \
|
||||
--vocab_file=${VOCAB_FILE} \
|
||||
--do_lower_case \
|
||||
--max_seq_length=${MAX_SEQUENCE_LENGTH} \
|
||||
--max_predictions_per_seq=${MAX_PREDICTIONS_PER_SEQUENCE} \
|
||||
--masked_lm_prob=${MASKED_LM_PROB} \
|
||||
--random_seed=${SEED} \
|
||||
--dupe_factor=${DUPE_FACTOR}
|
||||
|
|
@ -1,15 +0,0 @@
|
|||
#! /bin/bash
|
||||
|
||||
source /workspace/bert/data/utils/config.sh
|
||||
|
||||
SHARD_COUNT=0
|
||||
rm -rf ${TARGET_DIR}/xarg_list.txt
|
||||
touch ${TARGET_DIR}/xarg_list.txt
|
||||
for file in ${TARGET_DIR}/final_text_files_sharded/*; do
|
||||
echo ${SHARD_COUNT} >> ${TARGET_DIR}/xarg_list.txt
|
||||
SHARD_COUNT=$((SHARD_COUNT+1))
|
||||
done
|
||||
|
||||
xargs -n 1 --max-procs=${N_PROCS_PREPROCESS} --arg-file=${TARGET_DIR}/xarg_list.txt /workspace/bert/data/utils/preprocessing.sh
|
||||
|
||||
rm ${TARGET_DIR}/xarg_list.txt
|
|
@ -1,28 +0,0 @@
|
|||
# NVIDIA
|
||||
|
||||
import argparse
|
||||
import nltk
|
||||
import os
|
||||
|
||||
nltk.download('punkt')
|
||||
|
||||
parser = argparse.ArgumentParser(description='Sentence Segmentation')
|
||||
|
||||
parser.add_argument('input_file', type=str)
|
||||
parser.add_argument('output_file', type=str)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
input_file = args.input_file
|
||||
output_file = args.output_file
|
||||
|
||||
doc_seperator = "\n"
|
||||
|
||||
with open(input_file) as ifile:
|
||||
with open(output_file, "w") as ofile:
|
||||
for line in ifile:
|
||||
if line != "\n":
|
||||
sent_list = nltk.tokenize.sent_tokenize(line)
|
||||
for sent in sent_list:
|
||||
ofile.write(sent + "\n")
|
||||
ofile.write(doc_seperator)
|
|
@ -1,47 +0,0 @@
|
|||
# NVIDIA
|
||||
|
||||
import os
|
||||
import argparse
|
||||
|
||||
parser = argparse.ArgumentParser(description='Dataset sharding')
|
||||
|
||||
parser.add_argument('input_file', type=str)
|
||||
parser.add_argument('output_file', type=str)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
input_file = args.input_file
|
||||
output_file = args.output_file
|
||||
|
||||
doc_seperator = "\n"
|
||||
|
||||
line_buffer = []
|
||||
shard_size = 396000 # Approximate, will split at next article break
|
||||
line_counter = 0
|
||||
shard_index = 0
|
||||
|
||||
ifile_lines = 0
|
||||
with open(input_file) as ifile:
|
||||
for line in ifile:
|
||||
ifile_lines += 1
|
||||
|
||||
print("Input file contains", ifile_lines, "lines.")
|
||||
|
||||
iline_counter = 1
|
||||
with open(input_file) as ifile:
|
||||
for line in ifile:
|
||||
if line_counter < shard_size and iline_counter < ifile_lines:
|
||||
line_buffer.append(line)
|
||||
line_counter += 1
|
||||
iline_counter += 1
|
||||
elif line_counter >= shard_size and line != "\n" and iline_counter < ifile_lines:
|
||||
line_buffer.append(line)
|
||||
line_counter += 1
|
||||
iline_counter += 1
|
||||
else:
|
||||
with open(output_file + str(shard_index) + ".txt", "w") as ofile:
|
||||
for oline in line_buffer:
|
||||
ofile.write(oline)
|
||||
line_buffer = []
|
||||
line_counter = 0
|
||||
shard_index += 1
|
|
@ -1,30 +0,0 @@
|
|||
#! /bin/bash
|
||||
|
||||
WIKI_DUMP="https://dumps.wikimedia.org/enwiki/20190320/enwiki-20190320-pages-articles-multistream.xml.bz2"
|
||||
N_PROCS_PREPROCESS=$(nproc) # Adjust this based on memory requirements and available number of cores
|
||||
|
||||
# Download Wikipedia dump file
|
||||
mkdir -p ./download
|
||||
|
||||
# Not using --noclobber since it emits an error if exists (incompatible with bash 'set -e')
|
||||
echo "Downloading Wikidump"
|
||||
if [ ! -f ./download/wikidump.xml.bz2 ]; then
|
||||
wget -O ./download/wikidump.xml.bz2 ${WIKI_DUMP}
|
||||
fi
|
||||
|
||||
# Extract dump
|
||||
echo "Extracting Wikidump"
|
||||
mkdir -p ./raw_data
|
||||
if [ ! -f ./raw_data/wikidump.xml ]; then
|
||||
pv ./download/wikidump.xml.bz2 | bunzip2 -kdc > ./raw_data/wikidump.xml
|
||||
fi
|
||||
|
||||
# Wikiextractor.py - Creates lots of folders/files in "doc format"
|
||||
echo "Running Wikiextractor"
|
||||
mkdir -p ./extracted_articles
|
||||
/workspace/wikiextractor/WikiExtractor.py ./raw_data/wikidump.xml -b 1000M --processes ${N_PROCS_PREPROCESS} -o ./extracted_articles
|
||||
|
||||
# Remove XML Tags and extraneous titles (since they are not sentences)
|
||||
# Also clean to remove lines between paragraphs within article and use space-separated articles
|
||||
echo "Cleaning and formatting files (one article per line)"
|
||||
python3 ./remove_tags_and_clean.py ./extracted_articles ./wikipedia_corpus.txt
|
|
@ -1,39 +0,0 @@
|
|||
# NVIDIA
|
||||
|
||||
import glob
|
||||
import os
|
||||
import argparse
|
||||
|
||||
parser = argparse.ArgumentParser(description='Cleaning and merge downloaded bookcorpus files')
|
||||
|
||||
parser.add_argument('extracted_articles_path', type=str)
|
||||
parser.add_argument('output_file', type=str)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
extracted_articles_path = args.extracted_articles_path
|
||||
output_file = args.output_file
|
||||
|
||||
with open(output_file, "w") as ofile:
|
||||
for dirname in glob.glob('{}/*/'.format(extracted_articles_path), recursive=False):
|
||||
for filename in glob.glob(dirname + 'wiki_*', recursive=True):
|
||||
print(filename)
|
||||
article_lines = []
|
||||
article_open = False
|
||||
|
||||
with open(filename, "r") as file:
|
||||
for line in file:
|
||||
if "<doc id=" in line:
|
||||
article_open = True
|
||||
elif "</doc>" in line:
|
||||
article_open = False
|
||||
for oline in article_lines[1:]:
|
||||
if oline != "\n":
|
||||
ofile.write(oline.rstrip() + " ")
|
||||
ofile.write("\n\n")
|
||||
article_lines = []
|
||||
else:
|
||||
if article_open:
|
||||
article_lines.append(line)
|
||||
|
||||
|
|
@ -1,205 +0,0 @@
|
|||
import types
|
||||
import importlib
|
||||
|
||||
import math
|
||||
import torch
|
||||
|
||||
def warmup_cosine(x, warmup=0.002):
|
||||
if x < warmup:
|
||||
return x/warmup
|
||||
return 0.5 * (1.0 + torch.cos(math.pi * x))
|
||||
|
||||
def warmup_constant(x, warmup=0.002):
|
||||
if x < warmup:
|
||||
return x/warmup
|
||||
return 1.0
|
||||
|
||||
def warmup_linear(x, warmup=0.002):
|
||||
if x < warmup:
|
||||
return x/warmup
|
||||
return 1.0 - x
|
||||
|
||||
SCHEDULES = {
|
||||
'warmup_cosine':warmup_cosine,
|
||||
'warmup_constant':warmup_constant,
|
||||
'warmup_linear':warmup_linear,
|
||||
}
|
||||
|
||||
class FusedAdamBert(torch.optim.Optimizer):
|
||||
|
||||
"""Implements Adam algorithm. Currently GPU-only. Requires Apex to be installed via
|
||||
``python setup.py install --cuda_ext --cpp_ext``.
|
||||
It has been proposed in `Adam: A Method for Stochastic Optimization`_.
|
||||
Arguments:
|
||||
params (iterable): iterable of parameters to optimize or dicts defining
|
||||
parameter groups.
|
||||
lr (float, optional): learning rate. (default: 1e-3)
|
||||
betas (Tuple[float, float], optional): coefficients used for computing
|
||||
running averages of gradient and its square. (default: (0.9, 0.999))
|
||||
eps (float, optional): term added to the denominator to improve
|
||||
numerical stability. (default: 1e-8)
|
||||
weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
|
||||
amsgrad (boolean, optional): whether to use the AMSGrad variant of this
|
||||
algorithm from the paper `On the Convergence of Adam and Beyond`_
|
||||
(default: False) NOT SUPPORTED in FusedAdam!
|
||||
eps_inside_sqrt (boolean, optional): in the 'update parameters' step,
|
||||
adds eps to the bias-corrected second moment estimate before
|
||||
evaluating square root instead of adding it to the square root of
|
||||
second moment estimate as in the original paper. (default: False)
|
||||
.. _Adam\: A Method for Stochastic Optimization:
|
||||
https://arxiv.org/abs/1412.6980
|
||||
.. _On the Convergence of Adam and Beyond:
|
||||
https://openreview.net/forum?id=ryQu7f-RZ
|
||||
"""
|
||||
|
||||
# def __init__(self, params,
|
||||
# lr=1e-3, bias_correction = True,
|
||||
# betas=(0.9, 0.999), eps=1e-8, eps_inside_sqrt = False,
|
||||
# weight_decay=0., max_grad_norm=0., amsgrad=False):
|
||||
|
||||
def __init__(self, params, lr=1e-3, warmup=-1, t_total=-1, bias_correction=False, betas=(0.9, 0.999), schedule='warmup_linear',
|
||||
eps=1e-6, eps_inside_sqrt = False, weight_decay=0., max_grad_norm=1.0, amsgrad=False):
|
||||
|
||||
|
||||
global fused_adam_cuda
|
||||
fused_adam_cuda = importlib.import_module("fused_adam_cuda")
|
||||
|
||||
if amsgrad:
|
||||
raise RuntimeError('FusedAdam does not support the AMSGrad variant.')
|
||||
defaults = dict(lr=lr, bias_correction=bias_correction,
|
||||
betas=betas, eps=eps, weight_decay=weight_decay,
|
||||
max_grad_norm=max_grad_norm)
|
||||
super(FusedAdamBert, self).__init__(params, defaults)
|
||||
print("LOCAL FUSED ADAM")
|
||||
self.eps_mode = 0 if eps_inside_sqrt else 1
|
||||
self.schedule = schedule
|
||||
self.t_total = t_total
|
||||
self.warmup = warmup
|
||||
|
||||
def get_lr(self):
|
||||
lr = []
|
||||
for group in self.param_groups:
|
||||
for p in group['params']:
|
||||
state = self.state[p]
|
||||
if len(state) == 0:
|
||||
return [0]
|
||||
if group['t_total'] != -1:
|
||||
schedule_fct = SCHEDULES[group['schedule']]
|
||||
lr_scheduled = group['lr'] * schedule_fct(state['step']/group['t_total'], group['warmup'])
|
||||
else:
|
||||
lr_scheduled = group['lr']
|
||||
lr.append(lr_scheduled)
|
||||
print("LR {}".format(lr_scheduled))
|
||||
return lr
|
||||
|
||||
def step(self, closure=None, grads=None, output_params=None, scale=1., grad_norms=None):
|
||||
"""Performs a single optimization step.
|
||||
Arguments:
|
||||
closure (callable, optional): A closure that reevaluates the model
|
||||
and returns the loss.
|
||||
grads (list of tensors, optional): weight gradient to use for the
|
||||
optimizer update. If gradients have type torch.half, parameters
|
||||
are expected to be in type torch.float. (default: None)
|
||||
output params (list of tensors, optional): A reduced precision copy
|
||||
of the updated weights written out in addition to the regular
|
||||
updated weights. Have to be of same type as gradients. (default: None)
|
||||
scale (float, optional): factor to divide gradient tensor values
|
||||
by before applying to weights. (default: 1)
|
||||
"""
|
||||
loss = None
|
||||
if closure is not None:
|
||||
loss = closure()
|
||||
|
||||
if grads is None:
|
||||
grads_group = [None]*len(self.param_groups)
|
||||
# backward compatibility
|
||||
# assuming a list/generator of parameter means single group
|
||||
elif isinstance(grads, types.GeneratorType):
|
||||
grads_group = [grads]
|
||||
elif type(grads[0])!=list:
|
||||
grads_group = [grads]
|
||||
else:
|
||||
grads_group = grads
|
||||
|
||||
if output_params is None:
|
||||
output_params_group = [None]*len(self.param_groups)
|
||||
elif isinstance(output_params, types.GeneratorType):
|
||||
output_params_group = [output_params]
|
||||
elif type(output_params[0])!=list:
|
||||
output_params_group = [output_params]
|
||||
else:
|
||||
output_params_group = output_params
|
||||
|
||||
if grad_norms is None:
|
||||
grad_norms = [None]*len(self.param_groups)
|
||||
|
||||
#Compute global norm
|
||||
global_norm = 0.0
|
||||
for group, grads_this_group, output_params_this_group, grad_norm in zip(self.param_groups, grads_group,
|
||||
output_params_group, grad_norms):
|
||||
global_norm = (global_norm ** 2 + grad_norm ** 2) ** 0.5
|
||||
|
||||
for group, grads_this_group, output_params_this_group, grad_norm in zip(self.param_groups, grads_group, output_params_group, grad_norms):
|
||||
if grads_this_group is None:
|
||||
grads_this_group = [None]*len(group['params'])
|
||||
if output_params_this_group is None:
|
||||
output_params_this_group = [None]*len(group['params'])
|
||||
|
||||
# compute combined scale factor for this group
|
||||
combined_scale = scale
|
||||
if group['max_grad_norm'] > 0:
|
||||
# norm is in fact norm*scale
|
||||
clip = ((global_norm / scale) + 1e-6) / group['max_grad_norm']
|
||||
if clip > 1:
|
||||
combined_scale = clip * scale
|
||||
|
||||
bias_correction = 1 if group['bias_correction'] else 0
|
||||
|
||||
for p, grad, output_param in zip(group['params'], grads_this_group, output_params_this_group):
|
||||
#note: p.grad should not ever be set for correct operation of mixed precision optimizer that sometimes sends None gradients
|
||||
if p.grad is None and grad is None:
|
||||
continue
|
||||
if grad is None:
|
||||
grad = p.grad.data
|
||||
if grad.is_sparse:
|
||||
raise RuntimeError('FusedAdam does not support sparse gradients, please consider SparseAdam instead')
|
||||
|
||||
state = self.state[p]
|
||||
|
||||
# State initialization
|
||||
if len(state) == 0:
|
||||
state['step'] = 0
|
||||
# Exponential moving average of gradient values
|
||||
state['exp_avg'] = torch.zeros_like(p.data)
|
||||
# Exponential moving average of squared gradient values
|
||||
state['exp_avg_sq'] = torch.zeros_like(p.data)
|
||||
|
||||
exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']
|
||||
beta1, beta2 = group['betas']
|
||||
|
||||
state['step'] += 1
|
||||
|
||||
out_p = torch.tensor([], dtype = torch.float) if output_param is None else output_param
|
||||
#Changes sharath
|
||||
|
||||
schedule_fct = SCHEDULES[self.schedule]
|
||||
#schedule_fct(state['step']/self.t_total, self.warmup)
|
||||
#step_lr = group['lr'] * schedule_fct(state['step']/self.t_total, self.warmup)
|
||||
#step_lr = group['lr'] * scale#schedule_fct(state['step']/self.t_total, self.warmup)# schedule_fct(state['step']/group['t_total'], group['warmup'])
|
||||
#print(scale, step_lr)
|
||||
#print(group['lr'])
|
||||
fused_adam_cuda.adam(p.data,
|
||||
out_p,
|
||||
exp_avg,
|
||||
exp_avg_sq,
|
||||
grad,
|
||||
group['lr'], #step_lr,#group['lr'],
|
||||
beta1,
|
||||
beta2,
|
||||
group['eps'],
|
||||
combined_scale,
|
||||
state['step'],
|
||||
self.eps_mode,
|
||||
bias_correction,
|
||||
group['weight_decay'])
|
||||
return loss
|
|
@ -35,6 +35,11 @@ from torch.utils import checkpoint
|
|||
|
||||
from file_utils import cached_path
|
||||
|
||||
from torch.nn import Module
|
||||
from torch.nn.parameter import Parameter
|
||||
import torch.nn.functional as F
|
||||
import torch.nn.init as init
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
PRETRAINED_MODEL_ARCHIVE_MAP = {
|
||||
|
@ -111,14 +116,27 @@ def load_tf_weights_in_bert(model, tf_checkpoint_path):
|
|||
return model
|
||||
|
||||
|
||||
@torch.jit.script
|
||||
def f_gelu(x):
|
||||
return x * 0.5 * (1.0 + torch.erf(x / 1.41421))
|
||||
|
||||
@torch.jit.script
|
||||
def bias_gelu(bias, y):
|
||||
x = bias + y
|
||||
return x * 0.5 * (1.0 + torch.erf(x / 1.41421))
|
||||
|
||||
@torch.jit.script
|
||||
def bias_tanh(bias, y):
|
||||
x = bias + y
|
||||
return torch.tanh(x)
|
||||
|
||||
def gelu(x):
|
||||
"""Implementation of the gelu activation function.
|
||||
For information: OpenAI GPT's gelu is slightly different (and gives slightly different results):
|
||||
0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
|
||||
Also see https://arxiv.org/abs/1606.08415
|
||||
"""
|
||||
return x * 0.5 * (1.0 + torch.erf(x / math.sqrt(2.0)))
|
||||
|
||||
return f_gelu(x)
|
||||
|
||||
def swish(x):
|
||||
return x * torch.sigmoid(x)
|
||||
|
@ -126,6 +144,53 @@ def swish(x):
|
|||
|
||||
ACT2FN = {"gelu": gelu, "relu": torch.nn.functional.relu, "swish": swish}
|
||||
|
||||
class LinearActivation(Module):
|
||||
r"""Fused Linear and activation Module.
|
||||
"""
|
||||
__constants__ = ['bias']
|
||||
|
||||
def __init__(self, in_features, out_features, act='gelu', bias=True):
|
||||
super(LinearActivation, self).__init__()
|
||||
self.in_features = in_features
|
||||
self.out_features = out_features
|
||||
self.fused_gelu = False
|
||||
self.fused_tanh = False
|
||||
if isinstance(act, str) or (sys.version_info[0] == 2 and isinstance(act, unicode)):
|
||||
if bias and act == 'gelu':
|
||||
self.fused_gelu = True
|
||||
elif bias and act == 'tanh':
|
||||
self.fused_tanh = True
|
||||
else:
|
||||
self.act_fn = ACT2FN[act]
|
||||
else:
|
||||
self.act_fn = act
|
||||
self.weight = Parameter(torch.Tensor(out_features, in_features))
|
||||
if bias:
|
||||
self.bias = Parameter(torch.Tensor(out_features))
|
||||
else:
|
||||
self.register_parameter('bias', None)
|
||||
self.reset_parameters()
|
||||
|
||||
def reset_parameters(self):
|
||||
init.kaiming_uniform_(self.weight, a=math.sqrt(5))
|
||||
if self.bias is not None:
|
||||
fan_in, _ = init._calculate_fan_in_and_fan_out(self.weight)
|
||||
bound = 1 / math.sqrt(fan_in)
|
||||
init.uniform_(self.bias, -bound, bound)
|
||||
|
||||
def forward(self, input):
|
||||
if self.fused_gelu:
|
||||
return bias_gelu(self.bias, F.linear(input, self.weight, None))
|
||||
elif self.fused_tanh:
|
||||
return bias_tanh(self.bias, F.linear(input, self.weight, None))
|
||||
else:
|
||||
return self.act_fn(F.linear(input, self.weight, self.bias))
|
||||
|
||||
def extra_repr(self):
|
||||
return 'in_features={}, out_features={}, bias={}'.format(
|
||||
self.in_features, self.out_features, self.bias is not None
|
||||
)
|
||||
|
||||
|
||||
class BertConfig(object):
|
||||
"""Configuration class to store the configuration of a `BertModel`.
|
||||
|
@ -216,7 +281,11 @@ class BertConfig(object):
|
|||
return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n"
|
||||
|
||||
try:
|
||||
from apex.normalization.fused_layer_norm import FusedLayerNorm as BertLayerNorm
|
||||
import apex
|
||||
#apex.amp.register_half_function(apex.normalization.fused_layer_norm, 'FusedLayerNorm')
|
||||
import apex.normalization
|
||||
#apex.amp.register_float_function(apex.normalization.FusedLayerNorm, 'forward')
|
||||
BertLayerNorm = apex.normalization.FusedLayerNorm
|
||||
except ImportError:
|
||||
print("Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex.")
|
||||
class BertLayerNorm(nn.Module):
|
||||
|
@ -281,29 +350,35 @@ class BertSelfAttention(nn.Module):
|
|||
self.value = nn.Linear(config.hidden_size, self.all_head_size)
|
||||
|
||||
self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
|
||||
self.softmax = nn.Softmax(dim=-1)
|
||||
|
||||
def transpose_for_scores(self, x):
|
||||
new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
|
||||
x = x.view(*new_x_shape)
|
||||
return x.permute(0, 2, 1, 3)
|
||||
|
||||
def transpose_key_for_scores(self, x):
|
||||
new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
|
||||
x = x.view(*new_x_shape)
|
||||
return x.permute(0, 2, 3, 1)
|
||||
|
||||
def forward(self, hidden_states, attention_mask):
|
||||
mixed_query_layer = self.query(hidden_states)
|
||||
mixed_key_layer = self.key(hidden_states)
|
||||
mixed_value_layer = self.value(hidden_states)
|
||||
|
||||
query_layer = self.transpose_for_scores(mixed_query_layer)
|
||||
key_layer = self.transpose_for_scores(mixed_key_layer)
|
||||
key_layer = self.transpose_key_for_scores(mixed_key_layer)
|
||||
value_layer = self.transpose_for_scores(mixed_value_layer)
|
||||
|
||||
# Take the dot product between "query" and "key" to get the raw attention scores.
|
||||
attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
|
||||
attention_scores = torch.matmul(query_layer, key_layer)
|
||||
attention_scores = attention_scores / math.sqrt(self.attention_head_size)
|
||||
# Apply the attention mask is (precomputed for all layers in BertModel forward() function)
|
||||
attention_scores = attention_scores + attention_mask
|
||||
|
||||
# Normalize the attention scores to probabilities.
|
||||
attention_probs = nn.Softmax(dim=-1)(attention_scores)
|
||||
attention_probs = self.softmax(attention_scores)
|
||||
|
||||
# This is actually dropping out entire tokens to attend to, which might
|
||||
# seem a bit unusual, but is taken from the original Transformer paper.
|
||||
|
@ -345,15 +420,10 @@ class BertAttention(nn.Module):
|
|||
class BertIntermediate(nn.Module):
|
||||
def __init__(self, config):
|
||||
super(BertIntermediate, self).__init__()
|
||||
self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
|
||||
if isinstance(config.hidden_act, str) or (sys.version_info[0] == 2 and isinstance(config.hidden_act, unicode)):
|
||||
self.intermediate_act_fn = ACT2FN[config.hidden_act]
|
||||
else:
|
||||
self.intermediate_act_fn = config.hidden_act
|
||||
self.dense_act = LinearActivation(config.hidden_size, config.intermediate_size, act=config.hidden_act)
|
||||
|
||||
def forward(self, hidden_states):
|
||||
hidden_states = self.dense(hidden_states)
|
||||
hidden_states = self.intermediate_act_fn(hidden_states)
|
||||
hidden_states = self.dense_act(hidden_states)
|
||||
return hidden_states
|
||||
|
||||
|
||||
|
@ -449,31 +519,24 @@ class BertEncoder(nn.Module):
|
|||
class BertPooler(nn.Module):
|
||||
def __init__(self, config):
|
||||
super(BertPooler, self).__init__()
|
||||
self.dense = nn.Linear(config.hidden_size, config.hidden_size)
|
||||
self.activation = nn.Tanh()
|
||||
self.dense_act = LinearActivation(config.hidden_size, config.hidden_size, act="tanh")
|
||||
|
||||
def forward(self, hidden_states):
|
||||
# We "pool" the model by simply taking the hidden state corresponding
|
||||
# to the first token.
|
||||
first_token_tensor = hidden_states[:, 0]
|
||||
pooled_output = self.dense(first_token_tensor)
|
||||
pooled_output = self.activation(pooled_output)
|
||||
pooled_output = self.dense_act(first_token_tensor)
|
||||
return pooled_output
|
||||
|
||||
|
||||
class BertPredictionHeadTransform(nn.Module):
|
||||
def __init__(self, config):
|
||||
super(BertPredictionHeadTransform, self).__init__()
|
||||
self.dense = nn.Linear(config.hidden_size, config.hidden_size)
|
||||
if isinstance(config.hidden_act, str) or (sys.version_info[0] == 2 and isinstance(config.hidden_act, unicode)):
|
||||
self.transform_act_fn = ACT2FN[config.hidden_act]
|
||||
else:
|
||||
self.transform_act_fn = config.hidden_act
|
||||
self.dense_act = LinearActivation(config.hidden_size, config.hidden_size, act=config.hidden_act)
|
||||
self.LayerNorm = BertLayerNorm(config.hidden_size, eps=1e-12)
|
||||
|
||||
def forward(self, hidden_states):
|
||||
hidden_states = self.dense(hidden_states)
|
||||
hidden_states = self.transform_act_fn(hidden_states)
|
||||
hidden_states = self.dense_act(hidden_states)
|
||||
hidden_states = self.LayerNorm(hidden_states)
|
||||
return hidden_states
|
||||
|
||||
|
@ -493,7 +556,9 @@ class BertLMPredictionHead(nn.Module):
|
|||
|
||||
def forward(self, hidden_states):
|
||||
hidden_states = self.transform(hidden_states)
|
||||
torch.cuda.nvtx.range_push("decoder input.size() = {}, weight.size() = {}".format(hidden_states.size(), self.decoder.weight.size()))
|
||||
hidden_states = self.decoder(hidden_states) + self.bias
|
||||
torch.cuda.nvtx.range_pop()
|
||||
return hidden_states
|
||||
|
||||
|
||||
|
@ -1247,3 +1312,4 @@ class BertForQuestionAnswering(BertPreTrainedModel):
|
|||
return total_loss
|
||||
else:
|
||||
return start_logits, end_logits
|
||||
|
||||
|
|
|
@ -21,6 +21,13 @@ from torch.optim.optimizer import required
|
|||
from torch.nn.utils import clip_grad_norm_
|
||||
#from fused_adam_local import FusedAdam
|
||||
from apex.optimizers import FusedAdam
|
||||
from apex.multi_tensor_apply import multi_tensor_applier
|
||||
import amp_C
|
||||
multi_tensor_l2norm = amp_C.multi_tensor_l2norm
|
||||
lamb_compute_update = amp_C.multi_tensor_lamb_stage1_cuda
|
||||
lamb_apply_update = amp_C.multi_tensor_lamb_stage2_cuda
|
||||
scale = amp_C.multi_tensor_scale
|
||||
|
||||
|
||||
def warmup_cosine(x, warmup=0.002):
|
||||
if x < warmup:
|
||||
|
@ -35,17 +42,235 @@ def warmup_constant(x, warmup=0.002):
|
|||
def warmup_linear(x, warmup=0.002):
|
||||
if x < warmup:
|
||||
return x/warmup
|
||||
# return (1.0 - x)
|
||||
return max((x - 1. )/ (warmup - 1.), 0.)
|
||||
|
||||
def warmup_poly(x, warmup=0.002, degree=0.5):
|
||||
if x < warmup:
|
||||
return x/warmup
|
||||
return (1.0 - x)**degree
|
||||
|
||||
return max((x - 1. )/ (warmup - 1.), 0.)
|
||||
|
||||
SCHEDULES = {
|
||||
'warmup_cosine':warmup_cosine,
|
||||
'warmup_constant':warmup_constant,
|
||||
'warmup_linear':warmup_linear,
|
||||
'warmup_poly':warmup_poly,
|
||||
}
|
||||
|
||||
|
||||
class BertLAMB(Optimizer):
|
||||
"""Implements BERT version of LAMB algorithm.
|
||||
Params:
|
||||
lr: learning rate
|
||||
warmup: portion of t_total for the warmup, -1 means no warmup. Default: -1
|
||||
t_total: total number of training steps for the learning
|
||||
rate schedule, -1 means constant learning rate. Default: -1
|
||||
schedule: schedule to use for the warmup (see above). Default: 'warmup_linear'
|
||||
b1: LAMBs b1. Default: 0.9
|
||||
b2: LAMBs b2. Default: 0.999
|
||||
e: LAMBs epsilon. Default: 1e-6
|
||||
weight_decay: Weight decay. Default: 0.01
|
||||
max_grad_norm: Maximum global norm for the gradients. Default: 1.0
|
||||
"""
|
||||
def __init__(self, params, lr=required, warmup=-1, t_total=-1, schedule='warmup_poly',
|
||||
b1=0.9, b2=0.999, e=1e-6, weight_decay=0.01,
|
||||
max_grad_norm=1.0):
|
||||
if lr is not required and lr < 0.0:
|
||||
raise ValueError("Invalid learning rate: {} - should be >= 0.0".format(lr))
|
||||
if schedule not in SCHEDULES:
|
||||
raise ValueError("Invalid schedule parameter: {}".format(schedule))
|
||||
if not 0.0 <= warmup < 1.0 and not warmup == -1:
|
||||
raise ValueError("Invalid warmup: {} - should be in [0.0, 1.0[ or -1".format(warmup))
|
||||
if not 0.0 <= b1 < 1.0:
|
||||
raise ValueError("Invalid b1 parameter: {} - should be in [0.0, 1.0[".format(b1))
|
||||
if not 0.0 <= b2 < 1.0:
|
||||
raise ValueError("Invalid b2 parameter: {} - should be in [0.0, 1.0[".format(b2))
|
||||
if not e >= 0.0:
|
||||
raise ValueError("Invalid epsilon value: {} - should be >= 0.0".format(e))
|
||||
defaults = dict(lr=lr, schedule=schedule, warmup=warmup, t_total=t_total,
|
||||
b1=b1, b2=b2, e=e, weight_decay=weight_decay,
|
||||
max_grad_norm=max_grad_norm)
|
||||
super(BertLAMB, self).__init__(params, defaults)
|
||||
self.step_count = 0
|
||||
self.b1 = b1
|
||||
self.b2 = b2
|
||||
self.epsilon = e
|
||||
self.max_global_grad_norm = max_grad_norm
|
||||
self.learning_rate = lr
|
||||
self.schedule = schedule
|
||||
self.warmup = warmup
|
||||
self.max_steps = t_total
|
||||
self.updates_created=False
|
||||
|
||||
def get_lr(self):
|
||||
lr = []
|
||||
for group in self.param_groups:
|
||||
for p in group['params']:
|
||||
state = self.state[p]
|
||||
if len(state) == 0:
|
||||
return [0]
|
||||
if group['t_total'] != -1:
|
||||
schedule_fct = SCHEDULES[group['schedule']]
|
||||
lr_scheduled = group['lr'] * schedule_fct(state['step']/group['t_total'], group['warmup'])
|
||||
else:
|
||||
lr_scheduled = group['lr']
|
||||
lr.append(lr_scheduled)
|
||||
return lr
|
||||
|
||||
def apply_gradients(self, dummy_overflow_buf, lr_scheduled, per_param_decay, grad_list, param_list, momentum, velocity, update):
|
||||
# Compute global gradient norm
|
||||
global_grad_norm = multi_tensor_applier(
|
||||
multi_tensor_l2norm,
|
||||
dummy_overflow_buf,
|
||||
[grad_list],
|
||||
False)[0].item()
|
||||
|
||||
# Compute per parameter norm
|
||||
param_norms = multi_tensor_applier(
|
||||
multi_tensor_l2norm,
|
||||
dummy_overflow_buf,
|
||||
[param_list],
|
||||
True)[1]
|
||||
|
||||
# Compute LAMB update
|
||||
multi_tensor_applier(
|
||||
lamb_compute_update,
|
||||
dummy_overflow_buf,
|
||||
[grad_list, param_list, momentum, velocity, update],
|
||||
torch.cuda.FloatTensor(per_param_decay),
|
||||
self.step_count,
|
||||
self.b1,
|
||||
self.b2,
|
||||
self.epsilon,
|
||||
global_grad_norm,
|
||||
self.max_global_grad_norm,
|
||||
)
|
||||
|
||||
# Computer per parameter update norm
|
||||
update_norms = multi_tensor_applier(
|
||||
multi_tensor_l2norm,
|
||||
dummy_overflow_buf,
|
||||
[update],
|
||||
True)[1]
|
||||
|
||||
# Apply LAMB update on parameters
|
||||
multi_tensor_applier(
|
||||
lamb_apply_update,
|
||||
dummy_overflow_buf,
|
||||
[param_list, update],
|
||||
param_norms,
|
||||
update_norms,
|
||||
lr_scheduled,
|
||||
)
|
||||
|
||||
def step(self, closure=None):
|
||||
"""Performs a single optimization step.
|
||||
|
||||
Arguments:
|
||||
closure (callable, optional): A closure that reevaluates the model
|
||||
and returns the loss.
|
||||
"""
|
||||
loss = None
|
||||
if closure is not None:
|
||||
loss = closure()
|
||||
check = 1#torch.norm(all_grads, 2)
|
||||
|
||||
grad_list = []
|
||||
param_list = []
|
||||
per_param_decay = []
|
||||
momentum = []
|
||||
velocity = []
|
||||
|
||||
fp16_grad_list = []
|
||||
fp16_from_fp32_param_list = []
|
||||
fp32_param_list = []
|
||||
fp16_per_param_decay = []
|
||||
fp16_momentum = []
|
||||
fp16_velocity = []
|
||||
|
||||
if not self.updates_created:
|
||||
self.update = []
|
||||
self.fp16_update = []
|
||||
for group in self.param_groups:
|
||||
for p in group['params']:
|
||||
if p.grad is None:
|
||||
continue
|
||||
grad = p.grad.data
|
||||
if grad.is_sparse:
|
||||
raise RuntimeError('Adam does not support sparse gradients, please consider SparseAdam instead')
|
||||
|
||||
state = self.state[p]
|
||||
|
||||
# State initialization
|
||||
if len(state) == 0:
|
||||
# Keep step here for compatibility with earlier resume from checkpoint
|
||||
state['step'] = 0
|
||||
# Exponential moving average of gradient values
|
||||
state['momentum'] = torch.zeros_like(p.data, dtype=torch.float32)
|
||||
# Exponential moving average of squared gradient values
|
||||
state['velocity'] = torch.zeros_like(p.data, dtype=torch.float32)
|
||||
# fp32 master weights
|
||||
if 'master_param' not in state.keys() and p.type() == 'torch.cuda.HalfTensor':
|
||||
state['master_param'] = p.detach().clone().float()
|
||||
|
||||
# ensure these 3 are float tensors
|
||||
if state['momentum'].type() != 'torch.cuda.FloatTensor':
|
||||
state['momentum'] = state['momentum'].float()
|
||||
if state['velocity'].type() != 'torch.cuda.FloatTensor':
|
||||
state['velocity'] = state['velocity'].float()
|
||||
if 'master_param' in state.keys() and state['master_param'].type() != 'torch.cuda.FloatTensor':
|
||||
state['master_param'] = state['master_param'].float()
|
||||
|
||||
# Append all params, gradients, decays, velocity, momentum and updates to a list
|
||||
if p.type() == 'torch.cuda.HalfTensor':
|
||||
fp16_grad_list.append(grad)
|
||||
fp32_param_list.append(state['master_param'])
|
||||
fp16_from_fp32_param_list.append(p.data)
|
||||
fp16_per_param_decay.append(group['weight_decay'])
|
||||
fp16_momentum.append(state["momentum"])
|
||||
fp16_velocity.append(state["velocity"])
|
||||
if not self.updates_created:
|
||||
#self.fp16_update.append(torch.empty_like(p.data, dtype=torch.float32))
|
||||
# Use fp16 weights as temporary buffer for update term.
|
||||
# This is safe because fp16 weights are overwritten after apply_gradients
|
||||
self.fp16_update.append(p.data)
|
||||
else:
|
||||
grad_list.append(grad)
|
||||
param_list.append(p.data)
|
||||
per_param_decay.append(group['weight_decay'])
|
||||
momentum.append(state["momentum"])
|
||||
velocity.append(state["velocity"])
|
||||
if not self.updates_created:
|
||||
self.update.append(torch.empty_like(p.data))
|
||||
state['step'] += 1
|
||||
self.updates_created=True
|
||||
update = self.update
|
||||
fp16_update = self.fp16_update
|
||||
|
||||
self.step_count = state['step']
|
||||
# Calculate learning rate from input schedule
|
||||
# if self.max_steps != -1:
|
||||
schedule_fct = SCHEDULES[self.schedule]
|
||||
lr_scheduled = self.learning_rate * schedule_fct(self.step_count / self.max_steps, self.warmup)
|
||||
if torch.distributed.get_rank() == 0:
|
||||
print("Step {} LR {}".format(self.step_count, lr_scheduled))
|
||||
# else:
|
||||
# lr_scheduled = self.learning_rate
|
||||
|
||||
overflow_buf = torch.cuda.IntTensor([0])
|
||||
|
||||
if len(grad_list) > 0:
|
||||
self.apply_gradients(overflow_buf, lr_scheduled, per_param_decay, grad_list, param_list, momentum, velocity, update)
|
||||
if len(fp16_grad_list) > 0:
|
||||
self.apply_gradients(overflow_buf, lr_scheduled, fp16_per_param_decay, fp16_grad_list, fp32_param_list, fp16_momentum, fp16_velocity, fp16_update)
|
||||
multi_tensor_applier(
|
||||
scale,
|
||||
overflow_buf,
|
||||
[fp32_param_list, fp16_from_fp32_param_list],
|
||||
1.)
|
||||
|
||||
return loss
|
||||
|
||||
class BertAdam(Optimizer):
|
||||
"""Implements BERT version of Adam algorithm with weight decay fix.
|
||||
Params:
|
||||
|
@ -165,54 +390,3 @@ class BertAdam(Optimizer):
|
|||
|
||||
return loss
|
||||
|
||||
# =======================================================================
|
||||
class BertAdam_FP16(FusedAdam):
|
||||
"""Implements BERT version of Adam algorithm with weight decay fix.
|
||||
Params:
|
||||
lr: learning rate
|
||||
warmup: portion of t_total for the warmup, -1 means no warmup. Default: -1
|
||||
t_total: total number of training steps for the learning
|
||||
rate schedule, -1 means constant learning rate. Default: -1
|
||||
schedule: schedule to use for the warmup (see above). Default: 'warmup_linear'
|
||||
b1: Adams b1. Default: 0.9
|
||||
b2: Adams b2. Default: 0.999
|
||||
e: Adams epsilon. Default: 1e-6
|
||||
weight_decay: Weight decay. Default: 0.01
|
||||
max_grad_norm: Maximum norm for the gradients (-1 means no clipping). Default: 1.0
|
||||
"""
|
||||
def __init__(self, params, lr, warmup=-1, t_total=-1, bias_correction=False, schedule='warmup_linear',
|
||||
b1=0.9, b2=0.999, e=1e-6, weight_decay=0.01,
|
||||
max_grad_norm=1.0):
|
||||
if not lr >= 0.0:
|
||||
raise ValueError("Invalid learning rate: {} - should be >= 0.0".format(lr))
|
||||
if schedule not in SCHEDULES:
|
||||
raise ValueError("Invalid schedule parameter: {}".format(schedule))
|
||||
if not 0.0 <= warmup < 1.0 and not warmup == -1:
|
||||
raise ValueError("Invalid warmup: {} - should be in [0.0, 1.0[ or -1".format(warmup))
|
||||
if not 0.0 <= b1 < 1.0:
|
||||
raise ValueError("Invalid b1 parameter: {} - should be in [0.0, 1.0[".format(b1))
|
||||
if not 0.0 <= b2 < 1.0:
|
||||
raise ValueError("Invalid b2 parameter: {} - should be in [0.0, 1.0[".format(b2))
|
||||
if not e >= 0.0:
|
||||
raise ValueError("Invalid epsilon value: {} - should be >= 0.0".format(e))
|
||||
# defaults = dict(lr=lr, schedule=schedule, warmup=warmup, t_total=t_total,
|
||||
# b1=b1, b2=b2, e=e, weight_decay=weight_decay,
|
||||
# max_grad_norm=max_grad_norm)
|
||||
super(BertAdam_FP16, self).__init__(params, lr=lr, bias_correction=bias_correction, betas=(b1, b2), eps=e, weight_decay=weight_decay, max_grad_norm=max_grad_norm)#defaults)
|
||||
|
||||
def get_lr(self):
|
||||
lr = []
|
||||
for group in self.param_groups:
|
||||
for p in group['params']:
|
||||
state = self.state[p]
|
||||
if len(state) == 0:
|
||||
print("returning", state)
|
||||
return [0]
|
||||
if group['t_total'] != -1:
|
||||
schedule_fct = SCHEDULES[group['schedule']]
|
||||
lr_scheduled = group['lr'] * schedule_fct(state['step']/group['t_total'], group['warmup'])
|
||||
else:
|
||||
lr_scheduled = group['lr']
|
||||
lr.append(lr_scheduled)
|
||||
print("LR {}".format(lr_scheduled))
|
||||
return lr
|
||||
|
|
203
PyTorch/LanguageModeling/BERT/run.sub
Normal file
203
PyTorch/LanguageModeling/BERT/run.sub
Normal file
|
@ -0,0 +1,203 @@
|
|||
#!/bin/bash
|
||||
#SBATCH -p batch # partition
|
||||
#SBATCH -N 1 # number of nodes
|
||||
#SBATCH -t 1:30:00 # wall time
|
||||
#SBATCH -J "bert_pyt_lamb" # job name
|
||||
#SBATCH --exclusive # exclusive node access
|
||||
#SBATCH --mem=0 # all mem avail
|
||||
#SBATCH --ntasks-per-node=16 # max 8 tasks per machine (one task per gpu) - Exception for pytorch// srun launch with -n1
|
||||
#SBATCH --threads-per-core=2 # HT is on
|
||||
#SBATCH --cpus-per-task=40 # Not used yet (to reach perf pytorch might need overcommit)
|
||||
#SBATCH --overcommit # Needed for pytorch
|
||||
#SBATCH --mail-user=sharatht@nvidia.com
|
||||
#SBATCH --mail-type=END
|
||||
##SBATCH --deadline=$(date -d '+72 hours' '+%FT%T')
|
||||
|
||||
##SBATCH --reservation mlperf # reservation name
|
||||
##SBATCH --output=./logs/pytorch_%j.out
|
||||
##SBATCH --exclude=sc-sdgx-[394,397] # targeting nodes with mask until the constraints are implemented
|
||||
##SBATCH -w sc-sdgx-[377-388],sc-sdgx-[394-408] # avail pod12
|
||||
##SBATCH -C pod14 # constraint (not implemented yet)
|
||||
##SBATCH --ntasks-per-socket=4 # Not used (our slurm does not have sockets defined)
|
||||
|
||||
|
||||
## Your data, your container and its volumes
|
||||
## Your data, your container and its volumes
|
||||
set -x
|
||||
DATESTAMP=${DATESTAMP:-`date +'%y-%m-%d-%H-%M-%S-%N'`}
|
||||
BENCHMARK=${BENCHMARK:-"bert"}
|
||||
FRAMEWORK=${FRAMEWORK:-"pytorch"}
|
||||
BENCHMARK_NAME==${FRAMEWORK:-"bert"}
|
||||
JOBNAME=${JOBNAME:-"bert_lamb_phase1_96n_wiki+books_only_fast_lamb_O1_run_1337"}
|
||||
#.$DATESTAMP
|
||||
# Create results directory
|
||||
|
||||
#DATADIR=${DATADIR:-"/raid/datasets/bert/hdf5/shard_1472_test_split_10/seq_128_pred_20_dupe_5/training"}
|
||||
#DATADIR_PHASE2=${DATADIR_PHASE2:-"/raid/datasets/bert/hdf5/shard_1472_test_split_10/seq_512_pred_80_dupe_5/training"}
|
||||
DATADIR="/raid/datasets/bert/hdf5/shard_1472_test_split_10/seq_128_pred_20_dupe_5/training"
|
||||
DATADIR_PHASE2="/raid/datasets/bert/hdf5/shard_1472_test_split_10/seq_512_pred_80_dupe_5/training"
|
||||
#BOOKS_DIR=/raid/datasets/seq_512_pred_80_dupe_5_shard_256
|
||||
VOCAB_PATH=${VOCAB_PATH:-"/raid/datasets/bert_vocab/vocab.txt"}
|
||||
DATASET=${DATASET:-"coco/coco-2014"}
|
||||
CODEDIR=${CODEDIR:="bert_pyt/tree/sharatht/fast_lamb_ci_runs"}
|
||||
CONT=${CONT:-"gitlab-master.nvidia.com/dl/JoC/bert_pyt:bert_pyt"}
|
||||
LOGDIR=${LOGDIR:-"/raid/results/$BENCHMARK"}
|
||||
NEXP=${NEXP:-1}
|
||||
SEED=${SEED:-$(od -A n -t d -N 3 /dev/urandom)}
|
||||
#CHECKPOINT_DIR=${CHECKPOINT_DIR:-"/gpfs/fs1/svcnvdlfw/7108495/results/output"}
|
||||
CHECKPOINT_DIR="/gpfs/fs1/svcnvdlfw/7588296/results/output"
|
||||
## Load system-specific parameters for benchmark
|
||||
DGXSYSTEM=${DGXSYSTEM:-"DGX1"}
|
||||
if [[ ! -f "config_${DGXSYSTEM}.sh" ]]; then
|
||||
echo "Unknown system, assuming DGX1"
|
||||
DGXSYSTEM="DGX1"
|
||||
fi
|
||||
source config_${DGXSYSTEM}.sh
|
||||
|
||||
IBDEVICES=${IBDEVICES:-$DGXIBDEVICES}
|
||||
|
||||
## Check whether we are running in a slurm env
|
||||
INSLURM=1
|
||||
if [[ -z "$SLURM_JOB_ID" ]]; then
|
||||
INSLURM=0
|
||||
export SLURM_JOB_ID="${DATESTAMP}"
|
||||
export SLURM_NNODES=1
|
||||
else
|
||||
env | grep SLURM
|
||||
fi
|
||||
if [[ -z "SLURM_JOB_ID" || $SLURM_NNODES -eq 1 ]]; then
|
||||
# don't need IB if not multi-node
|
||||
export IBDEVICES=""
|
||||
fi
|
||||
|
||||
# Create results directory
|
||||
LOGFILE_BASE="${LOGDIR}/${DATESTAMP}"
|
||||
mkdir -p $(dirname "${LOGFILE_BASE}")
|
||||
|
||||
## Docker params
|
||||
CONTVOLS="-v $DATADIR:/workspace/data -v $LOGDIR:/results -v $CHECKPOINT_DIR:/checkpoints -v $DATADIR_PHASE2:/workspace/data_phase2"
|
||||
NV_GPU="${NVIDIA_VISIBLE_DEVICES:-$(seq 0 $((${SLURM_NTASKS_PER_NODE:-${DGXNGPU}}-1)) | tr '\n' ',' | sed 's/,$//')}"
|
||||
DOCKEREXEC="env NV_GPU=${NV_GPU} nvidia-docker run --init --rm --net=host --uts=host --ipc=host --ulimit stack=67108864 --ulimit memlock=-1 --name=cont_${SLURM_JOB_ID} --security-opt seccomp=unconfined $IBDEVICES"
|
||||
|
||||
## Get version of the OS
|
||||
export MLPERF_HOST_OS="$(cat /etc/issue | head -1 | cut -f1-3 -d" ") / $(cat /etc/dgx-release | grep -E "DGX_PRETTY_NAME|DGX_OTA_VERSION" |cut -f2 -d= |cut -f2 -d '"' |paste -sd' ')"
|
||||
|
||||
## Prep run and launch
|
||||
MASTER_IP=`getent hosts \`hostname\` | cut -d ' ' -f1`
|
||||
PORT=$((4242 + RANDOM%1000))
|
||||
SSH=''
|
||||
SRUN=''
|
||||
if [[ $INSLURM -eq 1 ]]; then
|
||||
hosts=( `scontrol show hostname |tr "\n" " "` )
|
||||
SSH='ssh -q -o UserKnownHostsFile=/dev/null -o StrictHostKeyChecking=no $hostn'
|
||||
SRUN='srun --mem=0 -N 1 -n 1 -w $hostn'
|
||||
else
|
||||
hosts=( `hostname` )
|
||||
fi
|
||||
|
||||
# Pull latest image
|
||||
if [[ "${PULL}" != "0" ]]; then
|
||||
DOCKERPULL="docker pull $CONT"
|
||||
pids=();
|
||||
for hostn in ${hosts[@]}; do
|
||||
timeout -k 600s 600s \
|
||||
$(eval echo $SRUN) $DOCKERPULL &
|
||||
pids+=($!);
|
||||
done
|
||||
wait "${pids[@]}"
|
||||
success=$? ; if [ $success -ne 0 ]; then echo "ERR: Image pull failed."; exit $success ; fi
|
||||
fi
|
||||
|
||||
# Test the base container launch
|
||||
pids=();
|
||||
for hostn in ${hosts[@]}; do
|
||||
timeout -k 600s 600s \
|
||||
$(eval echo $SRUN) $DOCKEREXEC $CONT python -c 'import torch; print("Found",torch.cuda.device_count(),"CUDA GPUs")' &
|
||||
pids+=($!);
|
||||
done
|
||||
wait "${pids[@]}"
|
||||
success=$? ; if [ $success -ne 0 ]; then echo "ERR: Base container launch failed."; exit $success ; fi
|
||||
|
||||
# Launch containers
|
||||
pids=(); rets=()
|
||||
for hostn in ${hosts[@]}; do
|
||||
$(eval echo $SSH) $DOCKEREXEC $CONTVOLS $CONT sleep infinity &
|
||||
pids+=($!); rets+=($?);
|
||||
done
|
||||
success=0; for s in ${rets[@]}; do ((success+=s)); done ; if [ $success -ne 0 ]; then echo "ERR: Container launch failed."; exit $success ; fi
|
||||
sleep 30 # Making sure containers have time to launch
|
||||
|
||||
# Disable compat check from further running
|
||||
pids=(); rets=()
|
||||
for hostn in ${hosts[@]}; do
|
||||
$(eval echo $SSH) docker exec cont_${SLURM_JOB_ID} rm -f /etc/shinit &
|
||||
pids+=($!);
|
||||
done
|
||||
wait "${pids[@]}"
|
||||
|
||||
# Run benchmarks
|
||||
|
||||
export SEED
|
||||
export NEXP
|
||||
for nrun in `seq 1 $NEXP`; do
|
||||
(
|
||||
echo "Beginning trial $nrun of $NEXP"
|
||||
|
||||
export VARS=(
|
||||
"-e" "SLURM_NNODES=$SLURM_NNODES"
|
||||
"-e" "MLPERF_HOST_OS"
|
||||
)
|
||||
|
||||
|
||||
## Clear RAM cache dentries and inodes
|
||||
echo "Clearing caches"
|
||||
pids=(); rets=()
|
||||
for hostn in ${hosts[@]}; do
|
||||
if [[ $INSLURM -eq 1 ]]; then
|
||||
$(eval echo $SSH) bash -c 'sync && sudo /sbin/sysctl vm.drop_caches=3' &
|
||||
else
|
||||
docker run --init --rm --privileged --entrypoint bash $CONT -c "sync && echo 3 > /proc/sys/vm/drop_caches || exit 1" &
|
||||
fi
|
||||
pids+=($!); rets+=($?);
|
||||
done
|
||||
wait "${pids[@]}"
|
||||
success=0; for s in ${rets[@]}; do ((success+=s)); done ; if [ $success -ne 0 ]; then echo "ERR: Cache clearing failed."; exit $success ; fi
|
||||
|
||||
## Launching benchmark
|
||||
pids=();
|
||||
export MULTI_NODE=''
|
||||
for h in `seq 0 $((SLURM_NNODES-1))`; do
|
||||
hostn="${hosts[$h]}"
|
||||
echo "Launching on node $hostn"
|
||||
if [[ $SLURM_NNODES -gt 1 ]]; then
|
||||
export MULTI_NODE=" --nnodes=$SLURM_NNODES --node_rank=$h --master_addr=$MASTER_IP --master_port=$PORT"
|
||||
else
|
||||
export MULTI_NODE=" --master_port=$PORT"
|
||||
fi
|
||||
export DOCKERENV=(
|
||||
"-e" "DGXSYSTEM=$DGXSYSTEM"
|
||||
"-e" "MULTI_NODE=$MULTI_NODE"
|
||||
"-e" "SEED=$SEED"
|
||||
"-e" "SLURM_JOB_ID=$SLURM_JOB_ID"
|
||||
"-e" "SLURM_NTASKS_PER_NODE=$SLURM_NTASKS_PER_NODE"
|
||||
"-e" "SLURM_NNODES=$SLURM_NNODES"
|
||||
)
|
||||
# Execute command
|
||||
set -x
|
||||
$(eval echo $SRUN) docker exec "${DOCKERENV[@]}" -e MODE=TRAIN cont_${SLURM_JOB_ID} ./run_and_time.sh &
|
||||
pids+=($!);
|
||||
set +x
|
||||
done
|
||||
wait "${pids[@]}"
|
||||
|
||||
) |& tee ${LOGFILE_BASE}_$nrun.log
|
||||
|
||||
## SEED update
|
||||
export SEED=$(od -A n -t d -N 3 /dev/urandom)
|
||||
|
||||
done
|
||||
|
||||
# Clean up (note: on SLURM we skip this, as the epilogue will take care of it)
|
||||
if [[ $INSLURM -eq 0 ]]; then
|
||||
docker rm -f cont_${SLURM_JOB_ID}
|
||||
fi
|
39
PyTorch/LanguageModeling/BERT/run_and_time.sh
Executable file
39
PyTorch/LanguageModeling/BERT/run_and_time.sh
Executable file
|
@ -0,0 +1,39 @@
|
|||
#!/bin/bash
|
||||
|
||||
#echo "Multi-node $MULTI_NODE"
|
||||
#echo "Dataset $DATASET"
|
||||
## DL vars -- Change your parameters below
|
||||
# To change the number of GPUs per node, change the sbatch param --ntasks-per-node in the launching script
|
||||
|
||||
## Need to avoid virtualenv and do python directly
|
||||
# train.py --data=/dev/shm/$DATASET \
|
||||
# train.py --data=/raid/datasets/$DATASET \
|
||||
|
||||
DGXSYSTEM=${DGXSYSTEM:-"DGX1"}
|
||||
if [[ -f config_${DGXSYSTEM}.sh ]]; then
|
||||
source config_${DGXSYSTEM}.sh
|
||||
else
|
||||
source config_DGX1.sh
|
||||
echo "Unknown system, assuming DGX1"
|
||||
fi
|
||||
SLURM_NTASKS_PER_NODE=${SLURM_NTASKS_PER_NODE:-$DGXNGPU}
|
||||
SLURM_JOB_ID=${SLURM_JOB_ID:-$RANDOM}
|
||||
MULTI_NODE=${MULTI_NODE:-''}
|
||||
echo "Run vars: id $SLURM_JOB_ID gpus $SLURM_NTASKS_PER_NODE mparams $MULTI_NODE"
|
||||
|
||||
# run training
|
||||
BIND_LAUNCH=1 ## should be the default
|
||||
|
||||
if [[ $BIND_LAUNCH -eq 1 ]]; then
|
||||
LAUNCH_OPT="bind_pyt --nsockets_per_node 2 --ncores_per_socket ${DGXSOCKETCORES} --nproc_per_node ${SLURM_NTASKS_PER_NODE} ${MULTI_NODE}"
|
||||
else
|
||||
LAUNCH_OPT="torch.distributed.launch --nproc_per_node ${SLURM_NTASKS_PER_NODE} ${MULTI_NODE}"
|
||||
fi
|
||||
|
||||
# Options
|
||||
python -m $LAUNCH_OPT \
|
||||
run_pretraining.py --seed=${SEED} \
|
||||
--train_batch_size=${BATCHSIZE} \
|
||||
--learning_rate=${LEARNING_RATE} \
|
||||
--warmup_proportion=${WARMUP_UPDATES} \
|
||||
$EXTRA_PARAMS
|
|
@ -1,6 +1,7 @@
|
|||
# coding=utf-8
|
||||
# Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team.
|
||||
#
|
||||
# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
|
||||
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
|
@ -18,10 +19,10 @@ from __future__ import absolute_import
|
|||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
|
||||
#==================
|
||||
# ==================
|
||||
import csv
|
||||
import os
|
||||
import time
|
||||
import logging
|
||||
import argparse
|
||||
import random
|
||||
|
@ -34,65 +35,73 @@ from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, Datas
|
|||
from torch.utils.data.distributed import DistributedSampler
|
||||
import math
|
||||
from apex import amp
|
||||
|
||||
|
||||
import multiprocessing
|
||||
|
||||
from tokenization import BertTokenizer
|
||||
from modeling import BertForPreTraining, BertConfig
|
||||
from optimization import BertAdam, BertAdam_FP16
|
||||
from optimization import BertLAMB
|
||||
|
||||
# from fused_adam_local import FusedAdamBert
|
||||
from file_utils import PYTORCH_PRETRAINED_BERT_CACHE
|
||||
|
||||
from apex.optimizers import FusedAdam #, FP16_Optimizer
|
||||
#from apex.optimizers import FusedAdam
|
||||
from utils import is_main_process
|
||||
from apex.parallel import DistributedDataParallel as DDP
|
||||
from schedulers import LinearWarmUpScheduler
|
||||
from apex.parallel.distributed import flat_dist_call
|
||||
import amp_C
|
||||
import apex_C
|
||||
from apex.amp import _amp_state
|
||||
|
||||
logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s',
|
||||
datefmt = '%m/%d/%Y %H:%M:%S',
|
||||
level = logging.INFO)
|
||||
from concurrent.futures import ProcessPoolExecutor
|
||||
|
||||
logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s - %(message)s',
|
||||
datefmt='%m/%d/%Y %H:%M:%S',
|
||||
level=logging.INFO)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def create_pretraining_dataset(input_file, max_pred_length, shared_list, args):
|
||||
|
||||
train_data = pretraining_dataset(input_file=input_file, max_pred_length=max_pred_length)
|
||||
train_sampler = RandomSampler(train_data)
|
||||
train_dataloader = DataLoader(train_data, sampler=train_sampler,
|
||||
batch_size=args.train_batch_size * args.n_gpu, num_workers=4,
|
||||
pin_memory=True)
|
||||
# shared_list["0"] = (train_dataloader, input_file)
|
||||
return train_dataloader, input_file
|
||||
|
||||
class pretraining_dataset(Dataset):
|
||||
|
||||
def __init__(self, input_file, max_pred_length):
|
||||
self.input_file = input_file
|
||||
self.max_pred_length = max_pred_length
|
||||
f = h5py.File(input_file, "r")
|
||||
self.input_ids = np.asarray(f["input_ids"][:]).astype(np.int64)#[num_instances x max_seq_length])
|
||||
self.input_masks = np.asarray(f["input_mask"][:]).astype(np.int64) #[num_instances x max_seq_length]
|
||||
self.segment_ids = np.asarray(f["segment_ids"][:]).astype(np.int64) #[num_instances x max_seq_length]
|
||||
self.masked_lm_positions = np.asarray(f["masked_lm_positions"][:]).astype(np.int64) #[num_instances x max_pred_length]
|
||||
self.masked_lm_ids= np.asarray(f["masked_lm_ids"][:]).astype(np.int64) #[num_instances x max_pred_length]
|
||||
self.next_sentence_labels = np.asarray(f["next_sentence_labels"][:]).astype(np.int64) # [num_instances]
|
||||
keys = ['input_ids', 'input_mask', 'segment_ids', 'masked_lm_positions', 'masked_lm_ids',
|
||||
'next_sentence_labels']
|
||||
self.inputs = [np.asarray(f[key][:]) for key in keys]
|
||||
f.close()
|
||||
|
||||
def __len__(self):
|
||||
'Denotes the total number of samples'
|
||||
return len(self.input_ids)
|
||||
return len(self.inputs[0])
|
||||
|
||||
def __getitem__(self, index):
|
||||
|
||||
input_ids= torch.from_numpy(self.input_ids[index]) # [max_seq_length]
|
||||
input_mask = torch.from_numpy(self.input_masks[index]) #[max_seq_length]
|
||||
segment_ids = torch.from_numpy(self.segment_ids[index])# [max_seq_length]
|
||||
masked_lm_positions = torch.from_numpy(self.masked_lm_positions[index]) #[max_pred_length]
|
||||
masked_lm_ids = torch.from_numpy(self.masked_lm_ids[index]) #[max_pred_length]
|
||||
next_sentence_labels = torch.from_numpy(np.asarray(self.next_sentence_labels[index])) #[1]
|
||||
|
||||
|
||||
[input_ids, input_mask, segment_ids, masked_lm_positions, masked_lm_ids, next_sentence_labels] = [
|
||||
torch.from_numpy(input[index].astype(np.int64)) if indice < 5 else torch.from_numpy(
|
||||
np.asarray(input[index].astype(np.int64))) for indice, input in enumerate(self.inputs)]
|
||||
|
||||
masked_lm_labels = torch.ones(input_ids.shape, dtype=torch.long) * -1
|
||||
index = self.max_pred_length
|
||||
# store number of masked tokens in index
|
||||
if len((masked_lm_positions == 0).nonzero()) != 0:
|
||||
index = (masked_lm_positions == 0).nonzero()[0].item()
|
||||
padded_mask_indices = (masked_lm_positions == 0).nonzero()
|
||||
if len(padded_mask_indices) != 0:
|
||||
index = padded_mask_indices[0].item()
|
||||
masked_lm_labels[masked_lm_positions[:index]] = masked_lm_ids[:index]
|
||||
|
||||
return [input_ids, segment_ids, input_mask, masked_lm_labels, next_sentence_labels]
|
||||
return [input_ids, segment_ids, input_mask,
|
||||
masked_lm_labels, next_sentence_labels]
|
||||
|
||||
def main():
|
||||
def parse_arguments():
|
||||
|
||||
print("IN NEW MAIN XD\n")
|
||||
parser = argparse.ArgumentParser()
|
||||
|
||||
## Required parameters
|
||||
|
@ -186,233 +195,407 @@ def main():
|
|||
help="Step to resume training from.")
|
||||
parser.add_argument('--num_steps_per_checkpoint',
|
||||
type=int,
|
||||
default=2000,
|
||||
default=100,
|
||||
help="Number of update steps until a model checkpoint is saved to disk.")
|
||||
|
||||
|
||||
parser.add_argument('--phase2',
|
||||
default=False,
|
||||
action='store_true',
|
||||
help="Whether to train with seq len 512")
|
||||
parser.add_argument('--allreduce_post_accumulation',
|
||||
default=False,
|
||||
action='store_true',
|
||||
help="Whether to do allreduces during gradient accumulation steps.")
|
||||
parser.add_argument('--allreduce_post_accumulation_fp16',
|
||||
default=False,
|
||||
action='store_true',
|
||||
help="Whether to do fp16 allreduce post accumulation.")
|
||||
parser.add_argument('--accumulate_into_fp16',
|
||||
default=False,
|
||||
action='store_true',
|
||||
help="Whether to use fp16 gradient accumulators.")
|
||||
parser.add_argument('--phase1_end_step',
|
||||
type=int,
|
||||
default=7038,
|
||||
help="Number of training steps in Phase1 - seq len 128")
|
||||
parser.add_argument("--do_train",
|
||||
default=False,
|
||||
action='store_true',
|
||||
help="Whether to run training.")
|
||||
args = parser.parse_args()
|
||||
return args
|
||||
|
||||
def setup_training(args):
|
||||
|
||||
random.seed(args.seed)
|
||||
np.random.seed(args.seed)
|
||||
torch.manual_seed(args.seed)
|
||||
|
||||
assert(torch.cuda.is_available())
|
||||
assert (torch.cuda.is_available())
|
||||
|
||||
if args.local_rank == -1:
|
||||
device = torch.device("cuda")
|
||||
n_gpu = torch.cuda.device_count()
|
||||
args.n_gpu = torch.cuda.device_count()
|
||||
else:
|
||||
torch.cuda.set_device(args.local_rank)
|
||||
device = torch.device("cuda", args.local_rank)
|
||||
n_gpu = 1
|
||||
args.n_gpu = 1
|
||||
# Initializes the distributed backend which will take care of sychronizing nodes/GPUs
|
||||
torch.distributed.init_process_group(backend='nccl', init_method='env://')
|
||||
|
||||
logger.info("device %s n_gpu %d distributed training %r", device, n_gpu, bool(args.local_rank != -1))
|
||||
logger.info("device %s n_gpu %d distributed training %r", device, args.n_gpu, bool(args.local_rank != -1))
|
||||
|
||||
if args.gradient_accumulation_steps < 1:
|
||||
raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format(
|
||||
args.gradient_accumulation_steps))
|
||||
args.gradient_accumulation_steps))
|
||||
if args.train_batch_size % args.gradient_accumulation_steps != 0:
|
||||
raise ValueError("Invalid gradient_accumulation_steps parameter: {}, batch size {} should be divisible".format(
|
||||
args.gradient_accumulation_steps, args.train_batch_size))
|
||||
args.gradient_accumulation_steps, args.train_batch_size))
|
||||
|
||||
args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps
|
||||
|
||||
if not args.do_train:
|
||||
raise ValueError(" `do_train` must be True.")
|
||||
|
||||
|
||||
if not args.resume_from_checkpoint and os.path.exists(args.output_dir) and (os.listdir(args.output_dir) and os.listdir(args.output_dir)!=['logfile.txt']):
|
||||
if not args.resume_from_checkpoint and os.path.exists(args.output_dir) and (
|
||||
os.listdir(args.output_dir) and os.listdir(args.output_dir) != ['logfile.txt']):
|
||||
raise ValueError("Output directory ({}) already exists and is not empty.".format(args.output_dir))
|
||||
|
||||
if not args.resume_from_checkpoint:
|
||||
os.makedirs(args.output_dir, exist_ok=True)
|
||||
|
||||
return device, args
|
||||
|
||||
def prepare_model_and_optimizer(args, device):
|
||||
|
||||
# Prepare model
|
||||
config = BertConfig.from_json_file(args.config_file)
|
||||
|
||||
# Padding for divisibility by 8
|
||||
if config.vocab_size % 8 != 0:
|
||||
config.vocab_size += 8 - (config.vocab_size % 8)
|
||||
model = BertForPreTraining(config)
|
||||
|
||||
|
||||
checkpoint = None
|
||||
if not args.resume_from_checkpoint:
|
||||
global_step = 0
|
||||
else:
|
||||
if args.resume_step == -1:
|
||||
model_names = [f for f in os.listdir(args.output_dir) if f.endswith(".pt")]
|
||||
args.resume_step = max([int(x.split('.pt')[0].split('_')[1].strip()) for x in model_names])
|
||||
|
||||
global_step = args.resume_step
|
||||
|
||||
checkpoint = torch.load(os.path.join(args.output_dir, "ckpt_{}.pt".format(global_step)), map_location="cpu")
|
||||
model.load_state_dict(checkpoint['model'], strict=False)
|
||||
|
||||
print("resume step from ", args.resume_step)
|
||||
if args.phase2:
|
||||
global_step -= args.phase1_end_step
|
||||
if is_main_process():
|
||||
print("resume step from ", args.resume_step)
|
||||
|
||||
model.to(device)
|
||||
|
||||
# Prepare optimizer
|
||||
param_optimizer = list(model.named_parameters())
|
||||
no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
|
||||
optimizer_grouped_parameters = [
|
||||
{'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
|
||||
{'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
|
||||
]
|
||||
|
||||
no_decay = ['bias', 'gamma', 'beta', 'LayerNorm']
|
||||
|
||||
optimizer_grouped_parameters = []
|
||||
names = []
|
||||
|
||||
count = 1
|
||||
for n, p in param_optimizer:
|
||||
count += 1
|
||||
if not any(nd in n for nd in no_decay):
|
||||
optimizer_grouped_parameters.append({'params': [p], 'weight_decay': 0.01, 'name': n})
|
||||
names.append({'params': [n], 'weight_decay': 0.01})
|
||||
if any(nd in n for nd in no_decay):
|
||||
optimizer_grouped_parameters.append({'params': [p], 'weight_decay': 0.00, 'name': n})
|
||||
names.append({'params': [n], 'weight_decay': 0.00})
|
||||
|
||||
optimizer = BertLAMB(optimizer_grouped_parameters,
|
||||
lr=args.learning_rate,
|
||||
warmup=args.warmup_proportion,
|
||||
t_total=args.max_steps)
|
||||
if args.fp16:
|
||||
|
||||
optimizer = FusedAdam(optimizer_grouped_parameters,
|
||||
lr=args.learning_rate,
|
||||
#warmup=args.warmup_proportion,
|
||||
#t_total=args.max_steps,
|
||||
bias_correction=False,
|
||||
weight_decay=0.01,
|
||||
max_grad_norm=1.0)
|
||||
|
||||
if args.loss_scale == 0:
|
||||
# optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)
|
||||
model, optimizer = amp.initialize(model, optimizer, opt_level="O2", keep_batchnorm_fp32=False, loss_scale="dynamic")
|
||||
model, optimizer = amp.initialize(model, optimizer, opt_level="O2", loss_scale="dynamic",
|
||||
master_weights=False if args.accumulate_into_fp16 else True)
|
||||
else:
|
||||
# optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale)
|
||||
model, optimizer = amp.initialize(model, optimizer, opt_level="O2", keep_batchnorm_fp32=False, loss_scale=args.loss_scale)
|
||||
|
||||
scheduler = LinearWarmUpScheduler(optimizer, warmup=args.warmup_proportion, total_steps=args.max_steps)
|
||||
|
||||
else:
|
||||
optimizer = BertAdam(optimizer_grouped_parameters,
|
||||
lr=args.learning_rate,
|
||||
warmup=args.warmup_proportion,
|
||||
t_total=args.max_steps)
|
||||
|
||||
|
||||
model, optimizer = amp.initialize(model, optimizer, opt_level="O2", loss_scale=args.loss_scale,
|
||||
master_weights=False if args.accumulate_into_fp16 else True)
|
||||
amp._amp_state.loss_scalers[0]._loss_scale = 2**20
|
||||
|
||||
if args.resume_from_checkpoint:
|
||||
if args.phase2:
|
||||
keys = list(checkpoint['optimizer']['state'].keys())
|
||||
#Override hyperparameters from Phase 1
|
||||
for key in keys:
|
||||
checkpoint['optimizer']['state'][key]['step'] = global_step
|
||||
for iter, item in enumerate(checkpoint['optimizer']['param_groups']):
|
||||
checkpoint['optimizer']['param_groups'][iter]['t_total'] = args.max_steps
|
||||
checkpoint['optimizer']['param_groups'][iter]['warmup'] = args.warmup_proportion
|
||||
checkpoint['optimizer']['param_groups'][iter]['lr'] = args.learning_rate
|
||||
optimizer.load_state_dict(checkpoint['optimizer']) # , strict=False)
|
||||
|
||||
|
||||
|
||||
# Restore AMP master parameters
|
||||
if args.fp16:
|
||||
optimizer._lazy_init_maybe_master_weights()
|
||||
optimizer._amp_stash.lazy_init_called = True
|
||||
optimizer.load_state_dict(checkpoint['optimizer'])
|
||||
for param, saved_param in zip(amp.master_params(optimizer), checkpoint['master params']):
|
||||
param.data.copy_(saved_param.data)
|
||||
|
||||
if args.local_rank != -1:
|
||||
model = DDP(model)
|
||||
elif n_gpu > 1:
|
||||
model = torch.nn.DataParallel(model)
|
||||
|
||||
|
||||
files = [os.path.join(args.input_dir, f) for f in os.listdir(args.input_dir) if os.path.isfile(os.path.join(args.input_dir, f))]
|
||||
files.sort()
|
||||
|
||||
num_files = len(files)
|
||||
|
||||
|
||||
logger.info("***** Running training *****")
|
||||
# logger.info(" Num examples = %d", len(train_data))
|
||||
logger.info(" Batch size = %d", args.train_batch_size)
|
||||
print(" LR = ", args.learning_rate)
|
||||
|
||||
|
||||
model.train()
|
||||
print("Training. . .")
|
||||
|
||||
most_recent_ckpts_paths = []
|
||||
|
||||
print("Training. . .")
|
||||
tr_loss = 0.0 # total added training loss
|
||||
average_loss = 0.0 # averaged loss every args.log_freq steps
|
||||
epoch = 0
|
||||
training_steps = 0
|
||||
while True:
|
||||
if not args.resume_from_checkpoint:
|
||||
random.shuffle(files)
|
||||
f_start_id = 0
|
||||
if not args.allreduce_post_accumulation:
|
||||
model = DDP(model, message_size=250000000, gradient_predivide_factor=torch.distributed.get_world_size())
|
||||
else:
|
||||
f_start_id = checkpoint['files'][0]
|
||||
files = checkpoint['files'][1:]
|
||||
args.resume_from_checkpoint = False
|
||||
for f_id in range(f_start_id, len(files)):
|
||||
data_file = files[f_id]
|
||||
logger.info("file no %s file %s" %(f_id, data_file))
|
||||
train_data = pretraining_dataset(input_file=data_file, max_pred_length=args.max_predictions_per_seq)
|
||||
flat_dist_call([param.data for param in model.parameters()], torch.distributed.broadcast, (0,) )
|
||||
elif args.n_gpu > 1:
|
||||
model = torch.nn.DataParallel(model)
|
||||
|
||||
if args.local_rank == -1:
|
||||
train_sampler = RandomSampler(train_data)
|
||||
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size * n_gpu, num_workers=4, pin_memory=True)
|
||||
return model, optimizer, checkpoint, global_step
|
||||
|
||||
def take_optimizer_step(args, optimizer, model, overflow_buf, global_step):
|
||||
|
||||
if args.allreduce_post_accumulation:
|
||||
# manually allreduce gradients after all accumulation steps
|
||||
# check for Inf/NaN
|
||||
# 1. allocate an uninitialized buffer for flattened gradient
|
||||
scaler = _amp_state.loss_scalers[0]
|
||||
master_grads = [p.grad for p in amp.master_params(optimizer) if p.grad is not None]
|
||||
flat_grad_size = sum(p.numel() for p in master_grads)
|
||||
allreduce_dtype = torch.float16 if args.allreduce_post_accumulation_fp16 else torch.float32
|
||||
flat_raw = torch.empty(flat_grad_size, device='cuda', dtype=allreduce_dtype)
|
||||
# 2. combine unflattening and predivision of unscaled 'raw' gradient
|
||||
allreduced_views = apex_C.unflatten(flat_raw, master_grads)
|
||||
overflow_buf.zero_()
|
||||
amp_C.multi_tensor_scale(65536,
|
||||
overflow_buf,
|
||||
[master_grads, allreduced_views],
|
||||
scaler.loss_scale() / (torch.distributed.get_world_size() * args.gradient_accumulation_steps))
|
||||
# 3. sum gradient across ranks. Because of the predivision, this averages the gradient
|
||||
torch.distributed.all_reduce(flat_raw)
|
||||
# 4. combine unscaling and unflattening of allreduced gradient
|
||||
overflow_buf.zero_()
|
||||
amp_C.multi_tensor_scale(65536,
|
||||
overflow_buf,
|
||||
[allreduced_views, master_grads],
|
||||
1./scaler.loss_scale())
|
||||
# 5. update loss scale
|
||||
scaler = _amp_state.loss_scalers[0]
|
||||
old_overflow_buf = scaler._overflow_buf
|
||||
scaler._overflow_buf = overflow_buf
|
||||
had_overflow = scaler.update_scale()
|
||||
scaler._overfloat_buf = old_overflow_buf
|
||||
# 6. call optimizer step function
|
||||
if had_overflow == 0:
|
||||
optimizer.step()
|
||||
global_step += 1
|
||||
else:
|
||||
# Overflow detected, print message and clear gradients
|
||||
if is_main_process():
|
||||
print(("Rank {} :: Gradient overflow. Skipping step, " +
|
||||
"reducing loss scale to {}").format(
|
||||
torch.distributed.get_rank(),
|
||||
scaler.loss_scale()))
|
||||
if _amp_state.opt_properties.master_weights:
|
||||
for param in optimizer._amp_stash.all_fp32_from_fp16_params:
|
||||
param.grad = None
|
||||
for param in model.parameters():
|
||||
param.grad = None
|
||||
else:
|
||||
optimizer.step()
|
||||
#optimizer.zero_grad()
|
||||
for param in model.parameters():
|
||||
param.grad = None
|
||||
global_step += 1
|
||||
|
||||
return global_step
|
||||
|
||||
def main():
|
||||
|
||||
args = parse_arguments()
|
||||
random.seed(args.seed)
|
||||
np.random.seed(args.seed)
|
||||
torch.manual_seed(args.seed)
|
||||
|
||||
device, args = setup_training(args)
|
||||
|
||||
# Prepare optimizer
|
||||
model, optimizer, checkpoint, global_step = prepare_model_and_optimizer(args, device)
|
||||
|
||||
if is_main_process():
|
||||
print("SEED {}".format(args.seed))
|
||||
|
||||
if args.do_train:
|
||||
if is_main_process():
|
||||
logger.info("***** Running training *****")
|
||||
# logger.info(" Num examples = %d", len(train_data))
|
||||
logger.info(" Batch size = %d", args.train_batch_size)
|
||||
print(" LR = ", args.learning_rate)
|
||||
print("Training. . .")
|
||||
|
||||
model.train()
|
||||
most_recent_ckpts_paths = []
|
||||
average_loss = 0.0 # averaged loss every args.log_freq steps
|
||||
epoch = 0
|
||||
training_steps = 0
|
||||
|
||||
pool = ProcessPoolExecutor(1)
|
||||
|
||||
# Note: We loop infinitely over epochs, termination is handled via iteration count
|
||||
while True:
|
||||
thread = None
|
||||
if not args.resume_from_checkpoint or epoch > 0 or args.phase2:
|
||||
files = [os.path.join(args.input_dir, f) for f in os.listdir(args.input_dir) if
|
||||
os.path.isfile(os.path.join(args.input_dir, f))]
|
||||
files.sort()
|
||||
num_files = len(files)
|
||||
random.shuffle(files)
|
||||
f_start_id = 0
|
||||
else:
|
||||
train_sampler = DistributedSampler(train_data)
|
||||
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size, num_workers=4, pin_memory=True)
|
||||
f_start_id = checkpoint['files'][0]
|
||||
files = checkpoint['files'][1:]
|
||||
args.resume_from_checkpoint = False
|
||||
num_files = len(files)
|
||||
|
||||
for step, batch in enumerate(tqdm(train_dataloader, desc="File Iteration")):
|
||||
|
||||
training_steps += 1
|
||||
batch = [t.to(device) for t in batch]
|
||||
input_ids, segment_ids, input_mask, masked_lm_labels, next_sentence_labels = batch#\
|
||||
loss = model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask, masked_lm_labels=masked_lm_labels, next_sentence_label=next_sentence_labels, checkpoint_activations=args.checkpoint_activations)
|
||||
if n_gpu > 1:
|
||||
loss = loss.mean() # mean() to average on multi-gpu.
|
||||
|
||||
if args.gradient_accumulation_steps > 1:
|
||||
loss = loss / args.gradient_accumulation_steps
|
||||
shared_file_list = {}
|
||||
|
||||
if args.fp16:
|
||||
# optimizer.backward(loss)
|
||||
with amp.scale_loss(loss, optimizer) as scaled_loss:
|
||||
scaled_loss.backward()
|
||||
else:
|
||||
loss.backward()
|
||||
tr_loss += loss
|
||||
average_loss += loss.item()
|
||||
if torch.distributed.is_initialized() and torch.distributed.get_world_size() > num_files:
|
||||
remainder = torch.distributed.get_world_size() % num_files
|
||||
data_file = files[(f_start_id*torch.distributed.get_world_size()+torch.distributed.get_rank() + remainder*f_start_id)%num_files]
|
||||
else:
|
||||
data_file = files[(f_start_id*torch.distributed.get_world_size()+torch.distributed.get_rank())%num_files]
|
||||
|
||||
if training_steps % args.gradient_accumulation_steps == 0:
|
||||
if args.fp16:
|
||||
scheduler.step()
|
||||
optimizer.step()
|
||||
optimizer.zero_grad()
|
||||
global_step += 1
|
||||
previous_file = data_file
|
||||
|
||||
train_data = pretraining_dataset(data_file, args.max_predictions_per_seq)
|
||||
train_sampler = RandomSampler(train_data)
|
||||
train_dataloader = DataLoader(train_data, sampler=train_sampler,
|
||||
batch_size=args.train_batch_size * args.n_gpu, num_workers=4,
|
||||
pin_memory=True)
|
||||
# shared_file_list["0"] = (train_dataloader, data_file)
|
||||
|
||||
overflow_buf = None
|
||||
if args.allreduce_post_accumulation:
|
||||
overflow_buf = torch.cuda.IntTensor([0])
|
||||
|
||||
for f_id in range(f_start_id + 1 , len(files)):
|
||||
|
||||
|
||||
# torch.cuda.synchronize()
|
||||
# f_start = time.time()
|
||||
if torch.distributed.get_world_size() > num_files:
|
||||
data_file = files[(f_id*torch.distributed.get_world_size()+torch.distributed.get_rank() + remainder*f_id)%num_files]
|
||||
else:
|
||||
data_file = files[(f_id*torch.distributed.get_world_size()+torch.distributed.get_rank())%num_files]
|
||||
|
||||
if training_steps == 1 * args.gradient_accumulation_steps:
|
||||
logger.info("Step:{} Average Loss = {} Step Loss = {} LR {}".format(global_step, average_loss,
|
||||
loss.item(), optimizer.param_groups[0]['lr']))
|
||||
logger.info("file no %s file %s" % (f_id, previous_file))
|
||||
|
||||
if training_steps % (args.log_freq * args.gradient_accumulation_steps) == 0:
|
||||
logger.info("Step:{} Average Loss = {} Step Loss = {} LR {}".format(global_step, average_loss / args.log_freq,
|
||||
loss.item(), optimizer.param_groups[0]['lr']))
|
||||
average_loss = 0
|
||||
previous_file = data_file
|
||||
|
||||
if global_step >= args.max_steps or training_steps % (
|
||||
args.num_steps_per_checkpoint * args.gradient_accumulation_steps) == 0:
|
||||
# train_dataloader = shared_file_list["0"][0]
|
||||
|
||||
if (not torch.distributed.is_initialized() or (torch.distributed.is_initialized() and torch.distributed.get_rank() == 0)):
|
||||
# Save a trained model
|
||||
logger.info("** ** * Saving fine - tuned model ** ** * ")
|
||||
model_to_save = model.module if hasattr(model, 'module') else model # Only save the model it-self
|
||||
output_save_file = os.path.join(args.output_dir, "ckpt_{}.pt".format(global_step))
|
||||
|
||||
torch.save({'model' : model_to_save.state_dict(),
|
||||
'optimizer' : optimizer.state_dict(),
|
||||
'files' : [f_id] + files }, output_save_file)
|
||||
|
||||
most_recent_ckpts_paths.append(output_save_file)
|
||||
if len(most_recent_ckpts_paths) > 3:
|
||||
ckpt_to_be_removed = most_recent_ckpts_paths.pop(0)
|
||||
os.remove(ckpt_to_be_removed)
|
||||
# thread = multiprocessing.Process(
|
||||
# name="LOAD DATA:" + str(f_id) + ":" + str(data_file),
|
||||
# target=create_pretraining_dataset,
|
||||
# args=(data_file, args.max_predictions_per_seq, shared_file_list, args, n_gpu)
|
||||
# )
|
||||
# thread.start()
|
||||
dataset_future = pool.submit(create_pretraining_dataset, data_file, args.max_predictions_per_seq, shared_file_list, args)
|
||||
# torch.cuda.synchronize()
|
||||
# f_end = time.time()
|
||||
# print('[{}] : shard overhead {}'.format(torch.distributed.get_rank(), f_end - f_start))
|
||||
|
||||
train_iter = tqdm(train_dataloader, desc="Iteration") if is_main_process() else train_dataloader
|
||||
for step, batch in enumerate(train_iter):
|
||||
# torch.cuda.synchronize()
|
||||
# iter_start = time.time()
|
||||
|
||||
training_steps += 1
|
||||
batch = [t.to(device) for t in batch]
|
||||
input_ids, segment_ids, input_mask, masked_lm_labels, next_sentence_labels = batch
|
||||
loss = model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask,
|
||||
masked_lm_labels=masked_lm_labels, next_sentence_label=next_sentence_labels,
|
||||
checkpoint_activations=args.checkpoint_activations)
|
||||
if args.n_gpu > 1:
|
||||
loss = loss.mean() # mean() to average on multi-gpu.
|
||||
|
||||
divisor = args.gradient_accumulation_steps
|
||||
if args.gradient_accumulation_steps > 1:
|
||||
if not args.allreduce_post_accumulation:
|
||||
# this division was merged into predivision
|
||||
loss = loss / args.gradient_accumulation_steps
|
||||
divisor = 1.0
|
||||
if args.fp16:
|
||||
with amp.scale_loss(loss, optimizer, delay_overflow_check=args.allreduce_post_accumulation) as scaled_loss:
|
||||
scaled_loss.backward()
|
||||
else:
|
||||
loss.backward()
|
||||
average_loss += loss.item()
|
||||
|
||||
if training_steps % args.gradient_accumulation_steps == 0:
|
||||
global_step = take_optimizer_step(args, optimizer, model, overflow_buf, global_step)
|
||||
|
||||
if global_step >= args.max_steps:
|
||||
tr_loss = tr_loss * args.gradient_accumulation_steps / training_steps
|
||||
last_num_steps = global_step % args.log_freq
|
||||
last_num_steps = args.log_freq if last_num_steps == 0 else last_num_steps
|
||||
average_loss = torch.tensor(average_loss, dtype=torch.float32).cuda()
|
||||
average_loss = average_loss / (last_num_steps * divisor)
|
||||
if (torch.distributed.is_initialized()):
|
||||
tr_loss /= torch.distributed.get_world_size()
|
||||
torch.distributed.all_reduce(tr_loss)
|
||||
logger.info("Total Steps:{} Final Loss = {}".format(training_steps, tr_loss.item()))
|
||||
return
|
||||
del train_dataloader
|
||||
del train_sampler
|
||||
del train_data
|
||||
#for obj in gc.get_objects():
|
||||
# if torch.is_tensor(obj) or (hasattr(obj, 'data') and torch.is_tensor(obj.data)):
|
||||
# del obj
|
||||
average_loss /= torch.distributed.get_world_size()
|
||||
torch.distributed.all_reduce(average_loss)
|
||||
if is_main_process():
|
||||
logger.info("Total Steps:{} Final Loss = {}".format(training_steps, average_loss.item()))
|
||||
elif training_steps % (args.log_freq * args.gradient_accumulation_steps) == 0:
|
||||
if is_main_process():
|
||||
print("Step:{} Average Loss = {} Step Loss = {} LR {}".format(global_step, average_loss / (
|
||||
args.log_freq * divisor),
|
||||
loss.item() * args.gradient_accumulation_steps / divisor,
|
||||
optimizer.param_groups[0][
|
||||
'lr']))
|
||||
average_loss = 0
|
||||
|
||||
torch.cuda.empty_cache()
|
||||
epoch += 1
|
||||
if global_step >= args.max_steps or training_steps % (
|
||||
args.num_steps_per_checkpoint * args.gradient_accumulation_steps) == 0:
|
||||
if is_main_process():
|
||||
# Save a trained model
|
||||
logger.info("** ** * Saving fine - tuned model ** ** * ")
|
||||
model_to_save = model.module if hasattr(model,
|
||||
'module') else model # Only save the model it-self
|
||||
if args.resume_step < 0 or not args.phase2:
|
||||
output_save_file = os.path.join(args.output_dir, "ckpt_{}.pt".format(global_step))
|
||||
else:
|
||||
output_save_file = os.path.join(args.output_dir, "ckpt_{}.pt".format(global_step + args.phase1_end_step))
|
||||
if args.do_train:
|
||||
torch.save({'model': model_to_save.state_dict(),
|
||||
'optimizer': optimizer.state_dict(),
|
||||
'master params': list(amp.master_params(optimizer)),
|
||||
'files': [f_id] + files}, output_save_file)
|
||||
|
||||
most_recent_ckpts_paths.append(output_save_file)
|
||||
if len(most_recent_ckpts_paths) > 3:
|
||||
ckpt_to_be_removed = most_recent_ckpts_paths.pop(0)
|
||||
os.remove(ckpt_to_be_removed)
|
||||
|
||||
if global_step >= args.max_steps:
|
||||
del train_dataloader
|
||||
# thread.join()
|
||||
return args
|
||||
|
||||
|
||||
# torch.cuda.synchronize()
|
||||
# iter_end = time.time()
|
||||
|
||||
# if torch.distributed.get_rank() == 0:
|
||||
# print('step {} : {}'.format(global_step, iter_end - iter_start))
|
||||
|
||||
del train_dataloader
|
||||
# thread.join()
|
||||
# Make sure pool has finished and switch train_dataloader
|
||||
# NOTE: Will block until complete
|
||||
train_dataloader, data_file = dataset_future.result(timeout=None)
|
||||
|
||||
epoch += 1
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
now = time.time()
|
||||
args = main()
|
||||
if is_main_process():
|
||||
print("Total time taken {}".format(time.time() - now))
|
||||
|
|
|
@ -101,11 +101,15 @@ def main():
|
|||
type=str,
|
||||
required=False,
|
||||
help="The BERT model config")
|
||||
parser.add_argument("--ckpt_dir",
|
||||
ckpt_group = parser.add_mutually_exclusive_group(required=True)
|
||||
ckpt_group.add_argument("--ckpt_dir",
|
||||
default=None,
|
||||
type=str,
|
||||
required=True,
|
||||
help="The ckpt directory, e.g. /results")
|
||||
ckpt_group.add_argument("--ckpt_path",
|
||||
default=None,
|
||||
type=str,
|
||||
help="Path to the specific checkpoint")
|
||||
|
||||
group = parser.add_mutually_exclusive_group(required=True)
|
||||
group.add_argument('--eval', dest='do_eval', action='store_true')
|
||||
|
@ -184,16 +188,21 @@ def main():
|
|||
|
||||
# Prepare model
|
||||
config = BertConfig.from_json_file(args.config_file)
|
||||
# Padding for divisibility by 8
|
||||
if config.vocab_size % 8 != 0:
|
||||
config.vocab_size += 8 - (config.vocab_size % 8)
|
||||
model = BertForPreTraining(config)
|
||||
|
||||
|
||||
if args.ckpt_step == -1:
|
||||
#retrieve latest model
|
||||
model_names = [f for f in os.listdir(args.ckpt_dir) if f.endswith(".model")]
|
||||
args.ckpt_step = max([int(x.split('.model')[0].split('_')[1].strip()) for x in model_names])
|
||||
print("load model saved at iteraton", args.ckpt_step)
|
||||
model_file = os.path.join(args.ckpt_dir, "ckpt_" + str(args.ckpt_step) + ".model")
|
||||
state_dict = torch.load(model_file, map_location="cpu")
|
||||
if args.ckpt_dir:
|
||||
if args.ckpt_step == -1:
|
||||
#retrieve latest model
|
||||
model_names = [f for f in os.listdir(args.ckpt_dir) if f.endswith(".model")]
|
||||
args.ckpt_step = max([int(x.split('.model')[0].split('_')[1].strip()) for x in model_names])
|
||||
print("load model saved at iteraton", args.ckpt_step)
|
||||
model_file = os.path.join(args.ckpt_dir, "ckpt_" + str(args.ckpt_step) + ".pt")
|
||||
else:
|
||||
model_file = args.ckpt_path
|
||||
state_dict = torch.load(model_file, map_location="cpu")["model"]
|
||||
model.load_state_dict(state_dict, strict=False)
|
||||
|
||||
if args.fp16:
|
||||
|
|
|
@ -916,11 +916,16 @@ def main():
|
|||
|
||||
# Prepare model
|
||||
config = BertConfig.from_json_file(args.config_file)
|
||||
# Padding for divisibility by 8
|
||||
if config.vocab_size % 8 != 0:
|
||||
config.vocab_size += 8 - (config.vocab_size % 8)
|
||||
|
||||
model = BertForQuestionAnswering(config)
|
||||
# model = BertForQuestionAnswering.from_pretrained(args.bert_model,
|
||||
# cache_dir=os.path.join(str(PYTORCH_PRETRAINED_BERT_CACHE), 'distributed_{}'.format(args.local_rank)))
|
||||
model.load_state_dict(torch.load(args.init_checkpoint, map_location='cpu'), strict=False)
|
||||
|
||||
print("USING CHECKOINT")
|
||||
model.load_state_dict(torch.load(args.init_checkpoint, map_location='cpu')["model"], strict=False)
|
||||
print("USED CHECKPOINT \n\n")
|
||||
model.to(device)
|
||||
if args.fp16 and args.old:
|
||||
model.half()
|
||||
|
|
|
@ -1,3 +1,2 @@
|
|||
#!/bin/bash
|
||||
|
||||
docker build . --rm -t bert
|
||||
docker build . --rm -t bert_pyt
|
||||
|
|
|
@ -1,17 +1,18 @@
|
|||
#!/bin/bash
|
||||
|
||||
DATA_DIR=${1:-"/mnt/dldata/bert"}
|
||||
VOCAB_DIR=${2:-"/mnt/dldata/bert/vocab"}
|
||||
CHECKPOINT_DIR=${3:-"/mnt/dldata/bert/pretrained_models_nvidia_pytorch"}
|
||||
DATA_DIR=${1:-"${PWD}/data/hdf5/books_wiki_en_corpus"}
|
||||
VOCAB_DIR=${2:-"${PWD}/data/download/google_pretrained_weights/uncased_L-24_H-1024_A-16"}
|
||||
CHECKPOINT_DIR=${3:-"${PWD}/checkpoints"}
|
||||
RESULTS_DIR=${4:-"${PWD}/results"}
|
||||
|
||||
docker run -it --rm \
|
||||
--runtime=nvidia \
|
||||
-p 8888:8888 \
|
||||
--shm-size=1g \
|
||||
--ulimit memlock=-1 \
|
||||
--ulimit stack=67108864 \
|
||||
-v $DATA_DIR:/workspace/bert/data \
|
||||
-v ${PWD}:/workspace/bert \
|
||||
-v $DATA_DIR:/workspace/bert/data/hdf5/books_wiki_en_corpus \
|
||||
-v $CHECKPOINT_DIR:/workspace/checkpoints \
|
||||
-v $VOCAB_DIR:/workspace/bert/vocab \
|
||||
-v $PWD/results:/results \
|
||||
bert bash
|
||||
-v $VOCAB_DIR:/workspace/bert/data/download/google_pretrained_weights/uncased_L-24_H-1024_A-16 \
|
||||
-v $RESULTS_DIR:/results \
|
||||
bert_pyt bash
|
||||
|
|
|
@ -1,28 +1,38 @@
|
|||
#!/bin/bash
|
||||
|
||||
echo "Container nvidia build = " $NVIDIA_BUILD_ID
|
||||
train_batch_size=${1:-14}
|
||||
learning_rate=${2:-"0.4375e-4"}
|
||||
train_batch_size=${1:-8192}
|
||||
learning_rate=${2:-"6e-3"}
|
||||
precision=${3:-"fp16"}
|
||||
num_gpus=${4:-8}
|
||||
warmup_proportion=${5:-"0.01"}
|
||||
train_steps=${6:-2285714}
|
||||
save_checkpoint_steps=${7:-2000}
|
||||
warmup_proportion=${5:-"0.2843"}
|
||||
train_steps=${6:-7038}
|
||||
save_checkpoint_steps=${7:-200}
|
||||
resume_training=${8:-"false"}
|
||||
create_logfile=${9:-"true"}
|
||||
accumulate_gradients=${10:-"false"}
|
||||
gradient_accumulation_steps=${11:-1}
|
||||
seed=${12:-42}
|
||||
job_name=${13:-"job"}
|
||||
accumulate_gradients=${10:-"true"}
|
||||
gradient_accumulation_steps=${11:-128}
|
||||
seed=${12:-$RANDOM}
|
||||
job_name=${13:-"bert_lamb_pretraining"}
|
||||
allreduce_post_accumulation=${14:-"true"}
|
||||
allreduce_post_accumulation_fp16=${15:-"true"}
|
||||
accumulate_into_fp16=${16:-"true"}
|
||||
|
||||
train_batch_size_phase2=${1:-4096}
|
||||
learning_rate_phase2=${2:-"4e-3"}
|
||||
warmup_proportion_phase2=${5:-"0.128"}
|
||||
train_steps_phase2=${6:-1563}
|
||||
gradient_accumulation_steps_phase2=${11:-512}
|
||||
|
||||
DATASET=wikipedia_corpus # change this for other datasets
|
||||
DATASET=books_wiki_en_corpus # change this for other datasets
|
||||
|
||||
DATA_DIR=data/${DATASET}/hdf5_shards/
|
||||
DATA_DIR=data/${DATASET}/training/
|
||||
#DATA_DIR=data/hdf5/wiki+book/bert_pytorch_wikipedia_bookcorpus_interseqmix_seq_128_pred_20/
|
||||
BERT_CONFIG=bert_config.json
|
||||
RESULTS_DIR=/results
|
||||
CHECKPOINTS_DIR=/results/checkpoints
|
||||
|
||||
|
||||
mkdir -p $CHECKPOINTS_DIR
|
||||
|
||||
|
||||
|
@ -63,6 +73,21 @@ if [ "$resume_training" == "true" ] ; then
|
|||
CHECKPOINT="--resume_from_checkpoint"
|
||||
fi
|
||||
|
||||
ALL_REDUCE_POST_ACCUMULATION=""
|
||||
if [ "$allreduce_post_accumulation" == "true" ] ; then
|
||||
ALL_REDUCE_POST_ACCUMULATION="--allreduce_post_accumulation"
|
||||
fi
|
||||
|
||||
ALL_REDUCE_POST_ACCUMULATION_FP16=""
|
||||
if [ "$allreduce_post_accumulation_fp16" == "true" ] ; then
|
||||
ALL_REDUCE_POST_ACCUMULATION_FP16="--allreduce_post_accumulation_fp16"
|
||||
fi
|
||||
|
||||
ACCUMULATE_INTO_FP16=""
|
||||
if [ "$accumulate_into_fp16" == "true" ] ; then
|
||||
ACCUMULATE_INTO_FP16="--accumulate_into_fp16"
|
||||
fi
|
||||
|
||||
echo $DATA_DIR
|
||||
INPUT_DIR=$DATA_DIR
|
||||
CMD=" /workspace/bert/run_pretraining.py"
|
||||
|
@ -71,8 +96,8 @@ CMD+=" --output_dir=$CHECKPOINTS_DIR"
|
|||
CMD+=" --config_file=$BERT_CONFIG"
|
||||
CMD+=" --bert_model=bert-large-uncased"
|
||||
CMD+=" --train_batch_size=$train_batch_size"
|
||||
CMD+=" --max_seq_length=512"
|
||||
CMD+=" --max_predictions_per_seq=80"
|
||||
CMD+=" --max_seq_length=128"
|
||||
CMD+=" --max_predictions_per_seq=20"
|
||||
CMD+=" --max_steps=$train_steps"
|
||||
CMD+=" --warmup_proportion=$warmup_proportion"
|
||||
CMD+=" --num_steps_per_checkpoint=$save_checkpoint_steps"
|
||||
|
@ -81,7 +106,10 @@ CMD+=" --seed=$seed"
|
|||
CMD+=" $PREC"
|
||||
CMD+=" $ACCUMULATE_GRADIENTS"
|
||||
CMD+=" $CHECKPOINT"
|
||||
|
||||
CMD+=" $ALL_REDUCE_POST_ACCUMULATION"
|
||||
CMD+=" $ALL_REDUCE_POST_ACCUMULATION_FP16"
|
||||
CMD+=" $ACCUMULATE_INTO_FP16"
|
||||
CMD+=" --do_train"
|
||||
|
||||
if [ "$num_gpus" -gt 1 ] ; then
|
||||
CMD="python3 -m torch.distributed.launch --nproc_per_node=$num_gpus $CMD"
|
||||
|
@ -115,39 +143,107 @@ target_loss=15
|
|||
THROUGHPUT=10
|
||||
THRESHOLD=0.9
|
||||
|
||||
throughput=`cat $LOGFILE | grep Iteration | tail -1 | awk -F's/it' '{print $1}' | awk -F',' '{print $2}' | egrep -o [0-9.]+`
|
||||
throughput=`cat $LOGFILE | grep Iteration | tail -1 | awk -F'it/s' '{print $1}' | awk -F',' '{print $2}' | egrep -o [0-9.]+`
|
||||
loss=`cat $LOGFILE | grep 'Average Loss' | tail -1 | awk -F'Average Loss =' '{print $2}' | awk -F' ' '{print $1}' | egrep -o [0-9.]+`
|
||||
final_loss=`cat $LOGFILE | grep 'Total Steps' | tail -1 | awk -F'Final Loss =' '{print $2}' | awk -F' ' '{print $1}' | egrep -o [0-9.]+`
|
||||
|
||||
echo "throughput: $throughput s/it"
|
||||
train_perf=$(awk 'BEGIN {print ('$throughput' * '$num_gpus' * '$train_batch_size')}')
|
||||
echo " training throughput phase1: $train_perf sequences/second"
|
||||
echo "average loss: $loss"
|
||||
echo "final loss: $final_loss"
|
||||
|
||||
ACCURACY_TEST_RESULT=$(awk 'BEGIN {print ('${loss}' <= '${target_loss}')}')
|
||||
#Start Phase2
|
||||
|
||||
if [ $ACCURACY_TEST_RESULT == 1 ];
|
||||
then
|
||||
echo "&&&& ACCURACY TEST PASSED"
|
||||
else
|
||||
echo "&&&& ACCURACY TEST FAILED"
|
||||
fi
|
||||
DATASET=merged_wiki+books_phase2 # change this for other datasets
|
||||
|
||||
PERFORMANCE_TEST_RESULT=$(awk 'BEGIN {print ('${throughput}' <= ('${THROUGHPUT}' * '${THRESHOLD}'))}')
|
||||
DATA_DIR=data/${DATASET}/hdf5_shards/
|
||||
#DATA_DIR=data/hdf5/wiki+book/bert_pytorch_wikipedia_bookcorpus_interseqmix_seq_512_pred_80/
|
||||
|
||||
if [ $PERFORMANCE_TEST_RESULT == 1 ];
|
||||
then
|
||||
echo "&&&& PERFORMANCE TEST PASSED"
|
||||
else
|
||||
echo "&&&& PERFORMANCE TEST FAILED"
|
||||
fi
|
||||
PREC=""
|
||||
if [ "$precision" = "fp16" ] ; then
|
||||
PREC="--fp16"
|
||||
elif [ "$precision" = "fp32" ] ; then
|
||||
PREC=""
|
||||
else
|
||||
echo "Unknown <precision> argument"
|
||||
exit -2
|
||||
fi
|
||||
|
||||
if [ $ACCURACY_TEST_RESULT == 1 -a $PERFORMANCE_TEST_RESULT == 1 ];
|
||||
then
|
||||
echo "&&&& PASSED"
|
||||
exit 0
|
||||
else
|
||||
echo "&&&& FAILED"
|
||||
exit 1
|
||||
fi
|
||||
ACCUMULATE_GRADIENTS=""
|
||||
if [ "$accumulate_gradients" == "true" ] ; then
|
||||
ACCUMULATE_GRADIENTS="--gradient_accumulation_steps=$gradient_accumulation_steps_phase2"
|
||||
fi
|
||||
|
||||
ALL_REDUCE_POST_ACCUMULATION=""
|
||||
if [ "$allreduce_post_accumulation" == "true" ] ; then
|
||||
ALL_REDUCE_POST_ACCUMULATION="--allreduce_post_accumulation"
|
||||
fi
|
||||
|
||||
ALL_REDUCE_POST_ACCUMULATION_FP16=""
|
||||
if [ "$allreduce_post_accumulation_fp16" == "true" ] ; then
|
||||
ALL_REDUCE_POST_ACCUMULATION_FP16="--allreduce_post_accumulation_fp16"
|
||||
fi
|
||||
|
||||
ACCUMULATE_INTO_FP16=""
|
||||
if [ "$accumulate_into_fp16" == "true" ] ; then
|
||||
ACCUMULATE_INTO_FP16="--accumulate_into_fp16"
|
||||
fi
|
||||
|
||||
echo $DATA_DIR
|
||||
INPUT_DIR=$DATA_DIR
|
||||
CMD=" /workspace/bert/run_pretraining.py"
|
||||
CMD+=" --input_dir=$DATA_DIR"
|
||||
CMD+=" --output_dir=$CHECKPOINTS_DIR"
|
||||
CMD+=" --config_file=$BERT_CONFIG"
|
||||
CMD+=" --bert_model=bert-large-uncased"
|
||||
CMD+=" --train_batch_size=$train_batch_size_phase2"
|
||||
CMD+=" --max_seq_length=512"
|
||||
CMD+=" --max_predictions_per_seq=80"
|
||||
CMD+=" --max_steps=$train_steps_phase2"
|
||||
CMD+=" --warmup_proportion=$warmup_proportion_phase2"
|
||||
CMD+=" --num_steps_per_checkpoint=$save_checkpoint_steps"
|
||||
CMD+=" --learning_rate=$learning_rate_phase2"
|
||||
CMD+=" --seed=$seed"
|
||||
CMD+=" $PREC"
|
||||
CMD+=" $ACCUMULATE_GRADIENTS"
|
||||
CMD+=" $CHECKPOINT"
|
||||
CMD+=" $ALL_REDUCE_POST_ACCUMULATION"
|
||||
CMD+=" $ALL_REDUCE_POST_ACCUMULATION_FP16"
|
||||
CMD+=" $ACCUMULATE_INTO_FP16"
|
||||
CMD+=" --do_train --phase2 --resume_from_checkpoint --phase1_end_step=$train_steps"
|
||||
|
||||
if [ "$num_gpus" -gt 1 ] ; then
|
||||
CMD="python3 -m torch.distributed.launch --nproc_per_node=$num_gpus $CMD"
|
||||
else
|
||||
CMD="python3 $CMD"
|
||||
fi
|
||||
|
||||
|
||||
if [ "$create_logfile" = "true" ] ; then
|
||||
export GBS=$(expr $train_batch_size \* $num_gpus)
|
||||
printf -v TAG "pyt_bert_pretraining_%s_gbs%d" "$precision" $GBS
|
||||
DATESTAMP=`date +'%y%m%d%H%M%S'`
|
||||
LOGFILE=$RESULTS_DIR/$job_name.$TAG.$DATESTAMP.log
|
||||
printf "Logs written to %s\n" "$LOGFILE"
|
||||
fi
|
||||
|
||||
set -x
|
||||
if [ -z "$LOGFILE" ] ; then
|
||||
$CMD
|
||||
else
|
||||
(
|
||||
$CMD
|
||||
) |& tee $LOGFILE
|
||||
fi
|
||||
|
||||
set +x
|
||||
|
||||
echo "finished phase2"
|
||||
throughput=`cat $LOGFILE | grep Iteration | tail -1 | awk -F'it/s' '{print $1}' | awk -F',' '{print $2}' | egrep -o [0-9.]+`
|
||||
loss=`cat $LOGFILE | grep 'Average Loss' | tail -1 | awk -F'Average Loss =' '{print $2}' | awk -F' ' '{print $1}' | egrep -o [0-9.]+`
|
||||
final_loss=`cat $LOGFILE | grep 'Total Steps' | tail -1 | awk -F'Final Loss =' '{print $2}' | awk -F' ' '{print $1}' | egrep -o [0-9.]+`
|
||||
|
||||
train_perf=$(awk 'BEGIN {print ('$throughput' * '$num_gpus' * '$train_batch_size_phase2')}')
|
||||
echo " training throughput phase2: $train_perf sequences/second"
|
||||
echo "average loss: $loss"
|
||||
echo "final loss: $final_loss"
|
||||
|
|
|
@ -96,51 +96,6 @@ else
|
|||
fi
|
||||
set +x
|
||||
|
||||
target_loss=15
|
||||
THROUGHPUT=1.0
|
||||
THRESHOLD=0.9
|
||||
|
||||
throughput=`cat $LOGFILE | grep Iteration | tail -1 | awk -F'it/s' '{print $1}' | awk -F',' '{print $2}' | egrep -o [0-9.]+`
|
||||
|
||||
|
||||
echo "throughput: $throughput it/s"
|
||||
|
||||
|
||||
PERFORMANCE_TEST_RESULT=$(awk 'BEGIN {print ('${throughput}' >= \
|
||||
('${THROUGHPUT}' * '${THRESHOLD}'))}')
|
||||
|
||||
if [ $PERFORMANCE_TEST_RESULT == 1 ];
|
||||
then
|
||||
echo "&&&& PERFORMANCE TEST PASSED"
|
||||
else
|
||||
echo "&&&& PERFORMANCE TEST FAILED"
|
||||
fi
|
||||
|
||||
|
||||
if [ "$inference_mode" = "eval" ] ; then
|
||||
loss=`cat $LOGFILE | grep Finished | tail -1 | awk -F'Final Loss =' '{print $2}' | awk -F' ' '{print $1}' | egrep -o [0-9.]+`
|
||||
|
||||
|
||||
echo "final loss: $loss"
|
||||
|
||||
|
||||
ACCURACY_TEST_RESULT=$(awk 'BEGIN {print ('${loss}' <= '${target_loss}')}')
|
||||
|
||||
if [ $ACCURACY_TEST_RESULT == 1 ];
|
||||
then
|
||||
echo "&&&& ACCURACY TEST PASSED"
|
||||
else
|
||||
echo "&&&& ACCURACY TEST FAILED"
|
||||
fi
|
||||
|
||||
|
||||
if [ $ACCURACY_TEST_RESULT == 1 -a $PERFORMANCE_TEST_RESULT == 1 ];
|
||||
then
|
||||
echo "&&&& PASSED"
|
||||
exit 0
|
||||
else
|
||||
echo "&&&& FAILED"
|
||||
exit 1
|
||||
fi
|
||||
fi
|
||||
|
||||
inference_perf=$(awk 'BEGIN {print ('$throughput' * '$num_gpus' * '$eval_batch_size')}')
|
||||
echo " inference throughput : $inference_perf sequences/second"
|
|
@ -13,7 +13,7 @@ precision=${5:-"fp16"}
|
|||
num_gpu=${6:-"8"}
|
||||
seed=${7:-"1"}
|
||||
squad_dir=${8:-"/workspace/bert/data/squad/v1.1"}
|
||||
vocab_file=${9:-"/workspace/bert/vocab/vocab"}
|
||||
vocab_file=${9:-"/workspace/bert/data/google_pretrained_weights/uncased_L-24_H-1024_A-16/vocab.txt"}
|
||||
OUT_DIR=${10:-"/results/SQuAD"}
|
||||
mode=${11:-"train eval"}
|
||||
CONFIG_FILE=${12:-"/workspace/bert/bert_config.json"}
|
||||
|
|
|
@ -45,7 +45,7 @@ fi
|
|||
printf -v EXTRA_PARAMS "%d %d %e %s 1 %d %d %d false" $train_batch_size $eval_batch_size $learning_rate "$precision" $warmup_proportion $train_steps $save_checkpoint_steps
|
||||
|
||||
export ROOTDIR=$root_dir
|
||||
export DATA_DIR=${DATA_DIR:-$CODEDIR/data/wikipedia_corpus/pyt_hdf5_shards}
|
||||
export DATA_DIR=${DATA_DIR:-$CODEDIR/data/hdf5/books_wiki_en_corpus}
|
||||
|
||||
VOLS="-v $ROOTDIR:/workspace/bert"
|
||||
VOLS+=" -v $DATA_DIR:/workspace/bert/data/wikipedia_corpus/pyt_hdf5_shards"
|
||||
|
|
12
PyTorch/LanguageModeling/BERT/utils.py
Normal file
12
PyTorch/LanguageModeling/BERT/utils.py
Normal file
|
@ -0,0 +1,12 @@
|
|||
import torch
|
||||
import torch.distributed as dist
|
||||
|
||||
def get_rank():
|
||||
if not dist.is_available():
|
||||
return 0
|
||||
if not dist.is_initialized():
|
||||
return 0
|
||||
return dist.get_rank()
|
||||
|
||||
def is_main_process():
|
||||
return get_rank() == 0
|
|
@ -1,123 +0,0 @@
|
|||
# NVIDIA
|
||||
|
||||
import hashlib
|
||||
import urllib.request
|
||||
import zipfile
|
||||
|
||||
# Download urls
|
||||
model_urls = {
|
||||
'bert_base_uncased' : ('https://storage.googleapis.com/bert_models/2018_10_18/uncased_L-12_H-768_A-12.zip', 'uncased_L-12_H-768_A-12.zip'),
|
||||
'bert_large_uncased' : ('https://storage.googleapis.com/bert_models/2018_10_18/uncased_L-24_H-1024_A-16.zip', 'uncased_L-24_H-1024_A-16.zip'),
|
||||
'bert_base_cased' : ('https://storage.googleapis.com/bert_models/2018_10_18/cased_L-12_H-768_A-12.zip', 'cased_L-12_H-768_A-12.zip'),
|
||||
'bert_large_cased' : ('https://storage.googleapis.com/bert_models/2018_10_18/cased_L-24_H-1024_A-16.zip', 'cased_L-24_H-1024_A-16.zip'),
|
||||
'bert_base_multilingual_cased' : ('https://storage.googleapis.com/bert_models/2018_11_23/multi_cased_L-12_H-768_A-12.zip', 'multi_cased_L-12_H-768_A-12.zip'),
|
||||
'bert_large_multilingual_uncased' : ('https://storage.googleapis.com/bert_models/2018_11_03/multilingual_L-12_H-768_A-12.zip', 'multilingual_L-12_H-768_A-12.zip'),
|
||||
'bert_base_chinese' : ('https://storage.googleapis.com/bert_models/2018_11_03/chinese_L-12_H-768_A-12.zip', 'chinese_L-12_H-768_A-12.zip')
|
||||
}
|
||||
|
||||
# SHA256sum verification for file download integrity (and checking for changes from the download source over time)
|
||||
bert_base_uncased_sha = {
|
||||
'bert_config.json' : '7b4e5f53efbd058c67cda0aacfafb340113ea1b5797d9ce6ee411704ba21fcbc',
|
||||
'bert_model.ckpt.data-00000-of-00001' : '58580dc5e0bf0ae0d2efd51d0e8272b2f808857f0a43a88aaf7549da6d7a8a84',
|
||||
'bert_model.ckpt.index' : '04c1323086e2f1c5b7c0759d8d3e484afbb0ab45f51793daab9f647113a0117b',
|
||||
'bert_model.ckpt.meta' : 'dd5682170a10c3ea0280c2e9b9a45fee894eb62da649bbdea37b38b0ded5f60e',
|
||||
'vocab.txt' : '07eced375cec144d27c900241f3e339478dec958f92fddbc551f295c992038a3',
|
||||
}
|
||||
|
||||
bert_large_uncased_sha = {
|
||||
'bert_config.json' : 'bfa42236d269e2aeb3a6d30412a33d15dbe8ea597e2b01dc9518c63cc6efafcb',
|
||||
'bert_model.ckpt.data-00000-of-00001' : 'bc6b3363e3be458c99ecf64b7f472d2b7c67534fd8f564c0556a678f90f4eea1',
|
||||
'bert_model.ckpt.index' : '68b52f2205ffc64dc627d1120cf399c1ef1cbc35ea5021d1afc889ffe2ce2093',
|
||||
'bert_model.ckpt.meta' : '6fcce8ff7628f229a885a593625e3d5ff9687542d5ef128d9beb1b0c05edc4a1',
|
||||
'vocab.txt' : '07eced375cec144d27c900241f3e339478dec958f92fddbc551f295c992038a3',
|
||||
}
|
||||
|
||||
bert_base_cased_sha = {
|
||||
'bert_config.json' : 'f11dfb757bea16339a33e1bf327b0aade6e57fd9c29dc6b84f7ddb20682f48bc',
|
||||
'bert_model.ckpt.data-00000-of-00001' : '734d5a1b68bf98d4e9cb6b6692725d00842a1937af73902e51776905d8f760ea',
|
||||
'bert_model.ckpt.index' : '517d6ef5c41fc2ca1f595276d6fccf5521810d57f5a74e32616151557790f7b1',
|
||||
'bert_model.ckpt.meta' : '5f8a9771ff25dadd61582abb4e3a748215a10a6b55947cbb66d0f0ba1694be98',
|
||||
'vocab.txt' : 'eeaa9875b23b04b4c54ef759d03db9d1ba1554838f8fb26c5d96fa551df93d02',
|
||||
}
|
||||
|
||||
bert_large_cased_sha = {
|
||||
'bert_config.json' : '7adb2125c8225da495656c982fd1c5f64ba8f20ad020838571a3f8a954c2df57',
|
||||
'bert_model.ckpt.data-00000-of-00001' : '6ff33640f40d472f7a16af0c17b1179ca9dcc0373155fb05335b6a4dd1657ef0',
|
||||
'bert_model.ckpt.index' : 'ef42a53f577fbe07381f4161b13c7cab4f4fc3b167cec6a9ae382c53d18049cf',
|
||||
'bert_model.ckpt.meta' : 'd2ddff3ed33b80091eac95171e94149736ea74eb645e575d942ec4a5e01a40a1',
|
||||
'vocab.txt' : 'eeaa9875b23b04b4c54ef759d03db9d1ba1554838f8fb26c5d96fa551df93d02',
|
||||
}
|
||||
|
||||
bert_base_multilingual_cased_sha = {
|
||||
'bert_config.json' : 'e76c3964bc14a8bb37a5530cdc802699d2f4a6fddfab0611e153aa2528f234f0',
|
||||
'bert_model.ckpt.data-00000-of-00001' : '55b8a2df41f69c60c5180e50a7c31b7cdf6238909390c4ddf05fbc0d37aa1ac5',
|
||||
'bert_model.ckpt.index' : '7d8509c2a62b4e300feb55f8e5f1eef41638f4998dd4d887736f42d4f6a34b37',
|
||||
'bert_model.ckpt.meta' : '95e5f1997e8831f1c31e5cf530f1a2e99f121e9cd20887f2dce6fe9e3343e3fa',
|
||||
'vocab.txt' : 'fe0fda7c425b48c516fc8f160d594c8022a0808447475c1a7c6d6479763f310c',
|
||||
}
|
||||
|
||||
bert_large_multilingual_uncased_sha = {
|
||||
'bert_config.json' : '49063bb061390211d2fdd108cada1ed86faa5f90b80c8f6fdddf406afa4c4624',
|
||||
'bert_model.ckpt.data-00000-of-00001' : '3cd83912ebeb0efe2abf35c9f1d5a515d8e80295e61c49b75c8853f756658429',
|
||||
'bert_model.ckpt.index' : '87c372c1a3b1dc7effaaa9103c80a81b3cbab04c7933ced224eec3b8ad2cc8e7',
|
||||
'bert_model.ckpt.meta' : '27f504f34f02acaa6b0f60d65195ec3e3f9505ac14601c6a32b421d0c8413a29',
|
||||
'vocab.txt' : '87b44292b452f6c05afa49b2e488e7eedf79ea4f4c39db6f2f4b37764228ef3f',
|
||||
}
|
||||
|
||||
bert_base_chinese_sha = {
|
||||
'bert_config.json' : '7aaad0335058e2640bcb2c2e9a932b1cd9da200c46ea7b8957d54431f201c015',
|
||||
'bert_model.ckpt.data-00000-of-00001' : '756699356b78ad0ef1ca9ba6528297bcb3dd1aef5feadd31f4775d7c7fc989ba',
|
||||
'bert_model.ckpt.index' : '46315546e05ce62327b3e2cd1bed22836adcb2ff29735ec87721396edb21b82e',
|
||||
'bert_model.ckpt.meta' : 'c0f8d51e1ab986604bc2b25d6ec0af7fd21ff94cf67081996ec3f3bf5d823047',
|
||||
'vocab.txt' : '45bbac6b341c319adc98a532532882e91a9cefc0329aa57bac9ae761c27b291c',
|
||||
}
|
||||
|
||||
# Relate SHA to urls for loop below
|
||||
model_sha = {
|
||||
'bert_base_uncased' : bert_base_uncased_sha,
|
||||
'bert_large_uncased' : bert_large_uncased_sha,
|
||||
'bert_base_cased' : bert_base_cased_sha,
|
||||
'bert_large_cased' : bert_large_cased_sha,
|
||||
'bert_base_multilingual_cased' : bert_base_multilingual_cased_sha,
|
||||
'bert_large_multilingual_uncased' : bert_large_multilingual_uncased_sha,
|
||||
'bert_base_chinese' : bert_base_chinese_sha
|
||||
}
|
||||
|
||||
# Helper to get sha256sum of a file
|
||||
def sha256sum(filename):
|
||||
h = hashlib.sha256()
|
||||
b = bytearray(128*1024)
|
||||
mv = memoryview(b)
|
||||
with open(filename, 'rb', buffering=0) as f:
|
||||
for n in iter(lambda : f.readinto(mv), 0):
|
||||
h.update(mv[:n])
|
||||
return h.hexdigest()
|
||||
|
||||
# Iterate over urls: download, unzip, verify sha256sum
|
||||
found_mismatch_sha = False
|
||||
for model in model_urls:
|
||||
url = model_urls[model][0]
|
||||
file = model_urls[model][1]
|
||||
|
||||
print("Downloading", url)
|
||||
response = urllib.request.urlopen(url)
|
||||
with open(file, "wb") as handle:
|
||||
handle.write(response.read())
|
||||
|
||||
print("Unzipping", file)
|
||||
zip = zipfile.ZipFile(file, 'r')
|
||||
zip.extractall()
|
||||
zip.close()
|
||||
|
||||
sha_dict = model_sha[model]
|
||||
for extracted_file in sha_dict:
|
||||
sha = sha_dict[extracted_file]
|
||||
if sha != sha256sum(file[:-4] + "/" + extracted_file):
|
||||
found_mismatch_sha = True
|
||||
print("SHA256sum does not match on file:", extracted_file, "from download url:", url)
|
||||
else:
|
||||
print(file[:-4] + "/" + extracted_file, "\t", "verified")
|
||||
|
||||
if not found_mismatch_sha:
|
||||
print("All downloads pass sha256sum verification.")
|
||||
|
File diff suppressed because it is too large
Load diff
Loading…
Reference in a new issue