[Electra/TF2] Adding new model

This commit is contained in:
Przemek Strzelczyk 2020-07-20 20:39:35 +02:00
parent 180382499f
commit b31d091cf3
38 changed files with 12623 additions and 0 deletions

View file

@ -0,0 +1,130 @@
# Initially taken from Github's Python gitignore file
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
# C extensions
*.so
#Data checkpoints and results
data/*/*/
data/*/*.zip
checkpoints/
results
results/*
#Editor
.idea
.idea/*
# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
.hypothesis/
.pytest_cache/
# Translations
*.mo
*.pot
# Django stuff:
*.log
local_settings.py
db.sqlite3
# Flask stuff:
instance/
.webassets-cache
# Scrapy stuff:
.scrapy
# Sphinx documentation
docs/_build/
# PyBuilder
target/
# Jupyter Notebook
.ipynb_checkpoints
# IPython
profile_default/
ipython_config.py
# pyenv
.python-version
# celery beat schedule file
celerybeat-schedule
# SageMath parsed files
*.sage.py
# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/
# Spyder project settings
.spyderproject
.spyproject
# Rope project settings
.ropeproject
# mkdocs documentation
/site
# mypy
.mypy_cache/
.dmypy.json
dmypy.json
# Pyre type checker
.pyre/
# vscode
.vscode

View file

@ -0,0 +1,40 @@
# Copyright (c) 2020 NVIDIA CORPORATION. All rights reserved.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
ARG FROM_IMAGE_NAME=nvcr.io/nvidian/tensorflow:20.06-tf2-py3
#FROM gitlab-master.nvidia.com:5005/dl/dgx/tensorrtserver:master-py3.1164446-client as trt
#FROM nvcr.io/nvidia/tensorrtserver:20.03-py3-clientsdk as trt
FROM ${FROM_IMAGE_NAME}
RUN apt-get update && apt-get install -y pbzip2 pv bzip2 cabextract
ENV DATA_PREP_WORKING_DIR /workspace/electra/data
WORKDIR /workspace
RUN git clone https://github.com/attardi/wikiextractor.git
RUN git clone https://github.com/soskek/bookcorpus.git
# Copy the perf_client over
#COPY --from=trt /workspace/install/ /workspace/install/
#ENV LD_LIBRARY_PATH /workspace/install/lib:${LD_LIBRARY_PATH}
# Install trt python api
#RUN pip install /workspace/install/python/tensorrtserver-1.*-py3-none-linux_x86_64.whl
WORKDIR /workspace/electra
RUN pip install --upgrade --no-cache-dir pip \
&& pip install --no-cache-dir \
tqdm boto3 requests six ipdb h5py html2text nltk progressbar filelock tokenizers==0.7.0 \
git+https://github.com/NVIDIA/dllogger
RUN apt-get install -y iputils-ping
COPY . .

View file

@ -0,0 +1,607 @@
# ELECTRA For TensorFlow2
This repository provides a script and recipe to train the ELECTRA model for TensorFlow2 to achieve state-of-the-art accuracy, and is tested and maintained by NVIDIA.
## Table Of Contents
- [Model overview](#model-overview)
* [Model architecture](#model-architecture)
* [Default configuration](#default-configuration)
* [Feature support matrix](#feature-support-matrix)
* [Features](#features)
* [Mixed precision training](#mixed-precision-training)
* [Enabling mixed precision](#enabling-mixed-precision)
* [Enabling TF32](#enabling-tf32)
* [Glossary](#glossary)
- [Setup](#setup)
* [Requirements](#requirements)
- [Quick Start Guide](#quick-start-guide)
- [Advanced](#advanced)
* [Scripts and sample code](#scripts-and-sample-code)
* [Parameters](#parameters)
+ [Fine tuning parameters](#fine-tuning-parameters)
* [Command-line options](#command-line-options)
* [Getting the data](#getting-the-data)
* [Training process](#training-process)
+ [Fine-tuning](#fine-tuning)
* [Inference process](#inference-process)
+ [Fine-tuning inference](#fine-tuning-inference)
- [Performance](#performance)
* [Benchmarking](#benchmarking)
+ [Training performance benchmark](#training-performance-benchmark)
+ [Inference performance benchmark](#inference-performance-benchmark)
* [Results](#results)
+ [Training accuracy results](#training-accuracy-results)
- [Fine-tuning accuracy: NVIDIA DGX A100 (8x A100 40GB)](#fine-tuning-accuracy-nvidia-dgx-a100-8x-a100-40gb)
- [Fine-tuning accuracy: NVIDIA DGX-1 (8x V100 16GB)](#fine-tuning-accuracy-nvidia-dgx-1-8x-v100-16gb)
- [Fine-tuning accuracy: NVIDIA DGX-2 (16x V100 32GB)](#fine-tuning-accuracy-nvidia-dgx-2-16x-v100-32gb)
- [Training stability test](#training-stability-test)
* [Fine-tuning stability test: NVIDIA DGX-1 (8x V100 16GB)](#fine-tuning-stability-test-nvidia-dgx-1-8x-v100-16gb)
+ [Training performance results](#training-performance-results)
- [Training performance: NVIDIA DGX A100 (8x A100 40GB)](#training-performance-nvidia-dgx-a100-8x-a100-40gb)
* [Fine-tuning NVIDIA DGX A100 (8x A100 40GB)](#fine-tuning-nvidia-dgx-a100-8x-a100-40gb)
- [Training performance: NVIDIA DGX-1 (8x V100 16GB)](#training-performance-nvidia-dgx-1-8x-v100-16gb)
* [Fine-tuning NVIDIA DGX-1 (8x V100 16GB)](#fine-tuning-nvidia-dgx-1-8x-v100-16gb)
- [Training performance: NVIDIA DGX-2 (16x V100 32GB)](#training-performance-nvidia-dgx-2-16x-v100-32gb)
* [Fine-tuning NVIDIA DGX-2 With 32GB](#fine-tuning-nvidia-dgx-2-with-32gb)
+ [Inference performance results](#inference-performance-results)
- [Inference performance: NVIDIA DGX A100 (1x A100 40GB)](#inference-performance-nvidia-dgx-a100-1x-a100-40gb)
* [Fine-tuning inference on NVIDIA DGX A100 (1x A100 40GB)](#fine-tuning-inference-on-nvidia-dgx-a100-1x-a100-40gb)
- [Inference performance: NVIDIA DGX-1 (1x V100 16GB)](#inference-performance-nvidia-dgx-1-1x-v100-16gb)
* [Fine-tuning inference on NVIDIA DGX-1 with 16GB](#fine-tuning-inference-on-nvidia-dgx-1-with-16gb)
- [Inference performance: NVIDIA DGX-2 (1x V100 32GB)](#inference-performance-nvidia-dgx-2-1x-v100-32gb)
* [Fine-tuning inference on NVIDIA DGX-2 with 32GB](#fine-tuning-inference-on-nvidia-dgx-2-with-32gb)
- [Release notes](#release-notes)
* [Changelog](#changelog)
* [Known issues](#known-issues)
## Model overview
Electra, Efficiently Learning an Encoder that Classifies Token Replacements Accurately, is novel pre-training language representations which outperforms existing techniques given the same compute budget on a wide array of Natural Language Processing (NLP) tasks. This model is based on the [ELECTRA: Pre-training Text Encoders as Discriminators Rather Than Generators](https://openreview.net/forum?id=r1xMH1BtvB) paper. NVIDIA's implementation of ELECTRA is an optimized version of the [Hugging Face implementation](https://huggingface.co/transformers/model_doc/electra.html), leveraging mixed precision arithmetic and Tensor Cores on Volta, Turing, and the NVIDIA Ampere GPU architectures for faster training times while maintaining target accuracy.
This repository contains scripts to interactively launch data download, training, benchmarking and inference routines in a Docker container for fine-tuning for tasks such as question answering. The major differences between the original implementation of the paper and this version of ELECTRA are as follows:
- Fused Adam optimizer for fine tuning tasks
- Fused CUDA kernels for better performance LayerNorm
- Automatic mixed precision (AMP) training support
Other publicly available implementations of Electra include:
1. [Hugging Face](https://huggingface.co/transformers/model_doc/electra.html)
2. [Google's implementation](https://github.com/google-research/electra)
This model trains with mixed precision Tensor Cores on Volta and provides a push-button solution to pretraining on a corpus of choice. As a result, researchers can get results 4x faster than training without Tensor Cores. This model is tested against each NGC monthly container release to ensure consistent accuracy and performance over time.
### Model architecture
ELECTRA is a combination of two transformer models: a generator and a discriminator. The generators role is to replace tokens in a sequence, and is therefore trained as a masked language model. The discriminator, which is the model were interested in, tries to identify which tokens were replaced by the generator in the sequence. Both generator and discriminator use the same architecture as the encoder of the Transformer. The encoder is simply a stack of Transformer blocks, which consist of a multi-head attention layer followed by successive stages of feed-forward networks and layer normalization. The multi-head attention layer accomplishes self-attention on multiple input representations.
![Figure 1-1](https://1.bp.blogspot.com/-sHybc03nJRo/XmfLongdVYI/AAAAAAAAFbI/a0t5w_zOZ-UtxYaoQlVkmTRsyFJyFddtQCLcBGAsYHQ/s1600/image1.png "ELECTRA architecture")
### Default configuration
ELECTRA uses a new pre-training task, called replaced token detection (RTD), that trains a bidirectional model (like a MLM) while learning from all input positions (like a LM). Inspired by generative adversarial networks (GANs), instead of corrupting the input by replacing tokens with “[MASK]” as in BERT, the generator is trained to corrupt the input by replacing some input tokens with incorrect, but somewhat plausible, fakes. On the other hand, the discriminator is trained to distinguish between “real” and “fake” input data.
The [Google ELECTRA repository](https://github.com/google-research/electra) reports the results for three configurations of ELECTRA, each corresponding to a unique model size. This implementation provides the same configurations by default, which are described in the table below.
| **Model** | **Hidden layers** | **Hidden unit size** | **Parameters** |
|:---------:|:----------:|:---:|:----:|
|ELECTRA_SMALL|12 encoder| 256 | 14M|
|ELECTRA_BASE |12 encoder| 768 |110M|
|ELECTRA_LARGE|24 encoder|1024 |335M|
The following features were implemented in this model:
- General:
- Mixed precision support with TensorFlow Automatic Mixed Precision (TF-AMP)
- Multi-GPU support using Horovod
- XLA support
- Inference:
- Joint predictions with beam search. The default beam size is 4.
### Feature support matrix
The following features are supported by this model.
| **Feature** | **ELECTRA** |
|:---------:|:----------:|
|Automatic mixed precision (AMP)|Yes|
|Horovod Multi-GPU|Yes|
#### Features
[AMP](https://nvidia.github.io/apex/amp.html) is an abbreviation used for automatic mixed precision training.
### Mixed precision training
Mixed precision is the combined use of different numerical precisions in a computational method. [Mixed precision](https://arxiv.org/abs/1710.03740) training offers significant computational speedup by performing operations in half-precision format, while storing minimal information in single-precision to retain as much information as possible in critical parts of the network. Since the introduction of [Tensor Cores](https://developer.nvidia.com/tensor-cores) in Volta, and following with both the Turing and Ampere architectures, significant training speedups are experienced by switching to mixed precision -- up to 3x overall speedup on the most arithmetically intense model architectures. Using mixed precision training requires two steps:
1. Porting the model to use the FP16 data type where appropriate.
2. Adding loss scaling to preserve small gradient values.
This can now be achieved using Automatic Mixed Precision (AMP) for TensorFlow to enable the full [mixed precision methodology](https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html#tensorflow) in your existing TensorFlow model code. AMP enables mixed precision training on Volta, Turing, and NVIDIA Ampere GPU architectures automatically. The TensorFlow framework code makes all necessary model changes internally.
In TF-AMP, the computational graph is optimized to use as few casts as necessary and maximize the use of FP16, and the loss scaling is automatically applied inside of supported optimizers. AMP can be configured to work with the existing tf.contrib loss scaling manager by disabling the AMP scaling with a single environment variable to perform only the automatic mixed-precision optimization. It accomplishes this by automatically rewriting all computation graphs with the necessary operations to enable mixed precision training and automatic loss scaling.
For information about:
- How to train using mixed precision, see the [Mixed Precision Training](https://arxiv.org/abs/1710.03740) paper and [Training With Mixed Precision](https://docs.nvidia.com/deeplearning/performance/mixed-precision-training/index.html) documentation.
- Techniques used for mixed precision training, see the [Mixed-Precision Training of Deep Neural Networks](https://devblogs.nvidia.com/mixed-precision-training-deep-neural-networks/) blog.
- How to access and enable AMP for TensorFlow, see [Using TF-AMP](https://docs.nvidia.com/deeplearning/dgx/tensorflow-user-guide/index.html#tfamp) from the TensorFlow User Guide.
- APEX tools for mixed precision training, see the [NVIDIA Apex: Tools for Easy Mixed-Precision Training in PyTorch](https://devblogs.nvidia.com/apex-pytorch-easy-mixed-precision-training/).
#### Enabling mixed precision
In this repository, Mixed precision is enabled in TensorFlow by using the Automatic Mixed Precision (TF-AMP) extension which casts variables to half-precision upon retrieval, while storing variables in single-precision format. Furthermore, to preserve small gradient magnitudes in backpropagation, a [loss scaling](https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html#lossscaling) step must be included when applying gradients. In TensorFlow, loss scaling can be applied statically by using simple multiplication of loss by a constant value or automatically, by TF-AMP. Automatic mixed precision makes all the adjustments internally in TensorFlow, providing two benefits over manual operations. First, programmers need not modify network model code, reducing development and maintenance effort. Second, using AMP maintains forward and backward compatibility with all the APIs for defining and running TensorFlow models.
To enable mixed precision, you can simply add the `--amp` to the command-line used to run the model.
#### Enabling TF32
TensorFloat-32 (TF32) is the new math mode in [NVIDIA A100](#https://www.nvidia.com/en-us/data-center/a100/) GPUs for handling the matrix math also called tensor operations. TF32 running on Tensor Cores in A100 GPUs can provide up to 10x speedups compared to single-precision floating-point math (FP32) on Volta GPUs.
TF32 Tensor Cores can speed up networks using FP32, typically with no loss of accuracy. It is more robust than FP16 for models which require high dynamic range for weights or activations.
For more information, refer to the [TensorFloat-32 in the A100 GPU Accelerates AI Training, HPC up to 20x](#https://blogs.nvidia.com/blog/2020/05/14/tensorfloat-32-precision-format/) blog post.
TF32 is supported in the NVIDIA Ampere GPU architecture and is enabled by default.
### Glossary
**Fine-tuning**
Training an already pretrained model further using a task specific dataset for subject-specific refinements, by adding task-specific layers on top if required.
**Language Model**
Assigns a probability distribution over a sequence of words. Given a sequence of words, it assigns a probability to the whole sequence.
**Pre-training**
Training a model on vast amounts of data on the same (or different) task to build general understandings.
**Transformer**
The paper [Attention Is All You Need](https://arxiv.org/abs/1706.03762) introduces a novel architecture called Transformer that uses an attention mechanism and transforms one sequence into another.
## Setup
The following section lists the requirements that you need to meet in order to start training the ELECTRA model.
### Requirements
This repository contains Dockerfile which extends the TensorFlow2 NGC container and encapsulates some dependencies. Aside from these dependencies, ensure you have the following components:
- [NVIDIA Docker](https://github.com/NVIDIA/nvidia-docker)
- [TensorFlow2 20.06-py3 NGC container or later](https://ngc.nvidia.com/registry/nvidia-tensorflow)
- Supported GPUs:
- [NVIDIA Volta architecture](https://www.nvidia.com/en-us/data-center/volta-gpu-architecture/)
- [NVIDIA Turing architecture](https://www.nvidia.com/en-us/geforce/turing/)
- [NVIDIA Ampere architecture](https://www.nvidia.com/en-us/data-center/nvidia-ampere-gpu-architecture/)
For more information about how to get started with NGC containers, see the following sections from the NVIDIA GPU Cloud Documentation and the Deep Learning Documentation:
- [Getting Started Using NVIDIA GPU Cloud](https://docs.nvidia.com/ngc/ngc-getting-started-guide/index.html)
- [Accessing And Pulling From The NGC Container Registry](https://docs.nvidia.com/deeplearning/dgx/user-guide/index.html#accessing_registry)
- [Running TensorFlow2](https://docs.nvidia.com/deeplearning/frameworks/tensorflow-release-notes/running.html#running)
For those unable to use the TensorFlow 2 NGC container, to set up the required environment or create your own container, see the versioned [NVIDIA Container Support Matrix](https://docs.nvidia.com/deeplearning/dgx/support-matrix/index.html).
## Quick Start Guide
To train your model using mixed precision with Tensor Cores or using FP32, perform the following steps using the default parameters of the ELECTRA model. The default parameters for pretraining have been set to run on 8x A100 40G cards. For the specifics concerning training and inference, see the [Advanced](#advanced) section.
1. Clone the repository.
`git clone https://github.com/NVIDIA/DeepLearningExamples.git`
`cd DeepLearningExamples/TensorFlow2/LanguageModeling/ELECTRA`
2. Build ELECTRA on top of the NGC container.
`bash scripts/docker/build.sh`
3. Start an interactive session in the NGC container to run training/inference.
`bash scripts/docker/launch.sh`
Resultant logs of pretraining and fine-tuning routines are stored in the `results/` folder. Checkpoints are stored in the `checkpoints/`
Required data are downloaded in the `data/` directory by default.
4. Download and preprocess the dataset.
This repository provides scripts to download, verify, and extract the following datasets:
- [SQuAD](https://rajpurkar.github.io/SQuAD-explorer/) (fine-tuning for question answering)
To download, verify, extract the datasets, and create the shards in `.hdf5` format, run:
`/workspace/electra/data/create_datasets_from_start.sh`
5. Start fine-tuning with the SQuAD dataset.
The above pretrained ELECTRA representations can be fine tuned with just one additional output layer for a state-of-the-art question answering system. Running the following script launches fine-tuning for question answering with the SQuAD dataset.
`bash scripts/run_squad.sh $(source scripts/configs/squad_config.sh && dgxa100_8gpu_amp) train_eval`
More configs for different V100 and A100 hardware setups can be found in `scripts/configs/squad_config.sh`
6. Start validation/evaluation.
Validation can be performed with the `bash scripts/run_squad.sh $(source scripts/configs/squad_config.sh && dgxa100_8gpu_amp) eval`. Running training first is required to generate needed checkpoints.
7. Start inference/predictions.
Inference can be performed with the `bash scripts/run_squad.sh $(source scripts/configs/squad_config.sh && dgxa100_8gpu_amp) prediction`. Inference predictions are saved to `<OUTPUT_DIRECTORY>/predictions.json`.
## Advanced
The following sections provide greater details of the dataset, running training and inference, and the training results.
### Scripts and sample code
Descriptions of the key scripts and folders are provided below.
- `data/` - Contains scripts for downloading and preparing individual datasets, and will contain downloaded and processed datasets.
- `scripts/` - Contains shell scripts to launch data download, pre-training, and fine-tuning.
- `run_squad.sh` - Interface for launching question answering fine-tuning with `run_squad.py`.
- `modeling.py` - Implements the ELECTRA pre-training and fine-tuning model architectures with TensorFlow2.
- `optimization.py` - Implements the Adam optimizer with TensorFlow2.
- `run_squad.py` - Implements fine tuning training and evaluation for question answering on the [SQuAD](https://rajpurkar.github.io/SQuAD-explorer/) dataset.
### Parameters
#### Fine tuning parameters
Default arguments are listed below in the order the scripts expects:
- ELECTRA MODEL - The default is `"google/electra-base-discriminator"`.
- Number of training Epochs - The default is `2`.
- Batch size - The default is `16`.
- Learning rate - The default is `4e-4`.
- Precision (either `amp` or `fp32`) - The default is `amp`.
- Number of GPUs - The default is `8`.
- Seed - The default is `1`.
- SQuAD version - The default is `1.1`
- SQuAD directory - The default is `/workspace/electra/data/download/squad/v$SQUAD_VERSION`.
- Output directory for result - The default is `results/`.
- Initialize checkpoint - The default is `"None"`
- Mode (`train`, `eval`, `train_eval`, `prediction`) - The default is `train_eval`.
The script saves the checkpoint at the end of each epoch to the `checkpoints/` folder.
The main script `run_tf_squad.py` specific parameters are:
```
--electra_model ELECTRA_MODEL - Specifies the type of ELECTRA model to use;
should be one of the following:
google/electra-small-generator
google/electra-base-generator
google/electra-large-generator
google/electra-small-discriminator
google/electra-base-discriminator
google/electra-large-discriminator
--data_dir DATA_DIR - Path to the SQuAD json for training and evaluation.
--max_seq_length MAX_SEQ_LENGTH
- The maximum total input sequence length
after WordPiece tokenization.
Sequences longer than this will be truncated,
and sequences shorter than this will be padded.
--doc_stride DOC_STRIDE - When splitting up a long document into chunks
this parameters sets how much stride to take
between chunks of tokens.
--max_query_length MAX_QUERY_LENGTH
- The maximum number of tokens for the question.
Questions longer than <max_query_length>
will be truncated to the value specified.
--n_best_size N_BEST_SIZE - The total number of n-best predictions to
generate in the nbest_predictions.json
output file.
--max_answer_length MAX_ANSWER_LENGTH
- The maximum length of an answer that can be
generated. This is needed because the start and
end predictions are not conditioned on one another.
--joint_head <True|False> - If true, beam search will be used to jointly predict
the start end end positions. Default is True.
--beam_size BEAM_SIZE - The beam size used to do joint predictions.
--verbose_logging - If true, all the warnings related to data
processing will be printed. A number of warnings
are expected for a normal SQuAD evaluation.
--do_lower_case - Whether to lower case the input text. Set to
true for uncased models and false for cased models.
--version_2_with_negative - If true, the SQuAD examples contain questions
that do not have an answer.
--null_score_diff_threshold NULL_SCORE_DIFF_THRES HOLD
- A null answer will be predicted if null_score
is greater than NULL_SCORE_DIFF_THRESHOLD.
```
### Command-line options
To see the full list of available options and their descriptions, use the `-h` or `--help` command line option, for example:
`python run_tf_squad.py --help`
Detailed descriptions of command-line options can be found in the [Parameters](#parameters) section.
### Getting the data
For fine-tuning a pre-trained ELECTRA model for specific tasks, by default this repository prepares the following dataset:
- [SQuAD](https://rajpurkar.github.io/SQuAD-explorer/): for question answering
### Training process
The training process consists of two steps: pre-training and fine-tuning.
#### Fine-tuning
Fine-tuning is provided for a variety of tasks. The following tasks are included with this repository through the following scripts:
- Question Answering (`scripts/run_squad.sh`)
By default, each Python script implements fine-tuning a pre-trained ELECTRA model for a specified number of training epochs as well as evaluation of the fine-tuned model. Each shell script invokes the associated Python script with the following default parameters:
- Uses 8 GPUs
- Has FP16 precision enabled
- HAS XLA enabled
- Saves a checkpoint at the end of training to the `checkpoints/` folder
Fine-tuning Python scripts implement support for mixed precision and multi-GPU training through [Horovod](https://github.com/horovod/horovod). For a full list of parameters and associated explanations, see the [Parameters](#parameters) section.
All fine-tuning shell scripts have the same positional arguments, outlined below:
```bash scripts/run_squad.sh <pretrained electra model> <epochs> <batch size> <learning rate> <amp|fp32> <num_gpus> <seed> <SQuAD version> <path to SQuAD dataset> <results directory> <checkpoint_to_load> <mode (either `train`, `eval` or `train_eval`)>```
By default, the mode positional argument is set to train_eval. See the [Quick Start Guide](#quick-start-guide) for explanations of each positional argument.
Note: The first positional argument (the path to the checkpoint to load) is required.
Each fine-tuning script assumes that the corresponding dataset files exist in the `data/` directory or separate path can be a command-line input to `run_squad.sh`.
### Inference process
#### Fine-tuning inference
Evaluation fine-tuning is enabled by the same scripts as training:
- Question Answering (`scripts/run_squad.sh`)
The mode positional argument of the shell script is used to run in evaluation mode. The fine-tuned ELECTRA model will be run on the evaluation dataset, and the evaluation loss and accuracy will be displayed.
Each inference shell script expects dataset files to exist in the same locations as the corresponding training scripts. The inference scripts can be run with default settings. By setting the `mode` variable in the script to either `eval` or `prediction` flag, you can choose between running predictions and evaluating them on a given dataset or just the former.
`bash scripts/run_squad.sh <pretrained electra model> <epochs> <batch size> <learning rate> <amp|fp32> <num_gpus> <seed> <SQuAD version> <path to SQuAD dataset> <results directory> <path to fine-tuned model checkpoint> <eval or prediction>`
To run inference interactively on question-context pairs, use the script `run_inference.py` as follows:
`python run_inference.py --electra_model <electra_model_type> --init_checkpoint <fine_tuned_checkpoint> --question="What food does Harry like?" --context="My name is Harry and I grew up in Canada. I love apples."`
## Performance
### Benchmarking
The following section shows how to run benchmarks measuring the model performance in training and inference modes.
#### Training performance benchmark
Training performance benchmarks for fine-tuning can be obtained by running `scripts/benchmark.sh`. The required parameters can be passed through the command-line as described in [Training process](#training-process). The performance information is printed at the end of each epoch.
To benchmark the training performance on a specific batch size, run:
`bash scripts/benchmark.sh train <num_gpus> <batch size> <infer_batch_size> <amp|fp32> <SQuAD version> <path to SQuAD dataset> <results directory> <checkpoint_to_load> <cache_Dir>`
An example call used to generate throughput numbers:
`bash scripts/benchmark.sh train 8 16`
#### Inference performance benchmark
Inference performance benchmarks fine-tuning can be obtained by running `scripts/benchmark.sh`. The required parameters can be passed through the command-line as described in [Inference process](#inference-process). This script runs one epoch by default on the SQuAD v1.1 dataset and extracts the averaged performance for the given configuration.
To benchmark the training performance on a specific batch size, run:
`bash scripts/benchmark.sh train <num_gpus> <batch size> <infer_batch_size> <amp|fp32> <SQuAD version> <path to SQuAD dataset> <results directory> <checkpoint_to_load> <cache_Dir>`
An example call used to generate throughput numbers:
`bash scripts/benchmark.sh eval 8 256`
### Results
The following sections provide details on how we achieved our performance and accuracy in training and inference. All results are on ELECTRA-base model and on SQuAD v1.1 dataset with a sequence length of 384 unless otherwise mentioned.
#### Training accuracy results
##### Fine-tuning accuracy: NVIDIA DGX A100 (8x A100 40GB)
Our results were obtained by running the `scripts/run_squad.sh` training script in the tensorflow:20.06-tf2-py3 NGC container on NVIDIA DGX A100 (8x A100 40GB) GPUs.
| GPUs | Batch size / GPU | Accuracy / F1 - FP32 | Accuracy / F1 - mixed precision | Time to train - TF32 (sec) | Time to train - mixed precision (sec) | Time to train speedup (FP32 to mixed precision) |
|---------|---------------------|------------------|-----------------------------|--------------------------|---------------------------------|-------------------------------------------------|
| 1 | 32 | 87.19 / 92.85 | 87.19 / 92.84 | 1699 | 749 | 2.27 |
| 8 | 32 | 86.84 / 92.57 | 86.83 / 92.56 | 263 | 201 | 1.30 |
##### Fine-tuning accuracy: NVIDIA DGX-1 (8x V100 16GB)
Our results were obtained by running the `scripts/run_squad.sh` training script in the tensorflow:20.06-tf2-py3 NGC container on NVIDIA DGX-1 with (8x V100 16GB) GPUs.
| GPUs | Batch size / GPU (FP32 : mixed precision) | Accuracy / F1 - FP32 | Accuracy / F1 - mixed precision | Time to train - FP32 (sec) | Time to train - mixed precision (sec) | Time to train speedup (FP32 to mixed precision) |
|---------|---------------------|------------------|-----------------------------|--------------------------|---------------------------------|-------------------------------------------------|
| 1 | 8 : 16 | 87.36 / 92.82 | 87.32 / 92.74 | 5136 | 1378 | 3.73 |
| 8 | 8 : 16 | 87.02 / 92.73 | 87.02 / 92.72 | 730 | 334 | 2.18 |
##### Fine-tuning accuracy: NVIDIA DGX-2 (16x V100 32GB)
Our results were obtained by running the `scripts/run_squad.sh` training script in the tensorflow:20.06-tf2-py3 NGC container on NVIDIA DGX-2 (16x V100 32G) GPUs.
| GPUs | Batch size / GPU | Accuracy / F1 - FP32 | Accuracy / F1 - mixed precision | Time to train - FP32 (sec) | Time to train - mixed precision (sec) | Time to train speedup (FP32 to mixed precision) |
|---------|---------------------|------------------|-----------------------------|--------------------------|---------------------------------|-------------------------------------------------|
| 1 | 32 | 87.14 / 92.69 | 86.95 / 92.69 | 4478 | 1162 | 3.85 |
| 16 | 32 | 86.95 / 90.58 | 86.93 / 92.48 | 333 | 229 | 1.45 |
##### Training stability test
###### Fine-tuning stability test: NVIDIA DGX-1 (8x V100 16GB)
Training stability with 8 GPUs, FP16 computations, batch size of 16 on SQuAD v1.1:
| Accuracy Metric | Seed 1 | Seed 2 | Seed 3 | Seed 4 | Seed 5 | Mean | Standard Deviation
|---|---|---|---|---|---|---|---
|Exact Match %| 86.99 | 86.81 | 86.95 | 87.10 | 87.26 | 87.02 | 0.17
| f1 % | 92.7 | 92.66 | 92.65 | 92.61 | 92.97 | 92.72 | 0.14
Training stability with 8 GPUs, FP16 computations, batch size of 16 on SQuAD v2.0:
| Accuracy Metric | Seed 1 | Seed 2 | Seed 3 | Seed 4 | Seed 5 | Mean | Standard Deviation
|---|---|---|---|---|---|---|---
|Exact Match %| 83.00 | 82.84 | 83.11 | 82.70 | 82.94 | 82.91 | 0.15
| f1 % | 85.63 | 85.48 | 85.69 | 85.31 | 85.57 | 85.54 | 0.15
#### Training performance results
##### Training performance: NVIDIA DGX A100 (8x A100 40GB)
Our results were obtained by running the `scripts/benchmark.sh` training script in the tensorflow:20.06-tf2-py3 NGC container on NVIDIA DGX A100 (8x A100 40GB) GPUs. Performance numbers (in items/images per second) were averaged over an entire training epoch.
###### Fine-tuning NVIDIA DGX A100 (8x A100 40GB)
| GPUs | Batch size / GPU | Throughput - FP32 (sequences/sec) | Throughput - mixed precision (sequences/sec) | Throughput speedup (TF32 - mixed precision) | Weak scaling - TF32 | Weak scaling - mixed precision |
|------------------|----------------------|-----------------------------------------------|------------------------------------|---------------------------------|----------------------|----------------------------------------------
| 1 | 32 | 104 | 285 | 2.73 | 1.00 | 1.00
| 4 | 32 | 405 | 962 | 2.37 | 3.88 | 3.37
| 8 | 32 | 809 | 1960| 2.42 | 7.75 | 6.87
##### Training performance: NVIDIA DGX-1 (8x V100 16GB)
Our results were obtained by running the `scripts/benchmark.sh` training scripts in the tensorflow:20.06-tf2-py3 NGC container on NVIDIA DGX-1 with (8x V100 32GB) GPUs. Performance numbers (in sequences per second) were averaged over an entire training epoch.
###### Fine-tuning NVIDIA DGX-1 (8x V100 16GB)
| GPUs | Batch size / GPU (FP32 : mixed precision) | Throughput - FP32 (sequences/sec) | Throughput - mixed precision (sequences/sec) | Throughput speedup (FP32 - mixed precision) | Weak scaling - FP32 | Weak scaling - mixed precision |
|------------------|----------------------|-----------------------------------------------|------------------------------------|---------------------------------|----------------------|----------------------------------------------
|1 | 8 : 16| 35| 144| 4.11| 1.00| 1.00
|4 | 8 : 16| 133| 508| 3.81| 3.80| 3.52
|8 | 8 : 16| 263| 965| 3.67| 7.51| 6.70
To achieve these same results, follow the steps in the [Quick Start Guide](#quick-start-guide).
##### Training performance: NVIDIA DGX-2 (16x V100 32GB)
Our results were obtained by running the `scripts/benchmark.sh` training scripts in the tensorflow:20.06-tf2-py3 NGC container on NVIDIA DGX-2 with (16x V100 32G) GPUs. Performance numbers (in sequences per second) were averaged over an entire training epoch.
###### Fine-tuning NVIDIA DGX-2 With 32GB
| GPUs | Batch size / GPU | Throughput - FP32 (sequences/sec) | Throughput - mixed precision (sequences/sec) | Throughput speedup (FP32 - mixed precision) | Weak scaling - FP32 | Weak scaling - mixed precision |
|------|------------------|----------------------------------|---------------------------------------------|---------------------------------------------|---------------------|--------------------------------|
| 1 | 16 | 40 | 173 | 4.33 | 1.00 | 1.00 |
| 4 | 16 | 157 | 625 | 3.98 | 3.93 | 3.61 |
| 8 | 16 | 311 | 1209 | 3.89 | 7.78 | 6.99 |
| 16 | 16 | 611 | 2288 | 3.74 | 15.28 | 13.23 |
To achieve these same results, follow the steps in the [Quick Start Guide](#quick-start-guide).
#### Inference performance results
##### Inference performance: NVIDIA DGX A100 (1x A100 40GB)
Our results were obtained by running the `scripts/benchmark.sh` inferencing benchmarking script in the tensorflow:20.06-tf2-py3 NGC container on NVIDIA DGX A100 (1x A100 40GB) GPU.
###### Fine-tuning inference on NVIDIA DGX A100 (1x A100 40GB)
FP16
| Batch size | Sequence length | Throughput Avg (sequences/sec) | Latency Avg (ms) | Latency 90% (ms) | Latency 95% (ms) | Latency 99% (ms) |
|------------|-----------------|--------------------------------|------------------|------------------|------------------|------------------|
| 1 | 384 | 178 | 5.630 | 5.500 | 5.555 | 5.608 |
| 256 | 384 | 857 | 1.112 | 1.111 | 1.111 | 1.112 |
| 512 | 384 | 864 | 1.054 | 1.051 | 1.053 | 1.053 |
TF32
| Batch size | Sequence length | Throughput Avg (sequences/sec) | Latency Avg (ms) | Latency 90% (ms) | Latency 95% (ms) | Latency 99% (ms) |
|------------|-----------------|--------------------------------|------------------|------------------|------------------|------------------|
| 1 | 384 | 123 | 8.186 | 7.995 | 8.078 | 8.152 |
| 256 | 384 | 344 | 2.832 | 2.822 | 2.826 | 2.830 |
| 512 | 384 | 351 | 2.787 | 2.781 | 2.784 | 2.784 |
##### Inference performance: NVIDIA DGX-1 (1x V100 16GB)
Our results were obtained by running the `scripts/benchmark.sh` script in the tensorflow:20.06-tf2-py3 NGC container on NVIDIA DGX-1 with (1x V100 16G) GPUs.
###### Fine-tuning inference on NVIDIA DGX-1 with 16GB
FP16
| Batch size | Sequence length | Throughput Avg (sequences/sec) | Latency Avg (ms) | Latency 90% (ms) | Latency 95% (ms) | Latency 99% (ms) |
|------------|-----------------|--------------------------------|------------------|------------------|------------------|------------------|
| 1 | 384 | 141 | 7.100 | 7.071 | 7.081 | 7.091 |
| 128 | 384 | 517 | 1.933 | 1.930 | 1.930 | 1.932 |
| 256 | 384 | 524 | 1.910 | 1.907 | 1.908 | 1.909 |
FP32
| Batch size | Sequence length | Throughput Avg (sequences/sec) | Latency Avg (ms) | Latency 90% (ms) | Latency 95% (ms) | Latency 99% (ms) |
|------------|-----------------|--------------------------------|------------------|------------------|------------------|------------------|
| 1 | 384 | 84 | 11.869 | 11.814 | 11.832 | 11.850 |
| 128 | 384 | 117 | 8.548 | 8.527 | 8.529 | 8.537 |
| 256 | 384 | 141 | 7.100 | 7.071 | 7.081 | 7.091 |
##### Inference performance: NVIDIA DGX-2 (1x V100 32GB)
Our results were obtained by running the `scripts/benchmark.sh` scripts in the tensorflow:20.06-tf2-py3 NGC container on NVIDIA DGX-2 with (1x V100 32G) GPUs.
###### Fine-tuning inference on NVIDIA DGX-2 with 32GB
FP16
| Batch size | Sequence length | Throughput Avg (sequences/sec) | Latency Avg (ms) | Latency 90% (ms) | Latency 95% (ms) | Latency 99% (ms) |
|------------|-----------------|--------------------------------|------------------|------------------|------------------|------------------|
| 1 | 384 | 144 | 6.953 | 6.888 | 6.910 | 6.932 |
| 128 | 384 | 547 | 1.828 | 1.827 | 1.827 | 1.828 |
| 256 | 384 | 557 | 1.795 | 1.792 | 1.793 | 1.794 |
FP32
| Batch size | Sequence length | Throughput Avg (sequences/sec) | Latency Avg (ms) | Latency 90% (ms) | Latency 95% (ms) | Latency 99% (ms) |
|------------|-----------------|--------------------------------|------------------|------------------|------------------|------------------|
| 1 | 384 | 86 | 11.580 | 11.515 | 11.535 | 11.558 |
| 128 | 384 | 124 | 8.056 | 8.05 | 8.052 | 8.055 |
| 256 | 384 | 125 | 8.006 | 8.002 | 8.004 | 8.005 |
To achieve these same results, follow the steps in the [Quick Start Guide](#quick-start-guide).
## Release notes
### Changelog
July 2020
- Initial release.
### Known issues
There are no known issues with this model.

View file

@ -0,0 +1,234 @@
# coding=utf-8
# Copyright 2020 The Google Research Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Writes out text data as tfrecords that ELECTRA can be pre-trained on."""
import argparse
import multiprocessing
import os
import random
import time
import tensorflow.compat.v1 as tf
import utils
from tokenization import ElectraTokenizer
def create_int_feature(values):
feature = tf.train.Feature(int64_list=tf.train.Int64List(value=list(values)))
return feature
class ExampleBuilder(object):
"""Given a stream of input text, creates pretraining examples."""
def __init__(self, tokenizer, max_length):
self._tokenizer = tokenizer
self._current_sentences = []
self._current_length = 0
self._max_length = max_length
self._target_length = max_length
def add_line(self, line):
"""Adds a line of text to the current example being built."""
line = line.strip().replace("\n", " ")
if (not line) and self._current_length != 0: # empty lines separate docs
return self._create_example()
bert_tokens = self._tokenizer.tokenize(line)
bert_tokids = self._tokenizer.convert_tokens_to_ids(bert_tokens)
self._current_sentences.append(bert_tokids)
self._current_length += len(bert_tokids)
if self._current_length >= self._target_length:
return self._create_example()
return None
def _create_example(self):
"""Creates a pre-training example from the current list of sentences."""
# small chance to only have one segment as in classification tasks
if random.random() < 0.1:
first_segment_target_length = 100000
else:
# -3 due to not yet having [CLS]/[SEP] tokens in the input text
first_segment_target_length = (self._target_length - 3) // 2
first_segment = []
second_segment = []
for sentence in self._current_sentences:
# the sentence goes to the first segment if (1) the first segment is
# empty, (2) the sentence doesn't put the first segment over length or
# (3) 50% of the time when it does put the first segment over length
if (len(first_segment) == 0 or
len(first_segment) + len(sentence) < first_segment_target_length or
(len(second_segment) == 0 and
len(first_segment) < first_segment_target_length and
random.random() < 0.5)):
first_segment += sentence
else:
second_segment += sentence
# trim to max_length while accounting for not-yet-added [CLS]/[SEP] tokens
first_segment = first_segment[:self._max_length - 2]
second_segment = second_segment[:max(0, self._max_length -
len(first_segment) - 3)]
# prepare to start building the next example
self._current_sentences = []
self._current_length = 0
# small chance for random-length instead of max_length-length example
if random.random() < 0.05:
self._target_length = random.randint(5, self._max_length)
else:
self._target_length = self._max_length
return self._make_tf_example(first_segment, second_segment)
def _make_tf_example(self, first_segment, second_segment):
"""Converts two "segments" of text into a tf.train.Example."""
vocab = self._tokenizer.vocab
input_ids = [vocab["[CLS]"]] + first_segment + [vocab["[SEP]"]]
segment_ids = [0] * len(input_ids)
if second_segment:
input_ids += second_segment + [vocab["[SEP]"]]
segment_ids += [1] * (len(second_segment) + 1)
input_mask = [1] * len(input_ids)
input_ids += [0] * (self._max_length - len(input_ids))
input_mask += [0] * (self._max_length - len(input_mask))
segment_ids += [0] * (self._max_length - len(segment_ids))
tf_example = tf.train.Example(features=tf.train.Features(feature={
"input_ids": create_int_feature(input_ids),
"input_mask": create_int_feature(input_mask),
"segment_ids": create_int_feature(segment_ids)
}))
return tf_example
class ExampleWriter(object):
"""Writes pre-training examples to disk."""
def __init__(self, job_id, vocab_file, output_dir, max_seq_length,
num_jobs, blanks_separate_docs, do_lower_case,
num_out_files=1000):
self._blanks_separate_docs = blanks_separate_docs
tokenizer = ElectraTokenizer(
vocab_file=vocab_file,
do_lower_case=do_lower_case)
self._example_builder = ExampleBuilder(tokenizer, max_seq_length)
self._writers = []
for i in range(num_out_files):
if i % num_jobs == job_id:
output_fname = os.path.join(
output_dir, "pretrain_data.tfrecord-{:}-of-{:}".format(
i, num_out_files))
self._writers.append(tf.io.TFRecordWriter(output_fname))
self.n_written = 0
def write_examples(self, input_file):
"""Writes out examples from the provided input file."""
with tf.io.gfile.GFile(input_file) as f:
for line in f:
line = line.strip()
if line or self._blanks_separate_docs:
example = self._example_builder.add_line(line)
if example:
self._writers[self.n_written % len(self._writers)].write(
example.SerializeToString())
self.n_written += 1
example = self._example_builder.add_line("")
if example:
self._writers[self.n_written % len(self._writers)].write(
example.SerializeToString())
self.n_written += 1
def finish(self):
for writer in self._writers:
writer.close()
def write_examples(job_id, args):
"""A single process creating and writing out pre-processed examples."""
def log(*args):
msg = " ".join(map(str, args))
print("Job {}:".format(job_id), msg)
log("Creating example writer")
example_writer = ExampleWriter(
job_id=job_id,
vocab_file=args.vocab_file,
output_dir=args.output_dir,
max_seq_length=args.max_seq_length,
num_jobs=args.num_processes,
blanks_separate_docs=args.blanks_separate_docs,
do_lower_case=args.do_lower_case,
num_out_files=args.num_out_files,
)
log("Writing tf examples")
fnames = sorted(tf.io.gfile.listdir(args.corpus_dir))
fnames = [f for (i, f) in enumerate(fnames)
if i % args.num_processes == job_id]
random.shuffle(fnames)
start_time = time.time()
for file_no, fname in enumerate(fnames):
if file_no > 0:
elapsed = time.time() - start_time
log("processed {:}/{:} files ({:.1f}%), ELAPSED: {:}s, ETA: {:}s, "
"{:} examples written".format(
file_no, len(fnames), 100.0 * file_no / len(fnames), int(elapsed),
int((len(fnames) - file_no) / (file_no / elapsed)),
example_writer.n_written))
example_writer.write_examples(os.path.join(args.corpus_dir, fname))
example_writer.finish()
log("Done!")
# python build_pretraining_dataset --corpus-dir
def main():
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument("--corpus-dir", required=True,
help="Location of pre-training text files.")
parser.add_argument("--vocab-file", required=True,
help="Location of vocabulary file.")
parser.add_argument("--output-dir", required=True,
help="Where to write out the tfrecords.")
parser.add_argument("--max-seq-length", default=128, type=int,
help="Number of tokens per example.")
parser.add_argument("--num-processes", default=1, type=int,
help="Parallelize across multiple processes.")
parser.add_argument("--blanks-separate-docs", default=True, type=bool,
help="Whether blank lines indicate document boundaries.")
parser.add_argument("--do-lower-case", dest='do_lower_case',
action='store_true', help="Lower case input text.")
parser.add_argument("--no-lower-case", dest='do_lower_case',
action='store_false', help="Don't lower case input text.")
parser.add_argument("--num-out-files", default=1000, type=int,
help="Number of output files.")
parser.set_defaults(do_lower_case=True)
args = parser.parse_args()
utils.rmkdir(args.output_dir)
if args.num_processes == 1:
write_examples(0, args)
else:
jobs = []
for i in range(args.num_processes):
job = multiprocessing.Process(target=write_examples, args=(i, args))
jobs.append(job)
job.start()
for job in jobs:
job.join()
if __name__ == "__main__":
main()

View file

@ -0,0 +1,132 @@
# coding=utf-8
# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" ELECTRA model configuration """
import logging
from configuration_utils import PretrainedConfig
logger = logging.getLogger(__name__)
ELECTRA_PRETRAINED_CONFIG_ARCHIVE_MAP = {
"google/electra-small-generator": "https://s3.amazonaws.com/models.huggingface.co/bert/google/electra-small-generator/config.json",
"google/electra-base-generator": "https://s3.amazonaws.com/models.huggingface.co/bert/google/electra-base-generator/config.json",
"google/electra-large-generator": "https://s3.amazonaws.com/models.huggingface.co/bert/google/electra-large-generator/config.json",
"google/electra-small-discriminator": "https://s3.amazonaws.com/models.huggingface.co/bert/google/electra-small-discriminator/config.json",
"google/electra-base-discriminator": "https://s3.amazonaws.com/models.huggingface.co/bert/google/electra-base-discriminator/config.json",
"google/electra-large-discriminator": "https://s3.amazonaws.com/models.huggingface.co/bert/google/electra-large-discriminator/config.json",
}
class ElectraConfig(PretrainedConfig):
r"""
This is the configuration class to store the configuration of a :class:`~transformers.ElectraModel`.
It is used to instantiate an ELECTRA model according to the specified arguments, defining the model
architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
the ELECTRA `google/electra-small-discriminator <https://huggingface.co/google/electra-small-discriminator>`__
architecture.
Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used
to control the model outputs. Read the documentation from :class:`~transformers.PretrainedConfig`
for more information.
Args:
vocab_size (:obj:`int`, optional, defaults to 30522):
Vocabulary size of the ELECTRA model. Defines the different tokens that
can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.ElectraModel`.
embedding_size (:obj:`int`, optional, defaults to 128):
Dimensionality of the encoder layers and the pooler layer.
hidden_size (:obj:`int`, optional, defaults to 256):
Dimensionality of the encoder layers and the pooler layer.
num_hidden_layers (:obj:`int`, optional, defaults to 12):
Number of hidden layers in the Transformer encoder.
num_attention_heads (:obj:`int`, optional, defaults to 4):
Number of attention heads for each attention layer in the Transformer encoder.
intermediate_size (:obj:`int`, optional, defaults to 1024):
Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
hidden_act (:obj:`str` or :obj:`function`, optional, defaults to "gelu"):
The non-linear activation function (function or string) in the encoder and pooler.
If string, "gelu", "relu", "swish" and "gelu_new" are supported.
hidden_dropout_prob (:obj:`float`, optional, defaults to 0.1):
The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
attention_probs_dropout_prob (:obj:`float`, optional, defaults to 0.1):
The dropout ratio for the attention probabilities.
max_position_embeddings (:obj:`int`, optional, defaults to 512):
The maximum sequence length that this model might ever be used with.
Typically set this to something large just in case (e.g., 512 or 1024 or 2048).
type_vocab_size (:obj:`int`, optional, defaults to 2):
The vocabulary size of the `token_type_ids` passed into :class:`~transformers.ElectraModel`.
initializer_range (:obj:`float`, optional, defaults to 0.02):
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
layer_norm_eps (:obj:`float`, optional, defaults to 1e-12):
The epsilon used by the layer normalization layers.
Example::
from transformers import ElectraModel, ElectraConfig
# Initializing a ELECTRA electra-base-uncased style configuration
configuration = ElectraConfig()
# Initializing a model from the electra-base-uncased style configuration
model = ElectraModel(configuration)
# Accessing the model configuration
configuration = model.config
Attributes:
pretrained_config_archive_map (Dict[str, str]):
A dictionary containing all the available pre-trained checkpoints.
"""
pretrained_config_archive_map = ELECTRA_PRETRAINED_CONFIG_ARCHIVE_MAP
model_type = "electra"
def __init__(
self,
vocab_size=30522,
embedding_size=128,
hidden_size=256,
num_hidden_layers=12,
num_attention_heads=4,
intermediate_size=1024,
hidden_act="gelu",
hidden_dropout_prob=0.1,
attention_probs_dropout_prob=0.1,
max_position_embeddings=512,
type_vocab_size=2,
initializer_range=0.02,
layer_norm_eps=1e-12,
pad_token_id=0,
**kwargs
):
super().__init__(pad_token_id=pad_token_id, **kwargs)
self.vocab_size = vocab_size
self.embedding_size = embedding_size
self.hidden_size = hidden_size
self.num_hidden_layers = num_hidden_layers
self.num_attention_heads = num_attention_heads
self.intermediate_size = intermediate_size
self.hidden_act = hidden_act
self.hidden_dropout_prob = hidden_dropout_prob
self.attention_probs_dropout_prob = attention_probs_dropout_prob
self.max_position_embeddings = max_position_embeddings
self.type_vocab_size = type_vocab_size
self.initializer_range = initializer_range
self.layer_norm_eps = layer_norm_eps

View file

@ -0,0 +1,517 @@
# coding=utf-8
# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" Configuration base class and utilities."""
import copy
import json
import logging
import os
from typing import Dict, Optional, Tuple
from file_utils import CONFIG_NAME, cached_path, hf_bucket_url, is_remote_url
logger = logging.getLogger(__name__)
class PretrainedConfig(object):
r""" Base class for all configuration classes.
Handles a few parameters common to all models' configurations as well as methods for loading/downloading/saving configurations.
Note:
A configuration file can be loaded and saved to disk. Loading the configuration file and using this file to initialize a model does **not** load the model weights.
It only affects the model's configuration.
Class attributes (overridden by derived classes):
- ``pretrained_config_archive_map``: a python ``dict`` with `shortcut names` (string) as keys and `url` (string) of associated pretrained model configurations as values.
- ``model_type``: a string that identifies the model type, that we serialize into the JSON file, and that we use to recreate the correct object in :class:`~transformers.AutoConfig`.
Args:
finetuning_task (:obj:`string` or :obj:`None`, `optional`, defaults to :obj:`None`):
Name of the task used to fine-tune the model. This can be used when converting from an original (TensorFlow or PyTorch) checkpoint.
num_labels (:obj:`int`, `optional`, defaults to `2`):
Number of classes to use when the model is a classification model (sequences/tokens)
output_attentions (:obj:`bool`, `optional`, defaults to :obj:`False`):
Should the model returns attentions weights.
output_hidden_states (:obj:`string`, `optional`, defaults to :obj:`False`):
Should the model returns all hidden-states.
torchscript (:obj:`bool`, `optional`, defaults to :obj:`False`):
Is the model used with Torchscript (for PyTorch models).
"""
pretrained_config_archive_map = {} # type: Dict[str, str]
model_type = "" # type: str
def __init__(self, **kwargs):
# Attributes with defaults
self.output_attentions = kwargs.pop("output_attentions", False)
self.output_hidden_states = kwargs.pop("output_hidden_states", False)
self.output_past = kwargs.pop("output_past", True) # Not used by all models
self.torchscript = kwargs.pop("torchscript", False) # Only used by PyTorch models
self.use_bfloat16 = kwargs.pop("use_bfloat16", False)
self.pruned_heads = kwargs.pop("pruned_heads", {})
# Is decoder is used in encoder-decoder models to differentiate encoder from decoder
self.is_encoder_decoder = kwargs.pop("is_encoder_decoder", False)
self.is_decoder = kwargs.pop("is_decoder", False)
# Parameters for sequence generation
self.max_length = kwargs.pop("max_length", 20)
self.min_length = kwargs.pop("min_length", 0)
self.do_sample = kwargs.pop("do_sample", False)
self.early_stopping = kwargs.pop("early_stopping", False)
self.num_beams = kwargs.pop("num_beams", 1)
self.temperature = kwargs.pop("temperature", 1.0)
self.top_k = kwargs.pop("top_k", 50)
self.top_p = kwargs.pop("top_p", 1.0)
self.repetition_penalty = kwargs.pop("repetition_penalty", 1.0)
self.length_penalty = kwargs.pop("length_penalty", 1.0)
self.no_repeat_ngram_size = kwargs.pop("no_repeat_ngram_size", 0)
self.bad_words_ids = kwargs.pop("bad_words_ids", None)
self.num_return_sequences = kwargs.pop("num_return_sequences", 1)
# Fine-tuning task arguments
self.architectures = kwargs.pop("architectures", None)
self.finetuning_task = kwargs.pop("finetuning_task", None)
self.num_labels = kwargs.pop("num_labels", 2)
self.id2label = kwargs.pop("id2label", {i: "LABEL_{}".format(i) for i in range(self.num_labels)})
self.id2label = dict((int(key), value) for key, value in self.id2label.items())
self.label2id = kwargs.pop("label2id", dict(zip(self.id2label.values(), self.id2label.keys())))
self.label2id = dict((key, int(value)) for key, value in self.label2id.items())
# Tokenizer arguments TODO: eventually tokenizer and models should share the same config
self.prefix = kwargs.pop("prefix", None)
self.bos_token_id = kwargs.pop("bos_token_id", None)
self.pad_token_id = kwargs.pop("pad_token_id", None)
self.eos_token_id = kwargs.pop("eos_token_id", None)
self.decoder_start_token_id = kwargs.pop("decoder_start_token_id", None)
# task specific arguments
self.task_specific_params = kwargs.pop("task_specific_params", None)
# TPU arguments
self.xla_device = kwargs.pop("xla_device", None)
# Additional attributes without default values
for key, value in kwargs.items():
try:
setattr(self, key, value)
except AttributeError as err:
logger.error("Can't set {} with value {} for {}".format(key, value, self))
raise err
@property
def num_labels(self):
return self._num_labels
@num_labels.setter
def num_labels(self, num_labels):
self._num_labels = num_labels
self.id2label = {i: "LABEL_{}".format(i) for i in range(self.num_labels)}
self.id2label = dict((int(key), value) for key, value in self.id2label.items())
self.label2id = dict(zip(self.id2label.values(), self.id2label.keys()))
self.label2id = dict((key, int(value)) for key, value in self.label2id.items())
def save_pretrained(self, save_directory):
"""
Save a configuration object to the directory `save_directory`, so that it
can be re-loaded using the :func:`~transformers.PretrainedConfig.from_pretrained` class method.
Args:
save_directory (:obj:`string`):
Directory where the configuration JSON file will be saved.
"""
assert os.path.isdir(
save_directory
), "Saving path should be a directory where the model and configuration can be saved"
# If we save using the predefined names, we can load using `from_pretrained`
output_config_file = os.path.join(save_directory, CONFIG_NAME)
self.to_json_file(output_config_file)
logger.info("Configuration saved in {}".format(output_config_file))
@classmethod
def from_pretrained(cls, pretrained_model_name_or_path, **kwargs) -> "PretrainedConfig":
r"""
Instantiate a :class:`~transformers.PretrainedConfig` (or a derived class) from a pre-trained model configuration.
Args:
pretrained_model_name_or_path (:obj:`string`):
either:
- a string with the `shortcut name` of a pre-trained model configuration to load from cache or
download, e.g.: ``bert-base-uncased``.
- a string with the `identifier name` of a pre-trained model configuration that was user-uploaded to
our S3, e.g.: ``dbmdz/bert-base-german-cased``.
- a path to a `directory` containing a configuration file saved using the
:func:`~transformers.PretrainedConfig.save_pretrained` method, e.g.: ``./my_model_directory/``.
- a path or url to a saved configuration JSON `file`, e.g.:
``./my_model_directory/configuration.json``.
cache_dir (:obj:`string`, `optional`):
Path to a directory in which a downloaded pre-trained model
configuration should be cached if the standard cache should not be used.
kwargs (:obj:`Dict[str, any]`, `optional`):
The values in kwargs of any keys which are configuration attributes will be used to override the loaded
values. Behavior concerning key/value pairs whose keys are *not* configuration attributes is
controlled by the `return_unused_kwargs` keyword parameter.
force_download (:obj:`bool`, `optional`, defaults to :obj:`False`):
Force to (re-)download the model weights and configuration files and override the cached versions if they exist.
resume_download (:obj:`bool`, `optional`, defaults to :obj:`False`):
Do not delete incompletely recieved file. Attempt to resume the download if such a file exists.
proxies (:obj:`Dict`, `optional`):
A dictionary of proxy servers to use by protocol or endpoint, e.g.:
:obj:`{'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.`
The proxies are used on each request.
return_unused_kwargs: (`optional`) bool:
If False, then this function returns just the final configuration object.
If True, then this functions returns a :obj:`Tuple(config, unused_kwargs)` where `unused_kwargs` is a
dictionary consisting of the key/value pairs whose keys are not configuration attributes: ie the part
of kwargs which has not been used to update `config` and is otherwise ignored.
Returns:
:class:`PretrainedConfig`: An instance of a configuration object
Examples::
# We can't instantiate directly the base class `PretrainedConfig` so let's show the examples on a
# derived class: BertConfig
config = BertConfig.from_pretrained('bert-base-uncased') # Download configuration from S3 and cache.
config = BertConfig.from_pretrained('./test/saved_model/') # E.g. config (or model) was saved using `save_pretrained('./test/saved_model/')`
config = BertConfig.from_pretrained('./test/saved_model/my_configuration.json')
config = BertConfig.from_pretrained('bert-base-uncased', output_attention=True, foo=False)
assert config.output_attention == True
config, unused_kwargs = BertConfig.from_pretrained('bert-base-uncased', output_attention=True,
foo=False, return_unused_kwargs=True)
assert config.output_attention == True
assert unused_kwargs == {'foo': False}
"""
config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
return cls.from_dict(config_dict, **kwargs)
@classmethod
def get_config_dict(
cls, pretrained_model_name_or_path: str, pretrained_config_archive_map: Optional[Dict] = None, **kwargs
) -> Tuple[Dict, Dict]:
"""
From a `pretrained_model_name_or_path`, resolve to a dictionary of parameters, to be used
for instantiating a Config using `from_dict`.
Parameters:
pretrained_model_name_or_path (:obj:`string`):
The identifier of the pre-trained checkpoint from which we want the dictionary of parameters.
pretrained_config_archive_map: (:obj:`Dict[str, str]`, `optional`) Dict:
A map of `shortcut names` to `url`. By default, will use the current class attribute.
Returns:
:obj:`Tuple[Dict, Dict]`: The dictionary that will be used to instantiate the configuration object.
"""
cache_dir = kwargs.pop("cache_dir", None)
force_download = kwargs.pop("force_download", False)
resume_download = kwargs.pop("resume_download", False)
proxies = kwargs.pop("proxies", None)
local_files_only = kwargs.pop("local_files_only", False)
if pretrained_config_archive_map is None:
pretrained_config_archive_map = cls.pretrained_config_archive_map
if pretrained_model_name_or_path in pretrained_config_archive_map:
config_file = pretrained_config_archive_map[pretrained_model_name_or_path]
elif os.path.isdir(pretrained_model_name_or_path):
config_file = os.path.join(pretrained_model_name_or_path, CONFIG_NAME)
elif os.path.isfile(pretrained_model_name_or_path) or is_remote_url(pretrained_model_name_or_path):
config_file = pretrained_model_name_or_path
else:
config_file = hf_bucket_url(pretrained_model_name_or_path, postfix=CONFIG_NAME)
try:
# Load from URL or cache if already cached
resolved_config_file = cached_path(
config_file,
cache_dir=cache_dir,
force_download=force_download,
proxies=proxies,
resume_download=resume_download,
local_files_only=local_files_only,
)
# Load config dict
if resolved_config_file is None:
raise EnvironmentError
config_dict = cls._dict_from_json_file(resolved_config_file)
except EnvironmentError:
if pretrained_model_name_or_path in pretrained_config_archive_map:
msg = "Couldn't reach server at '{}' to download pretrained model configuration file.".format(
config_file
)
else:
msg = (
"Can't load '{}'. Make sure that:\n\n"
"- '{}' is a correct model identifier listed on 'https://huggingface.co/models'\n\n"
"- or '{}' is the correct path to a directory containing a '{}' file\n\n".format(
pretrained_model_name_or_path,
pretrained_model_name_or_path,
pretrained_model_name_or_path,
CONFIG_NAME,
)
)
raise EnvironmentError(msg)
except json.JSONDecodeError:
msg = (
"Couldn't reach server at '{}' to download configuration file or "
"configuration file is not a valid JSON file. "
"Please check network or file content here: {}.".format(config_file, resolved_config_file)
)
raise EnvironmentError(msg)
if resolved_config_file == config_file:
logger.info("loading configuration file {}".format(config_file))
else:
logger.info("loading configuration file {} from cache at {}".format(config_file, resolved_config_file))
return config_dict, kwargs
@classmethod
def from_dict(cls, config_dict: Dict, **kwargs) -> "PretrainedConfig":
"""
Constructs a `Config` from a Python dictionary of parameters.
Args:
config_dict (:obj:`Dict[str, any]`):
Dictionary that will be used to instantiate the configuration object. Such a dictionary can be retrieved
from a pre-trained checkpoint by leveraging the :func:`~transformers.PretrainedConfig.get_config_dict`
method.
kwargs (:obj:`Dict[str, any]`):
Additional parameters from which to initialize the configuration object.
Returns:
:class:`PretrainedConfig`: An instance of a configuration object
"""
return_unused_kwargs = kwargs.pop("return_unused_kwargs", False)
config = cls(**config_dict)
if hasattr(config, "pruned_heads"):
config.pruned_heads = dict((int(key), value) for key, value in config.pruned_heads.items())
# Update config with kwargs if needed
to_remove = []
for key, value in kwargs.items():
if hasattr(config, key):
setattr(config, key, value)
to_remove.append(key)
for key in to_remove:
kwargs.pop(key, None)
logger.info("Model config %s", str(config))
if return_unused_kwargs:
return config, kwargs
else:
return config
@classmethod
def from_json_file(cls, json_file: str) -> "PretrainedConfig":
"""
Constructs a `Config` from the path to a json file of parameters.
Args:
json_file (:obj:`string`):
Path to the JSON file containing the parameters.
Returns:
:class:`PretrainedConfig`: An instance of a configuration object
"""
config_dict = cls._dict_from_json_file(json_file)
return cls(**config_dict)
@classmethod
def _dict_from_json_file(cls, json_file: str):
with open(json_file, "r", encoding="utf-8") as reader:
text = reader.read()
return json.loads(text)
def __eq__(self, other):
return self.__dict__ == other.__dict__
def __repr__(self):
return "{} {}".format(self.__class__.__name__, self.to_json_string())
def to_dict(self):
"""
Serializes this instance to a Python dictionary.
Returns:
:obj:`Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance,
"""
output = copy.deepcopy(self.__dict__)
if hasattr(self.__class__, "model_type"):
output["model_type"] = self.__class__.model_type
return output
def to_json_string(self):
"""
Serializes this instance to a JSON string.
Returns:
:obj:`string`: String containing all the attributes that make up this configuration instance in JSON format.
"""
return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n"
def to_json_file(self, json_file_path):
"""
Save this instance to a json file.
Args:
json_file_path (:obj:`string`):
Path to the JSON file in which this configuration instance's parameters will be saved.
"""
with open(json_file_path, "w", encoding="utf-8") as writer:
writer.write(self.to_json_string())
def update(self, config_dict: Dict):
"""
Updates attributes of this class
with attributes from `config_dict`.
Args:
:obj:`Dict[str, any]`: Dictionary of attributes that shall be updated for this class.
"""
for key, value in config_dict.items():
setattr(self, key, value)
BERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
"bert-base-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-config.json",
"bert-large-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-config.json",
"bert-base-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-config.json",
"bert-large-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-config.json",
"bert-base-multilingual-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-uncased-config.json",
"bert-base-multilingual-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased-config.json",
"bert-base-chinese": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese-config.json",
"bert-base-german-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-cased-config.json",
"bert-large-uncased-whole-word-masking": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-config.json",
"bert-large-cased-whole-word-masking": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-config.json",
"bert-large-uncased-whole-word-masking-finetuned-squad": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-finetuned-squad-config.json",
"bert-large-cased-whole-word-masking-finetuned-squad": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-finetuned-squad-config.json",
"bert-base-cased-finetuned-mrpc": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-finetuned-mrpc-config.json",
"bert-base-german-dbmdz-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-dbmdz-cased-config.json",
"bert-base-german-dbmdz-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-dbmdz-uncased-config.json",
"bert-base-japanese": "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-config.json",
"bert-base-japanese-whole-word-masking": "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-whole-word-masking-config.json",
"bert-base-japanese-char": "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-char-config.json",
"bert-base-japanese-char-whole-word-masking": "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-char-whole-word-masking-config.json",
"bert-base-finnish-cased-v1": "https://s3.amazonaws.com/models.huggingface.co/bert/TurkuNLP/bert-base-finnish-cased-v1/config.json",
"bert-base-finnish-uncased-v1": "https://s3.amazonaws.com/models.huggingface.co/bert/TurkuNLP/bert-base-finnish-uncased-v1/config.json",
"bert-base-dutch-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/wietsedv/bert-base-dutch-cased/config.json",
}
class BertConfig(PretrainedConfig):
r"""
This is the configuration class to store the configuration of a :class:`~transformers.BertModel`.
It is used to instantiate an BERT model according to the specified arguments, defining the model
architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
the BERT `bert-base-uncased <https://huggingface.co/bert-base-uncased>`__ architecture.
Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used
to control the model outputs. Read the documentation from :class:`~transformers.PretrainedConfig`
for more information.
Args:
vocab_size (:obj:`int`, optional, defaults to 30522):
Vocabulary size of the BERT model. Defines the different tokens that
can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.BertModel`.
hidden_size (:obj:`int`, optional, defaults to 768):
Dimensionality of the encoder layers and the pooler layer.
num_hidden_layers (:obj:`int`, optional, defaults to 12):
Number of hidden layers in the Transformer encoder.
num_attention_heads (:obj:`int`, optional, defaults to 12):
Number of attention heads for each attention layer in the Transformer encoder.
intermediate_size (:obj:`int`, optional, defaults to 3072):
Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
hidden_act (:obj:`str` or :obj:`function`, optional, defaults to "gelu"):
The non-linear activation function (function or string) in the encoder and pooler.
If string, "gelu", "relu", "swish" and "gelu_new" are supported.
hidden_dropout_prob (:obj:`float`, optional, defaults to 0.1):
The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
attention_probs_dropout_prob (:obj:`float`, optional, defaults to 0.1):
The dropout ratio for the attention probabilities.
max_position_embeddings (:obj:`int`, optional, defaults to 512):
The maximum sequence length that this model might ever be used with.
Typically set this to something large just in case (e.g., 512 or 1024 or 2048).
type_vocab_size (:obj:`int`, optional, defaults to 2):
The vocabulary size of the `token_type_ids` passed into :class:`~transformers.BertModel`.
initializer_range (:obj:`float`, optional, defaults to 0.02):
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
layer_norm_eps (:obj:`float`, optional, defaults to 1e-12):
The epsilon used by the layer normalization layers.
Example::
from transformers import BertModel, BertConfig
# Initializing a BERT bert-base-uncased style configuration
configuration = BertConfig()
# Initializing a model from the bert-base-uncased style configuration
model = BertModel(configuration)
# Accessing the model configuration
configuration = model.config
Attributes:
pretrained_config_archive_map (Dict[str, str]):
A dictionary containing all the available pre-trained checkpoints.
"""
pretrained_config_archive_map = BERT_PRETRAINED_CONFIG_ARCHIVE_MAP
model_type = "bert"
def __init__(
self,
vocab_size=30522,
hidden_size=768,
num_hidden_layers=12,
num_attention_heads=12,
intermediate_size=3072,
hidden_act="gelu",
hidden_dropout_prob=0.1,
attention_probs_dropout_prob=0.1,
max_position_embeddings=512,
type_vocab_size=2,
initializer_range=0.02,
layer_norm_eps=1e-12,
pad_token_id=0,
**kwargs
):
super().__init__(pad_token_id=pad_token_id, **kwargs)
self.vocab_size = vocab_size
self.hidden_size = hidden_size
self.num_hidden_layers = num_hidden_layers
self.num_attention_heads = num_attention_heads
self.hidden_act = hidden_act
self.intermediate_size = intermediate_size
self.hidden_dropout_prob = hidden_dropout_prob
self.attention_probs_dropout_prob = attention_probs_dropout_prob
self.max_position_embeddings = max_position_embeddings
self.type_vocab_size = type_vocab_size
self.initializer_range = initializer_range
self.layer_norm_eps = layer_norm_eps

View file

@ -0,0 +1,26 @@
# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import subprocess
class BooksDownloader:
def __init__(self, save_path):
self.save_path = save_path
pass
def download(self):
bookscorpus_download_command = 'python3 /workspace/bookcorpus/download_files.py --list /workspace/bookcorpus/url_list.jsonl --out'
bookscorpus_download_command += ' ' + self.save_path + '/bookscorpus'
bookscorpus_download_command += ' --trash-bad-count'
bookscorpus_download_process = subprocess.run(bookscorpus_download_command, shell=True, check=True)

View file

@ -0,0 +1,32 @@
# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import glob
import os
class BookscorpusTextFormatting:
def __init__(self, books_path, output_filename, recursive = False):
self.books_path = books_path
self.recursive = recursive
self.output_filename = output_filename
# This puts one book per line
def merge(self):
with open(self.output_filename, mode='w', newline='\n') as ofile:
for filename in glob.glob(self.books_path + '/' + '*.txt', recursive=True):
with open(filename, mode='r', encoding='utf-8-sig', newline='\n') as file:
for line in file:
if line.strip() != '':
ofile.write(line.strip() + ' ')
ofile.write("\n\n")

View file

@ -0,0 +1,91 @@
# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from GooglePretrainedWeightDownloader import GooglePretrainedWeightDownloader
from NVIDIAPretrainedWeightDownloader import NVIDIAPretrainedWeightDownloader
from WikiDownloader import WikiDownloader
from BooksDownloader import BooksDownloader
from MRPCDownloader import MRPCDownloader
from SquadDownloader import SquadDownloader
class Downloader:
def __init__(self, dataset_name, save_path):
self.dataset_name = dataset_name
self.save_path = save_path
def download(self):
if self.dataset_name == 'bookscorpus':
self.download_bookscorpus()
elif self.dataset_name == 'wikicorpus_en':
self.download_wikicorpus('en')
elif self.dataset_name == 'wikicorpus_zh':
self.download_wikicorpus('zh')
elif self.dataset_name == 'google_pretrained_weights':
self.download_google_pretrained_weights()
elif self.dataset_name == 'nvidia_pretrained_weights':
self.download_nvidia_pretrained_weights()
elif self.dataset_name == 'mrpc':
self.download_mrpc()
elif self.dataset_name == 'squad':
self.download_squad()
elif self.dataset_name == 'all':
self.download_bookscorpus(self.save_path)
self.download_wikicorpus('en', self.save_path)
self.download_wikicorpus('zh', self.save_path)
self.download_google_pretrained_weights(self.save_path)
self.download_nvidia_pretrained_weights(self.save_path)
self.download_mrpc(self.save_path)
self.download_squad(self.save_path)
else:
print(self.dataset_name)
assert False, 'Unknown dataset_name provided to downloader'
def download_bookscorpus(self):
downloader = BooksDownloader(self.save_path)
downloader.download()
def download_wikicorpus(self, language):
downloader = WikiDownloader(language, self.save_path)
downloader.download()
def download_google_pretrained_weights(self):
downloader = GooglePretrainedWeightDownloader(self.save_path)
downloader.download()
def download_nvidia_pretrained_weights(self):
downloader = NVIDIAPretrainedWeightDownloader(self.save_path)
downloader.download()
def download_mrpc(self):
downloader = MRPCDownloader(self.save_path)
downloader.download()
def download_squad(self):
downloader = SquadDownloader(self.save_path)
downloader.download()

View file

@ -0,0 +1,158 @@
# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import hashlib
import os
import urllib.request
import zipfile
class GooglePretrainedWeightDownloader:
def __init__(self, save_path):
self.save_path = save_path + '/google_pretrained_weights'
if not os.path.exists(self.save_path):
os.makedirs(self.save_path)
# Download urls
self.model_urls = {
'bert_base_uncased': ('https://storage.googleapis.com/bert_models/2018_10_18/uncased_L-12_H-768_A-12.zip', 'uncased_L-12_H-768_A-12.zip'),
'bert_large_uncased': ('https://storage.googleapis.com/bert_models/2018_10_18/uncased_L-24_H-1024_A-16.zip', 'uncased_L-24_H-1024_A-16.zip'),
'bert_base_cased': ('https://storage.googleapis.com/bert_models/2018_10_18/cased_L-12_H-768_A-12.zip', 'cased_L-12_H-768_A-12.zip'),
'bert_large_cased': ('https://storage.googleapis.com/bert_models/2018_10_18/cased_L-24_H-1024_A-16.zip', 'cased_L-24_H-1024_A-16.zip'),
'bert_base_multilingual_cased': ('https://storage.googleapis.com/bert_models/2018_11_23/multi_cased_L-12_H-768_A-12.zip', 'multi_cased_L-12_H-768_A-12.zip'),
'bert_large_multilingual_uncased': ('https://storage.googleapis.com/bert_models/2018_11_03/multilingual_L-12_H-768_A-12.zip', 'multilingual_L-12_H-768_A-12.zip'),
'bert_base_chinese': ('https://storage.googleapis.com/bert_models/2018_11_03/chinese_L-12_H-768_A-12.zip', 'chinese_L-12_H-768_A-12.zip')
}
# SHA256sum verification for file download integrity (and checking for changes from the download source over time)
self.bert_base_uncased_sha = {
'bert_config.json': '7b4e5f53efbd058c67cda0aacfafb340113ea1b5797d9ce6ee411704ba21fcbc',
'bert_model.ckpt.data-00000-of-00001': '58580dc5e0bf0ae0d2efd51d0e8272b2f808857f0a43a88aaf7549da6d7a8a84',
'bert_model.ckpt.index': '04c1323086e2f1c5b7c0759d8d3e484afbb0ab45f51793daab9f647113a0117b',
'bert_model.ckpt.meta': 'dd5682170a10c3ea0280c2e9b9a45fee894eb62da649bbdea37b38b0ded5f60e',
'vocab.txt': '07eced375cec144d27c900241f3e339478dec958f92fddbc551f295c992038a3',
}
self.bert_large_uncased_sha = {
'bert_config.json': 'bfa42236d269e2aeb3a6d30412a33d15dbe8ea597e2b01dc9518c63cc6efafcb',
'bert_model.ckpt.data-00000-of-00001': 'bc6b3363e3be458c99ecf64b7f472d2b7c67534fd8f564c0556a678f90f4eea1',
'bert_model.ckpt.index': '68b52f2205ffc64dc627d1120cf399c1ef1cbc35ea5021d1afc889ffe2ce2093',
'bert_model.ckpt.meta': '6fcce8ff7628f229a885a593625e3d5ff9687542d5ef128d9beb1b0c05edc4a1',
'vocab.txt': '07eced375cec144d27c900241f3e339478dec958f92fddbc551f295c992038a3',
}
self.bert_base_cased_sha = {
'bert_config.json': 'f11dfb757bea16339a33e1bf327b0aade6e57fd9c29dc6b84f7ddb20682f48bc',
'bert_model.ckpt.data-00000-of-00001': '734d5a1b68bf98d4e9cb6b6692725d00842a1937af73902e51776905d8f760ea',
'bert_model.ckpt.index': '517d6ef5c41fc2ca1f595276d6fccf5521810d57f5a74e32616151557790f7b1',
'bert_model.ckpt.meta': '5f8a9771ff25dadd61582abb4e3a748215a10a6b55947cbb66d0f0ba1694be98',
'vocab.txt': 'eeaa9875b23b04b4c54ef759d03db9d1ba1554838f8fb26c5d96fa551df93d02',
}
self.bert_large_cased_sha = {
'bert_config.json': '7adb2125c8225da495656c982fd1c5f64ba8f20ad020838571a3f8a954c2df57',
'bert_model.ckpt.data-00000-of-00001': '6ff33640f40d472f7a16af0c17b1179ca9dcc0373155fb05335b6a4dd1657ef0',
'bert_model.ckpt.index': 'ef42a53f577fbe07381f4161b13c7cab4f4fc3b167cec6a9ae382c53d18049cf',
'bert_model.ckpt.meta': 'd2ddff3ed33b80091eac95171e94149736ea74eb645e575d942ec4a5e01a40a1',
'vocab.txt': 'eeaa9875b23b04b4c54ef759d03db9d1ba1554838f8fb26c5d96fa551df93d02',
}
self.bert_base_multilingual_cased_sha = {
'bert_config.json': 'e76c3964bc14a8bb37a5530cdc802699d2f4a6fddfab0611e153aa2528f234f0',
'bert_model.ckpt.data-00000-of-00001': '55b8a2df41f69c60c5180e50a7c31b7cdf6238909390c4ddf05fbc0d37aa1ac5',
'bert_model.ckpt.index': '7d8509c2a62b4e300feb55f8e5f1eef41638f4998dd4d887736f42d4f6a34b37',
'bert_model.ckpt.meta': '95e5f1997e8831f1c31e5cf530f1a2e99f121e9cd20887f2dce6fe9e3343e3fa',
'vocab.txt': 'fe0fda7c425b48c516fc8f160d594c8022a0808447475c1a7c6d6479763f310c',
}
self.bert_large_multilingual_uncased_sha = {
'bert_config.json': '49063bb061390211d2fdd108cada1ed86faa5f90b80c8f6fdddf406afa4c4624',
'bert_model.ckpt.data-00000-of-00001': '3cd83912ebeb0efe2abf35c9f1d5a515d8e80295e61c49b75c8853f756658429',
'bert_model.ckpt.index': '87c372c1a3b1dc7effaaa9103c80a81b3cbab04c7933ced224eec3b8ad2cc8e7',
'bert_model.ckpt.meta': '27f504f34f02acaa6b0f60d65195ec3e3f9505ac14601c6a32b421d0c8413a29',
'vocab.txt': '87b44292b452f6c05afa49b2e488e7eedf79ea4f4c39db6f2f4b37764228ef3f',
}
self.bert_base_chinese_sha = {
'bert_config.json': '7aaad0335058e2640bcb2c2e9a932b1cd9da200c46ea7b8957d54431f201c015',
'bert_model.ckpt.data-00000-of-00001': '756699356b78ad0ef1ca9ba6528297bcb3dd1aef5feadd31f4775d7c7fc989ba',
'bert_model.ckpt.index': '46315546e05ce62327b3e2cd1bed22836adcb2ff29735ec87721396edb21b82e',
'bert_model.ckpt.meta': 'c0f8d51e1ab986604bc2b25d6ec0af7fd21ff94cf67081996ec3f3bf5d823047',
'vocab.txt': '45bbac6b341c319adc98a532532882e91a9cefc0329aa57bac9ae761c27b291c',
}
# Relate SHA to urls for loop below
self.model_sha = {
'bert_base_uncased': self.bert_base_uncased_sha,
'bert_large_uncased': self.bert_large_uncased_sha,
'bert_base_cased': self.bert_base_cased_sha,
'bert_large_cased': self.bert_large_cased_sha,
'bert_base_multilingual_cased': self.bert_base_multilingual_cased_sha,
'bert_large_multilingual_uncased': self.bert_large_multilingual_uncased_sha,
'bert_base_chinese': self.bert_base_chinese_sha
}
# Helper to get sha256sum of a file
def sha256sum(self, filename):
h = hashlib.sha256()
b = bytearray(128*1024)
mv = memoryview(b)
with open(filename, 'rb', buffering=0) as f:
for n in iter(lambda : f.readinto(mv), 0):
h.update(mv[:n])
return h.hexdigest()
def download(self):
# Iterate over urls: download, unzip, verify sha256sum
found_mismatch_sha = False
for model in self.model_urls:
url = self.model_urls[model][0]
file = self.save_path + '/' + self.model_urls[model][1]
print('Downloading', url)
response = urllib.request.urlopen(url)
with open(file, 'wb') as handle:
handle.write(response.read())
print('Unzipping', file)
zip = zipfile.ZipFile(file, 'r')
zip.extractall(self.save_path)
zip.close()
sha_dict = self.model_sha[model]
for extracted_file in sha_dict:
sha = sha_dict[extracted_file]
if sha != self.sha256sum(file[:-4] + '/' + extracted_file):
found_mismatch_sha = True
print('SHA256sum does not match on file:', extracted_file, 'from download url:', url)
else:
print(file[:-4] + '/' + extracted_file, '\t', 'verified')
if not found_mismatch_sha:
print("All downloads pass sha256sum verification.")
def serialize(self):
pass
def deserialize(self):
pass
def listAvailableWeights(self):
print("Available Weight Datasets")
for item in self.model_urls:
print(item)
def listLocallyStoredWeights(self):
pass

View file

@ -0,0 +1,44 @@
# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import bz2
import os
import urllib.request
import sys
class MRPCDownloader:
def __init__(self, save_path):
self.save_path = save_path + '/mrpc'
if not os.path.exists(self.save_path):
os.makedirs(self.save_path)
# Documentation - Download link obtained from here: https://github.com/nyu-mll/GLUE-baselines/blob/master/download_glue_data.py
self.download_urls = {
'https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2Fmrpc_dev_ids.tsv?alt=media&token=ec5c0836-31d5-48f4-b431-7480817f1adc' : 'mrpc_dev_ids.tsv'
}
def download(self):
for item in self.download_urls:
url = item
file = self.download_urls[item]
print('Downloading:', url)
if os.path.isfile(self.save_path + '/' + file):
print('** Download file already exists, skipping download')
else:
response = urllib.request.urlopen(url)
with open(self.save_path + '/' + file, "wb") as handle:
handle.write(response.read())

View file

@ -0,0 +1,27 @@
# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
class NVIDIAPretrainedWeightDownloader:
def __init__(self, save_path):
self.save_path = save_path + '/nvidia_pretrained_weights'
if not os.path.exists(self.save_path):
os.makedirs(self.save_path)
pass
def download(self):
assert False, 'NVIDIAPretrainedWeightDownloader not implemented yet.'

View file

@ -0,0 +1,54 @@
# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import bz2
import os
import urllib.request
import sys
class SquadDownloader:
def __init__(self, save_path):
self.save_path = save_path + '/squad'
if not os.path.exists(self.save_path):
os.makedirs(self.save_path)
if not os.path.exists(self.save_path + '/v1.1'):
os.makedirs(self.save_path + '/v1.1')
if not os.path.exists(self.save_path + '/v2.0'):
os.makedirs(self.save_path + '/v2.0')
self.download_urls = {
'https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v1.1.json' : 'v1.1/train-v1.1.json',
'https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v1.1.json' : 'v1.1/dev-v1.1.json',
'https://worksheets.codalab.org/rest/bundles/0xbcd57bee090b421c982906709c8c27e1/contents/blob/' : 'v1.1/evaluate-v1.1.py',
'https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v2.0.json' : 'v2.0/train-v2.0.json',
'https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v2.0.json' : 'v2.0/dev-v2.0.json',
'https://worksheets.codalab.org/rest/bundles/0x6b567e1cf2e041ec80d7098f031c5c9e/contents/blob/' : 'v2.0/evaluate-v2.0.py',
}
def download(self):
for item in self.download_urls:
url = item
file = self.download_urls[item]
print('Downloading:', url)
if os.path.isfile(self.save_path + '/' + file):
print('** Download file already exists, skipping download')
else:
response = urllib.request.urlopen(url)
with open(self.save_path + '/' + file, "wb") as handle:
handle.write(response.read())

View file

@ -0,0 +1,327 @@
# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from collections import defaultdict
from itertools import islice
import multiprocessing
import statistics
class Sharding:
def __init__(self, input_files, output_name_prefix, n_training_shards, n_test_shards, fraction_test_set):
assert len(input_files) > 0, 'The input file list must contain at least one file.'
assert n_training_shards > 0, 'There must be at least one output shard.'
assert n_test_shards > 0, 'There must be at least one output shard.'
self.n_training_shards = n_training_shards
self.n_test_shards = n_test_shards
self.fraction_test_set = fraction_test_set
self.input_files = input_files
self.output_name_prefix = output_name_prefix
self.output_training_identifier = '_training'
self.output_test_identifier = '_test'
self.output_file_extension = '.txt'
self.articles = {} # key: integer identifier, value: list of articles
self.sentences = {} # key: integer identifier, value: list of sentences
self.output_training_files = {} # key: filename, value: list of articles to go into file
self.output_test_files = {} # key: filename, value: list of articles to go into file
self.init_output_files()
# Remember, the input files contain one article per line (the whitespace check is to skip extraneous blank lines)
def load_articles(self):
print('Start: Loading Articles')
global_article_count = 0
for input_file in self.input_files:
print('input file:', input_file)
with open(input_file, mode='r', newline='\n') as f:
for i, line in enumerate(f):
if line.strip():
self.articles[global_article_count] = line.rstrip()
global_article_count += 1
print('End: Loading Articles: There are', len(self.articles), 'articles.')
def segment_articles_into_sentences(self, segmenter):
print('Start: Sentence Segmentation')
if len(self.articles) is 0:
self.load_articles()
assert len(self.articles) is not 0, 'Please check that input files are present and contain data.'
# TODO: WIP: multiprocessing (create independent ranges and spawn processes)
use_multiprocessing = 'serial'
def chunks(data, size=len(self.articles)):
it = iter(data)
for i in range(0, len(data), size):
yield {k: data[k] for k in islice(it, size)}
if use_multiprocessing == 'manager':
manager = multiprocessing.Manager()
return_dict = manager.dict()
jobs = []
n_processes = 7 # in addition to the main process, total = n_proc+1
def work(articles, return_dict):
sentences = {}
for i, article in enumerate(articles):
sentences[i] = segmenter.segment_string(articles[article])
if i % 5000 == 0:
print('Segmenting article', i)
return_dict.update(sentences)
for item in chunks(self.articles, len(self.articles)):
p = multiprocessing.Process(target=work, args=(item, return_dict))
# Busy wait
while len(jobs) >= n_processes:
pass
jobs.append(p)
p.start()
for proc in jobs:
proc.join()
elif use_multiprocessing == 'queue':
work_queue = multiprocessing.Queue()
jobs = []
for item in chunks(self.articles, len(self.articles)):
pass
else: # serial option
for i, article in enumerate(self.articles):
self.sentences[i] = segmenter.segment_string(self.articles[article])
if i % 5000 == 0:
print('Segmenting article', i)
print('End: Sentence Segmentation')
def init_output_files(self):
print('Start: Init Output Files')
assert len(self.output_training_files) is 0, 'Internal storage self.output_files already contains data. This function is intended to be used by the constructor only.'
assert len(self.output_test_files) is 0, 'Internal storage self.output_files already contains data. This function is intended to be used by the constructor only.'
for i in range(self.n_training_shards):
name = self.output_name_prefix + self.output_training_identifier + '_' + str(i) + self.output_file_extension
self.output_training_files[name] = []
for i in range(self.n_test_shards):
name = self.output_name_prefix + self.output_test_identifier + '_' + str(i) + self.output_file_extension
self.output_test_files[name] = []
print('End: Init Output Files')
def get_sentences_per_shard(self, shard):
result = 0
for article_id in shard:
result += len(self.sentences[article_id])
return result
def distribute_articles_over_shards(self):
print('Start: Distribute Articles Over Shards')
assert len(self.articles) >= self.n_training_shards + self.n_test_shards, 'There are fewer articles than shards. Please add more data or reduce the number of shards requested.'
# Create dictionary with - key: sentence count per article, value: article id number
sentence_counts = defaultdict(lambda: [])
max_sentences = 0
total_sentences = 0
for article_id in self.sentences:
current_length = len(self.sentences[article_id])
sentence_counts[current_length].append(article_id)
max_sentences = max(max_sentences, current_length)
total_sentences += current_length
n_sentences_assigned_to_training = int((1 - self.fraction_test_set) * total_sentences)
nominal_sentences_per_training_shard = n_sentences_assigned_to_training // self.n_training_shards
nominal_sentences_per_test_shard = (total_sentences - n_sentences_assigned_to_training) // self.n_test_shards
consumed_article_set = set({})
unused_article_set = set(self.articles.keys())
# Make first pass and add one article worth of lines per file
for file in self.output_training_files:
current_article_id = sentence_counts[max_sentences][-1]
sentence_counts[max_sentences].pop(-1)
self.output_training_files[file].append(current_article_id)
consumed_article_set.add(current_article_id)
unused_article_set.remove(current_article_id)
# Maintain the max sentence count
while len(sentence_counts[max_sentences]) == 0 and max_sentences > 0:
max_sentences -= 1
if len(self.sentences[current_article_id]) > nominal_sentences_per_training_shard:
nominal_sentences_per_training_shard = len(self.sentences[current_article_id])
print('Warning: A single article contains more than the nominal number of sentences per training shard.')
for file in self.output_test_files:
current_article_id = sentence_counts[max_sentences][-1]
sentence_counts[max_sentences].pop(-1)
self.output_test_files[file].append(current_article_id)
consumed_article_set.add(current_article_id)
unused_article_set.remove(current_article_id)
# Maintain the max sentence count
while len(sentence_counts[max_sentences]) == 0 and max_sentences > 0:
max_sentences -= 1
if len(self.sentences[current_article_id]) > nominal_sentences_per_test_shard:
nominal_sentences_per_test_shard = len(self.sentences[current_article_id])
print('Warning: A single article contains more than the nominal number of sentences per test shard.')
training_counts = []
test_counts = []
for shard in self.output_training_files:
training_counts.append(self.get_sentences_per_shard(self.output_training_files[shard]))
for shard in self.output_test_files:
test_counts.append(self.get_sentences_per_shard(self.output_test_files[shard]))
training_median = statistics.median(training_counts)
test_median = statistics.median(test_counts)
# Make subsequent passes over files to find articles to add without going over limit
history_remaining = []
n_history_remaining = 4
while len(consumed_article_set) < len(self.articles):
for fidx, file in enumerate(self.output_training_files):
nominal_next_article_size = min(nominal_sentences_per_training_shard - training_counts[fidx], max_sentences)
# Maintain the max sentence count
while len(sentence_counts[max_sentences]) == 0 and max_sentences > 0:
max_sentences -= 1
while len(sentence_counts[nominal_next_article_size]) == 0 and nominal_next_article_size > 0:
nominal_next_article_size -= 1
if nominal_next_article_size not in sentence_counts or nominal_next_article_size is 0 or training_counts[fidx] > training_median:
continue # skip adding to this file, will come back later if no file can accept unused articles
current_article_id = sentence_counts[nominal_next_article_size][-1]
sentence_counts[nominal_next_article_size].pop(-1)
self.output_training_files[file].append(current_article_id)
consumed_article_set.add(current_article_id)
unused_article_set.remove(current_article_id)
for fidx, file in enumerate(self.output_test_files):
nominal_next_article_size = min(nominal_sentences_per_test_shard - test_counts[fidx], max_sentences)
# Maintain the max sentence count
while len(sentence_counts[max_sentences]) == 0 and max_sentences > 0:
max_sentences -= 1
while len(sentence_counts[nominal_next_article_size]) == 0 and nominal_next_article_size > 0:
nominal_next_article_size -= 1
if nominal_next_article_size not in sentence_counts or nominal_next_article_size is 0 or test_counts[fidx] > test_median:
continue # skip adding to this file, will come back later if no file can accept unused articles
current_article_id = sentence_counts[nominal_next_article_size][-1]
sentence_counts[nominal_next_article_size].pop(-1)
self.output_test_files[file].append(current_article_id)
consumed_article_set.add(current_article_id)
unused_article_set.remove(current_article_id)
# If unable to place articles a few times, bump up nominal sizes by fraction until articles get placed
if len(history_remaining) == n_history_remaining:
history_remaining.pop(0)
history_remaining.append(len(unused_article_set))
history_same = True
for i in range(1, len(history_remaining)):
history_same = history_same and (history_remaining[i-1] == history_remaining[i])
if history_same:
nominal_sentences_per_training_shard += 1
# nominal_sentences_per_test_shard += 1
training_counts = []
test_counts = []
for shard in self.output_training_files:
training_counts.append(self.get_sentences_per_shard(self.output_training_files[shard]))
for shard in self.output_test_files:
test_counts.append(self.get_sentences_per_shard(self.output_test_files[shard]))
training_median = statistics.median(training_counts)
test_median = statistics.median(test_counts)
print('Distributing data over shards:', len(unused_article_set), 'articles remaining.')
if len(unused_article_set) != 0:
print('Warning: Some articles did not make it into output files.')
for shard in self.output_training_files:
print('Training shard:', self.get_sentences_per_shard(self.output_training_files[shard]))
for shard in self.output_test_files:
print('Test shard:', self.get_sentences_per_shard(self.output_test_files[shard]))
print('End: Distribute Articles Over Shards')
def write_shards_to_disk(self):
print('Start: Write Shards to Disk')
for shard in self.output_training_files:
self.write_single_shard(shard, self.output_training_files[shard])
for shard in self.output_test_files:
self.write_single_shard(shard, self.output_test_files[shard])
print('End: Write Shards to Disk')
def write_single_shard(self, shard_name, shard):
with open(shard_name, mode='w', newline='\n') as f:
for article_id in shard:
for line in self.sentences[article_id]:
f.write(line + '\n')
f.write('\n') # Line break between articles
import nltk
nltk.download('punkt')
class NLTKSegmenter:
def __init(self):
pass
def segment_string(self, article):
return nltk.tokenize.sent_tokenize(article)

View file

@ -0,0 +1,57 @@
# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import bz2
import os
import urllib.request
import subprocess
import sys
class WikiDownloader:
def __init__(self, language, save_path):
self.save_path = save_path + '/wikicorpus_' + language
if not os.path.exists(self.save_path):
os.makedirs(self.save_path)
self.language = language
self.download_urls = {
'en' : 'https://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles.xml.bz2',
'zh' : 'https://dumps.wikimedia.org/zhwiki/latest/zhwiki-latest-pages-articles.xml.bz2'
}
self.output_files = {
'en' : 'wikicorpus_en.xml.bz2',
'zh' : 'wikicorpus_zh.xml.bz2'
}
def download(self):
if self.language in self.download_urls:
url = self.download_urls[self.language]
filename = self.output_files[self.language]
print('Downloading:', url)
if os.path.isfile(self.save_path + '/' + filename):
print('** Download file already exists, skipping download')
else:
response = urllib.request.urlopen(url)
with open(self.save_path + '/' + filename, "wb") as handle:
handle.write(response.read())
# Always unzipping since this is relatively fast and will overwrite
print('Unzipping:', self.output_files[self.language])
subprocess.run('bzip2 -dk ' + self.save_path + '/' + filename, shell=True, check=True)
else:
assert False, 'WikiDownloader not implemented for this language yet.'

View file

@ -0,0 +1,46 @@
# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import glob
import os
class WikicorpusTextFormatting:
def __init__(self, wiki_path, output_filename, recursive = False):
self.wiki_path = wiki_path
self.recursive = recursive
self.output_filename = output_filename
# This puts one article per line
def merge(self):
with open(self.output_filename, mode='w', newline='\n') as ofile:
for dirname in glob.glob(self.wiki_path + '/*/', recursive=False):
for filename in glob.glob(dirname + 'wiki_*', recursive=self.recursive):
print(filename)
article_lines = []
article_open = False
with open(filename, mode='r', newline='\n') as file:
for line in file:
if '<doc id=' in line:
article_open = True
elif '</doc>' in line:
article_open = False
for oline in article_lines[1:]:
if oline != '\n':
ofile.write(oline.rstrip() + " ")
ofile.write("\n\n")
article_lines = []
else:
if article_open:
article_lines.append(line)

View file

@ -0,0 +1,12 @@
# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

View file

@ -0,0 +1,43 @@
#!/bin/bash
# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Download
#python3 /workspace/electra/data/dataPrep.py --action download --dataset bookscorpus
#python3 /workspace/electra/data/dataPrep.py --action download --dataset wikicorpus_en
#python3 /workspace/electra/data/dataPrep.py --action download --dataset google_pretrained_weights # Includes vocab
#All other pretraining related option commented out since only fine-tuning is supported at the moment.
python3 /workspace/electra/data/dataPrep.py --action download --dataset squad
#python3 /workspace/electra/data/dataPrep.py --action download --dataset mrpc
# Properly format the text files
#python3 /workspace/electra/data/dataPrep.py --action text_formatting --dataset bookscorpus
#python3 /workspace/electra/data/dataPrep.py --action text_formatting --dataset wikicorpus_en
# Shard the text files (group wiki+books then shard)
#python3 /workspace/electra/data/dataPrep.py --action sharding --dataset books_wiki_en_corpus
# Create HDF5 files Phase 1
#python3 /workspace/electra/data/dataPrep.py --action create_hdf5_files --dataset books_wiki_en_corpus --max_seq_length 128 \
# --max_predictions_per_seq 20 --vocab_file $DATA_PREP_WORKING_DIR/download/google_pretrained_weights/uncased_L-24_H-1024_A-16/vocab.txt --do_lower_case 1
# Create HDF5 files Phase 2
#python3 /workspace/electra/data/dataPrep.py --action create_hdf5_files --dataset books_wiki_en_corpus --max_seq_length 512 \
# --max_predictions_per_seq 80 --vocab_file $DATA_PREP_WORKING_DIR/download/google_pretrained_weights/uncased_L-24_H-1024_A-16/vocab.txt --do_lower_case 1

View file

@ -0,0 +1,362 @@
# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import BookscorpusTextFormatting
import Downloader
import TextSharding
import WikicorpusTextFormatting
import argparse
import itertools
import multiprocessing
import os
import pprint
import subprocess
def main(args):
working_dir = os.environ['DATA_PREP_WORKING_DIR']
print('Working Directory:', working_dir)
print('Action:', args.action)
print('Dataset Name:', args.dataset)
if args.input_files:
args.input_files = args.input_files.split(',')
hdf5_tfrecord_folder_prefix = "_lower_case_" + str(args.do_lower_case) + "_seq_len_" + str(args.max_seq_length) \
+ "_max_pred_" + str(args.max_predictions_per_seq) + "_masked_lm_prob_" + str(args.masked_lm_prob) \
+ "_random_seed_" + str(args.random_seed) + "_dupe_factor_" + str(args.dupe_factor)
directory_structure = {
'download' : working_dir + '/download', # Downloaded and decompressed
'extracted' : working_dir +'/extracted', # Extracted from whatever the initial format is (e.g., wikiextractor)
'formatted' : working_dir + '/formatted_one_article_per_line', # This is the level where all sources should look the same
'sharded' : working_dir + '/sharded_' + "training_shards_" + str(args.n_training_shards) + "_test_shards_" + str(args.n_test_shards) + "_fraction_" + str(args.fraction_test_set),
'tfrecord' : working_dir + '/tfrecord'+ hdf5_tfrecord_folder_prefix,
'hdf5': working_dir + '/hdf5' + hdf5_tfrecord_folder_prefix
}
print('\nDirectory Structure:')
pp = pprint.PrettyPrinter(indent=2)
pp.pprint(directory_structure)
print('')
if args.action == 'download':
if not os.path.exists(directory_structure['download']):
os.makedirs(directory_structure['download'])
downloader = Downloader.Downloader(args.dataset, directory_structure['download'])
downloader.download()
elif args.action == 'text_formatting':
assert args.dataset != 'google_pretrained_weights' and args.dataset != 'nvidia_pretrained_weights' and args.dataset != 'squad' and args.dataset != 'mrpc', 'Cannot perform text_formatting on pretrained weights'
if not os.path.exists(directory_structure['extracted']):
os.makedirs(directory_structure['extracted'])
if not os.path.exists(directory_structure['formatted']):
os.makedirs(directory_structure['formatted'])
if args.dataset == 'bookscorpus':
books_path = directory_structure['download'] + '/bookscorpus'
#books_path = directory_structure['download']
output_filename = directory_structure['formatted'] + '/bookscorpus_one_book_per_line.txt'
books_formatter = BookscorpusTextFormatting.BookscorpusTextFormatting(books_path, output_filename, recursive=True)
books_formatter.merge()
elif args.dataset == 'wikicorpus_en':
if args.skip_wikiextractor == 0:
path_to_wikiextractor_in_container = '/workspace/wikiextractor/WikiExtractor.py'
wikiextractor_command = path_to_wikiextractor_in_container + ' ' + directory_structure['download'] + '/' + args.dataset + '/wikicorpus_en.xml ' + '-b 100M --processes ' + str(args.n_processes) + ' -o ' + directory_structure['extracted'] + '/' + args.dataset
print('WikiExtractor Command:', wikiextractor_command)
wikiextractor_process = subprocess.run(wikiextractor_command, shell=True, check=True)
#wikiextractor_process.communicate()
wiki_path = directory_structure['extracted'] + '/wikicorpus_en'
output_filename = directory_structure['formatted'] + '/wikicorpus_en_one_article_per_line.txt'
wiki_formatter = WikicorpusTextFormatting.WikicorpusTextFormatting(wiki_path, output_filename, recursive=True)
wiki_formatter.merge()
elif args.dataset == 'wikicorpus_zh':
assert False, 'wikicorpus_zh not fully supported at this time. The simplified/tradition Chinese data needs to be translated and properly segmented still, and should work once this step is added.'
if args.skip_wikiextractor == 0:
path_to_wikiextractor_in_container = '/workspace/wikiextractor/WikiExtractor.py'
wikiextractor_command = path_to_wikiextractor_in_container + ' ' + directory_structure['download'] + '/' + args.dataset + '/wikicorpus_zh.xml ' + '-b 100M --processes ' + str(args.n_processes) + ' -o ' + directory_structure['extracted'] + '/' + args.dataset
print('WikiExtractor Command:', wikiextractor_command)
wikiextractor_process = subprocess.run(wikiextractor_command, shell=True, check=True)
#wikiextractor_process.communicate()
wiki_path = directory_structure['extracted'] + '/wikicorpus_zh'
output_filename = directory_structure['formatted'] + '/wikicorpus_zh_one_article_per_line.txt'
wiki_formatter = WikicorpusTextFormatting.WikicorpusTextFormatting(wiki_path, output_filename, recursive=True)
wiki_formatter.merge()
assert os.stat(output_filename).st_size > 0, 'File glob did not pick up extracted wiki files from WikiExtractor.'
elif args.action == 'sharding':
# Note: books+wiki requires user to provide list of input_files (comma-separated with no spaces)
if args.dataset == 'bookscorpus' or 'wikicorpus' in args.dataset or 'books_wiki' in args.dataset:
if args.input_files is None:
if args.dataset == 'bookscorpus':
args.input_files = [directory_structure['formatted'] + '/bookscorpus_one_book_per_line.txt']
elif args.dataset == 'wikicorpus_en':
args.input_files = [directory_structure['formatted'] + '/wikicorpus_en_one_article_per_line.txt']
elif args.dataset == 'wikicorpus_zh':
args.input_files = [directory_structure['formatted'] + '/wikicorpus_zh_one_article_per_line.txt']
elif args.dataset == 'books_wiki_en_corpus':
args.input_files = [directory_structure['formatted'] + '/bookscorpus_one_book_per_line.txt', directory_structure['formatted'] + '/wikicorpus_en_one_article_per_line.txt']
output_file_prefix = directory_structure['sharded'] + '/' + args.dataset + '/' + args.dataset
if not os.path.exists(directory_structure['sharded']):
os.makedirs(directory_structure['sharded'])
if not os.path.exists(directory_structure['sharded'] + '/' + args.dataset):
os.makedirs(directory_structure['sharded'] + '/' + args.dataset)
# Segmentation is here because all datasets look the same in one article/book/whatever per line format, and
# it seemed unnecessarily complicated to add an additional preprocessing step to call just for this.
# Different languages (e.g., Chinese simplified/traditional) may require translation and
# other packages to be called from here -- just add a conditional branch for those extra steps
segmenter = TextSharding.NLTKSegmenter()
sharding = TextSharding.Sharding(args.input_files, output_file_prefix, args.n_training_shards, args.n_test_shards, args.fraction_test_set)
sharding.load_articles()
sharding.segment_articles_into_sentences(segmenter)
sharding.distribute_articles_over_shards()
sharding.write_shards_to_disk()
else:
assert False, 'Unsupported dataset for sharding'
elif args.action == 'create_tfrecord_files':
assert False, 'TFrecord creation not supported in this PyTorch model example release.' \
''
if not os.path.exists(directory_structure['tfrecord'] + "/" + args.dataset):
os.makedirs(directory_structure['tfrecord'] + "/" + args.dataset)
def create_record_worker(filename_prefix, shard_id, output_format='tfrecord'):
bert_preprocessing_command = 'python /workspace/bert/create_pretraining_data.py'
bert_preprocessing_command += ' --input_file=' + directory_structure['sharded'] + '/' + args.dataset + '/' + filename_prefix + '_' + str(shard_id) + '.txt'
bert_preprocessing_command += ' --output_file=' + directory_structure['tfrecord'] + '/' + args.dataset + '/' + filename_prefix + '_' + str(shard_id) + '.' + output_format
bert_preprocessing_command += ' --vocab_file=' + args.vocab_file
bert_preprocessing_command += ' --do_lower_case' if args.do_lower_case else ''
bert_preprocessing_command += ' --max_seq_length=' + str(args.max_seq_length)
bert_preprocessing_command += ' --max_predictions_per_seq=' + str(args.max_predictions_per_seq)
bert_preprocessing_command += ' --masked_lm_prob=' + str(args.masked_lm_prob)
bert_preprocessing_command += ' --random_seed=' + str(args.random_seed)
bert_preprocessing_command += ' --dupe_factor=' + str(args.dupe_factor)
bert_preprocessing_process = subprocess.Popen(bert_preprocessing_command, shell=True)
last_process = bert_preprocessing_process
# This could be better optimized (fine if all take equal time)
if shard_id % args.n_processes == 0 and shard_id > 0:
bert_preprocessing_process.wait()
return last_process
output_file_prefix = args.dataset
for i in range(args.n_training_shards):
last_process =create_record_worker(output_file_prefix + '_training', i)
last_process.wait()
for i in range(args.n_test_shards):
last_process = create_record_worker(output_file_prefix + '_test', i)
last_process.wait()
elif args.action == 'create_hdf5_files':
last_process = None
if not os.path.exists(directory_structure['hdf5'] + "/" + args.dataset):
os.makedirs(directory_structure['hdf5'] + "/" + args.dataset)
def create_record_worker(filename_prefix, shard_id, output_format='hdf5'):
bert_preprocessing_command = 'python /workspace/bert/create_pretraining_data.py'
bert_preprocessing_command += ' --input_file=' + directory_structure['sharded'] + '/' + args.dataset + '/' + filename_prefix + '_' + str(shard_id) + '.txt'
bert_preprocessing_command += ' --output_file=' + directory_structure['hdf5'] + '/' + args.dataset + '/' + filename_prefix + '_' + str(shard_id) + '.' + output_format
bert_preprocessing_command += ' --vocab_file=' + args.vocab_file
bert_preprocessing_command += ' --do_lower_case' if args.do_lower_case else ''
bert_preprocessing_command += ' --max_seq_length=' + str(args.max_seq_length)
bert_preprocessing_command += ' --max_predictions_per_seq=' + str(args.max_predictions_per_seq)
bert_preprocessing_command += ' --masked_lm_prob=' + str(args.masked_lm_prob)
bert_preprocessing_command += ' --random_seed=' + str(args.random_seed)
bert_preprocessing_command += ' --dupe_factor=' + str(args.dupe_factor)
bert_preprocessing_process = subprocess.Popen(bert_preprocessing_command, shell=True)
last_process = bert_preprocessing_process
# This could be better optimized (fine if all take equal time)
if shard_id % args.n_processes == 0 and shard_id > 0:
bert_preprocessing_process.wait()
return last_process
output_file_prefix = args.dataset
for i in range(args.n_training_shards):
last_process = create_record_worker(output_file_prefix + '_training', i)
last_process.wait()
for i in range(args.n_test_shards):
last_process = create_record_worker(output_file_prefix + '_test', i)
last_process.wait()
if __name__ == "__main__":
parser = argparse.ArgumentParser(
description='Preprocessing Application for Everything BERT-related'
)
parser.add_argument(
'--action',
type=str,
help='Specify the action you want the app to take. e.g., generate vocab, segment, create tfrecords',
choices={
'download', # Download and verify mdf5/sha sums
'text_formatting', # Convert into a file that contains one article/book per line
'sharding', # Convert previous formatted text into shards containing one sentence per line
'create_tfrecord_files', # Turn each shard into a TFrecord with masking and next sentence prediction info
'create_hdf5_files' # Turn each shard into a HDF5 file with masking and next sentence prediction info
}
)
parser.add_argument(
'--dataset',
type=str,
help='Specify the dataset to perform --action on',
choices={
'bookscorpus',
'wikicorpus_en',
'wikicorpus_zh',
'books_wiki_en_corpus',
'google_pretrained_weights',
'nvidia_pretrained_weights',
'mrpc',
'squad',
'all'
}
)
parser.add_argument(
'--input_files',
type=str,
help='Specify the input files in a comma-separated list (no spaces)'
)
parser.add_argument(
'--n_training_shards',
type=int,
help='Specify the number of training shards to generate',
default=256
)
parser.add_argument(
'--n_test_shards',
type=int,
help='Specify the number of test shards to generate',
default=256
)
parser.add_argument(
'--fraction_test_set',
type=float,
help='Specify the fraction (0..1) of the data to withhold for the test data split (based on number of sequences)',
default=0.2
)
parser.add_argument(
'--segmentation_method',
type=str,
help='Specify your choice of sentence segmentation',
choices={
'nltk'
},
default='nltk'
)
parser.add_argument(
'--n_processes',
type=int,
help='Specify the max number of processes to allow at one time',
default=4
)
parser.add_argument(
'--random_seed',
type=int,
help='Specify the base seed to use for any random number generation',
default=12345
)
parser.add_argument(
'--dupe_factor',
type=int,
help='Specify the duplication factor',
default=5
)
parser.add_argument(
'--masked_lm_prob',
type=float,
help='Specify the probability for masked lm',
default=0.15
)
parser.add_argument(
'--max_seq_length',
type=int,
help='Specify the maximum sequence length',
default=512
)
parser.add_argument(
'--max_predictions_per_seq',
type=int,
help='Specify the maximum number of masked words per sequence',
default=20
)
parser.add_argument(
'--do_lower_case',
type=int,
help='Specify whether it is cased (0) or uncased (1) (any number greater than 0 will be treated as uncased)',
default=1
)
parser.add_argument(
'--vocab_file',
type=str,
help='Specify absolute path to vocab file to use)'
)
parser.add_argument(
'--skip_wikiextractor',
type=int,
help='Specify whether to skip wikiextractor step 0=False, 1=True',
default=0
)
parser.add_argument(
'--interactive_json_config_generator',
type=str,
help='Specify the action you want the app to take. e.g., generate vocab, segment, create tfrecords'
)
args = parser.parse_args()
main(args)

View file

@ -0,0 +1,20 @@
#!/usr/bin/env bash
# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
echo "Downloading MRPC data"
wget https://gist.githubusercontent.com/W4ngatang/60c2bdb54d156a41194446737ce03e2e/raw/17b8dd0d724281ed7c3b2aeeda662b92809aadd5/download_glue_data.py
python download_glue_data.py --data_dir . --tasks MRPC

View file

@ -0,0 +1,73 @@
#!/usr/bin/env bash
# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
echo "Downloading dataset for squad..."
# Download SQuAD
v1="v1.1"
mkdir $v1
wget https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v1.1.json -O $v1/train-v1.1.json
wget https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v1.1.json -O $v1/dev-v1.1.json
wget https://worksheets.codalab.org/rest/bundles/0xbcd57bee090b421c982906709c8c27e1/contents/blob/ -O $v1/evaluate-v1.1.py
EXP_TRAIN_v1='981b29407e0affa3b1b156f72073b945 -'
EXP_DEV_v1='3e85deb501d4e538b6bc56f786231552 -'
EXP_EVAL_v1='afb04912d18ff20696f7f88eed49bea9 -'
CALC_TRAIN_v1=`cat ${v1}/train-v1.1.json |md5sum`
CALC_DEV_v1=`cat ${v1}/dev-v1.1.json |md5sum`
CALC_EVAL_v1=`cat ${v1}/evaluate-v1.1.py |md5sum`
v2="v2.0"
mkdir $v2
wget https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v2.0.json -O $v2/train-v2.0.json
wget https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v2.0.json -O $v2/dev-v2.0.json
wget https://worksheets.codalab.org/rest/bundles/0x6b567e1cf2e041ec80d7098f031c5c9e/contents/blob/ -O $v2/evaluate-v2.0.py
EXP_TRAIN_v2='62108c273c268d70893182d5cf8df740 -'
EXP_DEV_v2='246adae8b7002f8679c027697b0b7cf8 -'
EXP_EVAL_v2='ff23213bed5516ea4a6d9edb6cd7d627 -'
CALC_TRAIN_v2=`cat ${v2}/train-v2.0.json |md5sum`
CALC_DEV_v2=`cat ${v2}/dev-v2.0.json |md5sum`
CALC_EVAL_v2=`cat ${v2}/evaluate-v2.0.py |md5sum`
echo "Squad data download done!"
echo "Verifying Dataset...."
if [ "$EXP_TRAIN_v1" != "$CALC_TRAIN_v1" ]; then
echo "train-v1.1.json is corrupted! md5sum doesn't match"
fi
if [ "$EXP_DEV_v1" != "$CALC_DEV_v1" ]; then
echo "dev-v1.1.json is corrupted! md5sum doesn't match"
fi
if [ "$EXP_EVAL_v1" != "$CALC_EVAL_v1" ]; then
echo "evaluate-v1.1.py is corrupted! md5sum doesn't match"
fi
if [ "$EXP_TRAIN_v2" != "$CALC_TRAIN_v2" ]; then
echo "train-v2.0.json is corrupted! md5sum doesn't match"
fi
if [ "$EXP_DEV_v2" != "$CALC_DEV_v2" ]; then
echo "dev-v2.0.json is corrupted! md5sum doesn't match"
fi
if [ "$EXP_EVAL_v2" != "$CALC_EVAL_v2" ]; then
echo "evaluate-v2.0.py is corrupted! md5sum doesn't match"
fi
echo "Complete!"

View file

@ -0,0 +1,515 @@
# coding=utf-8
# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Utilities for working with the local dataset cache.
This file is adapted from the AllenNLP library at https://github.com/allenai/allennlp
Copyright by the AllenNLP authors.
"""
import fnmatch
import json
import logging
import os
import shutil
import sys
import tarfile
import tempfile
from contextlib import contextmanager
from functools import partial, wraps
from hashlib import sha256
from typing import Optional
from urllib.parse import urlparse
from zipfile import ZipFile, is_zipfile
import boto3
import requests
from botocore.config import Config
from botocore.exceptions import ClientError
from filelock import FileLock
from tqdm.auto import tqdm
# from examples import __version__
__version__ = "0.1"
logger = logging.getLogger(__name__) # pylint: disable=invalid-name
try:
USE_TF = os.environ.get("USE_TF", "AUTO").upper()
USE_TORCH = os.environ.get("USE_TORCH", "AUTO").upper()
if USE_TORCH in ("1", "ON", "YES", "AUTO") and USE_TF not in ("1", "ON", "YES"):
import torch
_torch_available = True # pylint: disable=invalid-name
logger.info("PyTorch version {} available.".format(torch.__version__))
else:
logger.info("Disabling PyTorch because USE_TF is set")
_torch_available = False
except ImportError:
_torch_available = False # pylint: disable=invalid-name
try:
USE_TF = os.environ.get("USE_TF", "AUTO").upper()
USE_TORCH = os.environ.get("USE_TORCH", "AUTO").upper()
if USE_TF in ("1", "ON", "YES", "AUTO") and USE_TORCH not in ("1", "ON", "YES"):
import tensorflow as tf
assert hasattr(tf, "__version__") and int(tf.__version__[0]) >= 2
_tf_available = True # pylint: disable=invalid-name
logger.info("TensorFlow version {} available.".format(tf.__version__))
else:
logger.info("Disabling Tensorflow because USE_TORCH is set")
_tf_available = False
except (ImportError, AssertionError):
_tf_available = False # pylint: disable=invalid-name
try:
from torch.hub import _get_torch_home
torch_cache_home = _get_torch_home()
except ImportError:
torch_cache_home = os.path.expanduser(
os.getenv("TORCH_HOME", os.path.join(os.getenv("XDG_CACHE_HOME", "~/.cache"), "torch"))
)
default_cache_path = os.path.join(torch_cache_home, "transformers")
try:
from pathlib import Path
PYTORCH_PRETRAINED_BERT_CACHE = Path(
os.getenv("PYTORCH_TRANSFORMERS_CACHE", os.getenv("PYTORCH_PRETRAINED_BERT_CACHE", default_cache_path))
)
except (AttributeError, ImportError):
PYTORCH_PRETRAINED_BERT_CACHE = os.getenv(
"PYTORCH_TRANSFORMERS_CACHE", os.getenv("PYTORCH_PRETRAINED_BERT_CACHE", default_cache_path)
)
PYTORCH_TRANSFORMERS_CACHE = PYTORCH_PRETRAINED_BERT_CACHE # Kept for backward compatibility
TRANSFORMERS_CACHE = PYTORCH_PRETRAINED_BERT_CACHE # Kept for backward compatibility
WEIGHTS_NAME = "pytorch_model.bin"
TF2_WEIGHTS_NAME = "tf_model.h5"
TF_WEIGHTS_NAME = "model.ckpt"
CONFIG_NAME = "config.json"
MODEL_CARD_NAME = "modelcard.json"
MULTIPLE_CHOICE_DUMMY_INPUTS = [[[0], [1]], [[0], [1]]]
DUMMY_INPUTS = [[7, 6, 0, 0, 1], [1, 2, 3, 0, 0], [0, 0, 0, 4, 5]]
DUMMY_MASK = [[1, 1, 1, 1, 1], [1, 1, 1, 0, 0], [0, 0, 0, 1, 1]]
S3_BUCKET_PREFIX = "https://s3.amazonaws.com/models.huggingface.co/bert"
CLOUDFRONT_DISTRIB_PREFIX = "https://d2ws9o8vfrpkyk.cloudfront.net"
def is_torch_available():
return _torch_available
def is_tf_available():
return _tf_available
def add_start_docstrings(*docstr):
def docstring_decorator(fn):
fn.__doc__ = "".join(docstr) + (fn.__doc__ if fn.__doc__ is not None else "")
return fn
return docstring_decorator
def add_start_docstrings_to_callable(*docstr):
def docstring_decorator(fn):
class_name = ":class:`~transformers.{}`".format(fn.__qualname__.split(".")[0])
intro = " The {} forward method, overrides the :func:`__call__` special method.".format(class_name)
note = r"""
.. note::
Although the recipe for forward pass needs to be defined within
this function, one should call the :class:`Module` instance afterwards
instead of this since the former takes care of running the
pre and post processing steps while the latter silently ignores them.
"""
fn.__doc__ = intro + note + "".join(docstr) + (fn.__doc__ if fn.__doc__ is not None else "")
return fn
return docstring_decorator
def add_end_docstrings(*docstr):
def docstring_decorator(fn):
fn.__doc__ = fn.__doc__ + "".join(docstr)
return fn
return docstring_decorator
def is_remote_url(url_or_filename):
parsed = urlparse(url_or_filename)
return parsed.scheme in ("http", "https", "s3")
def hf_bucket_url(identifier, postfix=None, cdn=False) -> str:
endpoint = CLOUDFRONT_DISTRIB_PREFIX if cdn else S3_BUCKET_PREFIX
if postfix is None:
return "/".join((endpoint, identifier))
else:
return "/".join((endpoint, identifier, postfix))
def url_to_filename(url, etag=None):
"""
Convert `url` into a hashed filename in a repeatable way.
If `etag` is specified, append its hash to the url's, delimited
by a period.
If the url ends with .h5 (Keras HDF5 weights) adds '.h5' to the name
so that TF 2.0 can identify it as a HDF5 file
(see https://github.com/tensorflow/tensorflow/blob/00fad90125b18b80fe054de1055770cfb8fe4ba3/tensorflow/python/keras/engine/network.py#L1380)
"""
url_bytes = url.encode("utf-8")
url_hash = sha256(url_bytes)
filename = url_hash.hexdigest()
if etag:
etag_bytes = etag.encode("utf-8")
etag_hash = sha256(etag_bytes)
filename += "." + etag_hash.hexdigest()
if url.endswith(".h5"):
filename += ".h5"
return filename
def filename_to_url(filename, cache_dir=None):
"""
Return the url and etag (which may be ``None``) stored for `filename`.
Raise ``EnvironmentError`` if `filename` or its stored metadata do not exist.
"""
if cache_dir is None:
cache_dir = TRANSFORMERS_CACHE
if isinstance(cache_dir, Path):
cache_dir = str(cache_dir)
cache_path = os.path.join(cache_dir, filename)
if not os.path.exists(cache_path):
raise EnvironmentError("file {} not found".format(cache_path))
meta_path = cache_path + ".json"
if not os.path.exists(meta_path):
raise EnvironmentError("file {} not found".format(meta_path))
with open(meta_path, encoding="utf-8") as meta_file:
metadata = json.load(meta_file)
url = metadata["url"]
etag = metadata["etag"]
return url, etag
def cached_path(
url_or_filename,
cache_dir=None,
force_download=False,
proxies=None,
resume_download=False,
user_agent=None,
extract_compressed_file=False,
force_extract=False,
local_files_only=False,
) -> Optional[str]:
"""
Given something that might be a URL (or might be a local path),
determine which. If it's a URL, download the file and cache it, and
return the path to the cached file. If it's already a local path,
make sure the file exists and then return the path.
Args:
cache_dir: specify a cache directory to save the file to (overwrite the default cache dir).
force_download: if True, re-dowload the file even if it's already cached in the cache dir.
resume_download: if True, resume the download if incompletly recieved file is found.
user_agent: Optional string or dict that will be appended to the user-agent on remote requests.
extract_compressed_file: if True and the path point to a zip or tar file, extract the compressed
file in a folder along the archive.
force_extract: if True when extract_compressed_file is True and the archive was already extracted,
re-extract the archive and overide the folder where it was extracted.
Return:
None in case of non-recoverable file (non-existent or inaccessible url + no cache on disk).
Local path (string) otherwise
"""
if cache_dir is None:
cache_dir = TRANSFORMERS_CACHE
if isinstance(url_or_filename, Path):
url_or_filename = str(url_or_filename)
if isinstance(cache_dir, Path):
cache_dir = str(cache_dir)
if is_remote_url(url_or_filename):
# URL, so get it from the cache (downloading if necessary)
output_path = get_from_cache(
url_or_filename,
cache_dir=cache_dir,
force_download=force_download,
proxies=proxies,
resume_download=resume_download,
user_agent=user_agent,
local_files_only=local_files_only,
)
elif os.path.exists(url_or_filename):
# File, and it exists.
output_path = url_or_filename
elif urlparse(url_or_filename).scheme == "":
# File, but it doesn't exist.
raise EnvironmentError("file {} not found".format(url_or_filename))
else:
# Something unknown
raise ValueError("unable to parse {} as a URL or as a local path".format(url_or_filename))
if extract_compressed_file:
if not is_zipfile(output_path) and not tarfile.is_tarfile(output_path):
return output_path
# Path where we extract compressed archives
# We avoid '.' in dir name and add "-extracted" at the end: "./model.zip" => "./model-zip-extracted/"
output_dir, output_file = os.path.split(output_path)
output_extract_dir_name = output_file.replace(".", "-") + "-extracted"
output_path_extracted = os.path.join(output_dir, output_extract_dir_name)
if os.path.isdir(output_path_extracted) and os.listdir(output_path_extracted) and not force_extract:
return output_path_extracted
# Prevent parallel extractions
lock_path = output_path + ".lock"
with FileLock(lock_path):
shutil.rmtree(output_path_extracted, ignore_errors=True)
os.makedirs(output_path_extracted)
if is_zipfile(output_path):
with ZipFile(output_path, "r") as zip_file:
zip_file.extractall(output_path_extracted)
zip_file.close()
elif tarfile.is_tarfile(output_path):
tar_file = tarfile.open(output_path)
tar_file.extractall(output_path_extracted)
tar_file.close()
else:
raise EnvironmentError("Archive format of {} could not be identified".format(output_path))
return output_path_extracted
return output_path
def split_s3_path(url):
"""Split a full s3 path into the bucket name and path."""
parsed = urlparse(url)
if not parsed.netloc or not parsed.path:
raise ValueError("bad s3 path {}".format(url))
bucket_name = parsed.netloc
s3_path = parsed.path
# Remove '/' at beginning of path.
if s3_path.startswith("/"):
s3_path = s3_path[1:]
return bucket_name, s3_path
def s3_request(func):
"""
Wrapper function for s3 requests in order to create more helpful error
messages.
"""
@wraps(func)
def wrapper(url, *args, **kwargs):
try:
return func(url, *args, **kwargs)
except ClientError as exc:
if int(exc.response["Error"]["Code"]) == 404:
raise EnvironmentError("file {} not found".format(url))
else:
raise
return wrapper
@s3_request
def s3_etag(url, proxies=None):
"""Check ETag on S3 object."""
s3_resource = boto3.resource("s3", config=Config(proxies=proxies))
bucket_name, s3_path = split_s3_path(url)
s3_object = s3_resource.Object(bucket_name, s3_path)
return s3_object.e_tag
@s3_request
def s3_get(url, temp_file, proxies=None):
"""Pull a file directly from S3."""
s3_resource = boto3.resource("s3", config=Config(proxies=proxies))
bucket_name, s3_path = split_s3_path(url)
s3_resource.Bucket(bucket_name).download_fileobj(s3_path, temp_file)
def http_get(url, temp_file, proxies=None, resume_size=0, user_agent=None):
ua = "transformers/{}; python/{}".format(__version__, sys.version.split()[0])
if is_torch_available():
ua += "; torch/{}".format(torch.__version__)
if is_tf_available():
ua += "; tensorflow/{}".format(tf.__version__)
if isinstance(user_agent, dict):
ua += "; " + "; ".join("{}/{}".format(k, v) for k, v in user_agent.items())
elif isinstance(user_agent, str):
ua += "; " + user_agent
headers = {"user-agent": ua}
if resume_size > 0:
headers["Range"] = "bytes=%d-" % (resume_size,)
response = requests.get(url, stream=True, proxies=proxies, headers=headers)
if response.status_code == 416: # Range not satisfiable
return
content_length = response.headers.get("Content-Length")
total = resume_size + int(content_length) if content_length is not None else None
progress = tqdm(
unit="B",
unit_scale=True,
total=total,
initial=resume_size,
desc="Downloading",
disable=bool(logger.getEffectiveLevel() == logging.NOTSET),
)
for chunk in response.iter_content(chunk_size=1024):
if chunk: # filter out keep-alive new chunks
progress.update(len(chunk))
temp_file.write(chunk)
progress.close()
def get_from_cache(
url,
cache_dir=None,
force_download=False,
proxies=None,
etag_timeout=10,
resume_download=False,
user_agent=None,
local_files_only=False,
) -> Optional[str]:
"""
Given a URL, look for the corresponding file in the local cache.
If it's not there, download it. Then return the path to the cached file.
Return:
None in case of non-recoverable file (non-existent or inaccessible url + no cache on disk).
Local path (string) otherwise
"""
if cache_dir is None:
cache_dir = TRANSFORMERS_CACHE
if isinstance(cache_dir, Path):
cache_dir = str(cache_dir)
os.makedirs(cache_dir, exist_ok=True)
etag = None
if not local_files_only:
# Get eTag to add to filename, if it exists.
if url.startswith("s3://"):
etag = s3_etag(url, proxies=proxies)
else:
try:
response = requests.head(url, allow_redirects=True, proxies=proxies, timeout=etag_timeout)
if response.status_code == 200:
etag = response.headers.get("ETag")
except (EnvironmentError, requests.exceptions.Timeout):
# etag is already None
pass
filename = url_to_filename(url, etag)
# get cache path to put the file
cache_path = os.path.join(cache_dir, filename)
# etag is None = we don't have a connection, or url doesn't exist, or is otherwise inaccessible.
# try to get the last downloaded one
if etag is None:
if os.path.exists(cache_path):
return cache_path
else:
matching_files = [
file
for file in fnmatch.filter(os.listdir(cache_dir), filename + ".*")
if not file.endswith(".json") and not file.endswith(".lock")
]
if len(matching_files) > 0:
return os.path.join(cache_dir, matching_files[-1])
else:
# If files cannot be found and local_files_only=True,
# the models might've been found if local_files_only=False
# Notify the user about that
if local_files_only:
raise ValueError(
"Cannot find the requested files in the cached path and outgoing traffic has been"
" disabled. To enable model look-ups and downloads online, set 'local_files_only'"
" to False."
)
return None
# From now on, etag is not None.
if os.path.exists(cache_path) and not force_download:
return cache_path
# Prevent parallel downloads of the same file with a lock.
lock_path = cache_path + ".lock"
with FileLock(lock_path):
if resume_download:
incomplete_path = cache_path + ".incomplete"
@contextmanager
def _resumable_file_manager():
with open(incomplete_path, "a+b") as f:
yield f
temp_file_manager = _resumable_file_manager
if os.path.exists(incomplete_path):
resume_size = os.stat(incomplete_path).st_size
else:
resume_size = 0
else:
temp_file_manager = partial(tempfile.NamedTemporaryFile, dir=cache_dir, delete=False)
resume_size = 0
# Download to temporary file, then copy to cache dir once finished.
# Otherwise you get corrupt cache entries if the download gets interrupted.
with temp_file_manager() as temp_file:
logger.info("%s not found in cache or force_download set to True, downloading to %s", url, temp_file.name)
# GET file object
if url.startswith("s3://"):
if resume_download:
logger.warn('Warning: resumable downloads are not implemented for "s3://" urls')
s3_get(url, temp_file, proxies=proxies)
else:
http_get(url, temp_file, proxies=proxies, resume_size=resume_size, user_agent=user_agent)
logger.info("storing %s in cache at %s", url, cache_path)
os.replace(temp_file.name, cache_path)
logger.info("creating metadata file for %s", cache_path)
meta = {"url": url, "etag": etag}
meta_path = cache_path + ".json"
with open(meta_path, "w") as meta_file:
json.dump(meta, meta_file)
return cache_path

View file

@ -0,0 +1,893 @@
# coding=utf-8
# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import logging
import tensorflow as tf
from configuration import ElectraConfig
from file_utils import add_start_docstrings, add_start_docstrings_to_callable
from modeling_utils import ACT2FN, TFBertEncoder, TFBertPreTrainedModel
from modeling_utils import get_initializer, shape_list
from tokenization_utils import BatchEncoding
logger = logging.getLogger(__name__)
TF_ELECTRA_PRETRAINED_MODEL_ARCHIVE_MAP = {
"google/electra-small-generator": "https://s3.amazonaws.com/models.huggingface.co/bert/google/electra-small-generator/tf_model.h5",
"google/electra-base-generator": "https://s3.amazonaws.com/models.huggingface.co/bert/google/electra-base-generator/tf_model.h5",
"google/electra-large-generator": "https://s3.amazonaws.com/models.huggingface.co/bert/google/electra-large-generator/tf_model.h5",
"google/electra-small-discriminator": "https://s3.amazonaws.com/models.huggingface.co/bert/google/electra-small-discriminator/tf_model.h5",
"google/electra-base-discriminator": "https://s3.amazonaws.com/models.huggingface.co/bert/google/electra-base-discriminator/tf_model.h5",
"google/electra-large-discriminator": "https://s3.amazonaws.com/models.huggingface.co/bert/google/electra-large-discriminator/tf_model.h5",
}
class TFElectraEmbeddings(tf.keras.layers.Layer):
"""Construct the embeddings from word, position and token_type embeddings.
"""
def __init__(self, config, **kwargs):
super().__init__(**kwargs)
self.vocab_size = config.vocab_size
self.embedding_size = config.embedding_size
self.initializer_range = config.initializer_range
self.position_embeddings = tf.keras.layers.Embedding(
config.max_position_embeddings,
config.embedding_size,
embeddings_initializer=get_initializer(self.initializer_range),
name="position_embeddings",
)
self.token_type_embeddings = tf.keras.layers.Embedding(
config.type_vocab_size,
config.embedding_size,
embeddings_initializer=get_initializer(self.initializer_range),
name="token_type_embeddings",
)
# self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
# any TensorFlow checkpoint file
self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob)
def build(self, input_shape):
"""Build shared word embedding layer """
with tf.name_scope("word_embeddings"):
# Create and initialize weights. The random normal initializer was chosen
# arbitrarily, and works well.
self.word_embeddings = self.add_weight(
"weight",
shape=[self.vocab_size, self.embedding_size],
initializer=get_initializer(self.initializer_range),
)
super().build(input_shape)
def call(self, inputs, mode="embedding", training=False):
"""Get token embeddings of inputs.
Args:
inputs: list of three int64 tensors with shape [batch_size, length]: (input_ids, position_ids, token_type_ids)
mode: string, a valid value is one of "embedding" and "linear".
Returns:
outputs: (1) If mode == "embedding", output embedding tensor, float32 with
shape [batch_size, length, embedding_size]; (2) mode == "linear", output
linear tensor, float32 with shape [batch_size, length, vocab_size].
Raises:
ValueError: if mode is not valid.
Shared weights logic adapted from
https://github.com/tensorflow/models/blob/a009f4fb9d2fc4949e32192a944688925ef78659/official/transformer/v2/embedding_layer.py#L24
"""
if mode == "embedding":
return self._embedding(inputs, training=training)
elif mode == "linear":
return self._linear(inputs)
else:
raise ValueError("mode {} is not valid.".format(mode))
def _embedding(self, inputs, training=False):
"""Applies embedding based on inputs tensor."""
input_ids, position_ids, token_type_ids, inputs_embeds = inputs
if input_ids is not None:
input_shape = shape_list(input_ids)
else:
input_shape = shape_list(inputs_embeds)[:-1]
seq_length = input_shape[1]
if position_ids is None:
position_ids = tf.range(seq_length, dtype=tf.int32)[tf.newaxis, :]
if token_type_ids is None:
token_type_ids = tf.fill(input_shape, 0)
if inputs_embeds is None:
inputs_embeds = tf.gather(self.word_embeddings, input_ids)
position_embeddings = self.position_embeddings(position_ids)
token_type_embeddings = self.token_type_embeddings(token_type_ids)
embeddings = inputs_embeds + position_embeddings + token_type_embeddings
embeddings = self.LayerNorm(embeddings)
embeddings = self.dropout(embeddings, training=training)
return embeddings
def _linear(self, inputs):
"""Computes logits by running inputs through a linear layer.
Args:
inputs: A float32 tensor with shape [batch_size, length, hidden_size]
Returns:
float32 tensor with shape [batch_size, length, vocab_size].
"""
batch_size = shape_list(inputs)[0]
length = shape_list(inputs)[1]
x = tf.reshape(inputs, [-1, self.embedding_size])
logits = tf.matmul(x, self.word_embeddings, transpose_b=True)
return tf.reshape(logits, [batch_size, length, self.vocab_size])
class TFElectraDiscriminatorPredictions(tf.keras.layers.Layer):
def __init__(self, config, **kwargs):
super().__init__(**kwargs)
self.dense = tf.keras.layers.Dense(config.hidden_size, name="dense")
self.dense_prediction = tf.keras.layers.Dense(1, name="dense_prediction")
self.config = config
def call(self, discriminator_hidden_states, training=False):
hidden_states = self.dense(discriminator_hidden_states)
hidden_states = ACT2FN[self.config.hidden_act](hidden_states)
logits = tf.squeeze(self.dense_prediction(hidden_states))
return logits
class TFElectraGeneratorPredictions(tf.keras.layers.Layer):
def __init__(self, config, **kwargs):
super().__init__(**kwargs)
self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
self.dense = tf.keras.layers.Dense(config.embedding_size, name="dense")
def call(self, generator_hidden_states, training=False):
hidden_states = self.dense(generator_hidden_states)
hidden_states = ACT2FN["gelu"](hidden_states)
hidden_states = self.LayerNorm(hidden_states)
return hidden_states
class TFElectraPreTrainedModel(TFBertPreTrainedModel):
config_class = ElectraConfig
pretrained_model_archive_map = TF_ELECTRA_PRETRAINED_MODEL_ARCHIVE_MAP
base_model_prefix = "electra"
def get_extended_attention_mask(self, attention_mask, input_shape):
if attention_mask is None:
attention_mask = tf.fill(input_shape, 1)
# We create a 3D attention mask from a 2D tensor mask.
# Sizes are [batch_size, 1, 1, to_seq_length]
# So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length]
# this attention mask is more simple than the triangular masking of causal attention
# used in OpenAI GPT, we just need to prepare the broadcast dimension here.
extended_attention_mask = attention_mask[:, tf.newaxis, tf.newaxis, :]
# Since attention_mask is 1.0 for positions we want to attend and 0.0 for
# masked positions, this operation will create a tensor which is 0.0 for
# positions we want to attend and -10000.0 for masked positions.
# Since we are adding it to the raw scores before the softmax, this is
# effectively the same as removing these entirely.
extended_attention_mask = tf.cast(extended_attention_mask, tf.float32)
extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0
return extended_attention_mask
def get_head_mask(self, head_mask):
if head_mask is not None:
raise NotImplementedError
else:
head_mask = [None] * self.config.num_hidden_layers
return head_mask
class TFElectraMainLayer(TFElectraPreTrainedModel):
config_class = ElectraConfig
def __init__(self, config, **kwargs):
super().__init__(config, **kwargs)
self.embeddings = TFElectraEmbeddings(config, name="embeddings")
if config.embedding_size != config.hidden_size:
self.embeddings_project = tf.keras.layers.Dense(config.hidden_size, name="embeddings_project")
self.encoder = TFBertEncoder(config, name="encoder")
self.config = config
def get_input_embeddings(self):
return self.embeddings
def _resize_token_embeddings(self, new_num_tokens):
raise NotImplementedError
def _prune_heads(self, heads_to_prune):
""" Prunes heads of the model.
heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
See base class PreTrainedModel
"""
raise NotImplementedError
def call(
self,
inputs,
attention_mask=None,
token_type_ids=None,
position_ids=None,
head_mask=None,
inputs_embeds=None,
training=False,
):
if isinstance(inputs, (tuple, list)):
input_ids = inputs[0]
attention_mask = inputs[1] if len(inputs) > 1 else attention_mask
token_type_ids = inputs[2] if len(inputs) > 2 else token_type_ids
position_ids = inputs[3] if len(inputs) > 3 else position_ids
head_mask = inputs[4] if len(inputs) > 4 else head_mask
inputs_embeds = inputs[5] if len(inputs) > 5 else inputs_embeds
assert len(inputs) <= 6, "Too many inputs."
elif isinstance(inputs, (dict, BatchEncoding)):
input_ids = inputs.get("input_ids")
attention_mask = inputs.get("attention_mask", attention_mask)
token_type_ids = inputs.get("token_type_ids", token_type_ids)
position_ids = inputs.get("position_ids", position_ids)
head_mask = inputs.get("head_mask", head_mask)
inputs_embeds = inputs.get("inputs_embeds", inputs_embeds)
assert len(inputs) <= 6, "Too many inputs."
else:
input_ids = inputs
if input_ids is not None and inputs_embeds is not None:
raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
elif input_ids is not None:
input_shape = shape_list(input_ids)
elif inputs_embeds is not None:
input_shape = shape_list(inputs_embeds)[:-1]
else:
raise ValueError("You have to specify either input_ids or inputs_embeds")
if attention_mask is None:
attention_mask = tf.fill(input_shape, 1)
if token_type_ids is None:
token_type_ids = tf.fill(input_shape, 0)
extended_attention_mask = self.get_extended_attention_mask(attention_mask, input_shape)
head_mask = self.get_head_mask(head_mask)
hidden_states = self.embeddings([input_ids, position_ids, token_type_ids, inputs_embeds], training=training)
if hasattr(self, "embeddings_project"):
hidden_states = self.embeddings_project(hidden_states, training=training)
hidden_states = self.encoder([hidden_states, extended_attention_mask, head_mask], training=training)
return hidden_states
ELECTRA_START_DOCSTRING = r"""
This model is a `tf.keras.Model <https://www.tensorflow.org/api_docs/python/tf/keras/Model>`__ sub-class.
Use it as a regular TF 2.0 Keras Model and
refer to the TF 2.0 documentation for all matter related to general usage and behavior.
.. note::
TF 2.0 models accepts two formats as inputs:
- having all inputs as keyword arguments (like PyTorch models), or
- having all inputs as a list, tuple or dict in the first positional arguments.
This second option is useful when using :obj:`tf.keras.Model.fit()` method which currently requires having
all the tensors in the first argument of the model call function: :obj:`model(inputs)`.
If you choose this second option, there are three possibilities you can use to gather all the input Tensors
in the first positional argument :
- a single Tensor with input_ids only and nothing else: :obj:`model(inputs_ids)`
- a list of varying length with one or several input Tensors IN THE ORDER given in the docstring:
:obj:`model([input_ids, attention_mask])` or :obj:`model([input_ids, attention_mask, token_type_ids])`
- a dictionary with one or several input Tensors associated to the input names given in the docstring:
:obj:`model({'input_ids': input_ids, 'token_type_ids': token_type_ids})`
Parameters:
config (:class:`~transformers.ElectraConfig`): Model configuration class with all the parameters of the model.
Initializing with a config file does not load the weights associated with the model, only the configuration.
Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
"""
ELECTRA_INPUTS_DOCSTRING = r"""
Args:
input_ids (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`):
Indices of input sequence tokens in the vocabulary.
Indices can be obtained using :class:`transformers.ElectraTokenizer`.
See :func:`transformers.PreTrainedTokenizer.encode` and
:func:`transformers.PreTrainedTokenizer.encode_plus` for details.
`What are input IDs? <../glossary.html#input-ids>`__
attention_mask (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
Mask to avoid performing attention on padding token indices.
Mask values selected in ``[0, 1]``:
``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
`What are attention masks? <../glossary.html#attention-mask>`__
head_mask (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`, defaults to :obj:`None`):
Mask to nullify selected heads of the self-attention modules.
Mask values selected in ``[0, 1]``:
:obj:`1` indicates the head is **not masked**, :obj:`0` indicates the head is **masked**.
inputs_embeds (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, embedding_dim)`, `optional`, defaults to :obj:`None`):
Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation.
This is useful if you want more control over how to convert `input_ids` indices into associated vectors
than the model's internal embedding lookup matrix.
training (:obj:`boolean`, `optional`, defaults to :obj:`False`):
Whether to activate dropout modules (if set to :obj:`True`) during training or to de-activate them
(if set to :obj:`False`) for evaluation.
"""
@add_start_docstrings(
"The bare Electra Model transformer outputting raw hidden-states without any specific head on top. Identical to "
"the BERT model except that it uses an additional linear layer between the embedding layer and the encoder if the "
"hidden size and embedding size are different."
""
"Both the generator and discriminator checkpoints may be loaded into this model.",
ELECTRA_START_DOCSTRING,
)
class TFElectraModel(TFElectraPreTrainedModel):
def __init__(self, config, *inputs, **kwargs):
super().__init__(config, *inputs, **kwargs)
self.electra = TFElectraMainLayer(config, name="electra")
def get_input_embeddings(self):
return self.electra.embeddings
@add_start_docstrings_to_callable(ELECTRA_INPUTS_DOCSTRING)
def call(self, inputs, **kwargs):
r"""
Returns:
:obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.ElectraConfig`) and inputs:
last_hidden_state (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`):
Sequence of hidden-states at the output of the last layer of the model.
hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when :obj:`config.output_hidden_states=True`):
tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_attentions=True``):
tuple of :obj:`tf.Tensor` (one for each layer) of shape
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`:
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
Examples::
import tensorflow as tf
from transformers import ElectraTokenizer, TFElectraModel
tokenizer = ElectraTokenizer.from_pretrained('google/electra-small-discriminator')
model = TFElectraModel.from_pretrained('google/electra-small-discriminator')
input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :] # Batch size 1
outputs = model(input_ids)
last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple
"""
outputs = self.electra(inputs, **kwargs)
return outputs
@add_start_docstrings(
"""
Electra model with a binary classification head on top as used during pre-training for identifying generated
tokens.
Even though both the discriminator and generator may be loaded into this model, the discriminator is
the only model of the two to have the correct classification head to be used for this model.""",
ELECTRA_START_DOCSTRING,
)
class TFElectraForPreTraining(TFElectraPreTrainedModel):
def __init__(self, config, **kwargs):
super().__init__(config, **kwargs)
self.electra = TFElectraMainLayer(config, name="electra")
self.discriminator_predictions = TFElectraDiscriminatorPredictions(config, name="discriminator_predictions")
def get_input_embeddings(self):
return self.electra.embeddings
@add_start_docstrings_to_callable(ELECTRA_INPUTS_DOCSTRING)
def call(
self,
input_ids=None,
attention_mask=None,
token_type_ids=None,
position_ids=None,
head_mask=None,
inputs_embeds=None,
training=False,
):
r"""
Returns:
:obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.ElectraConfig`) and inputs:
scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.num_labels)`):
Prediction scores of the head (scores for each token before SoftMax).
hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when :obj:`config.output_hidden_states=True`):
tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_attentions=True``):
tuple of :obj:`tf.Tensor` (one for each layer) of shape
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`:
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
Examples::
import tensorflow as tf
from transformers import ElectraTokenizer, TFElectraForPreTraining
tokenizer = ElectraTokenizer.from_pretrained('google/electra-small-discriminator')
model = TFElectraForPreTraining.from_pretrained('google/electra-small-discriminator')
input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :] # Batch size 1
outputs = model(input_ids)
scores = outputs[0]
"""
discriminator_hidden_states = self.electra(
input_ids, attention_mask, token_type_ids, position_ids, head_mask, inputs_embeds, training=training
)
discriminator_sequence_output = discriminator_hidden_states[0]
logits = self.discriminator_predictions(discriminator_sequence_output)
output = (logits,)
output += discriminator_hidden_states[1:]
return output # (loss), scores, (hidden_states), (attentions)
class TFElectraMaskedLMHead(tf.keras.layers.Layer):
def __init__(self, config, input_embeddings, **kwargs):
super().__init__(**kwargs)
self.vocab_size = config.vocab_size
self.input_embeddings = input_embeddings
def build(self, input_shape):
self.bias = self.add_weight(shape=(self.vocab_size,), initializer="zeros", trainable=True, name="bias")
super().build(input_shape)
def call(self, hidden_states, training=False):
hidden_states = self.input_embeddings(hidden_states, mode="linear")
hidden_states = hidden_states + self.bias
return hidden_states
@add_start_docstrings(
"""
Electra model with a language modeling head on top.
Even though both the discriminator and generator may be loaded into this model, the generator is
the only model of the two to have been trained for the masked language modeling task.""",
ELECTRA_START_DOCSTRING,
)
class TFElectraForMaskedLM(TFElectraPreTrainedModel):
def __init__(self, config, **kwargs):
super().__init__(config, **kwargs)
self.vocab_size = config.vocab_size
self.electra = TFElectraMainLayer(config, name="electra")
self.generator_predictions = TFElectraGeneratorPredictions(config, name="generator_predictions")
if isinstance(config.hidden_act, str):
self.activation = ACT2FN[config.hidden_act]
else:
self.activation = config.hidden_act
self.generator_lm_head = TFElectraMaskedLMHead(config, self.electra.embeddings, name="generator_lm_head")
def get_input_embeddings(self):
return self.electra.embeddings
def get_output_embeddings(self):
return self.generator_lm_head
@add_start_docstrings_to_callable(ELECTRA_INPUTS_DOCSTRING)
def call(
self,
input_ids=None,
attention_mask=None,
token_type_ids=None,
position_ids=None,
head_mask=None,
inputs_embeds=None,
training=False,
):
r"""
Returns:
:obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.ElectraConfig`) and inputs:
prediction_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`):
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when :obj:`config.output_hidden_states=True`):
tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_attentions=True``):
tuple of :obj:`tf.Tensor` (one for each layer) of shape
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`:
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
Examples::
import tensorflow as tf
from transformers import ElectraTokenizer, TFElectraForMaskedLM
tokenizer = ElectraTokenizer.from_pretrained('google/electra-small-generator')
model = TFElectraForMaskedLM.from_pretrained('google/electra-small-generator')
input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :] # Batch size 1
outputs = model(input_ids)
prediction_scores = outputs[0]
"""
generator_hidden_states = self.electra(
input_ids, attention_mask, token_type_ids, position_ids, head_mask, inputs_embeds, training=training
)
generator_sequence_output = generator_hidden_states[0]
prediction_scores = self.generator_predictions(generator_sequence_output, training=training)
prediction_scores = self.generator_lm_head(prediction_scores, training=training)
output = (prediction_scores,)
output += generator_hidden_states[1:]
return output # (masked_lm_loss), prediction_scores, (hidden_states), (attentions)
@add_start_docstrings(
"""
Electra model with a token classification head on top.
Both the discriminator and generator may be loaded into this model.""",
ELECTRA_START_DOCSTRING,
)
class TFElectraForTokenClassification(TFElectraPreTrainedModel):
def __init__(self, config, **kwargs):
super().__init__(config, **kwargs)
self.electra = TFElectraMainLayer(config, name="electra")
self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob)
self.classifier = tf.keras.layers.Dense(config.num_labels, name="classifier")
@add_start_docstrings_to_callable(ELECTRA_INPUTS_DOCSTRING)
def call(
self,
input_ids=None,
attention_mask=None,
token_type_ids=None,
position_ids=None,
head_mask=None,
inputs_embeds=None,
training=False,
):
r"""
Returns:
:obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.ElectraConfig`) and inputs:
scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.num_labels)`):
Classification scores (before SoftMax).
hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when :obj:`config.output_hidden_states=True`):
tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_attentions=True``):
tuple of :obj:`tf.Tensor` (one for each layer) of shape
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`:
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
Examples::
import tensorflow as tf
from transformers import ElectraTokenizer, TFElectraForTokenClassification
tokenizer = ElectraTokenizer.from_pretrained('google/electra-small-discriminator')
model = TFElectraForTokenClassification.from_pretrained('google/electra-small-discriminator')
input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :] # Batch size 1
outputs = model(input_ids)
scores = outputs[0]
"""
discriminator_hidden_states = self.electra(
input_ids, attention_mask, token_type_ids, position_ids, head_mask, inputs_embeds, training=training
)
discriminator_sequence_output = discriminator_hidden_states[0]
discriminator_sequence_output = self.dropout(discriminator_sequence_output)
logits = self.classifier(discriminator_sequence_output)
output = (logits,)
output += discriminator_hidden_states[1:]
return output # (loss), scores, (hidden_states), (attentions)
class TFPoolerStartLogits(tf.keras.Model):
""" Compute SQuAD start_logits from sequence hidden states. """
def __init__(self, config, *inputs, **kwargs):
super().__init__(*inputs, **kwargs)
self.dense = tf.keras.layers.Dense(
1, kernel_initializer=get_initializer(config.initializer_range), name="start_logit_pooler_dense"
)
def call(self, hidden_states, p_mask=None, next_layer_dtype=tf.float32):
""" Args:
**p_mask**: (`optional`) ``torch.FloatTensor`` of shape `(batch_size, seq_len)`
invalid position mask such as query and special symbols (PAD, SEP, CLS)
1.0 means token should be masked.
"""
x = tf.squeeze(self.dense(hidden_states), axis=-1,
name="squeeze_start_logit_pooler")
if p_mask is not None:
if self.dense.dtype == tf.float16:
x = x * (1 - p_mask) - 65500 * p_mask
else:
x = x * (1 - p_mask) - 1e30 * p_mask
return x
class TFPoolerEndLogits(tf.keras.Model):
""" Compute SQuAD end_logits from sequence hidden states and start token hidden state.
"""
def __init__(self, config, *inputs, **kwargs):
super().__init__(*inputs, **kwargs)
self.dense_0 = tf.keras.layers.Dense(
config.hidden_size, kernel_initializer=get_initializer(config.initializer_range),
name="end_logit_pooler_dense_0"
)
self.activation = tf.keras.layers.Activation('tanh') # nn.Tanh()
self.LayerNorm = tf.keras.layers.LayerNormalization(axis=-1, epsilon=config.layer_norm_eps,
name="end_logit_pooler_LayerNorm")
self.dense_1 = tf.keras.layers.Dense(
1, kernel_initializer=get_initializer(config.initializer_range), name="end_logit_pooler_dense_1"
)
def call(self, hidden_states, start_states=None, start_positions=None, p_mask=None, training=False,
next_layer_dtype=tf.float32):
""" Args:
One of ``start_states``, ``start_positions`` should be not None.
If both are set, ``start_positions`` overrides ``start_states``.
**start_states**: ``torch.LongTensor`` of shape identical to hidden_states
hidden states of the first tokens for the labeled span.
**start_positions**: ``torch.LongTensor`` of shape ``(batch_size,)``
position of the first token for the labeled span:
**p_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, seq_len)``
Mask of invalid position such as query and special symbols (PAD, SEP, CLS)
1.0 means token should be masked.
"""
assert (
start_states is not None or start_positions is not None
), "One of start_states, start_positions should be not None"
if start_positions is not None and training:
bsz, slen, hsz = hidden_states.shape
start_states = tf.gather(hidden_states, start_positions[:, None], axis=1,
batch_dims=1) # shape (bsz, 1, hsz)
start_states = tf.broadcast_to(start_states, (bsz, slen, hsz)) # shape (bsz, slen, hsz)
x = self.dense_0(tf.concat([hidden_states, start_states], axis=-1))
x = self.activation(x)
if training:
# since we are not doing beam search, add dimension with value=1. corresponds to dimension with top_k during inference - if not layernorm crashes
x = tf.expand_dims(x, axis=2)
x = self.LayerNorm(x)
if training:
# undo the additional dimension added above
x = tf.squeeze(self.dense_1(x), axis=[-1, -2])
else:
x = tf.squeeze(self.dense_1(x), axis=-1)
if p_mask is not None:
if next_layer_dtype == tf.float16:
x = x * (1 - p_mask) - 65500 * p_mask
else:
x = x * (1 - p_mask) - 1e30 * p_mask
return x
class TFPoolerAnswerClass(tf.keras.Model):
""" Compute SQuAD 2.0 answer class from classification and start tokens hidden states. """
def __init__(self, config, *inputs, **kwargs):
super().__init__(*inputs, **kwargs)
self.dense_0 = tf.keras.layers.Dense(
config.hidden_size, kernel_initializer=get_initializer(config.initializer_range),
name="pooler_answer_class_dense_0"
)
self.activation = tf.keras.layers.Activation('tanh')
self.dense_1 = tf.keras.layers.Dense(
1, use_bias=False, kernel_initializer=get_initializer(config.initializer_range),
name="pooler_answer_class_dense_1"
)
def call(self, hidden_states, start_states=None, start_positions=None, cls_index=None):
"""
Args:
One of ``start_states``, ``start_positions`` should be not None.
If both are set, ``start_positions`` overrides ``start_states``.
**start_states**: ``torch.LongTensor`` of shape identical to ``hidden_states``.
hidden states of the first tokens for the labeled span.
**start_positions**: ``torch.LongTensor`` of shape ``(batch_size,)``
position of the first token for the labeled span.
**cls_index**: torch.LongTensor of shape ``(batch_size,)``
position of the CLS token. If None, take the last token.
note(Original repo):
no dependency on end_feature so that we can obtain one single `cls_logits`
for each sample
"""
assert (
start_states is not None or start_positions is not None
), "One of start_states, start_positions should be not None"
if start_positions is not None:
start_states = tf.gather(hidden_states, start_positions[:, None], axis=1,
batch_dims=1) # shape (bsz, 1, hsz)
start_states = tf.squeeze(start_states, axis=1) # shape (bsz, hsz)
if cls_index is not None:
cls_token_state = tf.gather(hidden_states, cls_index[:, None], axis=1, batch_dims=1) # shape (bsz, 1, hsz)
cls_token_state = tf.squeeze(cls_token_state, axis=1) # shape (bsz, hsz)
else:
cls_token_state = hidden_states[:, 0, :] # shape (bsz, hsz)
x = self.dense_0(tf.concat([start_states, cls_token_state], axis=-1))
x = self.activation(x)
x = tf.squeeze(self.dense_1(x), axis=-1)
return x
class TFElectraForQuestionAnswering(TFElectraPreTrainedModel):
def __init__(self, config, args):
super().__init__(config, args)
self.start_n_top = args.beam_size # config.start_n_top
self.end_n_top = args.beam_size # config.end_n_top
self.joint_head = args.joint_head
self.v2 = args.version_2_with_negative
self.electra = TFElectraMainLayer(config, name="electra")
self.num_hidden_layers = config.num_hidden_layers
self.amp = config.amp
##old head
if not self.joint_head:
self.qa_outputs = tf.keras.layers.Dense(
2, kernel_initializer=get_initializer(config.initializer_range), name="qa_outputs")
else:
self.start_logits = TFPoolerStartLogits(config, name='start_logits')
self.end_logits = TFPoolerEndLogits(config, name='end_logits')
if self.v2:
self.answer_class = TFPoolerAnswerClass(config, name='answer_class')
def call(
self,
input_ids=None,
attention_mask=None,
token_type_ids=None,
start_positions=None,
end_positions=None,
cls_index=None,
p_mask=None,
is_impossible=None,
position_ids=None,
head_mask=None,
inputs_embeds=None,
training=False,
):
outputs = self.electra(
input_ids, attention_mask, token_type_ids, position_ids, head_mask, inputs_embeds, training=training
)
discriminator_sequence_output = outputs[0]
# Simple head model
if not self.joint_head:
logits = self.qa_outputs(discriminator_sequence_output)
[start_logits, end_logits] = tf.split(logits, 2, axis=-1)
start_logits = tf.squeeze(start_logits, axis=-1, name="squeeze_start_logit")
end_logits = tf.squeeze(end_logits, axis=-1, name="squeeze_end_logit")
outputs = (start_logits, end_logits) + outputs
return outputs
start_logits = self.start_logits(discriminator_sequence_output, p_mask=p_mask,
next_layer_dtype=self.end_logits.dense_0.dtype)
if training: # start_positions is not None and end_positions is not None:
# during training, compute the end logits based on the ground truth of the start position
end_logits = self.end_logits(discriminator_sequence_output, start_positions=start_positions, p_mask=p_mask,
training=training,
next_layer_dtype=tf.float16 if self.amp else tf.float32)
if self.v2: # cls_index is not None:#cls_index is not None and is_impossible is not None:
# Predict answerability from the representation of CLS and START
cls_logits = self.answer_class(discriminator_sequence_output, start_positions=start_positions,
cls_index=cls_index)
else:
cls_logits = None
outputs = (start_logits, end_logits, cls_logits) + outputs
else:
# during inference, compute the end logits based on beam search
bsz, slen, hsz = discriminator_sequence_output.shape
start_n_top = min(self.start_n_top, slen)
end_n_top = min(self.end_n_top, slen)
start_log_probs = tf.nn.log_softmax(start_logits, axis=-1, name="start_logit_softmax") # shape (bsz, slen)
start_top_log_probs, start_top_index = tf.math.top_k(start_log_probs, k=start_n_top,
name="start_log_probs_top_k")
start_states = tf.gather(discriminator_sequence_output, start_top_index, axis=1,
batch_dims=1) # shape (bsz, start_n_top, hsz)
start_states = tf.broadcast_to(tf.expand_dims(start_states, axis=1),
[bsz, slen, start_n_top, hsz]) # shape (bsz, slen, start_n_top, hsz)
discriminator_sequence_output_expanded = tf.broadcast_to(
tf.expand_dims(discriminator_sequence_output, axis=2),
list(start_states.shape)) # shape (bsz, slen, start_n_top, hsz)
p_mask = tf.expand_dims(p_mask, axis=-1) if p_mask is not None else None
end_logits = self.end_logits(discriminator_sequence_output_expanded, start_states=start_states,
p_mask=p_mask, next_layer_dtype=tf.float16 if self.amp else tf.float32) # self.answer_class.dense_0.dtype)
end_log_probs = tf.nn.log_softmax(end_logits, axis=1,
name="end_logit_softmax") # shape (bsz, slen, start_n_top)
# need to transpose because tf.math.top_k works on default axis=-1
end_log_probs = tf.transpose(end_log_probs, perm=[0, 2, 1])
end_top_log_probs, end_top_index = tf.math.top_k(
end_log_probs, k=end_n_top) # shape (bsz, end_n_top, start_n_top).perm(0,2,1)
end_top_log_probs = tf.reshape(end_top_log_probs, (
-1, start_n_top * end_n_top)) # shape (bsz, self.start_n_top * self.end_n_top)
end_top_index = tf.reshape(end_top_index,
(-1, start_n_top * end_n_top)) # shape (bsz, self.start_n_top * self.end_n_top)
if self.v2: # cls_index is not None:
start_p = tf.nn.softmax(start_logits, axis=-1, name="start_softmax")
start_states = tf.einsum(
"blh,bl->bh", discriminator_sequence_output, start_p
) # get the representation of START as weighted sum of hidden states
# explicitly setting cls_index to None
cls_logits = self.answer_class(
discriminator_sequence_output, start_states=start_states, cls_index=None)
# one single `cls_logits` for each sample
else:
cls_logits = tf.fill([bsz], 0.0)
outputs = (start_top_log_probs, start_top_index, end_top_log_probs, end_top_index, cls_logits) + outputs
# return start_top_log_probs, start_top_index, end_top_log_probs, end_top_index, cls_logits
return outputs

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,360 @@
# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Functions and classes related to optimization (weight updates)."""
import re
import collections
import tensorflow as tf
from tensorflow.python.ops import control_flow_ops
from tensorflow.python.ops import math_ops
from tensorflow.python.ops import state_ops
from tensorflow.python.training import training_ops
class WarmUp(tf.keras.optimizers.schedules.LearningRateSchedule):
"""Applys a warmup schedule on a given learning rate decay schedule."""
def __init__(self, initial_learning_rate, decay_schedule_fn, warmup_steps, power=1.0, name=None):
super().__init__()
self.initial_learning_rate = initial_learning_rate
self.warmup_steps = warmup_steps
self.power = power
self.decay_schedule_fn = decay_schedule_fn
self.name = name
def __call__(self, step):
with tf.name_scope(self.name or "WarmUp") as name:
# Implements polynomial warmup. i.e., if global_step < warmup_steps, the
# learning rate will be `global_step/num_warmup_steps * init_lr`.
global_step_float = tf.cast(step, tf.float32)
warmup_steps_float = tf.cast(self.warmup_steps, tf.float32)
warmup_percent_done = global_step_float / warmup_steps_float
warmup_learning_rate = self.initial_learning_rate * tf.math.pow(warmup_percent_done, self.power)
return tf.cond(
global_step_float < warmup_steps_float,
lambda: warmup_learning_rate,
lambda: self.decay_schedule_fn(step),
name=name,
)
def get_config(self):
return {
"initial_learning_rate": self.initial_learning_rate,
"decay_schedule_fn": self.decay_schedule_fn,
"warmup_steps": self.warmup_steps,
"power": self.power,
"name": self.name,
}
def create_optimizer(init_lr, num_train_steps, num_warmup_steps, weight_decay_rate=0.01,
layerwise_lr_decay=-1, n_transformer_layers=None, clip_norm=1.0):
"""Creates an optimizer with learning rate schedule."""
# Implements linear decay of the learning rate.
learning_rate_fn = tf.keras.optimizers.schedules.PolynomialDecay(
initial_learning_rate=init_lr, decay_steps=num_train_steps, end_learning_rate=0.0
)
if num_warmup_steps:
learning_rate_fn = WarmUp(
initial_learning_rate=init_lr, decay_schedule_fn=learning_rate_fn, warmup_steps=num_warmup_steps
)
layer_decay = None
if layerwise_lr_decay > 0 and n_transformer_layers is not None:
layer_decay = _get_layer_decay(layerwise_lr_decay, n_transformer_layers)
optimizer = AdamWeightDecay(
learning_rate=learning_rate_fn,
weight_decay_rate=weight_decay_rate, # TODO (yy): update this as flag
layer_decay=layer_decay,
beta_1=0.9,
beta_2=0.999,
epsilon=1e-6,
exclude_from_weight_decay=["layer_norm", "bias"],
clip_norm=clip_norm,
)
return optimizer
class AdamWeightDecay(tf.keras.optimizers.Adam):
"""Adam enables L2 weight decay and clip_by_global_norm on gradients.
Just adding the square of the weights to the loss function is *not* the
correct way of using L2 regularization/weight decay with Adam, since that will
interact with the m and v parameters in strange ways.
Instead we want ot decay the weights in a manner that doesn't interact with
the m/v parameters. This is equivalent to adding the square of the weights to
the loss with plain (non-momentum) SGD.
"""
def __init__(
self,
learning_rate=0.001,
beta_1=0.9,
beta_2=0.999,
epsilon=1e-7,
amsgrad=False,
weight_decay_rate=0.0,
include_in_weight_decay=None,
exclude_from_weight_decay=None,
layer_decay=None,
clip_norm=1.0,
name="AdamWeightDecay",
**kwargs
):
super().__init__(learning_rate, beta_1, beta_2, epsilon, amsgrad, name, **kwargs)
self.weight_decay_rate = weight_decay_rate
self._include_in_weight_decay = include_in_weight_decay
self._exclude_from_weight_decay = exclude_from_weight_decay
self.layer_decay = layer_decay
self.clip_norm = clip_norm
@classmethod
def from_config(cls, config):
"""Creates an optimizer from its config with WarmUp custom object."""
custom_objects = {"WarmUp": WarmUp}
return super().from_config(config, custom_objects=custom_objects)
def _prepare_local(self, var_device, var_dtype, apply_state):
super()._prepare_local(var_device, var_dtype, apply_state)
apply_state["weight_decay_rate"] = tf.constant(self.weight_decay_rate, name="adam_weight_decay_rate")
def _decay_weights_op(self, var, learning_rate, apply_state):
do_decay = self._do_use_weight_decay(var.name)
if do_decay:
return var.assign_sub(
learning_rate * var * apply_state["weight_decay_rate"], use_locking=self._use_locking
)
return tf.no_op()
def apply_gradients(self, grads_and_vars, name=None, experimental_aggregate_gradients=True):
grads, tvars = list(zip(*grads_and_vars))
(grads, _) = tf.clip_by_global_norm(grads, clip_norm=self.clip_norm)
return super().apply_gradients(zip(grads, tvars), name=name,
experimental_aggregate_gradients=experimental_aggregate_gradients)
def _get_lr(self, var, apply_state):
"""Retrieves the learning rate with the given state."""
# if apply_state is None:
# return self._decayed_lr_t[var_dtype], {}
var_name, var_device, var_dtype = var.name, var.device, var.dtype.base_dtype
apply_state = apply_state or {}
coefficients = apply_state.get((var_device, var_dtype))
if coefficients is None:
coefficients = self._fallback_apply_state(var_device, var_dtype)
apply_state[(var_device, var_dtype)] = coefficients
lr_t = coefficients["lr_t"]
lr = coefficients["lr"]
if self.layer_decay is not None:
update_for_var = False
for key in self.layer_decay:
if key in var_name:
update_for_var = True
lr_t *= self.layer_decay[key]
lr *= self.layer_decay[key]
break
if not update_for_var:
raise ValueError("No learning rate specified for variable", var)
return lr_t, lr, coefficients, dict(apply_state=apply_state)
def _resource_apply_dense(self, grad, var, apply_state=None):
# print("Dense: {} {} {}".format(var.name, var.device, var.dtype.base_dtype))
lr_t, _, coefficients, kwargs = self._get_lr(var, apply_state)
decay = self._decay_weights_op(var, lr_t, apply_state)
with tf.control_dependencies([decay]):
m = self.get_slot(var, 'm')
v = self.get_slot(var, 'v')
if not self.amsgrad:
return training_ops.resource_apply_adam(
var.handle,
m.handle,
v.handle,
coefficients['beta_1_power'],
coefficients['beta_2_power'],
lr_t,
coefficients['beta_1_t'],
coefficients['beta_2_t'],
coefficients['epsilon'],
grad,
use_locking=self._use_locking)
else:
vhat = self.get_slot(var, 'vhat')
return training_ops.resource_apply_adam_with_amsgrad(
var.handle,
m.handle,
v.handle,
vhat.handle,
coefficients['beta_1_power'],
coefficients['beta_2_power'],
lr_t,
coefficients['beta_1_t'],
coefficients['beta_2_t'],
coefficients['epsilon'],
grad,
use_locking=self._use_locking)
def _resource_apply_sparse(self, grad, var, indices, apply_state=None):
# print("Sparse: {} {} {}".format(var.name, var.device, var.dtype.base_dtype))
lr_t, lr, coefficients, kwargs = self._get_lr(var, apply_state)
decay = self._decay_weights_op(var, lr_t, apply_state)
with tf.control_dependencies([decay]):
# m_t = beta1 * m + (1 - beta1) * g_t
m = self.get_slot(var, 'm')
m_scaled_g_values = grad * coefficients['one_minus_beta_1_t']
m_t = state_ops.assign(m, m * coefficients['beta_1_t'],
use_locking=self._use_locking)
with tf.control_dependencies([m_t]):
m_t = self._resource_scatter_add(m, indices, m_scaled_g_values)
# v_t = beta2 * v + (1 - beta2) * (g_t * g_t)
v = self.get_slot(var, 'v')
v_scaled_g_values = (grad * grad) * coefficients['one_minus_beta_2_t']
v_t = state_ops.assign(v, v * coefficients['beta_2_t'],
use_locking=self._use_locking)
with tf.control_dependencies([v_t]):
v_t = self._resource_scatter_add(v, indices, v_scaled_g_values)
if not self.amsgrad:
v_sqrt = math_ops.sqrt(v_t)
var_update = state_ops.assign_sub(
var, lr * m_t / (v_sqrt + coefficients['epsilon']),
use_locking=self._use_locking)
return control_flow_ops.group(*[var_update, m_t, v_t])
else:
v_hat = self.get_slot(var, 'vhat')
v_hat_t = math_ops.maximum(v_hat, v_t)
with tf.control_dependencies([v_hat_t]):
v_hat_t = state_ops.assign(
v_hat, v_hat_t, use_locking=self._use_locking)
v_hat_sqrt = math_ops.sqrt(v_hat_t)
var_update = state_ops.assign_sub(
var,
lr * m_t / (v_hat_sqrt + coefficients['epsilon']),
use_locking=self._use_locking)
return control_flow_ops.group(*[var_update, m_t, v_t, v_hat_t])
def get_config(self):
config = super().get_config()
config.update({"weight_decay_rate": self.weight_decay_rate})
return config
def _do_use_weight_decay(self, param_name):
"""Whether to use L2 weight decay for `param_name`."""
if self.weight_decay_rate == 0:
return False
if self._include_in_weight_decay:
for r in self._include_in_weight_decay:
if re.search(r, param_name) is not None:
return True
if self._exclude_from_weight_decay:
for r in self._exclude_from_weight_decay:
if re.search(r, param_name) is not None:
return False
return True
# Inspired from https://github.com/OpenNMT/OpenNMT-tf/blob/master/opennmt/optimizers/utils.py
class GradientAccumulator(object):
"""Distribution strategies-aware gradient accumulation utility."""
def __init__(self):
"""Initializes the accumulator."""
self._gradients = []
self._accum_steps = tf.Variable(
initial_value=0, dtype=tf.int64, trainable=False, aggregation=tf.VariableAggregation.ONLY_FIRST_REPLICA
)
@property
def step(self):
"""Number of accumulated steps."""
return self._accum_steps.value()
@property
def gradients(self):
"""The accumulated gradients."""
return list(
gradient.value() if gradient is not None else gradient for gradient in self._get_replica_gradients()
)
def __call__(self, gradients):
"""Accumulates :obj:`gradients`."""
if not self._gradients:
self._gradients.extend(
[
tf.Variable(tf.zeros_like(gradient), trainable=False) if gradient is not None else gradient
for gradient in gradients
]
)
if len(gradients) != len(self._gradients):
raise ValueError("Expected %s gradients, but got %d" % (len(self._gradients), len(gradients)))
for accum_gradient, gradient in zip(self._get_replica_gradients(), gradients):
if accum_gradient is not None and gradient is not None:
accum_gradient.assign_add(gradient)
self._accum_steps.assign_add(1)
def reset(self):
"""Resets the accumulated gradients."""
if self._gradients:
self._accum_steps.assign(0)
for gradient in self._get_replica_gradients():
if gradient is not None:
gradient.assign(tf.zeros_like(gradient))
def _get_replica_gradients(self):
if tf.distribute.has_strategy():
# In a replica context, we want to accumulate gradients on each replica
# without synchronization, so we directly assign the value of the
# current replica.
replica_context = tf.distribute.get_replica_context()
if replica_context is None or tf.distribute.get_strategy().num_replicas_in_sync == 1:
return self._gradients
return (
gradient.device_map.select_for_current_replica(gradient.values, replica_context)
for gradient in self._gradients
if gradient is not None
)
else:
return self._gradients
def _get_layer_decay(layer_decay, n_layers):
"""Have lower learning rates for layers closer to the input."""
key_to_depths = collections.OrderedDict({
"/embeddings/": 0,
"/embeddings_project/": 0,
"/start_logits/": n_layers + 2,
"/end_logits/": n_layers + 2,
"/answer_class/": n_layers + 2,
"/qa_outputs/": n_layers + 2,
})
for layer in range(n_layers):
key_to_depths["encoder/layer_._" + str(layer) + "/"] = layer + 1
return {
key: layer_decay ** (n_layers + 2 - depth)
for key, depth in key_to_depths.items()
}

View file

@ -0,0 +1,212 @@
# Copyright (c) 2020 NVIDIA CORPORATION. All rights reserved.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import sys
import subprocess
import time
import argparse
import json
import logging
import collections
import tensorflow as tf
if sys.version_info[0] == 2:
import cPickle as pickle
else:
import pickle
from configuration import ElectraConfig
from modeling import TFElectraForQuestionAnswering
from tokenization import ElectraTokenizer
from squad_utils import SquadResult, RawResult, _get_best_indices
TF_ELECTRA_PRETRAINED_MODEL_ARCHIVE_LIST = [
"google/electra-small-generator",
"google/electra-base-generator",
"google/electra-large-generator",
"google/electra-small-discriminator",
"google/electra-base-discriminator",
"google/electra-large-discriminator",
# See all ELECTRA models at https://huggingface.co/models?filter=electra
]
_PrelimPrediction = collections.namedtuple(
"PrelimPrediction",
["start_index", "end_index", "start_logit", "end_logit"])
def parse_args():
parser = argparse.ArgumentParser()
# Required parameters
parser.add_argument("--electra_model", default=None, type=str, required=True,
help="Model selected in the list: " + ", ".join(TF_ELECTRA_PRETRAINED_MODEL_ARCHIVE_LIST))
parser.add_argument("--init_checkpoint",
default=None,
type=str,
required=True,
help="The checkpoint file from pretraining")
parser.add_argument("--question",
default=None,
type=str,
required=True,
help="Question")
parser.add_argument("--context",
default=None,
type=str,
required=True,
help="Context")
parser.add_argument(
"--joint_head",
default=True,
type=bool,
help="Jointly predict the start and end positions",
)
parser.add_argument(
"--beam_size",
default=4,
type=int,
help="Beam size when doing joint predictions",
)
parser.add_argument("--n_best_size", default=20, type=int,
help="The total number of n-best predictions to generate in the nbest_predictions.json "
"output file.")
parser.add_argument("--max_answer_length", default=30, type=int,
help="The maximum length of an answer that can be generated. This is needed because the start "
"and end predictions are not conditioned on one another.")
parser.add_argument('--version_2_with_negative',
action='store_true',
help='If true, the SQuAD examples contain some that do not have an answer.')
parser.add_argument('--null_score_diff_threshold',
type=float, default=0.0,
help="If null_score - best_non_null is greater than the threshold predict null.")
args = parser.parse_args()
return args
def get_predictions_joint_head(start_indices, end_indices, result, max_len, args):
predictions = []
for i in range(args.beam_size):
start_index = start_indices[i]
for j in range(args.beam_size):
# for end_index in end_indices:
end_index = end_indices[i * args.beam_size + j]
if start_index >= max_len:
continue
if end_index >= max_len:
continue
if end_index < start_index:
continue
length = end_index - start_index + 1
if length > args.max_answer_length:
continue
predictions.append(
_PrelimPrediction(
start_index=start_index,
end_index=end_index,
start_logit=result.start_logits[i],
end_logit=result.end_logits[i * args.beam_size + j]))
return predictions
def get_predictions(start_indices, end_indices, result, max_len, args):
predictions = []
for start_index in start_indices:
for end_index in end_indices:
if start_index >= max_len:
continue
if end_index >= max_len:
continue
if end_index < start_index:
continue
length = end_index - start_index + 1
if length > args.max_answer_length:
continue
predictions.append(
_PrelimPrediction(
start_index=start_index,
end_index=end_index,
start_logit=result.start_logits[start_index],
end_logit=result.end_logits[end_index]))
return predictions
def main():
args = parse_args()
print("***** Loading tokenizer and model *****")
electra_model = args.electra_model
config = ElectraConfig.from_pretrained(electra_model)
tokenizer = ElectraTokenizer.from_pretrained(electra_model)
model = TFElectraForQuestionAnswering.from_pretrained(electra_model, config=config, args=args)
print("***** Loading fine-tuned checkpoint: {} *****".format(args.init_checkpoint))
model.load_weights(args.init_checkpoint, by_name=False, skip_mismatch=False).expect_partial()
question, text = args.question, args.context
encoding = tokenizer.encode_plus(question, text, return_tensors='tf')
input_ids, token_type_ids, attention_mask = encoding["input_ids"], encoding["token_type_ids"], \
encoding["attention_mask"]
all_tokens = tokenizer.convert_ids_to_tokens(input_ids.numpy()[0])
if not args.joint_head:
start_logits, end_logits = model(input_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
)[:2]
start_logits = start_logits[0].numpy().tolist()
end_logits = end_logits[0].numpy().tolist()
result = RawResult(unique_id=0,
start_logits=start_logits,
end_logits=end_logits)
start_indices = _get_best_indices(result.start_logits, args.n_best_size)
end_indices = _get_best_indices(result.end_logits, args.n_best_size)
predictions = get_predictions(start_indices, end_indices, result, len(all_tokens), args)
null_score = result.start_logits[0] + result.end_logits[0]
else:
outputs = model(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
output = [output[0].numpy().tolist() for output in outputs]
start_logits = output[0]
start_top_index = output[1]
end_logits = output[2]
end_top_index = output[3]
cls_logits = output[4]
result = SquadResult(
0,
start_logits,
end_logits,
start_top_index=start_top_index,
end_top_index=end_top_index,
cls_logits=cls_logits,
)
predictions = get_predictions_joint_head(result.start_top_index, result.end_top_index, result, len(all_tokens), args)
null_score = result.cls_logits
predictions = sorted(predictions, key=lambda x: (x.start_logit + x.end_logit), reverse=True)
answer = predictions[0]
answer = ' '.join(all_tokens[answer.start_index: answer.end_index + 1])
if args.null_score_diff_threshold > null_score and args.version_2_with_negative:
answer = ''
print(answer)
return answer
if __name__ == "__main__":
main()

View file

@ -0,0 +1,654 @@
# Copyright (c) 2020 NVIDIA CORPORATION. All rights reserved.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import sys
import subprocess
import time
import argparse
import json
import logging
import tensorflow as tf
import horovod.tensorflow as hvd
from horovod.tensorflow.compression import Compression
if sys.version_info[0] == 2:
import cPickle as pickle
else:
import pickle
from tqdm import tqdm
import dllogger
from utils import is_main_process, format_step, get_rank, get_world_size
from configuration import ElectraConfig
from modeling import TFElectraForQuestionAnswering
from tokenization import ElectraTokenizer
from optimization import create_optimizer
from squad_utils import SquadV1Processor, SquadV2Processor, squad_convert_examples_to_features, \
SquadResult, RawResult, get_answers
# create logger
logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG)
ch = logging.StreamHandler()
ch.setLevel(logging.INFO)
formatter = logging.Formatter('%(message)s')
ch.setFormatter(formatter)
logger.addHandler(ch)
TF_ELECTRA_PRETRAINED_MODEL_ARCHIVE_LIST = [
"google/electra-small-generator",
"google/electra-base-generator",
"google/electra-large-generator",
"google/electra-small-discriminator",
"google/electra-base-discriminator",
"google/electra-large-discriminator",
# See all ELECTRA models at https://huggingface.co/models?filter=electra
]
def parse_args():
parser = argparse.ArgumentParser()
# Required parameters
parser.add_argument("--electra_model", default=None, type=str, required=True,
help="Model selected in the list: " + ", ".join(TF_ELECTRA_PRETRAINED_MODEL_ARCHIVE_LIST))
parser.add_argument("--data_dir", default=None, type=str, required=True,
help="Path to dataset.")
parser.add_argument("--output_dir", default=".", type=str, required=True,
help="The output directory where the model checkpoints and predictions will be written.")
parser.add_argument("--init_checkpoint",
default=None,
type=str,
# required=True,
help="The checkpoint file from pretraining")
# Other parameters
parser.add_argument("--do_train", action='store_true', help="Whether to run training.")
parser.add_argument("--do_predict", action='store_true', help="Whether to run eval on the dev set.")
parser.add_argument("--do_eval",
action='store_true',
help="Whether to use evaluate accuracy of predictions")
parser.add_argument("--train_file", default=None, type=str, help="SQuAD json for training. E.g., train-v1.1.json")
parser.add_argument("--predict_file", default=None, type=str,
help="SQuAD json for predictions. E.g., dev-v1.1.json or test-v1.1.json")
parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.")
parser.add_argument("--predict_batch_size", default=8, type=int, help="Total batch size for predictions.")
parser.add_argument("--learning_rate", default=1e-4, type=float, help="The initial learning rate for Adam.")
parser.add_argument("--weight_decay_rate", default=0.01, type=float, help="Weight decay if we apply some.")
parser.add_argument("--layerwise_lr_decay", default=0.8, type=float,
help="The layerwise learning rate decay. Shallower layers have lower learning rates.")
parser.add_argument("--num_train_epochs", default=3, type=int,
help="Total number of training epochs to perform.")
parser.add_argument("--max_steps", default=-1.0, type=float,
help="Total number of training steps to perform.")
parser.add_argument("--warmup_proportion", default=0.1, type=float,
help="Proportion of training to perform linear learning rate warmup for. E.g., 0.1 = 10%% "
"of training.")
parser.add_argument("--max_seq_length", default=384, type=int,
help="The maximum total input sequence length after WordPiece tokenization. Sequences "
"longer than this will be truncated, and sequences shorter than this will be padded.")
parser.add_argument("--doc_stride", default=128, type=int,
help="When splitting up a long document into chunks, how much stride to take between chunks.")
parser.add_argument("--max_query_length", default=64, type=int,
help="The maximum number of tokens for the question. Questions longer than this will "
"be truncated to this length.")
parser.add_argument(
"--joint_head",
default=True,
type=bool,
help="Jointly predict the start and end positions",
)
parser.add_argument(
"--beam_size",
default=4,
type=int,
help="Beam size when doing joint predictions",
)
parser.add_argument("--n_best_size", default=20, type=int,
help="The total number of n-best predictions to generate in the nbest_predictions.json "
"output file.")
parser.add_argument("--max_answer_length", default=30, type=int,
help="The maximum length of an answer that can be generated. This is needed because the start "
"and end predictions are not conditioned on one another.")
parser.add_argument("--verbose_logging", action='store_true',
help="If true, all of the warnings related to data processing will be printed. "
"A number of warnings are expected for a normal SQuAD evaluation.")
parser.add_argument("--no_cuda",
action='store_true',
help="Whether not to use CUDA when available")
parser.add_argument('--seed',
type=int,
default=42,
help="random seed for initialization")
parser.add_argument(
"--evaluate_during_training", action="store_true", help="Run evaluation during training at each logging step."
)
parser.add_argument('--gradient_accumulation_steps',
type=int,
default=1,
help="Number of updates steps to accumulate before performing a backward/update pass.")
parser.add_argument("--do_lower_case",
action='store_true',
help="Whether to lower case the input text. True for uncased models, False for cased models.")
parser.add_argument("--local_rank",
type=int,
default=os.getenv('LOCAL_RANK', -1),
help="local_rank for distributed training on gpus")
parser.add_argument('--amp',
action='store_true',
help="Automatic mixed precision training")
parser.add_argument('--fp16_all_reduce',
action='store_true',
help="Whether to use 16-bit all reduce")
parser.add_argument('--xla',
action='store_true',
help="Whether to use XLA")
parser.add_argument('--version_2_with_negative',
action='store_true',
help='If true, the SQuAD examples contain some that do not have an answer.')
parser.add_argument('--null_score_diff_threshold',
type=float, default=0.0,
help="If null_score - best_non_null is greater than the threshold predict null.")
parser.add_argument('--log_freq',
type=int, default=50,
help='frequency of logging loss.')
parser.add_argument('--json-summary', type=str, default="results/dllogger.json",
help='If provided, the json summary will be written to the specified file.')
parser.add_argument("--eval_script",
help="Script to evaluate squad predictions",
default="evaluate.py",
type=str)
parser.add_argument("--use_env",
action='store_true',
help="Whether to read local rank from ENVVAR")
parser.add_argument('--skip_checkpoint',
default=False,
action='store_true',
help="Whether to save checkpoints")
parser.add_argument('--disable-progress-bar',
default=False,
action='store_true',
help='Disable tqdm progress bar')
parser.add_argument("--skip_cache",
default=False,
action='store_true',
help="Whether to cache train features")
parser.add_argument("--cache_dir",
default=None,
type=str,
help="Location to cache train feaures. Will default to the dataset direct")
args = parser.parse_args()
if not args.do_train and (not args.init_checkpoint or args.init_checkpoint == 'None'):
raise ValueError("Checkpoint is required if do_train is not set")
return args
def get_dataset_from_features(features, batch_size, drop_remainder=True, ngpu=8, mode="train", v2=False):
"""Input function for training"""
all_input_ids = tf.convert_to_tensor([f.input_ids for f in features], dtype=tf.int64)
all_input_mask = tf.convert_to_tensor([f.attention_mask for f in features], dtype=tf.int64)
all_segment_ids = tf.convert_to_tensor([f.token_type_ids for f in features], dtype=tf.int64)
all_start_pos = tf.convert_to_tensor([f.start_position for f in features], dtype=tf.int64)
all_end_pos = tf.convert_to_tensor([f.end_position for f in features], dtype=tf.int64)
# if v2 else None:
all_cls_index = tf.convert_to_tensor([f.cls_index for f in features], dtype=tf.int64)
all_p_mask = tf.convert_to_tensor([f.p_mask for f in features], dtype=tf.float32)
all_is_impossible = tf.convert_to_tensor([f.is_impossible for f in features], dtype=tf.float32)
dataset = tf.data.Dataset.from_tensor_slices(
(all_input_ids, all_input_mask, all_segment_ids, all_start_pos, all_end_pos)
+ (all_cls_index, all_p_mask, all_is_impossible))
if ngpu > 1:
dataset = dataset.shard(get_world_size(), get_rank())
if mode == "train":
dataset = dataset.shuffle(batch_size * 3)
# dataset = dataset.map(self._preproc_samples,
# num_parallel_calls=multiprocessing.cpu_count()//self._num_gpus)
dataset = dataset.batch(batch_size, drop_remainder=drop_remainder)
dataset = dataset.prefetch(batch_size)
return dataset
@tf.function
def train_step(model, inputs, loss, amp, opt, init, v2=False, loss_class=None, fp16=False):
with tf.GradientTape() as tape:
[input_ids, input_mask, segment_ids, start_positions, end_positions, cls_index, p_mask, is_impossible] = inputs
if not v2:
is_impossible = None
start_logits, end_logits, cls_logits = model(input_ids,
# input_ids=input_ids,
attention_mask=input_mask,
token_type_ids=segment_ids,
start_positions=start_positions,
end_positions=end_positions,
cls_index=cls_index,
p_mask=p_mask,
is_impossible=is_impossible,
position_ids=None,
head_mask=None,
inputs_embeds=None,
training=True,
)[0:3]
# If we are on multi-GPU, split add a dimension
if len(start_positions.shape) > 1:
start_positions = tf.squeeze(start_positions, axis=-1, name="squeeze_start_positions")
if len(end_positions.shape) > 1:
end_positions = tf.squeeze(end_positions, axis=-1, name="squeeze_end_positions")
if is_impossible is not None and len(is_impossible.shape) > 1 and v2 and cls_logits is not None:
is_impossible = tf.squeeze(is_impossible, axis=-1, name="squeeze_is_impossible")
# sometimes the start/end positions are outside our model inputs, we ignore these terms
ignored_index = start_logits.shape[1]
start_positions = tf.clip_by_value(start_positions, 0, ignored_index, name="clip_start_positions")
end_positions = tf.clip_by_value(end_positions, 0, ignored_index, name="clip_end_positions")
start_loss = loss(y_true=start_positions, y_pred=start_logits)
end_loss = loss(y_true=end_positions, y_pred=end_logits)
loss_value = (start_loss + end_loss) / 2
if v2:
cls_loss_value = loss_class(y_true=is_impossible, y_pred=cls_logits)
loss_value += cls_loss_value * 0.5
unscaled_loss = tf.stop_gradient(loss_value)
if amp:
loss_value = opt.get_scaled_loss(loss_value)
tape = hvd.DistributedGradientTape(tape, sparse_as_dense=True,
compression=Compression.fp16 if fp16 else Compression.none)
gradients = tape.gradient(loss_value, model.trainable_variables)
if amp:
gradients = opt.get_unscaled_gradients(gradients)
opt.apply_gradients(zip(gradients, model.trainable_variables)) # , clip_norm=1.0)
if init:
hvd.broadcast_variables(model.variables, root_rank=0)
hvd.broadcast_variables(opt.variables(), root_rank=0)
return unscaled_loss # , outputs#, tape.gradient(loss_value, model.trainable_variables)
@tf.function
def infer_step(model, input_ids,
attention_mask=None,
token_type_ids=None,
cls_index=None,
p_mask=None,
position_ids=None,
head_mask=None,
inputs_embeds=None,
training=False,
):
return model(input_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
cls_index=cls_index,
p_mask=p_mask,
position_ids=position_ids,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
training=training,
)
def main():
args = parse_args()
hvd.init()
if is_main_process():
print("Running total processes: {}".format(get_world_size()))
print("Starting process: {}".format(get_rank()))
if is_main_process():
dllogger.init(backends=[dllogger.JSONStreamBackend(verbosity=dllogger.Verbosity.VERBOSE,
filename=args.json_summary),
dllogger.StdOutBackend(verbosity=dllogger.Verbosity.VERBOSE, step_format=format_step)])
else:
dllogger.init(backends=[])
tf.random.set_seed(args.seed)
dllogger.log(step="PARAMETER", data={"SEED": args.seed})
# script parameters
BATCH_SIZE = args.train_batch_size
EVAL_BATCH_SIZE = args.predict_batch_size
USE_XLA = args.xla
USE_AMP = args.amp
EPOCHS = args.num_train_epochs
if not args.do_train:
EPOCHS = args.num_train_epochs = 1
print("Since running inference only, setting args.num_train_epochs to 1")
if not os.path.exists(args.output_dir) and is_main_process():
os.makedirs(args.output_dir)
# TensorFlow configuration
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
for gpu in gpus:
tf.config.experimental.set_memory_growth(gpu, True)
tf.config.experimental.set_visible_devices(gpus[hvd.local_rank()], 'GPU')
tf.config.optimizer.set_jit(USE_XLA)
tf.config.optimizer.set_experimental_options({"auto_mixed_precision": USE_AMP})
if is_main_process():
logger.info("***** Loading tokenizer and model *****")
# Load tokenizer and model from pretrained model/vocabulary. Specify the number of labels to classify (2+: classification, 1: regression)
electra_model = args.electra_model
config = ElectraConfig.from_pretrained(electra_model, cache_dir=args.cache_dir)
config.update({"amp": args.amp})
tokenizer = ElectraTokenizer.from_pretrained(electra_model, cache_dir=args.cache_dir)
model = TFElectraForQuestionAnswering.from_pretrained(electra_model, config=config, cache_dir=args.cache_dir, args=args)
if is_main_process():
logger.info("***** Loading dataset *****")
# Load data
processor = SquadV2Processor() if args.version_2_with_negative else SquadV1Processor()
train_examples = processor.get_train_examples(args.data_dir) if args.do_train else None
dev_examples = processor.get_dev_examples(args.data_dir) if args.do_predict else None
if is_main_process():
logger.info("***** Loading features *****")
# Load cached features
squad_version = '2.0' if args.version_2_with_negative else '1.1'
if args.cache_dir is None:
args.cache_dir = args.data_dir
cached_train_features_file = args.cache_dir.rstrip('/') + '/' + 'TF2_train-v{4}.json_{0}_{1}_{2}_{3}'.format(
electra_model.split("/")[1], str(args.max_seq_length), str(args.doc_stride),
str(args.max_query_length), squad_version)
cached_dev_features_file = args.cache_dir.rstrip('/') + '/' + 'TF2_dev-v{4}.json_{0}_{1}_{2}_{3}'.format(
electra_model.split("/")[1], str(args.max_seq_length), str(args.doc_stride),
str(args.max_query_length), squad_version)
try:
with open(cached_train_features_file, "rb") as reader:
train_features = pickle.load(reader) if args.do_train else []
with open(cached_dev_features_file, "rb") as reader:
dev_features = pickle.load(reader) if args.do_predict else []
except:
train_features = ( # TODO: (yy) do on rank 0?
squad_convert_examples_to_features(
examples=train_examples,
tokenizer=tokenizer,
max_seq_length=args.max_seq_length,
doc_stride=args.doc_stride,
max_query_length=args.max_query_length,
is_training=True,
return_dataset="",
)
if args.do_train
else []
)
dev_features = (
squad_convert_examples_to_features(
examples=dev_examples,
tokenizer=tokenizer,
max_seq_length=args.max_seq_length,
doc_stride=args.doc_stride,
max_query_length=args.max_query_length,
is_training=False,
return_dataset="",
)
if args.do_predict
else []
)
# Dump Cached features
if not args.skip_cache and is_main_process():
if args.do_train:
print("***** Building Cache Files: {} *****".format(cached_train_features_file))
with open(cached_train_features_file, "wb") as writer:
pickle.dump(train_features, writer)
if args.do_predict:
print("***** Building Cache Files: {} *****".format(cached_dev_features_file))
with open(cached_dev_features_file, "wb") as writer:
pickle.dump(dev_features, writer)
len_train_features = len(train_features)
total_train_steps = int((len_train_features * EPOCHS / BATCH_SIZE) / get_world_size()) + 1
train_steps_per_epoch = int((len_train_features / BATCH_SIZE) / get_world_size()) + 1
len_dev_features = len(dev_features)
total_dev_steps = int((len_dev_features / EVAL_BATCH_SIZE)) + 1
train_dataset = get_dataset_from_features(train_features, BATCH_SIZE,
v2=args.version_2_with_negative) if args.do_train else []
dev_dataset = get_dataset_from_features(dev_features, EVAL_BATCH_SIZE, drop_remainder=False, ngpu=1, mode="dev",
v2=args.version_2_with_negative) if args.do_predict else []
opt = create_optimizer(init_lr=args.learning_rate, num_train_steps=total_train_steps,
num_warmup_steps=int(args.warmup_proportion * total_train_steps),
weight_decay_rate=args.weight_decay_rate,
layerwise_lr_decay=args.layerwise_lr_decay,
n_transformer_layers=model.num_hidden_layers)
if USE_AMP:
# loss scaling is currently required when using mixed precision
opt = tf.keras.mixed_precision.experimental.LossScaleOptimizer(opt, "dynamic")
# Define loss function
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
loss_class = tf.keras.losses.BinaryCrossentropy(
from_logits=True,
name='binary_crossentropy'
)
metric = tf.keras.metrics.SparseCategoricalAccuracy("accuracy")
model.compile(optimizer=opt, loss=loss, metrics=[metric])
train_loss_results = []
if args.do_train and is_main_process():
logger.info("***** Running training *****")
logger.info(" Num examples = %d", len_train_features)
logger.info(" Num Epochs = %d", args.num_train_epochs)
logger.info(" Instantaneous batch size per GPU = %d", args.train_batch_size)
logger.info(
" Total train batch size (w. parallel, distributed & accumulation) = %d",
args.train_batch_size
* get_world_size(),
)
logger.info(" Total optimization steps = %d", total_train_steps)
total_train_time = 0
latency = []
for epoch in range(EPOCHS):
if args.do_train:
epoch_loss_avg = tf.keras.metrics.Mean()
epoch_perf_avg = tf.keras.metrics.Mean()
epoch_start = time.time()
epoch_iterator = tqdm(train_dataset, total=train_steps_per_epoch, desc="Iteration", mininterval=5,
disable=not is_main_process())
for iter, inputs in enumerate(epoch_iterator):
# breaking criterion if max_steps if > 1
if args.max_steps > 0 and (epoch * train_steps_per_epoch + iter) > args.max_steps:
break
iter_start = time.time()
# Optimize the model
loss_value = train_step(model, inputs, loss, USE_AMP, opt, (iter == 0 and epoch == 0),
v2=args.version_2_with_negative, loss_class=loss_class, fp16=USE_AMP)
epoch_perf_avg.update_state(1. * BATCH_SIZE / (time.time() - iter_start))
if iter % 100 == 0:
if is_main_process():
print("Epoch: {:03d}, Step:{:6d}, Loss:{:12.8f}, Perf:{:5.0f}".format(epoch, iter, loss_value,
epoch_perf_avg.result() * get_world_size()))
dllogger.log(step=(epoch, iter,), data={"step_loss": float(loss_value.numpy()),
"train_perf": float( epoch_perf_avg.result().numpy() * get_world_size())})
# Track progress
epoch_loss_avg.update_state(loss_value) # Add current batch loss
# End epoch
train_loss_results.append(epoch_loss_avg.result())
total_train_time += float(time.time() - epoch_start)
# Summarize and save checkpoint at the end of each epoch
if is_main_process():
# print(
# "**TRAIN SUMMARY** - Epoch {:03d}, Train_Loss: {:12.8f}, Train_Perf: {:5.0f} seq/s, Train_Time: {:5.0f} s"
# .format(epoch, epoch_loss_avg.result(), epoch_perf_avg.result() * get_world_size(), total_train_time))
dllogger.log(step=tuple(), data={"e2e_train_time": total_train_time,
"training_sequences_per_second": float(
epoch_perf_avg.result().numpy() * get_world_size()),
"final_loss": float(epoch_loss_avg.result().numpy())})
if not args.skip_checkpoint:
# checkpoint_name = "/workspace/electra/checkpoints/electra_base_qa_v2_{}_joint_head_{}_seed_{}_lr_{}_ckpt_{}".format(
# args.version_2_with_negative, args.joint_head, args.seed, args.learning_rate, epoch + 1)
checkpoint_name = "checkpoints/electra_base_qa_v2_{}_epoch_{}_ckpt".format(args.version_2_with_negative, epoch + 1)
if is_main_process():
model.save_weights(checkpoint_name)
if args.do_predict and (args.evaluate_during_training or epoch == args.num_train_epochs - 1):
if not args.do_train:
logger.info("***** Loading checkpoint: {} *****".format(args.init_checkpoint))
model.load_weights(args.init_checkpoint).expect_partial()
current_feature_id = 0
all_results = []
if is_main_process():
logger.info("***** Running evaluation *****")
logger.info(" Num examples = %d", total_dev_steps)
logger.info(" Batch size = %d", args.predict_batch_size)
raw_infer_start = time.time()
if is_main_process():
infer_perf_avg = tf.keras.metrics.Mean()
dev_iterator = tqdm(dev_dataset, total=total_dev_steps, desc="Iteration", mininterval=5,
disable=not is_main_process())
for input_ids, input_mask, segment_ids, start_positions, end_positions, cls_index, p_mask, is_impossible in dev_iterator:
# training=False is needed only if there are layers with different
# behavior during training versus inference (e.g. Dropout).
iter_start = time.time()
if not args.joint_head:
batch_start_logits, batch_end_logits = infer_step(model, input_ids,
attention_mask=input_mask,
token_type_ids=segment_ids,
)[:2]
else:
outputs = infer_step(model, input_ids,
attention_mask=input_mask,
token_type_ids=segment_ids,
cls_index=cls_index,
p_mask=p_mask,
)
infer_time = (time.time() - iter_start)
infer_perf_avg.update_state(1. * EVAL_BATCH_SIZE / infer_time)
latency.append(1. * infer_time / EVAL_BATCH_SIZE)
for iter_ in range(input_ids.shape[0]):
if not args.joint_head:
start_logits = batch_start_logits[iter_].numpy().tolist()
end_logits = batch_end_logits[iter_].numpy().tolist()
dev_feature = dev_features[current_feature_id]
current_feature_id += 1
unique_id = int(dev_feature.unique_id)
all_results.append(RawResult(unique_id=unique_id,
start_logits=start_logits,
end_logits=end_logits))
else:
dev_feature = dev_features[current_feature_id]
current_feature_id += 1
unique_id = int(dev_feature.unique_id)
output = [output[iter_].numpy().tolist() for output in outputs]
start_logits = output[0]
start_top_index = output[1]
end_logits = output[2]
end_top_index = output[3]
cls_logits = output[4]
result = SquadResult(
unique_id,
start_logits,
end_logits,
start_top_index=start_top_index,
end_top_index=end_top_index,
cls_logits=cls_logits,
)
all_results.append(result)
# Compute and save predictions
answers, nbest_answers = get_answers(dev_examples, dev_features, all_results, args)
output_prediction_file = os.path.join(args.output_dir, "predictions.json")
output_nbest_file = os.path.join(args.output_dir, "nbest_predictions.json")
e2e_infer_time = time.time() - raw_infer_start
# if args.version_2_with_negative:
# output_null_log_odds_file = os.path.join(args.output_dir, "null_odds.json")
# else:
# output_null_log_odds_file = None
with open(output_prediction_file, "w") as f:
f.write(json.dumps(answers, indent=4) + "\n")
with open(output_nbest_file, "w") as f:
f.write(json.dumps(nbest_answers, indent=4) + "\n")
if args.do_eval:
if args.version_2_with_negative:
dev_file = "dev-v2.0.json"
else:
dev_file = "dev-v1.1.json"
eval_out = subprocess.check_output([sys.executable, args.eval_script,
args.data_dir + "/" + dev_file, output_prediction_file])
print(eval_out.decode('UTF-8'))
scores = str(eval_out).strip()
exact_match = float(scores.split(":")[1].split(",")[0])
if args.version_2_with_negative:
f1 = float(scores.split(":")[2].split(",")[0])
else:
f1 = float(scores.split(":")[2].split("}")[0])
logger.info("Epoch: {:03d} Results: {}".format(epoch, eval_out.decode('UTF-8')))
print("**EVAL SUMMARY** - Epoch: {:03d}, EM: {:6.3f}, F1: {:6.3f}, Infer_Perf: {:4.0f} seq/s"
.format(epoch, exact_match, f1, infer_perf_avg.result()))
latency_all = sorted(latency)[:-2]
print(
"**LATENCY SUMMARY** - Epoch: {:03d}, Ave: {:6.3f} ms, 90%: {:6.3f} ms, 95%: {:6.3f} ms, 99%: {:6.3f} ms"
.format(epoch, sum(latency_all) / len(latency_all) * 1000,
sum(latency_all[:int(len(latency_all) * 0.9)]) / int(len(latency_all) * 0.9) * 1000,
sum(latency_all[:int(len(latency_all) * 0.95)]) / int(len(latency_all) * 0.95) * 1000,
sum(latency_all[:int(len(latency_all) * 0.99)]) / int(len(latency_all) * 0.99) * 1000,
))
dllogger.log(step=tuple(),
data={"inference_sequences_per_second": float(infer_perf_avg.result().numpy()),
"e2e_inference_time": e2e_infer_time})
if is_main_process() and args.do_train and args.do_eval:
print(
"**RESULTS SUMMARY** - EM: {:6.3f}, F1: {:6.3f}, Train_Time: {:4.0f} s, Train_Perf: {:4.0f} seq/s, Infer_Perf: {:4.0f} seq/s"
.format(exact_match, f1, total_train_time, epoch_perf_avg.result() * get_world_size(),
infer_perf_avg.result()))
dllogger.log(step=tuple(), data={"exact_match": exact_match, "F1": f1})
if __name__ == "__main__":
main()

View file

@ -0,0 +1,28 @@
#!/usr/bin/env bash
# Copyright (c) 2020 NVIDIA CORPORATION. All rights reserved.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
mode=${1:-"train"}
num_gpu=${2:-"8"}
batch_size=${3:-"16"}
infer_batch_size=${4:-"$batch_size"}
precision=${5:-"fp16"}
SQUAD_VERSION=${6:-"1.1"}
squad_dir=${7:-"/workspace/electra/data/download/squad/v$SQUAD_VERSION"}
OUT_DIR=${8:-"results/"}
init_checkpoint=${9:-"None"}
cache_dir=${10:-"$squad_dir"}
bash scripts/run_squad.sh google/electra-base-discriminator 1 $batch_size $infer_batch_size 8e-4 $precision 8 $RANDOM $SQUAD_VERSION $squad_dir $OUT_DIR $init_checkpoint $mode interactive $cache_dir 200

View file

@ -0,0 +1,240 @@
#!/usr/bin/env bash
# Copyright (c) 2020 NVIDIA CORPORATION. All rights reserved.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
dgxa100_8gpu_amp ()
{
electra_model="google/electra-base-discriminator"
epochs="2"
batch_size="32"
learning_rate="8e-4"
precision="amp"
num_gpu="8"
seed="1"
SQUAD_VERSION="1.1"
squad_dir="/workspace/electra/data/download/squad/v$SQUAD_VERSION"
OUT_DIR="results/"
init_checkpoint="checkpoints/electra_base_qa_v2_False_epoch_2_ckpt"
echo $electra_model $epochs $batch_size $batch_size $learning_rate \
$precision $num_gpu $seed $SQUAD_VERSION $squad_dir \
$OUT_DIR $init_checkpoint
}
dgxa100_8gpu_tf32 ()
{
electra_model="google/electra-base-discriminator"
epochs="2"
batch_size="32"
learning_rate="8e-4"
precision="tf32"
num_gpu="8"
seed="1"
SQUAD_VERSION="1.1"
squad_dir="/workspace/electra/data/download/squad/v$SQUAD_VERSION"
OUT_DIR="results/"
init_checkpoint="checkpoints/electra_base_qa_v2_False_epoch_2_ckpt"
echo $electra_model $epochs $batch_size $batch_size $learning_rate \
$precision $num_gpu $seed $SQUAD_VERSION $squad_dir \
$OUT_DIR $init_checkpoint
}
# Full SQuAD training configs for NVIDIA DGX-2H (16x NVIDIA V100 32GB GPU)
dgx2_16gpu_amp ()
{
electra_model="google/electra-base-discriminator"
epochs="2"
batch_size="32"
learning_rate="1e-3"
precision="amp"
num_gpu="16"
seed="1"
SQUAD_VERSION="1.1"
squad_dir="/workspace/electra/data/download/squad/v$SQUAD_VERSION"
OUT_DIR="results/"
init_checkpoint="checkpoints/electra_base_qa_v2_False_epoch_2_ckpt"
echo $electra_model $epochs $batch_size $batch_size $learning_rate \
$precision $num_gpu $seed $SQUAD_VERSION $squad_dir \
$OUT_DIR $init_checkpoint
}
dgx2_16gpu_fp32 ()
{
electra_model="google/electra-base-discriminator"
epochs="2"
batch_size="32"
learning_rate="1e-3"
precision="fp32"
num_gpu="16"
seed="1"
SQUAD_VERSION="1.1"
squad_dir="/workspace/electra/data/download/squad/v$SQUAD_VERSION"
OUT_DIR="results/"
init_checkpoint="checkpoints/electra_base_qa_v2_False_epoch_2_ckpt"
echo $electra_model $epochs $batch_size $batch_size $learning_rate \
$precision $num_gpu $seed $SQUAD_VERSION $squad_dir \
$OUT_DIR $init_checkpoint
}
# Full SQuAD training configs for NVIDIA DGX-1 (8x NVIDIA V100 16GB GPU)
dgx1_8gpu_amp ()
{
electra_model="google/electra-base-discriminator"
epochs="2"
batch_size="16"
learning_rate="4e-4"
precision="amp"
num_gpu="8"
seed="1"
SQUAD_VERSION="1.1"
squad_dir="/workspace/electra/data/download/squad/v$SQUAD_VERSION"
OUT_DIR="results/"
init_checkpoint="checkpoints/electra_base_qa_v2_False_epoch_2_ckpt"
echo $electra_model $epochs $batch_size $batch_size $learning_rate \
$precision $num_gpu $seed $SQUAD_VERSION $squad_dir \
$OUT_DIR $init_checkpoint
}
dgx1_8gpu_fp32 ()
{
electra_model="google/electra-base-discriminator"
epochs="2"
batch_size="8"
learning_rate="3e-4"
precision="fp32"
num_gpu="8"
seed="1"
SQUAD_VERSION="1.1"
squad_dir="/workspace/electra/data/download/squad/v$SQUAD_VERSION"
OUT_DIR="results/"
init_checkpoint="checkpoints/electra_base_qa_v2_False_epoch_2_ckpt"
echo $electra_model $epochs $batch_size $batch_size $learning_rate \
$precision $num_gpu $seed $SQUAD_VERSION $squad_dir \
$OUT_DIR $init_checkpoint
}
# 1GPU configs
dgxa100_1gpu_amp ()
{
electra_model="google/electra-base-discriminator"
epochs="2"
batch_size="32"
learning_rate="2e-4"
precision="amp"
num_gpu="1"
seed="1"
SQUAD_VERSION="1.1"
squad_dir="/workspace/electra/data/download/squad/v$SQUAD_VERSION"
OUT_DIR="results/"
init_checkpoint="checkpoints/electra_base_qa_v2_False_epoch_2_ckpt"
echo $electra_model $epochs $batch_size $batch_size $learning_rate \
$precision $num_gpu $seed $SQUAD_VERSION $squad_dir \
$OUT_DIR $init_checkpoint
}
dgxa100_1gpu_tf32 ()
{
electra_model="google/electra-base-discriminator"
epochs="2"
batch_size="32"
learning_rate="2e-4"
precision="tf32"
num_gpu="1"
seed="1"
SQUAD_VERSION="1.1"
squad_dir="/workspace/electra/data/download/squad/v$SQUAD_VERSION"
OUT_DIR="results/"
init_checkpoint="checkpoints/electra_base_qa_v2_False_epoch_2_ckpt"
echo $electra_model $epochs $batch_size $batch_size $learning_rate \
$precision $num_gpu $seed $SQUAD_VERSION $squad_dir \
$OUT_DIR $init_checkpoint
}
# Full SQuAD training configs for NVIDIA DGX-2H (16x NVIDIA V100 32GB GPU)
dgx2_1gpu_amp ()
{
electra_model="google/electra-base-discriminator"
epochs="2"
batch_size="32"
learning_rate="2e-4"
precision="amp"
num_gpu="1"
seed="1"
SQUAD_VERSION="1.1"
squad_dir="/workspace/electra/data/download/squad/v$SQUAD_VERSION"
OUT_DIR="results/"
init_checkpoint="checkpoints/electra_base_qa_v2_False_epoch_2_ckpt"
echo $electra_model $epochs $batch_size $batch_size $learning_rate \
$precision $num_gpu $seed $SQUAD_VERSION $squad_dir \
$OUT_DIR $init_checkpoint
}
dgx2_1gpu_fp32 ()
{
electra_model="google/electra-base-discriminator"
epochs="2"
batch_size="32"
learning_rate="2e-4"
precision="fp32"
num_gpu="1"
seed="1"
SQUAD_VERSION="1.1"
squad_dir="/workspace/electra/data/download/squad/v$SQUAD_VERSION"
OUT_DIR="results/"
init_checkpoint="checkpoints/electra_base_qa_v2_False_epoch_2_ckpt"
echo $electra_model $epochs $batch_size $batch_size $learning_rate \
$precision $num_gpu $seed $SQUAD_VERSION $squad_dir \
$OUT_DIR $init_checkpoint
}
# Full SQuAD training configs for NVIDIA DGX-1 (8x NVIDIA V100 16GB GPU)
dgx1_1gpu_amp ()
{
electra_model="google/electra-base-discriminator"
epochs="2"
batch_size="16"
learning_rate="1e-4"
precision="amp"
num_gpu="1"
seed="1"
SQUAD_VERSION="1.1"
squad_dir="/workspace/electra/data/download/squad/v$SQUAD_VERSION"
OUT_DIR="results/"
init_checkpoint="checkpoints/electra_base_qa_v2_False_epoch_2_ckpt"
echo $electra_model $epochs $batch_size $batch_size $learning_rate \
$precision $num_gpu $seed $SQUAD_VERSION $squad_dir \
$OUT_DIR $init_checkpoint
}
dgx1_1gpu_fp32 ()
{
electra_model="google/electra-base-discriminator"
epochs="2"
batch_size="8"
learning_rate="1e-4"
precision="fp32"
num_gpu="1"
seed="1"
SQUAD_VERSION="1.1"
squad_dir="/workspace/electra/data/download/squad/v$SQUAD_VERSION"
OUT_DIR="results/"
init_checkpoint="checkpoints/electra_base_qa_v2_False_epoch_2_ckpt"
echo $electra_model $epochs $batch_size $batch_size $learning_rate \
$precision $num_gpu $seed $SQUAD_VERSION $squad_dir \
$OUT_DIR $init_checkpoint
}

View file

@ -0,0 +1,2 @@
#!/bin/bash
docker build --network=host . --rm -t electra

View file

@ -0,0 +1,16 @@
#!/bin/bash
CMD=${1:-/bin/bash}
NV_VISIBLE_DEVICES=${2:-"all"}
DOCKER_BRIDGE=${3:-"host"}
docker run -it --rm \
--gpus device=$NV_VISIBLE_DEVICES \
--net=$DOCKER_BRIDGE \
--shm-size=1g \
--ulimit memlock=-1 \
--ulimit stack=67108864 \
--privileged \
-e LD_LIBRARY_PATH='/workspace/install/lib/' \
-v $PWD:/workspace/electra \
electra $CMD

View file

@ -0,0 +1,111 @@
#!/usr/bin/env bash
# Copyright (c) 2020 NVIDIA CORPORATION. All rights reserved.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
echo "Container nvidia build = " $NVIDIA_BUILD_ID
electra_model=${1:-"google/electra-base-discriminator"}
epochs=${2:-"2"}
batch_size=${3:-"16"}
infer_batch_size=${4:-"$batch_size"}
learning_rate=${5:-"4e-4"}
precision=${6:-"amp"}
num_gpu=${7:-"8"}
seed=${8:-"1"}
SQUAD_VERSION=${9:-"1.1"}
squad_dir=${10:-"/workspace/electra/data/download/squad/v$SQUAD_VERSION"}
OUT_DIR=${11:-"results/"}
init_checkpoint=${12:-"None"}
mode=${13:-"train_eval"}
env=${14:-"interactive"}
cache_dir=${15:-"$squad_dir"}
max_steps=${16:-"-1"}
echo "out dir is $OUT_DIR"
mkdir -p $OUT_DIR
if [ ! -d "$OUT_DIR" ]; then
echo "ERROR: non existing $OUT_DIR"
exit 1
fi
use_fp16=""
if [ "$precision" = "amp" ] ; then
echo "mixed-precision training and xla activated!"
use_fp16=" --amp --xla "
fi
if [ "$num_gpu" = "1" ] ; then
export CUDA_VISIBLE_DEVICES=0
mpi_command=" "
else
unset CUDA_VISIBLE_DEVICES
mpi_command=" horovodrun -np $num_gpu "
fi
if [ "$env" = "cluster" ] ; then
unset CUDA_VISIBLE_DEVICES
mpi_command=" "
fi
v2=""
echo "Running SQuAD-v$SQUAD_VERSION"
if [ "$SQUAD_VERSION" = "2.0" ] ; then
v2=" --version_2_with_negative "
fi
CMD=" $mpi_command python run_tf_squad.py "
CMD+="--init_checkpoint=$init_checkpoint "
if [ "$mode" = "train" ] ; then
CMD+="--do_train "
CMD+="--train_batch_size=$batch_size "
elif [ "$mode" = "eval" ] ; then
CMD+="--do_predict "
CMD+="--predict_batch_size=$batch_size "
CMD+="--eval_script=$squad_dir/evaluate-v$SQUAD_VERSION.py "
CMD+="--do_eval "
elif [ "$mode" = "prediction" ] ; then
CMD+="--do_predict "
CMD+="--predict_batch_size=$batch_size "
else
CMD+=" --do_train "
CMD+=" --train_batch_size=$batch_size "
CMD+="--do_predict "
CMD+="--predict_batch_size=$batch_size "
CMD+="--eval_script=$squad_dir/evaluate-v$SQUAD_VERSION.py "
CMD+="--do_eval "
fi
CMD+=" $v2 "
CMD+=" --data_dir $squad_dir "
CMD+=" --do_lower_case "
CMD+=" --electra_model=$electra_model "
CMD+=" --learning_rate=$learning_rate "
CMD+=" --warmup_proportion 0.05 "
CMD+=" --weight_decay_rate 0.01 "
CMD+=" --layerwise_lr_decay 0.8 "
CMD+=" --seed=$seed "
CMD+=" --num_train_epochs=$epochs "
CMD+=" --max_seq_length=384 "
CMD+=" --doc_stride=128 "
CMD+=" --beam_size 4 "
CMD+=" --joint_head True "
CMD+=" --null_score_diff_threshold -5.6 "
CMD+=" --output_dir=$OUT_DIR "
CMD+=" $use_fp16"
CMD+=" --cache_dir=$cache_dir "
CMD+=" --max_steps=$max_steps "
LOGFILE=$OUT_DIR/logfile.txt
echo "$CMD |& tee $LOGFILE"
time $CMD |& tee $LOGFILE

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,64 @@
# Copyright (c) 2020 NVIDIA CORPORATION. All rights reserved.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from tokenization_utils import BertTokenizer
VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt"}
PRETRAINED_VOCAB_FILES_MAP = {
"vocab_file": {
"google/electra-small-generator": "https://s3.amazonaws.com/models.huggingface.co/bert/google/electra-small-generator/vocab.txt",
"google/electra-base-generator": "https://s3.amazonaws.com/models.huggingface.co/bert/google/electra-base-generator/vocab.txt",
"google/electra-large-generator": "https://s3.amazonaws.com/models.huggingface.co/bert/google/electra-large-generator/vocab.txt",
"google/electra-small-discriminator": "https://s3.amazonaws.com/models.huggingface.co/bert/google/electra-small-discriminator/vocab.txt",
"google/electra-base-discriminator": "https://s3.amazonaws.com/models.huggingface.co/bert/google/electra-base-discriminator/vocab.txt",
"google/electra-large-discriminator": "https://s3.amazonaws.com/models.huggingface.co/bert/google/electra-large-discriminator/vocab.txt",
}
}
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
"google/electra-small-generator": 512,
"google/electra-base-generator": 512,
"google/electra-large-generator": 512,
"google/electra-small-discriminator": 512,
"google/electra-base-discriminator": 512,
"google/electra-large-discriminator": 512,
}
PRETRAINED_INIT_CONFIGURATION = {
"google/electra-small-generator": {"do_lower_case": True},
"google/electra-base-generator": {"do_lower_case": True},
"google/electra-large-generator": {"do_lower_case": True},
"google/electra-small-discriminator": {"do_lower_case": True},
"google/electra-base-discriminator": {"do_lower_case": True},
"google/electra-large-discriminator": {"do_lower_case": True},
}
class ElectraTokenizer(BertTokenizer):
r"""
Constructs an Electra tokenizer.
:class:`~transformers.ElectraTokenizer` is identical to :class:`~transformers.BertTokenizer` and runs end-to-end
tokenization: punctuation splitting + wordpiece.
Refer to superclass :class:`~transformers.BertTokenizer` for usage examples and documentation concerning
parameters.
"""
vocab_files_names = VOCAB_FILES_NAMES
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,159 @@
# Copyright (c) 2020 NVIDIA CORPORATION. All rights reserved.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import json
import pickle
import sys
import unicodedata
import six
import horovod.tensorflow as hvd
import tensorflow as tf
def get_rank():
try:
return hvd.rank()
except:
return 0
def get_world_size():
try:
return hvd.size()
except:
return 1
def is_main_process():
return get_rank() == 0
def format_step(step):
if isinstance(step, str):
return step
s = ""
if len(step) > 0:
s += "Training Epoch: {} ".format(step[0])
if len(step) > 1:
s += "Training Iteration: {} ".format(step[1])
if len(step) > 2:
s += "Validation Iteration: {} ".format(step[2])
return s
def load_json(path):
with tf.io.gfile.GFile(path, "r") as f:
return json.load(f)
def write_json(o, path):
if "/" in path:
tf.io.gfile.makedirs(path.rsplit("/", 1)[0])
with tf.io.gfile.GFile(path, "w") as f:
json.dump(o, f)
def load_pickle(path):
with tf.io.gfile.GFile(path, "rb") as f:
return pickle.load(f)
def write_pickle(o, path):
if "/" in path:
tf.io.gfile.makedirs(path.rsplit("/", 1)[0])
with tf.io.gfile.GFile(path, "wb") as f:
pickle.dump(o, f, -1)
def mkdir(path):
if not tf.io.gfile.exists(path):
tf.io.gfile.makedirs(path)
def rmrf(path):
if tf.io.gfile.exists(path):
tf.io.gfile.rmtree(path)
def rmkdir(path):
rmrf(path)
mkdir(path)
def log(*args):
msg = " ".join(map(str, args))
sys.stdout.write(msg + "\n")
sys.stdout.flush()
def log_config(config):
for key, value in sorted(config.__dict__.items()):
log(key, value)
log()
def heading(*args):
log(80 * "=")
log(*args)
log(80 * "=")
def nest_dict(d, prefixes, delim="_"):
"""Go from {prefix_key: value} to {prefix: {key: value}}."""
nested = {}
for k, v in d.items():
for prefix in prefixes:
if k.startswith(prefix + delim):
if prefix not in nested:
nested[prefix] = {}
nested[prefix][k.split(delim, 1)[1]] = v
else:
nested[k] = v
return nested
def flatten_dict(d, delim="_"):
"""Go from {prefix: {key: value}} to {prefix_key: value}."""
flattened = {}
for k, v in d.items():
if isinstance(v, dict):
for k2, v2 in v.items():
flattened[k + delim + k2] = v2
else:
flattened[k] = v
return flattened
def printable_text(text):
"""Returns text encoded in a way suitable for print or `tf.logging`."""
# These functions want `str` for both Python2 and Python3, but in one case
# it's a Unicode string and in the other it's a byte string.
if six.PY3:
if isinstance(text, str):
return text
elif isinstance(text, bytes):
return text.decode("utf-8", "ignore")
else:
raise ValueError("Unsupported string type: %s" % (type(text)))
elif six.PY2:
if isinstance(text, str):
return text
elif isinstance(text, unicode):
return text.encode("utf-8")
else:
raise ValueError("Unsupported string type: %s" % (type(text)))
else:
raise ValueError("Not running on Python2 or Python 3?")