diff --git a/README.md b/README.md index 1ba59c12..c6f55be9 100644 --- a/README.md +++ b/README.md @@ -28,7 +28,7 @@ The examples are organized first by framework, such as TensorFlow, PyTorch, etc. - __GNMT__ [[PyTorch](https://github.com/NVIDIA/DeepLearningExamples/tree/master/PyTorch/Translation/GNMT)] [[TensorFlow](https://github.com/NVIDIA/DeepLearningExamples/tree/master/TensorFlow/Translation/GNMT)] - __Transformer__ [[PyTorch](https://github.com/NVIDIA/DeepLearningExamples/tree/master/PyTorch/Translation/Transformer)] - __BERT__ [[PyTorch](https://github.com/NVIDIA/DeepLearningExamples/tree/master/PyTorch/LanguageModeling/BERT)] [[TensorFlow](https://github.com/NVIDIA/DeepLearningExamples/tree/master/TensorFlow/LanguageModeling/BERT)] -- __Transformer-XL__ [[PyTorch](https://github.com/NVIDIA/DeepLearningExamples/tree/master/PyTorch/LanguageModeling/Transformer-XL)] +- __Transformer-XL__ [[PyTorch](https://github.com/NVIDIA/DeepLearningExamples/tree/master/PyTorch/LanguageModeling/Transformer-XL)] [[TensorFlow](https://github.com/NVIDIA/DeepLearningExamples/tree/master/TensorFlow/LanguageModeling/Transformer-XL)] ### Recommender Systems @@ -79,6 +79,7 @@ The examples are organized first by framework, such as TensorFlow, PyTorch, etc. | [SSD320 v1.2](https://github.com/NVIDIA/DeepLearningExamples/tree/master/TensorFlow/Detection/SSD) | TensorFlow | N/A | Yes | Yes | - | - | - | - | - | | [BERT](https://github.com/NVIDIA/DeepLearningExamples/tree/master/TensorFlow/LanguageModeling/BERT) |TensorFlow | N/A | Yes | Yes | Yes | Yes | - | [Yes](https://github.com/NVIDIA/DeepLearningExamples/tree/master/TensorFlow/LanguageModeling/BERT/trtis) | Yes | | [BioBert](https://github.com/NVIDIA/DeepLearningExamples/tree/master/TensorFlow/LanguageModeling/BERT/biobert) | TensorFlow | N/A | Yes | Yes | - | - | - | - | - | +| [Transformer-XL](https://github.com/NVIDIA/DeepLearningExamples/tree/master/TensorFlow/LanguageModeling/Transformer-XL) |TensorFlow | N/A | Yes | Yes | - | - | - | - | - | | [Neural Collaborative Filtering](https://github.com/NVIDIA/DeepLearningExamples/tree/master/TensorFlow/Recommendation/NCF) |TensorFlow | N/A | Yes | Yes | - | - | - | - | - | | [Variational Autoencoder Collaborative Filtering](https://github.com/NVIDIA/DeepLearningExamples/tree/master/TensorFlow/Recommendation/VAE-CF) |TensorFlow | N/A | Yes | Yes | - | - | - | - | - | | [WideAndDeep](https://github.com/NVIDIA/DeepLearningExamples/tree/master/TensorFlow/Recommendation/WideAndDeep) | TensorFlow | N/A | Yes | Yes | - | - | - | - | - | diff --git a/TensorFlow/LanguageModeling/Transformer-XL/Dockerfile b/TensorFlow/LanguageModeling/Transformer-XL/Dockerfile new file mode 100755 index 00000000..b2299f4f --- /dev/null +++ b/TensorFlow/LanguageModeling/Transformer-XL/Dockerfile @@ -0,0 +1,7 @@ +ARG FROM_IMAGE_NAME=nvcr.io/nvidia/tensorflow:19.12-tf1-py3 +FROM ${FROM_IMAGE_NAME} + +WORKDIR /workspace/transformer-xl/tf +RUN pip --no-cache-dir --no-cache install 'git+https://github.com/NVIDIA/dllogger' + +ADD tf/ /workspace/transformer-xl/tf diff --git a/TensorFlow/LanguageModeling/Transformer-XL/LICENSE b/TensorFlow/LanguageModeling/Transformer-XL/LICENSE new file mode 100755 index 00000000..261eeb9e --- /dev/null +++ b/TensorFlow/LanguageModeling/Transformer-XL/LICENSE @@ -0,0 +1,201 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/TensorFlow/LanguageModeling/Transformer-XL/NOTICE b/TensorFlow/LanguageModeling/Transformer-XL/NOTICE new file mode 100644 index 00000000..bda76718 --- /dev/null +++ b/TensorFlow/LanguageModeling/Transformer-XL/NOTICE @@ -0,0 +1,9 @@ +Transformer-XL for Tensorflow + +This repository includes software from https://github.com/kimiyoung/transformer-xl licensed under the Apache License 2.0. + +This repository includes software from https://github.com/salesforce/awd-lstm-lm licensed under the BSD-3-Clause license. + +This repository includes software from https://github.com/cybertronai/transformer-xl licensed under the Apache License 2.0. + +This repository includes software from https://github.com/cybertronai/pytorch-lamb licensed under the MIT license. diff --git a/TensorFlow/LanguageModeling/Transformer-XL/README.md b/TensorFlow/LanguageModeling/Transformer-XL/README.md new file mode 100755 index 00000000..b678bca9 --- /dev/null +++ b/TensorFlow/LanguageModeling/Transformer-XL/README.md @@ -0,0 +1,945 @@ +# Transformer-XL For TensorFlow + +This repository provides a script and recipe to train the Transformer-XL model +to achieve state-of-the-art accuracy and is tested and maintained by NVIDIA. + +## Table Of Contents + + + +* [Model overview](#model-overview) + * [Model architecture](#model-architecture) + * [Default configuration](#default-configuration) + * [Feature support matrix](#feature-support-matrix) + * [Features](#features) + * [Mixed precision training](#mixed-precision-training) + * [Enabling mixed precision](#enabling-mixed-precision) +* [Setup](#setup) + * [Requirements](#requirements) +* [Quick Start Guide](#quick-start-guide) +* [Advanced](#advanced) + * [Scripts and sample code](#scripts-and-sample-code) + * [Parameters](#parameters) + * [Command-line options](#command-line-options) + * [Getting the data](#getting-the-data) + * [Dataset guidelines](#dataset-guidelines) + * [Multi-dataset](#multi-dataset) + * [Training process](#training-process) + * [Inference process](#inference-process) +* [Performance](#performance) + * [Benchmarking](#benchmarking) + * [Training performance benchmark](#training-performance-benchmark) + * [Inference performance benchmark](#inference-performance-benchmark) + * [Results](#results) + * [Training accuracy results](#training-accuracy-results) + * [Training accuracy: NVIDIA DGX-1 (8x V100 16G)](#training-accuracy-nvidia-dgx-1-8x-v100-16g) + * [Base model](#base-model) + * [Training accuracy: NVIDIA DGX-2 (16x V100 32G)](#training-accuracy-nvidia-dgx-2-16x-v100-32g) + * [Base model](#base-model-1) + * [Training loss plot](#training-loss-plot) + * [Base model](#base-model-2) + * [Training stability test](#training-stability-test) + * [Base model](#base-model-3) + * [Training performance results](#training-performance-results) + * [Training performance: NVIDIA DGX-1 (8x V100 16G)](#training-performance-nvidia-dgx-1-8x-v100-16g) + * [Base model](#base-model-4) + * [Training performance: NVIDIA DGX-2 (16x V100 32G)](#training-performance-nvidia-dgx-2-16x-v100-32g) + * [Base model](#base-model-5) + * [Inference performance results](#inference-performance-results) + * [Inference performance: NVIDIA DGX-1 (1x V100 16G)](#inference-performance-nvidia-dgx-1-1x-v100-16g) + * [Base model](#base-model-6) + * [Inference performance: NVIDIA T4](#inference-performance-nvidia-t4) + * [Base model](#base-model-7) +* [Release notes](#release-notes) + * [Changelog](#changelog) + * [Known issues](#known-issues) + + + +## Model overview + +This repository provides an implementation of the Transformer-XL model in +[TensorFlow](https://www.tensorflow.org) from the paper [Transformer-XL: Attentive +Language Models Beyond a Fixed-Length +Context](https://arxiv.org/abs/1901.02860). Transformer-XL is a +transformer-based language model with a segment-level recurrence and a novel +relative positional encoding. Enhancements introduced in Transformer-XL help +capture better long-term dependencies by attending to tokens from multiple +previous segments. + +Our implementation is based on the +[codebase](https://github.com/kimiyoung/transformer-xl) published by the +authors of the Transformer-XL paper. +Our implementation uses a modified model architecture. Our +modifications were made to achieve better hardware utilization and to take +advantage of Tensor Cores. Similar modifications were also proposed in an +implementation available from +[github.com/cybertronai/transformer-xl](https://github.com/cybertronai/transformer-xl). +Refer to the [Model architecture](#model-architecture) section for more +details. + +This model is trained with mixed precision using Tensor Cores on NVIDIA Volta +GPUs and evaluated on Volta and Turing GPUs. Therefore, researchers can get +results up to 1.5x faster than training without Tensor Cores, while +experiencing the benefits of mixed precision training. This model is tested +against each NGC monthly container release to ensure consistent accuracy and +performance over time. + +### Model architecture + +The Transformer-XL "base" model for WikiText-103 dataset available in this +repository was modified to use the following hyperparameter values: + + +|**Hyperparameter**|**Description**|**Original setting for the base model**|**Our modification to the base model**| +|------------------|---------------|--------------------------------------:|--------------------------------------:| +| `d_model` | hidden size | 410 | 512 | +| `n_head` | number of attention heads | 10 | 8 | +| `d_head` | size of each attention head | 41 | 64 | +| `d_inner` | hidden size in fully-connected layers | 2100 | 2048 | +| `tgt_len` | number of tokens to predict during training | 150 | 192 | +| `mem_len` | number of tokens cached from previous iterations during training | 150 | 192 | + +Changes described above were made to align certain hyperparameters with powers +of two, with this modification, the model is able to achieve better hardware +utilization, and therefore higher training throughput. + +The following table lists the hyperparameters for the base +Transformer-XL model for WikiText-103 dataset available in this repository. + +| **Hyperparameter** | **Description** | **Base model** | +| ------------------ | ---------------------------------------------------------------- | -------------: | +| `n_layer` | number of layers | 16 | +| `d_model` | hidden size | 512 | +| `n_head` | number of attention heads | 8 | +| `d_head` | size of each attention head | 64 | +| `d_inner` | inner hidden size in fully-connected layers | 2048 | +| `dropout` | dropout | 0.1 | +| `dropatt` | dropout after softmax in the attention | 0.0 | +| `lr` | base learning rate | 0.01 | +| `min_lr_ratio` | minimum ratio learning rate (for cosine decay) | 0.1 | +| `max_step` | number of training steps | 40,000 | +| `warmup_step` | number of learning rate warmup steps | 1,000 | +| `batch_size` | training batch size | 256 | +| `tgt_len` | number of tokens to predict during training | 192 | +| `mem_len` | number of tokens cached from previous iterations during training | 192 | + + +The Transformer-XL model addresses the limitations of vanilla transformer-based +language models, which are only able to use relatively short context, bounded +by the segment length. The Transformer-XL introduces a recurrence mechanism, +which is able to use a cached hidden state from previous segments. During +training, the context consists of a concatenation of the current segment's hidden +state and cached states from previous iterations. Gradients are backpropagated +only through the current segment, although the model is able to take advantage +of the extra information stored in the cache and therefore is able to model +long-term dependencies. + +An illustration of the recurrence mechanism taken from the [Transformer-XL +paper](https://arxiv.org/abs/1901.02860) is shown below. +![model](tf/img/model.png) + + +### Default configuration + +The following features were implemented in this model: + +* general + * single-node, Horovod multi-GPU training + * training and inference with mixed precision using Tensor Cores + * automatic mixed precision training (AMP) + +* model + * 16-layer base Transformer-XL model with hidden size 512, 8 attention heads, + each head with hidden size 64 + * the model trained on + [WikiText-103](https://blog.einstein.ai/the-wikitext-long-term-dependency-language-modeling-dataset/) + dataset, using word-level vocabulary and + adaptive softmax + * embedding weights are tied with weights in the classifier + +* training + * training with [LAMB](https://arxiv.org/abs/1904.00962) optimizer, the + implementation of the optimizer uses [XLA](https://www.tensorflow.org/xla), which enables + the fusion of elementwise operations and accelerates the training + * support for training with a gradient accumulation + * base model: + * linear learning rate warmup for 1,000 iterations, followed by the cosine + learning rate schedule, the initial learning rate is set to 0.0, and the final + learning rate is set to 0.001 (min_lr_ratio * base_lr) + * training for 40,000 steps, using a batch size of 256 + +* inference + * support for single-GPU inference + * each token is using the same size of the context from previous time steps. + * base model: + * target length is set to 64, length of memory is set to 640 + * positional embeddings are clamped after 400 time steps + +### Feature support matrix + +The following features are supported by this model: + +| **Feature** | **Transformer-XL** | +|:------------|-------------------:| +|[Automatic mixed precision (AMP)](https://nvidia.github.io/apex/amp.html) | Yes | +|[Horovod Multi-GPU (NCCL)](https://github.com/horovod/horovod) | Yes | +|[LAMB](https://arxiv.org/abs/1904.00962v3) | Yes | + + +#### Features + +[TF-AMP](https://docs.nvidia.com/deeplearning/dgx/tensorflow-user-guide/index.html#tfamp) - a +tool that enables Tensor Core-accelerated training. Refer to the [Enabling +mixed precision](#enabling-mixed-precision) section for more details. + +[Horovod](https://github.com/horovod/horovod) - Horovod +is a distributed training framework for TensorFlow, Keras, PyTorch, and MXNet. +The goal of Horovod is to make distributed deep learning fast and easy to use. +For more information about how to get started with Horovod, see the [Horovod: +Official repository](https://github.com/horovod/horovod). + +[Multi-GPU training with Horovod](https://github.com/horovod/horovod/#usage) - our model +uses Horovod to implement efficient multi-GPU training with NCCL. For details, +see example sources in this repository or see the [TensorFlow +tutorial](https://github.com/horovod/horovod/#usage). + +[LAMB](https://arxiv.org/abs/1904.00962v3) - stands +for Layerwise Adaptive Moments Based optimizer, is a large batch optimization +technique that helps accelerate training of deep neural networks using large +minibatches. + +### Mixed precision training + +Mixed precision is the combined use of different numerical precisions in a +computational method. +[Mixed precision](https://arxiv.org/abs/1710.03740) training offers significant +computational speedup by performing operations in half-precision format while +storing minimal information in single-precision to retain as much information +as possible in critical parts of the network. Since the introduction of [Tensor +Cores](https://developer.nvidia.com/tensor-cores) in the Volta and Turing +architectures, significant training speedups are experienced by switching to +mixed precision -- up to 3x overall speedup on the most arithmetically intense +model architectures. Using mixed precision training previously required two +steps: + +1. Porting the model to use the FP16 data type where appropriate. +2. Manually adding loss scaling to preserve small gradient values. + +The ability to train deep learning networks with lower precision was introduced +in the Pascal architecture and first supported in [CUDA +8](https://devblogs.nvidia.com/parallelforall/tag/fp16/) in the NVIDIA Deep +Learning SDK. + +For information about: + +* How to train using mixed precision, see the [Mixed Precision + Training](https://arxiv.org/abs/1710.03740) paper and [Training With Mixed + Precision](https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html) + documentation. +* Techniques used for mixed precision training, see the [Mixed-Precision + Training of Deep Neural + Networks](https://devblogs.nvidia.com/mixed-precision-training-deep-neural-networks/) + blog. +* How to access and enable AMP for TensorFlow, see [Using + TF-AMP](https://docs.nvidia.com/deeplearning/dgx/tensorflow-user-guide/index.html#tfamp) + from the TensorFlow User Guide. + +#### Enabling mixed precision + +Automatic Mixed Precision (AMP) for TensorFlow enables the full [mixed precision +methodology](https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html#tensorflow) in your existing +TensorFlow model code. AMP enables mixed precision training on Volta and Turing GPUs automatically. The TensorFlow +framework code makes all necessary model changes internally. + +In TF-AMP, the computational graph is optimized to use as few casts as necessary and maximizes the use of FP16, and the +loss scaling is automatically applied inside of supported optimizers. AMP can be configured to work with the existing +`tf.contrib` loss scaling manager by disabling the AMP scaling with a single environment variable to perform only the +automatic mixed precision optimization. It accomplishes this by automatically rewriting all computation graphs with the +necessary operations to enable mixed precision training and automatic loss scaling. + +## Setup + +The following section lists the requirements that you need to meet in order to +start training the Transformer-XL model. + +### Requirements + +This repository contains `Dockerfile` which extends the TensorFlow NGC container +and encapsulates some dependencies. Aside from these dependencies, ensure you +have the following components: + +* [NVIDIA Docker](https://github.com/NVIDIA/nvidia-docker) +* [TensorFlow 19.12-tf1-py3](https://ngc.nvidia.com/catalog/containers/nvidia:tensorflow) NGC container +* [NVIDIA Volta](https://www.nvidia.com/en-us/data-center/volta-gpu-architecture/) + or [Turing](https://www.nvidia.com/pl-pl/geforce/turing/) based GPU + +For more information about how to get started with NGC containers, see the +following sections from the NVIDIA GPU Cloud Documentation and the Deep +Learning DGX Documentation: + +* [Getting Started Using NVIDIA GPU Cloud](https://docs.nvidia.com/ngc/ngc-getting-started-guide/index.html), +* [Accessing And Pulling From The NGC Container Registry](https://docs.nvidia.com/deeplearning/dgx/user-guide/index.html#accessing_registry), +* [Running TensorFlow](https://docs.nvidia.com/deeplearning/frameworks/tensorflow-release-notes/running.html#running) + +For those unable to use the TensorFlow NGC container, to set up the required environment or create your own container, +see the versioned [NVIDIA Container Support +Matrix](https://docs.nvidia.com/deeplearning/frameworks/support-matrix/index.html). + +## Quick Start Guide + +To train your model using mixed precision with Tensor Cores or using FP32, +perform the following steps using the default parameters of the Transformer-XL +base model on the +[WikiText-103](https://blog.einstein.ai/the-wikitext-long-term-dependency-language-modeling-dataset/) +dataset. + +For the specifics concerning training +and inference, see the [Advanced](#advanced) section. + +1. Clone the repository. + +``` +git clone https://github.com/NVIDIA/DeepLearningExamples +cd DeepLearningExamples/TensorFlow/LanguageModeling/Transformer-XL +``` + +2. Download and preprocess the dataset. + +``` +bash getdata.sh +``` + +3. Build the Transformer-XL TensorFlow NGC container. + +``` +bash tf/scripts/docker/build.sh +``` + +4. Start an interactive session in the NGC container to run training/inference. + +``` +bash tf/scripts/docker/interactive.sh +``` + +5. Create tfrecords before your first training/evaluation for a given batch size per GPU. +Use same --batch_chunk and --training_batch_size flags as in the training. + +For training on DGX-1 with gradient accumulation in 2 steps: +``` +bash run_wt103_base.sh train_data --batch_chunk 2 +``` + +For single GPU training with gradient accumulation in 16 steps: +``` +bash run_wt103_base.sh train_data --batch_chunk 16 +``` + +For evaluation: +``` +bash run_wt103_base.sh test_data +``` + +6. Start training. + +To start mixed precision training on 8 GPUs on DGX-1, run: + +``` +bash run_wt103_base.sh train 8 --fp16 --batch_chunk 2 +``` + +To start FP32 training on single GPU, run: + +``` +bash run_wt103_base.sh train 1 --batch_chunk 16 +``` + +To start mixed precision training on 16 GPUs on DGX-2, run: + +``` +bash run_wt103_base.sh train 16 --fp16 +``` + +To start FP32 training on 16 GPUs on DGX-2, run: + +``` +bash run_wt103_base.sh train 16 +``` + +For more information on the available options, and for an explanation of what +happens at the end of training, refer to the [Training +process](#training-process) section. + +7. Start evaluation. + +To start mixed precision inference on the test set, run: + +``` +bash run_wt103_base.sh eval [--fp16] +``` + +The `--fp16` flag is optional, however, if it's set, then the script +launches mixed precision inference with Tensor Cores. If the flag is not +present, then the script launches FP32 inference. +By default, the script is loading the checkpoint from +`LM-TFM/model.ckpt`, which contains the model corresponding to the +last checkpoint from the previous training run. The path to the +checkpoint can be customized by setting the `--model_dir` flag. + +For more information on the available options, refer to the [Inference +process](#inference-process) section. + +## Advanced + +The following sections provide greater details of the dataset, running training +and inference, and the training results. + +### Scripts and sample code + +* `Dockerfile`: a container with the basic set of dependencies to run + Transformer-XL + +In the `tf` directory, the most important files are: + +* `data_utils.py`: data loading utilities +* `exp_utils.py`: utility functions for running training and benchmarking +* `lamb.py`: implementation of [LAMB](https://arxiv.org/abs/1904.00962) + optimizer +* `main.py`: serves as the entry point to launch the training and inference +* `model.py`: implementation of the Transformer-XL model +* `vocabulary.py`: implementation of word-level vocabulary + +### Parameters + +The complete list of available parameters for the `tf/main.py` script contains: + +``` + --batch_chunk: Number of accumulation steps. + (default: '1') + (an integer) + --clamp_len: Clamp length + (default: '-1') + (an integer) + --clip: Gradient clipping value. + (default: '0.25') + (a number) + --corpus_info_path: Path to corpus-info.json file. + (default: '') + --d_embed: Dimension of the embeddings. + (default: '512') + (an integer) + --d_head: Dimension of each attention head. + (default: '64') + (an integer) + --d_inner: Dimension of inner hidden size in positionwise feed-forward. + (default: '2048') + (an integer) + --d_model: Dimension of the model. + (default: '512') + (an integer) + --data_dir: Path to tf-records directory. + (default: '') + --div_val: Divide the embedding size by this val for each bin + (default: '1') + (an integer) + --[no]do_eval: Whether to run eval on the dev set. + (default: 'false') + --[no]do_train: Whether to run training. + (default: 'true') + --dropatt: Attention dropout rate. + (default: '0.0') + (a number) + --dropout: Dropout rate. + (default: '0.1') + (a number) + --eval_batch_size: Size of valid batch. + (default: '16') + (an integer) + --eval_ckpt_path: Checkpoint path for do_test evaluation.If set, model_dir will be ignored.If unset, will use the latest ckpt in model_dir. + --eval_split: Which data split to evaluate. + (default: 'valid') + --[no]fp16: Whether to enable AMP ops. + (default: 'false') + --init: : Initialization method. + (default: 'normal') + --init_range: Initialization std when init is uniform. + (default: '0.1') + (a number) + --init_std: Initialization std when init is normal. + (default: '0.02') + (a number) + --learning_rate: Maximum learning rate. + (default: '0.01') + (a number) + --log_interval: Number of iterations per repeat loop. + (default: '100') + (an integer) + --max_eval_batch: Set -1 to turn off. Only used in test mode. + (default: '-1') + (an integer) + --mem_len: Number of steps to cache + (default: '192') + (an integer) + --min_lr_ratio: Minimum ratio learning rate. + (default: '0.1') + (a number) + --model_dir: Estimator model_dir. + (default: 'LM-TFM') + --n_head: Number of attention heads. + (default: '8') + (an integer) + --n_layer: Number of layers. + (default: '16') + (an integer) + --num_core_per_host: Number of cores per host + (default: '8') + (an integer) + --percentiles: percentiles for latency confidence intervals + (default: '90,95,99') + (a comma separated list) + --proj_init_std: Initialization std for embedding projection. + (default: '0.01') + (a number) + --[no]proj_same_dim: Project the bin with the same dimension. + (default: 'true') + --[no]proj_share_all_but_first: True to share all but first projs, False not to share. + (default: 'false') + --record_info_dir: Path to local directory containing filenames.txt. + (default: '') + --[no]same_length: Same length attention + (default: 'false') + --save_steps: number of steps for model checkpointing. + (default: '5000') + (an integer) + --tgt_len: Number of steps to predict + (default: '192') + (an integer) + --[no]tie_weight: Tie embedding and softmax weight. + (default: 'true') + --train_batch_size: Size of train batch. + (default: '256') + (an integer) + --train_steps: Total number of training steps. + (default: '40000') + (an integer) + --[no]untie_r: untie r_w_bias and r_r_bias + (default: 'false') + --warmup_steps: Number of steps for linear lr warmup. + (default: '1000') + (an integer) +``` + +### Command-line options + +To see the full list of available options and their descriptions, use the `--help` command-line option. +For example: + +``` +python3 main.py --help +``` + +### Getting the data + +The Transformer-XL model was trained on the +[WikiText-103](https://blog.einstein.ai/the-wikitext-long-term-dependency-language-modeling-dataset/) +dataset. The WikiText-103 dataset is a collection of over 100 million tokens +extracted from the set of verified +[Good](https://en.wikipedia.org/wiki/Wikipedia:Good_articles) and +[Featured](https://en.wikipedia.org/wiki/Wikipedia:Featured_articles) articles +on Wikipedia. + +This repository contains the `getdata.sh` download script which +automatically downloads and extracts the training, validation and test +datasets. By default, data is downloaded to the `data` directory. + +In order to test with other datasets, the script needs to be customized +accordingly. + +#### Dataset guidelines + +The WikiText-103 dataset was already pre-tokenized with word-level tokens. The +dataset features a large vocabulary of 267,735 tokens and retains the original +case, punctuation and numbers. + +The `getdata.sh` script downloads the data, extracts the archive and renames +the training, validation, and test set to `train.txt`, `valid.txt`, `test.txt` +respectively. + +#### Multi-dataset + +Using other datasets requires changes in the `tf/data_utils.py` file: +* the name of the new dataset should be added to the `dataset` flag +* the support for the new dataset needs to be added to the `Corpus` class: + names of files containing training, validation and test data, options for + the tokenizer, dataset iterator and desired values of cutoffs for adaptive softmax + +The current codebase supports training with word-level vocabulary +(automatically generated based on the provided dataset) + +Additionally, using other datasets may require changes in some hyperparameters +(for example, batch size, learning rate, number of training steps, +and the configuration of learning rate scheduler). + +### Training process + +The default training configuration can be launched by running the +`run_wt103_base.sh` script with the first argument +set to `train`. By default, the training results are saved to `tf/LM-TFM` directory, +and map to your container's `/workspace/transformer-x/tf/LM-TFM` directory; +this can be customized by setting the `--model_dir` parameter. + +The training script launches a single-node data-parallel training with a fixed +global batch size of 256, optionally with gradient accumulation to allow +training on configurations with less than 16 GPUs. + +**Command-line** + +You can launch training of the Transformer-XL base model on the +WikiText-103 dataset with the word-based vocabulary and adaptive softmax using +`<#GPUs>` GPUs. For example: + +``` +bash run_wt103_base.sh train <#GPUs> [--fp16] [--batch_chunk CHUNK] +``` + +The `--fp16` flag is optional, however, if it's set, then the script +launches mixed precision training with Tensor Cores; if the flag is not +present, then the script launches FP32 training. + +The `--batch_chunk CHUNK` parameter controls gradient accumulation. With +gradient accumulation, the batch size is split into `CHUNK` chunks of equal +size, the training script executes the forward and backward pass using each +chunk and then executes the optimizer using accumulated gradients. + +**Examples** + +You can launch mixed precision training of the Transformer-XL base model on the +WikiText-103 dataset using 16 GPUs. For example: + +``` +bash run_wt103_base.sh train 16 --fp16 --batch_chunk 1 +``` + +The batch size per GPU is equal to the default global batch size of 256 +divided by the product of the number of GPUs times the number of chunks. In this +case, batch size per GPU is equal to `256 / (16 * 1) = 16`. + +You can launch FP32 training using 8 GPUs; the batch size per GPU is equal to 16 +(`--batch_chunk` was set to `2` because a local batch size of 32 runs out +of memory on a DGX-1 with Tesla V100 16G in FP32 training). For example: + +``` +bash run_wt103_base.sh train 8 --batch_chunk 2 +``` + +A summary of the training progress is printed after every 100 training +iterations; this can be customized by setting the `--log_interval` parameter. +The summary is printed in the following format: + +``` +step 1300 | lr 0.009998686 | loss 5.09 | pplx 162.70, bpc 7.3461, tok/s 138037 +``` + +which contains information about a current training +step, current learning rate, current training loss, +training [perplexity](https://en.wikipedia.org/wiki/Perplexity#Perplexity_per_word), +bits per character and throughput in tokens per second. + + +The script saves one checkpoint: `model.ckpt` which contains the last saved model. +By default, model saving is executed every +5000 training steps, this can be customized by setting the `--save_steps` +parameter. + +Evaluation (inference) benefits from longer attention sequences, therefore to +reproduce perplexity values reported in the [Transformer-XL +paper](https://arxiv.org/abs/1901.02860), it's necessary to run the final +evaluation with a dedicated inference script. Refer to the [Inference +process](#inference-process) section for more details. + +### Inference process + +Inference can be run by launching the `run_wt103_base.sh` script +with the first argument set to `eval`. Running +inference requires a pre-trained model checkpoint. + +The script supports only single-GPU inference. + +**Command-line** + +You can launch inference of the Transformer-XL base model on the +WikiText-103 dataset with the word-based vocabulary and adaptive softmax. + +For example: + +``` +bash run_wt103_base.sh eval --model_dir [--fp16] +``` + +The `--fp16` flag is optional, however, if it's specified, then the script +launches inference with Tensor Cores; if the flag is not present, then the +script launches FP32 inference. + +**Examples** + +To launch mixed precision inference on a single GPU using a checkpoint +loaded from `LM-TFM/model.ckpt*`, run: + +``` +bash run_wt103_base.sh eval --model_dir LM-TFM --fp16 +``` + +To launch FP32 inference on a single GPU using a checkpoint loaded +from `LM-TFM/model.ckpt*`, run: + +``` +bash run_wt103_base.sh eval --model_dir LM-TFM +``` + +After the execution, the script prints a summary in the following format: + +``` +I0109 13:02:31.304439 139903273469760 main.py:440] Evaluating with: math fp16 +INFO:tensorflow:| loss 3.15 | pplx 23.32, bpc 4.5432, tok/s 9946, ms/batch 102.84 +``` + +which contains information about loss, perplexity and execution performance on the test dataset. + +## Performance + +### Benchmarking + +The following section shows how to run benchmarks measuring the model +performance in training and inference modes. + +#### Training performance benchmark + +To benchmark the training performance on a specific global batch size ``, +with a specific number of GPUs `<#GPUs>` for a specific number of training +iterations `` run: + +For the base model: + +``` +bash run_wt103_base.sh train <#GPUs> --train_batch_size --train_steps --log_interval 1 [--fp16] [--batch_chunk CHUNK] +``` + +It's recommended to launch at least 1500 training steps to get a reliable +estimate of training performance. For more information about the available +options, refer to the [Training process](#training-process) section. + +The training script prints information in the following format: + +``` +(...) +[1,0]:INFO:tensorflow:step 99 | lr 0.000990000 | loss 9.22 | pplx 10069.60, bpc 13.2977, tok/s 136092 +[1,0]:I0109 12:18:41.333325 140403024426816 main.py:333] step 99 | lr 0.000990000 | loss 9.22 | pplx 10069.60, +bpc 13.2977, tok/s 136092 +[1,0]:INFO:tensorflow:step 100 | lr 0.001000000 | loss 9.21 | pplx 9981.87, bpc 13.2851, tok/s 135309 +[1,0]:I0109 12:18:41.696926 140403024426816 main.py:333] step 100 | lr 0.001000000 | loss 9.21 | pplx 9981.87, +bpc 13.2851, tok/s 135309 +(...) +[1,0]:INFO:tensorflow:Training throughput: 135959 tok/s +``` + +The last two lines contain information on the +average training throughput measured in tokens per second. + +#### Inference performance benchmark + +The inference performance and accuracy benchmarks require a checkpoint from a +trained model. + +To benchmark the inference performance on a specific global batch size ``, run: + +``` +bash run_wt103_base.sh eval --model_dir --eval_batch_size [--fp16] +``` + +The inference script prints information in the following format: + +``` +I0109 13:02:31.304439 139903273469760 main.py:440] Evaluating with: math fp16 +INFO:tensorflow:| loss 3.15 | pplx 23.32, bpc 4.5432, tok/s 9946, ms/batch 102.84 +``` + +The output contains information on the achieved test loss and test perplexity, +average inference throughput (measured in tokens per second), average inference +latency (measured in milliseconds). + +### Results + +The following sections provide details on how we achieved our performance and +accuracy in training and inference. + +#### Training accuracy results + +##### Training accuracy: NVIDIA DGX-1 (8x V100 16G) + +###### Base model +Our results were obtained by running the `tf/run_wt103_base.sh` +training script in the tensorflow:19.12-tf1-py3 NGC container on NVIDIA DGX-1 +with 8x V100 16G GPUs. + +|**GPUs**|**Batch Size / GPU**|**Accuracy - FP32 (perplexity)**|**Accuracy - Mixed precision (perplexity)**|**Time to Train - FP32 (minutes)**|**Time to Train - Mixed precision (minutes)**|**Time to Train Speedup (FP32 to Mixed precision)**| +|-------:|-------------------:|-------------------------------:|------------------------------------------:|---------------------------------:|--------------------------------------------:|--------------------------------------------------:| +| 1 | 16 | 23.64 | 23.58 | 2943 | 2011 | 1.46 | +| 8 | 16 | 23.36 | 23.38 | 439 | 333 | 1.32 | + +##### Training accuracy: NVIDIA DGX-2 (16x V100 32G) + +###### Base model + +Our results were obtained by running the `tf/run_wt103_base.sh` +training script in the tensorflow:19.12-tf1-py3 NGC container on NVIDIA DGX-2 +with 16x V100 32G GPUs. + +|**GPUs**|**Batch Size / GPU**|**Accuracy - FP32 (perplexity)**|**Accuracy - Mixed precision (perplexity)**|**Time to Train - FP32 (minutes)**|**Time to Train - Mixed precision (minutes)**|**Time to Train Speedup (FP32 to Mixed precision)**| +|-------:|-------------------:|-------------------------------:|------------------------------------------:|---------------------------------:|--------------------------------------------:|--------------------------------------------------:| +| 16 | 16 | 23.39 | 23.37 | 202 | 161 | 1.25 | +| 8 | 32 | 23.33 | 23.40 | 330 | 227 | 1.46 | + + +##### Training loss plot + +###### Base model + +![TrainingLossBase](tf/img/training_loss_base.png) + +##### Training stability test + +###### Base model +The Transformer-XL base model was trained for 40,000 training steps, starting +from 20 different initial random seeds. The training was performed in the tensorflow:19.12-tf1-py3 NGC container on +NVIDIA DGX-1 with 8x V100 16G GPUs. +After training, the models were evaluated on the test dataset. The following +table summarizes the final perplexity on the test set. + +|**Average perplexity**|**Standard deviation**|**Minimum**|**Maximum**|**Median**| +|---------------------:|---------------------:|----------:|----------:|---------:| +| 23.39 | 0.0878 | 23.24 | 23.58 | 23.39 | + +#### Training performance results + +##### Training performance: NVIDIA DGX-1 (8x V100 16G) + +###### Base model + +Our results were obtained by running the `tf/run_wt103_base.sh` +training script in the tensorflow:19.12-tf1-py3 NGC container on NVIDIA DGX-1 with 8x +V100 16G GPUs. Performance numbers (in tokens per second) were averaged over 2000 +training iterations. + +|**GPUs**|**Batch Size / GPU**|**Throughput - FP32 (tok/s)**|**Throughput - Mixed precision (tok/s)**|**Throughput speedup (FP32 to Mixed precision)**|**Weak Scaling - FP32**|**Weak Scaling - Mixed precision**| +|-------:|-------------------:|----------------------------:|---------------------------------------:|-----------------------------------------------:|----------------------:|---------------------------------:| +| 1 | 16 | 9,104 | 13,004 | 1.428 | 1.000 | 1.000 | +| 2 | 16 | 18,169 | 23,856 | 1.313 | 1.996 | 1.835 | +| 4 | 16 | 38,876 | 50,310 | 1.294 | 4.270 | 3.869 | +| 8 | 16 | 78,626 | 101,954 | 1.297 | 8.636 | 7.840 | + +To achieve these same results, follow the steps in the [Quick Start Guide](#quick-start-guide). + +##### Training performance: NVIDIA DGX-2 (16x V100 32G) + +###### Base model + +Our results were obtained by running the `tf/run_wt103_base.sh` training +script in the tensorflow:19.12-tf1-py3 NGC container on NVIDIA DGX-2 with 16x V100 32G +GPUs. Performance numbers (in tokens per second) were averaged over 2000 +training iterations. + +|**GPUs**|**Batch Size / GPU**|**Throughput - FP32 (tok/s)**|**Throughput - Mixed precision (tok/s)**|**Throughput speedup (FP32 to Mixed precision)**|**Weak Scaling - FP32**|**Weak Scaling - Mixed precision**| +|-------:|-------------------:|----------------------------:|---------------------------------------:|-----------------------------------------------:|----------------------:|---------------------------------:| +| 1 | 16 | 9,891 | 13,791 | 1.394 | 1.000 | 1.000 | +| 2 | 16 | 21,550 | 28,306 | 1.314 | 2.179 | 2.052 | +| 4 | 16 | 42,616 | 55,430 | 1.301 | 4.309 | 4.019 | +| 8 | 16 | 83,932 | 107,999 | 1.287 | 8.486 | 7.831 | +| 16 | 16 | 164,675 | 206,906 | 1.256 | 16.649 | 15.003 | + +To achieve these same results, follow the steps in the [Quick Start Guide](#quick-start-guide). + +#### Inference performance results + +##### Inference performance: NVIDIA DGX-1 (1x V100 16G) + +###### Base model + +Our results were obtained by running the +`tf/scripts/inference_benchmark.sh` inferencing benchmarking script in the +tensorflow:19.12-tf1-py3 NGC container on NVIDIA DGX-1 with 1x V100 16G GPU. + +The command to launch the inference performance benchmark is provided in the +[Inference performance benchmark](#inference-performance-benchmark) section. + +**FP16** + +|**Batch size**|**Sequence length**|**Memory length**|**Throughput Avg (tok/s)**|**Latency Avg (ms)**|**Latency 90% (ms)**|**Latency 95% (ms)**|**Latency 99% (ms)**| +|-------------:|------------------:|----------------:|-------------------------:|-------------------:|-------------------:|-------------------:|-------------------:| +| 1 | 64 | 640 | 1394.7 | 45.91 | 47.18 | 47.98 | 49.47 | +| 2 | 64 | 640 | 2560.9 | 50.00 | 51.30 | 52.08 | 54.94 | +| 4 | 64 | 640 | 4326.6 | 59.14 | 60.47 | 61.21 | 63.00 | +| 8 | 64 | 640 | 6621.9 | 77.29 | 78.50 | 79.01 | 81.36 | +| 16 | 64 | 640 | 8872.3 | 115.34 | 116.93 | 117.98 | 121.15 | +| 32 | 64 | 640 | 10441.9 | 196.00 | 197.94 | 199.43 | 203.96 | + +**FP32** + +|**Batch size**|**Sequence length**|**Memory length**|**Throughput Avg (tok/s)**|**Latency Avg (ms)**|**Latency 90% (ms)**|**Latency 95% (ms)**|**Latency 99% (ms)**| +|-------------:|------------------:|----------------:|-------------------------:|-------------------:|-------------------:|-------------------:|-------------------:| +| 1 | 64 | 640 | 1315.2 | 48.70 | 49.78 | 50.54 | 53.31 | +| 2 | 64 | 640 | 2419.2 | 52.91 | 54.17 | 54.73 | 56.13 | +| 4 | 64 | 640 | 4012.7 | 63.76 | 65.27 | 66.11 | 67.81 | +| 8 | 64 | 640 | 5650.1 | 90.56 | 91.92 | 92.47 | 94.15 | +| 16 | 64 | 640 | 7041.2 | 145.34 | 147.20 | 148.38 | 151.37 | +| 32 | 64 | 640 | 8051.3 | 254.14 | 256.58 | 257.51 | 258.39 | + +To achieve these same results, follow the steps in the [Quick Start Guide](#quick-start-guide). + +##### Inference performance: NVIDIA T4 + +###### Base model + +Our results were obtained by running the +`tf/scripts/inference_benchmark.sh` inferencing benchmarking script in the +tensorflow:19.12-tf1-py3 NGC container on NVIDIA T4. + +The command to launch the inference performance benchmark is provided in the +[Inference performance benchmark](#inference-performance-benchmark) section. + +**FP16** + +|**Batch size**|**Sequence length**|**Memory length**|**Throughput Avg (tok/s)**|**Latency Avg (ms)**|**Latency 90% (ms)**|**Latency 95% (ms)**|**Latency 99% (ms)**| +|-------------:|------------------:|----------------:|-------------------------:|-------------------:|-------------------:|-------------------:|-------------------:| +| 1 | 64 | 640 | 1053.6 | 60.75 | 61.59 | 62.02 | 63.58 | +| 2 | 64 | 640 | 2024.5 | 63.22 | 63.95 | 64.76 | 67.33 | +| 4 | 64 | 640 | 3309.7 | 77.30 | 78.33 | 78.85 | 80.12 | +| 8 | 64 | 640 | 4713.7 | 108.53 | 109.66 | 110.26 | 111.15 | +| 16 | 64 | 640 | 6075.8 | 168.40 | 169.62 | 170.28 | 171.88 | +| 32 | 64 | 640 | 6850.5 | 298.69 | 300.42 | 301.04 | 302.21 | + +**FP32** + +|**Batch size**|**Sequence length**|**Memory length**|**Throughput Avg (tok/s)**|**Latency Avg (ms)**|**Latency 90% (ms)**|**Latency 95% (ms)**|**Latency 99% (ms)**| +|-------------:|------------------:|----------------:|-------------------------:|-------------------:|-------------------:|-------------------:|-------------------:| +| 1 | 64 | 640 | 929.5 | 68.88 | 70.43 | 70.88 | 72.05 | +| 2 | 64 | 640 | 1757.6 | 72.84 | 74.30 | 75.08 | 76.62 | +| 4 | 64 | 640 | 2696.7 | 94.87 | 97.02 | 97.58 | 99.19 | +| 8 | 64 | 640 | 3561.6 | 143.65 | 145.98 | 146.96 | 148.18 | +| 16 | 64 | 640 | 4190.4 | 244.16 | 246.34 | 246.62 | 247.32 | +| 32 | 64 | 640 | 4567.7 | 447.96 | 451.19 | 452.77 | 455.32 | + +To achieve these same results, follow the steps in the [Quick Start Guide](#quick-start-guide). + +## Release notes + +### Changelog + +* April 2020 + * Initial release + * Support for FP32 and mixed precision training on NVIDIA + DGX-1, NVIDIA DGX-2, and inference on NVIDIA Tesla V100 16G + and NVIDIA T4 + +### Known issues + +There are no known issues with this model. diff --git a/TensorFlow/LanguageModeling/Transformer-XL/getdata.sh b/TensorFlow/LanguageModeling/Transformer-XL/getdata.sh new file mode 100755 index 00000000..51c134b0 --- /dev/null +++ b/TensorFlow/LanguageModeling/Transformer-XL/getdata.sh @@ -0,0 +1,120 @@ +# BSD 3-Clause License +# +# Copyright (c) 2017, +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of the copyright holder nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +echo "=== Acquiring datasets ===" +echo "---" + +mkdir -p data +cd data + +if [[ ! -d 'wikitext-2' ]]; then + echo "- Downloading WikiText-2 (WT2)" + wget --quiet --continue https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-v1.zip + unzip -q wikitext-2-v1.zip + cd wikitext-2 + mv wiki.train.tokens train.txt + mv wiki.valid.tokens valid.txt + mv wiki.test.tokens test.txt + cd .. +fi + +echo "- Downloading WikiText-103 (WT2)" +if [[ ! -d 'wikitext-103' ]]; then + wget --continue https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-103-v1.zip + unzip -q wikitext-103-v1.zip + cd wikitext-103 + mv wiki.train.tokens train.txt + mv wiki.valid.tokens valid.txt + mv wiki.test.tokens test.txt + cd .. +fi + +echo "- Downloading enwik8 (Character)" +if [[ ! -d 'enwik8' ]]; then + mkdir -p enwik8 + cd enwik8 + wget --continue http://mattmahoney.net/dc/enwik8.zip + wget https://raw.githubusercontent.com/salesforce/awd-lstm-lm/master/data/enwik8/prep_enwik8.py + python3 prep_enwik8.py + cd .. +fi + +echo "- Downloading text8 (Character)" +if [[ ! -d 'text8' ]]; then + mkdir -p text8 + cd text8 + wget --continue http://mattmahoney.net/dc/text8.zip + python ../../prep_text8.py + cd .. +fi + +echo "- Downloading Penn Treebank (PTB)" +if [[ ! -d 'penn' ]]; then + wget --quiet --continue http://www.fit.vutbr.cz/~imikolov/rnnlm/simple-examples.tgz + tar -xzf simple-examples.tgz + + mkdir -p penn + cd penn + mv ../simple-examples/data/ptb.train.txt train.txt + mv ../simple-examples/data/ptb.test.txt test.txt + mv ../simple-examples/data/ptb.valid.txt valid.txt + cd .. + + echo "- Downloading Penn Treebank (Character)" + mkdir -p pennchar + cd pennchar + mv ../simple-examples/data/ptb.char.train.txt train.txt + mv ../simple-examples/data/ptb.char.test.txt test.txt + mv ../simple-examples/data/ptb.char.valid.txt valid.txt + cd .. + + rm -rf simple-examples/ +fi + +echo "- Downloading 1B words" + +if [[ ! -d 'one-billion-words' ]]; then + mkdir -p one-billion-words + cd one-billion-words + + wget --no-proxy http://www.statmt.org/lm-benchmark/1-billion-word-language-modeling-benchmark-r13output.tar.gz + tar xzvf 1-billion-word-language-modeling-benchmark-r13output.tar.gz + + path="1-billion-word-language-modeling-benchmark-r13output/heldout-monolingual.tokenized.shuffled/" + cat ${path}/news.en.heldout-00000-of-00050 > valid.txt + cat ${path}/news.en.heldout-00000-of-00050 > test.txt + + wget https://github.com/rafaljozefowicz/lm/raw/master/1b_word_vocab.txt + + cd .. +fi + +echo "---" +echo "Happy language modeling :)" diff --git a/TensorFlow/LanguageModeling/Transformer-XL/prep_text8.py b/TensorFlow/LanguageModeling/Transformer-XL/prep_text8.py new file mode 100755 index 00000000..1bae4ae7 --- /dev/null +++ b/TensorFlow/LanguageModeling/Transformer-XL/prep_text8.py @@ -0,0 +1,62 @@ +#!/usr/bin/env python +# coding=utf-8 + +# BSD 3-Clause License +# +# Copyright (c) 2017, +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of the copyright holder nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import os +import sys +import zipfile + +from io import open + +if os.path.exists('train.txt'): + print('Tokenized text8 already exists - skipping processing') + sys.exit() + +data = zipfile.ZipFile('text8.zip').extractall() +data = open('text8', 'r', encoding='utf-8').read() + +print('Length of text8: {}'.format(len(data))) + +num_test_chars = 5000000 + +train_data = data[: -2 * num_test_chars] +valid_data = data[-2 * num_test_chars: -num_test_chars] +test_data = data[-num_test_chars:] + +for fn, part in [('train.txt', train_data), ('valid.txt', valid_data), ('test.txt', test_data)]: + print('{} will have {} bytes'.format(fn, len(part))) + print('- Tokenizing...') + # Change space ' ' to underscore '_' + part_str = ' '.join(['_' if c == ' ' else c for c in part.strip()]) + print('- Writing...') + f = open(fn, 'w').write(part_str) + f = open(fn + '.raw', 'w', encoding='utf-8').write(part) diff --git a/TensorFlow/LanguageModeling/Transformer-XL/tf/data_utils.py b/TensorFlow/LanguageModeling/Transformer-XL/tf/data_utils.py new file mode 100755 index 00000000..e5e609c6 --- /dev/null +++ b/TensorFlow/LanguageModeling/Transformer-XL/tf/data_utils.py @@ -0,0 +1,488 @@ +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import math +import os +from functools import partial + +from collections import Counter, OrderedDict +import pickle +import json +import multiprocessing as mp + +import numpy as np + +from absl import flags +import tensorflow as tf +from vocabulary import Vocab + +from tensorflow.gfile import Exists as exists +from tensorflow.gfile import MakeDirs as makedirs +from tensorflow.gfile import Glob as glob + + +def _preprocess(shard, train, vocab, save_dir, cutoffs, bin_sizes, bsz, tgt_len, + num_core_per_host, num_shuffle): + file_names = [] + num_batch = 0 + + path = train[shard] + data_shard = vocab.encode_file(path, ordered=False, add_double_eos=True) + + for shuffle in range(num_shuffle): + basename = "train-{:03d}-{:02d}".format(shard, shuffle) + print("Processing shard {} shuffle {}".format(shard, shuffle)) + + np.random.shuffle(data_shard) + file_name, num_batch_shuffle = create_ordered_tfrecords( + save_dir, basename, np.concatenate(data_shard), bsz, tgt_len, + num_core_per_host, cutoffs, bin_sizes) + file_names.append(file_name) + num_batch += num_batch_shuffle + + return file_names, num_batch + + +class Corpus(object): + def __init__(self, path, dataset, *args, **kwargs): + self.dataset = dataset + self.vocab = Vocab(*args, **kwargs) + + if self.dataset in ["ptb", "wt2", "enwik8", "text8"]: + self.vocab.count_file(os.path.join(path, "train.txt")) + self.vocab.count_file(os.path.join(path, "valid.txt")) + self.vocab.count_file(os.path.join(path, "test.txt")) + elif self.dataset == "wt103": + self.vocab.count_file(os.path.join(path, "train.txt")) + elif self.dataset == "lm1b": + train_path_pattern = os.path.join( + path, "1-billion-word-language-modeling-benchmark-r13output", + "training-monolingual.tokenized.shuffled", "news.en-*") + train_paths = glob(train_path_pattern) + + # the vocab will load from file when build_vocab() is called + # for train_path in sorted(train_paths): + # self.vocab.count_file(train_path, verbose=True) + + self.vocab.build_vocab() + + if self.dataset in ["ptb", "wt2", "wt103"]: + self.train = self.vocab.encode_file( + os.path.join(path, "train.txt"), ordered=True) + self.valid = self.vocab.encode_file( + os.path.join(path, "valid.txt"), ordered=True) + self.test = self.vocab.encode_file( + os.path.join(path, "test.txt"), ordered=True) + elif self.dataset in ["enwik8", "text8"]: + self.train = self.vocab.encode_file( + os.path.join(path, "train.txt"), ordered=True, add_eos=False) + self.valid = self.vocab.encode_file( + os.path.join(path, "valid.txt"), ordered=True, add_eos=False) + self.test = self.vocab.encode_file( + os.path.join(path, "test.txt"), ordered=True, add_eos=False) + elif self.dataset == "lm1b": + self.train = train_paths + valid_path = os.path.join(path, "valid.txt") + test_path = valid_path + self.valid = self.vocab.encode_file( + valid_path, ordered=True, add_double_eos=True) + self.test = self.vocab.encode_file( + test_path, ordered=True, add_double_eos=True) + + if self.dataset == "wt103": + self.cutoffs = [0, 19997, 39997, 199997] + [len(self.vocab)] + elif self.dataset == "lm1b": + self.cutoffs = [0, 59997, 99997, 639997] + [len(self.vocab)] + else: + self.cutoffs = [] + + + def convert_to_tfrecords(self, split, save_dir, bsz, tgt_len, + num_core_per_host, **kwargs): + FLAGS = kwargs.get('FLAGS') + + file_names = [] + + record_name = "record_info-{}.bsz-{}.tlen-{}.json".format( + split, bsz, tgt_len) + + record_info_path = os.path.join(save_dir, record_name) + + if self.dataset in ["ptb", "wt2", "wt103", "enwik8", "text8"]: + data = getattr(self, split) + bin_sizes = get_bin_sizes( + data, bsz // num_core_per_host, tgt_len, self.cutoffs) + file_name, num_batch = create_ordered_tfrecords( + save_dir, split, data, bsz, tgt_len, num_core_per_host, + self.cutoffs, bin_sizes, + num_passes=FLAGS.num_passes if split == 'train' else 1) + file_names.append(file_name) + elif self.dataset == "lm1b": + bin_sizes = get_bin_sizes( + self.valid, bsz // num_core_per_host, tgt_len, self.cutoffs) + if split == "train": + np.random.seed(123456) + num_batch = 0 + + if FLAGS.num_procs > 1: + _preprocess_wrapper = partial(_preprocess, + train=self.train, vocab=self.vocab, save_dir=save_dir, + cutoffs=self.cutoffs, bin_sizes=bin_sizes, bsz=bsz, + tgt_len=tgt_len, num_core_per_host=num_core_per_host, + num_shuffle=FLAGS.num_shuffle) + + pool = mp.Pool(processes=FLAGS.num_procs) + results = pool.map(_preprocess_wrapper, range(len(self.train))) + for res in results: + file_names.extend(res[0]) + num_batch += res[1] + else: + for shard, path in enumerate(self.train): + data_shard = self.vocab.encode_file(path, ordered=False, + add_double_eos=True) + + num_shuffle = FLAGS.num_shuffle + + for shuffle in range(num_shuffle): + print("Processing shard {} shuffle {}".format(shard, shuffle)) + basename = "train-{:03d}-{:02d}".format(shard, shuffle) + np.random.shuffle(data_shard) + file_name, num_batch_ = create_ordered_tfrecords( + save_dir, basename, np.concatenate(data_shard), bsz, tgt_len, + num_core_per_host, + self.cutoffs, bin_sizes) + file_names.append(file_name) + num_batch += num_batch_ + + else: + file_name, num_batch = create_ordered_tfrecords( + save_dir, split, getattr(self, split), bsz, tgt_len, + num_core_per_host, + self.cutoffs, bin_sizes) + file_names.append(file_name) + + with open(record_info_path, "w") as fp: + record_info = { + "filenames": file_names, + "bin_sizes": bin_sizes, + "num_batch": num_batch + } + json.dump(record_info, fp) + + +def get_bin_sizes(data, batch_size, tgt_len, cutoffs, std_mult=[2.5, 2.5, 2.5]): + """ + Note: the `batch_size` here should be per-core batch size + """ + bin_sizes = [] + + def _nearest_to_eight(x): + y = x - x % 8 + return y + 8 if x % 8 >= 4 else max(8, y) + + if cutoffs: + num_batch = len(data) // batch_size // tgt_len + + data = data[:batch_size * num_batch * tgt_len] + data = data.reshape(batch_size, num_batch, tgt_len) + + tot = batch_size * tgt_len + for b, (left, right) in enumerate(zip(cutoffs[1:-1], cutoffs[2:])): + mask = (data >= left) * (data < right) + percents = mask.astype(np.float64).sum(2).sum(0) / tot + mean = np.mean(percents) + std = np.std(percents) + + bin_size = int(math.ceil(tgt_len * batch_size * (mean + std_mult[b] * std))) + bin_size = _nearest_to_eight(bin_size) + bin_sizes.append(bin_size) + + return bin_sizes + + +def _int64_feature(values): + return tf.train.Feature(int64_list=tf.train.Int64List(value=values)) + +def _float_feature(values): + return tf.train.Feature(float_list=tf.train.FloatList(value=values)) + +def batchify(data, batch_size, num_passes): + """ + if num_passes > 1 + + Here, we use multiple randomly shifted copies. + """ + if num_passes > 1: + data_len = len(data) + double_data = np.concatenate([data, data]) + data_list = [] + for i in range(num_passes): + start = np.random.randint(0, data_len) + data_list.append(double_data[start:start+data_len]) + data = np.concatenate(data_list) + + num_step = len(data) // batch_size + data = data[:batch_size * num_step] + data = data.reshape(batch_size, num_step) + + return data + + +def create_ordered_tfrecords(save_dir, basename, data, batch_size, tgt_len, + num_core_per_host, cutoffs=[], bin_sizes=[], + num_passes=1): + + file_name = "{}.bsz-{}.tlen-{}.tfrecords".format( + basename, batch_size, tgt_len) + + save_path = os.path.join(save_dir, file_name) + record_writer = tf.python_io.TFRecordWriter(save_path) + + batched_data = batchify(data, batch_size, num_passes) + + num_batch = 0 + for t in range(0, batched_data.shape[1] - 1, tgt_len): + cur_tgt_len = min(batched_data.shape[1] - 1 - t, tgt_len) + if num_batch % 500 == 0: + print(" processing batch {}".format(num_batch)) + for idx in range(batch_size): + inputs = batched_data[idx, t:t + cur_tgt_len] + labels = batched_data[idx, t + 1:t + cur_tgt_len + 1] + + # features dict + feature = { + "inputs": _int64_feature(inputs), + "labels": _int64_feature(labels), + } + + example = tf.train.Example(features=tf.train.Features(feature=feature)) + record_writer.write(example.SerializeToString()) + + num_batch += 1 + + record_writer.close() + print("Done writing {}. batches: {}".format(file_name, num_batch)) + + return file_name, num_batch + + +def get_lm_corpus(data_dir, dataset): + fn = os.path.join(data_dir, "cache.pkl") + + if exists(fn): + print("Loading cached dataset...") + with open(fn, "rb") as fp: + corpus = pickle.load(fp) + else: + print("Producing dataset...") + kwargs = {} + if dataset in ["wt103", "wt2"]: + kwargs["special"] = [""] + kwargs["lower_case"] = False + elif dataset == "ptb": + kwargs["special"] = [""] + kwargs["lower_case"] = True + elif dataset == "lm1b": + kwargs["special"] = [] + kwargs["lower_case"] = False + kwargs["vocab_file"] = os.path.join(data_dir, "1b_word_vocab.txt") + elif dataset in ["enwik8", "text8"]: + pass + + corpus = Corpus(data_dir, dataset, **kwargs) + + print("Saving dataset...") + with open(fn, "wb") as fp: + pickle.dump(corpus, fp, protocol=2) + + corpus_info = { + "vocab_size" : len(corpus.vocab), + "cutoffs" : corpus.cutoffs, + "dataset" : corpus.dataset + } + with open(os.path.join(data_dir, "corpus-info.json"), "w") as fp: + json.dump(corpus_info, fp) + + return corpus + + +def main(unused_argv): + del unused_argv # Unused + + corpus = get_lm_corpus(FLAGS.data_dir, FLAGS.dataset) + + save_dir = os.path.join(FLAGS.data_dir, "tfrecords") + if not exists(save_dir): + makedirs(save_dir) + + # test mode + if FLAGS.eval_batch_size > 0: + corpus.convert_to_tfrecords("test", save_dir, FLAGS.eval_batch_size, + FLAGS.tgt_len, FLAGS.num_core_per_host, + FLAGS=FLAGS) + return + + for split, batch_size in zip( + ["train", "valid"], + [FLAGS.train_batch_size // FLAGS.batch_chunk, FLAGS.valid_batch_size]): + + if batch_size <= 0: continue + print("Converting {} set...".format(split)) + corpus.convert_to_tfrecords(split, save_dir, batch_size, FLAGS.tgt_len, + FLAGS.num_core_per_host, FLAGS=FLAGS) + + +def load_record_info(record_info_dir, split, per_host_bsz, tgt_len, + num_core_per_host): + record_name = "record_info-{}.bsz-{}.tlen-{}.json".format( + split, per_host_bsz, tgt_len) + + record_info_path = os.path.join(record_info_dir, record_name) + with open(record_info_path, "r") as fp: + record_info = json.load(fp) + + return record_info + +def get_input_fn(record_info_dir, split, per_host_bsz, tgt_len, + num_core_per_host, num_hosts=1): + """Creates input function.""" + record_info = load_record_info(record_info_dir, split, per_host_bsz, tgt_len, + num_core_per_host) + + file_names = record_info["filenames"] + bin_sizes = record_info["bin_sizes"] + num_batch = record_info["num_batch"] + + tf.logging.info("[{}] File names {}".format(split, file_names)) + + def input_fn(params): + # per-core batch size + per_core_bsz = params["batch_size"] // num_core_per_host + + # data_dir could be a remote path, e.g., a google storage url + data_dir = params["data_dir"] + + def parser(record): + # preprocess "inp_perm" and "tgt_perm" + def _process_perm_feature(example, prefix): + for b in range(len(bin_sizes)): + cnt = example.pop("{}_cnt_{}".format(prefix, b))[0] + tup = example.pop("{}_tup_{}".format(prefix, b)) + + tup = tf.reshape( + tf.sparse_tensor_to_dense(tup), + shape=[cnt, 2]) + + # tf.float32 + perm = tf.sparse_to_dense( + sparse_indices=tup, + output_shape=[tgt_len, bin_sizes[b]], + sparse_values=1.0, + default_value=0.0) + + example["{}_perm_{}".format(prefix, b)] = perm + + # whether allow the last batch with a potentially shorter length + record_spec = { + "inputs": tf.VarLenFeature(tf.int64), + "labels": tf.VarLenFeature(tf.int64), + } + + # retrieve serialized example + example = tf.parse_single_example( + serialized=record, + features=record_spec) + + # cast int64 into int32 + # cast sparse to dense + for key in list(example.keys()): + val = example[key] + if tf.keras.backend.is_sparse(val): + val = tf.sparse.to_dense(val) + if val.dtype == tf.int64: + val = tf.to_int32(val) + example[key] = val + + return example["inputs"], example["labels"] + + file_paths = [] + for file_name in file_names: + file_path = os.path.join(data_dir, file_name) + file_paths.append(file_path) + + if split == "train": + dataset = tf.data.Dataset.from_tensor_slices(file_paths) + if len(file_paths) > 1: + dataset = dataset.shuffle(len(file_paths)).repeat() + dataset = tf.data.TFRecordDataset(dataset) + elif num_hosts > 1: + host_id = params["context"].current_host + # drop the remaining batches + num_batch_per_host = num_batch // num_hosts + + my_start_sample_id = (host_id * num_batch_per_host * num_core_per_host * + per_core_bsz) + my_sample_num = num_batch_per_host * num_core_per_host * per_core_bsz + dataset = tf.data.TFRecordDataset(dataset).skip( + my_start_sample_id).take(my_sample_num) + else: + dataset = tf.data.TFRecordDataset(dataset) + + if num_core_per_host > 1: + import horovod.tensorflow as hvd + dataset = dataset.shard(hvd.size(), hvd.rank()) + dataset = dataset.map(parser).cache().repeat() + dataset = dataset.batch(per_core_bsz, drop_remainder=True) + dataset = dataset.prefetch(num_core_per_host * per_core_bsz) + else: + # do not shuffle, repeat or cache in evaluation + dataset = tf.data.Dataset.from_tensor_slices(file_paths) + dataset = tf.data.TFRecordDataset(dataset) + dataset = dataset.map(parser) + dataset = dataset.batch(per_core_bsz, drop_remainder=True) + + return dataset + + if split == "train" and num_hosts > 1: + record_info["num_batch"] = num_batch // num_hosts + + return input_fn, record_info + +def get_corpus_info(corpus_info_path): + with open(corpus_info_path, "r") as fp: + corpus_info = json.load(fp) + return corpus_info + +if __name__ == "__main__": + FLAGS = flags.FLAGS + flags.DEFINE_string("data_dir", None, + help="Location of the data corpus") + flags.DEFINE_enum("dataset", "wt103", + ["ptb", "wt2", "wt103", "lm1b", "enwik8", "text8"], + help="Dataset name.") + flags.DEFINE_integer("train_batch_size", 256, + help="train batch size each host") + flags.DEFINE_integer("valid_batch_size", 256, + help="valid batch size each host") + flags.DEFINE_integer("eval_batch_size", 16, + help="If > 0, enter test mode and process test set only." + "Otherwise, process train and dev sets only.") + flags.DEFINE_integer("tgt_len", 70, + help="number of tokens to predict") + flags.DEFINE_integer("max_batch", -1, + help="run in debug mode") + flags.DEFINE_integer("num_core_per_host", 8, + help="number of GPUs per host") + flags.DEFINE_bool("debug", default=False, + help="Process only the first batch without shuffle for lm1b.") + flags.DEFINE_integer("num_procs", 1, + help="number of processes") + flags.DEFINE_integer("num_passes", 10, + help="number of passes") + flags.DEFINE_integer("num_shuffle", 4, + help="number of shuffles for lm1b") + flags.DEFINE_integer("batch_chunk", 1, + help="number of accumulation steps") + + tf.app.run(main) diff --git a/TensorFlow/LanguageModeling/Transformer-XL/tf/exp_utils.py b/TensorFlow/LanguageModeling/Transformer-XL/tf/exp_utils.py new file mode 100755 index 00000000..1d0dd0fa --- /dev/null +++ b/TensorFlow/LanguageModeling/Transformer-XL/tf/exp_utils.py @@ -0,0 +1,56 @@ +# Copyright (c) 2020 NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import dllogger +import os + +class AverageMeter: + """ + Computes and stores the average and current value + """ + def __init__(self, warmup=0, keep=False): + self.reset() + self.warmup = warmup + self.keep = keep + + def reset(self): + self.val = 0 + self.avg = 0 + self.sum = 0 + self.count = 0 + self.iters = 0 + self.vals = [] + + def update(self, val, n=1): + self.iters += 1 + self.val = val + + if self.iters > self.warmup: + self.sum += val * n + self.count += n + self.avg = self.sum / self.count + if self.keep: + self.vals.append(val) + +def setup_dllogger(enabled=True, filename=os.devnull, rank=0): + if enabled and rank == 0: + backends = [ + dllogger.JSONStreamBackend( + dllogger.Verbosity.VERBOSE, + filename, + ), + ] + dllogger.init(backends) + else: + dllogger.init([]) diff --git a/TensorFlow/LanguageModeling/Transformer-XL/tf/img/model.png b/TensorFlow/LanguageModeling/Transformer-XL/tf/img/model.png new file mode 100644 index 00000000..4c4c7c8e Binary files /dev/null and b/TensorFlow/LanguageModeling/Transformer-XL/tf/img/model.png differ diff --git a/TensorFlow/LanguageModeling/Transformer-XL/tf/img/training_loss_base.png b/TensorFlow/LanguageModeling/Transformer-XL/tf/img/training_loss_base.png new file mode 100644 index 00000000..2ec24fc9 Binary files /dev/null and b/TensorFlow/LanguageModeling/Transformer-XL/tf/img/training_loss_base.png differ diff --git a/TensorFlow/LanguageModeling/Transformer-XL/tf/lamb.py b/TensorFlow/LanguageModeling/Transformer-XL/tf/lamb.py new file mode 100755 index 00000000..adb2cc9c --- /dev/null +++ b/TensorFlow/LanguageModeling/Transformer-XL/tf/lamb.py @@ -0,0 +1,179 @@ +# Copyright (c) 2020 NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# MIT License +# +# Copyright (c) 2019 cybertronai +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +# Copyright 2015 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +import tensorflow as tf +from tensorflow.python.ops import array_ops +from tensorflow.python.ops import linalg_ops +from tensorflow.python.eager import context +from tensorflow.python.framework import ops +from tensorflow.python.ops import control_flow_ops +from tensorflow.python.ops import math_ops +from tensorflow.python.ops import state_ops +from tensorflow.python.training import optimizer + +class LAMBOptimizer(optimizer.Optimizer): + + def __init__(self, learning_rate=0.001, wd= 0.01, beta1=0.9, beta2=0.999, epsilon=1e-6, + use_locking=False, name="LAMB"): + + super(LAMBOptimizer, self).__init__(use_locking, name) + self._lr = learning_rate + self._beta1 = beta1 + self._beta2 = beta2 + self._epsilon = epsilon + self._wd = wd + + # Tensor versions of the constructor arguments, created in _prepare(). + self._lr_t = None + self._beta1_t = None + self._beta2_t = None + self._epsilon_t = None + self._wd_t = None + + def _get_beta_accumulators(self): + with ops.init_scope(): + if context.executing_eagerly(): + graph = None + else: + graph = ops.get_default_graph() + return (self._get_non_slot_variable("beta1_power", graph=graph), + self._get_non_slot_variable("beta2_power", graph=graph)) + + def _create_slots(self, var_list): + first_var = min(var_list, key=lambda x: x.name) + self._create_non_slot_variable(initial_value=self._beta1, + name="beta1_power", + colocate_with=first_var) + self._create_non_slot_variable(initial_value=self._beta2, + name="beta2_power", + colocate_with=first_var) + + for v in var_list: + self._zeros_slot(v, "m", self._name) + self._zeros_slot(v, "v", self._name) + + def _prepare(self): + lr = self._call_if_callable(self._lr) + beta1 = self._call_if_callable(self._beta1) + beta2 = self._call_if_callable(self._beta2) + epsilon = self._call_if_callable(self._epsilon) + wd = self._call_if_callable(self._wd) + + self._lr_t = ops.convert_to_tensor(lr, name="learning_rate") + self._beta1_t = ops.convert_to_tensor(beta1, name="beta1") + self._beta2_t = ops.convert_to_tensor(beta2, name="beta2") + self._epsilon_t = ops.convert_to_tensor(epsilon, name="epsilon") + self._wd_t = ops.convert_to_tensor(wd, name="wd") + + def _apply_dense(self, grad, var): + lr_t = math_ops.cast(self._lr_t, var.dtype.base_dtype) + beta1_power, beta2_power = self._get_beta_accumulators() + beta1_t = math_ops.cast(self._beta1_t, var.dtype.base_dtype) + beta2_t = math_ops.cast(self._beta2_t, var.dtype.base_dtype) + eps = math_ops.cast(self._epsilon_t, var.dtype.base_dtype) + wd_lambda = math_ops.cast(self._wd_t, var.dtype.base_dtype) + + v = self.get_slot(var, "v") + v_t = v.assign(beta2_t * v + (1. - beta2_t) * grad**2) + m = self.get_slot(var, "m") + m_t = m.assign(beta1_t * m + (1. - beta1_t) * grad) + + # add l2 normalizations and set ratio + r1 = tf.sqrt(tf.reduce_sum(tf.square(var))) + step = m_t / (tf.sqrt(v_t) + eps) + wd_lambda * var + r2 = tf.sqrt(tf.reduce_sum(tf.square(step))) + + ratio = array_ops.where(math_ops.greater(r1, 0), array_ops.where( + math_ops.greater(r2, 0), tf.minimum(r1, 10) / r2, 1.0), 1.0) + var_update = state_ops.assign_sub(var, lr_t * ratio * step) + return control_flow_ops.group(*[var_update, v_t, m_t]) + + def _resource_apply_dense(self, grad, var): + return None + + def _apply_sparse_shared(self, grad, var, indices, scatter_add): + beta1_power, beta2_power = self._get_beta_accumulators() + lr_t = math_ops.cast(self._lr_t, var.dtype.base_dtype) + beta1_t = math_ops.cast(self._beta1_t, var.dtype.base_dtype) + beta2_t = math_ops.cast(self._beta2_t, var.dtype.base_dtype) + epsilon_t = math_ops.cast(self._epsilon_t, var.dtype.base_dtype) + # m_t = beta1 * m + (1 - beta1) * g_t + m = self.get_slot(var, "m") + m_scaled_g_values = grad * (1 - beta1_t) + m_t = state_ops.assign(m, m * beta1_t, use_locking=self._use_locking) + with ops.control_dependencies([m_t]): + m_t = scatter_add(m, indices, m_scaled_g_values) + # v_t = beta2 * v + (1 - beta2) * (g_t * g_t) + v = self.get_slot(var, "v") + v_scaled_g_values = (grad * grad) * (1 - beta2_t) + v_t = state_ops.assign(v, v * beta2_t, use_locking=self._use_locking) + with ops.control_dependencies([v_t]): + v_t = scatter_add(v, indices, v_scaled_g_values) + v_sqrt = math_ops.sqrt(v_t) + step = m_t / (v_sqrt + epsilon_t) + w_norm = linalg_ops.norm(var, ord=2) + g_norm = linalg_ops.norm(step, ord=2) + ratio = array_ops.where(math_ops.greater(w_norm, 0), array_ops.where( + math_ops.greater(g_norm, 0), tf.minimum(w_norm, 10) / g_norm, 1.0), 1.0) + var_update = state_ops.assign_sub( + var, ratio * lr_t * step, use_locking=self._use_locking) + return control_flow_ops.group(*[var_update, m_t, v_t]) + + def _apply_sparse(self, grad, var): + return self._apply_sparse_shared( + grad.values, + var, + grad.indices, + lambda x, i, v: state_ops.scatter_add( # pylint: disable=g-long-lambda + x, + i, + v, + use_locking=self._use_locking)) diff --git a/TensorFlow/LanguageModeling/Transformer-XL/tf/main.py b/TensorFlow/LanguageModeling/Transformer-XL/tf/main.py new file mode 100755 index 00000000..cffd458e --- /dev/null +++ b/TensorFlow/LanguageModeling/Transformer-XL/tf/main.py @@ -0,0 +1,510 @@ +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import os +import math +import time + +from absl import flags +import absl.logging as _logging # pylint: disable=unused-import + +import tensorflow as tf +import horovod.tensorflow as hvd +import model +import data_utils +import lamb +import dllogger +from exp_utils import AverageMeter, setup_dllogger + +import numpy as np + +flags.DEFINE_integer("num_core_per_host", default=8, + help="Number of cores per host") +flags.DEFINE_bool('horovod', True, 'Use Horovod ') +# Experiment (data/checkpoint/directory) config +flags.DEFINE_string("raport_file", default="summary.json", + help="Path to dlloger json") +flags.DEFINE_string("data_dir", default="", + help="Path to tf-records directory.") +flags.DEFINE_string("record_info_dir", default="", + help="Path to local directory containing filenames.txt.") +flags.DEFINE_string("corpus_info_path", default="", + help="Path to corpus-info.json file.") +flags.DEFINE_string("model_dir", default="LM-TFM", + help="Estimator model_dir.") +flags.DEFINE_bool("do_train", default=True, + help="Whether to run training.") +flags.DEFINE_bool("do_eval", default=False, + help="Whether to run eval on the dev set.") +flags.DEFINE_string("eval_ckpt_path", None, + help="Checkpoint path for do_test evaluation." + "If set, model_dir will be ignored." + "If unset, will use the latest ckpt in model_dir.") +flags.DEFINE_bool("fp16", default=False, + help="Whether to enable AMP ops.") +flags.DEFINE_bool("jit_optimizer", default=True, + help="Whether to enable XLA on optimizer") + +# Optimization config +flags.DEFINE_float("learning_rate", default=0.01, + help="Maximum learning rate.") +flags.DEFINE_float("clip", default=0.25, + help="Gradient clipping value.") +# for cosine decay +flags.DEFINE_float("min_lr_ratio", default=0.1, + help="Minimum ratio learning rate.") +flags.DEFINE_integer("warmup_steps", default=1000, + help="Number of steps for linear lr warmup.") + +# Training config +flags.DEFINE_integer("train_batch_size", default=256, + help="Size of train batch.") +flags.DEFINE_integer("eval_batch_size", default=16, + help="Size of valid batch.") +flags.DEFINE_integer("train_steps", default=40000, + help="Total number of training steps.") +flags.DEFINE_integer("log_interval", default=100, + help="Number of iterations per repeat loop.") +flags.DEFINE_integer("save_steps", default=5000, + help="number of steps for model checkpointing.") +flags.DEFINE_integer("batch_chunk", default=1, + help="Number of accumulation steps.") + +# Evaluation config +flags.DEFINE_integer("max_eval_batch", default=-1, + help="Set -1 to turn off. Only used in test mode.") +flags.DEFINE_string("eval_split", "valid", + help="Which data split to evaluate.") +flags.DEFINE_list("percentiles", default=['90', '95', '99'], + help="percentiles for latency confidence intervals") + +# Model config +flags.DEFINE_integer("tgt_len", default=192, + help="Number of steps to predict") +flags.DEFINE_integer("mem_len", default=192, + help="Number of steps to cache") +flags.DEFINE_bool("same_length", default=False, + help="Same length attention") +flags.DEFINE_integer("clamp_len", default=-1, + help="Clamp length") + +flags.DEFINE_integer("n_layer", default=16, + help="Number of layers.") +flags.DEFINE_integer("d_model", default=512, + help="Dimension of the model.") +flags.DEFINE_integer("d_embed", default=512, + help="Dimension of the embeddings.") +flags.DEFINE_integer("n_head", default=8, + help="Number of attention heads.") +flags.DEFINE_integer("d_head", default=64, + help="Dimension of each attention head.") +flags.DEFINE_integer("d_inner", default=2048, + help="Dimension of inner hidden size in positionwise feed-forward.") +flags.DEFINE_float("dropout", default=0.1, + help="Dropout rate.") +flags.DEFINE_float("dropatt", default=0.0, + help="Attention dropout rate.") +flags.DEFINE_bool("untie_r", default=False, + help="untie r_w_bias and r_r_bias") + +# Adaptive Softmax / Embedding +flags.DEFINE_bool("tie_weight", default=True, + help="Tie embedding and softmax weight.") +flags.DEFINE_integer("div_val", default=1, + help="Divide the embedding size by this val for each bin") +flags.DEFINE_bool("proj_share_all_but_first", default=False, + help="True to share all but first projs, False not to share.") +flags.DEFINE_bool("proj_same_dim", default=True, + help="Project the bin with the same dimension.") + +# Parameter initialization +flags.DEFINE_enum("init", default="normal", + enum_values=["normal", "uniform"], + help="Initialization method.") +flags.DEFINE_float("init_std", default=0.02, + help="Initialization std when init is normal.") +flags.DEFINE_float("proj_init_std", default=0.01, + help="Initialization std for embedding projection.") +flags.DEFINE_float("init_range", default=0.1, + help="Initialization std when init is uniform.") + + +FLAGS = flags.FLAGS + +def get_model_fn(n_token, cutoffs): + def model_fn(inp, tgt, mems, is_training): + inp = tf.transpose(inp, [1, 0]) + tgt = tf.transpose(tgt, [1, 0]) + + if FLAGS.init == "uniform": + initializer = tf.initializers.random_uniform( + minval=-FLAGS.init_range, + maxval=FLAGS.init_range, + seed=None) + elif FLAGS.init == "normal": + initializer = tf.initializers.random_normal( + stddev=FLAGS.init_std, + seed=None) + proj_initializer = tf.initializers.random_normal( + stddev=FLAGS.proj_init_std, + seed=None) + + tie_projs = [False for _ in range(len(cutoffs) + 1)] + if FLAGS.proj_share_all_but_first: + for i in range(1, len(tie_projs)): + tie_projs[i] = True + + loss, new_mems = model.transformer( + dec_inp=inp, + target=tgt, + mems=mems, + n_token=n_token, + n_layer=FLAGS.n_layer, + d_model=FLAGS.d_model, + d_embed=FLAGS.d_embed, + n_head=FLAGS.n_head, + d_head=FLAGS.d_head, + d_inner=FLAGS.d_inner, + dropout=FLAGS.dropout, + dropatt=FLAGS.dropatt, + initializer=initializer, + proj_initializer=proj_initializer, + is_training=is_training, + mem_len=FLAGS.mem_len, + cutoffs=cutoffs, + div_val=FLAGS.div_val, + tie_projs=tie_projs, + input_perms=None, + target_perms=None, + head_target=None, + same_length=FLAGS.same_length, + clamp_len=FLAGS.clamp_len, + untie_r=FLAGS.untie_r, + proj_same_dim=FLAGS.proj_same_dim) + + # number of parameters + num_params = sum([np.prod(v.shape) for v in tf.trainable_variables()]) + tf.logging.info('#params: {}'.format(num_params)) + + if is_training: + all_vars = tf.trainable_variables() + + return loss, new_mems, all_vars + else: + return loss, new_mems + + return model_fn + + +def single_core_graph(n_token, cutoffs, is_training, inp, tgt, mems): + model_fn = get_model_fn( + n_token=n_token, + cutoffs=cutoffs) + + model_ret = model_fn( + inp=inp, + tgt=tgt, + mems=mems, + is_training=is_training) + + return model_ret + + +def train(n_token, cutoffs, rank, local_rank, size): + + meters = {} + warmup = 2 + 12/size + meters['train_throughput'] = AverageMeter(warmup=warmup) + train_batch_size = FLAGS.train_batch_size // FLAGS.batch_chunk + ##### Get input function and model function + train_input_fn, train_record_info = data_utils.get_input_fn( + record_info_dir=FLAGS.record_info_dir, + split="train", + per_host_bsz=train_batch_size, + tgt_len=FLAGS.tgt_len, + num_core_per_host=FLAGS.num_core_per_host, + num_hosts=1) + + tf.logging.info("num of batches {}".format(train_record_info["num_batch"])) + + ##### Create computational graph + train_set = train_input_fn({ + "batch_size": train_batch_size, + "data_dir": FLAGS.data_dir}) + + inputs, labels = train_set.make_one_shot_iterator().get_next() + + per_core_bsz = train_batch_size // FLAGS.num_core_per_host + + with tf.variable_scope(tf.get_variable_scope()): + mems = [tf.Variable(tf.zeros([FLAGS.mem_len, per_core_bsz, FLAGS.d_model], tf.float32), trainable=False) + for _ in range(FLAGS.n_layer)] + + loss, new_mems, all_vars = single_core_graph( + n_token=n_token, + cutoffs=cutoffs, + is_training=True, + inp=inputs, + tgt=labels, + mems=mems) + + assign_mems = [mems[i].assign(new_mems[i]) for i in range(FLAGS.n_layer)] + + target_tokens = tf.size(labels) + + ## configure the optimizer + global_step = tf.train.get_or_create_global_step() + + # warmup stage: increase the learning rate linearly + if FLAGS.warmup_steps > 0: + warmup_lr = tf.to_float(global_step) / tf.to_float(FLAGS.warmup_steps) \ + * FLAGS.learning_rate + else: + warmup_lr = 0.0 + + # decay stage: decay the learning rate using the cosine schedule + decay_lr = tf.train.cosine_decay( + FLAGS.learning_rate, + global_step=global_step-FLAGS.warmup_steps, + decay_steps=FLAGS.train_steps-FLAGS.warmup_steps, + alpha=FLAGS.min_lr_ratio) + + # choose warmup or decay + learning_rate = tf.where(global_step < FLAGS.warmup_steps, + warmup_lr, decay_lr) + + # get the train op + optimizer = lamb.LAMBOptimizer(learning_rate=learning_rate) + if FLAGS.horovod: + optimizer = hvd.DistributedOptimizer(optimizer, sparse_as_dense=True) + grads_and_vars = optimizer.compute_gradients(loss/FLAGS.batch_chunk, all_vars) + grads, all_vars = zip(*grads_and_vars) + + accum_vars = [tf.Variable(tf.zeros_like(tv.initialized_value()), trainable=False) for tv in all_vars] + in_progress = tf.get_variable(name="in_progress", shape=[], dtype=tf.bool, trainable=False, + initializer=tf.zeros_initializer) + accum_ops = tf.cond(in_progress, + lambda: [accum_vars[i].assign_add(grad) for i, grad in enumerate(grads)], + lambda: [accum_vars[i].assign(grad) for i, grad in enumerate(grads)]) + with tf.control_dependencies(accum_ops + assign_mems): + acc_op = in_progress.assign(tf.ones_like(in_progress)) + final_accum_vars = [accum_vars[i] + gv for i,gv in enumerate(grads)] + acc_clipped, acc_gnorm = tf.clip_by_global_norm(final_accum_vars, FLAGS.clip) + clipped, gnorm = tf.clip_by_global_norm(grads, FLAGS.clip) + acc_train_op = optimizer.apply_gradients(list(zip(acc_clipped, all_vars)), global_step) + grads_and_vars = list(zip(clipped, all_vars)) + if FLAGS.jit_optimizer: + jit_scope = tf.contrib.compiler.jit.experimental_jit_scope + with jit_scope(): + train_op = optimizer.apply_gradients(grads_and_vars, global_step) + else: + train_op = optimizer.apply_gradients(grads_and_vars, global_step) + final_op = tf.group(train_op, assign_mems) + acc_final_op = tf.group(acc_train_op, assign_mems, in_progress.assign(tf.zeros_like(in_progress))) + ##### Training loop + saver = tf.train.Saver() + + gpu_options = tf.GPUOptions(allow_growth = True, visible_device_list = str(local_rank)) + with tf.Session(config=tf.ConfigProto(allow_soft_placement=True, gpu_options = gpu_options)) as sess: + sess.run(tf.global_variables_initializer()) + if FLAGS.horovod: + sess.run(hvd.broadcast_global_variables(0)) + + accum = [acc_op, target_tokens] + fetches = [loss, global_step, target_tokens, learning_rate, final_op if FLAGS.batch_chunk == 1 else acc_final_op] + total_loss, prev_step, target_tokens = 0., -1, 0 + start_time = time.time() + while True: + for i in range(FLAGS.batch_chunk-1): + _,tt = sess.run(accum) + target_tokens += tt + fetched = sess.run(fetches) + + loss_np, curr_step, tt = fetched[:3] + total_loss += loss_np + target_tokens += tt + + if curr_step > 0 and curr_step % FLAGS.log_interval == 0: + curr_loss = total_loss / (curr_step - prev_step) + throughput = target_tokens * size / (time.time()-start_time) + meters['train_throughput'].update(throughput) + if rank == 0: + tf.logging.info("step {} | lr {:8.9f} " + "| loss {:.2f} | pplx {:>7.2f}, bpc {:>7.4f}, tok/s {:>6.0f}".format( + curr_step, fetched[-2], + curr_loss, math.exp(curr_loss), curr_loss / math.log(2), throughput)) + dllogger_data = { + 'lr': fetched[-1], + 'train_loss': curr_loss, + 'train_perplexity': math.exp(curr_loss), + 'train_throughput': throughput, + } + dllogger.log(step=int(curr_step), data=dllogger_data) + total_loss, prev_step, target_tokens = 0., curr_step, 0 + start_time = time.time() + + if curr_step > 0 and curr_step % FLAGS.save_steps == 0 and rank == 0: + save_path = os.path.join(FLAGS.model_dir, "model.ckpt") + saver.save(sess, save_path) + tf.logging.info("Model saved in path: {}".format(save_path)) + + if curr_step == FLAGS.train_steps: + break + if rank == 0: + tf.logging.info("Training throughput: {:>6.0f} tok/s".format(meters['train_throughput'].avg)) + summary = { + 'train_throughput': meters['train_throughput'].avg, + } + dllogger.log(step=tuple(), data=summary) + + + +def evaluate(n_token, cutoffs): + ##### Get input function and model function + eval_input_fn, eval_record_info = data_utils.get_input_fn( + record_info_dir=FLAGS.record_info_dir, + split=FLAGS.eval_split, + per_host_bsz=FLAGS.eval_batch_size, + tgt_len=FLAGS.tgt_len, + num_core_per_host=FLAGS.num_core_per_host, + num_hosts=1) + + meters = {} + warmup = 2 + meters['eval_throughput'] = AverageMeter(warmup=warmup) + meters['eval_latency'] = AverageMeter(warmup=warmup, keep=True) + + num_batch = eval_record_info["num_batch"] + if FLAGS.max_eval_batch > 0: + num_batch = FLAGS.max_eval_batch + tf.logging.info("num of batches {}".format(num_batch)) + + ##### Create computational graph + eval_set = eval_input_fn({ + "batch_size": FLAGS.eval_batch_size, + "data_dir": FLAGS.data_dir}) + + inputs, labels = eval_set.make_one_shot_iterator().get_next() + + bsz = FLAGS.eval_batch_size + + with tf.variable_scope(tf.get_variable_scope()): + mems = [tf.placeholder(tf.float32, + [FLAGS.mem_len, bsz, FLAGS.d_model]) + for _ in range(FLAGS.n_layer)] + + loss, new_mems = single_core_graph( + n_token=n_token, + cutoffs=cutoffs, + is_training=False, + inp=inputs, + tgt=labels, + mems=mems) + + target_tokens = tf.size(labels) + ##### Evaluation loop + mems_np = [np.zeros([FLAGS.mem_len, bsz, FLAGS.d_model], dtype=np.float32) + for layer in range(FLAGS.n_layer)] + + saver = tf.train.Saver() + + with tf.Session(config=tf.ConfigProto(allow_soft_placement=True)) as sess: + sess.run(tf.global_variables_initializer()) + + if FLAGS.eval_ckpt_path is None: + eval_ckpt_path = tf.train.latest_checkpoint(FLAGS.model_dir) + else: + eval_ckpt_path = FLAGS.eval_ckpt_path + tf.logging.info("Evaluate {}".format(eval_ckpt_path)) + saver.restore(sess, eval_ckpt_path) + + fetches = [loss, new_mems, target_tokens] + + format_str = " >> processing batch {{:{0}d}}/{{:{0}d}}".format( + len(str(num_batch))) + + total_loss, total_cnt, target_tokens = 0, 0, 0 + start_time = time.time() + for step in range(num_batch): + feed_dict = {} + for m, m_np in zip(mems, mems_np): + feed_dict[m] = m_np + + fetched = sess.run(fetches, feed_dict=feed_dict) + + loss_np, mems_np, tt = fetched + target_tokens += tt + cnt_np = 1 + total_loss += loss_np * cnt_np + total_cnt += cnt_np + + elapsed = time.time()-start_time + throughput = target_tokens / elapsed + latency = elapsed*1000 + meters['eval_throughput'].update(throughput) + meters['eval_latency'].update(latency) + target_tokens = 0 + if (step+1) % (num_batch // 10) == 0: + tf.logging.info(format_str.format(step+1, num_batch)) + dllogger_data = { + 'eval_latency': latency, + 'eval_throughput': throughput, + } + dllogger.log(step=step+1, data=dllogger_data) + + + start_time = time.time() + avg_loss = total_loss / total_cnt + latency_data = np.array(meters['eval_latency'].vals) + tf.logging.info("Evaluating with: bs {}, math {} ".format(FLAGS.eval_batch_size, "fp16" if FLAGS.fp16 else "fp32")) + tf.logging.info("| loss {:.2f} | pplx {:>7.2f}, bpc {:>7.4f}, tok/s {:>6.1f}, ms/batch {:>4.2f}".format( + avg_loss, math.exp(avg_loss), avg_loss / math.log(2), meters['eval_throughput'].avg, meters['eval_latency'].avg)) + summary = { + 'eval_loss': avg_loss, + 'eval_ppl': math.exp(avg_loss), + 'eval_avg_throughput': meters['eval_throughput'].avg, + 'eval_avg_latency': meters['eval_latency'].avg, + } + for p in FLAGS.percentiles: + p = int(p) + tf.logging.info("Latency {}%: {:>4.2f} ms".format( + p, np.percentile(latency_data, p))) + summary[f'eval_{p}%_latency'] = np.percentile(latency_data, p) + dllogger.log(step=tuple(), data=summary) + + + +def main(unused_argv): + rank, local_rank, size = 0, 0, 1 + if FLAGS.horovod: + hvd.init() + rank = hvd.rank() + local_rank = hvd.local_rank() + size = hvd.size() + del unused_argv # Unused + + tf.logging.set_verbosity(tf.logging.INFO) + + if FLAGS.fp16: + os.environ["TF_ENABLE_AUTO_MIXED_PRECISION"] = "1" + else: + os.environ["TF_ENABLE_AUTO_MIXED_PRECISION"] = "0" + + # Get corpus info + corpus_info = data_utils.get_corpus_info(FLAGS.corpus_info_path) + n_token = corpus_info["vocab_size"] + cutoffs = corpus_info["cutoffs"][1:-1] + tf.logging.info("n_token {}".format(n_token)) + + setup_dllogger(enabled=True, filename=FLAGS.raport_file, rank=rank) + + if FLAGS.do_train: + train(n_token, cutoffs, rank, local_rank, size) + if FLAGS.do_eval: + evaluate(n_token, cutoffs) + + + +if __name__ == "__main__": + tf.app.run() diff --git a/TensorFlow/LanguageModeling/Transformer-XL/tf/model.py b/TensorFlow/LanguageModeling/Transformer-XL/tf/model.py new file mode 100755 index 00000000..84127df7 --- /dev/null +++ b/TensorFlow/LanguageModeling/Transformer-XL/tf/model.py @@ -0,0 +1,539 @@ +import tensorflow as tf + + +def positional_embedding(pos_seq, inv_freq, bsz=None): + sinusoid_inp = tf.einsum('i,j->ij', pos_seq, inv_freq) + pos_emb = tf.concat([tf.sin(sinusoid_inp), tf.cos(sinusoid_inp)], -1) + if bsz is not None: + return tf.tile(pos_emb[:, None, :], [1, bsz, 1]) + else: + return pos_emb[:, None, :] + + +def positionwise_FF(inp, d_model, d_inner, dropout, kernel_initializer, + scope='ff', is_training=True): + output = inp + with tf.variable_scope(scope): + output = tf.layers.dense(inp, d_inner, activation=tf.nn.relu, + kernel_initializer=kernel_initializer, + name='layer_1') + output = tf.layers.dropout(output, dropout, training=is_training, + name='drop_1') + output = tf.layers.dense(output, d_model, + kernel_initializer=kernel_initializer, + name='layer_2') + output = tf.layers.dropout(output, dropout, training=is_training, + name='drop_2') + output = tf.contrib.layers.layer_norm(output + inp, begin_norm_axis=-1) + return output + + +def rel_shift(x): + x_size = tf.shape(x) + + x = tf.pad(x, [[0, 0], [0, 0], [0, 0], [1, 0]]) + x = tf.reshape(x, [x_size[0], x_size[1], x_size[3] + 1, x_size[2]]) + x = tf.slice(x, [0, 0, 1, 0], [-1, -1, -1, -1]) + x = tf.reshape(x, x_size) + + return x + + +def rel_multihead_attn(w, r, r_w_bias, r_r_bias, attn_mask, mems, d_model, + n_head, d_head, dropout, dropatt, is_training, + kernel_initializer, scope='rel_attn'): + scale = 1 / (d_head ** 0.5) + with tf.variable_scope(scope): + qlen = tf.shape(w)[0] + rlen = tf.shape(r)[0] + bsz = tf.shape(w)[1] + + cat = tf.concat([mems, w], + 0) if mems is not None and mems.shape.ndims > 1 else w + w_heads = tf.layers.dense(cat, 3 * n_head * d_head, use_bias=False, + kernel_initializer=kernel_initializer, name='qkv') + r_head_k = tf.layers.dense(r, n_head * d_head, use_bias=False, + kernel_initializer=kernel_initializer, name='r') + + w_head_q, w_head_k, w_head_v = tf.split(w_heads, 3, -1) + w_head_q = w_head_q[-qlen:] + + klen = tf.shape(w_head_k)[0] + + w_head_q = tf.reshape(w_head_q, [qlen, bsz, n_head, d_head]) + w_head_k = tf.reshape(w_head_k, [klen, bsz, n_head, d_head]) + w_head_v = tf.reshape(w_head_v, [klen, bsz, n_head, d_head]) + + r_head_k = tf.reshape(r_head_k, [rlen, n_head, d_head]) + + rw_head_q = w_head_q + r_w_bias + rr_head_q = w_head_q + r_r_bias + + AC = tf.einsum('ibnd,jbnd->bnij', rw_head_q, w_head_k) + BD = tf.einsum('ibnd,jnd->bnij', rr_head_q, r_head_k) + BD = rel_shift(BD) + + attn_score = (AC + BD) * scale + attn_mask_t = attn_mask[None, None, :, :] + attn_score = attn_score * (1 - attn_mask_t) - 1e30 * attn_mask_t + + attn_prob = tf.nn.softmax(attn_score, 3) + attn_prob = tf.layers.dropout(attn_prob, dropatt, training=is_training) + + attn_vec = tf.einsum('bnij,jbnd->ibnd', attn_prob, w_head_v) + size_t = tf.shape(attn_vec) + attn_vec = tf.reshape(attn_vec, [size_t[0], size_t[1], n_head * d_head]) + + attn_out = tf.layers.dense(attn_vec, d_model, use_bias=False, + kernel_initializer=kernel_initializer, name='o') + attn_out = tf.layers.dropout(attn_out, dropout, training=is_training) + + output = tf.contrib.layers.layer_norm(attn_out + w, begin_norm_axis=-1) + return output + + +def embedding_lookup(lookup_table, x, use_tpu=True): + if use_tpu: + n_token = tf.shape(lookup_table)[0] + one_hot_idx = tf.one_hot(x, n_token) + if one_hot_idx.shape.ndims == 2: + return tf.einsum('nd,in->id', lookup_table, one_hot_idx) + else: + return tf.einsum('nd,ibn->ibd', lookup_table, one_hot_idx) + else: + return tf.nn.embedding_lookup(lookup_table, x) + + +def mask_adaptive_embedding_lookup(x, n_token, d_embed, d_proj, cutoffs, initializer, + proj_initializer, div_val=1, + proj_same_dim=True, + scope='adaptive_embed', **kwargs): + emb_scale = d_proj ** 0.5 + with tf.variable_scope(scope): + if div_val == 1: + lookup_table = tf.get_variable('lookup_table', [n_token, d_embed], + initializer=initializer) + y = embedding_lookup(lookup_table, x, use_tpu=False) + if d_proj != d_embed: + proj_W = tf.get_variable('proj_W', [d_embed, d_proj], + initializer=proj_initializer) + y = tf.einsum('ibe,ed->ibd', y, proj_W) + else: + proj_W = None + ret_params = [lookup_table, proj_W] + else: + tables, projs = [], [] + cutoff_ends = [0] + cutoffs + [n_token] + x_size = tf.shape(x) + y = tf.zeros([x_size[0], x_size[1], d_proj]) + for i in range(len(cutoff_ends) - 1): + with tf.variable_scope('cutoff_{}'.format(i)): + l_idx, r_idx = cutoff_ends[i], cutoff_ends[i + 1] + mask = (x >= l_idx) & (x < r_idx) + cur_x = tf.boolean_mask(x, mask) - l_idx + cur_d_embed = d_embed // (div_val ** i) + lookup_table = tf.get_variable('lookup_table', + [r_idx - l_idx, cur_d_embed], + initializer=initializer) + cur_y = embedding_lookup(lookup_table, cur_x, use_tpu=False) + if d_proj == cur_d_embed and not proj_same_dim: + proj_W = None + else: + proj_W = tf.get_variable('proj_W', [cur_d_embed, d_proj], + initializer=proj_initializer) + cur_y = tf.einsum('id,de->ie', cur_y, proj_W) + mask_idx = tf.to_int64(tf.where(mask)) + y += tf.scatter_nd(mask_idx, cur_y, tf.to_int64(tf.shape(y))) + tables.append(lookup_table) + projs.append(proj_W) + ret_params = [tables, projs] + + y *= emb_scale + return y, ret_params + + +def mul_adaptive_embedding_lookup(x, n_token, d_embed, d_proj, cutoffs, initializer, + proj_initializer, div_val=1, perms=None, + proj_same_dim=True, + scope='adaptive_embed'): + """ + perms: If None, first compute W = W1 x W2 (projection for each bin), + and then compute X x W (embedding lookup). If not None, + use bin-based embedding lookup with max_bin_size defined by + the shape of perms. + """ + emb_scale = d_proj ** 0.5 + with tf.variable_scope(scope): + if div_val == 1: + lookup_table = tf.get_variable('lookup_table', [n_token, d_embed], + initializer=initializer) + y = embedding_lookup(lookup_table, x) + if d_proj != d_embed: + proj_W = tf.get_variable('proj_W', [d_embed, d_proj], + initializer=proj_initializer) + y = tf.einsum('ibe,ed->ibd', y, proj_W) + else: + proj_W = None + ret_params = [lookup_table, proj_W] + else: + tables, projs = [], [] + cutoff_ends = [0] + cutoffs + [n_token] + x_size = tf.shape(x) + if perms is None: + cat_lookup = [] + else: + cat_lookup = tf.zeros([x_size[0], x_size[1], d_proj]) + for i in range(len(cutoff_ends) - 1): + with tf.variable_scope('cutoff_{}'.format(i)): + l_idx, r_idx = cutoff_ends[i], cutoff_ends[i + 1] + cur_d_embed = d_embed // (div_val ** i) + lookup_table = tf.get_variable('lookup_table', + [r_idx - l_idx, cur_d_embed], + initializer=initializer) + if cur_d_embed == d_proj and not proj_same_dim: + proj_W = None + else: + proj_W = tf.get_variable('proj_W', [cur_d_embed, d_proj], + initializer=proj_initializer) + if perms is None: + cat_lookup.append(tf.einsum('ie,ed->id', lookup_table, proj_W)) + else: + # speed up the computation of the first bin + # also save some meory + if i == 0: + cur_y = embedding_lookup(lookup_table, tf.minimum(x, r_idx - 1)) + if proj_W is not None: + cur_y = tf.einsum('ibe,ed->ibd', cur_y, proj_W) + cur_y *= perms[i][:, :, None] + cat_lookup += cur_y + else: + cur_x = tf.einsum('ib,ibk->k', tf.to_float(x - l_idx), perms[i]) + cur_x = tf.to_int32(cur_x) + cur_y = embedding_lookup(lookup_table, cur_x) + if proj_W is not None: + cur_y = tf.einsum('ke,ed->kd', cur_y, proj_W) + cat_lookup += tf.einsum('kd,ibk->ibd', cur_y, perms[i]) + tables.append(lookup_table) + projs.append(proj_W) + if perms is None: + cat_lookup = tf.concat(cat_lookup, 0) + y = embedding_lookup(cat_lookup, x) + else: + y = cat_lookup + ret_params = [tables, projs] + + y *= emb_scale + return y, ret_params + + +def mask_adaptive_logsoftmax(hidden, target, n_token, d_embed, d_proj, cutoffs, + params, tie_projs, + initializer=None, proj_initializer=None, + div_val=1, scope='adaptive_softmax', + proj_same_dim=True, + return_mean=True, **kwargs): + def _logit(x, W, b, proj): + y = x + if proj is not None: + y = tf.einsum('ibd,ed->ibe', y, proj) + return tf.einsum('ibd,nd->ibn', y, W) + b + + params_W, params_projs = params[0], params[1] + + def _gather_logprob(logprob, target): + lp_size = tf.shape(logprob) + r = tf.range(lp_size[0]) + idx = tf.stack([r, target], 1) + return tf.gather_nd(logprob, idx) + + with tf.variable_scope(scope): + if len(cutoffs) == 0: + softmax_b = tf.get_variable('bias', [n_token], + initializer=tf.zeros_initializer()) + output = _logit(hidden, params_W, softmax_b, params_projs) + nll = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=target, + logits=output) + else: + cutoff_ends = [0] + cutoffs + [n_token] + nll = tf.zeros_like(target, dtype=tf.float32) + for i in range(len(cutoff_ends) - 1): + with tf.variable_scope('cutoff_{}'.format(i)): + l_idx, r_idx = cutoff_ends[i], cutoff_ends[i + 1] + mask = (target >= l_idx) & (target < r_idx) + mask_idx = tf.where(mask) + cur_target = tf.boolean_mask(target, mask) - l_idx + cur_d_embed = d_embed // (div_val ** i) + + if div_val == 1: + cur_W = params_W[l_idx: r_idx] + else: + cur_W = params_W[i] + cur_b = tf.get_variable('b', [r_idx - l_idx], + initializer=tf.zeros_initializer()) + if tie_projs[i]: + if div_val == 1: + cur_proj = params_projs + else: + cur_proj = params_projs[i] + else: + if (div_val == 1 or not proj_same_dim) and d_proj == cur_d_embed: + cur_proj = None + else: + cur_proj = tf.get_variable('proj', [cur_d_embed, d_proj], + initializer=proj_initializer) + if i == 0: + cluster_W = tf.get_variable('cluster_W', [len(cutoffs), d_embed], + initializer=tf.zeros_initializer()) + cluster_b = tf.get_variable('cluster_b', [len(cutoffs)], + initializer=tf.zeros_initializer()) + cur_W = tf.concat([cur_W, cluster_W], 0) + cur_b = tf.concat([cur_b, cluster_b], 0) + + head_logit = _logit(hidden, cur_W, cur_b, cur_proj) + head_logprob = tf.nn.log_softmax(head_logit) + cur_head_logprob = tf.boolean_mask(head_logprob, mask) + cur_logprob = _gather_logprob(cur_head_logprob, cur_target) + else: + cur_head_logprob = tf.boolean_mask(head_logprob, mask) + cur_hidden = tf.boolean_mask(hidden, mask) + tail_logit = tf.squeeze(_logit( + cur_hidden[None], cur_W, cur_b, cur_proj), 0) + tail_logprob = tf.nn.log_softmax(tail_logit) + cur_logprob = (cur_head_logprob[:, cutoff_ends[1] + i - 1] + + _gather_logprob(tail_logprob, cur_target)) + nll += tf.scatter_nd(mask_idx, -cur_logprob, + tf.to_int64(tf.shape(nll))) + if return_mean: + nll = tf.reduce_mean(nll) + return nll + + +def mul_adaptive_logsoftmax(hidden, target, n_token, d_embed, d_proj, cutoffs, + params, tie_projs, + initializer=None, proj_initializer=None, + div_val=1, perms=None, proj_same_dim=True, + scope='adaptive_softmax', + **kwargs): + def _logit(x, W, b, proj): + y = x + if x.shape.ndims == 3: + if proj is not None: + y = tf.einsum('ibd,ed->ibe', y, proj) + return tf.einsum('ibd,nd->ibn', y, W) + b + else: + if proj is not None: + y = tf.einsum('id,ed->ie', y, proj) + return tf.einsum('id,nd->in', y, W) + b + + params_W, params_projs = params[0], params[1] + + with tf.variable_scope(scope): + if len(cutoffs) == 0: + softmax_b = tf.get_variable('bias', [n_token], + initializer=tf.zeros_initializer()) + output = _logit(hidden, params_W, softmax_b, params_projs) + nll = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=target, + logits=output) + nll = tf.reduce_mean(nll) + else: + total_loss, total_cnt = 0, 0 + cutoff_ends = [0] + cutoffs + [n_token] + for i in range(len(cutoff_ends) - 1): + with tf.variable_scope('cutoff_{}'.format(i)): + l_idx, r_idx = cutoff_ends[i], cutoff_ends[i + 1] + + cur_d_embed = d_embed // (div_val ** i) + + if div_val == 1: + cur_W = params_W[l_idx: r_idx] + else: + cur_W = params_W[i] + cur_b = tf.get_variable('b', [r_idx - l_idx], + initializer=tf.zeros_initializer()) + if tie_projs[i]: + if div_val == 1: + cur_proj = params_projs + else: + cur_proj = params_projs[i] + else: + if (div_val == 1 or not proj_same_dim) and d_proj == cur_d_embed: + cur_proj = None + else: + cur_proj = tf.get_variable('proj', [cur_d_embed, d_proj], + initializer=proj_initializer) + + if i == 0: + cluster_W = tf.get_variable('cluster_W', [len(cutoffs), d_embed], + initializer=tf.zeros_initializer()) + cluster_b = tf.get_variable('cluster_b', [len(cutoffs)], + initializer=tf.zeros_initializer()) + cur_W = tf.concat([cur_W, cluster_W], 0) + cur_b = tf.concat([cur_b, cluster_b], 0) + + head_logit = _logit(hidden, cur_W, cur_b, cur_proj) + + head_target = kwargs.get("head_target") + head_nll = tf.nn.sparse_softmax_cross_entropy_with_logits( + labels=head_target, + logits=head_logit) + + masked_loss = head_nll * perms[i] + total_loss += tf.reduce_sum(masked_loss) + total_cnt += tf.reduce_sum(perms[i]) + else: + cur_head_nll = tf.einsum('ib,ibk->k', head_nll, perms[i]) + + cur_hidden = tf.einsum('ibd,ibk->kd', hidden, perms[i]) + tail_logit = _logit(cur_hidden, cur_W, cur_b, cur_proj) + + tail_target = tf.einsum('ib,ibk->k', tf.to_float(target - l_idx), + perms[i]) + tail_nll = tf.nn.sparse_softmax_cross_entropy_with_logits( + labels=tf.to_int32(tail_target), + logits=tail_logit) + + sum_nll = cur_head_nll + tail_nll + mask = tf.reduce_sum(perms[i], [0, 1]) + + masked_loss = sum_nll * mask + total_loss += tf.reduce_sum(masked_loss) + total_cnt += tf.reduce_sum(mask) + + nll = total_loss / total_cnt + + return nll + + +def _create_mask(qlen, mlen, same_length=False): + attn_mask = tf.ones([qlen, qlen]) + mask_u = tf.matrix_band_part(attn_mask, 0, -1) + mask_dia = tf.matrix_band_part(attn_mask, 0, 0) + attn_mask_pad = tf.zeros([qlen, mlen]) + ret = tf.concat([attn_mask_pad, mask_u - mask_dia], 1) + if same_length: + mask_l = tf.matrix_band_part(attn_mask, -1, 0) + ret = tf.concat([ret[:, :qlen] + mask_l - mask_dia, ret[:, qlen:]], 1) + return ret + +def _cache_mem(curr_out, prev_mem, mem_len=None): + if mem_len is None or prev_mem is None: + new_mem = curr_out + elif mem_len == 0: + return prev_mem + else: + new_mem = tf.concat([prev_mem, curr_out], 0)[- mem_len:] + + return tf.stop_gradient(new_mem) + + +def transformer(dec_inp, target, mems, n_token, n_layer, d_model, d_embed, + n_head, d_head, d_inner, dropout, dropatt, + initializer, is_training, proj_initializer=None, + mem_len=None, cutoffs=[], div_val=1, tie_projs=[], + same_length=False, clamp_len=-1, use_tpu=False, + input_perms=None, target_perms=None, head_target=None, + untie_r=False, proj_same_dim=True, + scope='transformer'): + """ + cutoffs: a list of python int. Cutoffs for adaptive softmax. + tie_projs: a list of python bools. Whether to tie the projections. + use_tpu: if True, use one_hot in embedding lookup and bin-based implementation + of adaptive softmax. + perms: a list of tensors. Each tensor should of size [len, bsz, bin_size]. + Only used in the adaptive setting. + """ + new_mems = [] + with tf.variable_scope(scope): + if untie_r: + r_w_bias = tf.get_variable('r_w_bias', [n_layer, n_head, d_head], + initializer=initializer) + r_r_bias = tf.get_variable('r_r_bias', [n_layer, n_head, d_head], + initializer=initializer) + else: + r_w_bias = tf.get_variable('r_w_bias', [n_head, d_head], + initializer=initializer) + r_r_bias = tf.get_variable('r_r_bias', [n_head, d_head], + initializer=initializer) + + qlen = tf.shape(dec_inp)[0] + mlen = tf.shape(mems[0])[0] if mems is not None else 0 + klen = mlen + qlen + + if proj_initializer is None: + proj_initializer = initializer + lookup_fn = (mul_adaptive_embedding_lookup if use_tpu else + mask_adaptive_embedding_lookup) + embeddings, shared_params = lookup_fn( + x=dec_inp, + n_token=n_token, + d_embed=d_embed, + d_proj=d_model, + cutoffs=cutoffs, + initializer=initializer, + proj_initializer=proj_initializer, + div_val= div_val, + perms=input_perms, + proj_same_dim=proj_same_dim) + + attn_mask = _create_mask(qlen, mlen, same_length) + + pos_seq = tf.range(klen - 1, -1, -1.0) + if clamp_len > 0: + pos_seq = tf.minimum(pos_seq, clamp_len) + inv_freq = 1 / (10000 ** (tf.range(0, d_model, 2.0) / d_model)) + pos_emb = positional_embedding(pos_seq, inv_freq) + + output = tf.layers.dropout(embeddings, dropout, training=is_training) + pos_emb = tf.layers.dropout(pos_emb, dropout, training=is_training) + + if mems is None: + mems = [None] * n_layer + + for i in range(n_layer): + # cache new mems + new_mems.append(_cache_mem(output, mems[i], mem_len)) + + with tf.variable_scope('layer_{}'.format(i)): + output = rel_multihead_attn( + w=output, + r=pos_emb, + r_w_bias=r_w_bias if not untie_r else r_w_bias[i], + r_r_bias=r_r_bias if not untie_r else r_r_bias[i], + attn_mask=attn_mask, + mems=mems[i], + d_model=d_model, + n_head=n_head, + d_head=d_head, + dropout=dropout, + dropatt=dropatt, + is_training=is_training, + kernel_initializer=initializer) + output = positionwise_FF( + inp=output, + d_model=d_model, + d_inner=d_inner, + dropout=dropout, + kernel_initializer=initializer, + is_training=is_training) + + output = tf.layers.dropout(output, dropout, training=is_training) + + logsoftmax_fn = (mul_adaptive_logsoftmax if use_tpu else + mask_adaptive_logsoftmax) + loss = logsoftmax_fn( + hidden=output, + target=target, + n_token=n_token, + d_embed=d_embed, + d_proj=d_model, + cutoffs=cutoffs, + params=shared_params, + tie_projs=tie_projs, + initializer=initializer, + proj_initializer=proj_initializer, + div_val=div_val, + perms=target_perms, + head_target=head_target, + proj_same_dim=proj_same_dim) + return loss, new_mems + diff --git a/TensorFlow/LanguageModeling/Transformer-XL/tf/run_wt103_base.sh b/TensorFlow/LanguageModeling/Transformer-XL/tf/run_wt103_base.sh new file mode 100755 index 00000000..dd16c40e --- /dev/null +++ b/TensorFlow/LanguageModeling/Transformer-XL/tf/run_wt103_base.sh @@ -0,0 +1,98 @@ +#!/bin/bash + +# Data +DATA_ROOT=../data/wikitext-103/ + +# Model +DIV_VAL=1 +N_LAYER=16 +D_MODEL=512 +D_EMBED=512 +N_HEAD=8 +D_HEAD=64 +D_INNER=2048 + +# Training +TGT_LEN=192 +MEM_LEN=192 + +NUM_CORE=${2:-"8"} + +# Testing +TEST_TGT_LEN=64 +TEST_MEM_LEN=640 +TEST_CLAMP_LEN=400 + +TEST_NUM_CORE=1 + + +if [[ $1 == 'train_data' ]]; then + python data_utils.py \ + --data_dir=${DATA_ROOT}/ \ + --dataset=wt103 \ + --tgt_len=${TGT_LEN} \ + --num_passes=2 \ + --use_tpu=False \ + --eval_batch_size=0 \ + ${@:2} +elif [[ $1 == 'test_data' ]]; then + python data_utils.py \ + --data_dir=${DATA_ROOT}/ \ + --dataset=enwik8 \ + --tgt_len=${TEST_TGT_LEN} \ + --num_passes=1 \ + --use_tpu=False \ + ${@:2} +elif [[ $1 == 'train' ]]; then + echo 'Run training...' + horovodrun -np ${NUM_CORE} -H localhost:${NUM_CORE} python main.py \ + --data_dir=${DATA_ROOT}/tfrecords \ + --record_info_dir=${DATA_ROOT}/tfrecords/ \ + --corpus_info_path=${DATA_ROOT}/corpus-info.json \ + --div_val=${DIV_VAL} \ + --untie_r=True \ + --proj_share_all_but_first=True \ + --n_layer=${N_LAYER} \ + --d_model=${D_MODEL} \ + --d_embed=${D_EMBED} \ + --n_head=${N_HEAD} \ + --d_head=${D_HEAD} \ + --d_inner=${D_INNER} \ + --dropout=0.1 \ + --dropatt=0.0 \ + --learning_rate=0.01 \ + --warmup_steps=1000 \ + --tgt_len=${TGT_LEN} \ + --mem_len=${MEM_LEN} \ + --num_core_per_host=${NUM_CORE} \ + ${@:3} +elif [[ $1 == 'eval' ]]; then + echo 'Run evaluation...' + python main.py \ + --data_dir=${DATA_ROOT}/tfrecords \ + --record_info_dir=${DATA_ROOT}/tfrecords/ \ + --corpus_info_path=${DATA_ROOT}/corpus-info.json \ + --div_val=${DIV_VAL} \ + --untie_r=True \ + --proj_share_all_but_first=True \ + --n_layer=${N_LAYER} \ + --d_model=${D_MODEL} \ + --d_embed=${D_EMBED} \ + --n_head=${N_HEAD} \ + --d_head=${D_HEAD} \ + --d_inner=${D_INNER} \ + --dropout=0.0 \ + --dropatt=0.0 \ + --tgt_len=${TEST_TGT_LEN} \ + --mem_len=${TEST_MEM_LEN} \ + --clamp_len=${TEST_CLAMP_LEN} \ + --same_length=True \ + --num_core_per_host=${TEST_NUM_CORE} \ + --do_train=False \ + --do_eval=True \ + --horovod=False \ + --eval_split=test \ + ${@:2} +else + echo 'unknown argment 1' +fi diff --git a/TensorFlow/LanguageModeling/Transformer-XL/tf/scripts/docker/build.sh b/TensorFlow/LanguageModeling/Transformer-XL/tf/scripts/docker/build.sh new file mode 100755 index 00000000..99d21cb0 --- /dev/null +++ b/TensorFlow/LanguageModeling/Transformer-XL/tf/scripts/docker/build.sh @@ -0,0 +1,17 @@ +#!/bin/bash + +# Copyright (c) 2020 NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +docker build . --network=host --rm -t transformer-xl:latest diff --git a/TensorFlow/LanguageModeling/Transformer-XL/tf/scripts/docker/interactive.sh b/TensorFlow/LanguageModeling/Transformer-XL/tf/scripts/docker/interactive.sh new file mode 100755 index 00000000..94161a06 --- /dev/null +++ b/TensorFlow/LanguageModeling/Transformer-XL/tf/scripts/docker/interactive.sh @@ -0,0 +1,17 @@ +#!/bin/bash + +# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +nvidia-docker run --init -it --rm --network=host --ipc=host -v $PWD:/workspace/transformer-xl transformer-xl bash diff --git a/TensorFlow/LanguageModeling/Transformer-XL/tf/scripts/inference_benchmark.sh b/TensorFlow/LanguageModeling/Transformer-XL/tf/scripts/inference_benchmark.sh new file mode 100755 index 00000000..3041b24b --- /dev/null +++ b/TensorFlow/LanguageModeling/Transformer-XL/tf/scripts/inference_benchmark.sh @@ -0,0 +1,30 @@ +#!/bin/bash + +# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +BATCH_SIZES=(1 2 4 8 16 32) +# "empty" MATH corresponds to fp32 +MATHS=("" "--fp16") + + +for (( j = 0; j < ${#BATCH_SIZES[@]}; j++ )); do + for (( k = 0; k < ${#MATHS[@]}; k++ )); do + echo batch size: ${BATCH_SIZES[j]} math: ${MATHS[k]} + taskset -c 0 bash run_wt103_base.sh eval \ + --eval_batch_size "${BATCH_SIZES[j]}" \ + "${MATHS[k]}" \ + "${@:1}" + done +done diff --git a/TensorFlow/LanguageModeling/Transformer-XL/tf/vocabulary.py b/TensorFlow/LanguageModeling/Transformer-XL/tf/vocabulary.py new file mode 100755 index 00000000..20c728fd --- /dev/null +++ b/TensorFlow/LanguageModeling/Transformer-XL/tf/vocabulary.py @@ -0,0 +1,170 @@ +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +from collections import Counter, OrderedDict + +import numpy as np + +import tensorflow as tf + +from tensorflow.gfile import Open as open +from tensorflow.gfile import Exists as exists + +class Vocab(object): + def __init__(self, special=[], min_freq=0, max_size=None, lower_case=True, + delimiter=None, vocab_file=None): + self.counter = Counter() + self.special = special + self.min_freq = min_freq + self.max_size = max_size + self.lower_case = lower_case + self.delimiter = delimiter + self.vocab_file = vocab_file + + def tokenize(self, line, add_eos=False, add_double_eos=False): + line = line.strip() + # convert to lower case + if self.lower_case: + line = line.lower() + + # empty delimiter '' will evaluate False + if self.delimiter == '': + symbols = line + else: + symbols = line.split(self.delimiter) + + if add_double_eos: # lm1b + return [''] + symbols + [''] + elif add_eos: + return symbols + [''] + else: + return symbols + + def count_file(self, path, verbose=False, add_eos=False): + if verbose: print('counting file {} ...'.format(path)) + assert exists(path) + + sents = [] + with open(path, 'r') as f: + for idx, line in enumerate(f): + if verbose and idx > 0 and idx % 500000 == 0: + print(' line {}'.format(idx)) + symbols = self.tokenize(line, add_eos=add_eos) + self.counter.update(symbols) + sents.append(symbols) + + return sents + + def count_sents(self, sents, verbose=False): + """ + sents : a list of sentences, each a list of tokenized symbols + """ + if verbose: print('counting {} sents ...'.format(len(sents))) + for idx, symbols in enumerate(sents): + if verbose and idx > 0 and idx % 500000 == 0: + print(' line {}'.format(idx)) + self.counter.update(symbols) + + def _build_from_file(self, vocab_file): + self.idx2sym = [] + self.sym2idx = OrderedDict() + + with open(vocab_file, 'r') as f: + for line in f: + symb = line.strip().split()[0] + self.add_symbol(symb) + self.unk_idx = self.sym2idx[''] + + def build_vocab(self): + if self.vocab_file: + print('building vocab from {}'.format(self.vocab_file)) + self._build_from_file(self.vocab_file) + print('final vocab size {}'.format(len(self))) + else: + print('building vocab with min_freq={}, max_size={}'.format( + self.min_freq, self.max_size)) + self.idx2sym = [] + self.sym2idx = OrderedDict() + + for sym in self.special: + self.add_special(sym) + + for sym, cnt in self.counter.most_common(self.max_size): + if cnt < self.min_freq: break + self.add_symbol(sym) + + print('final vocab size {} from {} unique tokens'.format( + len(self), len(self.counter))) + + def encode_file(self, path, ordered=False, verbose=False, add_eos=True, + add_double_eos=False): + if verbose: print('encoding file {} ...'.format(path)) + assert exists(path) + encoded = [] + with open(path, 'r') as f: + for idx, line in enumerate(f): + if verbose and idx > 0 and idx % 500000 == 0: + print(' line {}'.format(idx)) + symbols = self.tokenize(line, add_eos=add_eos, + add_double_eos=add_double_eos) + encoded.append(self.convert_to_nparray(symbols)) + + if ordered: + encoded = np.concatenate(encoded) + + return encoded + + def encode_sents(self, sents, ordered=False, verbose=False): + if verbose: print('encoding {} sents ...'.format(len(sents))) + encoded = [] + for idx, symbols in enumerate(sents): + if verbose and idx > 0 and idx % 500000 == 0: + print(' line {}'.format(idx)) + encoded.append(self.convert_to_nparray(symbols)) + + if ordered: + encoded = np.concatenate(encoded) + + return encoded + + def add_special(self, sym): + if sym not in self.sym2idx: + self.idx2sym.append(sym) + self.sym2idx[sym] = len(self.idx2sym) - 1 + setattr(self, '{}_idx'.format(sym.strip('<>')), self.sym2idx[sym]) + + def add_symbol(self, sym): + if sym not in self.sym2idx: + self.idx2sym.append(sym) + self.sym2idx[sym] = len(self.idx2sym) - 1 + + def get_sym(self, idx): + assert 0 <= idx < len(self), 'Index {} out of range'.format(idx) + return self.idx2sym[idx] + + def get_idx(self, sym): + if sym in self.sym2idx: + return self.sym2idx[sym] + else: + assert hasattr(self, 'unk_idx') + return self.sym2idx.get(sym, self.unk_idx) + + def get_symbols(self, indices): + return [self.get_sym(idx) for idx in indices] + + def get_indices(self, symbols): + return [self.get_idx(sym) for sym in symbols] + + def convert_to_nparray(self, symbols): + nparray = np.array(self.get_indices(symbols), dtype=np.int64) + return nparray + + def convert_to_sent(self, indices, exclude=None): + if exclude is None: + return ' '.join([self.get_sym(idx) for idx in indices]) + else: + return ' '.join([self.get_sym(idx) for idx in indices if idx not in exclude]) + + def __len__(self): + return len(self.idx2sym)