Adding WideAndDeep/TF

This commit is contained in:
Przemek Strzelczyk 2020-03-24 23:05:23 +01:00
parent 6227f32700
commit 639de18491
32 changed files with 5602 additions and 0 deletions

View file

@ -34,6 +34,7 @@ The examples are organized first by framework, such as TensorFlow, PyTorch, etc.
### Recommender Systems
- __NCF__ [[PyTorch](https://github.com/NVIDIA/DeepLearningExamples/tree/master/PyTorch/Recommendation/NCF)] [[TensorFlow](https://github.com/NVIDIA/DeepLearningExamples/tree/master/TensorFlow/Recommendation/NCF)]
- __VAE-CF__ [[TensorFlow](https://github.com/NVIDIA/DeepLearningExamples/tree/master/TensorFlow/Recommendation/VAE-CF)]
- __WideAndDeep__ [[TensorFlow](https://github.com/NVIDIA/DeepLearningExamples/tree/master/TensorFlow/Recommendation/WideAndDeep)]
### Text to Speech
@ -80,6 +81,7 @@ The examples are organized first by framework, such as TensorFlow, PyTorch, etc.
| [BioBert](https://github.com/NVIDIA/DeepLearningExamples/tree/master/TensorFlow/LanguageModeling/BERT/biobert) | TensorFlow | N/A | Yes | Yes | - | - | - | - | - |
| [Neural Collaborative Filtering](https://github.com/NVIDIA/DeepLearningExamples/tree/master/TensorFlow/Recommendation/NCF) |TensorFlow | N/A | Yes | Yes | - | - | - | - | - |
| [Variational Autoencoder Collaborative Filtering](https://github.com/NVIDIA/DeepLearningExamples/tree/master/TensorFlow/Recommendation/VAE-CF) |TensorFlow | N/A | Yes | Yes | - | - | - | - | - |
| [WideAndDeep](https://github.com/NVIDIA/DeepLearningExamples/tree/master/TensorFlow/Recommendation/WideAndDeep) | TensorFlow | N/A | Yes | Yes | - | - | - | - | - |
| [U-Net Industrial](https://github.com/NVIDIA/DeepLearningExamples/tree/master/TensorFlow/Segmentation/UNet_Industrial) |TensorFlow | N/A | Yes | Yes | - | Yes | - | - | Yes |
| [U-Net Medical](https://github.com/NVIDIA/DeepLearningExamples/tree/master/TensorFlow/Segmentation/UNet_Medical) | TensorFlow | N/A | Yes | Yes | - | Yes |- | - | Yes |
| [V-Net Medical](https://github.com/NVIDIA/DeepLearningExamples/tree/master/TensorFlow/Segmentation/VNet) | TensorFlow | N/A | Yes | Yes | - | Yes | Yes | - | Yes |

View file

@ -0,0 +1,48 @@
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
ARG FROM_IMAGE_NAME=nvcr.io/nvidia/tensorflow:20.02-tf1-py3
FROM ${FROM_IMAGE_NAME}
USER root
# Spark dependencies
ENV APACHE_SPARK_VERSION 2.3.1
ENV HADOOP_VERSION 2.7
RUN apt-get -y update && \
apt-get install --no-install-recommends -y openjdk-8-jre-headless ca-certificates-java && \
apt-get clean && \
rm -rf /var/lib/apt/lists/*
RUN cd /tmp && \
wget -q http://archive.apache.org/dist/spark/spark-${APACHE_SPARK_VERSION}/spark-${APACHE_SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}.tgz && \
echo "DC3A97F3D99791D363E4F70A622B84D6E313BD852F6FDBC777D31EAB44CBC112CEEAA20F7BF835492FB654F48AE57E9969F93D3B0E6EC92076D1C5E1B40B4696 *spark-${APACHE_SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}.tgz" | sha512sum -c - && \
tar xzf spark-${APACHE_SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}.tgz -C /usr/local --owner root --group root --no-same-owner && \
rm spark-${APACHE_SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}.tgz
RUN cd /usr/local && ln -s spark-${APACHE_SPARK_VERSION}-bin-hadoop${HADOOP_VERSION} spark
# Spark config
ENV SPARK_HOME /usr/local/spark
ENV PYTHONPATH $SPARK_HOME/python:$SPARK_HOME/python/lib/py4j-0.10.7-src.zip
ENV SPARK_OPTS --driver-java-options=-Xms1024M --driver-java-options=-Xmx4096M --driver-java-options=-Dlog4j.logLevel=info
RUN pip install pyspark==2.3.1
RUN pip install --no-deps tensorflow-transform apache-beam==2.14 tensorflow-metadata==0.14.0 pydot dill
RUN pip install ipdb
RUN pip install -e git://github.com/NVIDIA/dllogger#egg=dllogger
WORKDIR /wd
COPY . .

View file

@ -0,0 +1,201 @@
Apache License
Version 2.0, January 2004
http://www.apache.org/licenses/
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
1. Definitions.
"License" shall mean the terms and conditions for use, reproduction,
and distribution as defined by Sections 1 through 9 of this document.
"Licensor" shall mean the copyright owner or entity authorized by
the copyright owner that is granting the License.
"Legal Entity" shall mean the union of the acting entity and all
other entities that control, are controlled by, or are under common
control with that entity. For the purposes of this definition,
"control" means (i) the power, direct or indirect, to cause the
direction or management of such entity, whether by contract or
otherwise, or (ii) ownership of fifty percent (50%) or more of the
outstanding shares, or (iii) beneficial ownership of such entity.
"You" (or "Your") shall mean an individual or Legal Entity
exercising permissions granted by this License.
"Source" form shall mean the preferred form for making modifications,
including but not limited to software source code, documentation
source, and configuration files.
"Object" form shall mean any form resulting from mechanical
transformation or translation of a Source form, including but
not limited to compiled object code, generated documentation,
and conversions to other media types.
"Work" shall mean the work of authorship, whether in Source or
Object form, made available under the License, as indicated by a
copyright notice that is included in or attached to the work
(an example is provided in the Appendix below).
"Derivative Works" shall mean any work, whether in Source or Object
form, that is based on (or derived from) the Work and for which the
editorial revisions, annotations, elaborations, or other modifications
represent, as a whole, an original work of authorship. For the purposes
of this License, Derivative Works shall not include works that remain
separable from, or merely link (or bind by name) to the interfaces of,
the Work and Derivative Works thereof.
"Contribution" shall mean any work of authorship, including
the original version of the Work and any modifications or additions
to that Work or Derivative Works thereof, that is intentionally
submitted to Licensor for inclusion in the Work by the copyright owner
or by an individual or Legal Entity authorized to submit on behalf of
the copyright owner. For the purposes of this definition, "submitted"
means any form of electronic, verbal, or written communication sent
to the Licensor or its representatives, including but not limited to
communication on electronic mailing lists, source code control systems,
and issue tracking systems that are managed by, or on behalf of, the
Licensor for the purpose of discussing and improving the Work, but
excluding communication that is conspicuously marked or otherwise
designated in writing by the copyright owner as "Not a Contribution."
"Contributor" shall mean Licensor and any individual or Legal Entity
on behalf of whom a Contribution has been received by Licensor and
subsequently incorporated within the Work.
2. Grant of Copyright License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
copyright license to reproduce, prepare Derivative Works of,
publicly display, publicly perform, sublicense, and distribute the
Work and such Derivative Works in Source or Object form.
3. Grant of Patent License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
(except as stated in this section) patent license to make, have made,
use, offer to sell, sell, import, and otherwise transfer the Work,
where such license applies only to those patent claims licensable
by such Contributor that are necessarily infringed by their
Contribution(s) alone or by combination of their Contribution(s)
with the Work to which such Contribution(s) was submitted. If You
institute patent litigation against any entity (including a
cross-claim or counterclaim in a lawsuit) alleging that the Work
or a Contribution incorporated within the Work constitutes direct
or contributory patent infringement, then any patent licenses
granted to You under this License for that Work shall terminate
as of the date such litigation is filed.
4. Redistribution. You may reproduce and distribute copies of the
Work or Derivative Works thereof in any medium, with or without
modifications, and in Source or Object form, provided that You
meet the following conditions:
(a) You must give any other recipients of the Work or
Derivative Works a copy of this License; and
(b) You must cause any modified files to carry prominent notices
stating that You changed the files; and
(c) You must retain, in the Source form of any Derivative Works
that You distribute, all copyright, patent, trademark, and
attribution notices from the Source form of the Work,
excluding those notices that do not pertain to any part of
the Derivative Works; and
(d) If the Work includes a "NOTICE" text file as part of its
distribution, then any Derivative Works that You distribute must
include a readable copy of the attribution notices contained
within such NOTICE file, excluding those notices that do not
pertain to any part of the Derivative Works, in at least one
of the following places: within a NOTICE text file distributed
as part of the Derivative Works; within the Source form or
documentation, if provided along with the Derivative Works; or,
within a display generated by the Derivative Works, if and
wherever such third-party notices normally appear. The contents
of the NOTICE file are for informational purposes only and
do not modify the License. You may add Your own attribution
notices within Derivative Works that You distribute, alongside
or as an addendum to the NOTICE text from the Work, provided
that such additional attribution notices cannot be construed
as modifying the License.
You may add Your own copyright statement to Your modifications and
may provide additional or different license terms and conditions
for use, reproduction, or distribution of Your modifications, or
for any such Derivative Works as a whole, provided Your use,
reproduction, and distribution of the Work otherwise complies with
the conditions stated in this License.
5. Submission of Contributions. Unless You explicitly state otherwise,
any Contribution intentionally submitted for inclusion in the Work
by You to the Licensor shall be under the terms and conditions of
this License, without any additional terms or conditions.
Notwithstanding the above, nothing herein shall supersede or modify
the terms of any separate license agreement you may have executed
with Licensor regarding such Contributions.
6. Trademarks. This License does not grant permission to use the trade
names, trademarks, service marks, or product names of the Licensor,
except as required for reasonable and customary use in describing the
origin of the Work and reproducing the content of the NOTICE file.
7. Disclaimer of Warranty. Unless required by applicable law or
agreed to in writing, Licensor provides the Work (and each
Contributor provides its Contributions) on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied, including, without limitation, any warranties or conditions
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
PARTICULAR PURPOSE. You are solely responsible for determining the
appropriateness of using or redistributing the Work and assume any
risks associated with Your exercise of permissions under this License.
8. Limitation of Liability. In no event and under no legal theory,
whether in tort (including negligence), contract, or otherwise,
unless required by applicable law (such as deliberate and grossly
negligent acts) or agreed to in writing, shall any Contributor be
liable to You for damages, including any direct, indirect, special,
incidental, or consequential damages of any character arising as a
result of this License or out of the use or inability to use the
Work (including but not limited to damages for loss of goodwill,
work stoppage, computer failure or malfunction, or any and all
other commercial damages or losses), even if such Contributor
has been advised of the possibility of such damages.
9. Accepting Warranty or Additional Liability. While redistributing
the Work or Derivative Works thereof, You may choose to offer,
and charge a fee for, acceptance of support, warranty, indemnity,
or other liability obligations and/or rights consistent with this
License. However, in accepting such obligations, You may act only
on Your own behalf and on Your sole responsibility, not on behalf
of any other Contributor, and only if You agree to indemnify,
defend, and hold each Contributor harmless for any liability
incurred by, or claims asserted against, such Contributor by reason
of your accepting any such warranty or additional liability.
END OF TERMS AND CONDITIONS
APPENDIX: How to apply the Apache License to your work.
To apply the Apache License to your work, attach the following
boilerplate notice, with the fields enclosed by brackets "[]"
replaced with your own identifying information. (Don't include
the brackets!) The text should be enclosed in the appropriate
comment syntax for the file format. We also recommend that a
file or class name and description of purpose be included on the
same "printed page" as the copyright notice for easier
identification within third-party archives.
Copyright 2019 NVIDIA Corporation
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.

View file

@ -0,0 +1,5 @@
Wide & Deep Tensorflow
This repository includes software from
https://github.com/gabrielspmoreira/kaggle_outbrain_click_prediction_google_cloud_ml_engine
licensed under the Apache License, Version 2.0.

View file

@ -0,0 +1,416 @@
# Wide & Deep Recommender Model Training For TensorFlow
This repository provides a script and recipe to train the Wide and Deep Recommender model to achieve state-of-the-art accuracy and is tested and maintained by NVIDIA.
## Table Of Contents
- [Model overview](#model-overview)
* [Model architecture](#model-architecture)
* [Applications and dataset](#applications-and-dataset)
* [Default configuration](#default-configuration)
* [Feature support matrix](#feature-support-matrix)
* [Features](#features)
* [Mixed precision](#mixed-precision)
* [Enabling mixed precision](#enabling-mixed-precision)
* [Impact of mixed precision on training accuracy](#impact-of-mixed-precision-on-training-accuracy)
* [Impact of mixed precision on inference accuracy](#impact-of-mixed-precision-on-inference-accuracy)
* [Glossary](#glossary)
- [Setup](#setup)
* [Requirements](#requirements)
- [Quick Start Guide](#quick-start-guide)
- [Advanced](#advanced)
* [Scripts and sample code](#scripts-and-sample-code)
* [Parameters](#parameters)
* [Command-line options](#command-line-options)
* [Getting the data](#getting-the-data)
* [Dataset guidelines](#dataset-guidelines)
* [Training process](#training-process)
* [Deploying the Wide & Deep model using Triton Inference Server](#deploying-the-wide-deep-model-using-triton-inference-server)
- [Performance](#performance)
* [Benchmarking](#benchmarking)
* [Training performance benchmark](#training-performance-benchmark)
* [Results](#results)
* [Training accuracy results](#training-accuracy-results)
* [Training accuracy: NVIDIA DGX-1 (8x V100 16G)](#training-accuracy-nvidia-dgx-1-8x-v100-16g)
* [Training accuracy plots](#training-accuracy-plots)
* [Training stability test](#training-stability-test)
* [Training performance results](#training-performance-results)
* [Training performance: NVIDIA DGX-1 (8x V100 16G)](#training-performance-nvidia-dgx-1-8x-v100-16g)
- [Release notes](#release-notes)
* [Changelog](#changelog)
* [Known issues](#known-issues)
## Model overview
Recommendation systems drive engagement on many of the most popular online platforms. As the volume of data available to power these systems grows exponentially, data scientists are increasingly turning from more traditional machine learning methods to highly expressive deep learning models to improve the quality of their recommendations. Google's [Wide & Deep Learning for Recommender Systems](https://arxiv.org/abs/1606.07792) has emerged as a popular model for these problems both for its robustness to signal sparsity as well as its user-friendly implementation in [TensorFlow](https://www.tensorflow.org/api_docs/python/tf/estimator/DNNLinearCombinedClassifier).
The differences between this Wide & Deep Recommender Model and the model from the paper is the size of the Deep part of the model. Originally, in Google's paper, the fully connected part was three layers of 1024, 512, and 256 neurons. Our model consists of 5 layers each of 1024 neurons.
The model enables you to train a recommender model that combines the memorization of the Wide part and generalization of the Deep part of the network.
This model is trained with mixed precision using Tensor Cores on NVIDIA Volta and Turing GPUs. Therefore, researchers can get results 1.32 times faster than training without Tensor Cores, while experiencing the benefits of mixed precision training. This model is tested against each NGC monthly container release to ensure consistent accuracy and performance over time.
### Model architecture
Wide & Deep refers to a class of networks that use the output of two parts working in parallel - wide model and deep model - to make predictions of recommenders. The wide model is a generalized linear model of features together with their transforms. The deep model is a series of 5 hidden MLP layers of 1024 neurons each beginning with a dense embedding of features. The architecture is presented in Figure 1.
<p align="center">
<img width="70%" src="https://developer.download.nvidia.com/w-and-d-recommender/model.svg" />
<br>
Figure 1. The architecture of the Wide & Deep model.</a>
</p>
### Applications and dataset
The basis of our API lies in the observation that in recommendation problems there are hierarchies of features: those which describe the person or object _to which_ we wish to make recommendations (*request* level features), and those which describe those objects which we are considering recommending (*item* level features). Additionally, these features often need to undergo some transformation from their raw representation in data stores to a representation digestible by neural networks. These transformations, defined by [TensorFlow `tf.feature_column`](https://www.tensorflow.org/api_docs/python/tf/feature_column), include nontrivial operations such as hashing, binning, vocabulary lookups, and embedding (indicator columns can be thought of as embeddings with the identity matrix as the embedding table).
In most APIs, including those implemented in standard TensorFlow, these transformations need to be computed for request level features repeatedly for _every_ item on which we want to compute a recommendation score. Moreover, if the model is being hosted on a dedicated remote inference server, this requires us to send copies of the request level data for every item as well.
To address this, we built a custom GPU op which computes _all_ these transformations in parallel, and only reads and computes request level features once before fanning them out to the rest of the batch. Besides saving on redundant compute and network I/O, this implementation leverages the exceptional parallel computing power of NVIDIA GPUs to provide massive inference time accelerations compared to native CPU based implementations.
As a reference dataset, we used a subset of [the features engineered](https://github.com/gabrielspmoreira/kaggle_outbrain_click_prediction_google_cloud_ml_engine) by the 19th place finisher in the [Kaggle Outbrain Click Prediction Challenge](https://www.kaggle.com/c/outbrain-click-prediction/). This competition challenged competitors to predict the likelihood with which a particular ad on a website's display would be clicked on. Competitors were given information about the user, display, document, and ad in order to train their models. More information can be found [here](https://www.kaggle.com/c/outbrain-click-prediction/data).
### Default configuration
For reference, and to give context to the acceleration numbers described below, some important properties of our features and model are as follows:
- Features
- Request Level
- 16 scalar numeric features `(shape=(1,)`)
- 12 one-hot categorical features (all `int` dtype)
- 5 indicator embeddings
- sizes 2, 2, 3, 3, 6
- 7 trainable embeddings
- all except two have an embedding size of 64 (remaining two have 128), though it's important to note for *all* categorical features that we *do not* leverage that information to short-circuit the lookups by treating them as a single multi-hot lookup. Our API is fully general to any combination of embedding sizes.
- all use hash bucketing with `num_buckets=` 300k, 100k, 4k, 2.5k, 2k, 1k, and 300 respectively
- 3 multi-hot categorical features (all `int` dtype)
- all trainable embeddings
- all with embedding size 64
- all use hash bucketing with `num_buckets=` 10k, 350, and 100 respectively
- Item Level
- 16 scalar numeric features
- 4 one hot categorical features (all `int` dtype)
- embedding sizes of 128, 64, 64, 64 respectively
- hash bucketing with `num_buckets=` 250k, 4k, 2.5k, and 1k respectively
- 3 multi-hot categorical features (all `int` dtype)
- all with embedding size 64
- hash bucketing with `num_buckets=` 10k, 350, and 100 respectively
- All features are used in both wide *and* deep branches of the network
- Model
- Total embedding dimension is 1328
- 5 hidden layers each with size 1024
- Output dimension is 1 (probability of click)
### Feature support matrix
The following features are supported by this model:
| Feature | Wide & Deep
|-----------------------|--------------------------
|Horovod Multi-GPU | Yes
|Automatic mixed precision (AMP) | Yes
#### Features
Horovod
Horovod is a distributed training framework for TensorFlow, Keras, PyTorch and MXNet. The goal of Horovod is to make distributed deep learning fast and easy to use. For more information about how to get started with Horovod, see the [Horovod: Official repository](https://github.com/horovod/horovod).
Multi-GPU training with Horovod
Our model uses Horovod to implement efficient multi-GPU training with NCCL. For details, see example sources in this repository or see the [TensorFlow tutorial](https://github.com/horovod/horovod/#usage).
### Mixed precision
Mixed precision is the combined use of different numerical precisions in a computational method. [Mixed precision](https://arxiv.org/abs/1710.03740) training offers significant computational speedup by performing operations in half-precision format while storing minimal information in single-precision to retain as much information as possible in critical parts of the network. Since the introduction of [Tensor Cores](https://developer.nvidia.com/tensor-cores) in the Volta and Turing architecture, significant training speedups are experienced by switching to mixed precision -- up to 3x overall speedup on the most arithmetically intense model architectures. Using mixed precision training requires two steps:
1. Porting the model to use the FP16 data type where appropriate.
2. Adding loss scaling to preserve small gradient values.
The ability to train deep learning networks with lower precision was introduced in the Pascal architecture and first supported in [CUDA 8](https://devblogs.nvidia.com/parallelforall/tag/fp16/) in the NVIDIA Deep Learning SDK.
For information about:
- How to train using mixed precision, see the [Mixed Precision Training](https://arxiv.org/abs/1710.03740) paper and [Training With Mixed Precision](https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html) documentation.
- Techniques used for mixed precision training, see the [Mixed-Precision Training of Deep Neural Networks](https://devblogs.nvidia.com/mixed-precision-training-deep-neural-networks/) blog.
- How to access and enable AMP for TensorFlow, see [Using TF-AMP](https://docs.nvidia.com/deeplearning/dgx/tensorflow-user-guide/index.html#tfamp) from the TensorFlow User Guide.
#### Enabling mixed precision
To enable Wide & Deep training to use mixed precision you don't need to perform input quantization, only an additional flag `--amp` to the training script is needed (see [Quick Start Guide](#quick-start-guide)).
##### Impact of mixed precision on training accuracy
The accuracy of training, measured with MAP@12 metric was not impacted by enabling mixed precision. The obtained results were statistically similar (i.e. similar run-to-run variance was observed, with standard deviation of the level of `0.002`).
##### Impact of mixed precision on inference accuracy
For our reference model, the average absolute error on the probability of interaction induced by reduced precision inference is `0.0002`, producing a near-perfect fit between predictions produced by full and mixed precision models. Moreover, this error is uncorrelated with the magnitude of the predicted value, which means for most predictions of interest (i.e. greater than `0.01` or `0.1` likelihood of interaction), the relative magnitude of the error is approaching the noise floor of the problem.
### Glossary
Request level features: Features that describe the person or object _to which_ we wish to make recommendations.
Item level features: Features that describe those objects which we are considering recommending.
## Setup
The following section lists the requirements that you need to meet in order to start training the Wide & Deep model.
### Requirements
This repository contains Dockerfile which extends the TensorFlow NGC container and encapsulates some dependencies. Aside from these dependencies, ensure you have the following components:
- [NVIDIA Docker](https://github.com/NVIDIA/nvidia-docker)
- [20.02-tf1-py3](https://ngc.nvidia.com/catalog/containers/nvidia:tensorflow) NGC container
- [NVIDIA Volta](https://www.nvidia.com/en-us/data-center/volta-gpu-architecture/) or [Turing](https://www.nvidia.com/en-us/geforce/turing/) based GPU
For more information about how to get started with NGC containers, see the following sections from the NVIDIA GPU Cloud Documentation and the Deep Learning Documentation:
- [Getting Started Using NVIDIA GPU Cloud](https://docs.nvidia.com/ngc/ngc-getting-started-guide/index.html)
- [Accessing And Pulling From The NGC Container Registry](https://docs.nvidia.com/deeplearning/frameworks/user-guide/index.html#accessing_registry)
- [Running TensorFlow](https://docs.nvidia.com/deeplearning/frameworks/tensorflow-release-notes/running.html#running)
For those unable to use the TensorFlow NGC container, to set up the required environment or create your own container, see the versioned [NVIDIA Container Support Matrix](https://docs.nvidia.com/deeplearning/frameworks/support-matrix/index.html).
## Quick Start Guide
To train your model using mixed precision with Tensor Cores or using FP32, perform the following steps using the default parameters of the Wide & Deep model on the Outbrain dataset. For the specifics concerning training and inference, see the [Advanced](#advanced) section.
1. Clone the repository.
```
git clone https://github.com/NVIDIA/DeepLearningExamples
cd DeepLearningExamples/TensorFlow/Recommendation/WideDeep
```
2. Download the Outbrain dataset.
The Outbrain dataset can be downloaded from [Kaggle](https://www.kaggle.com/c/outbrain-click-prediction/data) (requires Kaggle account).
Unzip the downloaded archive e.g. to `/raid/outbrain/orig` and set the `HOST_OUTBRAIN_PATH` variable to the parent directory:
```bash
HOST_OUTBRAIN_PATH=/raid/outbrain
```
3. Build the Wide & Deep Tensorflow NGC container.
```bash
docker build . -t wide_deep
```
4. Start an interactive session in the NGC container to run preprocessing/training/inference.
```bash
docker run --runtime=nvidia --rm -ti -v ${HOST_OUTBRAIN_PATH}:/outbrain wide_deep /bin/bash
```
5. Start preprocessing.
```bash
bash scripts/preproc.sh 4096
```
The result of preprocessing scripts are prebatched TFRecords. The argument to the script is the prebatch
size (4096 is the default).
6. Start training.
Single GPU:
```bash
python -m trainer.task --gpu --amp --batch_size 131072 --num_epochs 100
```
8 GPU:
```bash
mpiexec --allow-run-as-root --bind-to socket -np 8 python -m trainer.task --gpu --amp --hvd --batch_size 16384 --num_epochs 20
```
If you want to run validation or inference, you can either use the checkpoint obtained from the training
commands above, or download the pretrained checkpoint from NGC.
In order to download the checkpoint from NGC, visit [ngc.nvidia.com](https://ngc.nvidia.com) website and
browse the available models.
Download the checkpoint files and unzip them to some path, e.g. to `/raid/outbrain/checkpoints/`
(which is the default path for storing the checkpoints during training).
7. Start validation/evaluation.
In order to validate the checkpoint on the evaluation set, run the `task.py` script with `--evaluate` flag:
```bash
python -m trainer.task --gpu --amp --evaluate --model_dir /outbrain/checkpoints
```
8. Start inference/predictions.
In order to run inference and predict the results, run the `task.py`
script with `--predict` flag:
```bash
python -m trainer.task --gpu --amp --predict --model_dir /outbrain/checkpoints
```
## Advanced
The following sections provide greater details of the dataset, running training, and the training results.
### Scripts and sample code
These are the important scripts in this repository:
* `trainer/task.py` - Python script for training the Wide & Deep recommender model
* `trainer/features.py` - Python file describing the request and item level features
### Parameters
These are the important parameters in the `trainer/task.py` script:
```
--model_dir: Path to model checkpoint directory
--deep_hidden_units: [DEEP_LAYER1 DEEP_LAYER2 ...] hidden units per layer, separated by spaces
--prebatch_size: Number of samples in each pre-batch in tfrecords
--batch_size: Training batch size (must be a multiplicity of prebatch_size)
--eval_batch_size: Evaluation batch size (must be a multiplicity of prebatch_size)
--num_epochs: Number of epochs to train
--linear_learning_rate: Learning rate for the wide part of the model
--linear_l1_regularization: L1 regularization for the wide part of the model
--linear_l2_regularization: L2 regularization for the wide part of the model
--deep_learning_rate: Learning rate for the deep part of the model
--deep_l1_regularization: L1 regularization for the deep part of the model
--deep_l2_regularization: L2 regularization for the deep part of the model
--deep_dropout: Dropout probability for deep model
--predict: Perform only the prediction on the validation set, do not train
--evaluate: Perform only the evaluation on the validation set, do not train
--gpu: Run computations on GPU
--amp: Enable Automatic Mixed Precision
--xla: Enable XLA
--hvd: Use Horovod for multi-GPU training
```
### Command-line options
To see the full list of available options and their descriptions, use the `-h` or `--help` command-line option:
```bash
python -m trainer.task --help
```
### Getting the data
The Outbrain dataset can be downloaded from [Kaggle](https://www.kaggle.com/c/outbrain-click-prediction/data) (requires Kaggle account).
#### Dataset guidelines
The dataset contains a sample of users page views and clicks, as observed on multiple publisher sites. Viewed pages and clicked recommendations have further semantic attributes of the documents.
The dataset contains sets of content recommendations served to a specific user in a specific context. Each context (i.e. a set of recommendations) is given a display_id. In each such set, the user has clicked on at least one recommendation. The page view logs originally has more than 2 billion rows (around 100 GB uncompressed).
The data within the preprocessing stage are transferred into tabular data of 54 features, for training having 55 million rows.
### Training process
The training can be started by running the `trainer/task.py` script. By default the script is in train mode. Other training related
configs are also present in the `trainer/task.py` and can be seen using the command `python -m trainer.task --help`. Training happens for `--num_epochs` epochs with custom estimator for the model. The model has a wide linear part and a deep feed forward network, and the networks are built according to the default configuration.
Two separate optimizers are used to optimize the wide and the deep part of the network:
- FTLR (Follow the Regularized Leader) optimizer is used to optimize the wide part of the network.
- Proximal Adagrad optimizer is used to optimize the deep part of the network.
The training log will contain information about:
- Loss value after every 100 steps.
- Training throughput if `--benchmark` option is selected.
- Evaluation metrics after every evaluation cycle at the end of every epoch.
Checkpoints are stored at the end of every `--save_checkpoints_steps` at the `--model_dir` location.
### Deploying the Wide & Deep model using Triton Inference Server
This repository does not contain code for deploying the model using Triton Inference Server. The details of such deployment together with obtained performance numbers was discussed on the [blog post](https://devblogs.nvidia.com/accelerating-wide-deep-recommender-inference-on-gpus/).
## Performance
### Benchmarking
The following section shows how to run benchmarks measuring the model performance in training mode.
#### Training performance benchmark
We provide 6 scripts to benchmark the performance of training:
```bash
bash scripts/benchmark_training_fp32_1gpu.sh
bash scripts/benchmark_training_fp16_1gpu.sh
bash scripts/benchmark_training_fp32_4gpu.sh
bash scripts/benchmark_training_fp16_4gpu.sh
bash scripts/benchmark_training_fp32_8gpu.sh
bash scripts/benchmark_training_fp16_8gpu.sh
```
### Results
The following sections provide details on how we achieved our performance and
accuracy in training.
#### Training accuracy results
##### Training accuracy: NVIDIA DGX-1 (8x V100 16G)
Our results were obtained by running the benchmark scripts from the `scripts` directory in the TensorFlow NGC container on NVIDIA DGX-1 with (8x V100 16G) GPUs.
|**GPUs**|**Batch Size / GPU**|**Accuracy - FP32 (MAP@12)**|**Accuracy - Mixed precision (MAP@12)**|**Time to Train - FP32 (minutes)**|**Time to Train - Mixed precision (minutes)**|**Time to Train Speedup (FP32 to Mixed precision)**|
|-------:|-------------------:|----------------------------:|---------------------------------------:|-----------------------------------------------:|----------------------:|---------------------------------:|
| 1 | 131,072 | 0.67689 | 0.67542 | 546 | 414 | 1.32 |
| 4 | 32,768 | 0.67677 | 0.67647 | 78 | 66 | 1.18 |
| 8 | 16,384 | 0.67669 | 0.67594 | 30 | 24 | 1.25 |
To achieve the same results, follow the steps in the [Quick Start Guide](#quick-start-guide).
##### Training accuracy plots
![MAP12](img/map12_WnD.png)
##### Training stability test
The Wide and Deep model was trained for 72,951 training steps, starting
from 20 different initial random seeds. The training was performed in the 20.02-tf1-py3-stage NGC container on
NVIDIA DGX-1 with 8x V100 16G GPUs with mixed precision enabled.
After training, the models were evaluated on the test dataset. The following
table summarizes the final MAP@12 score on the test set.
|**Average MAP@12**|**Standard deviation**|**Minimum**|**Maximum**|
|---------------------:|---------------------:|----------:|----------:|
| 0.67594 | 0.00204 | 0.66906 | 0.67785 |
#### Training performance results
##### Training performance: NVIDIA DGX-1 (8x V100 16G)
Our results were obtained by running the `trainer/task.py` training script in the TensorFlow NGC container on NVIDIA DGX-1 with (8x V100 16G) GPUs. Performance numbers (in samples per second) were averaged over 50 training iterations. Improving model scaling for multi-GPU is planned, see [known issues](#known-issues).
To achieve these same results, follow the steps in the [Quick Start Guide](#quick-start-guide).
|**GPUs**|**Batch Size / GPU**|**Throughput - FP32 (samples/s)**|**Throughput - Mixed precision (samples/s)**|**Throughput speedup (FP32 to Mixed precision)**|**Weak Scaling - FP32**|**Weak Scaling - Mixed precision**|
|-------:|-------------------:|----------------------------:|---------------------------------------:|-----------------------------------------------:|----------------------:|---------------------------------:|
| 1 | 131,072 | 167,875 | 221,550 | 1.320 | 1.000 | 1.000 |
| 4 | 131,072 | 485,242 | 547,683 | 1.129 | 2.472 | 2.890 |
| 8 | 131,072 | 655,665 | 688,481 | 1.050 | 3.108 | 3.906 |
## Release notes
### Changelog
This section needs to include the date of the release and the most important changes after the initial release.
March 2020
- Initial release
### Known issues
- Limited tf.feature_column support
- Limited scaling for multi-GPU because of inefficient handling of embedding operations (multiple memory transfers between CPU and GPU), work in progress to cover all the operations on GPU.

View file

@ -0,0 +1,13 @@
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

View file

@ -0,0 +1,155 @@
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import argparse
import datetime
import os
import random
import subprocess
import sys
from joblib import Parallel, delayed
import outbrain_transform
import path_constants
import tensorflow as tf
import glob
import pandas as pd
import trainer.features
def parse_arguments(argv):
"""Parse command line arguments.
Args:
argv: list of command line arguments including program name.
Returns:
The parsed arguments as returned by argparse.ArgumentParser.
"""
parser = argparse.ArgumentParser(
description='Runs Transformation on the Outbrain Click Prediction model data.')
parser.add_argument(
'--training_data',
default='',
help='Data to analyze and encode as training features.')
parser.add_argument(
'--eval_data',
default='',
help='Data to encode as evaluation features.')
parser.add_argument(
'--output_dir',
default=None,
required=True,
help=('Google Cloud Storage or Local directory in which '
'to place outputs.'))
parser.add_argument('--batch_size', default=None, type=int, help='Size of batches to create.')
parser.add_argument('--submission', default=False, action='store_true', help='Use real test set for submission')
args, _ = parser.parse_known_args(args=argv[1:])
return args
# a version of this method that prefers pandas methods
def local_transform_chunk(nr, csv, output_prefix, min_logs, max_logs, batch_size=None, remainder=None):
# put any remainder at the front of the line, with the new rows after
if remainder is not None:
csv = remainder.append(csv)
# for each batch, slice into the datafrom to retrieve the corresponding data
print(str(datetime.datetime.now()) + '\tWriting rows...')
num_rows = len(csv.index)
with tf.python_io.TFRecordWriter(output_prefix + str(nr).zfill(3) + '.tfrecord') as writer:
for start_ind in range(0,num_rows,batch_size if batch_size is not None else 1): # for each batch
if start_ind + batch_size - 1 > num_rows: # if we'd run out of rows
return csv.iloc[start_ind:] # return remainder for use with the next file
# otherwise write this batch to TFRecord
csv_slice = csv.iloc[start_ind:start_ind+(batch_size if batch_size is not None else 1)]
example = outbrain_transform.create_tf_example(csv_slice, min_logs, max_logs)
writer.write(example.SerializeToString())
# calculate min and max stats for the given dataframes all in one go
def compute_min_max_logs(dataframes):
print(str(datetime.datetime.now()) + '\tComputing min and max')
min_logs = {}
max_logs = {}
df = pd.concat(dataframes) # concatenate all dataframes, to process at once
for name in trainer.features.FLOAT_COLUMNS_LOG_BIN_TRANSFORM:
feature_series = df[name]
min_logs[name + '_log_01scaled'] = outbrain_transform.log2_1p(feature_series.min(axis=0)*1000)
max_logs[name + '_log_01scaled'] = outbrain_transform.log2_1p(feature_series.max(axis=0)*1000)
for name in trainer.features.INT_COLUMNS:
feature_series = df[name]
min_logs[name + '_log_01scaled'] = outbrain_transform.log2_1p(feature_series.min(axis=0))
max_logs[name + '_log_01scaled'] = outbrain_transform.log2_1p(feature_series.max(axis=0))
return min_logs, max_logs
def main(argv=None):
args = parse_arguments(sys.argv if argv is None else argv)
# Retrieve and sort training and eval data (to ensure consistent order)
# Order is important so that the right data will get sorted together for MAP
training_data = sorted(glob.glob(args.training_data))
eval_data = sorted(glob.glob(args.eval_data))
print('Training data:\n{}\nFound:\n{}'.format(args.training_data,training_data))
print('Evaluation data:\n{}\nFound:\n{}'.format(args.eval_data,eval_data))
outbrain_transform.make_spec(args.output_dir + '/transformed_metadata', batch_size=args.batch_size)
# read all dataframes
print('\n' + str(datetime.datetime.now()) + '\tReading input files')
eval_dataframes = [pd.read_csv(filename, header=None, names=outbrain_transform.CSV_ORDERED_COLUMNS)
for filename in eval_data]
train_dataframes = [pd.read_csv(filename, header=None, names=outbrain_transform.CSV_ORDERED_COLUMNS)
for filename in training_data]
# calculate stats once over all records given
min_logs, max_logs = compute_min_max_logs(eval_dataframes + train_dataframes)
if args.submission:
train_output_string = '/sub_train_'
eval_output_string = '/test_'
else:
train_output_string = '/train_'
eval_output_string = '/eval_'
# process eval files
print('\n' + str(datetime.datetime.now()) + '\tWorking on evaluation data')
eval_remainder = None # remainder when a file's records don't divide evenly into batches
for i, df in enumerate(eval_dataframes):
print(eval_data[i])
eval_remainder = local_transform_chunk(i, df, args.output_dir + eval_output_string, min_logs, max_logs,
batch_size=args.batch_size, remainder=eval_remainder)
if eval_remainder is not None:
print('Dropping {} records (eval) on the floor'.format(len(eval_remainder)))
# process train files
print('\n' + str(datetime.datetime.now()) + '\tWorking on training data')
train_remainder = None
for i, df in enumerate(train_dataframes):
print(training_data[i])
train_remainder = local_transform_chunk(i, df, args.output_dir + train_output_string, min_logs, max_logs,
batch_size=args.batch_size, remainder=train_remainder)
if train_remainder is not None:
print('Dropping {} records (train) on the floor'.format(len(train_remainder)))
if __name__ == '__main__':
main()

Binary file not shown.

After

Width:  |  Height:  |  Size: 60 KiB

View file

@ -0,0 +1,220 @@
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import tensorflow as tf
from tensorflow_transform.tf_metadata import dataset_schema
from tensorflow_transform.tf_metadata import dataset_metadata
from tensorflow_transform.tf_metadata import metadata_io
import numpy as np
import pandas as pd
from trainer.features import LABEL_COLUMN, DISPLAY_ID_COLUMN, AD_ID_COLUMN, IS_LEAK_COLUMN, DISPLAY_ID_AND_IS_LEAK_ENCODED_COLUMN, CATEGORICAL_COLUMNS, DOC_CATEGORICAL_MULTIVALUED_COLUMNS, BOOL_COLUMNS, INT_COLUMNS, FLOAT_COLUMNS, FLOAT_COLUMNS_LOG_BIN_TRANSFORM, FLOAT_COLUMNS_SIMPLE_BIN_TRANSFORM
RENAME_COLUMNS = False
CSV_ORDERED_COLUMNS = ['label','display_id','ad_id','doc_id','doc_event_id','is_leak','event_weekend',
'user_has_already_viewed_doc','user_views','ad_views','doc_views',
'doc_event_days_since_published','doc_event_hour','doc_ad_days_since_published',
'pop_ad_id','pop_ad_id_conf',
'pop_ad_id_conf_multipl','pop_document_id','pop_document_id_conf',
'pop_document_id_conf_multipl','pop_publisher_id','pop_publisher_id_conf',
'pop_publisher_id_conf_multipl','pop_advertiser_id','pop_advertiser_id_conf',
'pop_advertiser_id_conf_multipl','pop_campain_id','pop_campain_id_conf',
'pop_campain_id_conf_multipl','pop_doc_event_doc_ad','pop_doc_event_doc_ad_conf',
'pop_doc_event_doc_ad_conf_multipl','pop_source_id','pop_source_id_conf',
'pop_source_id_conf_multipl','pop_source_id_country','pop_source_id_country_conf',
'pop_source_id_country_conf_multipl','pop_entity_id','pop_entity_id_conf',
'pop_entity_id_conf_multipl','pop_entity_id_country','pop_entity_id_country_conf',
'pop_entity_id_country_conf_multipl','pop_topic_id','pop_topic_id_conf',
'pop_topic_id_conf_multipl','pop_topic_id_country','pop_topic_id_country_conf',
'pop_topic_id_country_conf_multipl','pop_category_id','pop_category_id_conf',
'pop_category_id_conf_multipl','pop_category_id_country','pop_category_id_country_conf',
'pop_category_id_country_conf_multipl','user_doc_ad_sim_categories',
'user_doc_ad_sim_categories_conf','user_doc_ad_sim_categories_conf_multipl',
'user_doc_ad_sim_topics','user_doc_ad_sim_topics_conf','user_doc_ad_sim_topics_conf_multipl',
'user_doc_ad_sim_entities','user_doc_ad_sim_entities_conf','user_doc_ad_sim_entities_conf_multipl',
'doc_event_doc_ad_sim_categories','doc_event_doc_ad_sim_categories_conf',
'doc_event_doc_ad_sim_categories_conf_multipl','doc_event_doc_ad_sim_topics',
'doc_event_doc_ad_sim_topics_conf','doc_event_doc_ad_sim_topics_conf_multipl',
'doc_event_doc_ad_sim_entities','doc_event_doc_ad_sim_entities_conf',
'doc_event_doc_ad_sim_entities_conf_multipl','ad_advertiser','doc_ad_category_id_1',
'doc_ad_category_id_2','doc_ad_category_id_3','doc_ad_topic_id_1','doc_ad_topic_id_2',
'doc_ad_topic_id_3','doc_ad_entity_id_1','doc_ad_entity_id_2','doc_ad_entity_id_3',
'doc_ad_entity_id_4','doc_ad_entity_id_5','doc_ad_entity_id_6','doc_ad_publisher_id',
'doc_ad_source_id','doc_event_category_id_1','doc_event_category_id_2','doc_event_category_id_3',
'doc_event_topic_id_1','doc_event_topic_id_2','doc_event_topic_id_3','doc_event_entity_id_1',
'doc_event_entity_id_2','doc_event_entity_id_3','doc_event_entity_id_4','doc_event_entity_id_5',
'doc_event_entity_id_6','doc_event_publisher_id','doc_event_source_id','event_country',
'event_country_state','event_geo_location','event_hour','event_platform','traffic_source']
def make_spec(output_dir, batch_size=None):
fixed_shape = [batch_size,1] if batch_size is not None else []
spec = {}
spec[LABEL_COLUMN] = tf.FixedLenFeature(shape=fixed_shape, dtype=tf.int64, default_value=None)
spec[DISPLAY_ID_COLUMN] = tf.FixedLenFeature(shape=fixed_shape, dtype=tf.int64, default_value=None)
spec[IS_LEAK_COLUMN] = tf.FixedLenFeature(shape=fixed_shape, dtype=tf.int64, default_value=None)
spec[DISPLAY_ID_AND_IS_LEAK_ENCODED_COLUMN] = tf.FixedLenFeature(shape=fixed_shape, dtype=tf.int64, default_value=None)
for name in BOOL_COLUMNS:
spec[name] = tf.FixedLenFeature(shape=fixed_shape, dtype=tf.int64, default_value=None)
for name in FLOAT_COLUMNS_LOG_BIN_TRANSFORM+FLOAT_COLUMNS_SIMPLE_BIN_TRANSFORM:
spec[name] = tf.FixedLenFeature(shape=fixed_shape, dtype=tf.float32, default_value=None)
for name in FLOAT_COLUMNS_SIMPLE_BIN_TRANSFORM:
spec[name + '_binned'] = tf.FixedLenFeature(shape=fixed_shape, dtype=tf.int64, default_value=None)
for name in FLOAT_COLUMNS_LOG_BIN_TRANSFORM:
spec[name + '_binned'] = tf.FixedLenFeature(shape=fixed_shape, dtype=tf.int64, default_value=None)
spec[name + '_log_01scaled'] = tf.FixedLenFeature(shape=fixed_shape, dtype=tf.float32, default_value=None)
for name in INT_COLUMNS:
spec[name + '_log_int'] = tf.FixedLenFeature(shape=fixed_shape, dtype=tf.int64, default_value=None)
spec[name + '_log_01scaled'] = tf.FixedLenFeature(shape=fixed_shape, dtype=tf.float32, default_value=None)
for name in BOOL_COLUMNS + CATEGORICAL_COLUMNS:
spec[name] = tf.FixedLenFeature(shape=fixed_shape, dtype=tf.int64, default_value=None)
for multi_category in DOC_CATEGORICAL_MULTIVALUED_COLUMNS:
#spec[multi_category] = tf.VarLenFeature(dtype=tf.int64)
shape = fixed_shape[:-1]+[len(DOC_CATEGORICAL_MULTIVALUED_COLUMNS[multi_category])]
spec[multi_category] = tf.FixedLenFeature(shape=shape, dtype=tf.int64)
metadata = dataset_metadata.DatasetMetadata(dataset_schema.from_feature_spec(spec))
metadata_io.write_metadata(metadata, output_dir)
def make_input_schema(mode=tf.contrib.learn.ModeKeys.TRAIN, batch_size=None):
"""Input schema definition.
Args:
mode: tf.contrib.learn.ModeKeys specifying if the schema is being used for
train/eval or prediction.
batch_size: None if not explicitly batched (for FixedLenFeature size of []),
otherwise the number of elements to assume will be grouped (size of [batch_size])
Returns:
A `Schema` object.
"""
fixed_shape = [batch_size] if batch_size is not None else []
result = {}
result[LABEL_COLUMN] = tf.FixedLenFeature(shape=fixed_shape, dtype=tf.int64)
result[DISPLAY_ID_COLUMN] = tf.FixedLenFeature(shape=fixed_shape, dtype=tf.float32)
#result[AD_ID_COLUMN] = tf.VarLenFeature(dtype=tf.float32)
result[IS_LEAK_COLUMN] = tf.FixedLenFeature(shape=fixed_shape, dtype=tf.int64)
for name in BOOL_COLUMNS:
#result[name] = tf.VarLenFeature(dtype=tf.int64)
result[name] = tf.FixedLenFeature(shape=fixed_shape, dtype=tf.int64, default_value=0.0)
#TODO: Create dummy features that indicates whether any of the numeric features is null
#(currently default 0 value might introduce noise)
for name in FLOAT_COLUMNS_LOG_BIN_TRANSFORM+FLOAT_COLUMNS_SIMPLE_BIN_TRANSFORM:
result[name] = tf.FixedLenFeature(shape=fixed_shape, dtype=tf.float32, default_value=0.0)
for name in INT_COLUMNS:
result[name] = tf.FixedLenFeature(shape=fixed_shape, dtype=tf.float32, default_value=0.0)
for name in CATEGORICAL_COLUMNS:
result[name] = tf.FixedLenFeature(shape=fixed_shape, dtype=tf.float32, default_value=0.0)
#result[name] = tf.VarLenFeature(dtype=tf.float32)
for multi_category in DOC_CATEGORICAL_MULTIVALUED_COLUMNS:
for category in DOC_CATEGORICAL_MULTIVALUED_COLUMNS[multi_category]:
result[category] = tf.FixedLenFeature(shape=fixed_shape, dtype=tf.float32, default_value=0.0)
#result[category] = tf.VarLenFeature(dtype=tf.float32)
return dataset_schema.from_feature_spec(result)
def tf_log2_1p(x):
return tf.log1p(x) / tf.log(2.0)
def log2_1p(x):
return np.log1p(x) / np.log(2.0)
def compute_min_max_logs(rows):
min_logs = {}
max_logs = {}
for name in FLOAT_COLUMNS_LOG_BIN_TRANSFORM + INT_COLUMNS:
min_logs[name + '_log_01scaled'] = float("inf")
max_logs[name + '_log_01scaled'] = float("-inf")
for row in rows:
names = CSV_ORDERED_COLUMNS
columns_dict = dict(zip(names, row))
for name in FLOAT_COLUMNS_LOG_BIN_TRANSFORM:
nn = name + '_log_01scaled'
min_logs[nn] = min(min_logs[nn], log2_1p(columns_dict[name] * 1000))
max_logs[nn] = max(max_logs[nn], log2_1p(columns_dict[name] * 1000))
for name in INT_COLUMNS:
nn = name + '_log_01scaled'
min_logs[nn] = min(min_logs[nn], log2_1p(columns_dict[name]))
max_logs[nn] = max(max_logs[nn], log2_1p(columns_dict[name]))
return min_logs, max_logs
def scale_to_0_1(val, minv, maxv):
return (val - minv) / (maxv - minv)
def create_tf_example(df, min_logs, max_logs):
names = CSV_ORDERED_COLUMNS
#columns_dict = dict(zip(names, row))
result = {}
result[LABEL_COLUMN] = tf.train.Feature(int64_list=tf.train.Int64List(value=df[LABEL_COLUMN].to_list()))
result[DISPLAY_ID_COLUMN] = tf.train.Feature(int64_list=tf.train.Int64List(value=df[DISPLAY_ID_COLUMN].to_list()))
result[IS_LEAK_COLUMN] = tf.train.Feature(int64_list=tf.train.Int64List(value=df[IS_LEAK_COLUMN].to_list()))
#is_leak = df[IS_LEAK_COLUMN].to_list()
encoded_value = df[DISPLAY_ID_COLUMN].multiply(10).add(df[IS_LEAK_COLUMN].clip(lower=0)).to_list()
# * 10 + (0 if is_leak < 0 else is_leak)
result[DISPLAY_ID_AND_IS_LEAK_ENCODED_COLUMN] = tf.train.Feature(int64_list=tf.train.Int64List(value=encoded_value))
for name in FLOAT_COLUMNS:
result[name] = tf.train.Feature(float_list=tf.train.FloatList(value=df[name].to_list()))
for name in FLOAT_COLUMNS_SIMPLE_BIN_TRANSFORM:
#[int(columns_dict[name] * 10)]
value = df[name].multiply(10).astype('int64').to_list()
result[name + '_binned'] = tf.train.Feature(int64_list=tf.train.Int64List(value=value))
for name in FLOAT_COLUMNS_LOG_BIN_TRANSFORM:
# [int(log2_1p(columns_dict[name] * 1000))]
value_prelim = df[name].multiply(1000).apply(np.log1p).multiply(1./np.log(2.0))
value = value_prelim.astype('int64').to_list()
result[name + '_binned'] = tf.train.Feature(int64_list=tf.train.Int64List(value=value))
nn = name + '_log_01scaled'
#val = log2_1p(columns_dict[name] * 1000)
#val = scale_to_0_1(val, min_logs[nn], max_logs[nn])
value = value_prelim.add(-min_logs[nn]).multiply(1./(max_logs[nn]-min_logs[nn])).to_list()
result[nn] = tf.train.Feature(float_list=tf.train.FloatList(value=value))
for name in INT_COLUMNS:
#[int(log2_1p(columns_dict[name]))]
value_prelim = df[name].apply(np.log1p).multiply(1./np.log(2.0))
value = value_prelim.astype('int64').to_list()
result[name + '_log_int'] = tf.train.Feature(int64_list=tf.train.Int64List(value=value))
nn = name + '_log_01scaled'
#val = log2_1p(columns_dict[name])
#val = scale_to_0_1(val, min_logs[nn], max_logs[nn])
value = value_prelim.add(-min_logs[nn]).multiply(1./(max_logs[nn]-min_logs[nn])).to_list()
result[nn] = tf.train.Feature(float_list=tf.train.FloatList(value=value))
for name in BOOL_COLUMNS + CATEGORICAL_COLUMNS:
result[name] = tf.train.Feature(int64_list=tf.train.Int64List(value=df[name].to_list()))
for multi_category in DOC_CATEGORICAL_MULTIVALUED_COLUMNS:
values = []
for category in DOC_CATEGORICAL_MULTIVALUED_COLUMNS[multi_category]:
values = values + [df[category].to_numpy()]
# need to transpose the series so they will be parsed correctly by the FixedLenFeature
# we can pass in a single series here; they'll be reshaped to [batch_size, num_values]
# when parsed from the TFRecord
value = np.stack(values, axis=1).flatten().tolist()
result[multi_category] = tf.train.Feature(int64_list=tf.train.Int64List(value=value))
tf_example = tf.train.Example(features=tf.train.Features(feature=result))
return tf_example

View file

@ -0,0 +1,47 @@
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Copyright 2016 Google Inc. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""File paths for the Criteo Classification pipeline.
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
TEMP_DIR = 'tmp'
TRANSFORM_FN_DIR = 'transform_fn'
RAW_METADATA_DIR = 'raw_metadata'
TRANSFORMED_METADATA_DIR = 'transformed_metadata'
TRANSFORMED_TRAIN_DATA_FILE_PREFIX = 'features_train'
TRANSFORMED_EVAL_DATA_FILE_PREFIX = 'features_eval'
TRANSFORMED_PREDICT_DATA_FILE_PREFIX = 'features_predict'
TRAIN_RESULTS_FILE = 'train_results'
DEPLOY_SAVED_MODEL_DIR = 'saved_model'
MODEL_EVALUATIONS_FILE = 'model_evaluations'
BATCH_PREDICTION_RESULTS_FILE = 'batch_prediction_results'

View file

@ -0,0 +1,88 @@
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import pandas as pd
import os
import glob
import tqdm
import argparse
from joblib import Parallel, delayed
parser = argparse.ArgumentParser()
parser.add_argument('--train_files_pattern', default='train_feature_vectors_integral_eval.csv/part-*')
parser.add_argument('--valid_files_pattern', default='validation_feature_vectors_integral.csv/part-*')
parser.add_argument('--train_dst_dir', default='train_feature_vectors_integral_eval_imputed.csv')
parser.add_argument('--valid_dst_dir', default='validation_feature_vectors_integral_imputed.csv')
parser.add_argument('--header_path', default='train_feature_vectors_integral_eval.csv.header')
parser.add_argument('--num_workers', type=int, default=4)
args = parser.parse_args()
header = pd.read_csv(args.header_path, header=None)
columns = header[0].to_list()
train_files = glob.glob(args.train_files_pattern)
print('train files: ', train_files)
def get_counts(f):
df = pd.read_csv(f, header=None, dtype=object, names=columns, na_values='None')
counts = {}
for c in df:
counts[c] = df[c].value_counts()
return counts
all_counts = Parallel(n_jobs=args.num_workers)(delayed(get_counts)(f) for f in train_files)
cols = len(all_counts[0])
imputation_dict = {}
for c in tqdm.tqdm(columns):
temp = None
for i in range(len(all_counts)):
if temp is None:
temp = pd.Series(all_counts[i][c])
else:
temp += pd.Series(all_counts[i][c])
if len(temp) == 0:
imputation_dict[c] = 0
else:
imputation_dict[c] = temp.index[0]
print('imputation_dict: ', imputation_dict)
if not os.path.exists(args.train_dst_dir):
os.mkdir(args.train_dst_dir)
def impute_part(src_path, dst_dir):
print('imputing: ', src_path, ' to: ', dst_dir)
filename = os.path.basename(src_path)
dst_path = os.path.join(dst_dir, filename)
df = pd.read_csv(src_path, header=None, dtype=object, names=columns, na_values='None')
df2 = df.fillna(imputation_dict)
df2.to_csv(dst_path, header=None, index=None)
print('launching imputation for train CSVs')
Parallel(n_jobs=args.num_workers)(delayed(impute_part)(f, args.train_dst_dir) for f in train_files)
valid_files = glob.glob(args.valid_files_pattern)
if not os.path.exists(args.valid_dst_dir):
os.mkdir(args.valid_dst_dir)
print('launching imputation for validation CSVs')
Parallel(n_jobs=args.num_workers)(delayed(impute_part)(f, args.valid_dst_dir) for f in valid_files)
print('Done!')

View file

@ -0,0 +1,13 @@
state_abb,utc_dst_time_offset_cleaned
AB,-6.0
BC,-7.0
MB,-5.0
NB,-3.0
NL,-3.0
NS,-3.0
NU,-5.0
ON,-4.0
PE,-3.0
QC,-4.0
SK,-6.0
YT,-7.0
1 state_abb utc_dst_time_offset_cleaned
2 AB -6.0
3 BC -7.0
4 MB -5.0
5 NB -3.0
6 NL -3.0
7 NS -3.0
8 NU -5.0
9 ON -4.0
10 PE -3.0
11 QC -4.0
12 SK -6.0
13 YT -7.0

View file

@ -0,0 +1,247 @@
country_code,utc_dst_time_offset_cleaned
AX,3.0
AF,4.5
AL,2.0
DZ,1.0
AD,2.0
AO,1.0
AI,-4.0
AG,-4.0
AR,-3.0
AM,4.0
AW,-4.0
AU,10.0
AT,2.0
AZ,4.0
BS,-4.0
BH,3.0
BD,6.0
BB,-4.0
BY,3.0
BE,2.0
BZ,-6.0
BJ,1.0
BM,-3.0
BT,6.0
BO,-4.0
BA,2.0
BW,2.0
BR,-3.0
IO,6.0
BN,8.0
BG,3.0
BF,0.0
BI,2.0
KH,7.0
CM,1.0
CA,-5.0
BQ,-5.0
KY,-5.0
CF,1.0
TD,1.0
CL,-3.0
CN,8.0
CX,7.0
CC,6.5
CO,-5.0
KM,3.0
CD,1.0
CG,1.0
CK,-10.0
CR,-6.0
CI,0.0
HR,2.0
CW,-4.0
CY,3.0
CZ,2.0
DK,2.0
DJ,3.0
DM,-4.0
DO,-4.0
TL,9.0
EC,-5.0
EG,2.0
SV,-6.0
GQ,1.0
ER,3.0
EE,3.0
ET,3.0
FK,-3.0
FO,1.0
FJ,12.0
FI,3.0
FR,2.0
GF,-3.0
PF,-10.0
GA,1.0
GM,0.0
GE,4.0
DE,2.0
GH,0.0
GI,2.0
GR,3.0
GL,-2.0
GD,-4.0
GP,-4.0
GU,10.0
GT,-6.0
GG,1.0
GN,0.0
GW,0.0
GY,-4.0
HT,-5.0
HN,-6.0
HK,8.0
HU,2.0
IS,0.0
IN,5.5
ID,8.0
IR,4.5
IQ,3.0
IE,1.0
IM,1.0
IL,3.0
IT,2.0
JM,-5.0
JP,9.0
JE,1.0
JO,3.0
KZ,5.0
KE,3.0
KI,13.0
KP,-4.0
KR,-4.0
KP,8.5
KR,8.5
KP,9.0
KR,9.0
KW,3.0
KG,6.0
LA,7.0
LV,3.0
LB,3.0
LS,2.0
LR,0.0
LY,2.0
LI,2.0
LT,3.0
LU,2.0
MO,8.0
MK,2.0
MG,3.0
MW,2.0
MY,8.0
MV,5.0
ML,0.0
MT,2.0
MH,12.0
MQ,-4.0
MR,0.0
MU,4.0
YT,3.0
MX,-5.0
FM,10.0
MD,3.0
MC,2.0
MN,9.0
ME,2.0
MS,-4.0
MA,1.0
MZ,2.0
MM,6.5
NA,1.0
NR,12.0
NP,5.0
NL,2.0
NC,11.0
NZ,12.0
NI,-6.0
NE,1.0
NG,1.0
NU,-11.0
NF,11.0
MP,10.0
NO,2.0
OM,4.0
PK,5.0
PW,9.0
PS,3.0
PA,-5.0
PG,10.0
PY,-4.0
PE,-5.0
PH,8.0
PN,-8.0
PL,2.0
PT,1.0
PR,-4.0
QA,3.0
RE,4.0
RO,3.0
RU,7.0
RW,2.0
BL,-4.0
AS,-11.0
WS,-11.0
AS,13.0
WS,13.0
SM,2.0
ST,0.0
SA,3.0
SN,0.0
RS,2.0
SC,4.0
SL,0.0
SG,8.0
SK,2.0
SI,2.0
SB,11.0
SO,3.0
ZA,2.0
GS,-2.0
SS,3.0
ES,2.0
LK,5.5
SH,0.0
KN,-4.0
SX,-4.0
MF,-4.0
SD,3.0
SR,-3.0
SJ,2.0
SZ,2.0
SE,2.0
CH,2.0
SY,3.0
TW,8.0
TJ,5.0
TZ,3.0
TH,7.0
TG,0.0
TK,13.0
TO,13.0
TT,-4.0
TN,1.0
TR,3.0
TM,5.0
TC,-4.0
TV,12.0
UG,3.0
UA,3.0
AE,4.0
GB,1.0
US,-7.0
UY,-3.0
UZ,5.0
VU,11.0
VA,2.0
VE,-4.0
VN,7.0
VG,-4.0
VI,-4.0
VG,-4.0
VI,-4.0
WF,12.0
YE,3.0
ZM,2.0
ZW,2.0
1 country_code utc_dst_time_offset_cleaned
2 AX 3.0
3 AF 4.5
4 AL 2.0
5 DZ 1.0
6 AD 2.0
7 AO 1.0
8 AI -4.0
9 AG -4.0
10 AR -3.0
11 AM 4.0
12 AW -4.0
13 AU 10.0
14 AT 2.0
15 AZ 4.0
16 BS -4.0
17 BH 3.0
18 BD 6.0
19 BB -4.0
20 BY 3.0
21 BE 2.0
22 BZ -6.0
23 BJ 1.0
24 BM -3.0
25 BT 6.0
26 BO -4.0
27 BA 2.0
28 BW 2.0
29 BR -3.0
30 IO 6.0
31 BN 8.0
32 BG 3.0
33 BF 0.0
34 BI 2.0
35 KH 7.0
36 CM 1.0
37 CA -5.0
38 BQ -5.0
39 KY -5.0
40 CF 1.0
41 TD 1.0
42 CL -3.0
43 CN 8.0
44 CX 7.0
45 CC 6.5
46 CO -5.0
47 KM 3.0
48 CD 1.0
49 CG 1.0
50 CK -10.0
51 CR -6.0
52 CI 0.0
53 HR 2.0
54 CW -4.0
55 CY 3.0
56 CZ 2.0
57 DK 2.0
58 DJ 3.0
59 DM -4.0
60 DO -4.0
61 TL 9.0
62 EC -5.0
63 EG 2.0
64 SV -6.0
65 GQ 1.0
66 ER 3.0
67 EE 3.0
68 ET 3.0
69 FK -3.0
70 FO 1.0
71 FJ 12.0
72 FI 3.0
73 FR 2.0
74 GF -3.0
75 PF -10.0
76 GA 1.0
77 GM 0.0
78 GE 4.0
79 DE 2.0
80 GH 0.0
81 GI 2.0
82 GR 3.0
83 GL -2.0
84 GD -4.0
85 GP -4.0
86 GU 10.0
87 GT -6.0
88 GG 1.0
89 GN 0.0
90 GW 0.0
91 GY -4.0
92 HT -5.0
93 HN -6.0
94 HK 8.0
95 HU 2.0
96 IS 0.0
97 IN 5.5
98 ID 8.0
99 IR 4.5
100 IQ 3.0
101 IE 1.0
102 IM 1.0
103 IL 3.0
104 IT 2.0
105 JM -5.0
106 JP 9.0
107 JE 1.0
108 JO 3.0
109 KZ 5.0
110 KE 3.0
111 KI 13.0
112 KP -4.0
113 KR -4.0
114 KP 8.5
115 KR 8.5
116 KP 9.0
117 KR 9.0
118 KW 3.0
119 KG 6.0
120 LA 7.0
121 LV 3.0
122 LB 3.0
123 LS 2.0
124 LR 0.0
125 LY 2.0
126 LI 2.0
127 LT 3.0
128 LU 2.0
129 MO 8.0
130 MK 2.0
131 MG 3.0
132 MW 2.0
133 MY 8.0
134 MV 5.0
135 ML 0.0
136 MT 2.0
137 MH 12.0
138 MQ -4.0
139 MR 0.0
140 MU 4.0
141 YT 3.0
142 MX -5.0
143 FM 10.0
144 MD 3.0
145 MC 2.0
146 MN 9.0
147 ME 2.0
148 MS -4.0
149 MA 1.0
150 MZ 2.0
151 MM 6.5
152 NA 1.0
153 NR 12.0
154 NP 5.0
155 NL 2.0
156 NC 11.0
157 NZ 12.0
158 NI -6.0
159 NE 1.0
160 NG 1.0
161 NU -11.0
162 NF 11.0
163 MP 10.0
164 NO 2.0
165 OM 4.0
166 PK 5.0
167 PW 9.0
168 PS 3.0
169 PA -5.0
170 PG 10.0
171 PY -4.0
172 PE -5.0
173 PH 8.0
174 PN -8.0
175 PL 2.0
176 PT 1.0
177 PR -4.0
178 QA 3.0
179 RE 4.0
180 RO 3.0
181 RU 7.0
182 RW 2.0
183 BL -4.0
184 AS -11.0
185 WS -11.0
186 AS 13.0
187 WS 13.0
188 SM 2.0
189 ST 0.0
190 SA 3.0
191 SN 0.0
192 RS 2.0
193 SC 4.0
194 SL 0.0
195 SG 8.0
196 SK 2.0
197 SI 2.0
198 SB 11.0
199 SO 3.0
200 ZA 2.0
201 GS -2.0
202 SS 3.0
203 ES 2.0
204 LK 5.5
205 SH 0.0
206 KN -4.0
207 SX -4.0
208 MF -4.0
209 SD 3.0
210 SR -3.0
211 SJ 2.0
212 SZ 2.0
213 SE 2.0
214 CH 2.0
215 SY 3.0
216 TW 8.0
217 TJ 5.0
218 TZ 3.0
219 TH 7.0
220 TG 0.0
221 TK 13.0
222 TO 13.0
223 TT -4.0
224 TN 1.0
225 TR 3.0
226 TM 5.0
227 TC -4.0
228 TV 12.0
229 UG 3.0
230 UA 3.0
231 AE 4.0
232 GB 1.0
233 US -7.0
234 UY -3.0
235 UZ 5.0
236 VU 11.0
237 VA 2.0
238 VE -4.0
239 VN 7.0
240 VG -4.0
241 VI -4.0
242 VG -4.0
243 VI -4.0
244 WF 12.0
245 YE 3.0
246 ZM 2.0
247 ZW 2.0

View file

@ -0,0 +1,52 @@
state_abb,utc_dst_time_offset_cleaned
AL,-5.0
AK,-8.0
AZ,-7.0
AR,-5.0
CA,-7.0
CO,-6.0
CT,-4.0
DE,-4.0
DC,-4.0
FL,-4.0
GA,-4.0
HI,-10.0
ID,-6.0
IL,-5.0
IN,-4.0
IA,-5.0
KS,-5.0
KY,-4.0
LA,-5.0
ME,-4.0
MD,-4.0
MA,-4.0
MI,-4.0
MN,-5.0
MS,-5.0
MO,-5.0
MT,-6.0
NE,-5.0
NV,-7.0
NH,-4.0
NJ,-4.0
NM,-6.0
NY,-4.0
NC,-4.0
ND,-5.0
OH,-4.0
OK,-5.0
OR,-7.0
PA,-4.0
RI,-4.0
SC,-4.0
SD,-5.0
TN,-5.0
TX,-5.0
UT,-6.0
VT,-4.0
VA,-4.0
WA,-7.0
WV,-4.0
WI,-5.0
WY,-6.0
1 state_abb utc_dst_time_offset_cleaned
2 AL -5.0
3 AK -8.0
4 AZ -7.0
5 AR -5.0
6 CA -7.0
7 CO -6.0
8 CT -4.0
9 DE -4.0
10 DC -4.0
11 FL -4.0
12 GA -4.0
13 HI -10.0
14 ID -6.0
15 IL -5.0
16 IN -4.0
17 IA -5.0
18 KS -5.0
19 KY -4.0
20 LA -5.0
21 ME -4.0
22 MD -4.0
23 MA -4.0
24 MI -4.0
25 MN -5.0
26 MS -5.0
27 MO -5.0
28 MT -6.0
29 NE -5.0
30 NV -7.0
31 NH -4.0
32 NJ -4.0
33 NM -6.0
34 NY -4.0
35 NC -4.0
36 ND -5.0
37 OH -4.0
38 OK -5.0
39 OR -7.0
40 PA -4.0
41 RI -4.0
42 SC -4.0
43 SD -5.0
44 TN -5.0
45 TX -5.0
46 UT -6.0
47 VT -4.0
48 VA -4.0
49 WA -7.0
50 WV -4.0
51 WI -5.0
52 WY -6.0

View file

@ -0,0 +1,107 @@
#!/usr/bin/env python
# coding: utf-8
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
OUTPUT_BUCKET_FOLDER = "/outbrain/preprocessed/"
DATA_BUCKET_FOLDER = "/outbrain/orig/"
SPARK_TEMP_FOLDER = "/outbrain/spark-temp/"
from pyspark.sql.types import *
import pyspark.sql.functions as F
from pyspark.context import SparkContext, SparkConf
from pyspark.sql.session import SparkSession
conf = SparkConf().setMaster('local[*]').set('spark.executor.memory', '256g').set('spark.driver.memory', '126g').set("spark.local.dir", SPARK_TEMP_FOLDER)
sc = SparkContext(conf=conf)
spark = SparkSession(sc)
print('Loading data...')
truncate_day_from_timestamp_udf = F.udf(lambda ts: int(ts / 1000 / 60 / 60 / 24), IntegerType())
events_schema = StructType(
[StructField("display_id", IntegerType(), True),
StructField("uuid_event", StringType(), True),
StructField("document_id_event", IntegerType(), True),
StructField("timestamp_event", IntegerType(), True),
StructField("platform_event", IntegerType(), True),
StructField("geo_location_event", StringType(), True)]
)
events_df = spark.read.schema(events_schema) \
.options(header='true', inferschema='false', nullValue='\\N') \
.csv(DATA_BUCKET_FOLDER + "events.csv") \
.withColumn('day_event', truncate_day_from_timestamp_udf('timestamp_event')) \
.alias('events')
events_df.count()
print('Drop rows with empty "geo_location"...')
events_df = events_df.dropna(subset="geo_location_event")
events_df.count()
print('Drop rows with empty "platform"...')
events_df = events_df.dropna(subset="platform_event")
events_df.count()
promoted_content_schema = StructType(
[StructField("ad_id", IntegerType(), True),
StructField("document_id_promo", IntegerType(), True),
StructField("campaign_id", IntegerType(), True),
StructField("advertiser_id", IntegerType(), True)]
)
promoted_content_df = spark.read.schema(promoted_content_schema) \
.options(header='true', inferschema='false', nullValue='\\N') \
.csv(DATA_BUCKET_FOLDER+"promoted_content.csv") \
.alias('promoted_content')
clicks_train_schema = StructType(
[StructField("display_id", IntegerType(), True),
StructField("ad_id", IntegerType(), True),
StructField("clicked", IntegerType(), True)]
)
clicks_train_df = spark.read.schema(clicks_train_schema) \
.options(header='true', inferschema='false', nullValue='\\N') \
.csv(DATA_BUCKET_FOLDER+"clicks_train.csv") \
.alias('clicks_train')
clicks_train_joined_df = clicks_train_df \
.join(promoted_content_df, on='ad_id', how='left') \
.join(events_df, on='display_id', how='left')
clicks_train_joined_df.createOrReplaceTempView('clicks_train_joined')
validation_display_ids_df = clicks_train_joined_df.select('display_id','day_event') \
.distinct() \
.sampleBy("day_event", fractions={0: 0.2, 1: 0.2, 2: 0.2, 3: 0.2, 4: 0.2, \
5: 0.2, 6: 0.2, 7: 0.2, 8: 0.2, 9: 0.2, 10: 0.2, 11: 1.0, 12: 1.0}, seed=0)
validation_display_ids_df.createOrReplaceTempView("validation_display_ids")
validation_set_df = spark.sql('''SELECT display_id, ad_id, uuid_event, day_event,
timestamp_event, document_id_promo, platform_event, geo_location_event
FROM clicks_train_joined t
WHERE EXISTS (SELECT display_id FROM validation_display_ids
WHERE display_id = t.display_id)''')
validation_set_gcs_output = "validation_set.parquet"
validation_set_df.write.parquet(OUTPUT_BUCKET_FOLDER+validation_set_gcs_output, mode='overwrite')
print(validation_set_df.take(5))
spark.stop()

View file

@ -0,0 +1,414 @@
#!/usr/bin/env python
# coding: utf-8
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
OUTPUT_BUCKET_FOLDER = "/outbrain/preprocessed/"
DATA_BUCKET_FOLDER = "/outbrain/orig/"
SPARK_TEMP_FOLDER = "/outbrain/spark-temp/"
from IPython.display import display
from pyspark.sql.types import *
import pyspark.sql.functions as F
from pyspark.sql import DataFrameWriter
import numpy as np
import math
import datetime
import time
import random
random.seed(42)
from pyspark.context import SparkContext, SparkConf
from pyspark.sql.session import SparkSession
import argparse
parser = argparse.ArgumentParser()
parser.add_argument(
'--submission',
action='store_true',
default=False
)
args = parser.parse_args()
evaluation = not args.submission
conf = SparkConf().setMaster('local[*]').set('spark.executor.memory', '256g').set('spark.driver.memory', '126g').set("spark.local.dir", SPARK_TEMP_FOLDER)
sc = SparkContext(conf=conf)
spark = SparkSession(sc)
start_time = time.time()
print('Loading data...')
truncate_day_from_timestamp_udf = F.udf(lambda ts: int(ts / 1000 / 60 / 60 / 24), IntegerType())
extract_country_udf = F.udf(lambda geo: geo.strip()[:2] if geo != None else '', StringType())
documents_meta_schema = StructType(
[StructField("document_id_doc", IntegerType(), True),
StructField("source_id", IntegerType(), True),
StructField("publisher_id", IntegerType(), True),
StructField("publish_time", TimestampType(), True)]
)
documents_meta_df = spark.read.schema(documents_meta_schema) \
.options(header='true', inferschema='false', nullValue='\\N') \
.csv(DATA_BUCKET_FOLDER+"documents_meta.csv") \
.withColumn('dummyDocumentsMeta', F.lit(1)).alias('documents_meta')
documents_meta_df.count()
print('Drop rows with empty "source_id"...')
documents_meta_df = documents_meta_df.dropna(subset="source_id")
documents_meta_df.count()
source_publishers_df = documents_meta_df.select(["source_id", "publisher_id"]).dropDuplicates()
source_publishers_df.count()
print('Get list of source_ids without publisher_id...')
rows_no_pub = source_publishers_df.filter("publisher_id is NULL")
source_ids_without_publisher = [row['source_id'] for row in rows_no_pub.collect()]
len(source_ids_without_publisher)
print('Maximum value of publisher_id used so far...')
max_pub = max(source_publishers_df.select(["publisher_id"]).dropna().collect())['publisher_id']
max_pub
print('Rows filled with new publisher_ids')
new_publishers = [(source, max_pub + 1 + nr) for nr, source in enumerate(source_ids_without_publisher)]
new_publishers_df = spark.createDataFrame(new_publishers, ("source_id", "publisher_id"))
new_publishers_df.take(10)
# old and new publishers merged
fixed_source_publishers_df = source_publishers_df.dropna().union(new_publishers_df)
fixed_source_publishers_df.collect()[-30:]
print('Update documents_meta with bew publishers...')
documents_meta_df = documents_meta_df.drop('publisher_id').join(fixed_source_publishers_df, on='source_id')
documents_meta_df.count()
documents_categories_schema = StructType(
[StructField("document_id_cat", IntegerType(), True),
StructField("category_id", IntegerType(), True),
StructField("confidence_level_cat", FloatType(), True)]
)
documents_categories_df = spark.read.schema(documents_categories_schema) \
.options(header='true', inferschema='false', nullValue='\\N') \
.csv(DATA_BUCKET_FOLDER+"documents_categories.csv") \
.alias('documents_categories')
documents_categories_grouped_df = documents_categories_df.groupBy('document_id_cat') \
.agg(F.collect_list('category_id').alias('category_id_list'),
F.collect_list('confidence_level_cat').alias('cat_confidence_level_list')) \
.withColumn('dummyDocumentsCategory', F.lit(1)) \
.alias('documents_categories_grouped')
documents_topics_schema = StructType(
[StructField("document_id_top", IntegerType(), True),
StructField("topic_id", IntegerType(), True),
StructField("confidence_level_top", FloatType(), True)]
)
documents_topics_df = spark.read.schema(documents_topics_schema) \
.options(header='true', inferschema='false', nullValue='\\N') \
.csv(DATA_BUCKET_FOLDER+"documents_topics.csv") \
.alias('documents_topics')
documents_topics_grouped_df = documents_topics_df.groupBy('document_id_top') \
.agg(F.collect_list('topic_id').alias('topic_id_list'),
F.collect_list('confidence_level_top').alias('top_confidence_level_list')) \
.withColumn('dummyDocumentsTopics', F.lit(1)) \
.alias('documents_topics_grouped')
documents_entities_schema = StructType(
[StructField("document_id_ent", IntegerType(), True),
StructField("entity_id", StringType(), True),
StructField("confidence_level_ent", FloatType(), True)]
)
documents_entities_df = spark.read.schema(documents_entities_schema) \
.options(header='true', inferschema='false', nullValue='\\N') \
.csv(DATA_BUCKET_FOLDER+"documents_entities.csv") \
.alias('documents_entities')
documents_entities_grouped_df = documents_entities_df.groupBy('document_id_ent') \
.agg(F.collect_list('entity_id').alias('entity_id_list'),
F.collect_list('confidence_level_ent').alias('ent_confidence_level_list')) \
.withColumn('dummyDocumentsEntities', F.lit(1)) \
.alias('documents_entities_grouped')
documents_df = documents_meta_df.join(
documents_categories_grouped_df,
on=F.col("document_id_doc") == F.col("documents_categories_grouped.document_id_cat"),
how='left') \
.join(documents_topics_grouped_df,
on=F.col("document_id_doc") == F.col("documents_topics_grouped.document_id_top"),
how='left') \
.join(documents_entities_grouped_df,
on=F.col("document_id_doc") == F.col("documents_entities_grouped.document_id_ent"),
how='left') \
.cache()
documents_df.count()
if evaluation:
validation_set_df = spark.read.parquet(OUTPUT_BUCKET_FOLDER+"validation_set.parquet") \
.alias('validation_set')
validation_set_df.select('uuid_event').distinct().createOrReplaceTempView('users_to_profile')
validation_set_df.select('uuid_event','document_id_promo').distinct() \
.createOrReplaceTempView('validation_users_docs_to_ignore')
else:
events_schema = StructType(
[StructField("display_id", IntegerType(), True),
StructField("uuid_event", StringType(), True),
StructField("document_id_event", IntegerType(), True),
StructField("timestamp_event", IntegerType(), True),
StructField("platform_event", IntegerType(), True),
StructField("geo_location_event", StringType(), True)]
)
events_df = spark.read.schema(events_schema) \
.options(header='true', inferschema='false', nullValue='\\N') \
.csv(DATA_BUCKET_FOLDER+"events.csv") \
.withColumn('dummyEvents', F.lit(1)) \
.withColumn('day_event', truncate_day_from_timestamp_udf('timestamp_event')) \
.withColumn('event_country', extract_country_udf('geo_location_event')) \
.alias('events')
# Drop rows with empty "geo_location"
events_df = events_df.dropna(subset="geo_location_event")
# Drop rows with empty "platform"
events_df = events_df.dropna(subset="platform_event")
events_df.createOrReplaceTempView('events')
promoted_content_schema = StructType(
[StructField("ad_id", IntegerType(), True),
StructField("document_id_promo", IntegerType(), True),
StructField("campaign_id", IntegerType(), True),
StructField("advertiser_id", IntegerType(), True)]
)
promoted_content_df = spark.read.schema(promoted_content_schema) \
.options(header='true', inferschema='false', nullValue='\\N') \
.csv(DATA_BUCKET_FOLDER+"promoted_content.csv") \
.withColumn('dummyPromotedContent', F.lit(1)).alias('promoted_content')
clicks_test_schema = StructType(
[StructField("display_id", IntegerType(), True),
StructField("ad_id", IntegerType(), True)]
)
clicks_test_df = spark.read.schema(clicks_test_schema) \
.options(header='true', inferschema='false', nullValue='\\N') \
.csv(DATA_BUCKET_FOLDER+"clicks_test.csv") \
.withColumn('dummyClicksTest', F.lit(1)).alias('clicks_test')
test_set_df = clicks_test_df.join(promoted_content_df, on='ad_id', how='left') \
.join(events_df, on='display_id', how='left')
test_set_df.select('uuid_event').distinct().createOrReplaceTempView('users_to_profile')
test_set_df.select('uuid_event','document_id_promo', 'timestamp_event').distinct() \
.createOrReplaceTempView('test_users_docs_timestamp_to_ignore')
page_views_schema = StructType(
[StructField("uuid_pv", StringType(), True),
StructField("document_id_pv", IntegerType(), True),
StructField("timestamp_pv", IntegerType(), True),
StructField("platform_pv", IntegerType(), True),
StructField("geo_location_pv", StringType(), True),
StructField("traffic_source_pv", IntegerType(), True)]
)
page_views_df = spark.read.schema(page_views_schema) \
.options(header='true', inferschema='false', nullValue='\\N') \
.csv(DATA_BUCKET_FOLDER+"page_views.csv") \
.alias('page_views')
page_views_df.createOrReplaceTempView('page_views')
additional_filter = ''
if evaluation:
additional_filter = '''
AND NOT EXISTS (SELECT uuid_event FROM validation_users_docs_to_ignore
WHERE uuid_event = p.uuid_pv
AND document_id_promo = p.document_id_pv)
'''
else:
additional_filter = '''
AND NOT EXISTS (SELECT uuid_event FROM test_users_docs_timestamp_to_ignore
WHERE uuid_event = p.uuid_pv
AND document_id_promo = p.document_id_pv
AND p.timestamp_pv >= timestamp_event)
'''
page_views_train_df = spark.sql('''
SELECT * FROM page_views p
WHERE EXISTS (SELECT uuid_event FROM users_to_profile
WHERE uuid_event = p.uuid_pv)
''' + additional_filter).alias('views') \
.join(documents_df, on=F.col("document_id_pv") == F.col("document_id_doc"), how='left') \
.filter('dummyDocumentsEntities is not null OR dummyDocumentsTopics is not null OR dummyDocumentsCategory is not null')
print('Processing document frequencies...')
import pickle
documents_total = documents_meta_df.count()
documents_total
categories_docs_counts = documents_categories_df.groupBy('category_id').count().rdd.collectAsMap()
len(categories_docs_counts)
df_filenames_suffix = ''
if evaluation:
df_filenames_suffix = '_eval'
with open(OUTPUT_BUCKET_FOLDER+'categories_docs_counts'+df_filenames_suffix+'.pickle', 'wb') as output:
pickle.dump(categories_docs_counts, output)
topics_docs_counts = documents_topics_df.groupBy('topic_id').count().rdd.collectAsMap()
len(topics_docs_counts)
with open(OUTPUT_BUCKET_FOLDER+'topics_docs_counts'+df_filenames_suffix+'.pickle', 'wb') as output:
pickle.dump(topics_docs_counts, output)
entities_docs_counts = documents_entities_df.groupBy('entity_id').count().rdd.collectAsMap()
len(entities_docs_counts)
with open(OUTPUT_BUCKET_FOLDER+'entities_docs_counts'+df_filenames_suffix+'.pickle', 'wb') as output:
pickle.dump(entities_docs_counts, output)
print('Processing user profiles...')
int_null_to_minus_one_udf = F.udf(lambda x: x if x != None else -1, IntegerType())
int_list_null_to_empty_list_udf = F.udf(lambda x: x if x != None else [], ArrayType(IntegerType()))
float_list_null_to_empty_list_udf = F.udf(lambda x: x if x != None else [], ArrayType(FloatType()))
str_list_null_to_empty_list_udf = F.udf(lambda x: x if x != None else [], ArrayType(StringType()))
page_views_by_user_df = page_views_train_df \
.select(
'uuid_pv',
'document_id_pv',
int_null_to_minus_one_udf('timestamp_pv').alias('timestamp_pv'),
int_list_null_to_empty_list_udf('category_id_list').alias('category_id_list'),
float_list_null_to_empty_list_udf('cat_confidence_level_list').alias('cat_confidence_level_list'),
int_list_null_to_empty_list_udf('topic_id_list').alias('topic_id_list'),
float_list_null_to_empty_list_udf('top_confidence_level_list').alias('top_confidence_level_list'),
str_list_null_to_empty_list_udf('entity_id_list').alias('entity_id_list'),
float_list_null_to_empty_list_udf('ent_confidence_level_list').alias('ent_confidence_level_list')) \
.groupBy('uuid_pv') \
.agg(F.collect_list('document_id_pv').alias('document_id_pv_list'),
F.collect_list('timestamp_pv').alias('timestamp_pv_list'),
F.collect_list('category_id_list').alias('category_id_lists'),
F.collect_list('cat_confidence_level_list').alias('cat_confidence_level_lists'),
F.collect_list('topic_id_list').alias('topic_id_lists'),
F.collect_list('top_confidence_level_list').alias('top_confidence_level_lists'),
F.collect_list('entity_id_list').alias('entity_id_lists'),
F.collect_list('ent_confidence_level_list').alias('ent_confidence_level_lists'))
from collections import defaultdict
def get_user_aspects(docs_aspects, aspect_docs_counts):
docs_aspects_merged_lists = defaultdict(list)
for doc_aspects in docs_aspects:
for key in doc_aspects.keys():
docs_aspects_merged_lists[key].append(doc_aspects[key])
docs_aspects_stats = {}
for key in docs_aspects_merged_lists.keys():
aspect_list = docs_aspects_merged_lists[key]
tf = len(aspect_list)
idf = math.log(documents_total / float(aspect_docs_counts[key]))
confid_mean = sum(aspect_list) / float(len(aspect_list))
docs_aspects_stats[key] = [tf*idf, confid_mean]
return docs_aspects_stats
def generate_user_profile(docs_aspects_list, docs_aspects_confidence_list, aspect_docs_counts):
docs_aspects = []
for doc_aspects_list, doc_aspects_confidence_list in zip(docs_aspects_list, docs_aspects_confidence_list):
doc_aspects = dict(zip(doc_aspects_list, doc_aspects_confidence_list))
docs_aspects.append(doc_aspects)
user_aspects = get_user_aspects(docs_aspects, aspect_docs_counts)
return user_aspects
get_list_len_udf = F.udf(lambda docs_list: len(docs_list), IntegerType())
generate_categories_user_profile_map_udf = F.udf(
lambda docs_aspects_list, docs_aspects_confidence_list: \
generate_user_profile(docs_aspects_list,
docs_aspects_confidence_list,
categories_docs_counts),
MapType(IntegerType(), ArrayType(FloatType()), False))
generate_topics_user_profile_map_udf = F.udf(
lambda docs_aspects_list, docs_aspects_confidence_list: \
generate_user_profile(docs_aspects_list,
docs_aspects_confidence_list,
topics_docs_counts),
MapType(IntegerType(), ArrayType(FloatType()), False))
generate_entities_user_profile_map_udf = F.udf(
lambda docs_aspects_list, docs_aspects_confidence_list: \
generate_user_profile(docs_aspects_list,
docs_aspects_confidence_list,
entities_docs_counts),
MapType(StringType(), ArrayType(FloatType()), False))
users_profile_df = page_views_by_user_df \
.withColumn('views', get_list_len_udf('document_id_pv_list')) \
.withColumn('categories', generate_categories_user_profile_map_udf('category_id_lists',
'cat_confidence_level_lists')) \
.withColumn('topics', generate_topics_user_profile_map_udf('topic_id_lists',
'top_confidence_level_lists')) \
.withColumn('entities', generate_entities_user_profile_map_udf('entity_id_lists',
'ent_confidence_level_lists')) \
.select(
F.col('uuid_pv').alias('uuid'),
F.col('document_id_pv_list').alias('doc_ids'),
'views', 'categories', 'topics', 'entities')
if evaluation:
table_name = 'user_profiles_eval'
else:
table_name = 'user_profiles'
users_profile_df.write.parquet(OUTPUT_BUCKET_FOLDER+table_name, mode='overwrite')
finish_time = time.time()
print("Elapsed min: ", (finish_time-start_time)/60)
spark.stop()

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,24 @@
#!/bin/bash
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
input=$1
output=$2
mkdir -p ${output}
for f in ${input}/*
do
filename=${f##*/}
sort -n -k2 -t ',' $f > ${output}/${filename}
done

View file

@ -0,0 +1,20 @@
#!/bin/bash
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
set -x
set -e
python -m trainer.task --model_dir . --transformed_metadata_path "/outbrain/tfrecords" --eval_data_pattern "/outbrain/tfrecords/eval_*" --train_data_pattern "/outbrain/tfrecords/train_*" --save_checkpoints_secs 600 --linear_l1_regularization 0.0 --linear_l2_regularization 0.0 --linear_learning_rate 0.2 --deep_l1_regularization 0.0 --deep_l2_regularization 0.0 --deep_learning_rate 1.0 --deep_dropout 0.0 --deep_hidden_units 1024 1024 1024 1024 1024 --prebatch_size 4096 --batch_size 131072 --eval_batch_size 32768 --eval_steps 8 --num_epochs 15 --model_type wide_n_deep --gpu --benchmark --amp

View file

@ -0,0 +1,20 @@
#!/bin/bash
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
set -x
set -e
mpiexec --allow-run-as-root --bind-to socket -np 4 python -m trainer.task --hvd --model_dir . --transformed_metadata_path "/outbrain/tfrecords" --eval_data_pattern "/outbrain/tfrecords/eval_*" --train_data_pattern "/outbrain/tfrecords/train_*" --save_checkpoints_secs 600 --linear_l1_regularization 0.0 --linear_l2_regularization 0.0 --linear_learning_rate 0.2 --deep_l1_regularization 0.0 --deep_l2_regularization 0.0 --deep_learning_rate 1.0 --deep_dropout 0.0 --deep_hidden_units 1024 1024 1024 1024 1024 --prebatch_size 4096 --batch_size 131072 --eval_batch_size 32768 --eval_steps 8 --num_epochs 15 --model_type wide_n_deep --gpu --benchmark --amp

View file

@ -0,0 +1,20 @@
#!/bin/bash
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
set -x
set -e
mpiexec --allow-run-as-root --bind-to socket -np 8 python -m trainer.task --hvd --model_dir . --transformed_metadata_path "/outbrain/tfrecords" --eval_data_pattern "/outbrain/tfrecords/eval_*" --train_data_pattern "/outbrain/tfrecords/train_*" --save_checkpoints_secs 600 --linear_l1_regularization 0.0 --linear_l2_regularization 0.0 --linear_learning_rate 0.2 --deep_l1_regularization 0.0 --deep_l2_regularization 0.0 --deep_learning_rate 1.0 --deep_dropout 0.0 --deep_hidden_units 1024 1024 1024 1024 1024 --prebatch_size 4096 --batch_size 131072 --eval_batch_size 32768 --eval_steps 8 --num_epochs 15 --model_type wide_n_deep --gpu --benchmark --amp

View file

@ -0,0 +1,20 @@
#!/bin/bash
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
set -x
set -e
python -m trainer.task --model_dir . --transformed_metadata_path "/outbrain/tfrecords" --eval_data_pattern "/outbrain/tfrecords/eval_*" --train_data_pattern "/outbrain/tfrecords/train_*" --save_checkpoints_secs 600 --linear_l1_regularization 0.0 --linear_l2_regularization 0.0 --linear_learning_rate 0.2 --deep_l1_regularization 0.0 --deep_l2_regularization 0.0 --deep_learning_rate 1.0 --deep_dropout 0.0 --deep_hidden_units 1024 1024 1024 1024 1024 --prebatch_size 4096 --batch_size 131072 --eval_batch_size 32768 --eval_steps 8 --num_epochs 15 --model_type wide_n_deep --gpu --benchmark

View file

@ -0,0 +1,20 @@
#!/bin/bash
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
set -x
set -e
mpiexec --allow-run-as-root --bind-to socket -np 4 python -m trainer.task --hvd --model_dir . --transformed_metadata_path "/outbrain/tfrecords" --eval_data_pattern "/outbrain/tfrecords/eval_*" --train_data_pattern "/outbrain/tfrecords/train_*" --save_checkpoints_secs 600 --linear_l1_regularization 0.0 --linear_l2_regularization 0.0 --linear_learning_rate 0.2 --deep_l1_regularization 0.0 --deep_l2_regularization 0.0 --deep_learning_rate 1.0 --deep_dropout 0.0 --deep_hidden_units 1024 1024 1024 1024 1024 --prebatch_size 4096 --batch_size 131072 --eval_batch_size 32768 --eval_steps 8 --num_epochs 15 --model_type wide_n_deep --gpu --benchmark

View file

@ -0,0 +1,20 @@
#!/bin/bash
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
set -x
set -e
mpiexec --allow-run-as-root --bind-to socket -np 8 python -m trainer.task --hvd --model_dir . --transformed_metadata_path "/outbrain/tfrecords" --eval_data_pattern "/outbrain/tfrecords/eval_*" --train_data_pattern "/outbrain/tfrecords/train_*" --save_checkpoints_secs 600 --linear_l1_regularization 0.0 --linear_l2_regularization 0.0 --linear_learning_rate 0.2 --deep_l1_regularization 0.0 --deep_l2_regularization 0.0 --deep_learning_rate 1.0 --deep_dropout 0.0 --deep_hidden_units 1024 1024 1024 1024 1024 --prebatch_size 4096 --batch_size 131072 --eval_batch_size 32768 --eval_steps 8 --num_epochs 15 --model_type wide_n_deep --gpu --benchmark

View file

@ -0,0 +1,54 @@
#!/bin/bash
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
if [ $# -ge 1 ]
then
PREBATCH_SIZE=$1
else
PREBATCH_SIZE=4096
fi
python preproc/preproc1.py
python preproc/preproc2.py
python preproc/preproc3.py
export CUDA_VISIBLE_DEVICES=
LOCAL_DATA_DIR=/outbrain/preprocessed
LOCAL_DATA_TFRECORDS_DIR=/outbrain/tfrecords
TRAIN_DIR=train_feature_vectors_integral_eval.csv
VALID_DIR=validation_feature_vectors_integral.csv
TRAIN_IMPUTED_DIR=train_feature_vectors_integral_eval_imputed.csv
VALID_IMPUTED_DIR=validation_feature_vectors_integral_imputed.csv
HEADER_PATH=train_feature_vectors_integral_eval.csv.header
cd ${LOCAL_DATA_DIR}
python /wd/preproc/csv_data_imputation.py --num_workers 40 \
--train_files_pattern 'train_feature_vectors_integral_eval.csv/part-*' \
--valid_files_pattern 'validation_feature_vectors_integral.csv/part-*' \
--train_dst_dir ${TRAIN_IMPUTED_DIR} \
--valid_dst_dir ${VALID_IMPUTED_DIR} \
--header_path ${HEADER_PATH}
cd -
time preproc/sort_csv.sh ${LOCAL_DATA_DIR}/${VALID_IMPUTED_DIR} ${LOCAL_DATA_DIR}/${VALID_IMPUTED_DIR}_sorted
python dataflow_preprocess.py \
--eval_data "${LOCAL_DATA_DIR}/${VALID_IMPUTED_DIR}_sorted/part-*" \
--training_data "${LOCAL_DATA_DIR}/${TRAIN_IMPUTED_DIR}/part-*" \
--output_dir ${LOCAL_DATA_TFRECORDS_DIR} \
--batch_size ${PREBATCH_SIZE}

View file

@ -0,0 +1,39 @@
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Copyright 2016 Google Inc. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import setuptools
NAME = 'trainer'
VERSION = '1.0'
TENSORFLOW_TRANSFORM = 'tensorflow-transform==0.1.8'
if __name__ == '__main__':
setuptools.setup(name=NAME, version=VERSION, packages=['trainer'],
install_requires=[TENSORFLOW_TRANSFORM])

View file

@ -0,0 +1,13 @@
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

View file

@ -0,0 +1,293 @@
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import re
AD_KEYWORDS = ['ad', 'advertiser', 'campain']
AD_REs = ['(^|_){}_'.format(kw) for kw in AD_KEYWORDS]
AD_RE = re.compile('|'.join(AD_REs))
def level_map(name):
return 0 if re.search(AD_RE, name) is None else 1
LABEL_COLUMN = "label"
DISPLAY_ID_COLUMN = 'display_id'
AD_ID_COLUMN = 'ad_id'
IS_LEAK_COLUMN = 'is_leak'
DISPLAY_ID_AND_IS_LEAK_ENCODED_COLUMN = 'display_ad_and_is_leak'
CATEGORICAL_COLUMNS = [
'ad_id',
'doc_id',
'doc_event_id',
'ad_advertiser',
'doc_ad_source_id',
'doc_ad_publisher_id',
'doc_event_publisher_id',
'doc_event_source_id',
'event_country',
'event_country_state',
'event_geo_location',
'event_hour',
'event_platform',
'traffic_source']
DOC_CATEGORICAL_MULTIVALUED_COLUMNS = {
'doc_ad_category_id': ['doc_ad_category_id_1',
'doc_ad_category_id_2',
'doc_ad_category_id_3'],
'doc_ad_topic_id': ['doc_ad_topic_id_1',
'doc_ad_topic_id_2',
'doc_ad_topic_id_3'],
'doc_ad_entity_id': ['doc_ad_entity_id_1',
'doc_ad_entity_id_2',
'doc_ad_entity_id_3',
'doc_ad_entity_id_4',
'doc_ad_entity_id_5',
'doc_ad_entity_id_6'],
'doc_event_category_id': ['doc_event_category_id_1',
'doc_event_category_id_2',
'doc_event_category_id_3'],
'doc_event_topic_id': ['doc_event_topic_id_1',
'doc_event_topic_id_2',
'doc_event_topic_id_3'],
'doc_event_entity_id': ['doc_event_entity_id_1',
'doc_event_entity_id_2',
'doc_event_entity_id_3',
'doc_event_entity_id_4',
'doc_event_entity_id_5',
'doc_event_entity_id_6']
}
BOOL_COLUMNS = [
'event_weekend',
'user_has_already_viewed_doc']
INT_COLUMNS = [
'user_views',
'ad_views',
'doc_views',
'doc_event_days_since_published',
'doc_event_hour',
'doc_ad_days_since_published']
FLOAT_COLUMNS_LOG_BIN_TRANSFORM = [
'pop_ad_id',
'pop_ad_id_conf_multipl',
'pop_document_id',
'pop_document_id_conf_multipl',
'pop_publisher_id',
'pop_publisher_id_conf_multipl',
'pop_advertiser_id',
'pop_advertiser_id_conf_multipl',
'pop_campain_id',
'pop_campain_id_conf_multipl',
'pop_doc_event_doc_ad',
'pop_doc_event_doc_ad_conf_multipl',
'pop_source_id',
'pop_source_id_conf_multipl',
'pop_source_id_country',
'pop_source_id_country_conf_multipl',
'pop_entity_id',
'pop_entity_id_conf_multipl',
'pop_entity_id_country',
'pop_entity_id_country_conf_multipl',
'pop_topic_id',
'pop_topic_id_conf_multipl',
'pop_topic_id_country',
'pop_topic_id_country_conf_multipl',
'pop_category_id',
'pop_category_id_conf_multipl',
'pop_category_id_country',
'pop_category_id_country_conf_multipl',
'user_doc_ad_sim_categories',
'user_doc_ad_sim_categories_conf_multipl',
'user_doc_ad_sim_topics',
'user_doc_ad_sim_topics_conf_multipl',
'user_doc_ad_sim_entities',
'user_doc_ad_sim_entities_conf_multipl',
'doc_event_doc_ad_sim_categories',
'doc_event_doc_ad_sim_categories_conf_multipl',
'doc_event_doc_ad_sim_topics',
'doc_event_doc_ad_sim_topics_conf_multipl',
'doc_event_doc_ad_sim_entities',
'doc_event_doc_ad_sim_entities_conf_multipl']
FLOAT_COLUMNS_SIMPLE_BIN_TRANSFORM = [
'pop_ad_id_conf',
'pop_document_id_conf',
'pop_publisher_id_conf',
'pop_advertiser_id_conf',
'pop_campain_id_conf',
'pop_doc_event_doc_ad_conf',
'pop_source_id_conf',
'pop_source_id_country_conf',
'pop_entity_id_conf',
'pop_entity_id_country_conf',
'pop_topic_id_conf',
'pop_topic_id_country_conf',
'pop_category_id_conf',
'pop_category_id_country_conf',
'user_doc_ad_sim_categories_conf',
'user_doc_ad_sim_topics_conf',
'user_doc_ad_sim_entities_conf',
'doc_event_doc_ad_sim_categories_conf',
'doc_event_doc_ad_sim_topics_conf',
'doc_event_doc_ad_sim_entities_conf']
FLOAT_COLUMNS = FLOAT_COLUMNS_LOG_BIN_TRANSFORM + FLOAT_COLUMNS_SIMPLE_BIN_TRANSFORM
# Let's define the columns we're actually going to use
# during training
REQUEST_SINGLE_HOT_COLUMNS = [
"doc_event_id",
"doc_id",
"doc_event_source_id",
"event_geo_location",
"event_country_state",
"doc_event_publisher_id",
"event_country",
"event_hour",
"event_platform",
"traffic_source",
"event_weekend",
"user_has_already_viewed_doc"]
REQUEST_MULTI_HOT_COLUMNS = [
"doc_event_entity_id",
"doc_event_topic_id",
"doc_event_category_id"]
REQUEST_NUMERIC_COLUMNS = [
"pop_document_id_conf",
"pop_publisher_id_conf",
"pop_source_id_conf",
"pop_entity_id_conf",
"pop_topic_id_conf",
"pop_category_id_conf",
"pop_document_id",
"pop_publisher_id",
"pop_source_id",
"pop_entity_id",
"pop_topic_id",
"pop_category_id",
"user_views",
"doc_views",
"doc_event_days_since_published",
"doc_event_hour"]
ITEM_SINGLE_HOT_COLUMNS = [
"ad_id",
"doc_ad_source_id",
"ad_advertiser",
"doc_ad_publisher_id"]
ITEM_MULTI_HOT_COLUMNS = [
"doc_ad_topic_id",
"doc_ad_entity_id",
"doc_ad_category_id"]
ITEM_NUMERIC_COLUMNS = [
"pop_ad_id_conf",
"user_doc_ad_sim_categories_conf",
"user_doc_ad_sim_topics_conf",
"pop_advertiser_id_conf",
"pop_campain_id_conf_multipl",
"pop_ad_id",
"pop_advertiser_id",
"pop_campain_id",
"user_doc_ad_sim_categories",
"user_doc_ad_sim_topics",
"user_doc_ad_sim_entities",
"doc_event_doc_ad_sim_categories",
"doc_event_doc_ad_sim_topics",
"doc_event_doc_ad_sim_entities",
"ad_views",
"doc_ad_days_since_published"]
num_hot_map = {
name: 1 for name in
(REQUEST_SINGLE_HOT_COLUMNS + ITEM_SINGLE_HOT_COLUMNS)}
num_hot_map.update({
name: -1 for name in
(REQUEST_MULTI_HOT_COLUMNS + ITEM_MULTI_HOT_COLUMNS)})
NV_TRAINING_COLUMNS = (
REQUEST_SINGLE_HOT_COLUMNS +
REQUEST_MULTI_HOT_COLUMNS +
REQUEST_NUMERIC_COLUMNS +
ITEM_SINGLE_HOT_COLUMNS +
ITEM_MULTI_HOT_COLUMNS +
ITEM_NUMERIC_COLUMNS)
ALL_TRAINING_COLUMNS = (
CATEGORICAL_COLUMNS +
BOOL_COLUMNS +
INT_COLUMNS +
FLOAT_COLUMNS
)
for v in DOC_CATEGORICAL_MULTIVALUED_COLUMNS.values():
ALL_TRAINING_COLUMNS.extend(v)
HASH_BUCKET_SIZES = {
'doc_event_id': 300000,
'ad_id': 250000,
'doc_id': 100000,
'doc_ad_entity_id': 10000,
'doc_event_entity_id': 10000,
'doc_ad_source_id': 4000,
'doc_event_source_id': 4000,
'event_geo_location': 2500,
'ad_advertiser': 2500,
'event_country_state': 2000,
'doc_ad_publisher_id': 1000,
'doc_event_publisher_id': 1000,
'doc_ad_topic_id': 350,
'doc_event_topic_id': 350,
'event_country': 300,
'doc_ad_category_id': 100,
'doc_event_category_id': 100}
IDENTITY_NUM_BUCKETS = {
'event_hour': 6,
'event_platform': 3,
'traffic_source': 3,
'event_weekend': 2,
'user_has_already_viewed_doc': 2}
EMBEDDING_DIMENSIONS = {
'doc_event_id': 128,
'ad_id': 128,
'doc_id': 128,
'doc_ad_entity_id': 64,
'doc_event_entity_id': 64,
'doc_ad_source_id': 64,
'doc_event_source_id': 64,
'event_geo_location': 64,
'ad_advertiser': 64,
'event_country_state': 64,
'doc_ad_publisher_id': 64,
'doc_event_publisher_id': 64,
'doc_ad_topic_id': 64,
'doc_event_topic_id': 64,
'event_country': 64,
'doc_ad_category_id': 64,
'doc_event_category_id': 64}

View file

@ -0,0 +1,800 @@
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import numpy as np
import argparse
import json
import os
import sys
import pickle
import tensorflow as tf
import tensorflow_transform as tft
from trainer import features
from utils.hooks.benchmark_hooks import BenchmarkLoggingHook
import horovod.tensorflow as hvd
import dllogger
MODEL_TYPES = ['wide', 'deep', 'wide_n_deep']
WIDE, DEEP, WIDE_N_DEEP = MODEL_TYPES
# Default train dataset size
TRAIN_DATASET_SIZE = 59761827
def create_parser():
"""Initialize command line parser using arparse.
Returns:
An argparse.ArgumentParser.
"""
parser = argparse.ArgumentParser()
parser.add_argument(
'--model_type',
help='Model type to train on',
choices=MODEL_TYPES,
default=WIDE_N_DEEP)
parser.add_argument(
'--canned_estimator',
help='Use canned estimator instead of the experimental custom estimator',
action='store_true',
default=False)
parser.add_argument(
'--train_data_pattern',
help='Pattern of training file names. For example if training files are train_000.tfrecord, \
train_001.tfrecord then --train_data_pattern is train_*',
type=str,
default='/outbrain/tfrecords/train_*',
nargs='+'
)
parser.add_argument(
'--eval_data_pattern',
help='Pattern of eval file names. For example if eval files are eval_000.tfrecord, \
eval_001.tfrecord then --eval_data_pattern is eval_*',
type=str,
default='/outbrain/tfrecords/eval_*',
nargs='+'
)
parser.add_argument(
'--model_dir',
help='Model Checkpoint will be saved here',
type=str,
default='/outbrain/checkpoints'
)
parser.add_argument(
'--transformed_metadata_path',
help='Path to transformed_metadata.',
type=str,
default='/outbrain/tfrecords'
)
parser.add_argument(
'--deep_hidden_units',
help='hidden units per layer, separated by spaces',
default=[1024, 1024, 1024, 1024, 1024],
type=int,
nargs="+")
parser.add_argument(
'--prebatch_size',
help='Size of the pre-batches in the tfrecords',
default=4096,
type=int)
parser.add_argument(
'--batch_size',
help='Training batch size',
default=131072,
type=int)
parser.add_argument(
'--eval_batch_size',
help='Evaluation batch size',
default=32768,
type=int)
parser.add_argument(
'--eval_steps',
help='Number of evaluation steps to perform',
default=8,
type=int)
parser.add_argument(
'--training_set_size',
help='Number of samples in the training set',
default=TRAIN_DATASET_SIZE,
type=int)
parser.add_argument(
'--num_epochs',
help='Number of epochs',
default=100,
type=int)
parser.add_argument(
'--save_checkpoints_secs',
help='Minimal number of seconds between evaluations',
default=600,
type=int)
parser.add_argument(
'--save_checkpoints_steps',
help='Training steps between saving checkpoints. If 0, then save_checkpoints_secs applies',
default=0,
type=int)
parser.add_argument(
'--xla',
help='Enable XLA',
default=False,
action='store_true')
parser.add_argument(
'--gpu',
help='Run computations on the GPU',
default=False,
action='store_true')
parser.add_argument(
'--amp',
help='Attempt automatic mixed precision conversion',
default=False,
action='store_true')
parser.add_argument(
'--hvd',
help='Use Horovod',
action='store_true',
default=False)
# hyperparameters for linear part
parser.add_argument(
'--linear_l1_regularization',
help='L1 regularization for linear model',
type=float,
default=0.0)
parser.add_argument(
'--linear_l2_regularization',
help='L2 regularization for linear model',
type=float,
default=0.0)
parser.add_argument(
'--linear_learning_rate',
help='Learning rate for linear model',
type=float,
default=0.2)
# hyperparameters for deep part
parser.add_argument(
'--deep_l1_regularization',
help='L1 regularization for deep model',
type=float,
default=0.0)
parser.add_argument(
'--deep_l2_regularization',
help='L2 regularization for deep model',
type=float,
default=0.00)
parser.add_argument(
'--deep_learning_rate',
help='Learning rate for deep model',
type=float,
default=1.0)
parser.add_argument(
'--deep_dropout',
help='Dropout regularization for deep model',
type=float,
default=0.0)
parser.add_argument(
'--log_device_placement',
help='Ask Tensorflow (via ConfigProto) to print device placement of nodes',
default=False,
action='store_true')
parser.add_argument(
'--predict',
help='Only perform a prediction on the validation dataset, don\'t train',
default=False,
action='store_true')
parser.add_argument(
'--evaluate',
help='Only perform an evaluation on the validation dataset, don\'t train',
default=False,
action='store_true')
parser.add_argument(
'--results_dir',
type=str,
help='Directory to store training results',
default='/results')
parser.add_argument(
'--log_filename',
type=str,
help='Name of the file to store dlloger output',
default='log.json')
parser.add_argument(
'--use_all_columns',
help='Force using all features defined in the features.py file',
default=False,
action='store_true')
parser.add_argument(
'--shuffle_percentage',
type=float,
default=0.001,
help='Size of the shuffle buffer from 0 to 1. 1 means that the shuffle buffer size will be equal to the size of the training dataset.')
parser.add_argument(
'--print_display_ids',
help='Print the display ids processed by the input pipeline',
default=False,
action='store_true')
parser.add_argument(
'--eval_throttle_secs',
help='Number of evaluation steps to perform.',
default=600,
type=int)
parser.add_argument(
'--reader_num_threads',
default=12,
type=int)
parser.add_argument(
'--parser_num_threads',
default=3,
type=int)
parser.add_argument(
'--prefetch_buffer_size',
default=1,
type=int)
parser.add_argument(
'--submission',
action='store_true',
default=False)
parser.add_argument(
'--benchmark',
help='Collect performance metrics during training',
action='store_true',
default=False)
parser.add_argument(
'--benchmark_warmup_steps',
help='Warmup before starg of benchmarking the training',
type=int,
default=50)
parser.add_argument(
'--benchmark_steps',
help='Number of steps for train performance benchmark',
type=int,
default=100)
return parser
def get_feature_columns(use_all_columns=False, force_subset=None):
# adding the force_subset as a way to directly pass in column changes for testing/profiling
assert not use_all_columns or force_subset is None, \
'Cannot both use all columns and use only a subset; give only one argument'
deep_columns, wide_columns = [], []
if use_all_columns:
training_columns = features.ALL_TRAINING_COLUMNS
elif force_subset is not None:
training_columns = force_subset
else:
training_columns = features.NV_TRAINING_COLUMNS
tf.compat.v1.logging.warn('number of features: {}'.format(len(training_columns)))
for column_name in training_columns:
if column_name in features.HASH_BUCKET_SIZES:
categorical_column = tf.feature_column.categorical_column_with_hash_bucket(
column_name,
hash_bucket_size=features.HASH_BUCKET_SIZES[column_name],
dtype=tf.int32)
wide_columns.append(categorical_column)
elif column_name in features.IDENTITY_NUM_BUCKETS:
categorical_column = tf.feature_column.categorical_column_with_identity(
column_name, num_buckets=features.IDENTITY_NUM_BUCKETS[column_name])
wide_columns.append(categorical_column)
else:
columns = []
if column_name in features.FLOAT_COLUMNS_SIMPLE_BIN_TRANSFORM:
# add a categorical_column for column_name + "_binned"
# just add the regular float column for now
columns.append(tf.feature_column.numeric_column(
column_name, shape=(1,)))
elif column_name in features.FLOAT_COLUMNS_LOG_BIN_TRANSFORM:
# add a categorical_column for column_name + "_log_binned")
columns.append(tf.feature_column.numeric_column(
column_name + "_log_01scaled", shape=(1,)))
elif column_name in features.INT_COLUMNS:
# add a categorical_column for column_name + "_log_int"
columns.append(tf.feature_column.numeric_column(
column_name+"_log_01scaled", shape=(1,)))
for column in columns:
wide_columns.append(column)
deep_columns.append(column)
continue
if column_name in features.EMBEDDING_DIMENSIONS:
column = tf.feature_column.embedding_column(
categorical_column,
dimension=features.EMBEDDING_DIMENSIONS[column_name],
combiner='mean')
else:
column = tf.feature_column.indicator_column(categorical_column)
deep_columns.append(column)
tf.compat.v1.logging.warn('deep columns: {}'.format(len(deep_columns)))
tf.compat.v1.logging.warn('wide columns: {}'.format(len(wide_columns)))
tf.compat.v1.logging.warn('wide&deep intersection: {}'.format(len(set(wide_columns).intersection(set(deep_columns)))))
return wide_columns, deep_columns
def separate_input_fn(
tf_transform_output,
transformed_examples,
batch_size,
mode,
reader_num_threads=1,
parser_num_threads=2,
shuffle_buffer_size=10,
prefetch_buffer_size=1,
print_display_ids=False):
"""
A version of the training + eval input function that uses dataset operations.
(For more straightforward tweaking.)
"""
tf.compat.v1.logging.warn('Shuffle buffer size: {}'.format(shuffle_buffer_size))
filenames_dataset = tf.data.Dataset.list_files(transformed_examples, shuffle=False)
raw_dataset = tf.data.TFRecordDataset(filenames_dataset,
num_parallel_reads=reader_num_threads)
raw_dataset = raw_dataset.shuffle(shuffle_buffer_size) \
if (mode==tf.estimator.ModeKeys.TRAIN and shuffle_buffer_size > 1) \
else raw_dataset
raw_dataset = raw_dataset.repeat()
raw_dataset = raw_dataset.batch(batch_size)
# this function appears to require each element to be a vector
# batching should mean that this is always true
# one possible alternative for any problematic case is tf.io.parse_single_example
parsed_dataset = raw_dataset.apply(tf.data.experimental.parse_example_dataset(
tf_transform_output.transformed_feature_spec(),
num_parallel_calls=parser_num_threads))
# a function mapped over each dataset element
# will separate label, ensure that elements are two-dimensional (batch size, elements per record)
# adds print_display_ids injection
def consolidate_batch(elem):
label = elem.pop('label')
reshaped_label = tf.reshape(label, [-1, label.shape[-1]])
reshaped_elem = {key: tf.reshape(elem[key], [-1, elem[key].shape[-1]]) for key in elem}
if print_display_ids:
elem['ad_id'] = tf.Print(input_=elem['ad_id'],
data=[tf.reshape(elem['display_id'], [-1])],
message='display_id', name='print_display_ids', summarize=FLAGS.eval_batch_size)
elem['ad_id'] = tf.Print(input_=elem['ad_id'],
data=[tf.reshape(elem['ad_id'], [-1])],
message='ad_id', name='print_ad_ids', summarize=FLAGS.eval_batch_size)
elem['ad_id'] = tf.Print(input_=elem['ad_id'],
data=[tf.reshape(elem['is_leak'], [-1])],
message='is_leak', name='print_is_leak', summarize=FLAGS.eval_batch_size)
return reshaped_elem, reshaped_label
if mode == tf.estimator.ModeKeys.EVAL:
parsed_dataset = parsed_dataset.map(consolidate_batch, num_parallel_calls=None)
else:
parsed_dataset = parsed_dataset.map(consolidate_batch,
num_parallel_calls=parser_num_threads)
parsed_dataset = parsed_dataset.prefetch(prefetch_buffer_size)
return parsed_dataset
# rough approximation for MAP metric for measuring ad quality
# roughness comes from batch sizes falling between groups of
# display ids
# hack because of name clashes. Probably makes sense to rename features
DISPLAY_ID_COLUMN = features.DISPLAY_ID_COLUMN
def map_custom_metric(features, labels, predictions):
display_ids = tf.reshape(features[DISPLAY_ID_COLUMN], [-1])
predictions = predictions['probabilities'][:, 1]
labels = labels[:, 0]
# Processing unique display_ids, indexes and counts
# Sorting needed in case the same display_id occurs in two different places
sorted_ids = tf.argsort(display_ids)
display_ids = tf.gather(display_ids, indices=sorted_ids)
predictions = tf.gather(predictions, indices=sorted_ids)
labels = tf.gather(labels, indices=sorted_ids)
_, display_ids_idx, display_ids_ads_count = tf.unique_with_counts(
display_ids, out_idx=tf.int64)
pad_length = 30 - tf.reduce_max(display_ids_ads_count)
pad_fn = lambda x: tf.pad(x, [(0, 0), (0, pad_length)])
preds = tf.RaggedTensor.from_value_rowids(
predictions, display_ids_idx).to_tensor()
labels = tf.RaggedTensor.from_value_rowids(
labels, display_ids_idx).to_tensor()
labels = tf.argmax(labels, axis=1)
return {
'map': tf.compat.v1.metrics.average_precision_at_k(
predictions=pad_fn(preds),
labels=labels,
k=12,
name="streaming_map")}
IS_LEAK_COLUMN = features.IS_LEAK_COLUMN
def map_custom_metric_with_leak(features, labels, predictions):
display_ids = features[DISPLAY_ID_COLUMN]
display_ids = tf.reshape(display_ids, [-1])
is_leak_tf = features[IS_LEAK_COLUMN]
is_leak_tf = tf.reshape(is_leak_tf, [-1])
predictions = predictions['probabilities'][:, 1]
predictions = predictions + tf.cast(is_leak_tf, tf.float32)
labels = labels[:, 0]
# Processing unique display_ids, indexes and counts
# Sorting needed in case the same display_id occurs in two different places
sorted_ids = tf.argsort(display_ids)
display_ids = tf.gather(display_ids, indices=sorted_ids)
predictions = tf.gather(predictions, indices=sorted_ids)
labels = tf.gather(labels, indices=sorted_ids)
_, display_ids_idx, display_ids_ads_count = tf.unique_with_counts(
display_ids, out_idx=tf.int64)
pad_length = 30 - tf.reduce_max(display_ids_ads_count)
pad_fn = lambda x: tf.pad(x, [(0, 0), (0, pad_length)])
preds = tf.RaggedTensor.from_value_rowids(predictions, display_ids_idx).to_tensor()
labels = tf.RaggedTensor.from_value_rowids(labels, display_ids_idx).to_tensor()
labels = tf.argmax(labels, axis=1)
return {
'map_with_leak': tf.compat.v1.metrics.average_precision_at_k(
predictions=pad_fn(preds),
labels=labels,
k=12,
name="streaming_map_with_leak")}
# A quick custom hook to access and log info about an Estimator graph
class InternalLoggerHook(tf.estimator.SessionRunHook):
def __init__(self, logfile='internal_log.txt'):
self.logfile = logfile
def after_create_session(self, session, coord): # runs once, after graph is finalized
log_writer = open(self.logfile, 'w')
# one pass through to record a dictionary with {input: output} pairs of nodes
# doesn't add much overhead that I can see, and makes it easier to trace dependencies
hold_outputs_dict = {}
for op in tf.get_default_graph().get_operations():
for tensor in op.inputs:
base_name = ''.join(tensor.name.split(':')[:-1]) if ':' in tensor.name else tensor.name
if base_name not in hold_outputs_dict:
hold_outputs_dict[base_name] = [op.name]
else:
hold_outputs_dict[base_name].append(op.name)
# record information for each op to file, this time, drawing on the above dictionary for outputs
for op in tf.get_default_graph().get_operations():
op_repr = ''
op_repr = op_repr + repr(op.node_def) # protobuf-style representation
outputs = hold_outputs_dict.pop(op.name, [])
op_repr = op_repr + '\n'.join(['output: ' + repr(o) for o in outputs] + \
['colocation_group: ' + repr(cg) for cg in op.colocation_groups()] + \
['control_input: ' + repr(ci) for ci in op.control_inputs])
op_repr = ' ' + '\n '.join(op_repr.split('\n')) # indented
op_repr = op.name + ' {\n' + op_repr + '\n}\n\n'
log_writer.write(op_repr)
# leave a warning at the end of the file if any outputs are left over
log_writer.write('Unclaimed outputs:\n' + '\n'.join([key + ': ' + repr(hold_outputs_dict[key]) \
for key in hold_outputs_dict]))
log_writer.close()
# function to create a wide & deep Estimator, with options to knock out parts (model_type)
def custom_estimator_model_fn(features, labels, mode, params, config):
with tf.compat.v1.variable_scope('deep', values=features) as scope:
deep_absolute_scope = scope.name
if params['model_type'] in [DEEP, WIDE_N_DEEP]:
deep_features = features.copy()
deep_current = tf.compat.v1.feature_column.input_layer(deep_features, params['deep_columns'])
if params['model_type'] in [DEEP, WIDE_N_DEEP]:
for layer_ind in range(len(params['layers'])):
num_neurons = params['layers'][layer_ind]
deep_current = tf.keras.layers.Dense(num_neurons, activation=tf.nn.relu)(deep_current)
deep_current = tf.keras.layers.Dropout(params['deep_dropout'])(deep_current)
deep_logits = tf.keras.layers.Dense(1)(deep_current)
else:
deep_logits = None
with tf.compat.v1.variable_scope('wide', values=features) as scope:
wide_absolute_scope = scope.name
wide_logits = tf.compat.v1.feature_column.linear_model(features, params['wide_columns'], units=1, sparse_combiner='sum') \
if params['model_type'] in [WIDE, WIDE_N_DEEP] else None
if deep_logits is None and wide_logits is None: # with only the input pipeline, just return input features
assert mode == tf.estimator.ModeKeys.PREDICT, \
'Only the input pipeline is used; eval and train don\'t have meaning'
return tf.estimator.EstimatorSpec(mode, predictions=features)
else:
logits = deep_logits if wide_logits is None else wide_logits if deep_logits is None \
else (wide_logits + deep_logits)
head = tf.contrib.estimator.binary_classification_head(loss_reduction=tf.compat.v1.losses.Reduction.SUM_OVER_BATCH_SIZE)
def train_op_fn(loss):
global_step = tf.compat.v1.train.get_global_step()
deep_optimizer = params['deep_optimizer']
wide_optimizer = params['wide_optimizer']
# enable mixed precision if desired
if params['amp']:
deep_optimizer = tf.train.experimental.enable_mixed_precision_graph_rewrite(deep_optimizer)
wide_optimizer = tf.train.experimental.enable_mixed_precision_graph_rewrite(wide_optimizer)
deep_op = deep_optimizer.minimize(loss, var_list=tf.compat.v1.get_collection(
tf.compat.v1.GraphKeys.TRAINABLE_VARIABLES, scope=deep_absolute_scope)) if deep_logits is not None else None
wide_op = wide_optimizer.minimize(loss, var_list=tf.compat.v1.get_collection(
tf.compat.v1.GraphKeys.TRAINABLE_VARIABLES, scope=wide_absolute_scope)) if wide_logits is not None else None
train_op = tf.group(deep_op, wide_op) if deep_logits is not None and wide_logits is not None \
else deep_op if deep_logits is not None else wide_op
with tf.control_dependencies([train_op]): # increment global step once train op is done
return tf.compat.v1.assign_add(global_step, 1).op # this is how the canned estimator appears to do it
return head.create_estimator_spec(features, mode, logits, labels=labels, train_op_fn=train_op_fn)
# a helper function to create an estimator of the specified type, either custom or canned
# custom estimators are created with the custom_estimator_model_fn, and have some options working
# that the canned ones do not (AMP, knocking out parts of the model, NVTX)
def construct_estimator(model_type, custom_estimator, run_config,
wide_columns, wide_optimizer,
deep_columns, deep_hidden_units, deep_dropout, deep_optimizer,
amp=False):
if custom_estimator:
estimator = tf.estimator.Estimator(
model_fn=custom_estimator_model_fn,
config=run_config,
params={
'wide_columns': wide_columns,
'deep_columns': deep_columns,
'deep_dropout': deep_dropout,
'model_type': model_type,
'layers': deep_hidden_units,
'wide_optimizer': wide_optimizer,
'deep_optimizer': deep_optimizer,
'amp': amp,
})
else:
assert model_type in [WIDE, DEEP, WIDE_N_DEEP], 'Canned estimator only supports basic wide, deep, wnd'
assert not amp, 'AMP not functional for canned estimator' # AMP does not optimize the separate graph
if model_type == WIDE:
estimator = tf.estimator.LinearClassifier(
feature_columns=wide_columns,
config=run_config,
optimizer=wide_optimizer)
elif model_type == DEEP:
estimator = tf.estimator.DNNClassifier(
feature_columns=deep_columns,
hidden_units=deep_hidden_units,
dropout=deep_dropout,
config=run_config,
optimizer=deep_optimizer)
elif model_type == WIDE_N_DEEP:
estimator = tf.estimator.DNNLinearCombinedClassifier(
config=run_config,
linear_feature_columns=wide_columns,
linear_optimizer=wide_optimizer,
dnn_feature_columns=deep_columns,
dnn_optimizer=deep_optimizer,
dnn_hidden_units=deep_hidden_units,
dnn_dropout=deep_dropout,
linear_sparse_combiner='sum',
loss_reduction=tf.losses.Reduction.SUM_OVER_BATCH_SIZE)
return estimator
def main(FLAGS):
if FLAGS.hvd:
hvd.init()
if hvd.local_rank() == 0:
tf.logging.set_verbosity(tf.logging.INFO)
log_path = os.path.join(FLAGS.results_dir, FLAGS.log_filename)
os.makedirs(FLAGS.results_dir, exist_ok=True)
dllogger.init(backends=[
dllogger.JSONStreamBackend(verbosity=dllogger.Verbosity.VERBOSE,
filename=log_path),
dllogger.StdOutBackend(verbosity=dllogger.Verbosity.VERBOSE)])
else:
tf.logging.set_verbosity(tf.logging.ERROR)
dllogger.init(backends=[])
num_gpus = hvd.size()
else:
tf.logging.set_verbosity(tf.logging.INFO)
log_path = os.path.join(FLAGS.results_dir, FLAGS.log_filename)
os.makedirs(FLAGS.results_dir, exist_ok=True)
dllogger.init(backends=[
dllogger.JSONStreamBackend(verbosity=dllogger.Verbosity.VERBOSE,
filename=log_path),
dllogger.StdOutBackend(verbosity=dllogger.Verbosity.VERBOSE)])
num_gpus = 1
dllogger.log(data=vars(FLAGS), step='PARAMETER')
create_batches = FLAGS.batch_size // FLAGS.prebatch_size
wide_columns, deep_columns = get_feature_columns(use_all_columns=FLAGS.use_all_columns)
tf_transform_output = tft.TFTransformOutput(FLAGS.transformed_metadata_path)
if not FLAGS.hvd or hvd.local_rank() == 0:
tf.compat.v1.logging.warn('command line arguments: {}'.format(json.dumps(vars(FLAGS))))
if not os.path.exists(FLAGS.results_dir):
os.mkdir(FLAGS.results_dir)
with open('{}/args.json'.format(FLAGS.results_dir), 'w') as f:
json.dump(vars(FLAGS), f, indent=4)
if FLAGS.gpu:
session_config = tf.compat.v1.ConfigProto(log_device_placement=FLAGS.log_device_placement)
else:
session_config = tf.compat.v1.ConfigProto(device_count={'GPU': 0}, log_device_placement=FLAGS.log_device_placement)
if FLAGS.hvd:
session_config.gpu_options.visible_device_list = str(hvd.local_rank())
if FLAGS.xla:
session_config.graph_options.optimizer_options.global_jit_level = tf.OptimizerOptions.ON_1
if FLAGS.benchmark:
model_dir = None
else:
model_dir = FLAGS.model_dir
if FLAGS.save_checkpoints_steps != 0:
run_config = tf.estimator.RunConfig(model_dir=model_dir).replace(session_config=session_config,
save_checkpoints_steps=FLAGS.save_checkpoints_steps,
keep_checkpoint_max=1)
else:
run_config = tf.estimator.RunConfig(model_dir=model_dir).replace(session_config=session_config,
save_checkpoints_secs=FLAGS.save_checkpoints_secs,
keep_checkpoint_max=1)
wide_optimizer = tf.compat.v1.train.FtrlOptimizer(
learning_rate=FLAGS.linear_learning_rate,
l1_regularization_strength=FLAGS.linear_l1_regularization,
l2_regularization_strength=FLAGS.linear_l2_regularization)
deep_optimizer = tf.compat.v1.train.ProximalAdagradOptimizer(
learning_rate=FLAGS.deep_learning_rate,
initial_accumulator_value=0.1,
l1_regularization_strength=FLAGS.deep_l1_regularization,
l2_regularization_strength=FLAGS.deep_l2_regularization,
use_locking=False)
if FLAGS.hvd:
wide_optimizer = hvd.DistributedOptimizer(wide_optimizer)
deep_optimizer = hvd.DistributedOptimizer(deep_optimizer)
stats_filename = os.path.join(FLAGS.transformed_metadata_path, 'stats.json')
embed_columns = None
# input functions to read data from disk
train_input_fn = lambda : separate_input_fn(
tf_transform_output,
FLAGS.train_data_pattern,
create_batches,
tf.estimator.ModeKeys.TRAIN,
reader_num_threads=FLAGS.reader_num_threads,
parser_num_threads=FLAGS.parser_num_threads,
shuffle_buffer_size=int(FLAGS.shuffle_percentage*create_batches),
prefetch_buffer_size=FLAGS.prefetch_buffer_size,
print_display_ids=FLAGS.print_display_ids)
eval_input_fn = lambda : separate_input_fn(
tf_transform_output,
FLAGS.eval_data_pattern,
(FLAGS.eval_batch_size // FLAGS.prebatch_size),
tf.estimator.ModeKeys.EVAL,
reader_num_threads=1,
parser_num_threads=1,
shuffle_buffer_size=int(FLAGS.shuffle_percentage*create_batches),
prefetch_buffer_size=FLAGS.prefetch_buffer_size,
print_display_ids=FLAGS.print_display_ids)
estimator = construct_estimator(FLAGS.model_type, not FLAGS.canned_estimator, run_config,
wide_columns, wide_optimizer,
deep_columns, FLAGS.deep_hidden_units, FLAGS.deep_dropout, deep_optimizer,
amp=FLAGS.amp)
estimator = tf.estimator.add_metrics(estimator, map_custom_metric)
estimator = tf.estimator.add_metrics(estimator, map_custom_metric_with_leak)
steps_per_epoch = FLAGS.training_set_size / FLAGS.batch_size
print('Steps per epoch: {}'.format(steps_per_epoch))
max_steps = int(FLAGS.num_epochs * steps_per_epoch)
hooks = []
if FLAGS.hvd:
hooks.append(hvd.BroadcastGlobalVariablesHook(0))
if FLAGS.predict or FLAGS.evaluate: # inference
if FLAGS.benchmark:
benchmark_hook = BenchmarkLoggingHook(global_batch_size=num_gpus * FLAGS.eval_batch_size, warmup_steps=FLAGS.benchmark_warmup_steps)
hooks.append(benchmark_hook)
eval_steps = FLAGS.benchmark_steps
else:
eval_steps = FLAGS.eval_steps
predict_result_iter = estimator.predict(input_fn=eval_input_fn, hooks=hooks, yield_single_examples=False)
results = []
for i, r in enumerate(predict_result_iter):
print('predicting batch: ', i)
results.append(r)
# TODO: use eval_steps
if i >= eval_steps - 1:
break
if FLAGS.benchmark:
infer_throughput = benchmark_hook.mean_throughput.value()
if FLAGS.benchmark:
dllogger.log(data={'infer_throughput': infer_throughput}, step=tuple())
elif FLAGS.evaluate:
print('evaluating using estimator.evaluate with eval_batch_size = ',
FLAGS.eval_batch_size, ' and eval_steps = ', FLAGS.eval_steps)
result = estimator.evaluate(eval_input_fn, hooks=hooks, steps=FLAGS.eval_steps)
dllogger.log(step=(), data={'map_infer': float(result['map']), 'map_with_leak_infer': float(result['map_with_leak'])})
elif FLAGS.predict:
scores = [r['probabilities'][:, 1] for r in results]
scores = np.hstack(scores)
scores_path = os.path.join(FLAGS.model_dir, 'scores.txt')
print('saving the numpy scores array to: ', scores_path)
np.savetxt(scores_path, scores, fmt="%f", delimiter='\n')
else: # training
if FLAGS.benchmark:
benchmark_hook = BenchmarkLoggingHook(global_batch_size=num_gpus * FLAGS.batch_size,
warmup_steps=FLAGS.benchmark_warmup_steps)
hooks.append(benchmark_hook)
estimator.train(train_input_fn, hooks=hooks, steps=FLAGS.benchmark_steps)
train_throughput = benchmark_hook.mean_throughput.value()
dllogger.log(data={'train_throughput': train_throughput}, step=tuple())
else:
train_spec = tf.estimator.TrainSpec(input_fn=train_input_fn, max_steps=max_steps, hooks=hooks)
eval_spec = tf.estimator.EvalSpec(input_fn=eval_input_fn,
throttle_secs=FLAGS.eval_throttle_secs, steps=FLAGS.eval_steps)
result = tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec)
if result:
dllogger.log(step=(), data={'map': float(result[0]['map']),
'map_with_leak': float(result[0]['map_with_leak'])})
if __name__ == '__main__':
FLAGS = create_parser().parse_args()
main(FLAGS)

View file

@ -0,0 +1,49 @@
#! /usr/bin/python
# -*- coding: utf-8 -*-
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import time
import tensorflow as tf
import dllogger
from .training_hooks import MeanAccumulator
__all__ = ['BenchmarkLoggingHook']
class BenchmarkLoggingHook(tf.train.SessionRunHook):
def __init__(self, global_batch_size, warmup_steps=100):
self.warmup_steps = warmup_steps
self.global_batch_size = global_batch_size
self.current_step = 0
self.t0 = None
self.mean_throughput = MeanAccumulator()
def before_run(self, run_context):
self.t0 = time.time()
def after_run(self, run_context, run_values):
batch_time = time.time() - self.t0
samplesps = self.global_batch_size / batch_time
if self.current_step >= self.warmup_steps:
self.mean_throughput.consume(samplesps)
dllogger.log(data={"samplesps" : samplesps}, step=(0, self.current_step))
self.current_step += 1

View file

@ -0,0 +1,36 @@
#! /usr/bin/python
# -*- coding: utf-8 -*-
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import time
import tensorflow as tf
import dllogger
class MeanAccumulator:
def __init__(self):
self.sum = 0
self.count = 0
def consume(self, value):
self.sum += value
self.count += 1
def value(self):
return self.sum / self.count