[WIDENDEEP/TF2] Update NVTabular version and add various model optimizations

This commit is contained in:
Dawid Majchrowski 2021-11-08 14:10:12 -08:00 committed by Krzysztof Kudrynski
parent 2592d5a02c
commit 01a9f5b48c
49 changed files with 7022 additions and 19855 deletions

View File

@ -1 +0,0 @@
Dockerfile-train

View File

@ -0,0 +1,32 @@
# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
ARG FROM_IMAGE_NAME=nvcr.io/nvidia/merlin/merlin-tensorflow-training:21.09
FROM ${FROM_IMAGE_NAME}
ENV HOROVOD_CYCLE_TIME=0.1
ENV HOROVOD_FUSION_THRESHOLD=67108864
ENV HOROVOD_NUM_STREAMS=2
USER root
RUN pip install --no-cache-dir -e git+https://github.com/NVIDIA/dllogger#egg=dllogger
WORKDIR /wd
COPY . .
RUN cd /nvtabular && git checkout v0.6.1

View File

@ -1,54 +0,0 @@
# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
ARG FROM_IMAGE_NAME=nvcr.io/nvidia/nvtabular:0.3
FROM ${FROM_IMAGE_NAME}
USER root
# Spark dependencies
ENV APACHE_SPARK_VERSION 2.3.1
ENV HADOOP_VERSION 2.7
RUN apt-get -y update && \
apt-get install --no-install-recommends -y openjdk-8-jre-headless ca-certificates-java time && \
apt-get clean && \
rm -rf /var/lib/apt/lists/*
RUN cd /tmp && \
wget -q http://archive.apache.org/dist/spark/spark-${APACHE_SPARK_VERSION}/spark-${APACHE_SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}.tgz && \
echo "DC3A97F3D99791D363E4F70A622B84D6E313BD852F6FDBC777D31EAB44CBC112CEEAA20F7BF835492FB654F48AE57E9969F93D3B0E6EC92076D1C5E1B40B4696 *spark-${APACHE_SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}.tgz" | sha512sum -c - && \
tar xzf spark-${APACHE_SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}.tgz -C /usr/local --owner root --group root --no-same-owner && \
rm spark-${APACHE_SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}.tgz
RUN cd /usr/local && ln -s spark-${APACHE_SPARK_VERSION}-bin-hadoop${HADOOP_VERSION} spark
# Spark config
ENV SPARK_HOME /usr/local/spark
ENV PYTHONPATH $SPARK_HOME/python:$SPARK_HOME/python/lib/py4j-0.10.7-src.zip:/wd
ENV SPARK_OPTS --driver-java-options=-Xms1024M --driver-java-options=-Xmx4096M --driver-java-options=-Dlog4j.logLevel=info
ENV PYSPARK_PYTHON /conda/envs/rapids/bin/python
ENV PYSPARK_DRIVER_PYTHON /conda/envs/rapids/bin/python
SHELL ["/bin/bash", "-c"]
RUN source activate rapids && \
pip install --upgrade pip && \
pip install --no-cache-dir pyspark==2.3.1 && \
pip install --no-cache-dir --no-deps tensorflow-transform==0.24.1 apache-beam==2.14 tensorflow-metadata==0.14.0 pydot dill \
pip install --no-cache-dir -e git+https://github.com/NVIDIA/dllogger#egg=dllogger
WORKDIR /wd
COPY . .

View File

@ -0,0 +1,201 @@
Apache License
Version 2.0, January 2004
http://www.apache.org/licenses/
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
1. Definitions.
"License" shall mean the terms and conditions for use, reproduction,
and distribution as defined by Sections 1 through 9 of this document.
"Licensor" shall mean the copyright owner or entity authorized by
the copyright owner that is granting the License.
"Legal Entity" shall mean the union of the acting entity and all
other entities that control, are controlled by, or are under common
control with that entity. For the purposes of this definition,
"control" means (i) the power, direct or indirect, to cause the
direction or management of such entity, whether by contract or
otherwise, or (ii) ownership of fifty percent (50%) or more of the
outstanding shares, or (iii) beneficial ownership of such entity.
"You" (or "Your") shall mean an individual or Legal Entity
exercising permissions granted by this License.
"Source" form shall mean the preferred form for making modifications,
including but not limited to software source code, documentation
source, and configuration files.
"Object" form shall mean any form resulting from mechanical
transformation or translation of a Source form, including but
not limited to compiled object code, generated documentation,
and conversions to other media types.
"Work" shall mean the work of authorship, whether in Source or
Object form, made available under the License, as indicated by a
copyright notice that is included in or attached to the work
(an example is provided in the Appendix below).
"Derivative Works" shall mean any work, whether in Source or Object
form, that is based on (or derived from) the Work and for which the
editorial revisions, annotations, elaborations, or other modifications
represent, as a whole, an original work of authorship. For the purposes
of this License, Derivative Works shall not include works that remain
separable from, or merely link (or bind by name) to the interfaces of,
the Work and Derivative Works thereof.
"Contribution" shall mean any work of authorship, including
the original version of the Work and any modifications or additions
to that Work or Derivative Works thereof, that is intentionally
submitted to Licensor for inclusion in the Work by the copyright owner
or by an individual or Legal Entity authorized to submit on behalf of
the copyright owner. For the purposes of this definition, "submitted"
means any form of electronic, verbal, or written communication sent
to the Licensor or its representatives, including but not limited to
communication on electronic mailing lists, source code control systems,
and issue tracking systems that are managed by, or on behalf of, the
Licensor for the purpose of discussing and improving the Work, but
excluding communication that is conspicuously marked or otherwise
designated in writing by the copyright owner as "Not a Contribution."
"Contributor" shall mean Licensor and any individual or Legal Entity
on behalf of whom a Contribution has been received by Licensor and
subsequently incorporated within the Work.
2. Grant of Copyright License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
copyright license to reproduce, prepare Derivative Works of,
publicly display, publicly perform, sublicense, and distribute the
Work and such Derivative Works in Source or Object form.
3. Grant of Patent License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
(except as stated in this section) patent license to make, have made,
use, offer to sell, sell, import, and otherwise transfer the Work,
where such license applies only to those patent claims licensable
by such Contributor that are necessarily infringed by their
Contribution(s) alone or by combination of their Contribution(s)
with the Work to which such Contribution(s) was submitted. If You
institute patent litigation against any entity (including a
cross-claim or counterclaim in a lawsuit) alleging that the Work
or a Contribution incorporated within the Work constitutes direct
or contributory patent infringement, then any patent licenses
granted to You under this License for that Work shall terminate
as of the date such litigation is filed.
4. Redistribution. You may reproduce and distribute copies of the
Work or Derivative Works thereof in any medium, with or without
modifications, and in Source or Object form, provided that You
meet the following conditions:
(a) You must give any other recipients of the Work or
Derivative Works a copy of this License; and
(b) You must cause any modified files to carry prominent notices
stating that You changed the files; and
(c) You must retain, in the Source form of any Derivative Works
that You distribute, all copyright, patent, trademark, and
attribution notices from the Source form of the Work,
excluding those notices that do not pertain to any part of
the Derivative Works; and
(d) If the Work includes a "NOTICE" text file as part of its
distribution, then any Derivative Works that You distribute must
include a readable copy of the attribution notices contained
within such NOTICE file, excluding those notices that do not
pertain to any part of the Derivative Works, in at least one
of the following places: within a NOTICE text file distributed
as part of the Derivative Works; within the Source form or
documentation, if provided along with the Derivative Works; or,
within a display generated by the Derivative Works, if and
wherever such third-party notices normally appear. The contents
of the NOTICE file are for informational purposes only and
do not modify the License. You may add Your own attribution
notices within Derivative Works that You distribute, alongside
or as an addendum to the NOTICE text from the Work, provided
that such additional attribution notices cannot be construed
as modifying the License.
You may add Your own copyright statement to Your modifications and
may provide additional or different license terms and conditions
for use, reproduction, or distribution of Your modifications, or
for any such Derivative Works as a whole, provided Your use,
reproduction, and distribution of the Work otherwise complies with
the conditions stated in this License.
5. Submission of Contributions. Unless You explicitly state otherwise,
any Contribution intentionally submitted for inclusion in the Work
by You to the Licensor shall be under the terms and conditions of
this License, without any additional terms or conditions.
Notwithstanding the above, nothing herein shall supersede or modify
the terms of any separate license agreement you may have executed
with Licensor regarding such Contributions.
6. Trademarks. This License does not grant permission to use the trade
names, trademarks, service marks, or product names of the Licensor,
except as required for reasonable and customary use in describing the
origin of the Work and reproducing the content of the NOTICE file.
7. Disclaimer of Warranty. Unless required by applicable law or
agreed to in writing, Licensor provides the Work (and each
Contributor provides its Contributions) on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied, including, without limitation, any warranties or conditions
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
PARTICULAR PURPOSE. You are solely responsible for determining the
appropriateness of using or redistributing the Work and assume any
risks associated with Your exercise of permissions under this License.
8. Limitation of Liability. In no event and under no legal theory,
whether in tort (including negligence), contract, or otherwise,
unless required by applicable law (such as deliberate and grossly
negligent acts) or agreed to in writing, shall any Contributor be
liable to You for damages, including any direct, indirect, special,
incidental, or consequential damages of any character arising as a
result of this License or out of the use or inability to use the
Work (including but not limited to damages for loss of goodwill,
work stoppage, computer failure or malfunction, or any and all
other commercial damages or losses), even if such Contributor
has been advised of the possibility of such damages.
9. Accepting Warranty or Additional Liability. While redistributing
the Work or Derivative Works thereof, You may choose to offer,
and charge a fee for, acceptance of support, warranty, indemnity,
or other liability obligations and/or rights consistent with this
License. However, in accepting such obligations, You may act only
on Your own behalf and on Your sole responsibility, not on behalf
of any other Contributor, and only if You agree to indemnify,
defend, and hold each Contributor harmless for any liability
incurred by, or claims asserted against, such Contributor by reason
of your accepting any such warranty or additional liability.
END OF TERMS AND CONDITIONS
APPENDIX: How to apply the Apache License to your work.
To apply the Apache License to your work, attach the following
boilerplate notice, with the fields enclosed by brackets "[]"
replaced with your own identifying information. (Don't include
the brackets!) The text should be enclosed in the appropriate
comment syntax for the file format. We also recommend that a
file or class name and description of purpose be included on the
same "printed page" as the copyright notice for easier
identification within third-party archives.
Copyright 2021 NVIDIA Corporation
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.

View File

@ -25,27 +25,24 @@ The content of the repository is tested and maintained by NVIDIA.
* [Getting the data](#getting-the-data)
+ [Dataset guidelines](#dataset-guidelines)
+ [Dataset preprocessing](#dataset-preprocessing)
- [Spark CPU Dataset preprocessing](#spark-cpu-dataset-preprocessing)
- [NVTabular GPU preprocessing](#nvtabular-gpu-preprocessing)
* [Training process](#training-process)
* [Evaluation process](#evaluation-process)
- [Performance](#performance)
* [Benchmarking](#benchmarking)
+ [NVTabular and Spark CPU Preprocessing comparison](#nvtabular-and-spark-cpu-preprocessing-comparison)
+ [Training and inference performance benchmark](#training-and-inference-performance-benchmark)
* [Results](#results)
+ [Training accuracy results](#training-accuracy-results)
- [Training accuracy: NVIDIA DGX A100 (8x A100 80GB)](#training-accuracy-nvidia-dgx-a100-8x-a100-80gb)
- [Training accuracy: NVIDIA DGX-1 (8x V100 16GB)](#training-accuracy-nvidia-dgx-1-8x-v100-16gb)
- [Training accuracy: NVIDIA DGX-1 (8x V100 32GB)](#training-accuracy-nvidia-dgx-1-8x-v100-32gb)
- [Training accuracy plots](#training-accuracy-plots)
- [Training stability test](#training-stability-test)
- [Impact of mixed precision on training accuracy](#impact-of-mixed-precision-on-training-accuracy)
+ [Training performance results](#training-performance-results)
- [Training performance: NVIDIA DGX A100 (8x A100 80GB)](#training-performance-nvidia-dgx-a100-8x-a100-80gb)
- [Training performance: NVIDIA DGX-1 (8x V100 16GB)](#training-performance-nvidia-dgx-1-8x-v100-16gb)
+ [Inference performance results](#inference-performance-results)
- [Inference performance: NVIDIA DGX A100 (8x A100 80GB)](#inference-performance-nvidia-dgx-a100-8x-a100-80gb)
- [Inference performance: NVIDIA DGX-1 (8x V100 16GB)](#inference-performance-nvidia-dgx-1-8x-v100-16gb)
- [Training performance: NVIDIA DGX-1 (8x V100 32GB)](#training-performance-nvidia-dgx-1-8x-v100-32gb)
+ [Evaluation performance results](#evaluation-performance-results)
- [Evaluation performance: NVIDIA DGX A100 (8x A100 80GB)](#evaluation-performance-nvidia-dgx-a100-8x-a100-80gb)
- [Evaluation performance: NVIDIA DGX-1 (8x V100 32GB)](#evaluation-performance-nvidia-dgx-1-8x-v100-32gb)
- [Release notes](#release-notes)
* [Changelog](#changelog)
* [Known issues](#known-issues)
@ -86,21 +83,21 @@ The Outbrain Dataset is preprocessed in order to get features input to the model
Features:
- Request Level:
* 5 scalar numeric features `dtype=float32`
* 8 categorical features (all INT32 `dtype`)
* 8 trainable embeddings of (dimension, cardinality of categorical variable): (128,300000), (16,4), (128,100000), (64 ,4000), (64,1000), (64,2500), (64,300), (64,2000)
* 8 categorical features `dtype=int32`
* 8 trainable embeddings of (dimension, cardinality of categorical variable): (128,300000), (19,4), (128,100000), (64,4000), (64,1000), (64,2500), (64,300), (64,2000)
* 8 trainable embeddings for wide part of size 1 (serving as an embedding from the categorical to scalar space for input to the wide portion of the model)
- Item Level:
* 8 scalar numeric features `dtype=float32`
* 5 categorical features (all INT32 `dtype`)
* 5 trainable embeddings of dimensions (cardinality of categorical variable): 128 (250000), 64 (2500), 64 (4000), 64 (1000),64 (5000)
* 5 categorical features `dtype=int32`
* 5 trainable embeddings of (dimension, cardinality of categorical variable): (128,250000), (64,2500), (64,4000), (64,1000), (128,5000)
* 5 trainable embeddings for wide part of size 1 (working as trainable one-hot embeddings)
Features describe both the user (Request Level features) and Item (Item Level Features).
- Model:
* Input dimension is 26 (13 categorical and 13 numerical features)
* Total embedding dimension is 976
* Total embedding dimension is 1043
* 5 hidden layers each with size 1024
* Total number of model parameter is ~90M
* Output dimension is 1 (`y` is the probability of click given Request-level and Item-level features)
@ -112,7 +109,7 @@ For more information about feature preprocessing, go to [Dataset preprocessing](
Model accuracy is defined with the [MAP@12](https://en.wikipedia.org/wiki/Evaluation_measures_(information_retrieval)#Mean_average_precision) metric. This metric follows the way of assessing model accuracy in the original [Kaggle Outbrain Click Prediction Challenge](https://www.kaggle.com/c/outbrain-click-prediction/). In this repository, the leaked clicked ads are not taken into account since in industrial setup Data Scientists do not have access to leaked information when training the model. For more information about data leak in Kaggle Outbrain Click Prediction challenge, visit this [blogpost](https://medium.com/unstructured/how-feature-engineering-can-help-you-do-well-in-a-kaggle-competition-part-ii-3645d92282b8) by the 19th place finisher in that competition.
Training and inference script also reports AUC ROC, binary accuracy, and Loss (BCE) values.
Training and evaluation script also reports Loss (BCE) values.
### Feature support matrix
@ -177,7 +174,7 @@ The following section lists the requirements that you need to meet in order to s
This repository contains Dockerfile which extends the TensorFlow2 NGC container and encapsulates some dependencies. Aside from these dependencies, ensure you have the following components:
- [NVIDIA Docker](https://github.com/NVIDIA/nvidia-docker)
- [20.12-tf2-py3](https://ngc.nvidia.com/catalog/containers/nvidia:tensorflow) NGC container
- [21.09 Merlin Tensorflow Training](https://ngc.nvidia.com/catalog/containers/nvidia:merlin:merlin-tensorflow-training) NGC container
Supported GPUs:
- [NVIDIA Volta architecture](https://www.nvidia.com/en-us/data-center/volta-gpu-architecture/)
@ -213,57 +210,44 @@ The Outbrain dataset can be downloaded from Kaggle (requires Kaggle account). Un
HOST_OUTBRAIN_PATH=/raid/outbrain
```
4. Preprocess the Outbrain dataset.
4.1. Build the Wide & Deep Preprocessing Container.
4. Build the Wide & Deep Container.
```
cd DeepLearningExamples/TensorFlow2/Recommendation/WideAndDeep
docker build -f Dockerfile-preproc . -t wd2-prep
docker build . -t wd2
```
4.2. Start an interactive session in the Wide&Deep Preprocessing Container. Run preprocessing against the original Outbrain dataset to `tf_records`. You can run preprocessing using Spark (CPU) or NVTabular preprocessing (GPU).
5. Preprocess the Outbrain dataset.
5.1. Start an interactive session in the Wide&Deep Container. Run preprocessing against the original Outbrain dataset to `parquets`. You can run preprocessing using NVTabular preprocessing (GPU).
```
nvidia-docker run --rm -it --ipc=host -v ${HOST_OUTBRAIN_PATH}:/outbrain wd2-prep bash
docker run --runtime=nvidia --gpus=all --rm -it --ipc=host -v ${HOST_OUTBRAIN_PATH}:/outbrain wd2 bash
```
4.3. Start preprocessing.
You can preprocess the data using either Spark on CPU or NVTabular on GPU. For more information, go to the [Dataset preprocessing](#dataset-preprocessing) section.
4.3.1. CPU Preprocessing (Spark).
```
cd /wd && bash scripts/preproc.sh spark 40
```
4.3.2. GPU Preprocessing (NVTabular).
```
cd /wd && bash scripts/preproc.sh nvtabular 40
```
The result of preprocessing scripts are prebatched TFRecords. The argument to the script is the number of TFRecords files that will be generated by the script (here 40). TFRecord files are generated in `${HOST_OUTBRAIN_PATH}/tfrecords`.
4.4. Training of the model
4.4.1. Build the Wide&Deep Training Container
```
cd DeepLearningExamples/TensorFlow2/Recommendation/WideAndDeep
docker build -f Dockerfile-train . -t wd2-train
```
4.4.2. Start an interactive session in the Wide&Deep Training Container
```
nvidia-docker run --rm -it --privileged --ipc=host -v ${HOST_OUTBRAIN_PATH}:/outbrain wd2-train bash
```
4.4.3. Run training
For 1 GPU:
5.2. Start NVTabular GPU preprocessing. For more information, go to the [Dataset preprocessing](#dataset-preprocessing) section.
```
python main.py
bash scripts/preproc.sh
```
For 1 GPU with Mixed Precision training with XLA:
The result of preprocessing script is NVTabular dataset stored in parquets. Files are generated in `${HOST_OUTBRAIN_PATH}/data`.
6. Train the model
6.1. Start an interactive session in the Wide & Deep Container
```
docker run --runtime=nvidia --gpus=all --rm -it --ipc=host -v ${HOST_OUTBRAIN_PATH}:/outbrain wd2 bash
```
6.2. Run training (`${GPU}` is a arbitrary number of gpu to be used)
```
python main.py --xla --amp
horovodrun -np ${GPU} sh hvd_wrapper.sh python main.py
```
Training with Mixed Precision training with XLA:
```
horovodrun -np ${GPU} sh hvd_wrapper.sh python main.py --xla --amp
```
@ -272,39 +256,22 @@ For complete usage, run:
python main.py -h
```
For 8 GPUs:
```
mpiexec --allow-run-as-root --bind-to socket -np 8 python main.py
```
For 8 GPU with Mixed Precision training with XLA:
```
mpiexec --allow-run-as-root --bind-to socket -np 8 python main.py --xla --amp
```
5. Run validation or evaluation.
7. Run validation or evaluation.
If you want to run validation or evaluation, you can either:
* use the checkpoint obtained from the training commands above, or
* download the pretrained checkpoint from NGC.
In order to download the checkpoint from NGC, visit [ngc.nvidia.com](https://ngc.nvidia.com) website and browse the available models. Download the checkpoint files and unzip them to some path, for example, to `$HOST_OUTBRAIN_PATH/checkpoints/` (which is the default path for storing the checkpoints during training). The checkpoint requires around 700MB disk space.
6. Start validation/evaluation.
8. Start validation/evaluation.
In order to validate the checkpoint on the evaluation set, run the `main.py` script with the `--evaluate` and `--use_checkpoint` flags.
For 1 GPU:
```
python main.py --evaluate --use_checkpoint
horovodrun -np ${GPU} sh hvd_wrapper.sh python main.py --evaluate --use_checkpoint
```
For 8 GPUs:
```
mpiexec --allow-run-as-root --bind-to socket -np 8 python main.py --evaluate --use_checkpoint
```
Now that you have your model trained and evaluated, you can choose to compare your training results with our [Training accuracy results](#training-accuracy-results). You can also choose to benchmark yours performance to [Training and inference performance benchmark](#training-and-inference-performance-benchmark). Following the steps in these sections will ensure that you achieve the same accuracy and performance results as stated in the [Results](#results) section.
Now that you have your model trained and evaluated, you can choose to compare your training results with our [Training accuracy results](#training-accuracy-results). You can also choose to benchmark yours performance to [Training and evaluation performance benchmark](#training-and-evaluation-performance-benchmark). Following the steps in these sections will ensure that you achieve the same accuracy and performance results as stated in the [Results](#results) section.
## Advanced
@ -313,29 +280,29 @@ The following sections provide greater details of the dataset, running training,
### Scripts and sample code
These are the important scripts in this repository:
* `main.py` - Python script for training the Wide & Deep recommender model. This script is run inside the training container (named `wd-train` in the [Quick Start Guide](#quick-start-guide)).
* `scripts/preproc.sh` - Bash script for Outbrain dataset preparation for training, preprocessing and saving into TFRecords format. This script is run inside a preprocessing container (named `wd-prep` in the [Quick Start Guide](#quick-start-guide)).
* `data/outbrain/dataloader.py` - Python file containing data loaders for training and evaluation set.
* `main.py` - Python script for training the Wide & Deep recommender model.
* `scripts/preproc.sh` - Bash script for Outbrain dataset preparation for training, preprocessing and saving into NVTabular format.
* `data/outbrain/dataloader.py` - Python file containing NVTabular data loaders for train and evaluation set.
* `data/outbrain/features.py` - Python file describing the request and item level features as well as embedding dimensions and hash buckets sizes.
* `trainer/model/widedeep.py` - Python file with model definition.
* `trainer/utils/run.py` - Python file with training loop.
* `trainer/run.py` - Python file with training and evaluation setup.
### Parameters
These are the important parameters in the `main.py` script:
These are model parameters in the `main.py` script:
| Scope| parameter| Comment| Default Value |
| -------------------- | ----------------------------------------------------- | ------------------------------------------------------------ | ------------- |
| location of datasets | --transformed_metadata_path TRANSFORMED_METADATA_PATH | Path to transformed_metadata for feature specification reconstruction | |
|location of datasets| --use_checkpoint|Use checkpoint stored in model_dir path |False
|location of datasets |--train_data_pattern TRAIN_DATA_PATTERN |Pattern of training file names |/outbrain/data/train/*.parquet |
|location of datasets |--eval_data_pattern EVAL_DATA_PATTERN |Pattern of eval file names |/outbrain/data/valid/*.parquet |
|location of datasets|--use_checkpoint|Use checkpoint stored in model_dir path |False
|location of datasets|--model_dir MODEL_DIR|Destination where model checkpoint will be saved |/outbrain/checkpoints
|location of datasets|--results_dir RESULTS_DIR|Directory to store training results | /results
|location of datasets|--log_filename LOG_FILENAME|Name of the file to store dlloger output |log.json|
|training parameters|--training_set_size TRAINING_SET_SIZE|Number of samples in the training set | 59761827
|training parameters|--global_batch_size GLOBAL_BATCH_SIZE|Total size of training batch | 131072
|training parameters|--eval_batch_size EVAL_BATCH_SIZE|Total size of evaluation batch | 131072
|training parameters|--num_epochs NUM_EPOCHS|Number of training epochs | 20
|training parameters|--cpu|Run computations on the CPU | False
|training parameters|--cpu|Run computations on the CPU | Currently not supported
|training parameters|--amp|Enable automatic mixed precision conversion | False
|training parameters|--xla|Enable XLA conversion | False
|training parameters|--linear_learning_rate LINEAR_LEARNING_RATE|Learning rate for linear model | 0.02
@ -350,8 +317,6 @@ These are the important parameters in the `main.py` script:
|run mode parameters|--affinity{socket,single,single_unique,<br>socket_unique_interleaved,<br>socket_unique_continuous,disabled}|Type of CPU affinity | socket_unique_interleaved
### Command-line options
To see the full list of available options and their descriptions, use the `-h` or `--help` command-line option:
```
@ -374,38 +339,15 @@ The original data is stored in several separate files:
* `promoted_content.csv` - metadata about the ads
* `document_meta.csv`, `document_topics.csv`, `document_entities.csv`, `document_categories.csv` - metadata about the documents
During the preprocessing stage, the data is transformed into 87M rows tabular data of 26 features. The dataset is split into training and evaluation parts that have approx 60M and approx 27M rows, respectively. Splitting into train and eval is done in this way so that random 80% of daily events for the first 10 days of the dataset form a training set and remaining part (20% of events daily for the first 10 days and all events in the last two days) form an evaluation set. Eventually the dataset is saved in pre-batched TFRecord format.
During the preprocessing stage, the data is transformed into 87M rows tabular data of 26 features. The dataset is split into training and evaluation parts that have approx 60M and approx 27M rows, respectively. Splitting into train and eval is done in this way so that random 80% of daily events for the first 10 days of the dataset form a training set and remaining part (20% of events daily for the first 10 days and all events in the last two days) form an evaluation set. Eventually the dataset is saved in NVTabular parquet format.
#### Dataset preprocessing
Dataset preprocessing aims in creating in total 26 features: 13 categorical and 13 numerical. These features are obtained from the original Outbrain dataset in preprocessing. There are 2 types of preprocessing available for the model:
Spark CPU preprocessing
[NVTabular](https://nvidia.github.io/NVTabular/v0.3.0/index.html) GPU preprocessing
Both split the dataset into train and evaluation sets and produce the same feature set, therefore, the training is agnostic to the preprocessing step.
For comparison of Spark CPU and NVTabular preprocessing go to [NVTabular and Spark CPU Preprocessing comparison](#nvtabular-and-spark-cpu-preprocessing-comparison)
##### Spark CPU Dataset preprocessing
The original dataset is preprocessed using the scripts provided in `data/outbrain/spark`. Preprocessing is split into 3 preprocessing steps: `preproc1.py`, `preproc2.py`, and `preproc3.py` that form a complete workflow. The workflow consists of the following operations:
* separating out the validation set for cross-validation
* filling missing data with mode, median, or imputed values
* joining click data, ad metadata, and document category, topic and entity tables to create an enriched table
* computing 7 click-through rates (CTR) for ads grouped by 7 features
* computing attribute cosine similarity between the landing page and ad to be featured on the page
* math transformations of the numeric features (logarithmic, scaling, binning)
* categorifying data using hash-bucketing
* storing the resulting set of features in pre-batched TFRecord format
The `preproc1-3.py` preprocessing scripts use PySpark. In the Docker image, we have installed Spark 2.3.1 as a standalone cluster of Spark. The `preproc1.py` script splits the data into a training set and a validation set. The `preproc2.py` script computes the click-through rates (CTR) and cosine similarities between the features. The `preproc3.py` script performs the math transformations and generates the final TFRecord files. The data in the output files is pre-batched (with the default batch size of 4096) to avoid the overhead of the TFRecord format, which otherwise is not suitable for the tabular data.
The preprocessing includes some very resource-exhaustive operations including joining tables having over 2 billions of rows. Such operations may not fit into the RAM memory, and therefore we use Spark which is well suited for handling tabular operations on large data with limited RAM. Note that the Spark job requires about 500 GB disk space and 300 GB RAM to perform the preprocessing.
For more information about Spark, refer to the [Spark documentation](https://spark.apache.org/docs/2.3.1/).
Dataset preprocessing aims in creating in total 26 features: 13 categorical and 13 numerical. These features are obtained from the original Outbrain dataset in [NVTabular](https://nvidia.github.io/NVTabular/v0.6.1/index.html) preprocessing.
##### NVTabular GPU preprocessing
The NVTabular dataset is preprocessed using the script provided in `data/outbrain/nvtabular`. The workflow consists of most of the same operations in the Spark pipeline:
The NVTabular dataset is preprocessed using the script provided in `data/outbrain/nvtabular`. The workflow consists of:
* separating out the validation set for cross-validation
* filling missing data with themode, median, or imputed values most frequent value
* joining click data, ad metadata, and document category, topic and entity tables to create an enriched table.joining the tables for the ad clicks data
@ -415,16 +357,14 @@ The NVTabular dataset is preprocessed using the script provided in `data/outbrai
* categorifying data using hash-bucketing
* storing the result in a Parquet format
**Transforming the result into the pre-batched TFRecord format**
Most of the code describing operations in this workflow are in `data/outbrain/nvtabular/utils/workflow.py` and leverage NVTabular v0.3. As stated in its repository, [NVTabular](https://github.com/NVIDIA/NVTabular), a component of [NVIDIA Merlin Open Beta](https://developer.nvidia.com/nvidia-merlin), is a feature engineering and preprocessing library for tabular data that is designed to quickly and easily manipulate terabyte scale datasets and train deep learning based recommender systems. It provides a high-level abstraction to simplify code and accelerates computation on the GPU using the [RAPIDS Dask-cuDF](https://github.com/rapidsai/cudf/tree/main/python/dask_cudf) library. The code to transform the NVTabular Parquet output into TFRecords is in `data/outbrain/nvtabular/utils/converter.py`.
The NVTabular version of preprocessing is not subject to the same memory and storage constraints as its Spark counterpart, since NVTabular is able to manipulate tables on GPU and work with tables much larger than even physical RAM memory. The NVTabular Outbrain workflow has been successfully tested on DGX-1 V100 and DGX A100 for single and multigpu preprocessing.
Most of the code describing operations in this workflow are in `data/outbrain/nvtabular/utils/workflow.py` and leverage NVTabular v0.6.1. As stated in its repository, [NVTabular](https://github.com/NVIDIA/NVTabular), a component of [NVIDIA Merlin Open Beta](https://developer.nvidia.com/nvidia-merlin), is a feature engineering and preprocessing library for tabular data that is designed to quickly and easily manipulate terabyte scale datasets and train deep learning based recommender systems. It provides a high-level abstraction to simplify code and accelerates computation on the GPU using the [RAPIDS Dask-cuDF](https://github.com/rapidsai/cudf/tree/main/python/dask_cudf) library.
The NVTabular Outbrain workflow has been successfully tested on DGX-1 V100 and DGX A100 for single and multigpu preprocessing.
For more information about NVTabular, refer to the [NVTabular documentation](https://github.com/NVIDIA/NVTabular).
### Training process
The training can be started by running the `main.py` script. By default, the script is in train mode. Other training related configs are also present in the `trainer/utils/arguments.py` and can be seen using the command `python main.py -h`. Training happens on a TFRecords training dataset files that match `--train_data_pattern`. Training is run for `--num_epochs` epochs with a global batch size of `--global_batch_size` in strong scaling mode (i.e. the effective batch size per GPU equals `global_batch_size/gpu_count`).
The training can be started by running the `main.py` script. By default, the script is in train mode. Other training related configs are also present in the `trainer/utils/arguments.py` and can be seen using the command `python main.py -h`. Training happens with NVTabular data loader on a NVTabular training dataset files that match `--train_data_pattern`. Training is run for `--num_epochs` epochs with a global batch size of `--global_batch_size` in strong scaling mode (i.e. the effective batch size per GPU equals `global_batch_size/gpu_count`).
The model:
`tf.keras.experimental.WideDeepModel` consists of a wide part and deep part with a sigmoid activation in the output layer (see [Figure 1](#model-architecture)) for reference and `trainer/model/widedeep.py` for model definition).
@ -435,89 +375,46 @@ Two separate optimizers are used to optimize the wide and the deep part of the n
* RMSProp optimizer is used to optimize the deep part of the network.
Checkpoint of the model:
* Can be loaded at the beginning of training when `--use_checkpoint` is set
* Can be loaded at the beginning of training when `--use_checkpoint` is set.
* is saved into `--model_dir` after each training epoch. Only the last checkpoint is kept.
* Contains information about number of training epochs
* Contains information about number of training epochs.
The model is evaluated on an evaluation dataset after every training epoch training log is displayed in the console and stored in `--log_filename`.
Every 100 batches with training metrics:
loss, binary accuracy, AUC ROC, MAP@12 value
Every 100 batches with training metrics: bce loss
Every epoch after training, evaluation metrics are logged:
loss, binary accuracy, AUC ROC, MAP@12 value
Every epoch after training, evaluation metrics are logged: bce loss and MAP@12 value
### Evaluation process
The evaluation can be started by running the `main.py --evaluation` script. Evaluation is done for TFRecords dataset stored in `--eval_data_pattern`. Other evaluation related configs are also present in the `trainer/utils/arguments.py` and can be seen using the command `python main.py -h`.
The evaluation can be started by running the `main.py --evaluation` script. Evaluation is done on NVTabular parquet dataset stored in `--eval_data_pattern`. Other evaluation related configs are also present in the `trainer/utils/arguments.py` and can be seen using the command `python main.py -h`.
During evaluation (`--evaluation flag`):
* Model is restored from checkpoint in `--model_dir` if `--use_checkpoint` is set
* Evaluation log is displayed in console and stored in `--log_filename`
* Every 100 batches evaluation metrics are logged - loss, binary accuracy, AUC ROC, MAP@12 value
* Model is restored from checkpoint in `--model_dir` if `--use_checkpoint` is set.
* Evaluation log is displayed in console and stored in `--log_filename`.
* Every 100 batches evaluation metrics are logged: bce loss.
After the whole evaluation, the total evaluation metrics are logged, loss, binary accuracy, AUC ROC, MAP@12 value.
After the whole evaluation, the total evaluation metrics are logged: bce loss and MAP@12 value.
## Performance
The performance measurements in this document were conducted at the time of publication and may not reflect the performance achieved from NVIDIAs latest software release. For the most up-to-date performance measurements, go to [NVIDIA Data Center Deep Learning Product Performance](https://developer.nvidia.com/deep-learning-performance-training-inference).
### Benchmarking
The following section shows how to run benchmarks measuring the model performance in training mode.
#### NVTabular and Spark CPU Preprocessing comparison
Two types of dataset preprocessing are presented in Spark-CPU and NVTabular on GPU repositories. Both of these preprocess return prebatched TFRecords files with the same structure. The following table shows the comparison of both preprocessing in terms of code complication (Lines of code), top RAM consumption, and preprocessing time.
| |CPU preprocessing | CPU Preprocessing | GPU preprocessing | GPU Preprocessing | GPU preprocessing | GPU Preprocessing |
| -------------------------- | ----- | --------------------- | ------------------------ | ------------------------ | ------------------------ | ------------------------|
|| Spark on NVIDIA DGX-1 | Spark on NVIDIA DGX A100 | NVTabular on DGX-1 1 GPU | NVTabular on DGX-1 8 GPU | NVTabular DGX A100 1 GPU | NVTabular DGX A100 8 GPU | |
| Lines of code* | ~1500 | ~1500| ~500| ~500| ~500| ~500|
| Top RAM consumption \[GB\] | 167.0 | 223.4| 34.3| 48.7| 37.7 | 50.6|
| Top VRAM consumption per GPU \[GB\] | 0 | 0 | 16 | 13 | 45 | 67|
| Preprocessing time \[min\] |45.6|38.5|4.4|3.9|2.6| 2.3|
To achieve the same results for Top RAM consumption and preprocessing time, run a preprocessing container (`${HOST_OUTBRAIN_PATH}` is the path with Outbrain dataset).
```
nvidia-docker run --rm -it --ipc=host -v ${HOST_OUTBRAIN_PATH}:/outbrain wd2-prep bash
```
In the preprocessing container, run the preprocessing benchmark.
For Spark CPU preprocessing:
```
cd /wd && bash scripts/preproc_benchmark.sh -m spark
```
For GPU NVTabular preprocessing:
```
cd /wd && bash scripts/preproc_benchmark.sh -m nvtabular
```
#### Training and inference performance benchmark
#### Training and evaluation performance benchmark
Benchmark script is prepared to measure performance of the model during training (default configuration) and evaluation (`--evaluation`). Benchmark runs training or evaluation for `--benchmark_steps` batches, however measurement of performance starts after `--benchmark_warmup_steps`. Benchmark can be run for single and 8 GPUs and with a combination of XLA (`--xla`), AMP (`--amp`), batch sizes (`--global_batch_size` , `--eval_batch_size`) and affinity (`--affinity`).
In order to run benchmark follow these steps:
Run training container (`${HOST_OUTBRAIN_PATH}` is the path with Outbrain dataset):
```
nvidia-docker run --rm -it --ipc=host --privileged -v ${HOST_OUTBRAIN_PATH}:/outbrain wd2-train bash
```
Run Wide & Deep Container (`${HOST_OUTBRAIN_PATH}` is the path with Outbrain dataset):
```
docker run --runtime=nvidia --gpus=all --rm -it --ipc=host -v ${HOST_OUTBRAIN_PATH}:/outbrain wd2 bash
```
Run the benchmark script:
For 1 GPU:
```
python main.py --benchmark
```
The benchmark will be run for training with default training parameters.
For 8GPUs:
```
mpiexec --allow-run-as-root --bind-to socket -np 8 python main.py --benchmark
horovodrun -np ${GPU} sh hvd_wrapper.sh python main.py --benchmark
```
### Results
@ -530,27 +427,27 @@ The following sections provide details on how we achieved our performance and ac
Our results were obtained by running the `main.py` training script in the TensorFlow2 NGC container on NVIDIA DGX A100 with (8x A100 80GB) GPUs.
| GPUs | Batch size / GPU | XLA | Accuracy - TF32 (MAP@12), Spark dataset | Accuracy - mixed precision (MAP@12),Spark Dataset | Accuracy - TF32 (MAP@12), NVTabular dataset | Accuracy - mixed precision (MAP@12), NVTabular Dataset | Time to train - TF32 (minutes) | Time to train - mixed precision (minutes) | Time to train speedup (TF32 to mixed precision) |
| ---- | ---------------- | --- | --------------|---|------- | ----------------------------------- | ------------------------------ | ----------------------------------------- | ----------------------------------------------- |
1|131072|Yes|0.65536|0.65537|0.65537|0.65646|16.40|13.71|1.20
1|131072|No|0.65538|0.65533|0.65533|0.65643|19.58|18.49|1.06
8|16384|Yes|0.65527|0.65525|0.65525|0.65638|7.77|9.71|0.80
8|16384|No|0.65517|0.65525|0.65525|0.65638|7.84|9.48|0.83
| GPUs | Batch size / GPU | XLA | Accuracy - TF32 (MAP@12) | Accuracy - mixed precision (MAP@12) | Time to train - TF32 (minutes) | Time to train - mixed precision (minutes) | Time to train speedup (TF32 to mixed precision) |
| ---- | ---------------- | --- | --------------|---|------- | ----------------------------------- | ----------------------------------------------- |
1|131072|Yes|0.65656|0.65654|13.40|9.48|1.41
1|131072|No |0.65662|0.65656|17.75|13.38|1.33
8|16384|Yes |0.65672|0.65665|4.82|4.50|1.07
8|16384|No |0.65671|0.65655|5.71|5.72|1.00
To achieve the same results, follow the steps in the [Quick Start Guide](#quick-start-guide).
##### Training accuracy: NVIDIA DGX-1 (8x V100 16GB)
##### Training accuracy: NVIDIA DGX-1 (8x V100 32GB)
Our results were obtained by running the main.py training script in the TensorFlow2 NGC container on NVIDIA DGX-1 with (8x V100 16GB) GPUs.
Our results were obtained by running the main.py training script in the TensorFlow2 NGC container on NVIDIA DGX-1 with (8x V100 32GB) GPUs.
| GPUs | Batch size / GPU | XLA | Accuracy - TF32 (MAP@12), Spark dataset | Accuracy - mixed precision (MAP@12),Spark Dataset | Accuracy - TF32 (MAP@12), NVTabular dataset | Accuracy - mixed precision (MAP@12), NVTabular Dataset | Time to train - TF32 (minutes) | Time to train - mixed precision (minutes) | Time to train speedup (TF32 to mixed precision) |
| ---- | ---------------- | --- | --------------|---|------- | ----------------------------------- | ------------------------------ | ----------------------------------------- | ----------------------------------------------- |
1|131072|Yes|0.65531|0.65529|0.65529|0.65651|66.01|23.66|2.79
1|131072|No|0.65542|0.65534|0.65534|0.65641|72.68|29.18|2.49|
8|16384|Yes|0.65544|0.65547|0.65547|0.65642|16.28|13.90|1.17|
8|16384|No|0.65548|0.65540|0.65540|0.65633|16.34|12.65|1.29| |
| GPUs | Batch size / GPU | XLA | Accuracy - FP32 (MAP@12) | Accuracy - mixed precision (MAP@12) | Time to train - FP32 (minutes) | Time to train - mixed precision (minutes) | Time to train speedup (FP32 to mixed precision) |
| ---- | ---------------- | --- | --------------|---|------- | ----------------------------------------- | ----------------------------------------------- |
1|131072|Yes |0.65658|0.65664|62.89|18.65|3.37
1|131072|No |0.65662|0.65658|71.53|25.18|2.84
8|16384|Yes |0.65668|0.65655|12.21|8.89|1.37
8|16384|No |0.65665|0.65654|14.38|7.17|2.01
To achieve the same results, follow the steps in the [Quick Start Guide](#quick-start-guide).
@ -559,132 +456,80 @@ To achieve the same results, follow the steps in the [Quick Start Guide](#quick-
Models trained with FP32, TF32 and Automatic Mixed Precision (AMP), with and without XLA enabled achieve similar accuracy.
The plot represents MAP@12 in a function of steps (step is single batch) during training for default precision (FP32 for Volta architecture (DGX-1) and TF32 for Ampere GPU architecture (DGX-A100)) and AMP for XLA and without it for both datasets. All other parameters of training are default.
The plot represents MAP@12 in a function of steps (step is single batch) during training for default precision (FP32 for Volta architecture (DGX-1) and TF32 for Ampere GPU architecture (DGX-A100)) and AMP for XLA and without it for NVTabular dataset. All other parameters of training are default.
<p align="center">
<img width="100%" src="./img/leraning_curve_spark.svg" />
<img width="100%" src="./img/learning_curve.svg" />
<br>
Figure 2. Learning curves for Spark dataset for different configurations.</a>
Figure 2. Learning curves for different configurations on single gpu.</a>
</p>
<p align="center">
<img width="100%" src="./img/learning_curve_nvt.svg" />
<br>
Figure 3. Learning curves for NVTabular dataset for different configurations.</a>
</p>
##### Training stability test
Training of the model is stable for multiple configurations achieving the standard deviation of 10e-4. The model achieves similar MAP@12 scores for A100 and V100, training precisions, XLA usage and single/multi GPU. The Wide and Deep model was trained for 9100 training steps (20 epochs, 455 batches in each epoch, every batch containing 131072), starting from 20 different initial random seeds for each setup. The training was performed in the 20.12-tf1-py3 NGC container on NVIDIA DGX A100 80GB and DGX-1 16GB machines with and without mixed precision enabled, with and without XLA enabled for Spark- and NVTabular generated datasets. The provided charts and numbers consider single and 8 GPU training. After training, the models were evaluated on the validation set. The following plots compare distributions of MAP@12 on the evaluation set. In columns there is single vs 8 GPU training, in rows DGX A100 and DGX-1 V100.
Training of the model is stable for multiple configurations achieving the standard deviation of 10e-4. The model achieves similar MAP@12 scores for A100 and V100, training precisions, XLA usage and single/multi GPU. The Wide and Deep model was trained for 9140 training steps (20 epochs, 457 batches in each epoch, every batch containing 131072), starting from 20 different initial random seeds for each setup. The training was performed in the 21.09 Merlin Tensorflow Training NGC container on NVIDIA DGX A100 80GB and DGX-1 32GB machines with and without mixed precision enabled, with and without XLA enabled for NVTabular generated dataset. The provided charts and numbers consider single and 8 GPU training. After training, the models were evaluated on the validation set. The following plots compare distributions of MAP@12 on the evaluation set. In columns there is single vs 8 GPU training, in rows DGX A100 and DGX-1 V100.
<p align="center">
<img width="100%" src="./img/training_stability_spark.svg" />
<img width="100%" src="./img/training_stability.svg" />
<br>
Figure 4. Training stability for Spark dataset: distribution of MAP@12 across different configurations. 'All configurations' refer to the distribution of MAP@12 for cartesian product of architecture, training precision, XLA usage, single/multi GPU. </a>
Figure 3. Training stability plot, distribution of MAP@12 across different configurations. 'All configurations' refer to the distribution of MAP@12 for cartesian product of architecture, training precision, XLA usage, single/multi GPU. </a>
</p>
<p align="center">
<img width="100%" src="./img/training_stability_nvtabular.svg" />
<br>
Figure 5. Training stability for NVtabular dataset: distribution of MAP@12 across different configurations. 'All configurations' refer to the distribution of MAP@12 for cartesian product of architecture, training precision, XLA usage, single/multi GPU.</a>
</p>
Training stability was also compared in terms of point statistics for MAP@12 distribution for multiple configurations. Refer to the expandable table below.
<details>
<summary>Full tabular data for training stability tests</summary>
||GPUs|Precicision|Dataset|XLA|mean|std|Min|Max
|--------|-|---------|-----------|---|----|---|---|---
DGX A100|1|TF32|Spark preprocessed|Yes|0.65536|0.00016|0.65510|0.65560|
DGX A100|1|TF32|Spark preprocessed|No|0.65538|0.00013|0.65510|0.65570|
DGX A100|1|TF32|NVTabular preprocessed|Yes|0.65641|0.00038|0.65530|0.65680|
DGX A100|1|TF32|NVTabular preprocessed|No|0.65648|0.00024|0.65580|0.65690|
DGX A100|1|AMP|Spark preprocessed|Yes|0.65537|0.00013|0.65510|0.65550|
DGX A100|1|AMP|Spark preprocessed|No|0.65533|0.00016|0.65500|0.65550|
DGX A100|1|AMP|NVTabular preprocessed|Yes|0.65646|0.00036|0.65530|0.65690|
DGX A100|1|AMP|NVTabular preprocessed|No|0.65643|0.00027|0.65590|0.65690|
DGX A100|8|TF32|Spark preprocessed|Yes|0.65527|0.00013|0.65500|0.65560|
DGX A100|8|TF32|Spark preprocessed|No|0.65517|0.00025|0.65460|0.65560|
DGX A100|8|TF32|NVTabular preprocessed|Yes|0.65631|0.00038|0.65550|0.65690|
DGX A100|8|TF32|NVTabular preprocessed|No|0.65642|0.00022|0.65570|0.65680|
DGX A100|8|AMP|Spark preprocessed|Yes|0.65525|0.00018|0.65490|0.65550|
DGX A100|8|AMP|Spark preprocessed|No|0.65525|0.00016|0.65490|0.65550|
DGX A100|8|AMP|NVTabular preprocessed|Yes|0.65638|0.00026|0.65580|0.65680|
DGX A100|8|AMP|NVTabular preprocessed|No|0.65638|0.00031|0.65560|0.65700|
DGX-1 V100|1|FP32|Spark preprocessed|Yes|0.65531|0.00017|0.65490|0.65560|
DGX-1 V100|1|FP32|Spark preprocessed|No|0.65542|0.00012|0.65520|0.65560|
DGX-1 V100|1|FP32|NVTabular preprocessed|Yes|0.65651|0.00019|0.65610|0.65680|
DGX-1 V100|1|FP32|NVTabular preprocessed|No|0.65638|0.00035|0.65560|0.65680|
DGX-1 V100|1|AMP|Spark preprocessed|Yes|0.65529|0.00015|0.65500|0.65570|
DGX-1 V100|1|AMP|Spark preprocessed|No|0.65534|0.00015|0.65500|0.65560|
DGX-1 V100|1|AMP|NVTabular preprocessed|Yes|0.65651|0.00028|0.65560|0.65690|
DGX-1 V100|1|AMP|NVTabular preprocessed|No|0.65641|0.00032|0.65570|0.65680|
DGX-1 V100|8|FP32|Spark preprocessed|Yes|0.65544|0.00019|0.65500|0.65580|
DGX-1 V100|8|FP32|Spark preprocessed|No|0.65548|0.00013|0.65510|0.65560|
DGX-1 V100|8|FP32|NVTabular preprocessed|Yes|0.65645|0.00012|0.65630|0.65670|
DGX-1 V100|8|FP32|NVTabular preprocessed|No|0.65638|0.00015|0.65610|0.65670|
DGX-1 V100|8|AMP|Spark preprocessed|Yes|0.65547|0.00015|0.65520|0.65580|
DGX-1 V100|8|AMP|Spark preprocessed|No|0.65540|0.00019|0.65500|0.65580|
DGX-1 V100|8|AMP|NVTabular preprocessed|Yes|0.65642|0.00028|0.65580|0.65690|
DGX-1 V100|8|AMP|NVTabular preprocessed|No|0.65633|0.00037|0.65510|0.65680|
| | GPUs | Precicision | XLA | Mean | Std | Min | Max |
| -------- | --- | --------- | ---- | ------ | ------ | ------ | ------ |
|DGX A100|1|TF32|Yes |0.65656|0.00016|0.6563|0.6569
|DGX A100|1|TF32|No |0.65662|0.00013|0.6563|0.6568
|DGX A100|1|AMP|Yes |0.65654|0.00010|0.6563|0.6567
|DGX A100|1|AMP|No |0.65656|0.00011|0.6564|0.6568
|DGX A100|8|TF32|Yes |0.65672|0.00012|0.6565|0.6570
|DGX A100|8|TF32|No |0.65671|0.00013|0.6565|0.6569
|DGX A100|8|AMP|Yes |0.65665|0.00014|0.6564|0.6569
|DGX A100|8|AMP|No |0.65655|0.00012|0.6564|0.6568
|DGX-1 V100|1|FP32|Yes |0.65658|0.00013|0.6563|0.6568
|DGX-1 V100|1|FP32|No |0.65662|0.00011|0.6564|0.6568
|DGX-1 V100|1|AMP|Yes |0.65664|0.00011|0.6564|0.6568
|DGX-1 V100|1|AMP|No |0.65658|0.00011|0.6564|0.6568
|DGX-1 V100|8|FP32|Yes |0.65668|0.00016|0.6564|0.6570
|DGX-1 V100|8|FP32|No |0.65665|0.00019|0.6564|0.6570
|DGX-1 V100|8|AMP|Yes |0.65655|0.00012|0.6563|0.6567
|DGX-1 V100|8|AMP|No |0.65654|0.00013|0.6563|0.6567
</details>
##### Impact of mixed precision on training accuracy
The accuracy of training, measured with [MAP@12](https://en.wikipedia.org/wiki/Evaluation_measures_(information_retrieval)#Mean_average_precision) on the evaluation set after the final epoch metric was not impacted by enabling mixed precision. The obtained results were statistically similar. The similarity was measured according to the following procedure:
The model was trained 20 times for default settings (FP32 or TF32 for Volta and Ampere architecture respectively) and 20 times for AMP. After the last epoch, the accuracy score MAP@12 was calculated on the evaluation set.
Distributions for four configurations: architecture (A100, V100) and single/multi gpu for 2 datasets are presented below.
Distributions for four configurations: architecture (A100, V100) and single/multi gpu for NVTabular dataset are presented below.
<p align="center">
<img width="100%" src="./img/amp_influence_spark.svg" />
<img width="100%" src="./img/amp_influence.svg" />
<br>
Figure 6. Influence of AMP on MAP@12 distribution for DGX A100 and DGX-1 V100 for single and multi gpu training on Spark dataset. </a>
Figure 4. Influence of AMP on MAP@12 distribution for DGX A100 and DGX-1 V100 for single and multi gpu training. </a>
</p>
<p align="center">
<img width="100%" src="./img/amp_influence_nvtabular.svg" />
<br>
Figure 7. Influence of AMP on MAP@12 distribution for DGX A100 and DGX-1 V100 for single and multi gpu training on NVTabular dataset.
</p>
Distribution scores for full precision training and AMP training were compared in terms of mean, variance and [KolmogorovSmirnov test](https://en.wikipedia.org/wiki/Kolmogorov%E2%80%93Smirnov_test) to state statistical difference between full precision and AMP results. Refer to the expandable table below.
<details>
<summary>Full tabular data for AMP influence on MAP@12</summary>
| |GPUs | Dataset | XLA | Mean MAP@12 for Full precision (TF32 for A100, FP32 for V100) | Std MAP@12 for Full precision (TF32 for A100, FP32 for V100) | Mean MAP@12 for AMP | Std MAP@12 for AMP | KS test value: statistics, p-value |
| ------------ | ---------------------- | ------- | ------ | ------------------------------------------------------------ | ------------------------------------------------------------ | ------------------- | ------------------ | ---------------------------------- |
| DGX A100 | 1 | NVTabular preprocessed | No | 0.6565 | 0.0002 | 0.6564 | 0.0003 | 0.2000 (0.8320) | |
| DGX A100 | 8 | NVTabular preprocessed | No | 0.6564 | 0.0002 | 0.6564 | 0.0003 | 0.1500 (0.9831) | |
| DGX A100 | 1 | Spark preprocessed | No | 0.6554 | 0.0001 | 0.6553 | 0.0002 | 0.2500 (0.5713) | |
| DGX A100 | 8 | Spark preprocessed | No | 0.6552 | 0.0002 | 0.6552 | 0.0002 | 0.3000 (0.3356) | |
| DGX A100 | 1 | NVTabular preprocessed | No | 0.6564 | 0.0004 | 0.6565 | 0.0004 | 0.1500 (0.9831) | |
| DGX A100 | 8 | NVTabular preprocessed | No | 0.6563 | 0.0004 | 0.6564 | 0.0003 | 0.2500 (0.5713) | |
| DGX A100 | 1 | Spark preprocessed | No | 0.6554 | 0.0002 | 0.6554 | 0.0001 | 0.1500 (0.9831) | |
| DGX A100 | 8 | Spark preprocessed | No | 0.6553 | 0.0001 | 0.6552 | 0.0002 | 0.1500 (0.9831)) | |
| DGX-1 V100 | 1 | NVTabular preprocessed | No | 0.6564 | 0.0004 | 0.6564 | 0.0003 | 0.1000 (1.0000) | |
| DGX-1 V100 | 8 | NVTabular preprocessed | No | 0.6564 | 0.0001 | 0.6563 | 0.0004 | 0.2500 (0.5713) | |
| DGX-1 V100 | 1 | Spark preprocessed | No | 0.6554 | 0.0001 | 0.6553 | 0.0001 | 0.2000 (0.8320) | |
| DGX-1 V100 | 8 | Spark preprocessed | No | 0.6555 | 0.0001 | 0.6554 | 0.0002 | 0.3500 (0.1745) | |
| DGX-1 V100 | 1 | NVTabular preprocessed | No | 0.6565 | 0.0002 | 0.6565 | 0.0003 | 0.1500 (0.9831) | |
| DGX-1 V100 | 8 | NVTabular preprocessed | No | 0.6564 | 0.0001 | 0.6564 | 0.0003 | 0.2000 (0.8320) | |
| DGX-1 V100 | 1 | Spark preprocessed | No | 0.6553 | 0.0002 | 0.6553 | 0.0002 | 0.2000 (0.8320) | |
| DGX-1 V100 | 8 | Spark preprocessed | No | 0.6554 | 0.0002 | 0.6555 | 0.0002 | 0.1500 (0.9831) | |
| | GPUs | XLA | Mean MAP@12 for Full precision (TF32 for A100, FP32 for V100) | Std MAP@12 for Full precision (TF32 for A100, FP32 for V100) | Mean MAP@12 for AMP | Std MAP@12 for AMP | KS test value: statistics, p-value |
| ------------ | ---------------------- | ------ | ------------------------------------------------------------ | ------------------------------------------------------------ | ------------------- | ------------------ | ---------------------------------- |
| DGX A100 | 1 | Yes |0.65656|0.00016|0.65654|0.00010|0.10000 (0.99999)
| DGX A100 | 8 | Yes |0.65672|0.00012|0.65665|0.00014|0.40000 (0.08106)
| DGX A100 | 1 | No |0.65662|0.00013|0.65656|0.00011|0.35000 (0.17453)
| DGX A100 | 8 | No |0.65671|0.00013|0.65655|0.00012|0.35000 (0.17453)
| DGX-1 V100 | 1 | Yes |0.65658|0.00013|0.65664|0.00011|0.25000 (0.57134)
| DGX-1 V100 | 8 | Yes |0.65668|0.00016|0.65655|0.00012|0.30000 (0.33559)
| DGX-1 V100 | 1 | No |0.65662|0.00011|0.65658|0.00011|0.20000 (0.83197)
| DGX-1 V100 | 8 | No |0.65665|0.00019|0.65654|0.00013|0.40000 (0.08106)
</details>
@ -696,132 +541,132 @@ Our results were obtained by running the benchmark script (`main.py --benchmark`
|GPUs | Batch size / GPU | XLA | Throughput - TF32 (samples/s)|Throughput - mixed precision (samples/s)|Throughput speedup (TF32 - mixed precision)| Strong scaling - TF32|Strong scaling - mixed precision
| ---- | ---------------- | --- | ----------------------------- | ---------------------------------------- | ------------------------------------------- | --------------------- | -------------------------------- |
|1|131,072|Yes|1642892|1997414|1.22|1.00|1.00|
|1|131,072|No|1269638|1355523|1.07|1.00|1.00|
|8|16,384|Yes|3376438|2508278|0.74|2.06|1.26|
|8|16,384|No|3351118|2643009|0.79|2.64|1.07|
|1|131,072|Yes|2026524|3069487|1.51|1.00|1.00
|1|131,072|No |1379960|1928375|1.40|1.00|1.00
|8|16,384|Yes |6892010|7574174|1.10|3.40|2.47
|8|16,384|No |5124054|5120040|1.00|3.71|2.66
##### Training performance: NVIDIA DGX-1 (8x V100 16GB)
##### Training performance: NVIDIA DGX-1 (8x V100 32GB)
Our results were obtained by running the benchmark script (`main.py --benchmark`) in the TensorFlow2 NGC container on NVIDIA DGX-1 with (8x V100 16GB) GPUs.
Our results were obtained by running the benchmark script (`main.py --benchmark`) in the TensorFlow2 NGC container on NVIDIA DGX-1 with (8x V100 32GB) GPUs.
|GPUs | Batch size / GPU | XLA | Throughput - FP32 (samples/s)|Throughput - mixed precision (samples/s)|Throughput speedup (FP32 - mixed precision)| Strong scaling - FP32|Strong scaling - mixed precision
| ---- | ---------------- | --- | ----------------------------- | ---------------------------------------- | ------------------------------------------- | --------------------- | -------------------------------- |
|1|131,072|Yes|361202|1091584|3.02|1.00|1.00
|1|131,072|No|321816|847229|2.63|1.00|1.00
|8|16,384|Yes|1512691|1731391|1.14|4.19|1.59
|8|16,384|No|1490044|1837962|1.23|4.63|2.17
|1|131,072|Yes|378918|1405633|3.71|1.00|1.00
|1|131,072|No |323817|969824|2.99|1.00|1.00
|8|16,384|Yes |2196648|4332939|1.97|5.80|3.08
|8|16,384|No |1772485|3058944|1.73|5.47|3.15
#### Inference performance results
#### Evaluation performance results
##### Inference performance: NVIDIA DGX A100 (8x A100 80GB)
##### Evaluation performance: NVIDIA DGX A100 (8x A100 80GB)
Our results were obtained by running the benchmark script (`main.py --evaluate --benchmark`) in the TensorFlow2 NGC container on NVIDIA DGX A100 with 8x A100 80GB GPUs.
|GPUs|Batch size / GPU|XLA|Throughput \[samples/s\] TF32|Throughput \[samples/s\]AMP|Throughput speedup AMP to TF32
|GPUs|Batch size / GPU|XLA|Throughput \[samples/s\] TF32|Throughput \[samples/s\] AMP|Throughput speedup AMP to TF32
|----|----------------|---|------------------------------|-----------------------------|-------------------------------
|1|4096|NO|648058|614053|0.95|
|1|8192|NO|1063986|1063203|1.00|
|1|16384|NO|1506679|1573248|1.04|
|1|32768|NO|1983238|2088212|1.05|
|1|65536|NO|2280630|2523812|1.11|
|1|131072|NO|2568911|2915340|1.13|
|8|4096|NO|4516588|4374181|0.97|
|8|8192|NO|7715609|7718173|1.00|
|8|16384|NO|11296845|11624159|1.03|
|8|32768|NO|14957242|15904745|1.06|
|8|65536|NO|17671055|19332987|1.09|
|8|131072|NO|19779711|21761656|1.10|
|1|4096|NO |1107650|1028782|0.93|
|1|8192|NO |1783848|1856528|1.04|
|1|16384|NO |2295874|2409601|1.05|
|1|32768|NO |2367142|2583293|1.09|
|1|65536|NO |3044662|3471619|1.14|
|1|131072|NO |3229625|3823612|1.18|
|8|4096|NO |5503985|5333228|0.97|
|8|8192|NO |12251675|12386870|1.01|
|8|16384|NO |16020973|16438269|1.03|
|8|32768|NO |17225168|18667798|1.08|
|8|65536|NO |19969248|22270424|1.12|
|8|131072|NO |19929457|22496045|1.13|
For more results go to the expandable table below.
<details>
<summary>Full tabular data for inference performance results for DGX A100</summary>
<summary>Full tabular data for evaluation performance results for DGX A100</summary>
|GPUs|Batch size / GPU|XLA|Throughput \[samples/s\] TF32|Throughput \[samples/s\]AMP|Throughput speedup AMP to TF32
|GPUs|Batch size / GPU|XLA|Throughput \[samples/s\] TF32|Throughput \[samples/s\] AMP|Throughput speedup AMP to TF32
|----|----------------|---|------------------------------|-----------------------------|-------------------------------
|1|4096|YES|621024|648441|1.04|
|1|4096|NO|648058|614053|0.95|
|1|8192|YES|1068943|1045790|0.98|
|1|8192|NO|1063986|1063203|1.00|
|1|16384|YES|1554101|1710186|1.10|
|1|16384|NO|1506679|1573248|1.04|
|1|32768|YES|2014216|2363490|1.17|
|1|32768|NO|1983238|2088212|1.05|
|1|65536|YES|2010050|2450872|1.22|
|1|65536|NO|2280630|2523812|1.11|
|1|131072|YES|2321543|2885393|1.24|
|1|131072|NO|2568911|2915340|1.13|
|8|4096|YES|4328154|4445315|1.03|
|8|4096|NO|4516588|4374181|0.97|
|8|8192|YES|7410554|7640191|1.03|
|8|8192|NO|7715609|7718173|1.00|
|8|16384|YES|11412928|12422567|1.09|
|8|16384|NO|11296845|11624159|1.03|
|8|32768|YES|11428369|12525670|1.10|
|8|32768|NO|14957242|15904745|1.06|
|8|65536|YES|13453756|15308455|1.14|
|8|65536|NO|17671055|19332987|1.09|
|8|131072|YES|17047482|20930042|1.23|
|8|131072|NO|19779711|21761656|1.10|
|1|4096|YES |1344225|1501677|1.12|
|1|4096|NO |1107650|1028782|0.93|
|1|8192|YES |2220721|2545781|1.15|
|1|8192|NO |1783848|1856528|1.04|
|1|16384|YES |2730441|3230949|1.18|
|1|16384|NO |2295874|2409601|1.05|
|1|32768|YES |2527368|2974417|1.18|
|1|32768|NO |2367142|2583293|1.09|
|1|65536|YES |3163906|3935731|1.24|
|1|65536|NO |3044662|3471619|1.14|
|1|131072|YES |3171670|4064426|1.28|
|1|131072|NO |3229625|3823612|1.18|
|8|4096|YES |6243348|6553485|1.05|
|8|4096|NO |5503985|5333228|0.97|
|8|8192|YES |14995914|16222429|1.08|
|8|8192|NO |12251675|12386870|1.01|
|8|16384|YES |14584474|16543902|1.13|
|8|16384|NO |16020973|16438269|1.03|
|8|32768|YES |17840220|21537660|1.21|
|8|32768|NO |17225168|18667798|1.08|
|8|65536|YES |20732672|24082577|1.16|
|8|65536|NO |19969248|22270424|1.12|
|8|131072|YES |20104010|24157900|1.20|
|8|131072|NO |19929457|22496045|1.13|
</details>
##### Inference performance: NVIDIA DGX-1 (8x V100 16GB)
##### Evaluation performance: NVIDIA DGX-1 (8x V100 32GB)
Our results were obtained by running the benchmark script (`main.py --evaluate --benchmark`) in the TensorFlow2 NGC container on NVIDIA DGX-1 with (8x V100 16GB) GPUs.
Our results were obtained by running the benchmark script (`main.py --evaluate --benchmark`) in the TensorFlow2 NGC container on NVIDIA DGX-1 with (8x V100 32GB) GPUs.
|GPUs|Batch size / GPU|XLA|Throughput \[samples/s\] TF32|Throughput \[samples/s\]AMP|Throughput speedup AMP to TF32
|GPUs|Batch size / GPU|XLA|Throughput \[samples/s\] FP32|Throughput \[samples/s\] AMP|Throughput speedup AMP to FP32
|----|----------------|---|------------------------------|-----------------------------|-------------------------------
|1|4096|NO|375928|439395|1.17|
|1|8192|NO|526780|754517|1.43|
|1|16384|NO|673971|1133696|1.68|
|1|32768|NO|791637|1470221|1.86|
|1|65536|NO|842831|1753500|2.08|
|1|131072|NO|892941|1990898|2.23|
|8|4096|NO|2893390|3278473|1.13|
|8|8192|NO|3881996|5337866|1.38|
|8|16384|NO|5003135|8086178|1.62|
|8|32768|NO|6124648|11087247|1.81|
|8|65536|NO|6631887|13233484|2.00|
|8|131072|NO|7030438|15081861|2.15|
|1|4096|NO |499442|718163|1.44|
|1|8192|NO |670906|1144640|1.71|
|1|16384|NO |802366|1599006|1.99|
|1|32768|NO |856130|1795285|2.10|
|1|65536|NO |934394|2221221|2.38|
|1|131072|NO |965293|2403829|2.49|
|8|4096|NO |2840155|3602516|1.27|
|8|8192|NO |4810100|7912019|1.64|
|8|16384|NO |5939908|10876135|1.83|
|8|32768|NO |6489446|12593087|1.94|
|8|65536|NO |6614453|14742844|2.23|
|8|131072|NO |7133219|15524549|2.18|
For more results go to the expandable table below.
<details>
<summary>Full tabular data for inference performance for DGX-1 V100 results</summary>
<summary>Full tabular data for evaluation performance for DGX-1 V100 results</summary>
|GPUs|Batch size / GPU|XLA|Throughput \[samples/s\] TF32|Throughput \[samples/s\]AMP|Throughput speedup AMP to TF32
|GPUs|Batch size / GPU|XLA|Throughput \[samples/s\] FP32|Throughput \[samples/s\] AMP|Throughput speedup AMP to FP32
|----|----------------|---|------------------------------|-----------------------------|-------------------------------
|1|4096|YES|356963|459481|1.29|
|1|4096|NO|375928|439395|1.17|
|1|8192|YES|517016|734515|1.42|
|1|8192|NO|526780|754517|1.43|
|1|16384|YES|660772|1150292|1.74|
|1|16384|NO|673971|1133696|1.68|
|1|32768|YES|776357|1541699|1.99|
|1|32768|NO|791637|1470221|1.86|
|1|65536|YES|863311|1962275|2.27|
|1|65536|NO|842831|1753500|2.08|
|1|131072|YES|928290|2235968|2.41|
|1|131072|NO|892941|1990898|2.23|
|8|4096|YES|2680961|3182591|1.19|
|8|4096|NO|2893390|3278473|1.13|
|8|8192|YES|3738172|5185972|1.39|
|8|8192|NO|3881996|5337866|1.38|
|8|16384|YES|4961435|8170489|1.65|
|8|16384|NO|5003135|8086178|1.62|
|8|32768|YES|6218767|11658218|1.87|
|8|32768|NO|6124648|11087247|1.81|
|8|65536|YES|6808677|14921211|2.19|
|8|65536|NO|6631887|13233484|2.00|
|8|131072|YES|7205370|16923294|2.35|
|8|131072|NO|7030438|15081861|2.15|
|1|4096|YES |573285|919150|1.60|
|1|4096|NO |499442|718163|1.44|
|1|8192|YES |753993|1486867|1.97|
|1|8192|NO |670906|1144640|1.71|
|1|16384|YES |859699|1945700|2.26|
|1|16384|NO |802366|1599006|1.99|
|1|32768|YES |904255|1995194|2.21|
|1|32768|NO |856130|1795285|2.10|
|1|65536|YES |982448|2608010|2.65|
|1|65536|NO |934394|2221221|2.38|
|1|131072|YES |926734|2621095|2.83|
|1|131072|NO |965293|2403829|2.49|
|8|4096|YES |3102948|4083015|1.32|
|8|4096|NO |2840155|3602516|1.27|
|8|8192|YES |5536556|10094905|1.82|
|8|8192|NO |4810100|7912019|1.64|
|8|16384|YES |5722386|10524548|1.84|
|8|16384|NO |5939908|10876135|1.83|
|8|32768|YES |6813318|14356608|2.11|
|8|32768|NO |6489446|12593087|1.94|
|8|65536|YES |6918413|16227668|2.35|
|8|65536|NO |6614453|14742844|2.23|
|8|131072|YES |6910518|16423342|2.38|
|8|131072|NO |7133219|15524549|2.18|
</details>
## Release notes
@ -829,7 +674,15 @@ For more results go to the expandable table below.
### Changelog
February 2021
Initial release
- Initial release
November 2021
- Refresh release with performance optimizations
- Updated NVTabular to v0.6.1
- Replaced native TF dataloader with NVTabular counterpart
- Removed spark CPU preprocessing
- Updated readme numbers
- Changed V100 cards from 16GB to 32GB
### Known issues
* In this model the TF32 precision can in some cases be as fast as the FP16 precision on Ampere GPUs. This is because TF32 also uses Tensor Cores and doesn't need any additional logic such as maintaining FP32 master weights and casts. However, please note that W&D is, by modern recommender standards, a very small model. Larger models should still see significant benefits of using FP16 math.

View File

@ -12,128 +12,67 @@
# See the License for the specific language governing permissions and
# limitations under the License.
from functools import partial
from multiprocessing import cpu_count
import cupy
import horovod.tensorflow as hvd
import tensorflow as tf
from data.outbrain.features import CATEGORICAL_COLUMNS, NUMERIC_COLUMNS
from nvtabular.loader.tensorflow import KerasSequenceLoader
from data.outbrain.features import get_features_keys
cupy.random.seed(None)
def _consolidate_batch(elem):
label = elem.pop('label')
reshaped_label = tf.reshape(label, [-1, label.shape[-1]])
features = get_features_keys()
def seed_fn():
min_int, max_int = tf.int32.limits
max_rand = max_int // hvd.size()
reshaped_elem = {
key: tf.reshape(elem[key], [-1, elem[key].shape[-1]])
for key in elem
if key in features
}
# Generate a seed fragment on each worker
seed_fragment = cupy.random.randint(0, max_rand).get()
return reshaped_elem, reshaped_label
# Aggregate seed fragments from all Horovod workers
seed_tensor = tf.constant(seed_fragment)
reduced_seed = hvd.allreduce(seed_tensor, name="shuffle_seed", op=hvd.mpi_ops.Sum)
def get_parse_function(feature_spec):
def _parse_function(example_proto):
return tf.io.parse_single_example(example_proto, feature_spec)
return _parse_function
return reduced_seed % max_rand
def train_input_fn(
filepath_pattern,
feature_spec,
records_batch_size,
num_gpus=1,
id=0):
_parse_function = get_parse_function(feature_spec)
dataset = tf.data.Dataset.list_files(
file_pattern=filepath_pattern
)
dataset = dataset.interleave(
lambda x: tf.data.TFRecordDataset(x),
cycle_length=cpu_count() // num_gpus,
block_length=1
)
dataset = dataset.map(
map_func=_parse_function,
num_parallel_calls=tf.data.experimental.AUTOTUNE
)
dataset = dataset.shard(num_gpus, id)
dataset = dataset.shuffle(records_batch_size * 8)
dataset = dataset.repeat(
count=None
)
dataset = dataset.batch(
train_paths, records_batch_size, buffer_size=0.1, parts_per_chunk=1, shuffle=True
):
train_dataset_tf = KerasSequenceLoader(
train_paths,
batch_size=records_batch_size,
drop_remainder=False
label_names=["clicked"],
cat_names=CATEGORICAL_COLUMNS,
cont_names=NUMERIC_COLUMNS,
engine="parquet",
shuffle=shuffle,
buffer_size=buffer_size,
parts_per_chunk=parts_per_chunk,
global_size=hvd.size(),
global_rank=hvd.rank(),
seed_fn=seed_fn,
)
dataset = dataset.map(
map_func=partial(
_consolidate_batch
),
num_parallel_calls=tf.data.experimental.AUTOTUNE
)
dataset = dataset.prefetch(
buffer_size=tf.data.experimental.AUTOTUNE
)
return dataset
return train_dataset_tf
def eval_input_fn(
filepath_pattern,
feature_spec,
records_batch_size,
num_gpus=1,
repeat=1,
id=0):
dataset = tf.data.Dataset.list_files(
file_pattern=filepath_pattern,
shuffle=False
)
dataset = tf.data.TFRecordDataset(
filenames=dataset,
num_parallel_reads=1
)
dataset = dataset.shard(num_gpus, id)
dataset = dataset.repeat(
count=repeat
)
dataset = dataset.batch(
valid_paths, records_batch_size, buffer_size=0.1, parts_per_chunk=1, shuffle=False
):
valid_dataset_tf = KerasSequenceLoader(
valid_paths,
batch_size=records_batch_size,
drop_remainder=False
label_names=["clicked"],
cat_names=CATEGORICAL_COLUMNS + ["display_id"],
cont_names=NUMERIC_COLUMNS,
engine="parquet",
shuffle=shuffle,
buffer_size=buffer_size,
parts_per_chunk=parts_per_chunk,
global_size=hvd.size(),
global_rank=hvd.rank(),
seed_fn=seed_fn,
)
dataset = dataset.apply(
transformation_func=tf.data.experimental.parse_example_dataset(
features=feature_spec,
num_parallel_calls=1
)
)
dataset = dataset.map(
map_func=partial(
_consolidate_batch
),
num_parallel_calls=None
)
dataset = dataset.prefetch(
buffer_size=1
)
return dataset
return valid_dataset_tf

View File

@ -16,83 +16,75 @@ import logging
import tensorflow as tf
PREBATCH_SIZE = 4096
DISPLAY_ID_COLUMN = 'display_id'
DISPLAY_ID_COLUMN = "display_id"
TIME_COLUMNS = [
'doc_event_days_since_published_log_01scaled',
'doc_ad_days_since_published_log_01scaled'
NUMERIC_COLUMNS = [
"document_id_document_id_promo_sim_categories",
"document_id_document_id_promo_sim_topics",
"document_id_document_id_promo_sim_entities",
"document_id_promo_ctr",
"publisher_id_promo_ctr",
"source_id_promo_ctr",
"document_id_promo_count",
"publish_time_days_since_published",
"ad_id_ctr",
"advertiser_id_ctr",
"campaign_id_ctr",
"ad_id_count",
"publish_time_promo_days_since_published",
]
GB_COLUMNS = [
'pop_document_id',
'pop_publisher_id',
'pop_source_id',
'pop_ad_id',
'pop_advertiser_id',
'pop_campain_id',
'doc_views_log_01scaled',
'ad_views_log_01scaled'
]
SIM_COLUMNS = [
'doc_event_doc_ad_sim_categories',
'doc_event_doc_ad_sim_topics',
'doc_event_doc_ad_sim_entities'
]
NUMERIC_COLUMNS = TIME_COLUMNS + SIM_COLUMNS + GB_COLUMNS
CATEGORICAL_COLUMNS = [
'ad_id',
'campaign_id',
'doc_event_id',
'event_platform',
'doc_id',
'ad_advertiser',
'doc_event_source_id',
'doc_event_publisher_id',
'doc_ad_source_id',
'doc_ad_publisher_id',
'event_geo_location',
'event_country',
'event_country_state',
"ad_id",
"document_id",
"platform",
"document_id_promo",
"campaign_id",
"advertiser_id",
"source_id",
"geo_location",
"geo_location_country",
"geo_location_state",
"publisher_id",
"source_id_promo",
"publisher_id_promo",
]
HASH_BUCKET_SIZES = {
'doc_event_id': 300000,
'ad_id': 250000,
'doc_id': 100000,
'doc_ad_source_id': 4000,
'doc_event_source_id': 4000,
'event_geo_location': 2500,
'ad_advertiser': 2500,
'event_country_state': 2000,
'doc_ad_publisher_id': 1000,
'doc_event_publisher_id': 1000,
'event_country': 300,
'event_platform': 4,
'campaign_id': 5000
"document_id": 300000,
"ad_id": 250000,
"document_id_promo": 100000,
"source_id_promo": 4000,
"source_id": 4000,
"geo_location": 2500,
"advertiser_id": 2500,
"geo_location_state": 2000,
"publisher_id_promo": 1000,
"publisher_id": 1000,
"geo_location_country": 300,
"platform": 4,
"campaign_id": 5000,
}
EMBEDDING_DIMENSIONS = {
'doc_event_id': 128,
'ad_id': 128,
'doc_id': 128,
'doc_ad_source_id': 64,
'doc_event_source_id': 64,
'event_geo_location': 64,
'ad_advertiser': 64,
'event_country_state': 64,
'doc_ad_publisher_id': 64,
'doc_event_publisher_id': 64,
'event_country': 64,
'event_platform': 16,
'campaign_id': 128
"document_id": 128,
"ad_id": 128,
"document_id_promo": 128,
"source_id_promo": 64,
"source_id": 64,
"geo_location": 64,
"advertiser_id": 64,
"geo_location_state": 64,
"publisher_id_promo": 64,
"publisher_id": 64,
"geo_location_country": 64,
"platform": 19,
"campaign_id": 128,
}
EMBEDDING_TABLE_SHAPES = {
column: (HASH_BUCKET_SIZES[column], EMBEDDING_DIMENSIONS[column]) for column in CATEGORICAL_COLUMNS
column: (HASH_BUCKET_SIZES[column], EMBEDDING_DIMENSIONS[column])
for column in CATEGORICAL_COLUMNS
}
@ -101,31 +93,40 @@ def get_features_keys():
def get_feature_columns():
logger = logging.getLogger('tensorflow')
logger = logging.getLogger("tensorflow")
wide_columns, deep_columns = [], []
for column_name in CATEGORICAL_COLUMNS:
if column_name in EMBEDDING_TABLE_SHAPES:
categorical_column = tf.feature_column.categorical_column_with_identity(
column_name, num_buckets=EMBEDDING_TABLE_SHAPES[column_name][0])
column_name, num_buckets=EMBEDDING_TABLE_SHAPES[column_name][0]
)
wrapped_column = tf.feature_column.embedding_column(
categorical_column,
dimension=EMBEDDING_TABLE_SHAPES[column_name][1],
combiner='mean')
combiner="mean",
)
else:
raise ValueError(f'Unexpected categorical column found {column_name}')
raise ValueError(f"Unexpected categorical column found {column_name}")
wide_columns.append(categorical_column)
deep_columns.append(wrapped_column)
numerics = [tf.feature_column.numeric_column(column_name, shape=(1,), dtype=tf.float32)
for column_name in NUMERIC_COLUMNS]
numerics = [
tf.feature_column.numeric_column(column_name, shape=(1,), dtype=tf.float32)
for column_name in NUMERIC_COLUMNS
if column_name != DISPLAY_ID_COLUMN
]
wide_columns.extend(numerics)
deep_columns.extend(numerics)
logger.warning('deep columns: {}'.format(len(deep_columns)))
logger.warning('wide columns: {}'.format(len(wide_columns)))
logger.warning('wide&deep intersection: {}'.format(len(set(wide_columns).intersection(set(deep_columns)))))
logger.warning("deep columns: {}".format(len(deep_columns)))
logger.warning("wide columns: {}".format(len(wide_columns)))
logger.warning(
"wide&deep intersection: {}".format(
len(set(wide_columns).intersection(set(deep_columns)))
)
)
return wide_columns, deep_columns

View File

@ -15,37 +15,28 @@
import logging
import os
os.environ['TF_MEMORY_ALLOCATION'] = "0.0"
from data.outbrain.nvtabular.utils.converter import nvt_to_tfrecords
os.environ["TF_MEMORY_ALLOCATION"] = "0.0"
from data.outbrain.nvtabular.utils.workflow import execute_pipeline
from data.outbrain.nvtabular.utils.arguments import parse_args
from data.outbrain.nvtabular.utils.setup import create_config
def is_empty(path):
return not os.path.exists(path) or (not os.path.isfile(path) and not os.listdir(path))
return not (os.path.exists(path) and (os.path.isfile(path) or os.listdir(path)))
def main():
args = parse_args()
config = create_config(args)
if is_empty(args.metadata_path):
logging.warning('Creating new stats data into {}'.format(config['stats_file']))
logging.warning(
"Creating parquets into {}".format(config["output_bucket_folder"])
)
execute_pipeline(config)
else:
logging.warning('Directory is not empty {args.metadata_path}')
logging.warning('Skipping NVTabular preprocessing')
if os.path.exists(config['output_train_folder']) and os.path.exists(config['output_valid_folder']):
if is_empty(config['tfrecords_path']):
logging.warning('Executing NVTabular parquets to TFRecords conversion')
nvt_to_tfrecords(config)
else:
logging.warning(f"Directory is not empty {config['tfrecords_path']}")
logging.warning('Skipping TFrecords conversion')
else:
logging.warning(f'Train and validation dataset not found in {args.metadata_path}')
logging.warning(f"Directory exists {args.metadata_path}")
logging.warning("Skipping NVTabular preprocessing")
if __name__ == '__main__':
if __name__ == "__main__":
main()

View File

@ -14,39 +14,32 @@
import argparse
DEFAULT_DIR = '/outbrain'
DEFAULT_DIR = "/outbrain"
def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument(
'--data_path',
help='Path with the data required for NVTabular preprocessing. '
'If stats already exists under metadata_path preprocessing phase will be skipped.',
"--data_path",
help="Path with the data required for NVTabular preprocessing. "
"If stats already exists under metadata_path preprocessing phase will be skipped.",
type=str,
default=f'{DEFAULT_DIR}/orig',
nargs='+'
default=f"{DEFAULT_DIR}/orig",
nargs="+",
)
parser.add_argument(
'--metadata_path',
help='Path with preprocessed NVTabular stats',
"--metadata_path",
help="Path with preprocessed NVTabular stats",
type=str,
default=f'{DEFAULT_DIR}/data',
nargs='+'
default=f"{DEFAULT_DIR}/data",
nargs="+",
)
parser.add_argument(
'--tfrecords_path',
help='Path where converted tfrecords will be stored',
type=str,
default=f'{DEFAULT_DIR}/tfrecords',
nargs='+'
)
parser.add_argument(
'--workers',
help='Number of TfRecords files to be created',
type=int,
default=40
'--use_dask',
default=False,
action='store_true',
help='Use multi-gpu preprocessing for nvTabular workflow'
)
return parser.parse_args()

View File

@ -1,158 +0,0 @@
# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import logging
import os
from multiprocessing import Process
import pandas as pd
import tensorflow as tf
from tensorflow_transform.tf_metadata import dataset_metadata
from tensorflow_transform.tf_metadata import dataset_schema
from tensorflow_transform.tf_metadata import metadata_io
from data.outbrain.features import PREBATCH_SIZE
from data.outbrain.nvtabular.utils.feature_description import transform_nvt_to_spark, CATEGORICAL_COLUMNS, \
DISPLAY_ID_COLUMN, EXCLUDE_COLUMNS
def create_metadata(df, prebatch_size, output_path):
fixed_shape = [prebatch_size, 1]
spec = {}
for column in df:
if column in CATEGORICAL_COLUMNS + [DISPLAY_ID_COLUMN]:
spec[transform_nvt_to_spark(column)] = tf.io.FixedLenFeature(shape=fixed_shape, dtype=tf.int64,
default_value=None)
else:
spec[transform_nvt_to_spark(column)] = tf.io.FixedLenFeature(shape=fixed_shape, dtype=tf.float32,
default_value=None)
metadata = dataset_metadata.DatasetMetadata(dataset_schema.from_feature_spec(spec))
metadata_io.write_metadata(metadata, output_path)
def create_tf_example(df, start_index, offset):
parsed_features = {}
records = df.loc[start_index:start_index + offset - 1]
for column in records:
if column in CATEGORICAL_COLUMNS + [DISPLAY_ID_COLUMN]:
feature = tf.train.Feature(int64_list=tf.train.Int64List(value=records[column].to_numpy()))
else:
feature = tf.train.Feature(float_list=tf.train.FloatList(value=records[column].to_numpy()))
parsed_features[transform_nvt_to_spark(column)] = feature
features = tf.train.Features(feature=parsed_features)
return tf.train.Example(features=features)
def create_tf_records(df, prebatch_size, output_path):
with tf.io.TFRecordWriter(output_path) as out_file:
start_index = df.index[0]
for index in range(start_index, df.shape[0] + start_index - prebatch_size + 1, prebatch_size):
example = create_tf_example(df, index, prebatch_size)
out_file.write(example.SerializeToString())
def convert(path_to_nvt_dataset, output_path, prebatch_size, exclude_columns, workers=6):
train_path = os.path.join(path_to_nvt_dataset, 'train')
valid_path = os.path.join(path_to_nvt_dataset, 'valid')
output_metadata_path = os.path.join(output_path, 'transformed_metadata')
output_train_path = os.path.join(output_path, 'train')
output_valid_path = os.path.join(output_path, 'eval')
for directory in [output_metadata_path, output_train_path, output_valid_path]:
os.makedirs(directory, exist_ok=True)
train_workers, valid_workers = [], []
output_train_paths, output_valid_paths = [], []
for worker in range(workers):
part_number = str(worker).rjust(5, '0')
record_train_path = os.path.join(output_train_path, f'part-r-{part_number}')
record_valid_path = os.path.join(output_valid_path, f'part-r-{part_number}')
output_train_paths.append(record_train_path)
output_valid_paths.append(record_valid_path)
logging.warning(f'Prebatch size set to {prebatch_size}')
logging.warning(f'Number of TFRecords set to {workers}')
logging.warning(f'Reading training parquets from {train_path}')
df_train = pd.read_parquet(train_path, engine='pyarrow')
logging.warning('Done')
logging.warning(f'Removing training columns {exclude_columns}')
df_train = df_train.drop(columns=exclude_columns)
logging.warning('Done')
logging.warning(f'Creating metadata in {output_metadata_path}')
metadata_worker = Process(target=create_metadata, args=(df_train, prebatch_size, output_metadata_path))
metadata_worker.start()
logging.warning(f'Creating training TFrecords to {output_train_paths}')
shape = df_train.shape[0] // workers
shape = shape + (prebatch_size - shape % prebatch_size)
for worker_index in range(workers):
df_subset = df_train.loc[worker_index * shape:(worker_index + 1) * shape - 1]
worker = Process(target=create_tf_records, args=(df_subset, prebatch_size, output_train_paths[worker_index]))
train_workers.append(worker)
for worker in train_workers:
worker.start()
logging.warning(f'Reading validation parquets from {valid_path}')
df_valid = pd.read_parquet(valid_path, engine='pyarrow')
logging.warning('Done')
logging.warning(f'Removing validation columns {exclude_columns}')
df_valid = df_valid.drop(columns=exclude_columns)
logging.warning('Done')
logging.warning(f'Creating validation TFrecords to {output_valid_paths}')
shape = df_valid.shape[0] // workers
shape = shape + (prebatch_size - shape % prebatch_size)
for worker_index in range(workers):
df_subset = df_valid.loc[worker_index * shape:(worker_index + 1) * shape - 1]
worker = Process(target=create_tf_records, args=(df_subset, prebatch_size, output_valid_paths[worker_index]))
valid_workers.append(worker)
for worker in valid_workers:
worker.start()
for worker_index in range(workers):
metadata_worker.join()
train_workers[worker_index].join()
valid_workers[worker_index].join()
logging.warning('Done')
del df_train
del df_valid
return output_path
def nvt_to_tfrecords(config):
path_to_nvt_dataset = config['output_bucket_folder']
output_path = config['tfrecords_path']
workers = config['workers']
convert(
path_to_nvt_dataset=path_to_nvt_dataset,
output_path=output_path,
prebatch_size=PREBATCH_SIZE,
exclude_columns=EXCLUDE_COLUMNS,
workers=workers
)

View File

@ -12,89 +12,105 @@
# See the License for the specific language governing permissions and
# limitations under the License.
DISPLAY_ID_COLUMN = 'display_id'
DISPLAY_ID_COLUMN = "display_id"
BASE_CONT_COLUMNS = ['publish_time', 'publish_time_promo', 'timestamp', 'document_id_promo_clicked_sum_ctr',
'publisher_id_promo_clicked_sum_ctr',
'source_id_promo_clicked_sum_ctr', 'document_id_promo_count', 'publish_time_days_since_published',
'ad_id_clicked_sum_ctr',
'advertiser_id_clicked_sum_ctr', 'campaign_id_clicked_sum_ctr', 'ad_id_count',
'publish_time_promo_days_since_published']
BASE_CONT_COLUMNS = [
"publish_time",
"publish_time_promo",
"timestamp",
"document_id_promo_clicked_sum_ctr",
"publisher_id_promo_clicked_sum_ctr",
"source_id_promo_clicked_sum_ctr",
"document_id_promo_count",
"publish_time_days_since_published",
"ad_id_clicked_sum_ctr",
"advertiser_id_clicked_sum_ctr",
"campaign_id_clicked_sum_ctr",
"ad_id_count",
"publish_time_promo_days_since_published",
]
SIM_COLUMNS = [
'doc_event_doc_ad_sim_categories',
'doc_event_doc_ad_sim_topics',
'doc_event_doc_ad_sim_entities'
"doc_event_doc_ad_sim_categories",
"doc_event_doc_ad_sim_topics",
"doc_event_doc_ad_sim_entities",
]
CONTINUOUS_COLUMNS = BASE_CONT_COLUMNS + SIM_COLUMNS + [DISPLAY_ID_COLUMN]
groupby_columns = ['ad_id_count', 'ad_id_clicked_sum', 'source_id_promo_count', 'source_id_promo_clicked_sum',
'document_id_promo_count', 'document_id_promo_clicked_sum',
'publisher_id_promo_count', 'publisher_id_promo_clicked_sum', 'advertiser_id_count',
'advertiser_id_clicked_sum',
'campaign_id_count', 'campaign_id_clicked_sum']
ctr_columns = ['advertiser_id_clicked_sum_ctr', 'document_id_promo_clicked_sum_ctr',
'publisher_id_promo_clicked_sum_ctr',
'source_id_promo_clicked_sum_ctr',
'ad_id_clicked_sum_ctr', 'campaign_id_clicked_sum_ctr']
exclude_conts = ['publish_time', 'publish_time_promo', 'timestamp']
exclude_conts = ["publish_time", "publish_time_promo", "timestamp"]
NUMERIC_COLUMNS = [col for col in CONTINUOUS_COLUMNS if col not in exclude_conts]
CATEGORICAL_COLUMNS = ['ad_id', 'document_id', 'platform', 'document_id_promo', 'campaign_id', 'advertiser_id',
'source_id',
'publisher_id', 'source_id_promo', 'publisher_id_promo', 'geo_location', 'geo_location_country',
'geo_location_state']
CATEGORICAL_COLUMNS = [
"ad_id",
"document_id",
"platform",
"document_id_promo",
"campaign_id",
"advertiser_id",
"source_id",
"publisher_id",
"source_id_promo",
"publisher_id_promo",
]
CTR_INPUTS = [
"ad_id",
"source_id_promo",
"document_id_promo",
"publisher_id_promo",
"advertiser_id",
"campaign_id",
]
EXCLUDE_COLUMNS = [
'publish_time',
'publish_time_promo',
'timestamp',
'ad_id_clicked_sum',
'source_id_promo_count',
'source_id_promo_clicked_sum',
'document_id_promo_clicked_sum',
'publisher_id_promo_count', 'publisher_id_promo_clicked_sum',
'advertiser_id_count',
'advertiser_id_clicked_sum',
'campaign_id_count',
'campaign_id_clicked_sum',
'uuid',
'day_event'
"publish_time",
"publish_time_promo",
"timestamp",
"ad_id_clicked_sum",
"source_id_promo_count",
"source_id_promo_clicked_sum",
"document_id_promo_clicked_sum",
"publisher_id_promo_count",
"publisher_id_promo_clicked_sum",
"advertiser_id_count",
"advertiser_id_clicked_sum",
"campaign_id_count",
"campaign_id_clicked_sum",
"uuid",
"day_event",
]
nvt_to_spark = {
'ad_id': 'ad_id',
'clicked': 'label',
'display_id': 'display_id',
'document_id': 'doc_event_id',
'platform': 'event_platform',
'document_id_promo': 'doc_id',
'campaign_id': 'campaign_id',
'advertiser_id': 'ad_advertiser',
'source_id': 'doc_event_source_id',
'publisher_id': 'doc_event_publisher_id',
'source_id_promo': 'doc_ad_source_id',
'publisher_id_promo': 'doc_ad_publisher_id',
'geo_location': 'event_geo_location',
'geo_location_country': 'event_country',
'geo_location_state': 'event_country_state',
'document_id_promo_clicked_sum_ctr': 'pop_document_id',
'publisher_id_promo_clicked_sum_ctr': 'pop_publisher_id',
'source_id_promo_clicked_sum_ctr': 'pop_source_id',
'document_id_promo_count': 'doc_views_log_01scaled',
'publish_time_days_since_published': 'doc_event_days_since_published_log_01scaled',
'ad_id_clicked_sum_ctr': 'pop_ad_id',
'advertiser_id_clicked_sum_ctr': 'pop_advertiser_id',
'campaign_id_clicked_sum_ctr': 'pop_campain_id',
'ad_id_count': 'ad_views_log_01scaled',
'publish_time_promo_days_since_published': 'doc_ad_days_since_published_log_01scaled',
'doc_event_doc_ad_sim_categories': 'doc_event_doc_ad_sim_categories',
'doc_event_doc_ad_sim_topics': 'doc_event_doc_ad_sim_topics',
'doc_event_doc_ad_sim_entities': 'doc_event_doc_ad_sim_entities'
"ad_id": "ad_id",
"clicked": "label",
"display_id": "display_id",
"document_id": "doc_event_id",
"platform": "event_platform",
"document_id_promo": "doc_id",
"campaign_id": "campaign_id",
"advertiser_id": "ad_advertiser",
"source_id": "doc_event_source_id",
"publisher_id": "doc_event_publisher_id",
"source_id_promo": "doc_ad_source_id",
"publisher_id_promo": "doc_ad_publisher_id",
"geo_location": "event_geo_location",
"geo_location_country": "event_country",
"geo_location_state": "event_country_state",
"document_id_promo_ctr": "pop_document_id",
"publisher_id_promo_ctr": "pop_publisher_id",
"source_id_promo_ctr": "pop_source_id",
"document_id_promo_count": "doc_views_log_01scaled",
"publish_time_days_since_published": "doc_event_days_since_published_log_01scaled",
"ad_id_ctr": "pop_ad_id",
"advertiser_id_ctr": "pop_advertiser_id",
"campaign_id_ctr": "pop_campain_id",
"ad_id_count": "ad_views_log_01scaled",
"publish_time_promo_days_since_published": "doc_ad_days_since_published_log_01scaled",
"document_id_document_id_promo_sim_categories": "doc_event_doc_ad_sim_categories",
"document_id_document_id_promo_sim_topics": "doc_event_doc_ad_sim_topics",
"document_id_document_id_promo_sim_entities": "doc_event_doc_ad_sim_entities",
}
spark_to_nvt = {item: key for key, item in nvt_to_spark.items()}

View File

@ -15,34 +15,30 @@
import os
from data.outbrain.features import HASH_BUCKET_SIZES
from data.outbrain.nvtabular.utils.feature_description import transform_spark_to_nvt
def create_config(args):
stats_file = os.path.join(args.metadata_path, 'stats_wnd_workflow')
data_bucket_folder = args.data_path
output_bucket_folder = args.metadata_path
output_train_folder = os.path.join(output_bucket_folder, 'train/')
temporary_folder = os.path.join('/tmp', 'preprocessed')
train_path = os.path.join(temporary_folder, 'train_gdf.parquet')
valid_path = os.path.join(temporary_folder, 'valid_gdf.parquet')
output_valid_folder = os.path.join(output_bucket_folder, 'valid/')
tfrecords_path = args.tfrecords_path
workers = args.workers
hash_spec = {transform_spark_to_nvt(column): hash for column, hash in HASH_BUCKET_SIZES.items()}
temporary_folder = os.path.join("/tmp", "preprocessed")
train_path = os.path.join(temporary_folder, "train_gdf.parquet")
valid_path = os.path.join(temporary_folder, "valid_gdf.parquet")
stats_file = os.path.join(temporary_folder, "stats_wnd_workflow")
output_train_folder = os.path.join(output_bucket_folder, "train/")
output_valid_folder = os.path.join(output_bucket_folder, "valid/")
hash_spec = HASH_BUCKET_SIZES
config = {
'stats_file': stats_file,
'data_bucket_folder': data_bucket_folder,
'output_bucket_folder': output_bucket_folder,
'output_train_folder': output_train_folder,
'temporary_folder': temporary_folder,
'train_path': train_path,
'valid_path': valid_path,
'output_valid_folder': output_valid_folder,
'tfrecords_path': tfrecords_path,
'workers': workers,
'hash_spec': hash_spec
"stats_file": stats_file,
"data_bucket_folder": data_bucket_folder,
"output_bucket_folder": output_bucket_folder,
"output_train_folder": output_train_folder,
"temporary_folder": temporary_folder,
"train_path": train_path,
"valid_path": valid_path,
"output_valid_folder": output_valid_folder,
"hash_spec": hash_spec,
"dask": args.use_dask
}
return config

View File

@ -21,10 +21,24 @@ import nvtabular as nvt
import rmm
from dask.distributed import Client
from dask_cuda import LocalCUDACluster
from data.outbrain.nvtabular.utils.feature_description import CATEGORICAL_COLUMNS, CONTINUOUS_COLUMNS, \
DISPLAY_ID_COLUMN, groupby_columns, ctr_columns
from data.outbrain.nvtabular.utils.feature_description import (
CATEGORICAL_COLUMNS,
DISPLAY_ID_COLUMN,
CTR_INPUTS,
)
from nvtabular import ColumnGroup
from nvtabular.io import Shuffle
from nvtabular.ops import Normalize, FillMedian, FillMissing, LogOp, LambdaOp, JoinGroupby, HashBucket
from nvtabular.ops import (
FillMedian,
LogOp,
Rename,
JoinGroupby,
LambdaOp,
FillMissing,
HashBucket,
Normalize,
)
from nvtabular.ops import Operator
from nvtabular.ops.column_similarity import ColumnSimilarity
from nvtabular.utils import device_mem_size, get_rmm_size
@ -33,24 +47,38 @@ TIMESTAMP_DELTA = 1465876799998
def get_devices():
try:
devices = [int(device) for device in os.environ["CUDA_VISIBLE_DEVICES"].split(",")]
devices = [
int(device) for device in os.environ["CUDA_VISIBLE_DEVICES"].split(",")
]
except KeyError:
from pynvml import nvmlInit, nvmlDeviceGetCount
nvmlInit()
devices = list(range(nvmlDeviceGetCount()))
return devices
def _calculate_delta(col, gdf):
col.loc[col == ''] = None
col = col.astype('datetime64[ns]')
timestamp = (gdf['timestamp'] + TIMESTAMP_DELTA).astype('datetime64[ms]')
delta = (timestamp - col).dt.days
delta = delta * (delta >= 0) * (delta <= 10 * 365)
return delta
class DaysSincePublished(Operator):
def transform(self, columns, gdf):
for column in columns:
col = gdf[column]
col.loc[col == ""] = None
col = col.astype("datetime64[ns]")
timestamp = (gdf["timestamp"] + TIMESTAMP_DELTA).astype("datetime64[ms]")
delta = (timestamp - col).dt.days
gdf[column + "_days_since_published"] = (
delta * (delta >= 0) * (delta <= 10 * 365)
)
return gdf
def output_column_names(self, columns):
return [column + "_days_since_published" for column in columns]
def dependencies(self):
return ["timestamp"]
def _df_to_coo(df, row='document_id', col=None, data='confidence_level'):
def _df_to_coo(df, row="document_id", col=None, data="confidence_level"):
return cupy.sparse.coo_matrix((df[data].values, (df[row].values, df[col].values)))
@ -71,7 +99,7 @@ def create_client(devices, local_directory):
n_workers=len(devices),
CUDA_VISIBLE_DEVICES=",".join(str(x) for x in devices),
device_memory_limit=device_limit,
local_directory=local_directory
local_directory=local_directory,
)
client = Client(cluster)
setup_rmm_pool(client, device_pool_size)
@ -79,86 +107,95 @@ def create_client(devices, local_directory):
return client
def create_workflow(data_bucket_folder, output_bucket_folder, hash_spec, devices, local_directory):
def create_workflow(data_bucket_folder, hash_spec, devices, local_directory, dask):
rmm.reinitialize(managed_memory=False)
documents_categories_path = os.path.join(data_bucket_folder, 'documents_categories.csv')
documents_topics_path = os.path.join(data_bucket_folder, 'documents_topics.csv')
documents_entities_path = os.path.join(data_bucket_folder, 'documents_entities.csv')
documents_categories_path = os.path.join(
data_bucket_folder, "documents_categories.csv"
)
documents_topics_path = os.path.join(data_bucket_folder, "documents_topics.csv")
documents_entities_path = os.path.join(data_bucket_folder, "documents_entities.csv")
documents_categories_cudf = cudf.read_csv(documents_categories_path)
documents_topics_cudf = cudf.read_csv(documents_topics_path)
documents_entities_cudf = cudf.read_csv(documents_entities_path)
documents_entities_cudf['entity_id'] = documents_entities_cudf['entity_id'].astype('category').cat.codes
documents_entities_cudf["entity_id"] = (
documents_entities_cudf["entity_id"].astype("category").cat.codes
)
categories = _df_to_coo(documents_categories_cudf, col='category_id')
topics = _df_to_coo(documents_topics_cudf, col='topic_id')
entities = _df_to_coo(documents_entities_cudf, col='entity_id')
categories = _df_to_coo(documents_categories_cudf, col="category_id")
topics = _df_to_coo(documents_topics_cudf, col="topic_id")
entities = _df_to_coo(documents_entities_cudf, col="entity_id")
del documents_categories_cudf, documents_topics_cudf, documents_entities_cudf
ctr_thresh = {
'ad_id': 5,
'source_id_promo': 10,
'publisher_id_promo': 10,
'advertiser_id': 10,
'campaign_id': 10,
'document_id_promo': 5,
"ad_id": 5,
"source_id_promo": 10,
"publisher_id_promo": 10,
"advertiser_id": 10,
"campaign_id": 10,
"document_id_promo": 5,
}
client = create_client(
devices=devices,
local_directory=local_directory
ctr_inputs = ColumnGroup(CTR_INPUTS)
cat_cols = ColumnGroup(CATEGORICAL_COLUMNS)
geo_location = ColumnGroup(["geo_location"])
country = (
geo_location >> (lambda col: col.str.slice(0, 2)) >> Rename(postfix="_country")
)
state = (
geo_location >> (lambda col: col.str.slice(0, 5)) >> Rename(postfix="_state")
)
geo_features = geo_location + country + state
dates = ["publish_time", "publish_time_promo"]
date_features = dates >> DaysSincePublished() >> FillMedian() >> LogOp
stat_cols = ctr_inputs >> JoinGroupby(cont_cols=["clicked"], stats=["sum", "count"])
ctr_cols = (
stat_cols - [column + "_count" for column in ctr_inputs.flattened_columns]
>> LambdaOp(
f=lambda col, gdf: (
(col) / (gdf[col.name.replace("_clicked_sum", "_count")])
).where(
gdf[col.name.replace("_clicked_sum", "_count")]
>= ctr_thresh[col.name.replace("_clicked_sum", "")],
0,
),
dependency=stat_cols
- [column + "clicked_sum" for column in ctr_inputs.flattened_columns],
)
>> Rename(f=lambda x: x.replace("_clicked_sum", "_ctr"))
)
workflow = nvt.Workflow(
cat_names=CATEGORICAL_COLUMNS,
cont_names=CONTINUOUS_COLUMNS,
label_name=['clicked'],
client=client
stat_cols = stat_cols >> FillMissing() >> LogOp() >> Normalize()
ctr_cols = ctr_cols >> FillMissing()
cat_cols = cat_cols + geo_features >> HashBucket(hash_spec)
features = (
date_features + ctr_cols + stat_cols + cat_cols + ["clicked", "display_id"]
)
sim_features_categ = (
[["document_id", "document_id_promo"]]
>> ColumnSimilarity(categories, metric="tfidf", on_device=False)
>> Rename(postfix="_categories")
)
sim_features_topics = (
[["document_id", "document_id_promo"]]
>> ColumnSimilarity(topics, metric="tfidf", on_device=False)
>> Rename(postfix="_topics")
)
sim_features_entities = (
[["document_id", "document_id_promo"]]
>> ColumnSimilarity(entities, metric="tfidf", on_device=False)
>> Rename(postfix="_entities")
)
sim_features = sim_features_categ + sim_features_topics + sim_features_entities
workflow.add_feature([
LambdaOp(
op_name='country',
f=lambda col, gdf: col.str.slice(0, 2),
columns=['geo_location'], replace=False),
LambdaOp(
op_name='state',
f=lambda col, gdf: col.str.slice(0, 5),
columns=['geo_location'], replace=False),
LambdaOp(
op_name='days_since_published',
f=_calculate_delta,
columns=['publish_time', 'publish_time_promo'], replace=False),
client = create_client(devices=devices, local_directory=local_directory) if dask else None
FillMedian(columns=['publish_time_days_since_published', 'publish_time_promo_days_since_published']),
JoinGroupby(columns=['ad_id', 'source_id_promo', 'document_id_promo', 'publisher_id_promo', 'advertiser_id',
'campaign_id'],
cont_names=['clicked'], out_path=output_bucket_folder, stats=['sum', 'count']),
LambdaOp(
op_name='ctr',
f=lambda col, gdf: ((col) / (gdf[col.name.replace('_clicked_sum', '_count')])).where(
gdf[col.name.replace('_clicked_sum', '_count')] >= ctr_thresh[col.name.replace('_clicked_sum', '')], 0),
columns=['ad_id_clicked_sum', 'source_id_promo_clicked_sum', 'document_id_promo_clicked_sum',
'publisher_id_promo_clicked_sum',
'advertiser_id_clicked_sum', 'campaign_id_clicked_sum'], replace=False),
FillMissing(columns=groupby_columns + ctr_columns),
LogOp(
columns=groupby_columns + ['publish_time_days_since_published', 'publish_time_promo_days_since_published']),
Normalize(columns=groupby_columns),
ColumnSimilarity('doc_event_doc_ad_sim_categories', 'document_id', categories, 'document_id_promo',
metric='tfidf', on_device=False),
ColumnSimilarity('doc_event_doc_ad_sim_topics', 'document_id', topics, 'document_id_promo', metric='tfidf',
on_device=False),
ColumnSimilarity('doc_event_doc_ad_sim_entities', 'document_id', entities, 'document_id_promo', metric='tfidf',
on_device=False)
])
workflow.add_cat_preprocess([
HashBucket(hash_spec)
])
workflow.finalize()
workflow = nvt.Workflow(column_group=features + sim_features, client=client)
return workflow
@ -166,31 +203,52 @@ def create_workflow(data_bucket_folder, output_bucket_folder, hash_spec, devices
def create_parquets(data_bucket_folder, train_path, valid_path):
cupy.random.seed(seed=0)
rmm.reinitialize(managed_memory=True)
documents_meta_path = os.path.join(data_bucket_folder, 'documents_meta.csv')
clicks_train_path = os.path.join(data_bucket_folder, 'clicks_train.csv')
events_path = os.path.join(data_bucket_folder, 'events.csv')
promoted_content_path = os.path.join(data_bucket_folder, 'promoted_content.csv')
documents_meta_path = os.path.join(data_bucket_folder, "documents_meta.csv")
clicks_train_path = os.path.join(data_bucket_folder, "clicks_train.csv")
events_path = os.path.join(data_bucket_folder, "events.csv")
promoted_content_path = os.path.join(data_bucket_folder, "promoted_content.csv")
documents_meta = cudf.read_csv(documents_meta_path, na_values=['\\N', ''])
documents_meta = documents_meta.dropna(subset='source_id')
documents_meta['publisher_id'].fillna(
documents_meta['publisher_id'].isnull().cumsum() + documents_meta['publisher_id'].max() + 1, inplace=True)
merged = (cudf.read_csv(clicks_train_path, na_values=['\\N', ''])
.merge(cudf.read_csv(events_path, na_values=['\\N', '']), on=DISPLAY_ID_COLUMN, how='left',
suffixes=('', '_event'))
.merge(cudf.read_csv(promoted_content_path, na_values=['\\N', '']), on='ad_id',
how='left',
suffixes=('', '_promo'))
.merge(documents_meta, on='document_id', how='left')
.merge(documents_meta, left_on='document_id_promo', right_on='document_id', how='left',
suffixes=('', '_promo')))
merged['day_event'] = (merged['timestamp'] / 1000 / 60 / 60 / 24).astype(int)
merged['platform'] = merged['platform'].fillna(1)
merged['platform'] = merged['platform'] - 1
display_event = merged[[DISPLAY_ID_COLUMN, 'day_event']].drop_duplicates().reset_index()
documents_meta = cudf.read_csv(documents_meta_path, na_values=["\\N", ""])
documents_meta = documents_meta.dropna(subset="source_id")
documents_meta["publisher_id"].fillna(
documents_meta["publisher_id"].isnull().cumsum()
+ documents_meta["publisher_id"].max()
+ 1,
inplace=True,
)
merged = (
cudf.read_csv(clicks_train_path, na_values=["\\N", ""])
.merge(
cudf.read_csv(events_path, na_values=["\\N", ""]),
on=DISPLAY_ID_COLUMN,
how="left",
suffixes=("", "_event"),
)
.merge(
cudf.read_csv(promoted_content_path, na_values=["\\N", ""]),
on="ad_id",
how="left",
suffixes=("", "_promo"),
)
.merge(documents_meta, on="document_id", how="left")
.merge(
documents_meta,
left_on="document_id_promo",
right_on="document_id",
how="left",
suffixes=("", "_promo"),
)
)
merged["day_event"] = (merged["timestamp"] / 1000 / 60 / 60 / 24).astype(int)
merged["platform"] = merged["platform"].fillna(1)
merged["platform"] = merged["platform"] - 1
display_event = (
merged[[DISPLAY_ID_COLUMN, "day_event"]].drop_duplicates().reset_index()
)
random_state = cudf.Series(cupy.random.uniform(size=len(display_event)))
valid_ids, train_ids = display_event.scatter_by_map(
((display_event.day_event <= 10) & (random_state > 0.2)).astype(int))
((display_event.day_event <= 10) & (random_state > 0.2)).astype(int)
)
valid_ids = valid_ids[DISPLAY_ID_COLUMN].drop_duplicates()
train_ids = train_ids[DISPLAY_ID_COLUMN].drop_duplicates()
valid_set = merged[merged[DISPLAY_ID_COLUMN].isin(valid_ids)]
@ -201,27 +259,39 @@ def create_parquets(data_bucket_folder, train_path, valid_path):
del merged, train_set, valid_set
def save_stats(data_bucket_folder, output_bucket_folder,
output_train_folder, train_path, output_valid_folder,
valid_path, stats_file, hash_spec, local_directory):
def save_stats(
data_bucket_folder,
output_train_folder,
train_path,
output_valid_folder,
valid_path,
stats_file,
hash_spec,
local_directory,
dask
):
devices = get_devices()
shuffle = Shuffle.PER_PARTITION if len(devices) > 1 else True
workflow = create_workflow(data_bucket_folder=data_bucket_folder,
output_bucket_folder=output_bucket_folder,
hash_spec=hash_spec,
devices=devices,
local_directory=local_directory)
workflow = create_workflow(
data_bucket_folder=data_bucket_folder,
hash_spec=hash_spec,
devices=devices,
local_directory=local_directory,
dask=dask
)
train_dataset = nvt.Dataset(train_path, part_mem_fraction=0.12)
valid_dataset = nvt.Dataset(valid_path, part_mem_fraction=0.12)
train_dataset = nvt.Dataset(train_path, part_size="1GB")
valid_dataset = nvt.Dataset(valid_path, part_size="150MB")
workflow.fit(train_dataset)
workflow.transform(train_dataset).to_parquet(
output_path=output_train_folder, shuffle=shuffle, out_files_per_proc=8
)
workflow.transform(valid_dataset).to_parquet(
output_path=output_valid_folder, shuffle=None, output_files=8
)
workflow.apply(train_dataset, record_stats=True, output_path=output_train_folder, shuffle=shuffle,
out_files_per_proc=5)
workflow.apply(valid_dataset, record_stats=False, output_path=output_valid_folder, shuffle=None,
out_files_per_proc=None)
workflow.save_stats(stats_file)
workflow.save(stats_file)
return workflow
@ -231,24 +301,30 @@ def clean(path):
def execute_pipeline(config):
required_folders = [config['temporary_folder'], config['output_train_folder'], config['output_valid_folder']]
required_folders = [
config["temporary_folder"],
config["output_train_folder"],
config["output_valid_folder"],
]
for folder in required_folders:
os.makedirs(folder, exist_ok=True)
create_parquets(
data_bucket_folder=config['data_bucket_folder'],
train_path=config['train_path'],
valid_path=config['valid_path']
data_bucket_folder=config["data_bucket_folder"],
train_path=config["train_path"],
valid_path=config["valid_path"],
)
save_stats(
data_bucket_folder=config['data_bucket_folder'],
output_bucket_folder=config['output_bucket_folder'],
output_train_folder=config['output_train_folder'],
train_path=config['train_path'],
output_valid_folder=config['output_valid_folder'],
valid_path=config['valid_path'],
stats_file=config['stats_file'],
hash_spec=config['hash_spec'],
local_directory=config['temporary_folder']
data_bucket_folder=config["data_bucket_folder"],
output_train_folder=config["output_train_folder"],
train_path=config["train_path"],
output_valid_folder=config["output_valid_folder"],
valid_path=config["valid_path"],
stats_file=config["stats_file"],
hash_spec=config["hash_spec"],
local_directory=config["temporary_folder"],
dask=config["dask"]
)
clean(config['temporary_folder'])
clean(config["temporary_folder"])
clean("./categories")

View File

@ -1,13 +0,0 @@
state_abb,utc_dst_time_offset_cleaned
AB,-6.0
BC,-7.0
MB,-5.0
NB,-3.0
NL,-3.0
NS,-3.0
NU,-5.0
ON,-4.0
PE,-3.0
QC,-4.0
SK,-6.0
YT,-7.0
1 state_abb utc_dst_time_offset_cleaned
2 AB -6.0
3 BC -7.0
4 MB -5.0
5 NB -3.0
6 NL -3.0
7 NS -3.0
8 NU -5.0
9 ON -4.0
10 PE -3.0
11 QC -4.0
12 SK -6.0
13 YT -7.0

View File

@ -1,247 +0,0 @@
country_code,utc_dst_time_offset_cleaned
AX,3.0
AF,4.5
AL,2.0
DZ,1.0
AD,2.0
AO,1.0
AI,-4.0
AG,-4.0
AR,-3.0
AM,4.0
AW,-4.0
AU,10.0
AT,2.0
AZ,4.0
BS,-4.0
BH,3.0
BD,6.0
BB,-4.0
BY,3.0
BE,2.0
BZ,-6.0
BJ,1.0
BM,-3.0
BT,6.0
BO,-4.0
BA,2.0
BW,2.0
BR,-3.0
IO,6.0
BN,8.0
BG,3.0
BF,0.0
BI,2.0
KH,7.0
CM,1.0
CA,-5.0
BQ,-5.0
KY,-5.0
CF,1.0
TD,1.0
CL,-3.0
CN,8.0
CX,7.0
CC,6.5
CO,-5.0
KM,3.0
CD,1.0
CG,1.0
CK,-10.0
CR,-6.0
CI,0.0
HR,2.0
CW,-4.0
CY,3.0
CZ,2.0
DK,2.0
DJ,3.0
DM,-4.0
DO,-4.0
TL,9.0
EC,-5.0
EG,2.0
SV,-6.0
GQ,1.0
ER,3.0
EE,3.0
ET,3.0
FK,-3.0
FO,1.0
FJ,12.0
FI,3.0
FR,2.0
GF,-3.0
PF,-10.0
GA,1.0
GM,0.0
GE,4.0
DE,2.0
GH,0.0
GI,2.0
GR,3.0
GL,-2.0
GD,-4.0
GP,-4.0
GU,10.0
GT,-6.0
GG,1.0
GN,0.0
GW,0.0
GY,-4.0
HT,-5.0
HN,-6.0
HK,8.0
HU,2.0
IS,0.0
IN,5.5
ID,8.0
IR,4.5
IQ,3.0
IE,1.0
IM,1.0
IL,3.0
IT,2.0
JM,-5.0
JP,9.0
JE,1.0
JO,3.0
KZ,5.0
KE,3.0
KI,13.0
KP,-4.0
KR,-4.0
KP,8.5
KR,8.5
KP,9.0
KR,9.0
KW,3.0
KG,6.0
LA,7.0
LV,3.0
LB,3.0
LS,2.0
LR,0.0
LY,2.0
LI,2.0
LT,3.0
LU,2.0
MO,8.0
MK,2.0
MG,3.0
MW,2.0
MY,8.0
MV,5.0
ML,0.0
MT,2.0
MH,12.0
MQ,-4.0
MR,0.0
MU,4.0
YT,3.0
MX,-5.0
FM,10.0
MD,3.0
MC,2.0
MN,9.0
ME,2.0
MS,-4.0
MA,1.0
MZ,2.0
MM,6.5
NA,1.0
NR,12.0
NP,5.0
NL,2.0
NC,11.0
NZ,12.0
NI,-6.0
NE,1.0
NG,1.0
NU,-11.0
NF,11.0
MP,10.0
NO,2.0
OM,4.0
PK,5.0
PW,9.0
PS,3.0
PA,-5.0
PG,10.0
PY,-4.0
PE,-5.0
PH,8.0
PN,-8.0
PL,2.0
PT,1.0
PR,-4.0
QA,3.0
RE,4.0
RO,3.0
RU,7.0
RW,2.0
BL,-4.0
AS,-11.0
WS,-11.0
AS,13.0
WS,13.0
SM,2.0
ST,0.0
SA,3.0
SN,0.0
RS,2.0
SC,4.0
SL,0.0
SG,8.0
SK,2.0
SI,2.0
SB,11.0
SO,3.0
ZA,2.0
GS,-2.0
SS,3.0
ES,2.0
LK,5.5
SH,0.0
KN,-4.0
SX,-4.0
MF,-4.0
SD,3.0
SR,-3.0
SJ,2.0
SZ,2.0
SE,2.0
CH,2.0
SY,3.0
TW,8.0
TJ,5.0
TZ,3.0
TH,7.0
TG,0.0
TK,13.0
TO,13.0
TT,-4.0
TN,1.0
TR,3.0
TM,5.0
TC,-4.0
TV,12.0
UG,3.0
UA,3.0
AE,4.0
GB,1.0
US,-7.0
UY,-3.0
UZ,5.0
VU,11.0
VA,2.0
VE,-4.0
VN,7.0
VG,-4.0
VI,-4.0
VG,-4.0
VI,-4.0
WF,12.0
YE,3.0
ZM,2.0
ZW,2.0
1 country_code utc_dst_time_offset_cleaned
2 AX 3.0
3 AF 4.5
4 AL 2.0
5 DZ 1.0
6 AD 2.0
7 AO 1.0
8 AI -4.0
9 AG -4.0
10 AR -3.0
11 AM 4.0
12 AW -4.0
13 AU 10.0
14 AT 2.0
15 AZ 4.0
16 BS -4.0
17 BH 3.0
18 BD 6.0
19 BB -4.0
20 BY 3.0
21 BE 2.0
22 BZ -6.0
23 BJ 1.0
24 BM -3.0
25 BT 6.0
26 BO -4.0
27 BA 2.0
28 BW 2.0
29 BR -3.0
30 IO 6.0
31 BN 8.0
32 BG 3.0
33 BF 0.0
34 BI 2.0
35 KH 7.0
36 CM 1.0
37 CA -5.0
38 BQ -5.0
39 KY -5.0
40 CF 1.0
41 TD 1.0
42 CL -3.0
43 CN 8.0
44 CX 7.0
45 CC 6.5
46 CO -5.0
47 KM 3.0
48 CD 1.0
49 CG 1.0
50 CK -10.0
51 CR -6.0
52 CI 0.0
53 HR 2.0
54 CW -4.0
55 CY 3.0
56 CZ 2.0
57 DK 2.0
58 DJ 3.0
59 DM -4.0
60 DO -4.0
61 TL 9.0
62 EC -5.0
63 EG 2.0
64 SV -6.0
65 GQ 1.0
66 ER 3.0
67 EE 3.0
68 ET 3.0
69 FK -3.0
70 FO 1.0
71 FJ 12.0
72 FI 3.0
73 FR 2.0
74 GF -3.0
75 PF -10.0
76 GA 1.0
77 GM 0.0
78 GE 4.0
79 DE 2.0
80 GH 0.0
81 GI 2.0
82 GR 3.0
83 GL -2.0
84 GD -4.0
85 GP -4.0
86 GU 10.0
87 GT -6.0
88 GG 1.0
89 GN 0.0
90 GW 0.0
91 GY -4.0
92 HT -5.0
93 HN -6.0
94 HK 8.0
95 HU 2.0
96 IS 0.0
97 IN 5.5
98 ID 8.0
99 IR 4.5
100 IQ 3.0
101 IE 1.0
102 IM 1.0
103 IL 3.0
104 IT 2.0
105 JM -5.0
106 JP 9.0
107 JE 1.0
108 JO 3.0
109 KZ 5.0
110 KE 3.0
111 KI 13.0
112 KP -4.0
113 KR -4.0
114 KP 8.5
115 KR 8.5
116 KP 9.0
117 KR 9.0
118 KW 3.0
119 KG 6.0
120 LA 7.0
121 LV 3.0
122 LB 3.0
123 LS 2.0
124 LR 0.0
125 LY 2.0
126 LI 2.0
127 LT 3.0
128 LU 2.0
129 MO 8.0
130 MK 2.0
131 MG 3.0
132 MW 2.0
133 MY 8.0
134 MV 5.0
135 ML 0.0
136 MT 2.0
137 MH 12.0
138 MQ -4.0
139 MR 0.0
140 MU 4.0
141 YT 3.0
142 MX -5.0
143 FM 10.0
144 MD 3.0
145 MC 2.0
146 MN 9.0
147 ME 2.0
148 MS -4.0
149 MA 1.0
150 MZ 2.0
151 MM 6.5
152 NA 1.0
153 NR 12.0
154 NP 5.0
155 NL 2.0
156 NC 11.0
157 NZ 12.0
158 NI -6.0
159 NE 1.0
160 NG 1.0
161 NU -11.0
162 NF 11.0
163 MP 10.0
164 NO 2.0
165 OM 4.0
166 PK 5.0
167 PW 9.0
168 PS 3.0
169 PA -5.0
170 PG 10.0
171 PY -4.0
172 PE -5.0
173 PH 8.0
174 PN -8.0
175 PL 2.0
176 PT 1.0
177 PR -4.0
178 QA 3.0
179 RE 4.0
180 RO 3.0
181 RU 7.0
182 RW 2.0
183 BL -4.0
184 AS -11.0
185 WS -11.0
186 AS 13.0
187 WS 13.0
188 SM 2.0
189 ST 0.0
190 SA 3.0
191 SN 0.0
192 RS 2.0
193 SC 4.0
194 SL 0.0
195 SG 8.0
196 SK 2.0
197 SI 2.0
198 SB 11.0
199 SO 3.0
200 ZA 2.0
201 GS -2.0
202 SS 3.0
203 ES 2.0
204 LK 5.5
205 SH 0.0
206 KN -4.0
207 SX -4.0
208 MF -4.0
209 SD 3.0
210 SR -3.0
211 SJ 2.0
212 SZ 2.0
213 SE 2.0
214 CH 2.0
215 SY 3.0
216 TW 8.0
217 TJ 5.0
218 TZ 3.0
219 TH 7.0
220 TG 0.0
221 TK 13.0
222 TO 13.0
223 TT -4.0
224 TN 1.0
225 TR 3.0
226 TM 5.0
227 TC -4.0
228 TV 12.0
229 UG 3.0
230 UA 3.0
231 AE 4.0
232 GB 1.0
233 US -7.0
234 UY -3.0
235 UZ 5.0
236 VU 11.0
237 VA 2.0
238 VE -4.0
239 VN 7.0
240 VG -4.0
241 VI -4.0
242 VG -4.0
243 VI -4.0
244 WF 12.0
245 YE 3.0
246 ZM 2.0
247 ZW 2.0

View File

@ -1,52 +0,0 @@
state_abb,utc_dst_time_offset_cleaned
AL,-5.0
AK,-8.0
AZ,-7.0
AR,-5.0
CA,-7.0
CO,-6.0
CT,-4.0
DE,-4.0
DC,-4.0
FL,-4.0
GA,-4.0
HI,-10.0
ID,-6.0
IL,-5.0
IN,-4.0
IA,-5.0
KS,-5.0
KY,-4.0
LA,-5.0
ME,-4.0
MD,-4.0
MA,-4.0
MI,-4.0
MN,-5.0
MS,-5.0
MO,-5.0
MT,-6.0
NE,-5.0
NV,-7.0
NH,-4.0
NJ,-4.0
NM,-6.0
NY,-4.0
NC,-4.0
ND,-5.0
OH,-4.0
OK,-5.0
OR,-7.0
PA,-4.0
RI,-4.0
SC,-4.0
SD,-5.0
TN,-5.0
TX,-5.0
UT,-6.0
VT,-4.0
VA,-4.0
WA,-7.0
WV,-4.0
WI,-5.0
WY,-6.0
1 state_abb utc_dst_time_offset_cleaned
2 AL -5.0
3 AK -8.0
4 AZ -7.0
5 AR -5.0
6 CA -7.0
7 CO -6.0
8 CT -4.0
9 DE -4.0
10 DC -4.0
11 FL -4.0
12 GA -4.0
13 HI -10.0
14 ID -6.0
15 IL -5.0
16 IN -4.0
17 IA -5.0
18 KS -5.0
19 KY -4.0
20 LA -5.0
21 ME -4.0
22 MD -4.0
23 MA -4.0
24 MI -4.0
25 MN -5.0
26 MS -5.0
27 MO -5.0
28 MT -6.0
29 NE -5.0
30 NV -7.0
31 NH -4.0
32 NJ -4.0
33 NM -6.0
34 NY -4.0
35 NC -4.0
36 ND -5.0
37 OH -4.0
38 OK -5.0
39 OR -7.0
40 PA -4.0
41 RI -4.0
42 SC -4.0
43 SD -5.0
44 TN -5.0
45 TX -5.0
46 UT -6.0
47 VT -4.0
48 VA -4.0
49 WA -7.0
50 WV -4.0
51 WI -5.0
52 WY -6.0

View File

@ -1,104 +0,0 @@
#!/usr/bin/env python
# coding: utf-8
# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from pyspark.context import SparkContext, SparkConf
from pyspark.sql.functions import col
from pyspark.sql.session import SparkSession
from pyspark.sql.types import IntegerType, StringType, StructType, StructField
OUTPUT_BUCKET_FOLDER = "/tmp/spark/preprocessed/"
DATA_BUCKET_FOLDER = "/outbrain/orig/"
SPARK_TEMP_FOLDER = "/tmp/spark/spark-temp/"
conf = SparkConf().setMaster('local[*]').set('spark.executor.memory', '40g').set('spark.driver.memory', '200g').set(
"spark.local.dir", SPARK_TEMP_FOLDER)
sc = SparkContext(conf=conf)
spark = SparkSession(sc)
print('Loading data...')
events_schema = StructType(
[StructField("display_id", IntegerType(), True),
StructField("uuid_event", StringType(), True),
StructField("document_id_event", IntegerType(), True),
StructField("timestamp_event", IntegerType(), True),
StructField("platform_event", IntegerType(), True),
StructField("geo_location_event", StringType(), True)]
)
events_df = spark.read.schema(events_schema) \
.options(header='true', inferschema='false', nullValue='\\N') \
.csv(DATA_BUCKET_FOLDER + "events.csv") \
.withColumn('day_event', (col('timestamp_event') / 1000 / 60 / 60 / 24).cast("int")) \
.alias('events')
events_df.count()
print('Drop rows with empty "geo_location"...')
events_df = events_df.dropna(subset="geo_location_event")
events_df.count()
print('Drop rows with empty "platform"...')
events_df = events_df.dropna(subset="platform_event")
events_df.count()
promoted_content_schema = StructType(
[StructField("ad_id", IntegerType(), True),
StructField("document_id_promo", IntegerType(), True),
StructField("campaign_id", IntegerType(), True),
StructField("advertiser_id", IntegerType(), True)]
)
promoted_content_df = spark.read.schema(promoted_content_schema) \
.options(header='true', inferschema='false', nullValue='\\N') \
.csv(DATA_BUCKET_FOLDER + "promoted_content.csv") \
.alias('promoted_content')
clicks_train_schema = StructType(
[StructField("display_id", IntegerType(), True),
StructField("ad_id", IntegerType(), True),
StructField("clicked", IntegerType(), True)]
)
clicks_train_df = spark.read.schema(clicks_train_schema) \
.options(header='true', inferschema='false', nullValue='\\N') \
.csv(DATA_BUCKET_FOLDER + "clicks_train.csv") \
.alias('clicks_train')
clicks_train_joined_df = clicks_train_df \
.join(promoted_content_df, on='ad_id', how='left') \
.join(events_df, on='display_id', how='left')
clicks_train_joined_df.createOrReplaceTempView('clicks_train_joined')
validation_display_ids_df = clicks_train_joined_df.select('display_id', 'day_event') \
.distinct() \
.sampleBy("day_event", fractions={0: 0.2, 1: 0.2, 2: 0.2, 3: 0.2, 4: 0.2,
5: 0.2, 6: 0.2, 7: 0.2, 8: 0.2, 9: 0.2, 10: 0.2, 11: 1.0, 12: 1.0}, seed=0)
validation_display_ids_df.createOrReplaceTempView("validation_display_ids")
validation_set_df = spark.sql('''SELECT display_id, ad_id, uuid_event, day_event,
timestamp_event, document_id_promo, platform_event, geo_location_event
FROM clicks_train_joined t
WHERE EXISTS (SELECT display_id FROM validation_display_ids
WHERE display_id = t.display_id)''')
validation_set_gcs_output = "validation_set.parquet"
validation_set_df.write.parquet(OUTPUT_BUCKET_FOLDER + validation_set_gcs_output, mode='overwrite')
print(validation_set_df.take(5))
spark.stop()

View File

@ -1,474 +0,0 @@
#!/usr/bin/env python
# coding: utf-8
# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import datetime
import numpy as np
import pandas as pd
import pyspark.sql.functions as F
import tensorflow as tf
from pyspark import TaskContext
from pyspark.context import SparkContext, SparkConf
from pyspark.sql.functions import col, udf
from pyspark.sql.session import SparkSession
from pyspark.sql.types import ArrayType, DoubleType
from tensorflow_transform.tf_metadata import dataset_metadata
from tensorflow_transform.tf_metadata import dataset_schema
from tensorflow_transform.tf_metadata import metadata_io
from data.outbrain.features import PREBATCH_SIZE, HASH_BUCKET_SIZES
from data.outbrain.spark.utils.feature_description import LABEL_COLUMN, DISPLAY_ID_COLUMN, CATEGORICAL_COLUMNS, \
DOC_CATEGORICAL_MULTIVALUED_COLUMNS, BOOL_COLUMNS, INT_COLUMNS, FLOAT_COLUMNS, \
FLOAT_COLUMNS_LOG_BIN_TRANSFORM, FLOAT_COLUMNS_SIMPLE_BIN_TRANSFORM, FLOAT_COLUMNS_NO_TRANSFORM
pd.set_option('display.max_columns', 1000)
evaluation = True
evaluation_verbose = False
OUTPUT_BUCKET_FOLDER = "/tmp/spark/preprocessed/"
DATA_BUCKET_FOLDER = "/data/orig/"
SPARK_TEMP_FOLDER = "/tmp/spark/spark-temp/"
LOCAL_DATA_TFRECORDS_DIR = "/outbrain/tfrecords"
TEST_SET_MODE = False
TENSORFLOW_HADOOP = "data/outbrain/spark/data/tensorflow-hadoop-1.5.0.jar"
conf = SparkConf().setMaster('local[*]').set('spark.executor.memory', '40g').set('spark.driver.memory', '200g').set(
"spark.local.dir", SPARK_TEMP_FOLDER)
conf.set("spark.jars", TENSORFLOW_HADOOP)
conf.set("spark.sql.files.maxPartitionBytes", 805306368)
sc = SparkContext(conf=conf)
spark = SparkSession(sc)
parser = argparse.ArgumentParser()
parser.add_argument(
'--num_train_partitions',
help='number of train partitions',
type=int,
default=40)
parser.add_argument(
'--num_valid_partitions',
help='number of validation partitions',
type=int,
default=40)
args = parser.parse_args()
num_train_partitions = args.num_train_partitions
num_valid_partitions = args.num_valid_partitions
batch_size = PREBATCH_SIZE
# # Feature Vector export
bool_feature_names = []
int_feature_names = ['ad_views',
'doc_views',
'doc_event_days_since_published',
'doc_ad_days_since_published',
]
float_feature_names = [
'pop_ad_id',
'pop_document_id',
'pop_publisher_id',
'pop_advertiser_id',
'pop_campain_id',
'pop_source_id',
'doc_event_doc_ad_sim_categories',
'doc_event_doc_ad_sim_topics',
'doc_event_doc_ad_sim_entities',
]
TRAFFIC_SOURCE_FV = 'traffic_source'
EVENT_HOUR_FV = 'event_hour'
EVENT_COUNTRY_FV = 'event_country'
EVENT_COUNTRY_STATE_FV = 'event_country_state'
EVENT_GEO_LOCATION_FV = 'event_geo_location'
EVENT_PLATFORM_FV = 'event_platform'
AD_ADVERTISER_FV = 'ad_advertiser'
DOC_AD_SOURCE_ID_FV = 'doc_ad_source_id'
DOC_AD_PUBLISHER_ID_FV = 'doc_ad_publisher_id'
DOC_EVENT_SOURCE_ID_FV = 'doc_event_source_id'
DOC_EVENT_PUBLISHER_ID_FV = 'doc_event_publisher_id'
DOC_AD_CATEGORY_ID_FV = 'doc_ad_category_id'
DOC_AD_TOPIC_ID_FV = 'doc_ad_topic_id'
DOC_AD_ENTITY_ID_FV = 'doc_ad_entity_id'
DOC_EVENT_CATEGORY_ID_FV = 'doc_event_category_id'
DOC_EVENT_TOPIC_ID_FV = 'doc_event_topic_id'
DOC_EVENT_ENTITY_ID_FV = 'doc_event_entity_id'
# ### Configuring feature vector
category_feature_names_integral = ['ad_advertiser',
'doc_ad_publisher_id',
'doc_ad_source_id',
'doc_event_publisher_id',
'doc_event_source_id',
'event_country',
'event_country_state',
'event_geo_location',
'event_hour',
'event_platform',
'traffic_source']
feature_vector_labels_integral = bool_feature_names \
+ int_feature_names \
+ float_feature_names \
+ category_feature_names_integral
train_feature_vector_gcs_folder_name = 'train_feature_vectors_integral_eval'
# ## Exporting integral feature vectors to CSV
train_feature_vectors_exported_df = spark.read.parquet(OUTPUT_BUCKET_FOLDER + train_feature_vector_gcs_folder_name)
train_feature_vectors_exported_df.take(3)
integral_headers = ['label', 'display_id', 'ad_id', 'doc_id', 'doc_event_id'] + feature_vector_labels_integral
CSV_ORDERED_COLUMNS = ['label', 'display_id', 'ad_id', 'doc_id', 'doc_event_id', 'ad_views', 'campaign_id','doc_views',
'doc_event_days_since_published', 'doc_ad_days_since_published',
'pop_ad_id', 'pop_document_id', 'pop_publisher_id', 'pop_advertiser_id', 'pop_campain_id',
'pop_source_id',
'doc_event_doc_ad_sim_categories', 'doc_event_doc_ad_sim_topics',
'doc_event_doc_ad_sim_entities', 'ad_advertiser', 'doc_ad_publisher_id',
'doc_ad_source_id', 'doc_event_publisher_id', 'doc_event_source_id', 'event_country',
'event_country_state', 'event_geo_location', 'event_platform',
'traffic_source']
FEAT_CSV_ORDERED_COLUMNS = ['ad_views', 'campaign_id','doc_views',
'doc_event_days_since_published', 'doc_ad_days_since_published',
'pop_ad_id', 'pop_document_id', 'pop_publisher_id', 'pop_advertiser_id', 'pop_campain_id',
'pop_source_id',
'doc_event_doc_ad_sim_categories', 'doc_event_doc_ad_sim_topics',
'doc_event_doc_ad_sim_entities', 'ad_advertiser', 'doc_ad_publisher_id',
'doc_ad_source_id', 'doc_event_publisher_id', 'doc_event_source_id', 'event_country',
'event_country_state', 'event_geo_location', 'event_platform',
'traffic_source']
def to_array(col):
def to_array_(v):
return v.toArray().tolist()
# Important: asNondeterministic requires Spark 2.3 or later
# It can be safely removed i.e.
# return udf(to_array_, ArrayType(DoubleType()))(col)
# but at the cost of decreased performance
return udf(to_array_, ArrayType(DoubleType())).asNondeterministic()(col)
CONVERT_TO_INT = ['doc_ad_category_id_1',
'doc_ad_category_id_2', 'doc_ad_category_id_3', 'doc_ad_topic_id_1', 'doc_ad_topic_id_2',
'doc_ad_topic_id_3', 'doc_ad_entity_id_1', 'doc_ad_entity_id_2', 'doc_ad_entity_id_3',
'doc_ad_entity_id_4', 'doc_ad_entity_id_5', 'doc_ad_entity_id_6',
'doc_ad_source_id', 'doc_event_category_id_1', 'doc_event_category_id_2', 'doc_event_category_id_3',
'doc_event_topic_id_1', 'doc_event_topic_id_2', 'doc_event_topic_id_3', 'doc_event_entity_id_1',
'doc_event_entity_id_2', 'doc_event_entity_id_3', 'doc_event_entity_id_4', 'doc_event_entity_id_5',
'doc_event_entity_id_6']
def format_number(element, name):
if name in BOOL_COLUMNS + CATEGORICAL_COLUMNS:
return element.cast("int")
elif name in CONVERT_TO_INT:
return element.cast("int")
else:
return element
def to_array_with_none(col):
def to_array_with_none_(v):
tmp = np.full((v.size,), fill_value=None, dtype=np.float64)
tmp[v.indices] = v.values
return tmp.tolist()
# Important: asNondeterministic requires Spark 2.3 or later
# It can be safely removed i.e.
# return udf(to_array_, ArrayType(DoubleType()))(col)
# but at the cost of decreased performance
return udf(to_array_with_none_, ArrayType(DoubleType())).asNondeterministic()(col)
@udf
def count_value(x):
from collections import Counter
tmp = Counter(x).most_common(2)
if not tmp or np.isnan(tmp[0][0]):
return 0
return float(tmp[0][0])
def replace_with_most_frequent(most_value):
return udf(lambda x: most_value if not x or np.isnan(x) else x)
train_feature_vectors_integral_csv_rdd_df = train_feature_vectors_exported_df.select('label', 'display_id', 'ad_id',
'document_id', 'document_id_event',
'feature_vector').withColumn(
"featvec", to_array("feature_vector")).select(
['label'] + ['display_id'] + ['ad_id'] + ['document_id'] + ['document_id_event'] + [
format_number(element, FEAT_CSV_ORDERED_COLUMNS[index]).alias(FEAT_CSV_ORDERED_COLUMNS[index]) for
index, element in enumerate([col("featvec")[i] for i in range(len(feature_vector_labels_integral))])]).replace(
float('nan'), 0)
test_validation_feature_vector_gcs_folder_name = 'validation_feature_vectors_integral'
# ## Exporting integral feature vectors
test_validation_feature_vectors_exported_df = spark.read.parquet(
OUTPUT_BUCKET_FOLDER + test_validation_feature_vector_gcs_folder_name)
test_validation_feature_vectors_exported_df = test_validation_feature_vectors_exported_df.repartition(40,
'display_id').orderBy(
'display_id')
test_validation_feature_vectors_exported_df.take(3)
test_validation_feature_vectors_integral_csv_rdd_df = test_validation_feature_vectors_exported_df.select(
'label', 'display_id', 'ad_id', 'document_id', 'document_id_event', 'feature_vector').withColumn("featvec",
to_array(
"feature_vector")).select(
['label'] + ['display_id'] + ['ad_id'] + ['document_id'] + ['document_id_event'] + [
format_number(element, FEAT_CSV_ORDERED_COLUMNS[index]).alias(FEAT_CSV_ORDERED_COLUMNS[index]) for
index, element in enumerate([col("featvec")[i] for i in range(len(feature_vector_labels_integral))])]).replace(
float('nan'), 0)
def make_spec(output_dir, batch_size=None):
fixed_shape = [batch_size, 1] if batch_size is not None else []
spec = {}
spec[LABEL_COLUMN] = tf.io.FixedLenFeature(shape=fixed_shape, dtype=tf.int64, default_value=None)
spec[DISPLAY_ID_COLUMN] = tf.io.FixedLenFeature(shape=fixed_shape, dtype=tf.int64, default_value=None)
for name in BOOL_COLUMNS:
spec[name] = tf.io.FixedLenFeature(shape=fixed_shape, dtype=tf.int64, default_value=None)
for name in FLOAT_COLUMNS_LOG_BIN_TRANSFORM + FLOAT_COLUMNS_SIMPLE_BIN_TRANSFORM + FLOAT_COLUMNS_NO_TRANSFORM:
spec[name] = tf.io.FixedLenFeature(shape=fixed_shape, dtype=tf.float32, default_value=None)
for name in FLOAT_COLUMNS_SIMPLE_BIN_TRANSFORM:
spec[name + '_binned'] = tf.io.FixedLenFeature(shape=fixed_shape, dtype=tf.int64, default_value=None)
for name in FLOAT_COLUMNS_LOG_BIN_TRANSFORM:
spec[name + '_log_01scaled'] = tf.io.FixedLenFeature(shape=fixed_shape, dtype=tf.float32, default_value=None)
for name in INT_COLUMNS:
spec[name + '_log_01scaled'] = tf.io.FixedLenFeature(shape=fixed_shape, dtype=tf.float32, default_value=None)
for name in BOOL_COLUMNS + CATEGORICAL_COLUMNS:
spec[name] = tf.io.FixedLenFeature(shape=fixed_shape, dtype=tf.int64, default_value=None)
for multi_category in DOC_CATEGORICAL_MULTIVALUED_COLUMNS:
shape = fixed_shape[:-1] + [len(DOC_CATEGORICAL_MULTIVALUED_COLUMNS[multi_category])]
spec[multi_category] = tf.io.FixedLenFeature(shape=shape, dtype=tf.int64)
metadata = dataset_metadata.DatasetMetadata(dataset_schema.from_feature_spec(spec))
metadata_io.write_metadata(metadata, output_dir)
# write out tfrecords meta
make_spec(LOCAL_DATA_TFRECORDS_DIR + '/transformed_metadata', batch_size=batch_size)
def log2_1p(x):
return np.log1p(x) / np.log(2.0)
# calculate min and max stats for the given dataframes all in one go
def compute_min_max_logs(df):
print(str(datetime.datetime.now()) + '\tComputing min and max')
min_logs = {}
max_logs = {}
float_expr = []
for name in FLOAT_COLUMNS_LOG_BIN_TRANSFORM + INT_COLUMNS:
float_expr.append(F.min(name))
float_expr.append(F.max(name))
floatDf = all_df.agg(*float_expr).collect()
for name in FLOAT_COLUMNS_LOG_BIN_TRANSFORM:
minAgg = floatDf[0]["min(" + name + ")"]
maxAgg = floatDf[0]["max(" + name + ")"]
min_logs[name + '_log_01scaled'] = log2_1p(minAgg * 1000)
max_logs[name + '_log_01scaled'] = log2_1p(maxAgg * 1000)
for name in INT_COLUMNS:
minAgg = floatDf[0]["min(" + name + ")"]
maxAgg = floatDf[0]["max(" + name + ")"]
min_logs[name + '_log_01scaled'] = log2_1p(minAgg)
max_logs[name + '_log_01scaled'] = log2_1p(maxAgg)
return min_logs, max_logs
all_df = test_validation_feature_vectors_integral_csv_rdd_df.union(train_feature_vectors_integral_csv_rdd_df)
min_logs, max_logs = compute_min_max_logs(all_df)
train_output_string = '/train'
eval_output_string = '/eval'
path = LOCAL_DATA_TFRECORDS_DIR
def create_tf_example_spark(df, min_logs, max_logs):
result = {}
result[LABEL_COLUMN] = tf.train.Feature(int64_list=tf.train.Int64List(value=df[LABEL_COLUMN].to_list()))
result[DISPLAY_ID_COLUMN] = tf.train.Feature(int64_list=tf.train.Int64List(value=df[DISPLAY_ID_COLUMN].to_list()))
for name in FLOAT_COLUMNS:
value = df[name].to_list()
result[name] = tf.train.Feature(float_list=tf.train.FloatList(value=value))
for name in FLOAT_COLUMNS_SIMPLE_BIN_TRANSFORM:
value = df[name].multiply(10).astype('int64').to_list()
result[name + '_binned'] = tf.train.Feature(int64_list=tf.train.Int64List(value=value))
for name in FLOAT_COLUMNS_LOG_BIN_TRANSFORM:
value_prelim = df[name].multiply(1000).apply(np.log1p).multiply(1. / np.log(2.0))
value = value_prelim.astype('int64').to_list()
result[name + '_binned'] = tf.train.Feature(int64_list=tf.train.Int64List(value=value))
nn = name + '_log_01scaled'
value = value_prelim.add(-min_logs[nn]).multiply(1. / (max_logs[nn] - min_logs[nn])).to_list()
result[nn] = tf.train.Feature(float_list=tf.train.FloatList(value=value))
for name in INT_COLUMNS:
value_prelim = df[name].apply(np.log1p).multiply(1. / np.log(2.0))
value = value_prelim.astype('int64').to_list()
result[name + '_log_int'] = tf.train.Feature(int64_list=tf.train.Int64List(value=value))
nn = name + '_log_01scaled'
value = value_prelim.add(-min_logs[nn]).multiply(1. / (max_logs[nn] - min_logs[nn])).to_list()
result[nn] = tf.train.Feature(float_list=tf.train.FloatList(value=value))
for name in BOOL_COLUMNS + CATEGORICAL_COLUMNS:
value = df[name].fillna(0).astype('int64').to_list()
result[name] = tf.train.Feature(int64_list=tf.train.Int64List(value=value))
for multi_category in DOC_CATEGORICAL_MULTIVALUED_COLUMNS:
values = []
for category in DOC_CATEGORICAL_MULTIVALUED_COLUMNS[multi_category]:
values = values + [df[category].to_numpy()]
# need to transpose the series so they will be parsed correctly by the FixedLenFeature
# we can pass in a single series here; they'll be reshaped to [batch_size, num_values]
# when parsed from the TFRecord
value = np.stack(values, axis=1).flatten().tolist()
result[multi_category] = tf.train.Feature(int64_list=tf.train.Int64List(value=value))
tf_example = tf.train.Example(features=tf.train.Features(feature=result))
return tf_example
def hash_bucket(num_buckets):
return lambda x: x % num_buckets
def _transform_to_tfrecords(rdds):
csv = pd.DataFrame(list(rdds), columns=CSV_ORDERED_COLUMNS)
num_rows = len(csv.index)
examples = []
for start_ind in range(0, num_rows, batch_size if batch_size is not None else 1): # for each batch
if start_ind + batch_size - 1 > num_rows: # if we'd run out of rows
csv_slice = csv.iloc[start_ind:]
# drop the remainder
print("last Example has: ", len(csv_slice))
examples.append((create_tf_example_spark(csv_slice, min_logs, max_logs), len(csv_slice)))
return examples
else:
csv_slice = csv.iloc[start_ind:start_ind + (batch_size if batch_size is not None else 1)]
examples.append((create_tf_example_spark(csv_slice, min_logs, max_logs), batch_size))
return examples
max_partition_num = 30
def _transform_to_slices(rdds):
taskcontext = TaskContext.get()
partitionid = taskcontext.partitionId()
csv = pd.DataFrame(list(rdds), columns=CSV_ORDERED_COLUMNS)
for name, size in HASH_BUCKET_SIZES.items():
if name in csv.columns.values:
csv[name] = csv[name].apply(hash_bucket(size))
num_rows = len(csv.index)
print("working with partition: ", partitionid, max_partition_num, num_rows)
examples = []
for start_ind in range(0, num_rows, batch_size if batch_size is not None else 1): # for each batch
if start_ind + batch_size - 1 > num_rows: # if we'd run out of rows
csv_slice = csv.iloc[start_ind:]
print("last Example has: ", len(csv_slice), partitionid)
examples.append((csv_slice, len(csv_slice)))
return examples
else:
csv_slice = csv.iloc[start_ind:start_ind + (batch_size if batch_size is not None else 1)]
examples.append((csv_slice, len(csv_slice)))
return examples
def _transform_to_tfrecords_from_slices(rdds):
examples = []
for slice in rdds:
if len(slice[0]) != batch_size:
print("slice size is not correct, dropping: ", len(slice[0]))
else:
examples.append(
(bytearray((create_tf_example_spark(slice[0], min_logs, max_logs)).SerializeToString()), None))
return examples
def _transform_to_tfrecords_from_reslice(rdds):
examples = []
all_dataframes = pd.DataFrame([])
for slice in rdds:
all_dataframes = all_dataframes.append(slice[0])
num_rows = len(all_dataframes.index)
examples = []
for start_ind in range(0, num_rows, batch_size if batch_size is not None else 1): # for each batch
if start_ind + batch_size - 1 > num_rows: # if we'd run out of rows
csv_slice = all_dataframes.iloc[start_ind:]
if TEST_SET_MODE:
remain_len = batch_size - len(csv_slice)
(m, n) = divmod(remain_len, len(csv_slice))
print("remainder: ", len(csv_slice), remain_len, m, n)
if m:
for i in range(m):
csv_slice = csv_slice.append(csv_slice)
csv_slice = csv_slice.append(csv_slice.iloc[:n])
print("after fill remainder: ", len(csv_slice))
examples.append(
(bytearray((create_tf_example_spark(csv_slice, min_logs, max_logs)).SerializeToString()), None))
return examples
# drop the remainder
print("dropping remainder: ", len(csv_slice))
return examples
else:
csv_slice = all_dataframes.iloc[start_ind:start_ind + (batch_size if batch_size is not None else 1)]
examples.append(
(bytearray((create_tf_example_spark(csv_slice, min_logs, max_logs)).SerializeToString()), None))
return examples
TEST_SET_MODE = False
train_features = train_feature_vectors_integral_csv_rdd_df.coalesce(30).rdd.mapPartitions(_transform_to_slices)
cached_train_features = train_features.cache()
train_full = cached_train_features.filter(lambda x: x[1] == batch_size)
# split out slies where we don't have a full batch so that we can reslice them so we only drop mininal rows
train_not_full = cached_train_features.filter(lambda x: x[1] < batch_size)
train_examples_full = train_full.mapPartitions(_transform_to_tfrecords_from_slices)
train_left = train_not_full.coalesce(1).mapPartitions(_transform_to_tfrecords_from_reslice)
all_train = train_examples_full.union(train_left)
TEST_SET_MODE = True
valid_features = test_validation_feature_vectors_integral_csv_rdd_df.repartition(num_valid_partitions,
'display_id').rdd.mapPartitions(
_transform_to_slices)
cached_valid_features = valid_features.cache()
valid_full = cached_valid_features.filter(lambda x: x[1] == batch_size)
valid_not_full = cached_valid_features.filter(lambda x: x[1] < batch_size)
valid_examples_full = valid_full.mapPartitions(_transform_to_tfrecords_from_slices)
valid_left = valid_not_full.coalesce(1).mapPartitions(_transform_to_tfrecords_from_reslice)
all_valid = valid_examples_full.union(valid_left)
all_train.saveAsNewAPIHadoopFile(LOCAL_DATA_TFRECORDS_DIR + train_output_string,
"org.tensorflow.hadoop.io.TFRecordFileOutputFormat",
keyClass="org.apache.hadoop.io.BytesWritable",
valueClass="org.apache.hadoop.io.NullWritable")
all_valid.saveAsNewAPIHadoopFile(LOCAL_DATA_TFRECORDS_DIR + eval_output_string,
"org.tensorflow.hadoop.io.TFRecordFileOutputFormat",
keyClass="org.apache.hadoop.io.BytesWritable",
valueClass="org.apache.hadoop.io.NullWritable")
spark.stop()

View File

@ -1,136 +0,0 @@
# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
LABEL_COLUMN = "label"
DISPLAY_ID_COLUMN = 'display_id'
IS_LEAK_COLUMN = 'is_leak'
DISPLAY_ID_AND_IS_LEAK_ENCODED_COLUMN = 'display_ad_and_is_leak'
CATEGORICAL_COLUMNS = [
'ad_id',
'campaign_id',
'doc_id',
'doc_event_id',
'ad_advertiser',
'doc_ad_source_id',
'doc_ad_publisher_id',
'doc_event_publisher_id',
'doc_event_source_id',
'event_country',
'event_country_state',
'event_geo_location',
'event_platform']
DOC_CATEGORICAL_MULTIVALUED_COLUMNS = {
}
BOOL_COLUMNS = []
INT_COLUMNS = [
'ad_views',
'doc_views',
'doc_event_days_since_published',
'doc_ad_days_since_published']
FLOAT_COLUMNS_LOG_BIN_TRANSFORM = []
FLOAT_COLUMNS_NO_TRANSFORM = [
'pop_ad_id',
'pop_document_id',
'pop_publisher_id',
'pop_advertiser_id',
'pop_campain_id',
'pop_source_id',
'doc_event_doc_ad_sim_categories',
'doc_event_doc_ad_sim_topics',
'doc_event_doc_ad_sim_entities',
]
FLOAT_COLUMNS_SIMPLE_BIN_TRANSFORM = []
FLOAT_COLUMNS = FLOAT_COLUMNS_LOG_BIN_TRANSFORM + FLOAT_COLUMNS_SIMPLE_BIN_TRANSFORM + FLOAT_COLUMNS_NO_TRANSFORM
REQUEST_SINGLE_HOT_COLUMNS = [
"doc_event_id",
"doc_id",
"doc_event_source_id",
"event_geo_location",
"event_country_state",
"doc_event_publisher_id",
"event_country",
"event_hour",
"event_platform",
"traffic_source",
"event_weekend",
"user_has_already_viewed_doc"]
REQUEST_MULTI_HOT_COLUMNS = [
"doc_event_entity_id",
"doc_event_topic_id",
"doc_event_category_id"]
REQUEST_NUMERIC_COLUMNS = [
"pop_document_id_conf",
"pop_publisher_id_conf",
"pop_source_id_conf",
"pop_entity_id_conf",
"pop_topic_id_conf",
"pop_category_id_conf",
"pop_document_id",
"pop_publisher_id",
"pop_source_id",
"pop_entity_id",
"pop_topic_id",
"pop_category_id",
"user_views",
"doc_views",
"doc_event_days_since_published",
"doc_event_hour"]
ITEM_SINGLE_HOT_COLUMNS = [
"ad_id",
'campaign_id',
"doc_ad_source_id",
"ad_advertiser",
"doc_ad_publisher_id"]
ITEM_MULTI_HOT_COLUMNS = [
"doc_ad_topic_id",
"doc_ad_entity_id",
"doc_ad_category_id"]
ITEM_NUMERIC_COLUMNS = [
"pop_ad_id_conf",
"user_doc_ad_sim_categories_conf",
"user_doc_ad_sim_topics_conf",
"pop_advertiser_id_conf",
"pop_ad_id",
"pop_advertiser_id",
"pop_campain_id",
"user_doc_ad_sim_categories",
"user_doc_ad_sim_topics",
"user_doc_ad_sim_entities",
"doc_event_doc_ad_sim_categories",
"doc_event_doc_ad_sim_topics",
"doc_event_doc_ad_sim_entities",
"ad_views",
"doc_ad_days_since_published"]
NV_TRAINING_COLUMNS = (
REQUEST_SINGLE_HOT_COLUMNS +
REQUEST_MULTI_HOT_COLUMNS +
REQUEST_NUMERIC_COLUMNS +
ITEM_SINGLE_HOT_COLUMNS +
ITEM_MULTI_HOT_COLUMNS +
ITEM_NUMERIC_COLUMNS)

View File

@ -1,3 +1,5 @@
#!/bin/bash
# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
@ -12,16 +14,15 @@
# See the License for the specific language governing permissions and
# limitations under the License.
ARG FROM_IMAGE_NAME=nvcr.io/nvidia/tensorflow:20.12-tf2-py3
FROM ${FROM_IMAGE_NAME}
# Get local process ID from OpenMPI or alternatively from SLURM
if [ -z "${CUDA_VISIBLE_DEVICES:-}" ]; then
if [ -n "${OMPI_COMM_WORLD_LOCAL_RANK:-}" ]; then
LOCAL_RANK="${OMPI_COMM_WORLD_LOCAL_RANK}"
elif [ -n "${SLURM_LOCALID:-}" ]; then
LOCAL_RANK="${SLURM_LOCALID}"
fi
export CUDA_VISIBLE_DEVICES=${LOCAL_RANK}
fi
USER root
RUN pip install --no-cache-dir --no-deps tensorflow-transform==0.24.1 tensorflow-metadata==0.14.0 pydot dill && \
pip install --no-cache-dir ipdb pynvml==8.0.4 && \
pip install --no-cache-dir -e git+https://github.com/NVIDIA/dllogger#egg=dllogger
WORKDIR /wd
COPY . .
exec "$@"

File diff suppressed because it is too large Load Diff

After

Width:  |  Height:  |  Size: 50 KiB

File diff suppressed because it is too large Load Diff

Before

Width:  |  Height:  |  Size: 62 KiB

File diff suppressed because it is too large Load Diff

Before

Width:  |  Height:  |  Size: 67 KiB

File diff suppressed because it is too large Load Diff

After

Width:  |  Height:  |  Size: 44 KiB

File diff suppressed because it is too large Load Diff

Before

Width:  |  Height:  |  Size: 53 KiB

File diff suppressed because it is too large Load Diff

Before

Width:  |  Height:  |  Size: 70 KiB

File diff suppressed because it is too large Load Diff

Before

Width:  |  Height:  |  Size: 51 KiB

File diff suppressed because it is too large Load Diff

Before

Width:  |  Height:  |  Size: 67 KiB

File diff suppressed because it is too large Load Diff

After

Width:  |  Height:  |  Size: 60 KiB

File diff suppressed because it is too large Load Diff

Before

Width:  |  Height:  |  Size: 74 KiB

File diff suppressed because it is too large Load Diff

Before

Width:  |  Height:  |  Size: 72 KiB

View File

@ -12,8 +12,13 @@
# See the License for the specific language governing permissions and
# limitations under the License.
import os
os.environ["TF_MEMORY_ALLOCATION"] = "0.6" # fraction of free memory
import nvtabular as nvt
from trainer.model.widedeep import wide_deep_model
from trainer.run import train, evaluate
from trainer.run import run
from trainer.utils.arguments import parse_args
from trainer.utils.setup import create_config
@ -21,13 +26,9 @@ from trainer.utils.setup import create_config
def main():
args = parse_args()
config = create_config(args)
model = wide_deep_model(args)
if args.evaluate:
evaluate(args, model, config)
else:
train(args, model, config)
model, _ = wide_deep_model(args)
run(args, model, config)
if __name__ == '__main__':
if __name__ == "__main__":
main()

View File

@ -80,13 +80,13 @@ if ! [ "$gpu" -ge 0 ] || [[ ! "$gpu" =~ ^(1|4|8)$ ]] 2>/dev/null; then
exit 1
fi
cmd="mpiexec --allow-run-as-root --bind-to socket -np ${gpu} \
cmd="horovodrun -np ${gpu} sh hvd_wrapper.sh \
python main.py \
--evaluate \
--benchmark \
--benchmark_warmup_steps 500 \
--benchmark_steps 1000 \
-eval_batch_size ${bs} \
--eval_batch_size ${bs} \
${amp} \
${xla}"

View File

@ -1,30 +0,0 @@
#!/bin/bash -e
# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
mTotal=$(cat /proc/meminfo | grep "MemTotal:" | tr -s ' ' | cut -d' ' -f2)
mFOffset=$(cat /proc/meminfo | grep "MemAvailable:" | tr -s ' ' | cut -d' ' -f2)
minFreeMem=$mFOffset
while true; do
mF=$(cat /proc/meminfo | grep "MemAvailable:" | tr -s ' ' | cut -d' ' -f2)
if [ $minFreeMem -gt $mF ]
then
minFreeMem=$mF
memConsumed=$((mFOffset - mF))
echo $memConsumed > mem_consumption.txt
fi
sleep 1
done

View File

@ -17,7 +17,7 @@
set -e
function usage() {
echo "Usage: bash scripts/preproc.sh nvtabular/spark [tfrecords]"
echo "Usage: bash scripts/preproc.sh"
}
if [ ! -d "scripts" ] || [ ! "$(ls -A 'scripts')" ]; then
@ -26,35 +26,4 @@ if [ ! -d "scripts" ] || [ ! "$(ls -A 'scripts')" ]; then
exit 1
fi
if [ $# -ne 1 ] && [ $# -ne 2 ]; then
usage
exit 1
fi
tfrecords=${2:-40}
if ! [ "$tfrecords" -ge 0 ] 2>/dev/null; then
echo "Expected tfrecords (${tfrecords}) to be positive integer"
usage
exit 1
fi
case "$1" in
nvtabular)
time python -m data.outbrain.nvtabular.preproc --workers "${tfrecords}"
;;
spark)
echo "Starting preprocessing 1/3..."
time python data/outbrain/spark/preproc1.py
echo "Starting preprocessing 2/3..."
time python data/outbrain/spark/preproc2.py
echo "Starting preprocessing 3/3..."
time python data/outbrain/spark/preproc3.py --num_train_partitions "${tfrecords}" --num_valid_partitions "${tfrecords}"
;;
*)
usage
exit 1
;;
esac
time python -m data.outbrain.nvtabular.preproc

View File

@ -1,119 +0,0 @@
#!/bin/bash
# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
set -e
OUTBRAIN_DIR='/outbrain'
SPARK_DIR='/tmp/spark'
usage() {
cat <<EOF
Usage: bash scripts/preproc_benchmark.sh -m nvtabular/spark
-m | --mode (Required) Preprocessing to be executed from [nvtabular, spark].
-t | --tfrecords (Optional) Number of tfrecords to be created, default 40.
-i | --iteration (Optional) Number of benchmark iterations, default 10.
EOF
}
if [ ! -d "scripts" ] || [ ! "$(ls -A 'scripts')" ]; then
echo "You are probably calling this script from wrong directory"
usage
exit 1
fi
mode=
iter=10
tfrecords=40
while [ "$1" != "" ]; do
case $1 in
-m | --mode)
shift
mode="$1"
;;
-t | --tfrecords)
shift
tfrecords="$1"
;;
-i | --iteration)
shift
iter="$1"
;;
*)
usage
exit 1
;;
esac
shift
done
if [ -z "$mode" ]; then
echo "Missing preprocessing mode"
usage
exit 1
fi
if [[ ! "$mode" =~ ^(spark|nvtabular)$ ]]; then
echo "Expected mode (${mode}) to be equal spark or nvtabular"
usage
exit 1
fi
if ! [ "$tfrecords" -ge 0 ] 2>/dev/null; then
echo "Expected tfrecords (${tfrecords}) to be positive integer"
usage
exit 1
fi
if ! [ "$iter" -ge 0 ] 2>/dev/null; then
echo "Expected iteration (${iter}) to be positive integer"
usage
exit 1
fi
function clean() {
case "$1" in
nvtabular)
rm -rf "$OUTBRAIN_DIR/data"
rm -rf "$OUTBRAIN_DIR/tfrecords"
;;
spark)
rm -rf "$SPARK_DIR"
rm -rf "$OUTBRAIN_DIR/tfrecords"
;;
esac
}
SECONDS=0
for i in $(seq 1 "$iter"); do
echo "[BENCHMARK] Cleaning directories"
clean "${mode}"
echo "[BENCHMARK] Running iteration ${i}"
bash scripts/memscript.sh & bash scripts/preproc.sh "${mode}" "${tfrecords}"
echo "[BENCHMARK] Memory consumption during iteration ${i} (kB): $(cat mem_consumption.txt)"
done
echo -e "\n[BENCHMARK] Benchmark finished:\n"
echo "[BENCHMARK] Memory consumption (kB): $(cat mem_consumption.txt)"
rm mem_consumption.txt
echo "[BENCHMARK] Mode=${mode}"
echo "[BENCHMARK] Iteration=${iter}"
echo "[BENCHMARK] Tfrecords=${tfrecords}"
AVG_SECONDS=$((("$SECONDS" + "$iter" / 2) / "$iter"))
printf '[BENCHMARK] Total time elapsed: %dh:%dm:%ds\n' $(("$SECONDS" / 3600)) $(("$SECONDS" % 3600 / 60)) $(("$SECONDS" % 60))
printf '[BENCHMARK] Average iteration time: %dh:%dm:%ds\n\n' $(("$AVG_SECONDS" / 3600)) $(("$AVG_SECONDS" % 3600 / 60)) $(("$AVG_SECONDS" % 60))

View File

@ -68,7 +68,7 @@ if ! [ "$gpu" -ge 0 ] || [[ ! "$gpu" =~ ^(1|4|8)$ ]] 2>/dev/null; then
exit 1
fi
cmd="mpiexec --allow-run-as-root --bind-to socket -np ${gpu} \
cmd="horovodrun -np ${gpu} sh hvd_wrapper.sh \
python main.py \
--benchmark \
--benchmark_warmup_steps 500 \

View File

@ -68,7 +68,7 @@ if ! [ "$gpu" -ge 0 ] || [[ ! "$gpu" =~ ^(1|4|8)$ ]] 2>/dev/null; then
exit 1
fi
cmd="mpiexec --allow-run-as-root --bind-to socket -np ${gpu} \
cmd="horovodrun -np ${gpu} sh hvd_wrapper.sh \
python main.py \
${amp} \
${xla}"

View File

@ -1,167 +0,0 @@
# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import numpy as np
import tensorflow as tf
from tensorflow.python.feature_column import feature_column_v2 as fc
def _sort_columns(feature_columns):
return sorted(feature_columns, key=lambda col: col.name)
def _validate_numeric_column(feature_column):
if len(feature_column.shape) > 1:
return "Matrix numeric utils are not allowed, " "found feature {} with shape {}".format(
feature_column.key, feature_column.shape
)
elif feature_column.shape[0] != 1:
return "Vector numeric utils are not allowed, " "found feature {} with shape {}".format(
feature_column.key, feature_column.shape
)
def _validate_categorical_column(feature_column):
if not isinstance(feature_column, fc.IdentityCategoricalColumn):
return (
"Only acceptable categorical columns for feeding "
"embeddings are identity, found column {} of type {}. "
"Consider using NVTabular online preprocessing to perform "
"categorical transformations".format(feature_column.name, type(feature_column).__name__)
)
def _validate_dense_feature_columns(feature_columns):
_errors = []
for feature_column in feature_columns:
if isinstance(feature_column, fc.CategoricalColumn):
if not isinstance(feature_column, fc.BucketizedColumn):
_errors.append(
"All feature columns must be dense, found categorical "
"column {} of type {}. Please wrap categorical columns "
"in embedding or indicator columns before passing".format(
feature_column.name, type(feature_column).__name__
)
)
else:
_errors.append(
"Found bucketized column {}. ScalarDenseFeatures layer "
"cannot apply bucketization preprocessing. Consider using "
"NVTabular to do preprocessing offline".format(feature_column.name)
)
elif isinstance(feature_column, (fc.EmbeddingColumn, fc.IndicatorColumn)):
_errors.append(_validate_categorical_column(feature_column.categorical_column))
elif isinstance(feature_column, fc.NumericColumn):
_errors.append(_validate_numeric_column(feature_column))
_errors = list(filter(lambda e: e is not None, _errors))
if len(_errors) > 0:
msg = "Found issues with columns passed to ScalarDenseFeatures:"
msg += "\n\t".join(_errors)
raise ValueError(_errors)
def _validate_stack_dimensions(feature_columns):
dims = []
for feature_column in feature_columns:
if isinstance(feature_column, fc.EmbeddingColumn):
dimension = feature_column.dimension
elif isinstance(feature_column, fc.IndicatorColumn):
dimension = feature_column.categorical_column.num_buckets
else:
dimension = feature_column.shape[0]
dims.append(dimension)
dim0 = dims[0]
if not all(dim == dim0 for dim in dims[1:]):
dims = ", ".join(map(str, dims))
raise ValueError(
"'stack' aggregation requires all categorical "
"embeddings and continuous utils to have same "
"size. Found dimensions {}".format(dims)
)
class ScalarDenseFeatures(tf.keras.layers.Layer):
def __init__(self, feature_columns, aggregation="concat", name=None, **kwargs):
feature_columns = _sort_columns(feature_columns)
_validate_dense_feature_columns(feature_columns)
assert aggregation in ("concat", "stack")
if aggregation == "stack":
_validate_stack_dimensions(feature_columns)
self.feature_columns = feature_columns
self.aggregation = aggregation
super(ScalarDenseFeatures, self).__init__(name=name, **kwargs)
def build(self, input_shapes):
assert all(shape[1] == 1 for shape in input_shapes.values())
self.embedding_tables = {}
for feature_column in self.feature_columns:
if isinstance(feature_column, fc.NumericColumn):
continue
feature_name = feature_column.categorical_column.key
num_buckets = feature_column.categorical_column.num_buckets
if isinstance(feature_column, fc.EmbeddingColumn):
self.embedding_tables[feature_name] = self.add_weight(
name="{}/embedding_weights".format(feature_name),
trainable=True,
initializer="glorot_normal",
shape=(num_buckets, feature_column.dimension),
)
else:
self.embedding_tables[feature_name] = self.add_weight(
name="{}/embedding_weights".format(feature_name),
trainable=False,
initializer=tf.constant_initializer(np.eye(num_buckets)),
shape=(num_buckets, num_buckets),
)
self.built = True
def call(self, inputs):
features = []
for feature_column in self.feature_columns:
if isinstance(feature_column, fc.NumericColumn):
features.append(inputs[feature_column.name])
else:
feature_name = feature_column.categorical_column.name
table = self.embedding_tables[feature_name]
embeddings = tf.gather(table, inputs[feature_name][:, 0])
features.append(embeddings)
if self.aggregation == "stack":
return tf.stack(features, axis=1)
return tf.concat(features, axis=1)
def compute_output_shape(self, input_shapes):
input_shape = [i for i in input_shapes.values()][0]
if self.aggregation == "concat":
output_dim = len(self.numeric_features) + sum(
[shape[-1] for shape in self.embedding_shapes.values()]
)
return (input_shape[0], output_dim)
else:
embedding_dim = [i for i in self.embedding_shapes.values()][0]
return (input_shape[0], len(self.embedding_shapes), embedding_dim)
def get_config(self):
return {
"feature_columns": self.feature_columns,
"aggregation": self.aggregation,
}

View File

@ -13,65 +13,79 @@
# limitations under the License.
import tensorflow as tf
from data.outbrain.features import get_feature_columns, NUMERIC_COLUMNS, EMBEDDING_TABLE_SHAPES
from trainer.model.layers import ScalarDenseFeatures
from data.outbrain.features import (
CATEGORICAL_COLUMNS,
NUMERIC_COLUMNS,
get_feature_columns,
)
from nvtabular.framework_utils.tensorflow import layers as nvtlayers
def wide_deep_model(args):
def get_inputs_columns():
wide_columns, deep_columns = get_feature_columns()
wide_weighted_outputs = []
numeric_dense_inputs = []
wide_columns_dict = {}
deep_columns_dict = {}
features = {}
for col in wide_columns:
features[col.key] = tf.keras.Input(shape=(1,),
batch_size=None,
name=col.key,
dtype=tf.float32 if col.key in NUMERIC_COLUMNS else tf.int32,
sparse=False)
features[col.key] = tf.keras.Input(
shape=(1,),
batch_size=None,
name=col.key,
dtype=tf.float32 if col.key in NUMERIC_COLUMNS else tf.int32,
sparse=False,
)
wide_columns_dict[col.key] = col
for col in deep_columns:
is_embedding_column = ('key' not in dir(col))
is_embedding_column = "key" not in dir(col)
key = col.categorical_column.key if is_embedding_column else col.key
if key not in features:
features[key] = tf.keras.Input(shape=(1,),
batch_size=None,
name=key,
dtype=tf.float32 if col.key in NUMERIC_COLUMNS else tf.int32,
sparse=False)
features[key] = tf.keras.Input(
shape=(1,),
batch_size=None,
name=key,
dtype=tf.float32 if col.key in NUMERIC_COLUMNS else tf.int32,
sparse=False,
)
deep_columns_dict[key] = col
for key in wide_columns_dict:
if key in EMBEDDING_TABLE_SHAPES:
wide_weighted_outputs.append(tf.keras.layers.Flatten()(tf.keras.layers.Embedding(
EMBEDDING_TABLE_SHAPES[key][0], 1, input_length=1)(features[key])))
else:
numeric_dense_inputs.append(features[key])
categorical_output_contrib = tf.keras.layers.add(wide_weighted_outputs,
name='categorical_output')
numeric_dense_tensor = tf.keras.layers.concatenate(
numeric_dense_inputs, name='numeric_dense')
deep_columns = list(deep_columns_dict.values())
wide_columns = list(wide_columns_dict.values())
dnn = ScalarDenseFeatures(deep_columns, name='deep_embedded')(features)
return deep_columns, wide_columns, features
def wide_deep_model(args):
deep_columns, wide_columns, features = get_inputs_columns()
wide = nvtlayers.LinearFeatures(wide_columns, name="wide_linear")(features)
dnn = nvtlayers.DenseFeatures(deep_columns, name="deep_embedded")(features)
for unit_size in args.deep_hidden_units:
dnn = tf.keras.layers.Dense(units=unit_size, activation='relu')(dnn)
dnn = tf.keras.layers.Dense(units=unit_size, activation="relu")(dnn)
dnn = tf.keras.layers.Dropout(rate=args.deep_dropout)(dnn)
dnn = tf.keras.layers.Dense(units=1)(dnn)
dnn_model = tf.keras.Model(inputs=features,
outputs=dnn)
linear_output = categorical_output_contrib + tf.keras.layers.Dense(1)(numeric_dense_tensor)
linear_model = tf.keras.Model(inputs=features,
outputs=linear_output)
dnn_model = tf.keras.Model(inputs=features, outputs=dnn)
linear_model = tf.keras.Model(inputs=features, outputs=wide)
model = tf.keras.experimental.WideDeepModel(
linear_model, dnn_model, activation='sigmoid')
linear_model, dnn_model, activation="sigmoid"
)
return model
return model, features
def get_dummy_inputs(batch_size):
inputs = {}
shape = (batch_size, 1)
for cat in CATEGORICAL_COLUMNS:
inputs[cat] = tf.zeros(shape, dtype=tf.dtypes.int32)
for cat in NUMERIC_COLUMNS:
inputs[cat] = tf.zeros(shape, dtype=tf.dtypes.float32)
return inputs

View File

@ -12,398 +12,70 @@
# See the License for the specific language governing permissions and
# limitations under the License.
import logging
import os
import time
import dllogger
import horovod.tensorflow as hvd
import numpy as np
import tensorflow as tf
from data.outbrain.features import DISPLAY_ID_COLUMN
from tensorflow.python.keras import backend as K
from trainer.utils.schedulers import get_schedule
from trainer.utils.benchmark import ThroughputCalculator
from trainer.utils.evaluator import Evaluator
from trainer.utils.schedulers import LearningRateScheduler
from trainer.utils.trainer import Trainer
def train(args, model, config):
logger = logging.getLogger('tensorflow')
def run(args, model, config):
train_dataset = config["train_dataset"]
eval_dataset = config["eval_dataset"]
steps_per_epoch = len(train_dataset)
train_dataset = config['train_dataset']
eval_dataset = config['eval_dataset']
steps = int(config['steps_per_epoch'])
schedule = get_schedule(
steps = int(steps_per_epoch * args.num_epochs)
deep_optimizer = tf.keras.optimizers.RMSprop(
learning_rate=args.deep_learning_rate, rho=0.5
)
wide_optimizer = tf.keras.optimizers.Ftrl(learning_rate=args.linear_learning_rate)
if not args.cpu:
deep_optimizer = hvd.DistributedOptimizer(deep_optimizer)
wide_optimizer = hvd.DistributedOptimizer(wide_optimizer)
if args.amp:
deep_optimizer = tf.keras.mixed_precision.LossScaleOptimizer(
deep_optimizer, dynamic=True
)
wide_optimizer = tf.keras.mixed_precision.LossScaleOptimizer(
wide_optimizer, dynamic=True
)
scheduler = LearningRateScheduler(
args=args, steps_per_epoch=steps_per_epoch, optimizer=deep_optimizer
)
throughput_calculator = ThroughputCalculator(args)
compiled_loss = tf.keras.losses.BinaryCrossentropy()
evaluator = Evaluator(
model=model,
throughput_calculator=throughput_calculator,
eval_dataset=eval_dataset,
compiled_loss=compiled_loss,
steps=steps,
args=args,
steps_per_epoch=steps
)
writer = tf.summary.create_file_writer(os.path.join(args.model_dir, 'event_files'))
deep_optimizer = tf.keras.optimizers.RMSprop(
learning_rate=args.deep_learning_rate,
rho=0.5
)
wide_optimizer = tf.keras.optimizers.Ftrl(
learning_rate=args.linear_learning_rate
)
compiled_loss = tf.keras.losses.BinaryCrossentropy()
eval_loss = tf.keras.metrics.Mean()
metrics = [
tf.keras.metrics.BinaryAccuracy(),
tf.keras.metrics.AUC()
]
current_step_var = tf.Variable(0, trainable=False, dtype=tf.int64)
display_id_counter = tf.Variable(0., trainable=False, dtype=tf.float64)
streaming_map = tf.Variable(0., name='STREAMING_MAP', trainable=False, dtype=tf.float64)
checkpoint = tf.train.Checkpoint(
trainer = Trainer(
model=model,
scheduler=scheduler,
deep_optimizer=deep_optimizer,
wide_optimizer=wide_optimizer,
model=model,
current_step=current_step_var
)
manager = tf.train.CheckpointManager(
checkpoint=checkpoint,
directory=os.path.join(args.model_dir, 'checkpoint'),
max_to_keep=1
throughput_calculator=throughput_calculator,
compiled_loss=compiled_loss,
steps=steps,
args=args,
train_dataset=train_dataset,
evaluator=evaluator,
)
if args.use_checkpoint:
checkpoint.restore(manager.latest_checkpoint)
if manager.latest_checkpoint:
logger.warning(f'Model restored from checkpoint {args.model_dir}')
if args.benchmark:
current_step_var.assign(0)
else:
logger.warning(f'Failed to restore model from checkpoint {args.model_dir}')
trainer.maybe_restore_checkpoint()
if args.amp:
deep_optimizer = tf.keras.mixed_precision.experimental.LossScaleOptimizer(
deep_optimizer,
loss_scale='dynamic'
)
wide_optimizer = tf.keras.mixed_precision.experimental.LossScaleOptimizer(
wide_optimizer,
loss_scale='dynamic'
)
@tf.function
def train_step(x, y, first_batch):
with tf.GradientTape(persistent=True) as tape:
y_pred = model(x, training=True)
loss = compiled_loss(y, y_pred)
linear_loss = wide_optimizer.get_scaled_loss(loss) if args.amp else loss
deep_loss = deep_optimizer.get_scaled_loss(loss) if args.amp else loss
if not args.cpu:
tape = hvd.DistributedGradientTape(tape)
for metric in metrics:
metric.update_state(y, y_pred)
linear_vars = model.linear_model.trainable_variables
dnn_vars = model.dnn_model.trainable_variables
linear_grads = tape.gradient(linear_loss, linear_vars)
dnn_grads = tape.gradient(deep_loss, dnn_vars)
if args.amp:
linear_grads = wide_optimizer.get_unscaled_gradients(linear_grads)
dnn_grads = deep_optimizer.get_unscaled_gradients(dnn_grads)
wide_optimizer.apply_gradients(zip(linear_grads, linear_vars))
deep_optimizer.apply_gradients(zip(dnn_grads, dnn_vars))
if first_batch and not args.cpu:
hvd.broadcast_variables(model.linear_model.variables, root_rank=0)
hvd.broadcast_variables(model.dnn_model.variables, root_rank=0)
hvd.broadcast_variables(wide_optimizer.variables(), root_rank=0)
hvd.broadcast_variables(deep_optimizer.variables(), root_rank=0)
return loss
@tf.function
def evaluation_step(x, y):
predictions = model(x, training=False)
loss = compiled_loss(y, predictions)
for metric in metrics:
metric.update_state(y, predictions)
predictions = tf.reshape(predictions, [-1])
predictions = tf.cast(predictions, tf.float64)
display_ids = x[DISPLAY_ID_COLUMN]
display_ids = tf.reshape(display_ids, [-1])
labels = tf.reshape(y, [-1])
sorted_ids = tf.argsort(display_ids)
display_ids = tf.gather(display_ids, indices=sorted_ids)
predictions = tf.gather(predictions, indices=sorted_ids)
labels = tf.gather(labels, indices=sorted_ids)
_, display_ids_idx, display_ids_ads_count = tf.unique_with_counts(display_ids, out_idx=tf.int64)
pad_length = 30 - tf.reduce_max(display_ids_ads_count)
preds = tf.RaggedTensor.from_value_rowids(predictions, display_ids_idx).to_tensor()
labels = tf.RaggedTensor.from_value_rowids(labels, display_ids_idx).to_tensor()
labels_mask = tf.math.reduce_max(labels, 1)
preds_masked = tf.boolean_mask(preds, labels_mask)
labels_masked = tf.boolean_mask(labels, labels_mask)
labels_masked = tf.argmax(labels_masked, axis=1, output_type=tf.int32)
labels_masked = tf.reshape(labels_masked, [-1, 1])
preds_masked = tf.pad(preds_masked, [(0, 0), (0, pad_length)])
_, predictions_idx = tf.math.top_k(preds_masked, 12)
indices = tf.math.equal(predictions_idx, labels_masked)
indices_mask = tf.math.reduce_any(indices, 1)
masked_indices = tf.boolean_mask(indices, indices_mask)
res = tf.argmax(masked_indices, axis=1)
ap_matrix = tf.divide(1, tf.add(res, 1))
ap_sum = tf.reduce_sum(ap_matrix)
shape = tf.cast(tf.shape(indices)[0], tf.float64)
display_id_counter.assign_add(shape)
streaming_map.assign_add(ap_sum)
return loss
t0 = None
t_batch = None
with writer.as_default():
for epoch in range(1, args.num_epochs + 1):
for step, (x, y) in enumerate(train_dataset):
current_step = np.asscalar(current_step_var.numpy())
schedule(optimizer=deep_optimizer, current_step=current_step)
for metric in metrics:
metric.reset_states()
loss = train_step(x, y, epoch == 1 and step == 0)
if args.cpu or hvd.rank() == 0:
for metric in metrics:
tf.summary.scalar(f'{metric.name}', metric.result(), step=current_step)
tf.summary.scalar('loss', loss, step=current_step)
tf.summary.scalar('schedule', K.get_value(deep_optimizer.lr), step=current_step)
writer.flush()
if args.benchmark:
boundary = max(args.benchmark_warmup_steps, 1)
if current_step == boundary:
t0 = time.time()
if current_step > boundary:
batch_time = time.time() - t_batch
samplesps = args.global_batch_size / batch_time
dllogger.log(data={'batch_samplesps': samplesps}, step=(1, current_step))
if args.benchmark_steps <= current_step:
train_time = time.time() - t0
epochs = args.benchmark_steps - max(args.benchmark_warmup_steps, 1)
train_throughput = (args.global_batch_size * epochs) / train_time
dllogger.log(
data={'train_throughput': train_throughput},
step=tuple()
)
return
else:
if current_step % 100 == 0:
train_data = {metric.name: f'{metric.result().numpy():.4f}' for metric in metrics}
train_data['loss'] = f'{loss.numpy():.4f}'
dllogger.log(data=train_data, step=(current_step, args.num_epochs * steps))
if step == steps:
break
current_step_var.assign_add(1)
t_batch = time.time()
if args.benchmark:
continue
for metric in metrics:
metric.reset_states()
eval_loss.reset_states()
for step, (x, y) in enumerate(eval_dataset):
loss = evaluation_step(x, y)
eval_loss.update_state(loss)
map_metric = tf.divide(streaming_map, display_id_counter) if args.cpu else \
hvd.allreduce(tf.divide(streaming_map, display_id_counter))
map_metric = map_metric.numpy()
eval_loss_reduced = eval_loss.result() if args.cpu else \
hvd.allreduce(eval_loss.result())
metrics_reduced = {
f'{metric.name}_val': metric.result() if args.cpu else
hvd.allreduce(metric.result()) for metric in metrics
}
for name, result in metrics_reduced.items():
tf.summary.scalar(f'{name}', result, step=steps * epoch)
tf.summary.scalar('loss_val', eval_loss_reduced, step=steps * epoch)
tf.summary.scalar('map_val', map_metric, step=steps * epoch)
writer.flush()
eval_data = {name: f'{result.numpy():.4f}' for name, result in metrics_reduced.items()}
eval_data.update({
'loss_val': f'{eval_loss_reduced.numpy():.4f}',
'streaming_map_val': f'{map_metric:.4f}'
})
dllogger.log(data=eval_data, step=(steps * epoch, args.num_epochs * steps))
if args.cpu or hvd.rank() == 0:
manager.save()
display_id_counter.assign(0)
streaming_map.assign(0)
if args.cpu or hvd.rank() == 0:
dllogger.log(data=eval_data, step=tuple())
def evaluate(args, model, config):
logger = logging.getLogger('tensorflow')
deep_optimizer = tf.keras.optimizers.RMSprop(
learning_rate=args.deep_learning_rate,
rho=0.5
)
wide_optimizer = tf.keras.optimizers.Ftrl(
learning_rate=args.linear_learning_rate
)
compiled_loss = tf.keras.losses.BinaryCrossentropy()
eval_loss = tf.keras.metrics.Mean()
metrics = [
tf.keras.metrics.BinaryAccuracy(),
tf.keras.metrics.AUC()
]
if args.amp:
deep_optimizer = tf.keras.mixed_precision.experimental.LossScaleOptimizer(
deep_optimizer,
loss_scale='dynamic'
)
wide_optimizer = tf.keras.mixed_precision.experimental.LossScaleOptimizer(
wide_optimizer,
loss_scale='dynamic'
)
current_step = 0
current_step_var = tf.Variable(0, trainable=False, dtype=tf.int64)
display_id_counter = tf.Variable(0., trainable=False, dtype=tf.float64)
streaming_map = tf.Variable(0., name='STREAMING_MAP', trainable=False, dtype=tf.float64)
checkpoint = tf.train.Checkpoint(
deep_optimizer=deep_optimizer,
wide_optimizer=wide_optimizer,
model=model,
current_step=current_step_var
)
manager = tf.train.CheckpointManager(
checkpoint=checkpoint,
directory=os.path.join(args.model_dir, 'checkpoint'),
max_to_keep=1
)
if args.use_checkpoint:
checkpoint.restore(manager.latest_checkpoint).expect_partial()
if manager.latest_checkpoint:
logger.warning(f'Model restored from checkpoint {args.model_dir}')
else:
logger.warning(f'Failed to restore model from checkpoint {args.model_dir}')
@tf.function
def evaluation_step(x, y):
predictions = model(x, training=False)
loss = compiled_loss(y, predictions)
for metric in metrics:
metric.update_state(y, predictions)
predictions = tf.reshape(predictions, [-1])
predictions = tf.cast(predictions, tf.float64)
display_ids = x[DISPLAY_ID_COLUMN]
display_ids = tf.reshape(display_ids, [-1])
labels = tf.reshape(y, [-1])
sorted_ids = tf.argsort(display_ids)
display_ids = tf.gather(display_ids, indices=sorted_ids)
predictions = tf.gather(predictions, indices=sorted_ids)
labels = tf.gather(labels, indices=sorted_ids)
_, display_ids_idx, display_ids_ads_count = tf.unique_with_counts(display_ids, out_idx=tf.int64)
pad_length = 30 - tf.reduce_max(display_ids_ads_count)
preds = tf.RaggedTensor.from_value_rowids(predictions, display_ids_idx).to_tensor()
labels = tf.RaggedTensor.from_value_rowids(labels, display_ids_idx).to_tensor()
labels_mask = tf.math.reduce_max(labels, 1)
preds_masked = tf.boolean_mask(preds, labels_mask)
labels_masked = tf.boolean_mask(labels, labels_mask)
labels_masked = tf.argmax(labels_masked, axis=1, output_type=tf.int32)
labels_masked = tf.reshape(labels_masked, [-1, 1])
preds_masked = tf.pad(preds_masked, [(0, 0), (0, pad_length)])
_, predictions_idx = tf.math.top_k(preds_masked, 12)
indices = tf.math.equal(predictions_idx, labels_masked)
indices_mask = tf.math.reduce_any(indices, 1)
masked_indices = tf.boolean_mask(indices, indices_mask)
res = tf.argmax(masked_indices, axis=1)
ap_matrix = tf.divide(1, tf.add(res, 1))
ap_sum = tf.reduce_sum(ap_matrix)
shape = tf.cast(tf.shape(indices)[0], tf.float64)
display_id_counter.assign_add(shape)
streaming_map.assign_add(ap_sum)
return loss
eval_dataset = config['eval_dataset']
t0 = None
t_batch = None
for step, (x, y) in enumerate(eval_dataset):
loss = evaluation_step(x, y)
eval_loss.update_state(loss)
if args.benchmark:
boundary = max(args.benchmark_warmup_steps, 1)
if current_step == boundary:
t0 = time.time()
if current_step > boundary:
batch_time = time.time() - t_batch
samplesps = args.eval_batch_size / batch_time
if args.cpu or hvd.rank() == 0:
dllogger.log(data={'batch_samplesps': samplesps}, step=(1, current_step))
if args.benchmark_steps <= current_step:
valid_time = time.time() - t0
epochs = args.benchmark_steps - max(args.benchmark_warmup_steps, 1)
valid_throughput = (args.eval_batch_size * epochs) / valid_time
if args.cpu or hvd.rank() == 0:
dllogger.log(
data={'validation_throughput': valid_throughput},
step=tuple()
)
return
else:
if step % 100 == 0:
valid_data = {metric.name: f'{metric.result().numpy():.4f}' for metric in metrics}
valid_data['loss'] = f'{loss.numpy():.4f}'
if args.cpu or hvd.rank() == 0:
dllogger.log(data=valid_data, step=(step,))
current_step += 1
t_batch = time.time()
map_metric = tf.divide(streaming_map, display_id_counter) if args.cpu else \
hvd.allreduce(tf.divide(streaming_map, display_id_counter))
eval_loss_reduced = eval_loss.result() if args.cpu else \
hvd.allreduce(eval_loss.result())
metrics_reduced = {
f'{metric.name}_val': metric.result() if args.cpu else
hvd.allreduce(metric.result()) for metric in metrics
}
eval_data = {name: f'{result.numpy():.4f}' for name, result in metrics_reduced.items()}
eval_data.update({
'loss_val': f'{eval_loss_reduced.numpy():.4f}',
'streaming_map_val': f'{map_metric.numpy():.4f}'
})
dllogger.log(data=eval_data, step=(step,))
if args.evaluate:
evaluator.eval(trainer.current_step_var)
else:
trainer.run_loop()

View File

@ -14,101 +14,178 @@
import argparse
# Default train dataset size
TRAIN_DATASET_SIZE = 59761827
DEFAULT_DIR = "/outbrain"
def parse_args():
parser = argparse.ArgumentParser(
description='Tensorflow2 WideAndDeep Model',
description="Tensorflow2 WideAndDeep Model",
formatter_class=argparse.ArgumentDefaultsHelpFormatter,
add_help=True,
)
locations = parser.add_argument_group('location of datasets')
locations = parser.add_argument_group("location of datasets")
locations.add_argument('--train_data_pattern', type=str, default='/outbrain/tfrecords/train/part*', nargs='+',
help='Pattern of training file names. For example if training files are train_000.tfrecord, '
'train_001.tfrecord then --train_data_pattern is train_*')
locations.add_argument(
"--train_data_pattern",
type=str,
default=f"{DEFAULT_DIR}/data/train/*.parquet",
help="Pattern of training file names. For example if training files are part_0.parquet, "
"part_0.parquet then --train_data_pattern is *.parquet",
)
locations.add_argument('--eval_data_pattern', type=str, default='/outbrain/tfrecords/eval/part*', nargs='+',
help='Pattern of eval file names. For example if eval files are eval_000.tfrecord, '
'eval_001.tfrecord then --eval_data_pattern is eval_*')
locations.add_argument(
"--eval_data_pattern",
type=str,
default=f"{DEFAULT_DIR}/data/valid/*.parquet",
help="Pattern of eval file names. For example if training files are part_0.parquet, "
"part_0.parquet then --eval_data_pattern is *.parquet",
)
locations.add_argument('--transformed_metadata_path', type=str, default='/outbrain/tfrecords',
help='Path to transformed_metadata for feature specification reconstruction')
locations.add_argument(
"--use_checkpoint",
default=False,
action="store_true",
help="Use checkpoint stored in model_dir path",
)
locations.add_argument('--use_checkpoint', default=False, action='store_true',
help='Use checkpoint stored in model_dir path')
locations.add_argument(
"--model_dir",
type=str,
default=f"{DEFAULT_DIR}/checkpoints",
help="Destination where model checkpoint will be saved",
)
locations.add_argument('--model_dir', type=str, default='/outbrain/checkpoints',
help='Destination where model checkpoint will be saved')
locations.add_argument(
"--results_dir",
type=str,
default="/results",
help="Directory to store training results",
)
locations.add_argument('--results_dir', type=str, default='/results',
help='Directory to store training results')
locations.add_argument(
"--log_filename",
type=str,
default="log.json",
help="Name of the file to store dlloger output",
)
locations.add_argument('--log_filename', type=str, default='log.json',
help='Name of the file to store dlloger output')
training_params = parser.add_argument_group("training parameters")
training_params = parser.add_argument_group('training parameters')
training_params.add_argument(
"--global_batch_size",
type=int,
default=131072,
help="Total size of training batch",
)
training_params.add_argument('--training_set_size', type=int, default=TRAIN_DATASET_SIZE,
help='Number of samples in the training set')
training_params.add_argument(
"--eval_batch_size",
type=int,
default=131072,
help="Total size of evaluation batch",
)
training_params.add_argument('--global_batch_size', type=int, default=131072,
help='Total size of training batch')
training_params.add_argument(
"--num_epochs", type=int, default=20, help="Number of training epochs"
)
training_params.add_argument('--eval_batch_size', type=int, default=131072,
help='Total size of evaluation batch')
training_params.add_argument(
"--cpu", default=False, action="store_true", help="Run computations on the CPU"
)
training_params.add_argument('--num_epochs', type=int, default=20,
help='Number of training epochs')
training_params.add_argument(
"--amp",
default=False,
action="store_true",
help="Enable automatic mixed precision conversion",
)
training_params.add_argument('--cpu', default=False, action='store_true',
help='Run computations on the CPU')
training_params.add_argument(
"--xla", default=False, action="store_true", help="Enable XLA conversion"
)
training_params.add_argument('--amp', default=False, action='store_true',
help='Enable automatic mixed precision conversion')
training_params.add_argument(
"--linear_learning_rate",
type=float,
default=0.02,
help="Learning rate for linear model",
)
training_params.add_argument('--xla', default=False, action='store_true',
help='Enable XLA conversion')
training_params.add_argument(
"--deep_learning_rate",
type=float,
default=0.00012,
help="Learning rate for deep model",
)
training_params.add_argument('--linear_learning_rate', type=float, default=0.02,
help='Learning rate for linear model')
training_params.add_argument(
"--deep_warmup_epochs",
type=float,
default=6,
help="Number of learning rate warmup epochs for deep model",
)
training_params.add_argument('--deep_learning_rate', type=float, default=0.00012,
help='Learning rate for deep model')
model_construction = parser.add_argument_group("model construction")
training_params.add_argument('--deep_warmup_epochs', type=float, default=6,
help='Number of learning rate warmup epochs for deep model')
model_construction.add_argument(
"--deep_hidden_units",
type=int,
default=[1024, 1024, 1024, 1024, 1024],
nargs="+",
help="Hidden units per layer for deep model, separated by spaces",
)
model_construction = parser.add_argument_group('model construction')
model_construction.add_argument(
"--deep_dropout",
type=float,
default=0.1,
help="Dropout regularization for deep model",
)
model_construction.add_argument('--deep_hidden_units', type=int, default=[1024, 1024, 1024, 1024, 1024], nargs="+",
help='Hidden units per layer for deep model, separated by spaces')
run_params = parser.add_argument_group("run mode parameters")
model_construction.add_argument('--deep_dropout', type=float, default=0.1,
help='Dropout regularization for deep model')
run_params.add_argument(
"--evaluate",
default=False,
action="store_true",
help="Only perform an evaluation on the validation dataset, don't train",
)
run_params = parser.add_argument_group('run mode parameters')
run_params.add_argument(
"--benchmark",
action="store_true",
default=False,
help="Run training or evaluation benchmark to collect performance metrics",
)
run_params.add_argument('--evaluate', default=False, action='store_true',
help='Only perform an evaluation on the validation dataset, don\'t train')
run_params.add_argument(
"--benchmark_warmup_steps",
type=int,
default=500,
help="Number of warmup steps before start of the benchmark",
)
run_params.add_argument('--benchmark', action='store_true', default=False,
help='Run training or evaluation benchmark to collect performance metrics', )
run_params.add_argument(
"--benchmark_steps",
type=int,
default=1000,
help="Number of steps for performance benchmark",
)
run_params.add_argument('--benchmark_warmup_steps', type=int, default=500,
help='Number of warmup steps before start of the benchmark')
run_params.add_argument('--benchmark_steps', type=int, default=1000,
help='Number of steps for performance benchmark')
run_params.add_argument('--affinity', type=str, default='socket_unique_interleaved',
choices=['socket', 'single', 'single_unique',
'socket_unique_interleaved',
'socket_unique_continuous',
'disabled'],
help='Type of CPU affinity')
run_params.add_argument(
"--affinity",
type=str,
default="socket_unique_interleaved",
choices=[
"socket",
"single",
"single_unique",
"socket_unique_interleaved",
"socket_unique_continuous",
"disabled",
],
help="Type of CPU affinity",
)
return parser.parse_args()

View File

@ -0,0 +1,70 @@
# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import time
import dllogger
import horovod.tensorflow as hvd
import tensorflow as tf
from horovod.tensorflow.mpi_ops import Sum
class ThroughputCalculator:
def __init__(self, args):
self.args = args
self.boundary = max(self.args.benchmark_warmup_steps, 1)
self.step = 0
self.t0 = None
self.start_batch_time = None
with tf.device("/CPU:0"):
self.samples = tf.Variable(0, trainable=False, dtype=tf.int64)
def _init_benchmark(self):
self.t0 = time.time()
def on_epoch_end_log(self, step, shape):
batch_time = time.time() - self.start_batch_time
self.samples.assign_add(shape)
workers = hvd.size() if not self.args.cpu else 1
samplesps = shape * workers / batch_time
if self.args.cpu or hvd.rank() == 0:
dllogger.log(data={"batch_samplesps": samplesps}, step=(1, step))
def on_benchmark_end_log(self, eval_benchmark=False):
train_time = time.time() - self.t0
hvd.join()
if not self.args.cpu:
all_samples = hvd.allreduce(self.samples, op=Sum)
else:
all_samples = self.samples
all_samples = all_samples.numpy()
if self.args.cpu or hvd.rank() == 0:
key = "train_throughput" if not eval_benchmark else "validation_throughput"
throughput = all_samples / train_time
dllogger.log(data={key: throughput}, step=tuple())
def __call__(self, shape, eval_benchmark=False):
if self.args.benchmark:
if self.step == self.boundary:
self._init_benchmark()
if self.step > self.boundary:
self.on_epoch_end_log(self.step, shape)
if self.args.benchmark_steps <= self.step:
self.on_benchmark_end_log(eval_benchmark=eval_benchmark)
exit(0)
self.step += 1
self.start_batch_time = time.time()

View File

@ -0,0 +1,164 @@
# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import dllogger
import horovod.tensorflow as hvd
import tensorflow as tf
from data.outbrain.features import DISPLAY_ID_COLUMN
from horovod.tensorflow.mpi_ops import Sum, Average
class Evaluator:
def __init__(
self,
model,
throughput_calculator,
eval_dataset,
compiled_loss,
steps,
args,
):
self.model = model
self.steps = steps
self.args = args
self.throughput_calculator = throughput_calculator
self.compiled_loss = compiled_loss
self.eval_loss = tf.keras.metrics.Mean()
self.metrics = []
self.eval_dataset = eval_dataset
with tf.device("/CPU:0"):
self.current_step_var = tf.Variable(0, trainable=False, dtype=tf.int64)
self.display_id_counter = tf.Variable(
0.0, trainable=False, dtype=tf.float64
)
self.streaming_map = tf.Variable(
0.0, name="STREAMING_MAP", trainable=False, dtype=tf.float64
)
def _reset_states(self):
for metric in self.metrics:
metric.reset_states()
self.eval_loss.reset_states()
self.display_id_counter.assign(1)
self.current_step_var.assign(1)
self.streaming_map.assign(1)
@tf.function
def _calculate_map(self, x, y, predictions):
predictions = tf.reshape(predictions, [-1])
predictions = tf.cast(predictions, tf.float64)
display_ids = x[DISPLAY_ID_COLUMN]
display_ids = tf.reshape(display_ids, [-1])
labels = tf.reshape(y, [-1])
sorted_ids = tf.argsort(display_ids)
display_ids = tf.gather(display_ids, indices=sorted_ids)
predictions = tf.gather(predictions, indices=sorted_ids)
labels = tf.gather(labels, indices=sorted_ids)
_, display_ids_idx, display_ids_ads_count = tf.unique_with_counts(
display_ids, out_idx=tf.int64
)
pad_length = 30 - tf.reduce_max(display_ids_ads_count)
preds = tf.RaggedTensor.from_value_rowids(
predictions, display_ids_idx
).to_tensor()
labels = tf.RaggedTensor.from_value_rowids(labels, display_ids_idx).to_tensor()
labels_mask = tf.math.reduce_max(labels, 1)
preds_masked = tf.boolean_mask(preds, labels_mask)
labels_masked = tf.boolean_mask(labels, labels_mask)
labels_masked = tf.argmax(labels_masked, axis=1, output_type=tf.int32)
labels_masked = tf.reshape(labels_masked, [-1, 1])
preds_masked = tf.pad(preds_masked, [(0, 0), (0, pad_length)])
_, predictions_idx = tf.math.top_k(preds_masked, 12)
indices = tf.math.equal(predictions_idx, labels_masked)
indices_mask = tf.math.reduce_any(indices, 1)
masked_indices = tf.boolean_mask(indices, indices_mask)
res = tf.argmax(masked_indices, axis=1)
ap_matrix = tf.divide(1, tf.add(res, 1))
ap_sum = tf.reduce_sum(ap_matrix)
shape = tf.cast(tf.shape(indices)[0], tf.float64)
self.display_id_counter.assign_add(shape)
self.streaming_map.assign_add(ap_sum)
@tf.function
def _execute_step_calculations(self, x, y):
predictions = self.model(x, training=False)
with tf.device("/CPU:0"):
loss = self.compiled_loss(y, predictions)
for metric in self.metrics:
metric.update_state(y, predictions)
self.eval_loss.update_state(loss)
self._calculate_map(x, y, predictions)
return loss
@tf.function
def _reduce_results(self):
if not self.args.cpu:
all_streaming_map = hvd.allreduce(self.streaming_map, op=Sum)
all_display_id_counter = hvd.allreduce(self.display_id_counter, op=Sum)
eval_loss = hvd.allreduce(
self.eval_loss.result(), op=Average
)
else:
all_streaming_map = self.streaming_map
all_display_id_counter = self.display_id_counter
eval_loss = self.eval_loss.result()
map_metric = tf.divide(all_streaming_map, all_display_id_counter)
eval_loss = eval_loss
return map_metric, eval_loss
@staticmethod
def log(eval_data, step, steps):
dllogger.log(data=eval_data, step=(step, steps))
def eval_step(self, x, y):
self._execute_step_calculations(x, y)
if self.args.benchmark:
self.throughput_calculator(y.shape[0], eval_benchmark=True)
def eval(self, step):
eval_data = {}
self._reset_states()
range_val = 1 if not self.args.benchmark else 100
# Graph mode part
for _ in range(range_val):
for x, y in self.eval_dataset:
self.eval_step(x, y)
map_metric, eval_loss = self._reduce_results()
if self.args.cpu or hvd.rank() == 0:
with tf.device("/CPU:0"):
# Eager mode part
current_step = int(step.numpy())
eval_data = {
"loss_val": f"{eval_loss.numpy():.4f}",
"streaming_map_val": f"{map_metric.numpy():.4f}",
}
self.log(eval_data, current_step, self.steps)
return eval_data

View File

@ -43,12 +43,12 @@ class device:
return pynvml.nvmlDeviceGetName(self.handle)
def getCpuAffinity(self):
affinity_string = ''
affinity_string = ""
for j in pynvml.nvmlDeviceGetCpuAffinity(
self.handle, device._nvml_affinity_elements
):
# assume nvml returns list of 64 bit ints
affinity_string = '{:064b}'.format(j) + affinity_string
affinity_string = "{:064b}".format(j) + affinity_string
affinity_list = [int(x) for x in affinity_string]
affinity_list.reverse() # so core 0 is in 0th element of list
@ -77,7 +77,9 @@ def set_single_unique_affinity(gpu_id, nproc_per_node):
# remove siblings
for idx, socket_affinity in enumerate(socket_affinities):
socket_affinities[idx] = list(set(socket_affinity) - set(siblings_dict.values()))
socket_affinities[idx] = list(
set(socket_affinity) - set(siblings_dict.values())
)
affinities = []
assigned = []
@ -100,7 +102,9 @@ def set_socket_unique_affinity(gpu_id, nproc_per_node, mode):
# remove siblings
for idx, socket_affinity in enumerate(socket_affinities):
socket_affinities[idx] = list(set(socket_affinity) - set(siblings_dict.values()))
socket_affinities[idx] = list(
set(socket_affinity) - set(siblings_dict.values())
)
socket_affinities_to_device_ids = collections.defaultdict(list)
@ -112,22 +116,26 @@ def set_socket_unique_affinity(gpu_id, nproc_per_node, mode):
cores_per_device = len(socket_affinity) // devices_per_group
for group_id, device_id in enumerate(device_ids):
if device_id == gpu_id:
if mode == 'interleaved':
if mode == "interleaved":
affinity = list(socket_affinity[group_id::devices_per_group])
elif mode == 'continuous':
affinity = list(socket_affinity[group_id * cores_per_device:(group_id + 1) * cores_per_device])
elif mode == "continuous":
affinity = list(
socket_affinity[group_id * cores_per_device:(group_id + 1) * cores_per_device]
)
else:
raise RuntimeError('Unknown set_socket_unique_affinity mode')
raise RuntimeError("Unknown set_socket_unique_affinity mode")
# reintroduce siblings
affinity += [siblings_dict[aff] for aff in affinity if aff in siblings_dict]
affinity += [
siblings_dict[aff] for aff in affinity if aff in siblings_dict
]
os.sched_setaffinity(0, affinity)
def get_thread_siblings_list():
path = '/sys/devices/system/cpu/cpu*/topology/thread_siblings_list'
path = "/sys/devices/system/cpu/cpu*/topology/thread_siblings_list"
thread_siblings_list = []
pattern = re.compile(r'(\d+)\D(\d+)')
pattern = re.compile(r"(\d+)\D(\d+)")
for fname in pathlib.Path(path[0]).glob(path[1:]):
with open(fname) as f:
content = f.read().strip()
@ -138,19 +146,19 @@ def get_thread_siblings_list():
return thread_siblings_list
def set_affinity(gpu_id, nproc_per_node, mode='socket'):
if mode == 'socket':
def set_affinity(gpu_id, nproc_per_node, mode="socket"):
if mode == "socket":
set_socket_affinity(gpu_id)
elif mode == 'single':
elif mode == "single":
set_single_affinity(gpu_id)
elif mode == 'single_unique':
elif mode == "single_unique":
set_single_unique_affinity(gpu_id, nproc_per_node)
elif mode == 'socket_unique_interleaved':
set_socket_unique_affinity(gpu_id, nproc_per_node, 'interleaved')
elif mode == 'socket_unique_continuous':
set_socket_unique_affinity(gpu_id, nproc_per_node, 'continuous')
elif mode == "socket_unique_interleaved":
set_socket_unique_affinity(gpu_id, nproc_per_node, "interleaved")
elif mode == "socket_unique_continuous":
set_socket_unique_affinity(gpu_id, nproc_per_node, "continuous")
else:
raise RuntimeError('Unknown affinity mode')
raise RuntimeError("Unknown affinity mode")
affinity = os.sched_getaffinity(0)
return affinity

View File

@ -12,29 +12,29 @@
# See the License for the specific language governing permissions and
# limitations under the License.
from tensorflow.python.keras import backend as K
import tensorflow as tf
def get_schedule(args, steps_per_epoch):
assert args.deep_warmup_epochs <= args.num_epochs, 'Number of warmup epochs cannot be higher than training epochs'
base_lr = args.deep_learning_rate
warmup_steps = args.deep_warmup_epochs * steps_per_epoch
bound_epoch = args.deep_warmup_epochs + (args.num_epochs - args.deep_warmup_epochs) / 2
boundaries = [bound_epoch * steps_per_epoch]
values = [base_lr / 4, base_lr / 8]
class LearningRateScheduler:
def __init__(self, args, steps_per_epoch, optimizer):
assert (
args.deep_warmup_epochs <= args.num_epochs
), "Number of warmup epochs cannot be higher than training epochs"
self.base_lr = args.deep_learning_rate
self.warmup_steps = args.deep_warmup_epochs * steps_per_epoch
bound_epoch = (
args.deep_warmup_epochs + (args.num_epochs - args.deep_warmup_epochs) / 2
)
self.boundaries = [bound_epoch * steps_per_epoch]
self.values = [self.base_lr / 4, self.base_lr / 8]
self.optimizer = optimizer
def schedule(optimizer, current_step):
current_step = max(1, current_step)
if current_step < warmup_steps:
warmup_lr = base_lr * current_step / warmup_steps
K.set_value(optimizer.lr, K.get_value(warmup_lr))
@tf.function
def __call__(self, step):
if step < self.warmup_steps:
warmup_lr = self.base_lr * step / self.warmup_steps
self.optimizer.lr.assign(warmup_lr)
else:
for index, bound in enumerate(boundaries):
if current_step <= bound:
K.set_value(optimizer.lr, K.get_value(values[index]))
return
K.set_value(optimizer.lr, K.get_value(values[-1]))
return
return schedule
index = tf.reduce_sum(tf.cast(step > self.boundaries, tf.int64))
value = tf.gather(self.values, index)
self.optimizer.lr.assign(value)

View File

@ -12,6 +12,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
import glob
import json
import logging
import os
@ -19,46 +20,33 @@ import os
import dllogger
import horovod.tensorflow.keras as hvd
import tensorflow as tf
import tensorflow_transform as tft
from data.outbrain.dataloader import train_input_fn, eval_input_fn
from data.outbrain.features import PREBATCH_SIZE
from trainer.utils.gpu_affinity import set_affinity
def init_cpu(args, logger):
os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
init_logger(
full=True,
args=args,
logger=logger
)
init_logger(full=True, args=args, logger=logger)
logger.warning('--gpu flag not set, running computation on CPU')
logger.warning("--gpu flag not set, running computation on CPU")
raise RuntimeError("CPU not supported with nvTabular dataloader")
def init_gpu(args, logger):
hvd.init()
init_logger(
full=hvd.rank() == 0,
args=args,
logger=logger
)
if args.affinity != 'disabled':
init_logger(full=hvd.rank() == 0, args=args, logger=logger)
if args.affinity != "disabled":
gpu_id = hvd.local_rank()
affinity = set_affinity(
gpu_id=gpu_id,
nproc_per_node=hvd.size(),
mode=args.affinity
gpu_id=gpu_id, nproc_per_node=hvd.size(), mode=args.affinity
)
logger.warning(f'{gpu_id}: thread affinity: {affinity}')
gpus = tf.config.experimental.list_physical_devices('GPU')
tf.config.experimental.set_visible_devices(gpus[hvd.local_rank()], 'GPU')
logger.warning(f"{gpu_id}: thread affinity: {affinity}")
if args.amp:
policy = tf.keras.mixed_precision.experimental.Policy("mixed_float16")
tf.keras.mixed_precision.experimental.set_policy(policy)
tf.keras.mixed_precision.set_global_policy("mixed_float16")
if args.xla:
tf.config.optimizer.set_jit(True)
@ -69,29 +57,36 @@ def init_logger(args, full, logger):
logger.setLevel(logging.INFO)
log_path = os.path.join(args.results_dir, args.log_filename)
os.makedirs(args.results_dir, exist_ok=True)
dllogger.init(backends=[
dllogger.JSONStreamBackend(verbosity=dllogger.Verbosity.VERBOSE,
filename=log_path),
dllogger.StdOutBackend(verbosity=dllogger.Verbosity.VERBOSE)])
logger.warning('command line arguments: {}'.format(json.dumps(vars(args))))
dllogger.init(
backends=[
dllogger.JSONStreamBackend(
verbosity=dllogger.Verbosity.VERBOSE, filename=log_path
),
dllogger.StdOutBackend(verbosity=dllogger.Verbosity.VERBOSE),
]
)
logger.warning("command line arguments: {}".format(json.dumps(vars(args))))
if not os.path.exists(args.results_dir):
os.mkdir(args.results_dir)
with open('{}/args.json'.format(args.results_dir), 'w') as f:
with open("{}/args.json".format(args.results_dir), "w") as f:
json.dump(vars(args), f, indent=4)
else:
logger.setLevel(logging.ERROR)
dllogger.init(backends=[])
dllogger.log(data=vars(args), step='PARAMETER')
dllogger.log(data=vars(args), step="PARAMETER")
def create_config(args):
assert not (args.cpu and args.amp), \
'Automatic mixed precision conversion works only with GPU'
assert not args.benchmark or args.benchmark_warmup_steps < args.benchmark_steps, \
'Number of benchmark steps must be higher than warmup steps'
logger = logging.getLogger('tensorflow')
assert not (
args.cpu and args.amp
), "Automatic mixed precision conversion works only with GPU"
assert (
not args.benchmark or args.benchmark_warmup_steps < args.benchmark_steps
), "Number of benchmark steps must be higher than warmup steps"
logger = logging.getLogger("tensorflow")
if args.cpu:
init_cpu(args, logger)
@ -99,36 +94,24 @@ def create_config(args):
init_gpu(args, logger)
num_gpus = 1 if args.cpu else hvd.size()
gpu_id = 0 if args.cpu else hvd.rank()
train_batch_size = args.global_batch_size // num_gpus
eval_batch_size = args.eval_batch_size // num_gpus
steps_per_epoch = args.training_set_size / args.global_batch_size
feature_spec = tft.TFTransformOutput(
args.transformed_metadata_path
).transformed_feature_spec()
train_paths = sorted(glob.glob(args.train_data_pattern))
valid_paths = sorted(glob.glob(args.eval_data_pattern))
train_spec_input_fn = train_input_fn(
num_gpus=num_gpus,
id=gpu_id,
filepath_pattern=args.train_data_pattern,
feature_spec=feature_spec,
records_batch_size=train_batch_size // PREBATCH_SIZE,
train_paths=train_paths,
records_batch_size=train_batch_size,
)
eval_spec_input_fn = eval_input_fn(
num_gpus=num_gpus,
id=gpu_id,
repeat=None if args.benchmark else 1,
filepath_pattern=args.eval_data_pattern,
feature_spec=feature_spec,
records_batch_size=eval_batch_size // PREBATCH_SIZE
valid_paths=valid_paths, records_batch_size=eval_batch_size
)
config = {
'steps_per_epoch': steps_per_epoch,
'train_dataset': train_spec_input_fn,
'eval_dataset': eval_spec_input_fn
"train_dataset": train_spec_input_fn,
"eval_dataset": eval_spec_input_fn,
}
return config

View File

@ -0,0 +1,170 @@
# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import logging
import os
import dllogger
import horovod.tensorflow as hvd
import tensorflow as tf
class Trainer:
def __init__(
self,
model,
scheduler,
deep_optimizer,
wide_optimizer,
throughput_calculator,
compiled_loss,
steps,
args,
train_dataset,
evaluator,
):
self.model = model
self.scheduler = scheduler
self.deep_optimizer = deep_optimizer
self.wide_optimizer = wide_optimizer
self.throughput_calculator = throughput_calculator
self.steps = steps
self.args = args
self.train_dataset = train_dataset
self.evaluator = evaluator
self.compiled_loss = compiled_loss
self.logger = logging.getLogger("tensorflow")
with tf.device("/CPU:0"):
self.current_step_var = tf.Variable(0, trainable=False, dtype=tf.int64)
self.display_id_counter = tf.Variable(
0.0, trainable=False, dtype=tf.float64
)
self._init_checkpoint_manager()
def _init_checkpoint_manager(self):
self.checkpoint = tf.train.Checkpoint(
deep_optimizer=self.deep_optimizer,
wide_optimizer=self.wide_optimizer,
model=self.model,
current_step=self.current_step_var,
)
self.manager = tf.train.CheckpointManager(
checkpoint=self.checkpoint,
directory=os.path.join(self.args.model_dir, "checkpoint"),
max_to_keep=1,
)
def maybe_restore_checkpoint(self):
if self.args.use_checkpoint:
self.checkpoint.restore(self.manager.latest_checkpoint).expect_partial()
if self.manager.latest_checkpoint:
self.logger.warning(
f"Model restored from checkpoint {self.args.model_dir}"
)
if self.args.benchmark:
self.current_step_var.assign(0)
else:
self.logger.warning(
f"Failed to restore model from checkpoint {self.args.model_dir}"
)
@tf.function
def __call__(self, x, y):
with tf.GradientTape(persistent=True) as tape:
y_pred = self.model(x, training=True)
loss = self.compiled_loss(y, y_pred)
linear_loss = (
self.wide_optimizer.get_scaled_loss(loss) if self.args.amp else loss
)
deep_loss = (
self.deep_optimizer.get_scaled_loss(loss) if self.args.amp else loss
)
if not self.args.cpu:
tape = hvd.DistributedGradientTape(tape, sparse_as_dense=True)
linear_vars = self.model.linear_model.trainable_variables
dnn_vars = self.model.dnn_model.trainable_variables
linear_grads = tape.gradient(linear_loss, linear_vars)
dnn_grads = tape.gradient(deep_loss, dnn_vars)
if self.args.amp:
linear_grads = self.wide_optimizer.get_unscaled_gradients(linear_grads)
dnn_grads = self.deep_optimizer.get_unscaled_gradients(dnn_grads)
self.wide_optimizer.apply_gradients(zip(linear_grads, linear_vars))
self.deep_optimizer.apply_gradients(zip(dnn_grads, dnn_vars))
if self.current_step_var == 0:
hvd.broadcast_variables(self.model.linear_model.variables, root_rank=0)
hvd.broadcast_variables(self.model.dnn_model.variables, root_rank=0)
hvd.broadcast_variables(self.wide_optimizer.variables(), root_rank=0)
hvd.broadcast_variables(self.deep_optimizer.variables(), root_rank=0)
return loss
@tf.function
def _execute_step_calculations(self, x, y):
loss = self(x, y)
with tf.device("/CPU:0"):
self.scheduler(tf.cast(self.current_step_var + 1, tf.float32))
self.current_step_var.assign_add(1)
return loss
def log(self, current_step, loss):
train_data = {"loss": f"{loss:.4f}"}
dllogger.log(data=train_data, step=(current_step, self.steps))
def train_step(self, x, y):
# Graph mode part
loss = self._execute_step_calculations(x, y)
# Eager mode part
current_step = int(self.current_step_var.numpy()) - 1
if self.args.benchmark:
self.throughput_calculator(y.shape[0])
elif (self.args.cpu or hvd.rank() == 0) and current_step % 100 == 0:
self.log(current_step, loss.numpy())
def join_and_broadcast(self):
hvd.join()
if not self.args.benchmark:
hvd.broadcast_variables(self.model.linear_model.variables, root_rank=0)
hvd.broadcast_variables(self.model.dnn_model.variables, root_rank=0)
hvd.broadcast_variables(self.wide_optimizer.variables(), root_rank=0)
hvd.broadcast_variables(self.deep_optimizer.variables(), root_rank=0)
def run_loop(self):
eval_data = {}
current_epoch = int(self.current_step_var.numpy()) // len(self.train_dataset) + 1
for _ in range(current_epoch, self.args.num_epochs + 1):
range_val = 1 if not self.args.benchmark else 100
# Graph mode part
for _ in range(range_val):
for x, y in self.train_dataset:
self.train_step(x, y)
self.join_and_broadcast()
eval_data = self.evaluator.eval(self.current_step_var)
if self.args.cpu or hvd.rank() == 0:
self.manager.save()
if self.args.cpu or hvd.rank() == 0:
dllogger.log(data=eval_data, step=tuple())