diff --git a/README.md b/README.md index 9c9a5c25..4880855b 100644 --- a/README.md +++ b/README.md @@ -34,6 +34,7 @@ The examples are organized first by framework, such as TensorFlow, PyTorch, etc. ### Recommender Systems - __NCF__ [[PyTorch](https://github.com/NVIDIA/DeepLearningExamples/tree/master/PyTorch/Recommendation/NCF)] [[TensorFlow](https://github.com/NVIDIA/DeepLearningExamples/tree/master/TensorFlow/Recommendation/NCF)] - __VAE-CF__ [[TensorFlow](https://github.com/NVIDIA/DeepLearningExamples/tree/master/TensorFlow/Recommendation/VAE-CF)] +- __WideAndDeep__ [[TensorFlow](https://github.com/NVIDIA/DeepLearningExamples/tree/master/TensorFlow/Recommendation/WideAndDeep)] ### Text to Speech @@ -80,6 +81,7 @@ The examples are organized first by framework, such as TensorFlow, PyTorch, etc. | [BioBert](https://github.com/NVIDIA/DeepLearningExamples/tree/master/TensorFlow/LanguageModeling/BERT/biobert) | TensorFlow | N/A | Yes | Yes | - | - | - | - | - | | [Neural Collaborative Filtering](https://github.com/NVIDIA/DeepLearningExamples/tree/master/TensorFlow/Recommendation/NCF) |TensorFlow | N/A | Yes | Yes | - | - | - | - | - | | [Variational Autoencoder Collaborative Filtering](https://github.com/NVIDIA/DeepLearningExamples/tree/master/TensorFlow/Recommendation/VAE-CF) |TensorFlow | N/A | Yes | Yes | - | - | - | - | - | +| [WideAndDeep](https://github.com/NVIDIA/DeepLearningExamples/tree/master/TensorFlow/Recommendation/WideAndDeep) | TensorFlow | N/A | Yes | Yes | - | - | - | - | - | | [U-Net Industrial](https://github.com/NVIDIA/DeepLearningExamples/tree/master/TensorFlow/Segmentation/UNet_Industrial) |TensorFlow | N/A | Yes | Yes | - | Yes | - | - | Yes | | [U-Net Medical](https://github.com/NVIDIA/DeepLearningExamples/tree/master/TensorFlow/Segmentation/UNet_Medical) | TensorFlow | N/A | Yes | Yes | - | Yes |- | - | Yes | | [V-Net Medical](https://github.com/NVIDIA/DeepLearningExamples/tree/master/TensorFlow/Segmentation/VNet) | TensorFlow | N/A | Yes | Yes | - | Yes | Yes | - | Yes | diff --git a/TensorFlow/Recommendation/WideAndDeep/Dockerfile b/TensorFlow/Recommendation/WideAndDeep/Dockerfile new file mode 100644 index 00000000..de68662b --- /dev/null +++ b/TensorFlow/Recommendation/WideAndDeep/Dockerfile @@ -0,0 +1,48 @@ +# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +ARG FROM_IMAGE_NAME=nvcr.io/nvidia/tensorflow:20.02-tf1-py3 + +FROM ${FROM_IMAGE_NAME} + +USER root + +# Spark dependencies +ENV APACHE_SPARK_VERSION 2.3.1 +ENV HADOOP_VERSION 2.7 + +RUN apt-get -y update && \ + apt-get install --no-install-recommends -y openjdk-8-jre-headless ca-certificates-java && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists/* + +RUN cd /tmp && \ + wget -q http://archive.apache.org/dist/spark/spark-${APACHE_SPARK_VERSION}/spark-${APACHE_SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}.tgz && \ + echo "DC3A97F3D99791D363E4F70A622B84D6E313BD852F6FDBC777D31EAB44CBC112CEEAA20F7BF835492FB654F48AE57E9969F93D3B0E6EC92076D1C5E1B40B4696 *spark-${APACHE_SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}.tgz" | sha512sum -c - && \ + tar xzf spark-${APACHE_SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}.tgz -C /usr/local --owner root --group root --no-same-owner && \ + rm spark-${APACHE_SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}.tgz +RUN cd /usr/local && ln -s spark-${APACHE_SPARK_VERSION}-bin-hadoop${HADOOP_VERSION} spark + +# Spark config +ENV SPARK_HOME /usr/local/spark +ENV PYTHONPATH $SPARK_HOME/python:$SPARK_HOME/python/lib/py4j-0.10.7-src.zip +ENV SPARK_OPTS --driver-java-options=-Xms1024M --driver-java-options=-Xmx4096M --driver-java-options=-Dlog4j.logLevel=info + +RUN pip install pyspark==2.3.1 +RUN pip install --no-deps tensorflow-transform apache-beam==2.14 tensorflow-metadata==0.14.0 pydot dill +RUN pip install ipdb +RUN pip install -e git://github.com/NVIDIA/dllogger#egg=dllogger + +WORKDIR /wd +COPY . . diff --git a/TensorFlow/Recommendation/WideAndDeep/LICENSE b/TensorFlow/Recommendation/WideAndDeep/LICENSE new file mode 100644 index 00000000..faa07e60 --- /dev/null +++ b/TensorFlow/Recommendation/WideAndDeep/LICENSE @@ -0,0 +1,201 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright 2019 NVIDIA Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. \ No newline at end of file diff --git a/TensorFlow/Recommendation/WideAndDeep/NOTICE b/TensorFlow/Recommendation/WideAndDeep/NOTICE new file mode 100644 index 00000000..3a7e5096 --- /dev/null +++ b/TensorFlow/Recommendation/WideAndDeep/NOTICE @@ -0,0 +1,5 @@ +Wide & Deep Tensorflow + +This repository includes software from +https://github.com/gabrielspmoreira/kaggle_outbrain_click_prediction_google_cloud_ml_engine +licensed under the Apache License, Version 2.0. \ No newline at end of file diff --git a/TensorFlow/Recommendation/WideAndDeep/README.md b/TensorFlow/Recommendation/WideAndDeep/README.md new file mode 100644 index 00000000..05595de7 --- /dev/null +++ b/TensorFlow/Recommendation/WideAndDeep/README.md @@ -0,0 +1,416 @@ +# Wide & Deep Recommender Model Training For TensorFlow + +This repository provides a script and recipe to train the Wide and Deep Recommender model to achieve state-of-the-art accuracy and is tested and maintained by NVIDIA. + +## Table Of Contents + +- [Model overview](#model-overview) + * [Model architecture](#model-architecture) + * [Applications and dataset](#applications-and-dataset) + * [Default configuration](#default-configuration) + * [Feature support matrix](#feature-support-matrix) + * [Features](#features) + * [Mixed precision](#mixed-precision) + * [Enabling mixed precision](#enabling-mixed-precision) + * [Impact of mixed precision on training accuracy](#impact-of-mixed-precision-on-training-accuracy) + * [Impact of mixed precision on inference accuracy](#impact-of-mixed-precision-on-inference-accuracy) + * [Glossary](#glossary) +- [Setup](#setup) + * [Requirements](#requirements) +- [Quick Start Guide](#quick-start-guide) +- [Advanced](#advanced) + * [Scripts and sample code](#scripts-and-sample-code) + * [Parameters](#parameters) + * [Command-line options](#command-line-options) + * [Getting the data](#getting-the-data) + * [Dataset guidelines](#dataset-guidelines) + * [Training process](#training-process) + * [Deploying the Wide & Deep model using Triton Inference Server](#deploying-the-wide-deep-model-using-triton-inference-server) +- [Performance](#performance) + * [Benchmarking](#benchmarking) + * [Training performance benchmark](#training-performance-benchmark) + * [Results](#results) + * [Training accuracy results](#training-accuracy-results) + * [Training accuracy: NVIDIA DGX-1 (8x V100 16G)](#training-accuracy-nvidia-dgx-1-8x-v100-16g) + * [Training accuracy plots](#training-accuracy-plots) + * [Training stability test](#training-stability-test) + * [Training performance results](#training-performance-results) + * [Training performance: NVIDIA DGX-1 (8x V100 16G)](#training-performance-nvidia-dgx-1-8x-v100-16g) +- [Release notes](#release-notes) + * [Changelog](#changelog) + * [Known issues](#known-issues) + + +## Model overview + +Recommendation systems drive engagement on many of the most popular online platforms. As the volume of data available to power these systems grows exponentially, data scientists are increasingly turning from more traditional machine learning methods to highly expressive deep learning models to improve the quality of their recommendations. Google's [Wide & Deep Learning for Recommender Systems](https://arxiv.org/abs/1606.07792) has emerged as a popular model for these problems both for its robustness to signal sparsity as well as its user-friendly implementation in [TensorFlow](https://www.tensorflow.org/api_docs/python/tf/estimator/DNNLinearCombinedClassifier). + +The differences between this Wide & Deep Recommender Model and the model from the paper is the size of the Deep part of the model. Originally, in Google's paper, the fully connected part was three layers of 1024, 512, and 256 neurons. Our model consists of 5 layers each of 1024 neurons. + +The model enables you to train a recommender model that combines the memorization of the Wide part and generalization of the Deep part of the network. + +This model is trained with mixed precision using Tensor Cores on NVIDIA Volta and Turing GPUs. Therefore, researchers can get results 1.32 times faster than training without Tensor Cores, while experiencing the benefits of mixed precision training. This model is tested against each NGC monthly container release to ensure consistent accuracy and performance over time. + +### Model architecture + +Wide & Deep refers to a class of networks that use the output of two parts working in parallel - wide model and deep model - to make predictions of recommenders. The wide model is a generalized linear model of features together with their transforms. The deep model is a series of 5 hidden MLP layers of 1024 neurons each beginning with a dense embedding of features. The architecture is presented in Figure 1. + +

+ +
+Figure 1. The architecture of the Wide & Deep model. +

+ +### Applications and dataset + +The basis of our API lies in the observation that in recommendation problems there are hierarchies of features: those which describe the person or object _to which_ we wish to make recommendations (*request* level features), and those which describe those objects which we are considering recommending (*item* level features). Additionally, these features often need to undergo some transformation from their raw representation in data stores to a representation digestible by neural networks. These transformations, defined by [TensorFlow `tf.feature_column`](https://www.tensorflow.org/api_docs/python/tf/feature_column), include nontrivial operations such as hashing, binning, vocabulary lookups, and embedding (indicator columns can be thought of as embeddings with the identity matrix as the embedding table). + +In most APIs, including those implemented in standard TensorFlow, these transformations need to be computed for request level features repeatedly for _every_ item on which we want to compute a recommendation score. Moreover, if the model is being hosted on a dedicated remote inference server, this requires us to send copies of the request level data for every item as well. + +To address this, we built a custom GPU op which computes _all_ these transformations in parallel, and only reads and computes request level features once before fanning them out to the rest of the batch. Besides saving on redundant compute and network I/O, this implementation leverages the exceptional parallel computing power of NVIDIA GPUs to provide massive inference time accelerations compared to native CPU based implementations. + +As a reference dataset, we used a subset of [the features engineered](https://github.com/gabrielspmoreira/kaggle_outbrain_click_prediction_google_cloud_ml_engine) by the 19th place finisher in the [Kaggle Outbrain Click Prediction Challenge](https://www.kaggle.com/c/outbrain-click-prediction/). This competition challenged competitors to predict the likelihood with which a particular ad on a website's display would be clicked on. Competitors were given information about the user, display, document, and ad in order to train their models. More information can be found [here](https://www.kaggle.com/c/outbrain-click-prediction/data). + + +### Default configuration + +For reference, and to give context to the acceleration numbers described below, some important properties of our features and model are as follows: + +- Features + - Request Level + - 16 scalar numeric features `(shape=(1,)`) + - 12 one-hot categorical features (all `int` dtype) + - 5 indicator embeddings + - sizes 2, 2, 3, 3, 6 + - 7 trainable embeddings + - all except two have an embedding size of 64 (remaining two have 128), though it's important to note for *all* categorical features that we *do not* leverage that information to short-circuit the lookups by treating them as a single multi-hot lookup. Our API is fully general to any combination of embedding sizes. + - all use hash bucketing with `num_buckets=` 300k, 100k, 4k, 2.5k, 2k, 1k, and 300 respectively + - 3 multi-hot categorical features (all `int` dtype) + - all trainable embeddings + - all with embedding size 64 + - all use hash bucketing with `num_buckets=` 10k, 350, and 100 respectively + - Item Level + - 16 scalar numeric features + - 4 one hot categorical features (all `int` dtype) + - embedding sizes of 128, 64, 64, 64 respectively + - hash bucketing with `num_buckets=` 250k, 4k, 2.5k, and 1k respectively + - 3 multi-hot categorical features (all `int` dtype) + - all with embedding size 64 + - hash bucketing with `num_buckets=` 10k, 350, and 100 respectively + - All features are used in both wide *and* deep branches of the network + +- Model + - Total embedding dimension is 1328 + - 5 hidden layers each with size 1024 + - Output dimension is 1 (probability of click) + +### Feature support matrix + +The following features are supported by this model: + +| Feature | Wide & Deep +|-----------------------|-------------------------- +|Horovod Multi-GPU | Yes +|Automatic mixed precision (AMP) | Yes + +#### Features + +Horovod + +Horovod is a distributed training framework for TensorFlow, Keras, PyTorch and MXNet. The goal of Horovod is to make distributed deep learning fast and easy to use. For more information about how to get started with Horovod, see the [Horovod: Official repository](https://github.com/horovod/horovod). + +Multi-GPU training with Horovod + +Our model uses Horovod to implement efficient multi-GPU training with NCCL. For details, see example sources in this repository or see the [TensorFlow tutorial](https://github.com/horovod/horovod/#usage). + + +### Mixed precision + +Mixed precision is the combined use of different numerical precisions in a computational method. [Mixed precision](https://arxiv.org/abs/1710.03740) training offers significant computational speedup by performing operations in half-precision format while storing minimal information in single-precision to retain as much information as possible in critical parts of the network. Since the introduction of [Tensor Cores](https://developer.nvidia.com/tensor-cores) in the Volta and Turing architecture, significant training speedups are experienced by switching to mixed precision -- up to 3x overall speedup on the most arithmetically intense model architectures. Using mixed precision training requires two steps: +1. Porting the model to use the FP16 data type where appropriate. +2. Adding loss scaling to preserve small gradient values. + +The ability to train deep learning networks with lower precision was introduced in the Pascal architecture and first supported in [CUDA 8](https://devblogs.nvidia.com/parallelforall/tag/fp16/) in the NVIDIA Deep Learning SDK. + +For information about: +- How to train using mixed precision, see the [Mixed Precision Training](https://arxiv.org/abs/1710.03740) paper and [Training With Mixed Precision](https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html) documentation. +- Techniques used for mixed precision training, see the [Mixed-Precision Training of Deep Neural Networks](https://devblogs.nvidia.com/mixed-precision-training-deep-neural-networks/) blog. +- How to access and enable AMP for TensorFlow, see [Using TF-AMP](https://docs.nvidia.com/deeplearning/dgx/tensorflow-user-guide/index.html#tfamp) from the TensorFlow User Guide. + +#### Enabling mixed precision + +To enable Wide & Deep training to use mixed precision you don't need to perform input quantization, only an additional flag `--amp` to the training script is needed (see [Quick Start Guide](#quick-start-guide)). + +##### Impact of mixed precision on training accuracy +The accuracy of training, measured with MAP@12 metric was not impacted by enabling mixed precision. The obtained results were statistically similar (i.e. similar run-to-run variance was observed, with standard deviation of the level of `0.002`). + +##### Impact of mixed precision on inference accuracy +For our reference model, the average absolute error on the probability of interaction induced by reduced precision inference is `0.0002`, producing a near-perfect fit between predictions produced by full and mixed precision models. Moreover, this error is uncorrelated with the magnitude of the predicted value, which means for most predictions of interest (i.e. greater than `0.01` or `0.1` likelihood of interaction), the relative magnitude of the error is approaching the noise floor of the problem. + + + +### Glossary + +Request level features: Features that describe the person or object _to which_ we wish to make recommendations. + +Item level features: Features that describe those objects which we are considering recommending. + +## Setup + +The following section lists the requirements that you need to meet in order to start training the Wide & Deep model. + +### Requirements + +This repository contains Dockerfile which extends the TensorFlow NGC container and encapsulates some dependencies. Aside from these dependencies, ensure you have the following components: +- [NVIDIA Docker](https://github.com/NVIDIA/nvidia-docker) +- [20.02-tf1-py3](https://ngc.nvidia.com/catalog/containers/nvidia:tensorflow) NGC container +- [NVIDIA Volta](https://www.nvidia.com/en-us/data-center/volta-gpu-architecture/) or [Turing](https://www.nvidia.com/en-us/geforce/turing/) based GPU + +For more information about how to get started with NGC containers, see the following sections from the NVIDIA GPU Cloud Documentation and the Deep Learning Documentation: +- [Getting Started Using NVIDIA GPU Cloud](https://docs.nvidia.com/ngc/ngc-getting-started-guide/index.html) +- [Accessing And Pulling From The NGC Container Registry](https://docs.nvidia.com/deeplearning/frameworks/user-guide/index.html#accessing_registry) +- [Running TensorFlow](https://docs.nvidia.com/deeplearning/frameworks/tensorflow-release-notes/running.html#running) + +For those unable to use the TensorFlow NGC container, to set up the required environment or create your own container, see the versioned [NVIDIA Container Support Matrix](https://docs.nvidia.com/deeplearning/frameworks/support-matrix/index.html). + +## Quick Start Guide + +To train your model using mixed precision with Tensor Cores or using FP32, perform the following steps using the default parameters of the Wide & Deep model on the Outbrain dataset. For the specifics concerning training and inference, see the [Advanced](#advanced) section. + +1. Clone the repository. + +``` +git clone https://github.com/NVIDIA/DeepLearningExamples +cd DeepLearningExamples/TensorFlow/Recommendation/WideDeep +``` + +2. Download the Outbrain dataset. + +The Outbrain dataset can be downloaded from [Kaggle](https://www.kaggle.com/c/outbrain-click-prediction/data) (requires Kaggle account). +Unzip the downloaded archive e.g. to `/raid/outbrain/orig` and set the `HOST_OUTBRAIN_PATH` variable to the parent directory: + +```bash +HOST_OUTBRAIN_PATH=/raid/outbrain +``` + +3. Build the Wide & Deep Tensorflow NGC container. + +```bash +docker build . -t wide_deep +``` + +4. Start an interactive session in the NGC container to run preprocessing/training/inference. + +```bash +docker run --runtime=nvidia --rm -ti -v ${HOST_OUTBRAIN_PATH}:/outbrain wide_deep /bin/bash +``` +5. Start preprocessing. + +```bash +bash scripts/preproc.sh 4096 +``` +The result of preprocessing scripts are prebatched TFRecords. The argument to the script is the prebatch +size (4096 is the default). + +6. Start training. + +Single GPU: +```bash +python -m trainer.task --gpu --amp --batch_size 131072 --num_epochs 100 +``` +8 GPU: +```bash +mpiexec --allow-run-as-root --bind-to socket -np 8 python -m trainer.task --gpu --amp --hvd --batch_size 16384 --num_epochs 20 +``` + +If you want to run validation or inference, you can either use the checkpoint obtained from the training +commands above, or download the pretrained checkpoint from NGC. + +In order to download the checkpoint from NGC, visit [ngc.nvidia.com](https://ngc.nvidia.com) website and +browse the available models. +Download the checkpoint files and unzip them to some path, e.g. to `/raid/outbrain/checkpoints/` +(which is the default path for storing the checkpoints during training). + + +7. Start validation/evaluation. + +In order to validate the checkpoint on the evaluation set, run the `task.py` script with `--evaluate` flag: + +```bash +python -m trainer.task --gpu --amp --evaluate --model_dir /outbrain/checkpoints +``` + +8. Start inference/predictions. + +In order to run inference and predict the results, run the `task.py` +script with `--predict` flag: + +```bash +python -m trainer.task --gpu --amp --predict --model_dir /outbrain/checkpoints +``` + + +## Advanced + +The following sections provide greater details of the dataset, running training, and the training results. + +### Scripts and sample code + +These are the important scripts in this repository: +* `trainer/task.py` - Python script for training the Wide & Deep recommender model +* `trainer/features.py` - Python file describing the request and item level features + +### Parameters + +These are the important parameters in the `trainer/task.py` script: + +``` +--model_dir: Path to model checkpoint directory +--deep_hidden_units: [DEEP_LAYER1 DEEP_LAYER2 ...] hidden units per layer, separated by spaces +--prebatch_size: Number of samples in each pre-batch in tfrecords +--batch_size: Training batch size (must be a multiplicity of prebatch_size) +--eval_batch_size: Evaluation batch size (must be a multiplicity of prebatch_size) +--num_epochs: Number of epochs to train +--linear_learning_rate: Learning rate for the wide part of the model +--linear_l1_regularization: L1 regularization for the wide part of the model +--linear_l2_regularization: L2 regularization for the wide part of the model +--deep_learning_rate: Learning rate for the deep part of the model +--deep_l1_regularization: L1 regularization for the deep part of the model +--deep_l2_regularization: L2 regularization for the deep part of the model +--deep_dropout: Dropout probability for deep model +--predict: Perform only the prediction on the validation set, do not train +--evaluate: Perform only the evaluation on the validation set, do not train +--gpu: Run computations on GPU +--amp: Enable Automatic Mixed Precision +--xla: Enable XLA +--hvd: Use Horovod for multi-GPU training +``` + +### Command-line options + +To see the full list of available options and their descriptions, use the `-h` or `--help` command-line option: +```bash +python -m trainer.task --help +``` + + +### Getting the data + +The Outbrain dataset can be downloaded from [Kaggle](https://www.kaggle.com/c/outbrain-click-prediction/data) (requires Kaggle account). + + +#### Dataset guidelines + +The dataset contains a sample of users’ page views and clicks, as observed on multiple publisher sites. Viewed pages and clicked recommendations have further semantic attributes of the documents. + +The dataset contains sets of content recommendations served to a specific user in a specific context. Each context (i.e. a set of recommendations) is given a display_id. In each such set, the user has clicked on at least one recommendation. The page view logs originally has more than 2 billion rows (around 100 GB uncompressed). + +The data within the preprocessing stage are transferred into tabular data of 54 features, for training having 55 million rows. + + +### Training process + +The training can be started by running the `trainer/task.py` script. By default the script is in train mode. Other training related +configs are also present in the `trainer/task.py` and can be seen using the command `python -m trainer.task --help`. Training happens for `--num_epochs` epochs with custom estimator for the model. The model has a wide linear part and a deep feed forward network, and the networks are built according to the default configuration. + +Two separate optimizers are used to optimize the wide and the deep part of the network: + +- FTLR (Follow the Regularized Leader) optimizer is used to optimize the wide part of the network. +- Proximal Adagrad optimizer is used to optimize the deep part of the network. + +The training log will contain information about: + +- Loss value after every 100 steps. +- Training throughput if `--benchmark` option is selected. +- Evaluation metrics after every evaluation cycle at the end of every epoch. + +Checkpoints are stored at the end of every `--save_checkpoints_steps` at the `--model_dir` location. + +### Deploying the Wide & Deep model using Triton Inference Server + +This repository does not contain code for deploying the model using Triton Inference Server. The details of such deployment together with obtained performance numbers was discussed on the [blog post](https://devblogs.nvidia.com/accelerating-wide-deep-recommender-inference-on-gpus/). + +## Performance + +### Benchmarking + +The following section shows how to run benchmarks measuring the model performance in training mode. + +#### Training performance benchmark + +We provide 6 scripts to benchmark the performance of training: +```bash +bash scripts/benchmark_training_fp32_1gpu.sh +bash scripts/benchmark_training_fp16_1gpu.sh +bash scripts/benchmark_training_fp32_4gpu.sh +bash scripts/benchmark_training_fp16_4gpu.sh +bash scripts/benchmark_training_fp32_8gpu.sh +bash scripts/benchmark_training_fp16_8gpu.sh +``` + +### Results + +The following sections provide details on how we achieved our performance and +accuracy in training. + +#### Training accuracy results + +##### Training accuracy: NVIDIA DGX-1 (8x V100 16G) + +Our results were obtained by running the benchmark scripts from the `scripts` directory in the TensorFlow NGC container on NVIDIA DGX-1 with (8x V100 16G) GPUs. + +|**GPUs**|**Batch Size / GPU**|**Accuracy - FP32 (MAP@12)**|**Accuracy - Mixed precision (MAP@12)**|**Time to Train - FP32 (minutes)**|**Time to Train - Mixed precision (minutes)**|**Time to Train Speedup (FP32 to Mixed precision)**| +|-------:|-------------------:|----------------------------:|---------------------------------------:|-----------------------------------------------:|----------------------:|---------------------------------:| +| 1 | 131,072 | 0.67689 | 0.67542 | 546 | 414 | 1.32 | +| 4 | 32,768 | 0.67677 | 0.67647 | 78 | 66 | 1.18 | +| 8 | 16,384 | 0.67669 | 0.67594 | 30 | 24 | 1.25 | + +To achieve the same results, follow the steps in the [Quick Start Guide](#quick-start-guide). + +##### Training accuracy plots + +![MAP12](img/map12_WnD.png) + +##### Training stability test + +The Wide and Deep model was trained for 72,951 training steps, starting +from 20 different initial random seeds. The training was performed in the 20.02-tf1-py3-stage NGC container on +NVIDIA DGX-1 with 8x V100 16G GPUs with mixed precision enabled. +After training, the models were evaluated on the test dataset. The following +table summarizes the final MAP@12 score on the test set. + +|**Average MAP@12**|**Standard deviation**|**Minimum**|**Maximum**| +|---------------------:|---------------------:|----------:|----------:| +| 0.67594 | 0.00204 | 0.66906 | 0.67785 | + + +#### Training performance results + + +##### Training performance: NVIDIA DGX-1 (8x V100 16G) + +Our results were obtained by running the `trainer/task.py` training script in the TensorFlow NGC container on NVIDIA DGX-1 with (8x V100 16G) GPUs. Performance numbers (in samples per second) were averaged over 50 training iterations. Improving model scaling for multi-GPU is planned, see [known issues](#known-issues). + +To achieve these same results, follow the steps in the [Quick Start Guide](#quick-start-guide). + +|**GPUs**|**Batch Size / GPU**|**Throughput - FP32 (samples/s)**|**Throughput - Mixed precision (samples/s)**|**Throughput speedup (FP32 to Mixed precision)**|**Weak Scaling - FP32**|**Weak Scaling - Mixed precision**| +|-------:|-------------------:|----------------------------:|---------------------------------------:|-----------------------------------------------:|----------------------:|---------------------------------:| +| 1 | 131,072 | 167,875 | 221,550 | 1.320 | 1.000 | 1.000 | +| 4 | 131,072 | 485,242 | 547,683 | 1.129 | 2.472 | 2.890 | +| 8 | 131,072 | 655,665 | 688,481 | 1.050 | 3.108 | 3.906 | + + + +## Release notes + +### Changelog + +This section needs to include the date of the release and the most important changes after the initial release. + +March 2020 +- Initial release + +### Known issues + +- Limited tf.feature_column support +- Limited scaling for multi-GPU because of inefficient handling of embedding operations (multiple memory transfers between CPU and GPU), work in progress to cover all the operations on GPU. diff --git a/TensorFlow/Recommendation/WideAndDeep/__init__.py b/TensorFlow/Recommendation/WideAndDeep/__init__.py new file mode 100644 index 00000000..067cf051 --- /dev/null +++ b/TensorFlow/Recommendation/WideAndDeep/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. \ No newline at end of file diff --git a/TensorFlow/Recommendation/WideAndDeep/dataflow_preprocess.py b/TensorFlow/Recommendation/WideAndDeep/dataflow_preprocess.py new file mode 100644 index 00000000..90b63618 --- /dev/null +++ b/TensorFlow/Recommendation/WideAndDeep/dataflow_preprocess.py @@ -0,0 +1,155 @@ +# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import argparse +import datetime +import os +import random +import subprocess +import sys +from joblib import Parallel, delayed + +import outbrain_transform +import path_constants + +import tensorflow as tf +import glob + +import pandas as pd + +import trainer.features + + +def parse_arguments(argv): + """Parse command line arguments. + + Args: + argv: list of command line arguments including program name. + Returns: + The parsed arguments as returned by argparse.ArgumentParser. + """ + parser = argparse.ArgumentParser( + description='Runs Transformation on the Outbrain Click Prediction model data.') + + parser.add_argument( + '--training_data', + default='', + help='Data to analyze and encode as training features.') + parser.add_argument( + '--eval_data', + default='', + help='Data to encode as evaluation features.') + parser.add_argument( + '--output_dir', + default=None, + required=True, + help=('Google Cloud Storage or Local directory in which ' + 'to place outputs.')) + parser.add_argument('--batch_size', default=None, type=int, help='Size of batches to create.') + parser.add_argument('--submission', default=False, action='store_true', help='Use real test set for submission') + + args, _ = parser.parse_known_args(args=argv[1:]) + + return args + +# a version of this method that prefers pandas methods +def local_transform_chunk(nr, csv, output_prefix, min_logs, max_logs, batch_size=None, remainder=None): + # put any remainder at the front of the line, with the new rows after + if remainder is not None: + csv = remainder.append(csv) + + # for each batch, slice into the datafrom to retrieve the corresponding data + print(str(datetime.datetime.now()) + '\tWriting rows...') + num_rows = len(csv.index) + with tf.python_io.TFRecordWriter(output_prefix + str(nr).zfill(3) + '.tfrecord') as writer: + for start_ind in range(0,num_rows,batch_size if batch_size is not None else 1): # for each batch + if start_ind + batch_size - 1 > num_rows: # if we'd run out of rows + return csv.iloc[start_ind:] # return remainder for use with the next file + # otherwise write this batch to TFRecord + csv_slice = csv.iloc[start_ind:start_ind+(batch_size if batch_size is not None else 1)] + example = outbrain_transform.create_tf_example(csv_slice, min_logs, max_logs) + writer.write(example.SerializeToString()) + +# calculate min and max stats for the given dataframes all in one go +def compute_min_max_logs(dataframes): + print(str(datetime.datetime.now()) + '\tComputing min and max') + min_logs = {} + max_logs = {} + df = pd.concat(dataframes) # concatenate all dataframes, to process at once + for name in trainer.features.FLOAT_COLUMNS_LOG_BIN_TRANSFORM: + feature_series = df[name] + min_logs[name + '_log_01scaled'] = outbrain_transform.log2_1p(feature_series.min(axis=0)*1000) + max_logs[name + '_log_01scaled'] = outbrain_transform.log2_1p(feature_series.max(axis=0)*1000) + for name in trainer.features.INT_COLUMNS: + feature_series = df[name] + min_logs[name + '_log_01scaled'] = outbrain_transform.log2_1p(feature_series.min(axis=0)) + max_logs[name + '_log_01scaled'] = outbrain_transform.log2_1p(feature_series.max(axis=0)) + return min_logs, max_logs + + +def main(argv=None): + args = parse_arguments(sys.argv if argv is None else argv) + + # Retrieve and sort training and eval data (to ensure consistent order) + # Order is important so that the right data will get sorted together for MAP + training_data = sorted(glob.glob(args.training_data)) + eval_data = sorted(glob.glob(args.eval_data)) + print('Training data:\n{}\nFound:\n{}'.format(args.training_data,training_data)) + print('Evaluation data:\n{}\nFound:\n{}'.format(args.eval_data,eval_data)) + + outbrain_transform.make_spec(args.output_dir + '/transformed_metadata', batch_size=args.batch_size) + + # read all dataframes + print('\n' + str(datetime.datetime.now()) + '\tReading input files') + eval_dataframes = [pd.read_csv(filename, header=None, names=outbrain_transform.CSV_ORDERED_COLUMNS) + for filename in eval_data] + train_dataframes = [pd.read_csv(filename, header=None, names=outbrain_transform.CSV_ORDERED_COLUMNS) + for filename in training_data] + + # calculate stats once over all records given + min_logs, max_logs = compute_min_max_logs(eval_dataframes + train_dataframes) + + if args.submission: + train_output_string = '/sub_train_' + eval_output_string = '/test_' + else: + train_output_string = '/train_' + eval_output_string = '/eval_' + + # process eval files + print('\n' + str(datetime.datetime.now()) + '\tWorking on evaluation data') + eval_remainder = None # remainder when a file's records don't divide evenly into batches + for i, df in enumerate(eval_dataframes): + print(eval_data[i]) + eval_remainder = local_transform_chunk(i, df, args.output_dir + eval_output_string, min_logs, max_logs, + batch_size=args.batch_size, remainder=eval_remainder) + if eval_remainder is not None: + print('Dropping {} records (eval) on the floor'.format(len(eval_remainder))) + + # process train files + print('\n' + str(datetime.datetime.now()) + '\tWorking on training data') + train_remainder = None + for i, df in enumerate(train_dataframes): + print(training_data[i]) + train_remainder = local_transform_chunk(i, df, args.output_dir + train_output_string, min_logs, max_logs, + batch_size=args.batch_size, remainder=train_remainder) + if train_remainder is not None: + print('Dropping {} records (train) on the floor'.format(len(train_remainder))) + +if __name__ == '__main__': + main() diff --git a/TensorFlow/Recommendation/WideAndDeep/img/.gitkeep b/TensorFlow/Recommendation/WideAndDeep/img/.gitkeep new file mode 100644 index 00000000..e69de29b diff --git a/TensorFlow/Recommendation/WideAndDeep/img/map12_WnD.png b/TensorFlow/Recommendation/WideAndDeep/img/map12_WnD.png new file mode 100644 index 00000000..68d01cd7 Binary files /dev/null and b/TensorFlow/Recommendation/WideAndDeep/img/map12_WnD.png differ diff --git a/TensorFlow/Recommendation/WideAndDeep/outbrain_transform.py b/TensorFlow/Recommendation/WideAndDeep/outbrain_transform.py new file mode 100644 index 00000000..5dfd32a4 --- /dev/null +++ b/TensorFlow/Recommendation/WideAndDeep/outbrain_transform.py @@ -0,0 +1,220 @@ +# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import tensorflow as tf +from tensorflow_transform.tf_metadata import dataset_schema +from tensorflow_transform.tf_metadata import dataset_metadata +from tensorflow_transform.tf_metadata import metadata_io +import numpy as np +import pandas as pd + +from trainer.features import LABEL_COLUMN, DISPLAY_ID_COLUMN, AD_ID_COLUMN, IS_LEAK_COLUMN, DISPLAY_ID_AND_IS_LEAK_ENCODED_COLUMN, CATEGORICAL_COLUMNS, DOC_CATEGORICAL_MULTIVALUED_COLUMNS, BOOL_COLUMNS, INT_COLUMNS, FLOAT_COLUMNS, FLOAT_COLUMNS_LOG_BIN_TRANSFORM, FLOAT_COLUMNS_SIMPLE_BIN_TRANSFORM + +RENAME_COLUMNS = False + +CSV_ORDERED_COLUMNS = ['label','display_id','ad_id','doc_id','doc_event_id','is_leak','event_weekend', + 'user_has_already_viewed_doc','user_views','ad_views','doc_views', + 'doc_event_days_since_published','doc_event_hour','doc_ad_days_since_published', + 'pop_ad_id','pop_ad_id_conf', + 'pop_ad_id_conf_multipl','pop_document_id','pop_document_id_conf', + 'pop_document_id_conf_multipl','pop_publisher_id','pop_publisher_id_conf', + 'pop_publisher_id_conf_multipl','pop_advertiser_id','pop_advertiser_id_conf', + 'pop_advertiser_id_conf_multipl','pop_campain_id','pop_campain_id_conf', + 'pop_campain_id_conf_multipl','pop_doc_event_doc_ad','pop_doc_event_doc_ad_conf', + 'pop_doc_event_doc_ad_conf_multipl','pop_source_id','pop_source_id_conf', + 'pop_source_id_conf_multipl','pop_source_id_country','pop_source_id_country_conf', + 'pop_source_id_country_conf_multipl','pop_entity_id','pop_entity_id_conf', + 'pop_entity_id_conf_multipl','pop_entity_id_country','pop_entity_id_country_conf', + 'pop_entity_id_country_conf_multipl','pop_topic_id','pop_topic_id_conf', + 'pop_topic_id_conf_multipl','pop_topic_id_country','pop_topic_id_country_conf', + 'pop_topic_id_country_conf_multipl','pop_category_id','pop_category_id_conf', + 'pop_category_id_conf_multipl','pop_category_id_country','pop_category_id_country_conf', + 'pop_category_id_country_conf_multipl','user_doc_ad_sim_categories', + 'user_doc_ad_sim_categories_conf','user_doc_ad_sim_categories_conf_multipl', + 'user_doc_ad_sim_topics','user_doc_ad_sim_topics_conf','user_doc_ad_sim_topics_conf_multipl', + 'user_doc_ad_sim_entities','user_doc_ad_sim_entities_conf','user_doc_ad_sim_entities_conf_multipl', + 'doc_event_doc_ad_sim_categories','doc_event_doc_ad_sim_categories_conf', + 'doc_event_doc_ad_sim_categories_conf_multipl','doc_event_doc_ad_sim_topics', + 'doc_event_doc_ad_sim_topics_conf','doc_event_doc_ad_sim_topics_conf_multipl', + 'doc_event_doc_ad_sim_entities','doc_event_doc_ad_sim_entities_conf', + 'doc_event_doc_ad_sim_entities_conf_multipl','ad_advertiser','doc_ad_category_id_1', + 'doc_ad_category_id_2','doc_ad_category_id_3','doc_ad_topic_id_1','doc_ad_topic_id_2', + 'doc_ad_topic_id_3','doc_ad_entity_id_1','doc_ad_entity_id_2','doc_ad_entity_id_3', + 'doc_ad_entity_id_4','doc_ad_entity_id_5','doc_ad_entity_id_6','doc_ad_publisher_id', + 'doc_ad_source_id','doc_event_category_id_1','doc_event_category_id_2','doc_event_category_id_3', + 'doc_event_topic_id_1','doc_event_topic_id_2','doc_event_topic_id_3','doc_event_entity_id_1', + 'doc_event_entity_id_2','doc_event_entity_id_3','doc_event_entity_id_4','doc_event_entity_id_5', + 'doc_event_entity_id_6','doc_event_publisher_id','doc_event_source_id','event_country', + 'event_country_state','event_geo_location','event_hour','event_platform','traffic_source'] + +def make_spec(output_dir, batch_size=None): + fixed_shape = [batch_size,1] if batch_size is not None else [] + spec = {} + spec[LABEL_COLUMN] = tf.FixedLenFeature(shape=fixed_shape, dtype=tf.int64, default_value=None) + spec[DISPLAY_ID_COLUMN] = tf.FixedLenFeature(shape=fixed_shape, dtype=tf.int64, default_value=None) + spec[IS_LEAK_COLUMN] = tf.FixedLenFeature(shape=fixed_shape, dtype=tf.int64, default_value=None) + spec[DISPLAY_ID_AND_IS_LEAK_ENCODED_COLUMN] = tf.FixedLenFeature(shape=fixed_shape, dtype=tf.int64, default_value=None) + + for name in BOOL_COLUMNS: + spec[name] = tf.FixedLenFeature(shape=fixed_shape, dtype=tf.int64, default_value=None) + for name in FLOAT_COLUMNS_LOG_BIN_TRANSFORM+FLOAT_COLUMNS_SIMPLE_BIN_TRANSFORM: + spec[name] = tf.FixedLenFeature(shape=fixed_shape, dtype=tf.float32, default_value=None) + for name in FLOAT_COLUMNS_SIMPLE_BIN_TRANSFORM: + spec[name + '_binned'] = tf.FixedLenFeature(shape=fixed_shape, dtype=tf.int64, default_value=None) + for name in FLOAT_COLUMNS_LOG_BIN_TRANSFORM: + spec[name + '_binned'] = tf.FixedLenFeature(shape=fixed_shape, dtype=tf.int64, default_value=None) + spec[name + '_log_01scaled'] = tf.FixedLenFeature(shape=fixed_shape, dtype=tf.float32, default_value=None) + for name in INT_COLUMNS: + spec[name + '_log_int'] = tf.FixedLenFeature(shape=fixed_shape, dtype=tf.int64, default_value=None) + spec[name + '_log_01scaled'] = tf.FixedLenFeature(shape=fixed_shape, dtype=tf.float32, default_value=None) + for name in BOOL_COLUMNS + CATEGORICAL_COLUMNS: + spec[name] = tf.FixedLenFeature(shape=fixed_shape, dtype=tf.int64, default_value=None) + + for multi_category in DOC_CATEGORICAL_MULTIVALUED_COLUMNS: + #spec[multi_category] = tf.VarLenFeature(dtype=tf.int64) + shape = fixed_shape[:-1]+[len(DOC_CATEGORICAL_MULTIVALUED_COLUMNS[multi_category])] + spec[multi_category] = tf.FixedLenFeature(shape=shape, dtype=tf.int64) + + metadata = dataset_metadata.DatasetMetadata(dataset_schema.from_feature_spec(spec)) + + metadata_io.write_metadata(metadata, output_dir) + +def make_input_schema(mode=tf.contrib.learn.ModeKeys.TRAIN, batch_size=None): + """Input schema definition. + + Args: + mode: tf.contrib.learn.ModeKeys specifying if the schema is being used for + train/eval or prediction. + batch_size: None if not explicitly batched (for FixedLenFeature size of []), + otherwise the number of elements to assume will be grouped (size of [batch_size]) + Returns: + A `Schema` object. + """ + fixed_shape = [batch_size] if batch_size is not None else [] + result = {} + result[LABEL_COLUMN] = tf.FixedLenFeature(shape=fixed_shape, dtype=tf.int64) + result[DISPLAY_ID_COLUMN] = tf.FixedLenFeature(shape=fixed_shape, dtype=tf.float32) + #result[AD_ID_COLUMN] = tf.VarLenFeature(dtype=tf.float32) + result[IS_LEAK_COLUMN] = tf.FixedLenFeature(shape=fixed_shape, dtype=tf.int64) + for name in BOOL_COLUMNS: + #result[name] = tf.VarLenFeature(dtype=tf.int64) + result[name] = tf.FixedLenFeature(shape=fixed_shape, dtype=tf.int64, default_value=0.0) + #TODO: Create dummy features that indicates whether any of the numeric features is null + #(currently default 0 value might introduce noise) + for name in FLOAT_COLUMNS_LOG_BIN_TRANSFORM+FLOAT_COLUMNS_SIMPLE_BIN_TRANSFORM: + result[name] = tf.FixedLenFeature(shape=fixed_shape, dtype=tf.float32, default_value=0.0) + for name in INT_COLUMNS: + result[name] = tf.FixedLenFeature(shape=fixed_shape, dtype=tf.float32, default_value=0.0) + for name in CATEGORICAL_COLUMNS: + result[name] = tf.FixedLenFeature(shape=fixed_shape, dtype=tf.float32, default_value=0.0) + #result[name] = tf.VarLenFeature(dtype=tf.float32) + for multi_category in DOC_CATEGORICAL_MULTIVALUED_COLUMNS: + for category in DOC_CATEGORICAL_MULTIVALUED_COLUMNS[multi_category]: + result[category] = tf.FixedLenFeature(shape=fixed_shape, dtype=tf.float32, default_value=0.0) + #result[category] = tf.VarLenFeature(dtype=tf.float32) + + return dataset_schema.from_feature_spec(result) + +def tf_log2_1p(x): + return tf.log1p(x) / tf.log(2.0) + +def log2_1p(x): + return np.log1p(x) / np.log(2.0) + +def compute_min_max_logs(rows): + min_logs = {} + max_logs = {} + + for name in FLOAT_COLUMNS_LOG_BIN_TRANSFORM + INT_COLUMNS: + min_logs[name + '_log_01scaled'] = float("inf") + max_logs[name + '_log_01scaled'] = float("-inf") + + for row in rows: + names = CSV_ORDERED_COLUMNS + columns_dict = dict(zip(names, row)) + for name in FLOAT_COLUMNS_LOG_BIN_TRANSFORM: + nn = name + '_log_01scaled' + min_logs[nn] = min(min_logs[nn], log2_1p(columns_dict[name] * 1000)) + max_logs[nn] = max(max_logs[nn], log2_1p(columns_dict[name] * 1000)) + for name in INT_COLUMNS: + nn = name + '_log_01scaled' + min_logs[nn] = min(min_logs[nn], log2_1p(columns_dict[name])) + max_logs[nn] = max(max_logs[nn], log2_1p(columns_dict[name])) + + return min_logs, max_logs + +def scale_to_0_1(val, minv, maxv): + return (val - minv) / (maxv - minv) + +def create_tf_example(df, min_logs, max_logs): + names = CSV_ORDERED_COLUMNS + #columns_dict = dict(zip(names, row)) + + result = {} + result[LABEL_COLUMN] = tf.train.Feature(int64_list=tf.train.Int64List(value=df[LABEL_COLUMN].to_list())) + result[DISPLAY_ID_COLUMN] = tf.train.Feature(int64_list=tf.train.Int64List(value=df[DISPLAY_ID_COLUMN].to_list())) + result[IS_LEAK_COLUMN] = tf.train.Feature(int64_list=tf.train.Int64List(value=df[IS_LEAK_COLUMN].to_list())) + #is_leak = df[IS_LEAK_COLUMN].to_list() + encoded_value = df[DISPLAY_ID_COLUMN].multiply(10).add(df[IS_LEAK_COLUMN].clip(lower=0)).to_list() + # * 10 + (0 if is_leak < 0 else is_leak) + result[DISPLAY_ID_AND_IS_LEAK_ENCODED_COLUMN] = tf.train.Feature(int64_list=tf.train.Int64List(value=encoded_value)) + + for name in FLOAT_COLUMNS: + result[name] = tf.train.Feature(float_list=tf.train.FloatList(value=df[name].to_list())) + for name in FLOAT_COLUMNS_SIMPLE_BIN_TRANSFORM: + #[int(columns_dict[name] * 10)] + value = df[name].multiply(10).astype('int64').to_list() + result[name + '_binned'] = tf.train.Feature(int64_list=tf.train.Int64List(value=value)) + for name in FLOAT_COLUMNS_LOG_BIN_TRANSFORM: + # [int(log2_1p(columns_dict[name] * 1000))] + value_prelim = df[name].multiply(1000).apply(np.log1p).multiply(1./np.log(2.0)) + value = value_prelim.astype('int64').to_list() + result[name + '_binned'] = tf.train.Feature(int64_list=tf.train.Int64List(value=value)) + nn = name + '_log_01scaled' + #val = log2_1p(columns_dict[name] * 1000) + #val = scale_to_0_1(val, min_logs[nn], max_logs[nn]) + value = value_prelim.add(-min_logs[nn]).multiply(1./(max_logs[nn]-min_logs[nn])).to_list() + result[nn] = tf.train.Feature(float_list=tf.train.FloatList(value=value)) + for name in INT_COLUMNS: + #[int(log2_1p(columns_dict[name]))] + value_prelim = df[name].apply(np.log1p).multiply(1./np.log(2.0)) + value = value_prelim.astype('int64').to_list() + result[name + '_log_int'] = tf.train.Feature(int64_list=tf.train.Int64List(value=value)) + nn = name + '_log_01scaled' + #val = log2_1p(columns_dict[name]) + #val = scale_to_0_1(val, min_logs[nn], max_logs[nn]) + value = value_prelim.add(-min_logs[nn]).multiply(1./(max_logs[nn]-min_logs[nn])).to_list() + result[nn] = tf.train.Feature(float_list=tf.train.FloatList(value=value)) + + for name in BOOL_COLUMNS + CATEGORICAL_COLUMNS: + result[name] = tf.train.Feature(int64_list=tf.train.Int64List(value=df[name].to_list())) + + for multi_category in DOC_CATEGORICAL_MULTIVALUED_COLUMNS: + values = [] + for category in DOC_CATEGORICAL_MULTIVALUED_COLUMNS[multi_category]: + values = values + [df[category].to_numpy()] + # need to transpose the series so they will be parsed correctly by the FixedLenFeature + # we can pass in a single series here; they'll be reshaped to [batch_size, num_values] + # when parsed from the TFRecord + value = np.stack(values, axis=1).flatten().tolist() + result[multi_category] = tf.train.Feature(int64_list=tf.train.Int64List(value=value)) + + tf_example = tf.train.Example(features=tf.train.Features(feature=result)) + + return tf_example diff --git a/TensorFlow/Recommendation/WideAndDeep/path_constants.py b/TensorFlow/Recommendation/WideAndDeep/path_constants.py new file mode 100644 index 00000000..6e8d5f45 --- /dev/null +++ b/TensorFlow/Recommendation/WideAndDeep/path_constants.py @@ -0,0 +1,47 @@ +# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +# Copyright 2016 Google Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""File paths for the Criteo Classification pipeline. +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + + +TEMP_DIR = 'tmp' +TRANSFORM_FN_DIR = 'transform_fn' +RAW_METADATA_DIR = 'raw_metadata' +TRANSFORMED_METADATA_DIR = 'transformed_metadata' +TRANSFORMED_TRAIN_DATA_FILE_PREFIX = 'features_train' +TRANSFORMED_EVAL_DATA_FILE_PREFIX = 'features_eval' +TRANSFORMED_PREDICT_DATA_FILE_PREFIX = 'features_predict' +TRAIN_RESULTS_FILE = 'train_results' +DEPLOY_SAVED_MODEL_DIR = 'saved_model' +MODEL_EVALUATIONS_FILE = 'model_evaluations' +BATCH_PREDICTION_RESULTS_FILE = 'batch_prediction_results' diff --git a/TensorFlow/Recommendation/WideAndDeep/preproc/csv_data_imputation.py b/TensorFlow/Recommendation/WideAndDeep/preproc/csv_data_imputation.py new file mode 100644 index 00000000..e18fe031 --- /dev/null +++ b/TensorFlow/Recommendation/WideAndDeep/preproc/csv_data_imputation.py @@ -0,0 +1,88 @@ +# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import pandas as pd +import os +import glob +import tqdm +import argparse +from joblib import Parallel, delayed + + +parser = argparse.ArgumentParser() +parser.add_argument('--train_files_pattern', default='train_feature_vectors_integral_eval.csv/part-*') +parser.add_argument('--valid_files_pattern', default='validation_feature_vectors_integral.csv/part-*') +parser.add_argument('--train_dst_dir', default='train_feature_vectors_integral_eval_imputed.csv') +parser.add_argument('--valid_dst_dir', default='validation_feature_vectors_integral_imputed.csv') +parser.add_argument('--header_path', default='train_feature_vectors_integral_eval.csv.header') +parser.add_argument('--num_workers', type=int, default=4) +args = parser.parse_args() + +header = pd.read_csv(args.header_path, header=None) +columns = header[0].to_list() + +train_files = glob.glob(args.train_files_pattern) +print('train files: ', train_files) + +def get_counts(f): + df = pd.read_csv(f, header=None, dtype=object, names=columns, na_values='None') + counts = {} + for c in df: + counts[c] = df[c].value_counts() + return counts + +all_counts = Parallel(n_jobs=args.num_workers)(delayed(get_counts)(f) for f in train_files) +cols = len(all_counts[0]) +imputation_dict = {} +for c in tqdm.tqdm(columns): + temp = None + for i in range(len(all_counts)): + if temp is None: + temp = pd.Series(all_counts[i][c]) + else: + temp += pd.Series(all_counts[i][c]) + if len(temp) == 0: + imputation_dict[c] = 0 + else: + imputation_dict[c] = temp.index[0] + +print('imputation_dict: ', imputation_dict) + +if not os.path.exists(args.train_dst_dir): + os.mkdir(args.train_dst_dir) + +def impute_part(src_path, dst_dir): + print('imputing: ', src_path, ' to: ', dst_dir) + filename = os.path.basename(src_path) + dst_path = os.path.join(dst_dir, filename) + + df = pd.read_csv(src_path, header=None, dtype=object, names=columns, na_values='None') + df2 = df.fillna(imputation_dict) + df2.to_csv(dst_path, header=None, index=None) + + +print('launching imputation for train CSVs') +Parallel(n_jobs=args.num_workers)(delayed(impute_part)(f, args.train_dst_dir) for f in train_files) + +valid_files = glob.glob(args.valid_files_pattern) + +if not os.path.exists(args.valid_dst_dir): + os.mkdir(args.valid_dst_dir) + +print('launching imputation for validation CSVs') +Parallel(n_jobs=args.num_workers)(delayed(impute_part)(f, args.valid_dst_dir) for f in valid_files) + +print('Done!') diff --git a/TensorFlow/Recommendation/WideAndDeep/preproc/data/ca_states_abbrev_bst.csv b/TensorFlow/Recommendation/WideAndDeep/preproc/data/ca_states_abbrev_bst.csv new file mode 100644 index 00000000..02f86ea2 --- /dev/null +++ b/TensorFlow/Recommendation/WideAndDeep/preproc/data/ca_states_abbrev_bst.csv @@ -0,0 +1,13 @@ +state_abb,utc_dst_time_offset_cleaned +AB,-6.0 +BC,-7.0 +MB,-5.0 +NB,-3.0 +NL,-3.0 +NS,-3.0 +NU,-5.0 +ON,-4.0 +PE,-3.0 +QC,-4.0 +SK,-6.0 +YT,-7.0 diff --git a/TensorFlow/Recommendation/WideAndDeep/preproc/data/country_codes_utc_dst_tz_delta.csv b/TensorFlow/Recommendation/WideAndDeep/preproc/data/country_codes_utc_dst_tz_delta.csv new file mode 100644 index 00000000..7f02e795 --- /dev/null +++ b/TensorFlow/Recommendation/WideAndDeep/preproc/data/country_codes_utc_dst_tz_delta.csv @@ -0,0 +1,247 @@ +country_code,utc_dst_time_offset_cleaned +AX,3.0 +AF,4.5 +AL,2.0 +DZ,1.0 +AD,2.0 +AO,1.0 +AI,-4.0 +AG,-4.0 +AR,-3.0 +AM,4.0 +AW,-4.0 +AU,10.0 +AT,2.0 +AZ,4.0 +BS,-4.0 +BH,3.0 +BD,6.0 +BB,-4.0 +BY,3.0 +BE,2.0 +BZ,-6.0 +BJ,1.0 +BM,-3.0 +BT,6.0 +BO,-4.0 +BA,2.0 +BW,2.0 +BR,-3.0 +IO,6.0 +BN,8.0 +BG,3.0 +BF,0.0 +BI,2.0 +KH,7.0 +CM,1.0 +CA,-5.0 +BQ,-5.0 +KY,-5.0 +CF,1.0 +TD,1.0 +CL,-3.0 +CN,8.0 +CX,7.0 +CC,6.5 +CO,-5.0 +KM,3.0 +CD,1.0 +CG,1.0 +CK,-10.0 +CR,-6.0 +CI,0.0 +HR,2.0 +CW,-4.0 +CY,3.0 +CZ,2.0 +DK,2.0 +DJ,3.0 +DM,-4.0 +DO,-4.0 +TL,9.0 +EC,-5.0 +EG,2.0 +SV,-6.0 +GQ,1.0 +ER,3.0 +EE,3.0 +ET,3.0 +FK,-3.0 +FO,1.0 +FJ,12.0 +FI,3.0 +FR,2.0 +GF,-3.0 +PF,-10.0 +GA,1.0 +GM,0.0 +GE,4.0 +DE,2.0 +GH,0.0 +GI,2.0 +GR,3.0 +GL,-2.0 +GD,-4.0 +GP,-4.0 +GU,10.0 +GT,-6.0 +GG,1.0 +GN,0.0 +GW,0.0 +GY,-4.0 +HT,-5.0 +HN,-6.0 +HK,8.0 +HU,2.0 +IS,0.0 +IN,5.5 +ID,8.0 +IR,4.5 +IQ,3.0 +IE,1.0 +IM,1.0 +IL,3.0 +IT,2.0 +JM,-5.0 +JP,9.0 +JE,1.0 +JO,3.0 +KZ,5.0 +KE,3.0 +KI,13.0 +KP,-4.0 +KR,-4.0 +KP,8.5 +KR,8.5 +KP,9.0 +KR,9.0 +KW,3.0 +KG,6.0 +LA,7.0 +LV,3.0 +LB,3.0 +LS,2.0 +LR,0.0 +LY,2.0 +LI,2.0 +LT,3.0 +LU,2.0 +MO,8.0 +MK,2.0 +MG,3.0 +MW,2.0 +MY,8.0 +MV,5.0 +ML,0.0 +MT,2.0 +MH,12.0 +MQ,-4.0 +MR,0.0 +MU,4.0 +YT,3.0 +MX,-5.0 +FM,10.0 +MD,3.0 +MC,2.0 +MN,9.0 +ME,2.0 +MS,-4.0 +MA,1.0 +MZ,2.0 +MM,6.5 +NA,1.0 +NR,12.0 +NP,5.0 +NL,2.0 +NC,11.0 +NZ,12.0 +NI,-6.0 +NE,1.0 +NG,1.0 +NU,-11.0 +NF,11.0 +MP,10.0 +NO,2.0 +OM,4.0 +PK,5.0 +PW,9.0 +PS,3.0 +PA,-5.0 +PG,10.0 +PY,-4.0 +PE,-5.0 +PH,8.0 +PN,-8.0 +PL,2.0 +PT,1.0 +PR,-4.0 +QA,3.0 +RE,4.0 +RO,3.0 +RU,7.0 +RW,2.0 +BL,-4.0 +AS,-11.0 +WS,-11.0 +AS,13.0 +WS,13.0 +SM,2.0 +ST,0.0 +SA,3.0 +SN,0.0 +RS,2.0 +SC,4.0 +SL,0.0 +SG,8.0 +SK,2.0 +SI,2.0 +SB,11.0 +SO,3.0 +ZA,2.0 +GS,-2.0 +SS,3.0 +ES,2.0 +LK,5.5 +SH,0.0 +KN,-4.0 +SX,-4.0 +MF,-4.0 +SD,3.0 +SR,-3.0 +SJ,2.0 +SZ,2.0 +SE,2.0 +CH,2.0 +SY,3.0 +TW,8.0 +TJ,5.0 +TZ,3.0 +TH,7.0 +TG,0.0 +TK,13.0 +TO,13.0 +TT,-4.0 +TN,1.0 +TR,3.0 +TM,5.0 +TC,-4.0 +TV,12.0 +UG,3.0 +UA,3.0 +AE,4.0 +GB,1.0 +US,-7.0 +UY,-3.0 +UZ,5.0 +VU,11.0 +VA,2.0 +VE,-4.0 +VN,7.0 +VG,-4.0 +VI,-4.0 +VG,-4.0 +VI,-4.0 +WF,12.0 +YE,3.0 +ZM,2.0 +ZW,2.0 diff --git a/TensorFlow/Recommendation/WideAndDeep/preproc/data/us_states_abbrev_bst.csv b/TensorFlow/Recommendation/WideAndDeep/preproc/data/us_states_abbrev_bst.csv new file mode 100644 index 00000000..fd881611 --- /dev/null +++ b/TensorFlow/Recommendation/WideAndDeep/preproc/data/us_states_abbrev_bst.csv @@ -0,0 +1,52 @@ +state_abb,utc_dst_time_offset_cleaned +AL,-5.0 +AK,-8.0 +AZ,-7.0 +AR,-5.0 +CA,-7.0 +CO,-6.0 +CT,-4.0 +DE,-4.0 +DC,-4.0 +FL,-4.0 +GA,-4.0 +HI,-10.0 +ID,-6.0 +IL,-5.0 +IN,-4.0 +IA,-5.0 +KS,-5.0 +KY,-4.0 +LA,-5.0 +ME,-4.0 +MD,-4.0 +MA,-4.0 +MI,-4.0 +MN,-5.0 +MS,-5.0 +MO,-5.0 +MT,-6.0 +NE,-5.0 +NV,-7.0 +NH,-4.0 +NJ,-4.0 +NM,-6.0 +NY,-4.0 +NC,-4.0 +ND,-5.0 +OH,-4.0 +OK,-5.0 +OR,-7.0 +PA,-4.0 +RI,-4.0 +SC,-4.0 +SD,-5.0 +TN,-5.0 +TX,-5.0 +UT,-6.0 +VT,-4.0 +VA,-4.0 +WA,-7.0 +WV,-4.0 +WI,-5.0 +WY,-6.0 diff --git a/TensorFlow/Recommendation/WideAndDeep/preproc/preproc1.py b/TensorFlow/Recommendation/WideAndDeep/preproc/preproc1.py new file mode 100644 index 00000000..9f481af5 --- /dev/null +++ b/TensorFlow/Recommendation/WideAndDeep/preproc/preproc1.py @@ -0,0 +1,107 @@ +#!/usr/bin/env python +# coding: utf-8 + +# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +OUTPUT_BUCKET_FOLDER = "/outbrain/preprocessed/" +DATA_BUCKET_FOLDER = "/outbrain/orig/" +SPARK_TEMP_FOLDER = "/outbrain/spark-temp/" + +from pyspark.sql.types import * +import pyspark.sql.functions as F + +from pyspark.context import SparkContext, SparkConf +from pyspark.sql.session import SparkSession + +conf = SparkConf().setMaster('local[*]').set('spark.executor.memory', '256g').set('spark.driver.memory', '126g').set("spark.local.dir", SPARK_TEMP_FOLDER) + +sc = SparkContext(conf=conf) +spark = SparkSession(sc) + +print('Loading data...') + +truncate_day_from_timestamp_udf = F.udf(lambda ts: int(ts / 1000 / 60 / 60 / 24), IntegerType()) + +events_schema = StructType( + [StructField("display_id", IntegerType(), True), + StructField("uuid_event", StringType(), True), + StructField("document_id_event", IntegerType(), True), + StructField("timestamp_event", IntegerType(), True), + StructField("platform_event", IntegerType(), True), + StructField("geo_location_event", StringType(), True)] + ) + +events_df = spark.read.schema(events_schema) \ + .options(header='true', inferschema='false', nullValue='\\N') \ + .csv(DATA_BUCKET_FOLDER + "events.csv") \ + .withColumn('day_event', truncate_day_from_timestamp_udf('timestamp_event')) \ + .alias('events') + +events_df.count() + +print('Drop rows with empty "geo_location"...') +events_df = events_df.dropna(subset="geo_location_event") +events_df.count() + +print('Drop rows with empty "platform"...') +events_df = events_df.dropna(subset="platform_event") +events_df.count() + +promoted_content_schema = StructType( + [StructField("ad_id", IntegerType(), True), + StructField("document_id_promo", IntegerType(), True), + StructField("campaign_id", IntegerType(), True), + StructField("advertiser_id", IntegerType(), True)] + ) + +promoted_content_df = spark.read.schema(promoted_content_schema) \ + .options(header='true', inferschema='false', nullValue='\\N') \ + .csv(DATA_BUCKET_FOLDER+"promoted_content.csv") \ + .alias('promoted_content') + +clicks_train_schema = StructType( + [StructField("display_id", IntegerType(), True), + StructField("ad_id", IntegerType(), True), + StructField("clicked", IntegerType(), True)] + ) + +clicks_train_df = spark.read.schema(clicks_train_schema) \ + .options(header='true', inferschema='false', nullValue='\\N') \ + .csv(DATA_BUCKET_FOLDER+"clicks_train.csv") \ + .alias('clicks_train') + + +clicks_train_joined_df = clicks_train_df \ + .join(promoted_content_df, on='ad_id', how='left') \ + .join(events_df, on='display_id', how='left') +clicks_train_joined_df.createOrReplaceTempView('clicks_train_joined') + +validation_display_ids_df = clicks_train_joined_df.select('display_id','day_event') \ + .distinct() \ + .sampleBy("day_event", fractions={0: 0.2, 1: 0.2, 2: 0.2, 3: 0.2, 4: 0.2, \ + 5: 0.2, 6: 0.2, 7: 0.2, 8: 0.2, 9: 0.2, 10: 0.2, 11: 1.0, 12: 1.0}, seed=0) +validation_display_ids_df.createOrReplaceTempView("validation_display_ids") +validation_set_df = spark.sql('''SELECT display_id, ad_id, uuid_event, day_event, + timestamp_event, document_id_promo, platform_event, geo_location_event + FROM clicks_train_joined t + WHERE EXISTS (SELECT display_id FROM validation_display_ids + WHERE display_id = t.display_id)''') + +validation_set_gcs_output = "validation_set.parquet" +validation_set_df.write.parquet(OUTPUT_BUCKET_FOLDER+validation_set_gcs_output, mode='overwrite') + +print(validation_set_df.take(5)) + +spark.stop() diff --git a/TensorFlow/Recommendation/WideAndDeep/preproc/preproc2.py b/TensorFlow/Recommendation/WideAndDeep/preproc/preproc2.py new file mode 100644 index 00000000..0de2656c --- /dev/null +++ b/TensorFlow/Recommendation/WideAndDeep/preproc/preproc2.py @@ -0,0 +1,414 @@ +#!/usr/bin/env python +# coding: utf-8 + +# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +OUTPUT_BUCKET_FOLDER = "/outbrain/preprocessed/" +DATA_BUCKET_FOLDER = "/outbrain/orig/" +SPARK_TEMP_FOLDER = "/outbrain/spark-temp/" + +from IPython.display import display + +from pyspark.sql.types import * +import pyspark.sql.functions as F + +from pyspark.sql import DataFrameWriter + +import numpy as np + +import math +import datetime +import time + +import random +random.seed(42) + +from pyspark.context import SparkContext, SparkConf +from pyspark.sql.session import SparkSession + +import argparse + +parser = argparse.ArgumentParser() +parser.add_argument( + '--submission', + action='store_true', + default=False +) +args = parser.parse_args() + +evaluation = not args.submission + +conf = SparkConf().setMaster('local[*]').set('spark.executor.memory', '256g').set('spark.driver.memory', '126g').set("spark.local.dir", SPARK_TEMP_FOLDER) + +sc = SparkContext(conf=conf) +spark = SparkSession(sc) + +start_time = time.time() + +print('Loading data...') + +truncate_day_from_timestamp_udf = F.udf(lambda ts: int(ts / 1000 / 60 / 60 / 24), IntegerType()) + +extract_country_udf = F.udf(lambda geo: geo.strip()[:2] if geo != None else '', StringType()) + +documents_meta_schema = StructType( + [StructField("document_id_doc", IntegerType(), True), + StructField("source_id", IntegerType(), True), + StructField("publisher_id", IntegerType(), True), + StructField("publish_time", TimestampType(), True)] + ) + +documents_meta_df = spark.read.schema(documents_meta_schema) \ + .options(header='true', inferschema='false', nullValue='\\N') \ + .csv(DATA_BUCKET_FOLDER+"documents_meta.csv") \ + .withColumn('dummyDocumentsMeta', F.lit(1)).alias('documents_meta') + +documents_meta_df.count() + +print('Drop rows with empty "source_id"...') +documents_meta_df = documents_meta_df.dropna(subset="source_id") +documents_meta_df.count() + +source_publishers_df = documents_meta_df.select(["source_id", "publisher_id"]).dropDuplicates() +source_publishers_df.count() + +print('Get list of source_ids without publisher_id...') +rows_no_pub = source_publishers_df.filter("publisher_id is NULL") +source_ids_without_publisher = [row['source_id'] for row in rows_no_pub.collect()] +len(source_ids_without_publisher) + +print('Maximum value of publisher_id used so far...') +max_pub = max(source_publishers_df.select(["publisher_id"]).dropna().collect())['publisher_id'] +max_pub + +print('Rows filled with new publisher_ids') +new_publishers = [(source, max_pub + 1 + nr) for nr, source in enumerate(source_ids_without_publisher)] +new_publishers_df = spark.createDataFrame(new_publishers, ("source_id", "publisher_id")) +new_publishers_df.take(10) + +# old and new publishers merged +fixed_source_publishers_df = source_publishers_df.dropna().union(new_publishers_df) +fixed_source_publishers_df.collect()[-30:] + +print('Update documents_meta with bew publishers...') +documents_meta_df = documents_meta_df.drop('publisher_id').join(fixed_source_publishers_df, on='source_id') +documents_meta_df.count() + +documents_categories_schema = StructType( + [StructField("document_id_cat", IntegerType(), True), + StructField("category_id", IntegerType(), True), + StructField("confidence_level_cat", FloatType(), True)] + ) + +documents_categories_df = spark.read.schema(documents_categories_schema) \ + .options(header='true', inferschema='false', nullValue='\\N') \ + .csv(DATA_BUCKET_FOLDER+"documents_categories.csv") \ + .alias('documents_categories') + +documents_categories_grouped_df = documents_categories_df.groupBy('document_id_cat') \ + .agg(F.collect_list('category_id').alias('category_id_list'), + F.collect_list('confidence_level_cat').alias('cat_confidence_level_list')) \ + .withColumn('dummyDocumentsCategory', F.lit(1)) \ + .alias('documents_categories_grouped') + +documents_topics_schema = StructType( + [StructField("document_id_top", IntegerType(), True), + StructField("topic_id", IntegerType(), True), + StructField("confidence_level_top", FloatType(), True)] + ) + +documents_topics_df = spark.read.schema(documents_topics_schema) \ + .options(header='true', inferschema='false', nullValue='\\N') \ + .csv(DATA_BUCKET_FOLDER+"documents_topics.csv") \ + .alias('documents_topics') + +documents_topics_grouped_df = documents_topics_df.groupBy('document_id_top') \ + .agg(F.collect_list('topic_id').alias('topic_id_list'), + F.collect_list('confidence_level_top').alias('top_confidence_level_list')) \ + .withColumn('dummyDocumentsTopics', F.lit(1)) \ + .alias('documents_topics_grouped') + +documents_entities_schema = StructType( + [StructField("document_id_ent", IntegerType(), True), + StructField("entity_id", StringType(), True), + StructField("confidence_level_ent", FloatType(), True)] + ) + +documents_entities_df = spark.read.schema(documents_entities_schema) \ + .options(header='true', inferschema='false', nullValue='\\N') \ + .csv(DATA_BUCKET_FOLDER+"documents_entities.csv") \ + .alias('documents_entities') + +documents_entities_grouped_df = documents_entities_df.groupBy('document_id_ent') \ + .agg(F.collect_list('entity_id').alias('entity_id_list'), + F.collect_list('confidence_level_ent').alias('ent_confidence_level_list')) \ + .withColumn('dummyDocumentsEntities', F.lit(1)) \ + .alias('documents_entities_grouped') + +documents_df = documents_meta_df.join( + documents_categories_grouped_df, + on=F.col("document_id_doc") == F.col("documents_categories_grouped.document_id_cat"), + how='left') \ + .join(documents_topics_grouped_df, + on=F.col("document_id_doc") == F.col("documents_topics_grouped.document_id_top"), + how='left') \ + .join(documents_entities_grouped_df, + on=F.col("document_id_doc") == F.col("documents_entities_grouped.document_id_ent"), + how='left') \ + .cache() + +documents_df.count() + +if evaluation: + validation_set_df = spark.read.parquet(OUTPUT_BUCKET_FOLDER+"validation_set.parquet") \ + .alias('validation_set') + + validation_set_df.select('uuid_event').distinct().createOrReplaceTempView('users_to_profile') + validation_set_df.select('uuid_event','document_id_promo').distinct() \ + .createOrReplaceTempView('validation_users_docs_to_ignore') + +else: + events_schema = StructType( + [StructField("display_id", IntegerType(), True), + StructField("uuid_event", StringType(), True), + StructField("document_id_event", IntegerType(), True), + StructField("timestamp_event", IntegerType(), True), + StructField("platform_event", IntegerType(), True), + StructField("geo_location_event", StringType(), True)] + ) + + events_df = spark.read.schema(events_schema) \ + .options(header='true', inferschema='false', nullValue='\\N') \ + .csv(DATA_BUCKET_FOLDER+"events.csv") \ + .withColumn('dummyEvents', F.lit(1)) \ + .withColumn('day_event', truncate_day_from_timestamp_udf('timestamp_event')) \ + .withColumn('event_country', extract_country_udf('geo_location_event')) \ + .alias('events') + + # Drop rows with empty "geo_location" + events_df = events_df.dropna(subset="geo_location_event") + # Drop rows with empty "platform" + events_df = events_df.dropna(subset="platform_event") + + events_df.createOrReplaceTempView('events') + + + promoted_content_schema = StructType( + [StructField("ad_id", IntegerType(), True), + StructField("document_id_promo", IntegerType(), True), + StructField("campaign_id", IntegerType(), True), + StructField("advertiser_id", IntegerType(), True)] + ) + + promoted_content_df = spark.read.schema(promoted_content_schema) \ + .options(header='true', inferschema='false', nullValue='\\N') \ + .csv(DATA_BUCKET_FOLDER+"promoted_content.csv") \ + .withColumn('dummyPromotedContent', F.lit(1)).alias('promoted_content') + + clicks_test_schema = StructType( + [StructField("display_id", IntegerType(), True), + StructField("ad_id", IntegerType(), True)] + ) + + clicks_test_df = spark.read.schema(clicks_test_schema) \ + .options(header='true', inferschema='false', nullValue='\\N') \ + .csv(DATA_BUCKET_FOLDER+"clicks_test.csv") \ + .withColumn('dummyClicksTest', F.lit(1)).alias('clicks_test') + + test_set_df = clicks_test_df.join(promoted_content_df, on='ad_id', how='left') \ + .join(events_df, on='display_id', how='left') + + test_set_df.select('uuid_event').distinct().createOrReplaceTempView('users_to_profile') + test_set_df.select('uuid_event','document_id_promo', 'timestamp_event').distinct() \ + .createOrReplaceTempView('test_users_docs_timestamp_to_ignore') + +page_views_schema = StructType( + [StructField("uuid_pv", StringType(), True), + StructField("document_id_pv", IntegerType(), True), + StructField("timestamp_pv", IntegerType(), True), + StructField("platform_pv", IntegerType(), True), + StructField("geo_location_pv", StringType(), True), + StructField("traffic_source_pv", IntegerType(), True)] + ) +page_views_df = spark.read.schema(page_views_schema) \ + .options(header='true', inferschema='false', nullValue='\\N') \ + .csv(DATA_BUCKET_FOLDER+"page_views.csv") \ + .alias('page_views') + +page_views_df.createOrReplaceTempView('page_views') + +additional_filter = '' + +if evaluation: + additional_filter = ''' + AND NOT EXISTS (SELECT uuid_event FROM validation_users_docs_to_ignore + WHERE uuid_event = p.uuid_pv + AND document_id_promo = p.document_id_pv) + ''' +else: + additional_filter = ''' + AND NOT EXISTS (SELECT uuid_event FROM test_users_docs_timestamp_to_ignore + WHERE uuid_event = p.uuid_pv + AND document_id_promo = p.document_id_pv + AND p.timestamp_pv >= timestamp_event) + ''' + +page_views_train_df = spark.sql(''' + SELECT * FROM page_views p + WHERE EXISTS (SELECT uuid_event FROM users_to_profile + WHERE uuid_event = p.uuid_pv) + ''' + additional_filter).alias('views') \ + .join(documents_df, on=F.col("document_id_pv") == F.col("document_id_doc"), how='left') \ + .filter('dummyDocumentsEntities is not null OR dummyDocumentsTopics is not null OR dummyDocumentsCategory is not null') + + +print('Processing document frequencies...') +import pickle + +documents_total = documents_meta_df.count() +documents_total + +categories_docs_counts = documents_categories_df.groupBy('category_id').count().rdd.collectAsMap() +len(categories_docs_counts) + +df_filenames_suffix = '' +if evaluation: + df_filenames_suffix = '_eval' + +with open(OUTPUT_BUCKET_FOLDER+'categories_docs_counts'+df_filenames_suffix+'.pickle', 'wb') as output: + pickle.dump(categories_docs_counts, output) + +topics_docs_counts = documents_topics_df.groupBy('topic_id').count().rdd.collectAsMap() +len(topics_docs_counts) + +with open(OUTPUT_BUCKET_FOLDER+'topics_docs_counts'+df_filenames_suffix+'.pickle', 'wb') as output: + pickle.dump(topics_docs_counts, output) + +entities_docs_counts = documents_entities_df.groupBy('entity_id').count().rdd.collectAsMap() +len(entities_docs_counts) + +with open(OUTPUT_BUCKET_FOLDER+'entities_docs_counts'+df_filenames_suffix+'.pickle', 'wb') as output: + pickle.dump(entities_docs_counts, output) + + +print('Processing user profiles...') + +int_null_to_minus_one_udf = F.udf(lambda x: x if x != None else -1, IntegerType()) +int_list_null_to_empty_list_udf = F.udf(lambda x: x if x != None else [], ArrayType(IntegerType())) +float_list_null_to_empty_list_udf = F.udf(lambda x: x if x != None else [], ArrayType(FloatType())) +str_list_null_to_empty_list_udf = F.udf(lambda x: x if x != None else [], ArrayType(StringType())) + +page_views_by_user_df = page_views_train_df \ + .select( + 'uuid_pv', + 'document_id_pv', + int_null_to_minus_one_udf('timestamp_pv').alias('timestamp_pv'), + int_list_null_to_empty_list_udf('category_id_list').alias('category_id_list'), + float_list_null_to_empty_list_udf('cat_confidence_level_list').alias('cat_confidence_level_list'), + int_list_null_to_empty_list_udf('topic_id_list').alias('topic_id_list'), + float_list_null_to_empty_list_udf('top_confidence_level_list').alias('top_confidence_level_list'), + str_list_null_to_empty_list_udf('entity_id_list').alias('entity_id_list'), + float_list_null_to_empty_list_udf('ent_confidence_level_list').alias('ent_confidence_level_list')) \ + .groupBy('uuid_pv') \ + .agg(F.collect_list('document_id_pv').alias('document_id_pv_list'), + F.collect_list('timestamp_pv').alias('timestamp_pv_list'), + F.collect_list('category_id_list').alias('category_id_lists'), + F.collect_list('cat_confidence_level_list').alias('cat_confidence_level_lists'), + F.collect_list('topic_id_list').alias('topic_id_lists'), + F.collect_list('top_confidence_level_list').alias('top_confidence_level_lists'), + F.collect_list('entity_id_list').alias('entity_id_lists'), + F.collect_list('ent_confidence_level_list').alias('ent_confidence_level_lists')) + +from collections import defaultdict + +def get_user_aspects(docs_aspects, aspect_docs_counts): + docs_aspects_merged_lists = defaultdict(list) + + for doc_aspects in docs_aspects: + for key in doc_aspects.keys(): + docs_aspects_merged_lists[key].append(doc_aspects[key]) + + docs_aspects_stats = {} + for key in docs_aspects_merged_lists.keys(): + aspect_list = docs_aspects_merged_lists[key] + tf = len(aspect_list) + idf = math.log(documents_total / float(aspect_docs_counts[key])) + + confid_mean = sum(aspect_list) / float(len(aspect_list)) + docs_aspects_stats[key] = [tf*idf, confid_mean] + + return docs_aspects_stats + + +def generate_user_profile(docs_aspects_list, docs_aspects_confidence_list, aspect_docs_counts): + docs_aspects = [] + for doc_aspects_list, doc_aspects_confidence_list in zip(docs_aspects_list, docs_aspects_confidence_list): + doc_aspects = dict(zip(doc_aspects_list, doc_aspects_confidence_list)) + docs_aspects.append(doc_aspects) + + user_aspects = get_user_aspects(docs_aspects, aspect_docs_counts) + return user_aspects + +get_list_len_udf = F.udf(lambda docs_list: len(docs_list), IntegerType()) + +generate_categories_user_profile_map_udf = F.udf( + lambda docs_aspects_list, docs_aspects_confidence_list: \ + generate_user_profile(docs_aspects_list, + docs_aspects_confidence_list, + categories_docs_counts), + MapType(IntegerType(), ArrayType(FloatType()), False)) + + +generate_topics_user_profile_map_udf = F.udf( + lambda docs_aspects_list, docs_aspects_confidence_list: \ + generate_user_profile(docs_aspects_list, + docs_aspects_confidence_list, + topics_docs_counts), + MapType(IntegerType(), ArrayType(FloatType()), False)) + + +generate_entities_user_profile_map_udf = F.udf( + lambda docs_aspects_list, docs_aspects_confidence_list: \ + generate_user_profile(docs_aspects_list, + docs_aspects_confidence_list, + entities_docs_counts), + MapType(StringType(), ArrayType(FloatType()), False)) + +users_profile_df = page_views_by_user_df \ + .withColumn('views', get_list_len_udf('document_id_pv_list')) \ + .withColumn('categories', generate_categories_user_profile_map_udf('category_id_lists', + 'cat_confidence_level_lists')) \ + .withColumn('topics', generate_topics_user_profile_map_udf('topic_id_lists', + 'top_confidence_level_lists')) \ + .withColumn('entities', generate_entities_user_profile_map_udf('entity_id_lists', + 'ent_confidence_level_lists')) \ + .select( + F.col('uuid_pv').alias('uuid'), + F.col('document_id_pv_list').alias('doc_ids'), + 'views', 'categories', 'topics', 'entities') + +if evaluation: + table_name = 'user_profiles_eval' +else: + table_name = 'user_profiles' + + +users_profile_df.write.parquet(OUTPUT_BUCKET_FOLDER+table_name, mode='overwrite') + +finish_time = time.time() +print("Elapsed min: ", (finish_time-start_time)/60) + +spark.stop() + diff --git a/TensorFlow/Recommendation/WideAndDeep/preproc/preproc3.py b/TensorFlow/Recommendation/WideAndDeep/preproc/preproc3.py new file mode 100644 index 00000000..cf49f69d --- /dev/null +++ b/TensorFlow/Recommendation/WideAndDeep/preproc/preproc3.py @@ -0,0 +1,2146 @@ +#!/usr/bin/env python +# coding: utf-8 + +# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +evaluation_verbose = False + +OUTPUT_BUCKET_FOLDER = "/outbrain/preprocessed/" +DATA_BUCKET_FOLDER = "/outbrain/orig/" +SPARK_TEMP_FOLDER = "/outbrain/spark-temp/" + + +from IPython.display import display + + +from pyspark.sql.types import * +import pyspark.sql.functions as F +from pyspark.ml.linalg import Vectors, SparseVector, VectorUDT + +from pyspark.context import SparkContext, SparkConf +from pyspark.sql.session import SparkSession + +conf = SparkConf().setMaster('local[*]').set('spark.executor.memory', '256g').set('spark.driver.memory', '126g').set("spark.local.dir", SPARK_TEMP_FOLDER) + +sc = SparkContext(conf=conf) +spark = SparkSession(sc) + +import numpy as np +import scipy.sparse + +import math +import datetime +import time +import itertools + +import pickle + +import random +random.seed(42) + +import pandas as pd + +start_time = time.time() + +import hashlib +def hashstr(s, nr_bins): + return int(hashlib.md5(s.encode('utf8')).hexdigest(), 16)%(nr_bins-1)+1 + +import argparse + +parser = argparse.ArgumentParser() +parser.add_argument( + '--submission', + action='store_true', + default=False +) +args = parser.parse_args() + +evaluation = not args.submission + + +# ## UDFs +def date_time_to_unix_epoch(date_time): + return int(time.mktime(date_time.timetuple())) + +def date_time_to_unix_epoch_treated(dt): + if dt != None: + try: + epoch = date_time_to_unix_epoch(dt) + return epoch + except Exception as e: + print("Error processing dt={}".format(dt), e) + return 0 + else: + return 0 + +timestamp_null_to_zero_int_udf = F.udf(lambda x: date_time_to_unix_epoch_treated(x), IntegerType()) + +INT_DEFAULT_NULL_VALUE = -1 +int_null_to_minus_one_udf = F.udf(lambda x: x if x != None else INT_DEFAULT_NULL_VALUE, IntegerType()) +int_list_null_to_empty_list_udf = F.udf(lambda x: x if x != None else [], ArrayType(IntegerType())) +float_list_null_to_empty_list_udf = F.udf(lambda x: x if x != None else [], ArrayType(FloatType())) +str_list_null_to_empty_list_udf = F.udf(lambda x: x if x != None else [], ArrayType(StringType())) + +def truncate_day_from_timestamp(ts): + return int(ts / 1000 / 60 / 60 / 24) + +truncate_day_from_timestamp_udf = F.udf(lambda ts: truncate_day_from_timestamp(ts), IntegerType()) + +extract_country_udf = F.udf(lambda geo: geo.strip()[:2] if geo != None else '', StringType()) + +extract_country_state_udf = F.udf(lambda geo: geo.strip()[:5] if geo != None else '', StringType()) + +list_len_udf = F.udf(lambda x: len(x) if x != None else 0, IntegerType()) + +def convert_odd_timestamp(timestamp_ms_relative): + TIMESTAMP_DELTA=1465876799998 + return datetime.datetime.fromtimestamp((int(timestamp_ms_relative)+TIMESTAMP_DELTA)//1000) + + +# # Loading Files + +# ## Loading UTC/BST for each country and US / CA states (local time) +country_utc_dst_df = pd.read_csv('preproc/data/country_codes_utc_dst_tz_delta.csv', keep_default_na=False) + +countries_utc_dst_dict = dict(zip(country_utc_dst_df['country_code'].tolist(), country_utc_dst_df['utc_dst_time_offset_cleaned'].tolist())) +countries_utc_dst_broad = sc.broadcast(countries_utc_dst_dict) + +us_states_utc_dst_df = pd.read_csv('preproc/data/us_states_abbrev_bst.csv', keep_default_na=False) + +us_states_utc_dst_dict = dict(zip(us_states_utc_dst_df['state_abb'].tolist(), us_states_utc_dst_df['utc_dst_time_offset_cleaned'].tolist())) +us_states_utc_dst_broad = sc.broadcast(us_states_utc_dst_dict) + +ca_states_utc_dst_df = pd.read_csv('preproc/data/ca_states_abbrev_bst.csv', keep_default_na=False) + +ca_countries_utc_dst_dict = dict(zip(ca_states_utc_dst_df['state_abb'].tolist(), ca_states_utc_dst_df['utc_dst_time_offset_cleaned'].tolist())) +ca_countries_utc_dst_broad = sc.broadcast(ca_countries_utc_dst_dict) + + +# ## Loading competition csvs +events_schema = StructType( + [StructField("display_id", IntegerType(), True), + StructField("uuid_event", StringType(), True), + StructField("document_id_event", IntegerType(), True), + StructField("timestamp_event", IntegerType(), True), + StructField("platform_event", IntegerType(), True), + StructField("geo_location_event", StringType(), True)] + ) + +events_df = spark.read.schema(events_schema) \ + .options(header='true', inferschema='false', nullValue='\\N') \ + .csv(DATA_BUCKET_FOLDER + "events.csv") \ + .withColumn('dummyEvents', F.lit(1)) \ + .withColumn('day_event', truncate_day_from_timestamp_udf('timestamp_event')) \ + .withColumn('event_country', extract_country_udf('geo_location_event')) \ + .withColumn('event_country_state', extract_country_state_udf('geo_location_event')) \ + .alias('events') + +events_df.count() + +# Drop rows with empty "geo_location" +events_df = events_df.dropna(subset="geo_location_event") +events_df.count() + +# Drop rows with empty "platform" +events_df = events_df.dropna(subset="platform_event") +events_df.count() + +page_views_schema = StructType( + [StructField("uuid_pv", StringType(), True), + StructField("document_id_pv", IntegerType(), True), + StructField("timestamp_pv", IntegerType(), True), + StructField("platform_pv", IntegerType(), True), + StructField("geo_location_pv", StringType(), True), + StructField("traffic_source_pv", IntegerType(), True)] + ) + +page_views_df = spark.read.schema(page_views_schema) \ + .options(header='true', inferschema='false', nullValue='\\N') \ + .csv(DATA_BUCKET_FOLDER+"page_views.csv") \ + .withColumn('day_pv', truncate_day_from_timestamp_udf('timestamp_pv')) \ + .alias('page_views') + +page_views_df.createOrReplaceTempView('page_views') + +page_views_users_df = spark.sql(''' + SELECT uuid_pv, document_id_pv, max(timestamp_pv) as max_timestamp_pv, 1 as dummyPageView + FROM page_views p + GROUP BY uuid_pv, document_id_pv + ''').alias('page_views_users') + +promoted_content_schema = StructType( + [StructField("ad_id", IntegerType(), True), + StructField("document_id_promo", IntegerType(), True), + StructField("campaign_id", IntegerType(), True), + StructField("advertiser_id", IntegerType(), True)] + ) + +promoted_content_df = spark.read.schema(promoted_content_schema) \ + .options(header='true', inferschema='false', nullValue='\\N') \ + .csv(DATA_BUCKET_FOLDER+"promoted_content.csv") \ + .withColumn('dummyPromotedContent', F.lit(1)).alias('promoted_content').cache() + +documents_meta_schema = StructType( + [StructField("document_id_doc", IntegerType(), True), + StructField("source_id", IntegerType(), True), + StructField("publisher_id", IntegerType(), True), + StructField("publish_time", TimestampType(), True)] + ) + +documents_meta_df = spark.read.schema(documents_meta_schema) \ + .options(header='true', inferschema='false', nullValue='\\N') \ + .csv(DATA_BUCKET_FOLDER+"documents_meta.csv") \ + .withColumn('dummyDocumentsMeta', F.lit(1)).alias('documents_meta').cache() + +documents_meta_df.count() + +# Drop rows with empty "source_id" +documents_meta_df = documents_meta_df.dropna(subset="source_id") +documents_meta_df.count() + +source_publishers_df = documents_meta_df.select(["source_id", "publisher_id"]).dropDuplicates() +source_publishers_df.count() + +# get list of source_ids without publisher_id +rows_no_pub = source_publishers_df.filter("publisher_id is NULL") +source_ids_without_publisher = [row['source_id'] for row in rows_no_pub.collect()] +len(source_ids_without_publisher) + +# maximum value of publisher_id used so far +max_pub = max(source_publishers_df.select(["publisher_id"]).dropna().collect())['publisher_id'] +max_pub + +# rows filled with new publisher_ids +new_publishers = [(source, max_pub + 1 + nr) for nr, source in enumerate(source_ids_without_publisher)] +new_publishers_df = spark.createDataFrame(new_publishers, ("source_id", "publisher_id")) +new_publishers_df.take(10) + +# old and new publishers merged +fixed_source_publishers_df = source_publishers_df.dropna().union(new_publishers_df) +fixed_source_publishers_df.collect()[-30:] + +# update documents_meta with bew publishers +documents_meta_df = documents_meta_df.drop('publisher_id').join(fixed_source_publishers_df, on='source_id') +documents_meta_df.count() + +#Joining with Page Views to get traffic_source_pv +events_joined_df = events_df.join(documents_meta_df \ + .withColumnRenamed('source_id', 'source_id_doc_event') \ + .withColumnRenamed('publisher_id', 'publisher_doc_event') \ + .withColumnRenamed('publish_time', 'publish_time_doc_event'), + on=F.col("document_id_event") == F.col("document_id_doc"), how='left') \ + .join(page_views_df, + on=[F.col('uuid_event') == F.col('uuid_pv'), + F.col('document_id_event') == F.col('document_id_pv'), + F.col('platform_event') == F.col('platform_pv'), + F.col('geo_location_event') == F.col('geo_location_pv'), + F.col('day_event') == F.col('day_pv')], + how='left') \ + .alias('events').cache() + +documents_categories_schema = StructType( + [StructField("document_id_cat", IntegerType(), True), + StructField("category_id", IntegerType(), True), + StructField("confidence_level_cat", FloatType(), True)] + ) + +documents_categories_df = spark.read.schema(documents_categories_schema) \ + .options(header='true', inferschema='false', nullValue='\\N') \ + .csv(DATA_BUCKET_FOLDER+"documents_categories.csv") \ + .alias('documents_categories').cache() + +documents_categories_grouped_df = documents_categories_df.groupBy('document_id_cat') \ + .agg(F.collect_list('category_id').alias('category_id_list'), + F.collect_list('confidence_level_cat').alias('confidence_level_cat_list')) \ + .withColumn('dummyDocumentsCategory', F.lit(1)) \ + .alias('documents_categories_grouped') + +documents_topics_schema = StructType( + [StructField("document_id_top", IntegerType(), True), + StructField("topic_id", IntegerType(), True), + StructField("confidence_level_top", FloatType(), True)] + ) + +documents_topics_df = spark.read.schema(documents_topics_schema) \ + .options(header='true', inferschema='false', nullValue='\\N') \ + .csv(DATA_BUCKET_FOLDER+"documents_topics.csv") \ + .alias('documents_topics').cache() + +documents_topics_grouped_df = documents_topics_df.groupBy('document_id_top') \ + .agg(F.collect_list('topic_id').alias('topic_id_list'), + F.collect_list('confidence_level_top').alias('confidence_level_top_list')) \ + .withColumn('dummyDocumentsTopics', F.lit(1)) \ + .alias('documents_topics_grouped') + +documents_entities_schema = StructType( + [StructField("document_id_ent", IntegerType(), True), + StructField("entity_id", StringType(), True), + StructField("confidence_level_ent", FloatType(), True)] + ) + +documents_entities_df = spark.read.schema(documents_entities_schema) \ + .options(header='true', inferschema='false', nullValue='\\N') \ + .csv(DATA_BUCKET_FOLDER+"documents_entities.csv") \ + .alias('documents_entities').cache() + +documents_entities_grouped_df = documents_entities_df.groupBy('document_id_ent') \ + .agg(F.collect_list('entity_id').alias('entity_id_list'), + F.collect_list('confidence_level_ent').alias('confidence_level_ent_list')) \ + .withColumn('dummyDocumentsEntities', F.lit(1)) \ + .alias('documents_entities_grouped') + +clicks_train_schema = StructType( + [StructField("display_id", IntegerType(), True), + StructField("ad_id", IntegerType(), True), + StructField("clicked", IntegerType(), True)] + ) + +clicks_train_df = spark.read.schema(clicks_train_schema) \ + .options(header='true', inferschema='false', nullValue='\\N') \ + .csv(DATA_BUCKET_FOLDER+"clicks_train.csv") \ + .withColumn('dummyClicksTrain', F.lit(1)).alias('clicks_train') + +clicks_train_joined_df = clicks_train_df \ + .join(promoted_content_df, on='ad_id', how='left') \ + .join(documents_meta_df, + on=F.col("promoted_content.document_id_promo") == F.col("documents_meta.document_id_doc"), + how='left') \ + .join(events_joined_df, on='display_id', how='left') +clicks_train_joined_df.createOrReplaceTempView('clicks_train_joined') + +if evaluation: + table_name = 'user_profiles_eval' +else: + table_name = 'user_profiles' + +user_profiles_df = spark.read.parquet(OUTPUT_BUCKET_FOLDER+table_name) \ + .withColumn('dummyUserProfiles', F.lit(1)).alias('user_profiles') + + +# # Spliting Train/validation set | Test set +if evaluation: + validation_set_exported_df = spark.read.parquet( + OUTPUT_BUCKET_FOLDER+"validation_set.parquet") \ + .alias('validation_set') + + validation_set_exported_df.select('display_id').distinct() \ + .createOrReplaceTempView("validation_display_ids") + + + validation_set_df = spark.sql(''' + SELECT * FROM clicks_train_joined t + WHERE EXISTS (SELECT display_id FROM validation_display_ids + WHERE display_id = t.display_id)''').alias('clicks') \ + .join(documents_categories_grouped_df, + on=F.col("document_id_promo") == F.col("documents_categories_grouped.document_id_cat"), + how='left') \ + .join(documents_topics_grouped_df, + on=F.col("document_id_promo") == F.col("documents_topics_grouped.document_id_top"), + how='left') \ + .join(documents_entities_grouped_df, + on=F.col("document_id_promo") == F.col("documents_entities_grouped.document_id_ent"), + how='left') \ + .join(documents_categories_grouped_df \ + .withColumnRenamed('category_id_list', 'doc_event_category_id_list') \ + .withColumnRenamed('confidence_level_cat_list', 'doc_event_confidence_level_cat_list') \ + .alias('documents_event_categories_grouped'), + on=F.col("document_id_event") == F.col("documents_event_categories_grouped.document_id_cat"), + how='left') \ + .join(documents_topics_grouped_df \ + .withColumnRenamed('topic_id_list', 'doc_event_topic_id_list') + .withColumnRenamed('confidence_level_top_list', 'doc_event_confidence_level_top_list') \ + .alias('documents_event_topics_grouped'), + on=F.col("document_id_event") == F.col("documents_event_topics_grouped.document_id_top"), + how='left') \ + .join(documents_entities_grouped_df \ + .withColumnRenamed('entity_id_list', 'doc_event_entity_id_list') + .withColumnRenamed('confidence_level_ent_list', 'doc_event_confidence_level_ent_list') \ + .alias('documents_event_entities_grouped'), + on=F.col("document_id_event") == F.col("documents_event_entities_grouped.document_id_ent"), + how='left') \ + .join(page_views_users_df, + on=[F.col("clicks.uuid_event") == F.col("page_views_users.uuid_pv"), + F.col("clicks.document_id_promo") == F.col("page_views_users.document_id_pv")], + how='left') + + #print("validation_set_df.count() =", validation_set_df.count()) + + #Added to validation set information about the event and the user for statistics of the error (avg ctr) + validation_set_ground_truth_df = validation_set_df.filter('clicked = 1') \ + .join(user_profiles_df, + on=[F.col("user_profiles.uuid") == F.col("uuid_event")], + how='left') \ + .withColumn('user_categories_count', list_len_udf('category_id_list')) \ + .withColumn('user_topics_count', list_len_udf('topic_id_list')) \ + .withColumn('user_entities_count', list_len_udf('entity_id_list')) \ + .select('display_id','ad_id','platform_event', 'day_event', 'timestamp_event', + 'geo_location_event', 'event_country', 'event_country_state', 'views', + 'user_categories_count', 'user_topics_count', 'user_entities_count') \ + .withColumnRenamed('ad_id','ad_id_gt') \ + .withColumnRenamed('views','user_views_count') \ + .cache() + #print("validation_set_ground_truth_df.count() =", validation_set_ground_truth_df.count()) + + train_set_df = spark.sql(''' + SELECT * FROM clicks_train_joined t + WHERE NOT EXISTS (SELECT display_id FROM validation_display_ids + WHERE display_id = t.display_id)''').cache() + print("train_set_df.count() =", train_set_df.count()) + + #validation_display_ids_df.groupBy("day_event").count().show() + +else: + + clicks_test_schema = StructType( + [StructField("display_id", IntegerType(), True), + StructField("ad_id", IntegerType(), True)] + ) + + clicks_test_df = spark.read.schema(clicks_test_schema) \ + .options(header='true', inferschema='false', nullValue='\\N') \ + .csv(DATA_BUCKET_FOLDER + "clicks_test.csv") \ + .withColumn('dummyClicksTest', F.lit(1)) \ + .withColumn('clicked', F.lit(-999)) \ + .alias('clicks_test') + + + test_set_df = clicks_test_df \ + .join(promoted_content_df, on='ad_id', how='left') \ + .join(documents_meta_df, + on=F.col("promoted_content.document_id_promo") == F.col("documents_meta.document_id_doc"), + how='left') \ + .join(documents_categories_grouped_df, + on=F.col("document_id_promo") == F.col("documents_categories_grouped.document_id_cat"), + how='left') \ + .join(documents_topics_grouped_df, + on=F.col("document_id_promo") == F.col("documents_topics_grouped.document_id_top"), + how='left') \ + .join(documents_entities_grouped_df, + on=F.col("document_id_promo") == F.col("documents_entities_grouped.document_id_ent"), + how='left') \ + .join(events_joined_df, on='display_id', how='left') \ + .join(documents_categories_grouped_df \ + .withColumnRenamed('category_id_list', 'doc_event_category_id_list') + .withColumnRenamed('confidence_level_cat_list', 'doc_event_confidence_level_cat_list') \ + .alias('documents_event_categories_grouped'), + on=F.col("document_id_event") == F.col("documents_event_categories_grouped.document_id_cat"), + how='left') \ + .join(documents_topics_grouped_df \ + .withColumnRenamed('topic_id_list', 'doc_event_topic_id_list') + .withColumnRenamed('confidence_level_top_list', 'doc_event_confidence_level_top_list') \ + .alias('documents_event_topics_grouped'), + on=F.col("document_id_event") == F.col("documents_event_topics_grouped.document_id_top"), + how='left') \ + .join(documents_entities_grouped_df \ + .withColumnRenamed('entity_id_list', 'doc_event_entity_id_list') + .withColumnRenamed('confidence_level_ent_list', 'doc_event_confidence_level_ent_list') \ + .alias('documents_event_entities_grouped'), + on=F.col("document_id_event") == F.col("documents_event_entities_grouped.document_id_ent"), + how='left') \ + .join(page_views_users_df, + on=[F.col("events.uuid_event") == F.col("page_views_users.uuid_pv"), + F.col("promoted_content.document_id_promo") == F.col("page_views_users.document_id_pv")], + how='left') + + train_set_df = clicks_train_joined_df.cache() + print("train_set_df.count() =", train_set_df.count()) + + +# # Training models +def is_null(value): + return value == None or len(str(value).strip()) == 0 + +LESS_SPECIAL_CAT_VALUE = 'less' +def get_category_field_values_counts(field, df, min_threshold=10): + category_counts = dict(list(filter(lambda x: not is_null(x[0]) and x[1] >= min_threshold, df.select(field).groupBy(field).count().rdd.map(lambda x: (x[0], x[1])).collect()))) + #Adding a special value to create a feature for values in this category that are less than min_threshold + category_counts[LESS_SPECIAL_CAT_VALUE] = -1 + return category_counts + + +# ## Building category values counters and indexers +event_country_values_counts = get_category_field_values_counts('event_country', events_df, min_threshold=10) +len(event_country_values_counts) +#All non-null categories: 230 + +event_country_state_values_counts = get_category_field_values_counts('event_country_state', events_df, min_threshold=10) +len(event_country_state_values_counts) + +event_geo_location_values_counts = get_category_field_values_counts('geo_location_event', events_df, min_threshold=10) +len(event_geo_location_values_counts) +#All non-null categories: 2988 + +doc_entity_id_values_counts = get_category_field_values_counts('entity_id', documents_entities_df, min_threshold=10) +len(doc_entity_id_values_counts) +#All non-null categories: 1326009 + + +# ## Processing average CTR by categories +def get_percentiles(df, field, quantiles_levels=None, max_error_rate=0.0): + if quantiles_levels == None: + quantiles_levels = np.arange(0.0, 1.1, 0.1).tolist() + quantiles = df.approxQuantile(field, quantiles_levels, max_error_rate) + return dict(zip(quantiles_levels, quantiles)) + +#REG = 10 +REG = 0 +ctr_udf = F.udf(lambda clicks, views: clicks / float(views + REG), FloatType()) + + +# ### Average CTR by ad_id +ad_id_popularity_df = train_set_df.groupby('ad_id').agg(F.sum('clicked').alias('clicks'), + F.count('*').alias('views')) \ + .withColumn('ctr', ctr_udf('clicks','views')) + +#ad_id_popularity_df.count() + +#get_percentiles(ad_id_popularity_df, 'clicks') + +#get_percentiles(ad_id_popularity_df, 'views') + +ad_id_popularity = ad_id_popularity_df.filter('views > 5').select('ad_id', 'ctr', 'views') \ + .rdd.map(lambda x: (x['ad_id'], (x['ctr'], x['views'], 1, 1))).collectAsMap() + +ad_id_popularity_broad = sc.broadcast(ad_id_popularity) + +list(ad_id_popularity.values())[:3] + +len(ad_id_popularity) + +#get_ad_id_ctr_udf = F.udf(lambda ad_id: ad_id_popularity[ad_id] if ad_id in ad_id_popularity else -1, FloatType()) + +ad_id_avg_ctr = sum(map(lambda x: x[0], ad_id_popularity.values())) / float(len(ad_id_popularity)) +ad_id_avg_ctr + +ad_id_weighted_avg_ctr = sum(map(lambda x: x[0]*x[1], ad_id_popularity.values())) / float(sum(map(lambda x: x[1], ad_id_popularity.values()))) +ad_id_weighted_avg_ctr + +ad_id_views_median = np.median(np.array(list(map(lambda x: x[1], ad_id_popularity.values())))) +ad_id_views_median + +ad_id_views_mean = sum(map(lambda x: x[1], ad_id_popularity.values())) / float(len(ad_id_popularity)) +ad_id_views_mean + + +# ### Average CTR by document_id (promoted_content) +document_id_popularity_df = train_set_df \ + .groupby('document_id_promo') \ + .agg(F.sum('clicked').alias('clicks'), F.count('*').alias('views'), + F.countDistinct('ad_id').alias('distinct_ad_ids')) \ + .withColumn('ctr', ctr_udf('clicks','views')) + +document_id_popularity = document_id_popularity_df.filter('views > 5') \ + .select('document_id_promo', 'ctr', 'views', 'distinct_ad_ids') \ + .rdd.map(lambda x: (x['document_id_promo'], + (x['ctr'], x['views'], x['distinct_ad_ids'], 1))).collectAsMap() +len(document_id_popularity) + +document_id_popularity_broad = sc.broadcast(document_id_popularity) + +#document_id_popularity_df.count() + +#get_percentiles(document_id_popularity_df, 'clicks') + +#get_percentiles(document_id_popularity_df, 'views') + +document_id_avg_ctr = sum(map(lambda x: x[0], document_id_popularity.values())) / float(len(document_id_popularity)) +document_id_avg_ctr + +document_id_weighted_avg_ctr = sum(list(map(lambda x: x[0]*x[1], document_id_popularity.values()))) / float(sum(list(map(lambda x: x[1], document_id_popularity.values())))) +document_id_weighted_avg_ctr + +document_id_views_median = np.median(np.array(list(map(lambda x: x[1], document_id_popularity.values())))) +document_id_views_median + +document_id_views_mean = sum(map(lambda x: x[1], document_id_popularity.values())) / float(len(document_id_popularity)) +document_id_views_mean + + +# ### Average CTR by (doc_event, doc_ad) +doc_event_doc_ad_avg_ctr_df = train_set_df.groupBy('document_id_event', 'document_id_promo') \ + .agg(F.sum('clicked').alias('clicks'), + F.count('*').alias('views'), F.countDistinct('ad_id').alias('distinct_ad_ids')) \ + .withColumn('ctr', ctr_udf('clicks','views')) + +doc_event_doc_ad_avg_ctr = doc_event_doc_ad_avg_ctr_df.filter('views > 5') \ + .select('document_id_event', 'document_id_promo','ctr', 'views', 'distinct_ad_ids') \ + .rdd.map(lambda x: ((x['document_id_event'], x['document_id_promo']), + (x['ctr'], x['views'], x['distinct_ad_ids'], 1))).collectAsMap() + +len(doc_event_doc_ad_avg_ctr) + +doc_event_doc_ad_avg_ctr_broad = sc.broadcast(doc_event_doc_ad_avg_ctr) + + +# ### Average CTR by country, source_id +source_id_by_country_popularity_df = train_set_df \ + .select('clicked', 'source_id', 'event_country', 'ad_id') \ + .groupby('event_country', 'source_id') \ + .agg(F.sum('clicked').alias('clicks'), F.count('*').alias('views'), + F.countDistinct('ad_id').alias('distinct_ad_ids')) \ + .withColumn('ctr', ctr_udf('clicks','views')) + +#source_id_popularity = source_id_popularity_df.filter('views > 100 and source_id is not null').select('source_id', 'ctr').rdd.collectAsMap() +source_id_by_country_popularity = source_id_by_country_popularity_df.filter('views > 5 and source_id is not null and event_country <> ""').select('event_country', 'source_id', 'ctr', 'views', 'distinct_ad_ids') .rdd.map(lambda x: ((x['event_country'], x['source_id']), (x['ctr'], x['views'], x['distinct_ad_ids'], 1))).collectAsMap() +len(source_id_by_country_popularity) + +source_id_by_country_popularity_broad = sc.broadcast(source_id_by_country_popularity) + +source_id_by_country_avg_ctr = sum(map(lambda x: x[0], source_id_by_country_popularity.values())) / float(len(source_id_by_country_popularity)) +source_id_by_country_avg_ctr + +source_id_by_country_weighted_avg_ctr = sum(map(lambda x: x[0]*x[1], source_id_by_country_popularity.values())) / float(sum(map(lambda x: x[1], source_id_by_country_popularity.values()))) +source_id_by_country_weighted_avg_ctr + +source_id_by_country_views_median = np.median(np.array(list(map(lambda x: x[1], source_id_by_country_popularity.values())))) +source_id_by_country_views_median + +source_id_by_country_views_mean = sum(map(lambda x: x[1], source_id_by_country_popularity.values())) / float(len(source_id_by_country_popularity)) +source_id_by_country_views_mean + + +# ### Average CTR by source_id +source_id_popularity_df = train_set_df.select('clicked', 'source_id', 'ad_id') \ + .groupby('source_id').agg(F.sum('clicked').alias('clicks'), F.count('*').alias('views'), + F.countDistinct('ad_id').alias('distinct_ad_ids')) \ + .withColumn('ctr', ctr_udf('clicks','views')) + +source_id_popularity = source_id_popularity_df \ + .filter('views > 10 and source_id is not null') \ + .select('source_id', 'ctr', 'views', 'distinct_ad_ids') \ + .rdd.map(lambda x: (x['source_id'], (x['ctr'], x['views'], x['distinct_ad_ids'], 1))) \ + .collectAsMap() +len(source_id_popularity) + +source_id_popularity_broad = sc.broadcast(source_id_popularity) + +#source_id_popularity_df.count() + +#get_percentiles(source_id_popularity_df, 'clicks') + +#get_percentiles(source_id_popularity_df, 'views') + +#source_id_popularity = source_id_popularity_df.filter('views > 100 and source_id is not null').select('source_id', 'ctr').rdd.collectAsMap() + + +# ### Average CTR by publisher_id +publisher_popularity_df = train_set_df.select('clicked', 'publisher_id', 'ad_id') \ + .groupby('publisher_id').agg(F.sum('clicked').alias('clicks'), F.count('*').alias('views'), + F.countDistinct('ad_id').alias('distinct_ad_ids')) \ + .withColumn('ctr', ctr_udf('clicks','views')) + +publisher_popularity = publisher_popularity_df \ + .filter('views > 10 and publisher_id is not null') \ + .select('publisher_id', 'ctr', 'views', 'distinct_ad_ids') \ + .rdd.map(lambda x: (x['publisher_id'], (x['ctr'], x['views'], x['distinct_ad_ids'], 1))) \ + .collectAsMap() +len(publisher_popularity) + +publisher_popularity_broad = sc.broadcast(publisher_popularity) + +#publisher_popularity_df.count() +##863 + +#get_percentiles(publisher_popularity_df, 'clicks') + +#get_percentiles(publisher_popularity_df, 'views') + +#publisher_id_popularity = publisher_popularity_df.filter('views > 100 and publisher_id is not null').select('publisher_id', 'ctr').rdd.collectAsMap() +#len(publisher_id_popularity) +##639 + + +# ### Average CTR by advertiser_id +advertiser_id_popularity_df = train_set_df.select('clicked', 'advertiser_id', 'ad_id') \ + .groupby('advertiser_id').agg(F.sum('clicked').alias('clicks'), F.count('*').alias('views'), + F.countDistinct('ad_id').alias('distinct_ad_ids')) \ + .withColumn('ctr', ctr_udf('clicks','views')) + +advertiser_id_popularity = advertiser_id_popularity_df \ + .filter('views > 10 and advertiser_id is not null') \ + .select('advertiser_id', 'ctr', 'views', 'distinct_ad_ids') \ + .rdd.map(lambda x: (x['advertiser_id'], + (x['ctr'], x['views'], x['distinct_ad_ids'], 1))).collectAsMap() +len(advertiser_id_popularity) + +advertiser_id_popularity_broad = sc.broadcast(advertiser_id_popularity) + +#advertiser_id_popularity_df.count() +##4063 + +#get_percentiles(advertiser_id_popularity_df, 'clicks') + +#get_percentiles(advertiser_id_popularity_df, 'views') + +#advertiser_id_popularity = advertiser_id_popularity_df.filter('views > 100 and advertiser_id is not null').select('advertiser_id', 'ctr').rdd.collectAsMap() +#len(advertiser_id_popularity) +##3129 + + +# ### Average CTR by campaign_id +campaign_id_popularity_df = train_set_df.select('clicked', 'campaign_id', 'ad_id') \ + .groupby('campaign_id').agg(F.sum('clicked').alias('clicks'), F.count('*').alias('views'), + F.countDistinct('ad_id').alias('distinct_ad_ids')) \ + .withColumn('ctr', ctr_udf('clicks','views')) + +campaign_id_popularity = campaign_id_popularity_df \ + .filter('views > 10 and campaign_id is not null') \ + .select('campaign_id', 'ctr', 'views', 'distinct_ad_ids') \ + .rdd.map(lambda x: (x['campaign_id'], (x['ctr'], x['views'], x['distinct_ad_ids'], 1))) \ + .collectAsMap() +len(campaign_id_popularity) + +campaign_id_popularity_broad = sc.broadcast(campaign_id_popularity) + +#campaign_id_popularity_df.count() +##31390 + +#get_percentiles(campaign_id_popularity_df, 'clicks') + +#get_percentiles(campaign_id_popularity_df, 'views') + +#campaign_id_popularity = campaign_id_popularity_df.filter('views > 100 and campaign_id is not null').select('campaign_id', 'ctr').rdd.collectAsMap() +#len(campaign_id_popularity) +##16097 + + +# ### Average CTR by category + +category_id_popularity_df = train_set_df.join( + documents_categories_df.alias('cat_local'), + on=F.col("document_id_promo") == F.col("cat_local.document_id_cat"), how='inner') \ + .select('clicked', 'category_id', 'confidence_level_cat', 'ad_id') \ + .groupby('category_id').agg(F.sum('clicked').alias('clicks'), + F.count('*').alias('views'), + F.mean('confidence_level_cat').alias('avg_confidence_level_cat'), + F.countDistinct('ad_id').alias('distinct_ad_ids')) \ + .withColumn('ctr', ctr_udf('clicks','views')) + +category_id_popularity = category_id_popularity_df.filter('views > 10') \ + .select('category_id', 'ctr', 'views', 'avg_confidence_level_cat', 'distinct_ad_ids') \ + .rdd.map(lambda x: (x['category_id'], + (x['ctr'], x['views'], x['distinct_ad_ids'], x['avg_confidence_level_cat']))).collectAsMap() +len(category_id_popularity) + +category_id_popularity_broad = sc.broadcast(category_id_popularity) + +list(category_id_popularity.values())[:10] + +np.median(np.array(list(map(lambda x: x[1], category_id_popularity.values())))) + +sum(map(lambda x: x[1], category_id_popularity.values())) / float(len(category_id_popularity)) + +#Parece haver uma hierarquia nas categorias pelo padrão dos códigos... +#category_id_popularity + + +# ### Average CTR by (country, category) +category_id_by_country_popularity_df = train_set_df \ + .join(documents_categories_df.alias('cat_local'), + on=F.col("document_id_promo") == F.col("cat_local.document_id_cat"), how='inner') \ + .select('clicked', 'category_id', 'confidence_level_cat', 'event_country', 'ad_id') \ + .groupby('event_country','category_id').agg(F.sum('clicked').alias('clicks'), + F.count('*').alias('views'), + F.mean('confidence_level_cat').alias('avg_confidence_level_cat'), + F.countDistinct('ad_id').alias('distinct_ad_ids')) \ + .withColumn('ctr', ctr_udf('clicks','views')) + +category_id_by_country_popularity = category_id_by_country_popularity_df \ + .filter('views > 10 and event_country <> ""') \ + .select('event_country', 'category_id', 'ctr', 'views', 'avg_confidence_level_cat', + 'distinct_ad_ids') \ + .rdd.map(lambda x: ((x['event_country'], x['category_id']), + (x['ctr'], x['views'], x['distinct_ad_ids'], x['avg_confidence_level_cat']))).collectAsMap() +len(category_id_by_country_popularity) + +category_id_by_country_popularity_broad = sc.broadcast(category_id_by_country_popularity) + + +# ### Average CTR by Topic +topic_id_popularity_df = train_set_df.join( + documents_topics_df.alias('top_local'), + on=F.col("document_id_promo") == F.col("top_local.document_id_top"), how='inner') \ + .select('clicked', 'topic_id', 'confidence_level_top', 'ad_id') \ + .groupby('topic_id').agg(F.sum('clicked').alias('clicks'), F.count('*').alias('views'), + F.mean('confidence_level_top').alias('avg_confidence_level_top'), + F.countDistinct('ad_id').alias('distinct_ad_ids')) \ + .withColumn('ctr', ctr_udf('clicks','views')) +topic_id_popularity = topic_id_popularity_df.filter('views > 10') \ + .select('topic_id', 'ctr', 'views', 'avg_confidence_level_top', 'distinct_ad_ids') \ + .rdd.map(lambda x: (x['topic_id'], \ + (x['ctr'], x['views'], x['distinct_ad_ids'], x['avg_confidence_level_top']))).collectAsMap() +len(topic_id_popularity) + +topic_id_popularity_broad = sc.broadcast(topic_id_popularity) + +sum(map(lambda x: x[1], topic_id_popularity.values())) / float(len(topic_id_popularity)) + +sum(map(lambda x: x[2]*x[1], topic_id_popularity.values())) / float(len(topic_id_popularity)) + + +# ### Average CTR by (country, topic) +topic_id_by_country_popularity_df = train_set_df.join( + documents_topics_df.alias('top_local'), + on=F.col("document_id_promo") == F.col("top_local.document_id_top"), how='inner') \ + .select('clicked', 'topic_id', 'confidence_level_top','event_country', 'ad_id') \ + .groupby('event_country','topic_id').agg(F.sum('clicked').alias('clicks'), + F.count('*').alias('views'), + F.mean('confidence_level_top').alias('avg_confidence_level_top'), + F.countDistinct('ad_id').alias('distinct_ad_ids')) \ + .withColumn('ctr', ctr_udf('clicks','views')) + +topic_id_id_by_country_popularity = topic_id_by_country_popularity_df \ + .filter('views > 10 and event_country <> ""') \ + .select('event_country', 'topic_id', 'ctr', 'views', 'avg_confidence_level_top', + 'distinct_ad_ids') \ + .rdd.map(lambda x: ((x['event_country'], x['topic_id']), + (x['ctr'], x['views'], x['distinct_ad_ids'], x['avg_confidence_level_top']))).collectAsMap() +len(topic_id_id_by_country_popularity) + +topic_id_id_by_country_popularity_broad = sc.broadcast(topic_id_id_by_country_popularity) + + +# ### Average CTR by Entity +entity_id_popularity_df = train_set_df.join( + documents_entities_df.alias('ent_local'), + on=F.col("document_id_promo") == F.col("ent_local.document_id_ent"), how='inner') \ + .select('clicked', 'entity_id', 'confidence_level_ent', 'ad_id') \ + .groupby('entity_id').agg(F.sum('clicked').alias('clicks'), F.count('*').alias('views'), + F.mean('confidence_level_ent').alias('avg_confidence_level_ent'), + F.countDistinct('ad_id').alias('distinct_ad_ids')) \ + .withColumn('ctr', ctr_udf('clicks','views')) + +entity_id_popularity = entity_id_popularity_df.filter('views > 5') \ + .select('entity_id', 'ctr', 'views', 'avg_confidence_level_ent', 'distinct_ad_ids') \ + .rdd.map(lambda x: (x['entity_id'], + (x['ctr'], x['views'], x['distinct_ad_ids'], x['avg_confidence_level_ent']))).collectAsMap() +len(entity_id_popularity) + +entity_id_popularity_broad = sc.broadcast(entity_id_popularity) + +np.median(np.array(list(map(lambda x: x[1], entity_id_popularity.values())))) + +sum(map(lambda x: x[1], entity_id_popularity.values())) / float(len(entity_id_popularity)) + + +# ### Average CTR by (country, entity) +entity_id_by_country_popularity_df = train_set_df.join( + documents_entities_df.alias('ent_local'), + on=F.col("document_id_promo") == F.col("ent_local.document_id_ent"), how='inner') \ + .select('clicked', 'entity_id', 'event_country', 'confidence_level_ent','ad_id') \ + .groupby('event_country','entity_id').agg(F.sum('clicked').alias('clicks'), + F.count('*').alias('views'), + F.mean('confidence_level_ent').alias('avg_confidence_level_ent'), + F.countDistinct('ad_id').alias('distinct_ad_ids')) \ + .withColumn('ctr', ctr_udf('clicks','views')) + +entity_id_by_country_popularity = entity_id_by_country_popularity_df \ + .filter('views > 5 and event_country <> ""') \ + .select('event_country', 'entity_id', 'ctr', 'views', 'avg_confidence_level_ent', + 'distinct_ad_ids') \ + .rdd.map(lambda x: ((x['event_country'], x['entity_id']), + (x['ctr'], x['views'], x['distinct_ad_ids'], x['avg_confidence_level_ent']))).collectAsMap() +len(entity_id_by_country_popularity) + +entity_id_by_country_popularity_broad = sc.broadcast(entity_id_by_country_popularity) + + +# ### Loading # docs by categories, topics, entities +#import cPickle +import _pickle as cPickle + +df_filenames_suffix = '' +if evaluation: + df_filenames_suffix = '_eval' + +with open(OUTPUT_BUCKET_FOLDER+'categories_docs_counts'+df_filenames_suffix+'.pickle', 'rb') as input_file: + categories_docs_counts = cPickle.load(input_file) +len(categories_docs_counts) + +with open(OUTPUT_BUCKET_FOLDER+'topics_docs_counts'+df_filenames_suffix+'.pickle', 'rb') as input_file: + topics_docs_counts = cPickle.load(input_file) +len(topics_docs_counts) + +with open(OUTPUT_BUCKET_FOLDER+'entities_docs_counts'+df_filenames_suffix+'.pickle', 'rb') as input_file: + entities_docs_counts = cPickle.load(input_file) +len(entities_docs_counts) + +documents_total = documents_meta_df.count() +documents_total + + +# ## Exploring Publish Time +publish_times_df = train_set_df.filter('publish_time is not null').select('document_id_promo','publish_time').distinct().select(F.col('publish_time').cast(IntegerType())) +publish_time_percentiles = get_percentiles(publish_times_df, 'publish_time', quantiles_levels=[0.5], max_error_rate=0.001) +publish_time_percentiles + +publish_time_median = int(publish_time_percentiles[0.5]) +datetime.datetime.utcfromtimestamp(publish_time_median) + +def get_days_diff(newer_timestamp, older_timestamp): + sec_diff = newer_timestamp - older_timestamp + days_diff = sec_diff / 60 / 60 / 24 + return days_diff + +def get_time_decay_factor(timestamp, timestamp_ref=None, alpha=0.001): + if timestamp_ref == None: + timestamp_ref = time.time() + + days_diff = get_days_diff(timestamp_ref, timestamp) + denominator = math.pow(1+alpha, days_diff) + if denominator != 0: + return 1.0 / denominator + else: + return 0.0 + +def convert_odd_timestamp(timestamp_ms_relative): + TIMESTAMP_DELTA=1465876799998 + return datetime.datetime.fromtimestamp((int(timestamp_ms_relative)+TIMESTAMP_DELTA)//1000) + +TIME_DECAY_ALPHA = 0.0005 + +ref_dates = [ + 1476714880, # 7 days + 1474727680, # 30 days + 1469370880, # 90 days + 1461508480, # 180 days + 1445697280, # 1 year + 1414161280 # 2 years +] + +for d in ref_dates: + print(datetime.datetime.utcfromtimestamp(d), get_time_decay_factor(d, alpha=TIME_DECAY_ALPHA)) + + +# ### Get local time +DEFAULT_TZ_EST = -4.0 + +def get_local_utc_bst_tz(event_country, event_country_state): + local_tz = DEFAULT_TZ_EST + if len(event_country) > 0: + if event_country in countries_utc_dst_broad.value: + local_tz = countries_utc_dst_broad.value[event_country] + if len(event_country_state)>2: + state = event_country_state[3:5] + if event_country == 'US': + if state in us_states_utc_dst_broad.value: + local_tz = us_states_utc_dst_broad.value[state] + elif event_country == 'CA': + if state in ca_countries_utc_dst_broad.value: + local_tz = ca_countries_utc_dst_broad.value[state] + return float(local_tz) + +hour_bins_dict = {'EARLY_MORNING': 0, + 'MORNING': 1, + 'MIDDAY': 2, + 'AFTERNOON': 3, + 'EVENING': 4, + 'NIGHT': 5} + +hour_bins_values = sorted(hour_bins_dict.values()) + +def get_hour_bin(hour): + if hour >= 5 and hour < 8: + hour_bin = hour_bins_dict['EARLY_MORNING'] + elif hour >= 8 and hour < 11: + hour_bin = hour_bins_dict['MORNING'] + elif hour >= 11 and hour < 14: + hour_bin = hour_bins_dict['MIDDAY'] + elif hour >= 14 and hour < 19: + hour_bin = hour_bins_dict['AFTERNOON'] + elif hour >= 19 and hour < 22: + hour_bin = hour_bins_dict['EVENING'] + else: + hour_bin = hour_bins_dict['NIGHT'] + return hour_bin + +def get_local_datetime(dt, event_country, event_country_state): + local_tz = get_local_utc_bst_tz(event_country, event_country_state) + tz_delta = local_tz - DEFAULT_TZ_EST + local_time = dt + datetime.timedelta(hours=tz_delta) + return local_time + +get_local_datetime(datetime.datetime.now(), 'US', 'US>CA') + +def is_weekend(dt): + return dt.weekday() >= 5 + +is_weekend(datetime.datetime(2016, 6, 14)) + + +# ## Average CTR functions +timestamp_ref = date_time_to_unix_epoch(datetime.datetime(2016, 6, 29, 3, 59, 59)) +decay_factor_default = get_time_decay_factor(publish_time_median, timestamp_ref, alpha=TIME_DECAY_ALPHA) +print("decay_factor_default", decay_factor_default) + +def get_confidence_sample_size(sample, max_for_reference=100000): + #Avoiding overflow for large sample size + if sample >= max_for_reference: + return 1.0 + + ref_log = math.log(1+max_for_reference, 2) #Curiosly reference in log with base 2 gives a slightly higher score, so I will keep + + return math.log(1+sample) / float(ref_log) + +for i in [0,0.5,1,2,3,4,5,10,20,30,100,200,300,1000,2000,3000,10000,20000,30000, 50000, 90000, 100000, 500000, 900000, 1000000, 2171607]: + print(i, get_confidence_sample_size(i)) + +def get_popularity(an_id, a_dict): + return (a_dict[an_id][0], get_confidence_sample_size(a_dict[an_id][1] / float(a_dict[an_id][2])) * a_dict[an_id][3]) if an_id in a_dict else (None, None) + +def get_weighted_avg_popularity_from_list(ids_list, confidence_ids_list, pop_dict): + pops = list(filter(lambda x: x[0][0]!=None, [(get_popularity(an_id, pop_dict), confidence) for an_id, confidence in zip(ids_list, confidence_ids_list)])) + #print("pops",pops) + if len(pops) > 0: + weighted_avg = sum(map(lambda x: x[0][0]*x[0][1]*x[1], pops)) / float(sum(map(lambda x: x[0][1]*x[1], pops))) + confidence = max(map(lambda x: x[0][1]*x[1], pops)) + return weighted_avg, confidence + else: + return None, None + +def get_weighted_avg_country_popularity_from_list(event_country, ids_list, confidence_ids_list, pop_dict): + pops = list(filter(lambda x: x[0][0]!=None, [(get_popularity((event_country, an_id), pop_dict), confidence) for an_id, confidence in zip(ids_list, confidence_ids_list)])) + + if len(pops) > 0: + weighted_avg = sum(map(lambda x: x[0][0]*x[0][1]*x[1], pops)) / float(sum(map(lambda x: x[0][1]*x[1], pops))) + confidence = max(map(lambda x: x[0][1]*x[1], pops)) + return weighted_avg, confidence + else: + return None, None + +def get_popularity_score(event_country, ad_id, document_id, source_id, + publisher_id, advertiser_id, campaign_id, document_id_event, + category_ids_by_doc, cat_confidence_level_by_doc, + topic_ids_by_doc, top_confidence_level_by_doc, + entity_ids_by_doc, ent_confidence_level_by_doc, + output_detailed_list=False): + probs = [] + + avg_ctr, confidence = get_popularity(ad_id, ad_id_popularity_broad.value) + if avg_ctr != None: + probs.append(('pop_ad_id', avg_ctr, confidence)) + + avg_ctr, confidence = get_popularity(document_id, document_id_popularity_broad.value) + if avg_ctr != None: + probs.append(('pop_document_id', avg_ctr, confidence)) + + avg_ctr, confidence = get_popularity((document_id_event, document_id), doc_event_doc_ad_avg_ctr_broad.value) + if avg_ctr != None: + probs.append(('pop_doc_event_doc_ad', avg_ctr, confidence)) + + + if source_id != -1: + avg_ctr = None + if event_country != '': + avg_ctr, confidence = get_popularity((event_country, source_id), source_id_by_country_popularity_broad.value) + if avg_ctr != None: + probs.append(('pop_source_id_country', avg_ctr, confidence)) + + avg_ctr, confidence = get_popularity(source_id, source_id_popularity_broad.value) + if avg_ctr != None: + probs.append(('pop_source_id', avg_ctr, confidence)) + + + if publisher_id != None: + avg_ctr, confidence = get_popularity(publisher_id, publisher_popularity_broad.value) + if avg_ctr != None: + probs.append(('pop_publisher_id', avg_ctr, confidence)) + + if advertiser_id != None: + avg_ctr, confidence = get_popularity(advertiser_id, advertiser_id_popularity_broad.value) + if avg_ctr != None: + probs.append(('pop_advertiser_id', avg_ctr, confidence)) + + if campaign_id != None: + avg_ctr, confidence = get_popularity(campaign_id, campaign_id_popularity_broad.value) + if avg_ctr != None: + probs.append(('pop_campain_id', avg_ctr, confidence)) + + if len(entity_ids_by_doc) > 0: + avg_ctr = None + if event_country != '': + avg_ctr, confidence = get_weighted_avg_country_popularity_from_list( + event_country, entity_ids_by_doc, ent_confidence_level_by_doc, + entity_id_by_country_popularity_broad.value) + if avg_ctr != None: + probs.append(('pop_entity_id_country', avg_ctr, confidence)) + + avg_ctr, confidence = get_weighted_avg_popularity_from_list( + entity_ids_by_doc, ent_confidence_level_by_doc, + entity_id_popularity_broad.value) + if avg_ctr != None: + probs.append(('pop_entity_id', avg_ctr, confidence)) + + + + if len(topic_ids_by_doc) > 0: + avg_ctr = None + if event_country != '': + avg_ctr, confidence = get_weighted_avg_country_popularity_from_list( + event_country, topic_ids_by_doc, top_confidence_level_by_doc, + topic_id_id_by_country_popularity_broad.value) + if avg_ctr != None: + probs.append(('pop_topic_id_country', avg_ctr, confidence)) + + avg_ctr, confidence = get_weighted_avg_popularity_from_list( + topic_ids_by_doc, top_confidence_level_by_doc, + topic_id_popularity_broad.value) + if avg_ctr != None: + probs.append(('pop_topic_id', avg_ctr, confidence)) + + + if len(category_ids_by_doc) > 0: + avg_ctr = None + if event_country != '': + avg_ctr, confidence = get_weighted_avg_country_popularity_from_list( + event_country, category_ids_by_doc, cat_confidence_level_by_doc, + category_id_by_country_popularity_broad.value) + if avg_ctr != None: + probs.append(('pop_category_id_country', avg_ctr, confidence)) + + avg_ctr, confidence = get_weighted_avg_popularity_from_list( + category_ids_by_doc, cat_confidence_level_by_doc, + category_id_popularity_broad.value) + if avg_ctr != None: + probs.append(('pop_category_id', avg_ctr, confidence)) + + #print("[get_popularity_score] probs", probs) + if output_detailed_list: + return probs + + else: + if len(probs) > 0: + #weighted_avg_probs_by_confidence = sum(map(lambda x: x[1] * math.log(1+x[2],2), probs)) / float(sum(map(lambda x: math.log(1+x[2],2), probs))) + weighted_avg_probs_by_confidence = sum(map(lambda x: x[1] * x[2], probs)) / float(sum(map(lambda x: x[2], probs))) + confidence = max(map(lambda x: x[2], probs)) + return weighted_avg_probs_by_confidence, confidence + else: + return None, None + + +# ## Content-Based similarity functions +def cosine_similarity_dicts(dict1, dict2): + dict1_norm = math.sqrt(sum([v**2 for v in dict1.values()])) + dict2_norm = math.sqrt(sum([v**2 for v in dict2.values()])) + + sum_common_aspects = 0.0 + intersections = 0 + for key in dict1: + if key in dict2: + sum_common_aspects += dict1[key] * dict2[key] + intersections += 1 + + return sum_common_aspects / (dict1_norm * dict2_norm), intersections + +def cosine_similarity_user_docs_aspects(user_aspect_profile, doc_aspect_ids, doc_aspects_confidence, aspect_docs_counts): + if user_aspect_profile==None or len(user_aspect_profile) == 0 or doc_aspect_ids == None or len(doc_aspect_ids) == 0: + return None, None + + doc_aspects = dict(zip(doc_aspect_ids, doc_aspects_confidence)) + doc_aspects_tfidf_confid = {} + for key in doc_aspects: + tf = 1.0 + idf = math.log(math.log(documents_total / float(aspect_docs_counts[key]))) + confidence = doc_aspects[key] + doc_aspects_tfidf_confid[key] = tf*idf * confidence + + user_aspects_tfidf_confid = {} + for key in user_aspect_profile: + tfidf = user_aspect_profile[key][0] + confidence = user_aspect_profile[key][1] + user_aspects_tfidf_confid[key] = tfidf * confidence + + similarity, intersections = cosine_similarity_dicts(doc_aspects_tfidf_confid, user_aspects_tfidf_confid) + + if intersections > 0: + #P(A intersect B)_intersections = P(A)^intersections * P(B)^intersections + random_error = math.pow(len(doc_aspects) / float(len(aspect_docs_counts)), + intersections) * math.pow(len(user_aspect_profile) / float(len(aspect_docs_counts)), + intersections) + confidence = 1.0 - random_error + else: + #P(A not intersect B) = 1 - P(A intersect B) + random_error = 1 - ((len(doc_aspects) / float(len(aspect_docs_counts))) * + (len(user_aspect_profile) / float(len(aspect_docs_counts)))) + + confidence = 1.0 - random_error + + return similarity, confidence + +def cosine_similarity_doc_event_doc_ad_aspects(doc_event_aspect_ids, doc_event_aspects_confidence, + doc_ad_aspect_ids, doc_ad_aspects_confidence, + aspect_docs_counts): + if doc_event_aspect_ids == None or len(doc_event_aspect_ids) == 0 \ + or doc_ad_aspect_ids == None or len(doc_ad_aspect_ids) == 0: + return None, None + + doc_event_aspects = dict(zip(doc_event_aspect_ids, doc_event_aspects_confidence)) + doc_event_aspects_tfidf_confid = {} + for key in doc_event_aspect_ids: + tf = 1.0 + idf = math.log(math.log(documents_total / float(aspect_docs_counts[key]))) + confidence = doc_event_aspects[key] + doc_event_aspects_tfidf_confid[key] = tf*idf * confidence + + doc_ad_aspects = dict(zip(doc_ad_aspect_ids, doc_ad_aspects_confidence)) + doc_ad_aspects_tfidf_confid = {} + for key in doc_ad_aspect_ids: + tf = 1.0 + idf = math.log(math.log(documents_total / float(aspect_docs_counts[key]))) + confidence = doc_ad_aspects[key] + doc_ad_aspects_tfidf_confid[key] = tf*idf * confidence + + similarity, intersections = cosine_similarity_dicts(doc_event_aspects_tfidf_confid, doc_ad_aspects_tfidf_confid) + + if intersections > 0: + #P(A intersect B)_intersections = P(A)^intersections * P(B)^intersections + random_error = math.pow(len(doc_event_aspect_ids) / float(len(aspect_docs_counts)), + intersections) * math.pow(len(doc_ad_aspect_ids) / float(len(aspect_docs_counts)), + intersections) + confidence = 1.0 - random_error + else: + #P(A not intersect B) = 1 - P(A intersect B) + random_error = 1 - ((len(doc_event_aspect_ids) / float(len(aspect_docs_counts))) * + (len(doc_ad_aspect_ids) / float(len(aspect_docs_counts)))) + + confidence = 1.0 - random_error + + return similarity, confidence + +def get_user_cb_interest_score(user_views_count, user_categories, user_topics, user_entities, + timestamp_event, category_ids_by_doc, cat_confidence_level_by_doc, + topic_ids_by_doc, top_confidence_level_by_doc, + entity_ids_by_doc, ent_confidence_level_by_doc, + output_detailed_list=False): + + #Content-Based + + sims = [] + + categories_similarity, cat_sim_confidence = cosine_similarity_user_docs_aspects(user_categories, category_ids_by_doc, cat_confidence_level_by_doc, categories_docs_counts) + if categories_similarity != None: + sims.append(('user_doc_ad_sim_categories', categories_similarity, cat_sim_confidence)) + + topics_similarity, top_sim_confidence = cosine_similarity_user_docs_aspects(user_topics, topic_ids_by_doc, top_confidence_level_by_doc, topics_docs_counts) + if topics_similarity != None: + sims.append(('user_doc_ad_sim_topics', topics_similarity, top_sim_confidence)) + + entities_similarity, entity_sim_confid = cosine_similarity_user_docs_aspects(user_entities, entity_ids_by_doc, ent_confidence_level_by_doc, entities_docs_counts) + if entities_similarity != None: + sims.append(('user_doc_ad_sim_entities', entities_similarity, entity_sim_confid)) + + if output_detailed_list: + return sims + else: + if len(sims) > 0: + weighted_avg_sim_by_confidence = sum(map(lambda x: x[1]*x[2], sims)) / float(sum(map(lambda x: x[2], sims))) + confidence = sum(map(lambda x: x[2], sims)) / float(len(sims)) + + #print("[get_user_cb_interest_score] sims: {} | Avg: {} - Confid: {}".format(sims, weighted_avg_sim_by_confidence, confidence)) + return weighted_avg_sim_by_confidence, confidence + else: + return None, None + +def get_doc_event_doc_ad_cb_similarity_score(doc_event_category_ids, doc_event_cat_confidence_levels, + doc_event_topic_ids, doc_event_top_confidence_levels, + doc_event_entity_ids, doc_event_ent_confidence_levels, + doc_ad_category_ids, doc_ad_cat_confidence_levels, + doc_ad_topic_ids, doc_ad_top_confidence_levels, + doc_ad_entity_ids, doc_ad_ent_confidence_levels, + output_detailed_list=False): + + #Content-Based + sims = [] + + + + categories_similarity, cat_sim_confidence = cosine_similarity_doc_event_doc_ad_aspects( + doc_event_category_ids, doc_event_cat_confidence_levels, + doc_ad_category_ids, doc_ad_cat_confidence_levels, + categories_docs_counts) + if categories_similarity != None: + sims.append(('doc_event_doc_ad_sim_categories', categories_similarity, cat_sim_confidence)) + + topics_similarity, top_sim_confidence = cosine_similarity_doc_event_doc_ad_aspects( + doc_event_topic_ids, doc_event_top_confidence_levels, + doc_ad_topic_ids, doc_ad_top_confidence_levels, + topics_docs_counts) + + if topics_similarity != None: + sims.append(('doc_event_doc_ad_sim_topics', topics_similarity, top_sim_confidence)) + + entities_similarity, entity_sim_confid = cosine_similarity_doc_event_doc_ad_aspects( + doc_event_entity_ids, doc_event_ent_confidence_levels, + doc_ad_entity_ids, doc_ad_ent_confidence_levels, + entities_docs_counts) + + if entities_similarity != None: + sims.append(('doc_event_doc_ad_sim_entities', entities_similarity, entity_sim_confid)) + + if output_detailed_list: + return sims + else: + if len(sims) > 0: + weighted_avg_sim_by_confidence = sum(map(lambda x: x[1]*x[2], sims)) / float(sum(map(lambda x: x[2], sims))) + confidence = sum(map(lambda x: x[2], sims)) / float(len(sims)) + + #print("[get_user_cb_interest_score] sims: {} | Avg: {} - Confid: {}".format(sims, weighted_avg_sim_by_confidence, confidence)) + return weighted_avg_sim_by_confidence, confidence + else: + return None, None + + +# # Feature Vector export +bool_feature_names = ['event_weekend', + 'user_has_already_viewed_doc'] + +int_feature_names = ['user_views', + 'ad_views', + 'doc_views', + 'doc_event_days_since_published', + 'doc_event_hour', + 'doc_ad_days_since_published', + ] + +float_feature_names = [ + 'pop_ad_id', + 'pop_ad_id_conf', + 'pop_ad_id_conf_multipl', + 'pop_document_id', + 'pop_document_id_conf', + 'pop_document_id_conf_multipl', + 'pop_publisher_id', + 'pop_publisher_id_conf', + 'pop_publisher_id_conf_multipl', + 'pop_advertiser_id', + 'pop_advertiser_id_conf', + 'pop_advertiser_id_conf_multipl', + 'pop_campain_id', + 'pop_campain_id_conf', + 'pop_campain_id_conf_multipl', + 'pop_doc_event_doc_ad', + 'pop_doc_event_doc_ad_conf', + 'pop_doc_event_doc_ad_conf_multipl', + 'pop_source_id', + 'pop_source_id_conf', + 'pop_source_id_conf_multipl', + 'pop_source_id_country', + 'pop_source_id_country_conf', + 'pop_source_id_country_conf_multipl', + 'pop_entity_id', + 'pop_entity_id_conf', + 'pop_entity_id_conf_multipl', + 'pop_entity_id_country', + 'pop_entity_id_country_conf', + 'pop_entity_id_country_conf_multipl', + 'pop_topic_id', + 'pop_topic_id_conf', + 'pop_topic_id_conf_multipl', + 'pop_topic_id_country', + 'pop_topic_id_country_conf', + 'pop_topic_id_country_conf_multipl', + 'pop_category_id', + 'pop_category_id_conf', + 'pop_category_id_conf_multipl', + 'pop_category_id_country', + 'pop_category_id_country_conf', + 'pop_category_id_country_conf_multipl', + 'user_doc_ad_sim_categories', + 'user_doc_ad_sim_categories_conf', + 'user_doc_ad_sim_categories_conf_multipl', + 'user_doc_ad_sim_topics', + 'user_doc_ad_sim_topics_conf', + 'user_doc_ad_sim_topics_conf_multipl', + 'user_doc_ad_sim_entities', + 'user_doc_ad_sim_entities_conf', + 'user_doc_ad_sim_entities_conf_multipl', + 'doc_event_doc_ad_sim_categories', + 'doc_event_doc_ad_sim_categories_conf', + 'doc_event_doc_ad_sim_categories_conf_multipl', + 'doc_event_doc_ad_sim_topics', + 'doc_event_doc_ad_sim_topics_conf', + 'doc_event_doc_ad_sim_topics_conf_multipl', + 'doc_event_doc_ad_sim_entities', + 'doc_event_doc_ad_sim_entities_conf', + 'doc_event_doc_ad_sim_entities_conf_multipl' + ] + +TRAFFIC_SOURCE_FV='traffic_source' +EVENT_HOUR_FV='event_hour' +EVENT_COUNTRY_FV = 'event_country' +EVENT_COUNTRY_STATE_FV = 'event_country_state' +EVENT_GEO_LOCATION_FV = 'event_geo_location' +EVENT_PLATFORM_FV = 'event_platform' +AD_ADVERTISER_FV = 'ad_advertiser' +DOC_AD_SOURCE_ID_FV='doc_ad_source_id' +DOC_AD_PUBLISHER_ID_FV='doc_ad_publisher_id' +DOC_EVENT_SOURCE_ID_FV='doc_event_source_id' +DOC_EVENT_PUBLISHER_ID_FV='doc_event_publisher_id' +DOC_AD_CATEGORY_ID_FV='doc_ad_category_id' +DOC_AD_TOPIC_ID_FV='doc_ad_topic_id' +DOC_AD_ENTITY_ID_FV='doc_ad_entity_id' +DOC_EVENT_CATEGORY_ID_FV='doc_event_category_id' +DOC_EVENT_TOPIC_ID_FV='doc_event_topic_id' +DOC_EVENT_ENTITY_ID_FV='doc_event_entity_id' + + +# ### Configuring feature vector +category_feature_names_integral = ['ad_advertiser', + 'doc_ad_category_id_1', + 'doc_ad_category_id_2', + 'doc_ad_category_id_3', + 'doc_ad_topic_id_1', + 'doc_ad_topic_id_2', + 'doc_ad_topic_id_3', + 'doc_ad_entity_id_1', + 'doc_ad_entity_id_2', + 'doc_ad_entity_id_3', + 'doc_ad_entity_id_4', + 'doc_ad_entity_id_5', + 'doc_ad_entity_id_6', + 'doc_ad_publisher_id', + 'doc_ad_source_id', + 'doc_event_category_id_1', + 'doc_event_category_id_2', + 'doc_event_category_id_3', + 'doc_event_topic_id_1', + 'doc_event_topic_id_2', + 'doc_event_topic_id_3', + 'doc_event_entity_id_1', + 'doc_event_entity_id_2', + 'doc_event_entity_id_3', + 'doc_event_entity_id_4', + 'doc_event_entity_id_5', + 'doc_event_entity_id_6', + 'doc_event_publisher_id', + 'doc_event_source_id', + 'event_country', + 'event_country_state', + 'event_geo_location', + 'event_hour', + 'event_platform', + 'traffic_source'] + + +feature_vector_labels_integral = bool_feature_names + int_feature_names + float_feature_names + category_feature_names_integral + +feature_vector_labels_integral_dict = dict([(key, idx) for idx, key in enumerate(feature_vector_labels_integral)]) + +with open(OUTPUT_BUCKET_FOLDER+'feature_vector_labels_integral.txt', 'w') as output: + output.writelines('\n'.join(feature_vector_labels_integral)) + +def set_feature_vector_cat_value(field_name, field_value, feature_vector): + if not is_null(field_value) and str(field_value) != '-1': + feature_name = get_ohe_feature_name(field_name, field_value) + if feature_name in feature_vector_labels_dict: + feature_idx = feature_vector_labels_dict[feature_name] + else: + #Unpopular category value + feature_idx = feature_vector_labels_dict[get_ohe_feature_name(field_name, LESS_SPECIAL_CAT_VALUE)] + + feature_vector[feature_idx] = float(1) + +def set_feature_vector_cat_values(field_name, field_values, feature_vector): + for field_value in field_values: + set_feature_vector_cat_value(field_name, field_value, feature_vector) + +def get_ad_feature_vector(user_doc_ids_viewed, user_views_count, user_categories, user_topics, user_entities, + event_country, event_country_state, + ad_id, document_id, source_id, doc_ad_publish_time, timestamp_event, platform_event, + geo_location_event, + doc_event_source_id, doc_event_publisher_id, doc_event_publish_time, + traffic_source_pv, advertiser_id, publisher_id, + campaign_id, document_id_event, + doc_ad_category_ids, doc_ad_cat_confidence_levels, + doc_ad_topic_ids, doc_ad_top_confidence_levels, + doc_ad_entity_ids, doc_ad_ent_confidence_levels, + doc_event_category_ids, doc_event_cat_confidence_levels, + doc_event_topic_ids, doc_event_top_confidence_levels, + doc_event_entity_ids, doc_event_ent_confidence_levels): + + try: + + feature_vector = {} + + if user_views_count != None: + feature_vector[feature_vector_labels_dict['user_views']] = float(user_views_count) + + if user_doc_ids_viewed != None: + feature_vector[feature_vector_labels_dict['user_has_already_viewed_doc']] = float(document_id in user_doc_ids_viewed) + + if ad_id in ad_id_popularity_broad.value: + feature_vector[feature_vector_labels_dict['ad_views']] = float(ad_id_popularity_broad.value[ad_id][1]) + + if document_id in document_id_popularity_broad.value: + feature_vector[feature_vector_labels_dict['doc_views']] = float(document_id_popularity_broad.value[document_id][1]) + + if timestamp_event > -1: + dt_timestamp_event = convert_odd_timestamp(timestamp_event) + if doc_ad_publish_time != None: + delta_days = (dt_timestamp_event - doc_ad_publish_time).days + if delta_days >= 0 and delta_days <= 365*10: #10 years + feature_vector[feature_vector_labels_dict['doc_ad_days_since_published']] = float(delta_days) + + if doc_event_publish_time != None: + delta_days = (dt_timestamp_event - doc_event_publish_time).days + if delta_days >= 0 and delta_days <= 365*10: #10 years + feature_vector[feature_vector_labels_dict['doc_event_days_since_published']] = float(delta_days) + + + #Local period of the day (hours) + dt_local_timestamp_event = get_local_datetime(dt_timestamp_event, event_country, event_country_state) + local_hour_bin = get_hour_bin(dt_local_timestamp_event.hour) + feature_vector[feature_vector_labels_dict['doc_event_hour']] = float(local_hour_bin) #Hour for Decision Trees + set_feature_vector_cat_value(EVENT_HOUR_FV, local_hour_bin, feature_vector) #Period of day for FFM + + #Weekend + weekend = int(is_weekend(dt_local_timestamp_event)) + feature_vector[feature_vector_labels_dict['event_weekend']] = float(weekend) + + conf_field_suffix = '_conf' + conf_multiplied_field_suffix = '_conf_multipl' + + #Setting Popularity fields + pop_scores = get_popularity_score(event_country, ad_id, document_id, source_id, + publisher_id, advertiser_id, campaign_id, document_id_event, + doc_ad_category_ids, doc_ad_cat_confidence_levels, + doc_ad_topic_ids, doc_ad_top_confidence_levels, + doc_ad_entity_ids, doc_ad_ent_confidence_levels, + output_detailed_list=True) + + + + for score in pop_scores: + feature_vector[feature_vector_labels_dict[score[0]]] = score[1] + feature_vector[feature_vector_labels_dict[score[0]+conf_field_suffix]] = score[2] + feature_vector[feature_vector_labels_dict[score[0]+conf_multiplied_field_suffix]] = score[1] * score[2] + + #Setting User-Doc_ad CB Similarity fields + user_doc_ad_cb_sim_scores = get_user_cb_interest_score(user_views_count, user_categories, user_topics, user_entities, + timestamp_event, + doc_ad_category_ids, doc_ad_cat_confidence_levels, + doc_ad_topic_ids, doc_ad_top_confidence_levels, + doc_ad_entity_ids, doc_ad_ent_confidence_levels, + output_detailed_list=True) + + for score in user_doc_ad_cb_sim_scores: + feature_vector[feature_vector_labels_dict[score[0]]] = score[1] + feature_vector[feature_vector_labels_dict[score[0]+conf_field_suffix]] = score[2] + feature_vector[feature_vector_labels_dict[score[0]+conf_multiplied_field_suffix]] = score[1] * score[2] + + #Setting Doc_event-doc_ad CB Similarity fields + doc_event_doc_ad_cb_sim_scores = get_doc_event_doc_ad_cb_similarity_score( + doc_event_category_ids, doc_event_cat_confidence_levels, + doc_event_topic_ids, doc_event_top_confidence_levels, + doc_event_entity_ids, doc_event_ent_confidence_levels, + doc_ad_category_ids, doc_ad_cat_confidence_levels, + doc_ad_topic_ids, doc_ad_top_confidence_levels, + doc_ad_entity_ids, doc_ad_ent_confidence_levels, + output_detailed_list=True) + + for score in doc_event_doc_ad_cb_sim_scores: + feature_vector[feature_vector_labels_dict[score[0]]] = score[1] + feature_vector[feature_vector_labels_dict[score[0]+conf_field_suffix]] = score[2] + feature_vector[feature_vector_labels_dict[score[0]+conf_multiplied_field_suffix]] = score[1] * score[2] + + # -1 to traffic_source and platform_event + if traffic_source_pv != None: + feature_vector[feature_vector_labels_dict[TRAFFIC_SOURCE_FV]] = int(traffic_source_pv - 1) + if platform_event != None: + feature_vector[feature_vector_labels_dict[EVENT_PLATFORM_FV]] = int(platform_event - 1) + + # set_feature_vector_cat_value(TRAFFIC_SOURCE_FV, traffic_source_pv, feature_vector) + set_feature_vector_cat_value(EVENT_COUNTRY_FV, event_country, feature_vector) + set_feature_vector_cat_value(EVENT_COUNTRY_STATE_FV, event_country_state, feature_vector) + set_feature_vector_cat_value(EVENT_GEO_LOCATION_FV, geo_location_event, feature_vector) + # set_feature_vector_cat_value(EVENT_PLATFORM_FV, platform_event, feature_vector) + set_feature_vector_cat_value(AD_ADVERTISER_FV, advertiser_id, feature_vector) + set_feature_vector_cat_value(DOC_AD_SOURCE_ID_FV, source_id, feature_vector) + set_feature_vector_cat_value(DOC_AD_PUBLISHER_ID_FV, publisher_id, feature_vector) + set_feature_vector_cat_value(DOC_EVENT_SOURCE_ID_FV, doc_event_source_id, feature_vector) + set_feature_vector_cat_value(DOC_EVENT_PUBLISHER_ID_FV, doc_event_publisher_id, feature_vector) + set_feature_vector_cat_values(DOC_AD_CATEGORY_ID_FV, doc_ad_category_ids, feature_vector) + set_feature_vector_cat_values(DOC_AD_TOPIC_ID_FV, doc_ad_topic_ids, feature_vector) + set_feature_vector_cat_values(DOC_AD_ENTITY_ID_FV, doc_ad_entity_ids, feature_vector) + set_feature_vector_cat_values(DOC_EVENT_CATEGORY_ID_FV, doc_event_category_ids, feature_vector) + set_feature_vector_cat_values(DOC_EVENT_TOPIC_ID_FV, doc_event_topic_ids, feature_vector) + set_feature_vector_cat_values(DOC_EVENT_ENTITY_ID_FV, doc_event_entity_ids, feature_vector) + + #Creating dummy column as the last column because xgboost have a problem if the last column is undefined for all rows, + #saying that dimentions of data and feature_names do not match + #feature_vector[feature_vector_labels_dict[DUMMY_FEATURE_COLUMN]] = float(0) + + #Ensuring that all elements are floats for compatibility with UDF output (ArrayType(FloatType())) + #feature_vector = list([float(x) for x in feature_vector]) + + except Exception as e: + raise Exception("[get_ad_feature_vector] ERROR PROCESSING FEATURE VECTOR! Params: {}" + .format([user_doc_ids_viewed, user_views_count, user_categories, user_topics, user_entities, + event_country, event_country_state, + ad_id, document_id, source_id, doc_ad_publish_time, timestamp_event, platform_event, + geo_location_event, + doc_event_source_id, doc_event_publisher_id, doc_event_publish_time, + traffic_source_pv, advertiser_id, publisher_id, + campaign_id, document_id_event, + doc_ad_category_ids, doc_ad_cat_confidence_levels, + doc_ad_topic_ids, doc_ad_top_confidence_levels, + doc_ad_entity_ids, doc_ad_ent_confidence_levels, + doc_event_category_ids, doc_event_cat_confidence_levels, + doc_event_topic_ids, doc_event_top_confidence_levels, + doc_event_entity_ids, doc_event_ent_confidence_levels]), + e) + + return SparseVector(len(feature_vector_labels_dict), feature_vector) + +get_ad_feature_vector_udf = F.udf( + lambda user_doc_ids_viewed, user_views_count, user_categories, user_topics, + user_entities, event_country, event_country_state, ad_id, document_id, + source_id, doc_ad_publish_time, timestamp_event, platform_event, + geo_location_event, + doc_event_source_id, doc_event_publisher_id, doc_event_publish_time, + traffic_source_pv, advertiser_id, publisher_id, + campaign_id, document_id_event, + category_ids_by_doc, cat_confidence_level_by_doc, + topic_ids_by_doc, top_confidence_level_by_doc, + entity_ids_by_doc, ent_confidence_level_by_doc, + doc_event_category_id_list, doc_event_confidence_level_cat_list, + doc_event_topic_id_list, doc_event_confidence_level_top, + doc_event_entity_id_list, doc_event_confidence_level_ent: \ + get_ad_feature_vector(user_doc_ids_viewed, user_views_count, user_categories, user_topics, user_entities, + event_country, event_country_state, + ad_id, document_id, source_id, doc_ad_publish_time, timestamp_event, platform_event, + geo_location_event, + doc_event_source_id, doc_event_publisher_id, doc_event_publish_time, + traffic_source_pv, advertiser_id, publisher_id, + campaign_id, document_id_event, + category_ids_by_doc, cat_confidence_level_by_doc, + topic_ids_by_doc, top_confidence_level_by_doc, + entity_ids_by_doc, ent_confidence_level_by_doc, + doc_event_category_id_list, doc_event_confidence_level_cat_list, + doc_event_topic_id_list, doc_event_confidence_level_top, + doc_event_entity_id_list, doc_event_confidence_level_ent), + VectorUDT()) + + +# ### Building feature vectors +def set_feature_vector_cat_value_integral(field_name, field_value, feature_vector): + if not is_null(field_value): #and str(field_value) != '-1': + feature_vector[feature_vector_labels_integral_dict[field_name]] = float(field_value) + +def set_feature_vector_cat_top_multi_values_integral( + field_name, values, confidences, feature_vector, top=5): + top_values = list(filter(lambda z: z != -1, + map(lambda y: y[0], sorted(zip(values, confidences), key=lambda x: -x[1]))))[:top] + for idx, field_value in list(enumerate(top_values)): + set_feature_vector_cat_value_integral( + '{}_{}'.format(field_name, idx+1), field_value, feature_vector) + +def get_ad_feature_vector_integral( + user_doc_ids_viewed, user_views_count, user_categories, user_topics, user_entities, + event_country, event_country_state, + ad_id, document_id, source_id, doc_ad_publish_time, timestamp_event, platform_event, + geo_location_event, + doc_event_source_id, doc_event_publisher_id, doc_event_publish_time, + traffic_source_pv, advertiser_id, publisher_id, + campaign_id, document_id_event, + doc_ad_category_ids, doc_ad_cat_confidence_levels, + doc_ad_topic_ids, doc_ad_top_confidence_levels, + doc_ad_entity_ids, doc_ad_ent_confidence_levels, + doc_event_category_ids, doc_event_cat_confidence_levels, + doc_event_topic_ids, doc_event_top_confidence_levels, + doc_event_entity_ids, doc_event_ent_confidence_levels): + + try: + + feature_vector = {} + + if user_views_count != None: + feature_vector[feature_vector_labels_integral_dict['user_views']] = float(user_views_count) + + if user_doc_ids_viewed != None: + feature_vector[feature_vector_labels_integral_dict['user_has_already_viewed_doc']] = float(document_id in user_doc_ids_viewed) + + if ad_id in ad_id_popularity_broad.value: + feature_vector[feature_vector_labels_integral_dict['ad_views']] = float(ad_id_popularity_broad.value[ad_id][1]) + + if document_id in document_id_popularity_broad.value: + feature_vector[feature_vector_labels_integral_dict['doc_views']] = float(document_id_popularity_broad.value[document_id][1]) + + if timestamp_event > -1: + dt_timestamp_event = convert_odd_timestamp(timestamp_event) + if doc_ad_publish_time != None: + delta_days = (dt_timestamp_event - doc_ad_publish_time).days + if delta_days >= 0 and delta_days <= 365*10: #10 years + feature_vector[feature_vector_labels_integral_dict['doc_ad_days_since_published']] = float(delta_days) + + if doc_event_publish_time != None: + delta_days = (dt_timestamp_event - doc_event_publish_time).days + if delta_days >= 0 and delta_days <= 365*10: #10 years + feature_vector[feature_vector_labels_integral_dict['doc_event_days_since_published']] = float(delta_days) + + + #Local period of the day (hours) + dt_local_timestamp_event = get_local_datetime(dt_timestamp_event, event_country, event_country_state) + local_hour_bin = get_hour_bin(dt_local_timestamp_event.hour) + feature_vector[feature_vector_labels_integral_dict['doc_event_hour']] = float(local_hour_bin) #Hour for Decision Trees + set_feature_vector_cat_value_integral(EVENT_HOUR_FV, local_hour_bin, feature_vector) #Period of day for FFM + + #Weekend + weekend = int(is_weekend(dt_local_timestamp_event)) + feature_vector[feature_vector_labels_integral_dict['event_weekend']] = float(weekend) + + + conf_field_suffix = '_conf' + conf_multiplied_field_suffix = '_conf_multipl' + + #Setting Popularity fields + pop_scores = get_popularity_score(event_country, ad_id, document_id, source_id, + publisher_id, advertiser_id, campaign_id, document_id_event, + doc_ad_category_ids, doc_ad_cat_confidence_levels, + doc_ad_topic_ids, doc_ad_top_confidence_levels, + doc_ad_entity_ids, doc_ad_ent_confidence_levels, + output_detailed_list=True) + + + + for score in pop_scores: + feature_vector[feature_vector_labels_integral_dict[score[0]]] = score[1] + feature_vector[feature_vector_labels_integral_dict[score[0]+conf_field_suffix]] = score[2] + feature_vector[feature_vector_labels_integral_dict[score[0]+conf_multiplied_field_suffix]] = score[1] * score[2] + + #Setting User-Doc_ad CB Similarity fields + user_doc_ad_cb_sim_scores = get_user_cb_interest_score( + user_views_count, user_categories, user_topics, user_entities, + timestamp_event, + doc_ad_category_ids, doc_ad_cat_confidence_levels, + doc_ad_topic_ids, doc_ad_top_confidence_levels, + doc_ad_entity_ids, doc_ad_ent_confidence_levels, + output_detailed_list=True) + + for score in user_doc_ad_cb_sim_scores: + feature_vector[feature_vector_labels_integral_dict[score[0]]] = score[1] + feature_vector[feature_vector_labels_integral_dict[score[0]+conf_field_suffix]] = score[2] + feature_vector[feature_vector_labels_integral_dict[score[0]+conf_multiplied_field_suffix]] = score[1] * score[2] + + #Setting Doc_event-doc_ad CB Similarity fields + doc_event_doc_ad_cb_sim_scores = get_doc_event_doc_ad_cb_similarity_score( + doc_event_category_ids, doc_event_cat_confidence_levels, + doc_event_topic_ids, doc_event_top_confidence_levels, + doc_event_entity_ids, doc_event_ent_confidence_levels, + doc_ad_category_ids, doc_ad_cat_confidence_levels, + doc_ad_topic_ids, doc_ad_top_confidence_levels, + doc_ad_entity_ids, doc_ad_ent_confidence_levels, + output_detailed_list=True) + + for score in doc_event_doc_ad_cb_sim_scores: + feature_vector[feature_vector_labels_integral_dict[score[0]]] = score[1] + feature_vector[feature_vector_labels_integral_dict[score[0]+conf_field_suffix]] = score[2] + feature_vector[feature_vector_labels_integral_dict[score[0]+conf_multiplied_field_suffix]] = score[1] * score[2] + + #Process code for event_country + if event_country in event_country_values_counts: + event_country_code = event_country_values_counts[event_country] + else: + event_country_code = event_country_values_counts[LESS_SPECIAL_CAT_VALUE] + set_feature_vector_cat_value_integral(EVENT_COUNTRY_FV, event_country_code, feature_vector) + + #Process code for event_country_state + if event_country_state in event_country_state_values_counts: + event_country_state_code = event_country_state_values_counts[event_country_state] + else: + event_country_state_code = event_country_state_values_counts[LESS_SPECIAL_CAT_VALUE] + set_feature_vector_cat_value_integral(EVENT_COUNTRY_STATE_FV, event_country_state_code, feature_vector) + + #Process code for geo_location_event + if geo_location_event in event_geo_location_values_counts: + geo_location_event_code = event_geo_location_values_counts[geo_location_event] + else: + geo_location_event_code = event_geo_location_values_counts[LESS_SPECIAL_CAT_VALUE] + + # -1 to traffic_source and platform_event + if traffic_source_pv != None: + feature_vector[feature_vector_labels_integral_dict[TRAFFIC_SOURCE_FV]] = int(traffic_source_pv - 1) + if platform_event != None: + feature_vector[feature_vector_labels_integral_dict[EVENT_PLATFORM_FV]] = int(platform_event - 1) + + set_feature_vector_cat_value_integral(EVENT_GEO_LOCATION_FV, geo_location_event_code, feature_vector) + + # set_feature_vector_cat_value_integral(TRAFFIC_SOURCE_FV, traffic_source_pv - 1, feature_vector) + # set_feature_vector_cat_value_integral(EVENT_PLATFORM_FV, platform_event - 1, feature_vector) + set_feature_vector_cat_value_integral(AD_ADVERTISER_FV, advertiser_id, feature_vector) + set_feature_vector_cat_value_integral(DOC_AD_SOURCE_ID_FV, source_id, feature_vector) + set_feature_vector_cat_value_integral(DOC_AD_PUBLISHER_ID_FV, publisher_id, feature_vector) + set_feature_vector_cat_value_integral(DOC_EVENT_SOURCE_ID_FV, doc_event_source_id, feature_vector) + set_feature_vector_cat_value_integral(DOC_EVENT_PUBLISHER_ID_FV, doc_event_publisher_id, feature_vector) + + set_feature_vector_cat_top_multi_values_integral(DOC_AD_CATEGORY_ID_FV, doc_ad_category_ids, doc_ad_cat_confidence_levels, feature_vector, top=3) + set_feature_vector_cat_top_multi_values_integral(DOC_AD_TOPIC_ID_FV, doc_ad_topic_ids, doc_ad_top_confidence_levels, feature_vector, top=3) + + set_feature_vector_cat_top_multi_values_integral(DOC_EVENT_CATEGORY_ID_FV, doc_event_category_ids, doc_event_cat_confidence_levels, feature_vector, top=3) + set_feature_vector_cat_top_multi_values_integral(DOC_EVENT_TOPIC_ID_FV, doc_event_topic_ids, doc_event_top_confidence_levels, feature_vector, top=3) + + #Process codes for doc_ad_entity_ids + doc_ad_entity_ids_codes = [doc_entity_id_values_counts[x] + if x in doc_entity_id_values_counts + else doc_entity_id_values_counts[LESS_SPECIAL_CAT_VALUE] + for x in doc_ad_entity_ids] + set_feature_vector_cat_top_multi_values_integral(DOC_AD_ENTITY_ID_FV, doc_ad_entity_ids_codes, doc_ad_ent_confidence_levels, feature_vector, top=6) + + + #Process codes for doc_event_entity_ids + doc_event_entity_ids_codes = [doc_entity_id_values_counts[x] + if x in doc_entity_id_values_counts + else doc_entity_id_values_counts[LESS_SPECIAL_CAT_VALUE] + for x in doc_event_entity_ids] + set_feature_vector_cat_top_multi_values_integral(DOC_EVENT_ENTITY_ID_FV, doc_event_entity_ids_codes, doc_event_ent_confidence_levels, feature_vector, top=6) + + #Creating dummy column as the last column because xgboost have a problem if the last column is undefined for all rows, + #saying that dimentions of data and feature_names do not match + #feature_vector[feature_vector_labels_dict[DUMMY_FEATURE_COLUMN]] = float(0) + + #Ensuring that all elements are floats for compatibility with UDF output (ArrayType(FloatType())) + #feature_vector = list([float(x) for x in feature_vector]) + + except Exception as e: + raise Exception("[get_ad_feature_vector_integral] ERROR PROCESSING FEATURE VECTOR! Params: {}" \ + .format([user_doc_ids_viewed, user_views_count, user_categories, user_topics, user_entities, + event_country, event_country_state, + ad_id, document_id, source_id, doc_ad_publish_time, timestamp_event, platform_event, + geo_location_event, + doc_event_source_id, doc_event_publisher_id, doc_event_publish_time, + traffic_source_pv, advertiser_id, publisher_id, + campaign_id, document_id_event, + doc_ad_category_ids, doc_ad_cat_confidence_levels, + doc_ad_topic_ids, doc_ad_top_confidence_levels, + doc_ad_entity_ids, doc_ad_ent_confidence_levels, + doc_event_category_ids, doc_event_cat_confidence_levels, + doc_event_topic_ids, doc_event_top_confidence_levels, + doc_event_entity_ids, doc_event_ent_confidence_levels]), + e) + + return SparseVector(len(feature_vector_labels_integral_dict), feature_vector) + +get_ad_feature_vector_integral_udf = F.udf( + lambda user_doc_ids_viewed, user_views_count, user_categories, user_topics, + user_entities, event_country, event_country_state, ad_id, document_id, source_id, + doc_ad_publish_time, timestamp_event, platform_event, + geo_location_event, + doc_event_source_id, doc_event_publisher_id, doc_event_publish_time, + traffic_source_pv, advertiser_id, publisher_id, + campaign_id, document_id_event, + category_ids_by_doc, cat_confidence_level_by_doc, + topic_ids_by_doc, top_confidence_level_by_doc, + entity_ids_by_doc, ent_confidence_level_by_doc, + doc_event_category_id_list, doc_event_confidence_level_cat_list, + doc_event_topic_id_list, doc_event_confidence_level_top, + doc_event_entity_id_list, doc_event_confidence_level_ent: \ + get_ad_feature_vector_integral(user_doc_ids_viewed, user_views_count, user_categories, user_topics, user_entities, + event_country, event_country_state, + ad_id, document_id, source_id, doc_ad_publish_time, timestamp_event, platform_event, + geo_location_event, + doc_event_source_id, doc_event_publisher_id, doc_event_publish_time, + traffic_source_pv, advertiser_id, publisher_id, + campaign_id, document_id_event, + category_ids_by_doc, cat_confidence_level_by_doc, + topic_ids_by_doc, top_confidence_level_by_doc, + entity_ids_by_doc, ent_confidence_level_by_doc, + doc_event_category_id_list, doc_event_confidence_level_cat_list, + doc_event_topic_id_list, doc_event_confidence_level_top, + doc_event_entity_id_list, doc_event_confidence_level_ent), + VectorUDT()) + +# ## Export Train set feature vectors +train_set_enriched_df = train_set_df \ + .join(documents_categories_grouped_df, + on=F.col("document_id_promo") == F.col("documents_categories_grouped.document_id_cat"), + how='left') \ + .join(documents_topics_grouped_df, + on=F.col("document_id_promo") == F.col("documents_topics_grouped.document_id_top"), + how='left') \ + .join(documents_entities_grouped_df, + on=F.col("document_id_promo") == F.col("documents_entities_grouped.document_id_ent"), + how='left') \ + .join(documents_categories_grouped_df \ + .withColumnRenamed('category_id_list', 'doc_event_category_id_list') + .withColumnRenamed('confidence_level_cat_list', 'doc_event_confidence_level_cat_list') \ + .alias('documents_event_categories_grouped'), + on=F.col("document_id_event") == F.col("documents_event_categories_grouped.document_id_cat"), + how='left') \ + .join(documents_topics_grouped_df \ + .withColumnRenamed('topic_id_list', 'doc_event_topic_id_list') + .withColumnRenamed('confidence_level_top_list', 'doc_event_confidence_level_top_list') \ + .alias('documents_event_topics_grouped'), + on=F.col("document_id_event") == F.col("documents_event_topics_grouped.document_id_top"), + how='left') \ + .join(documents_entities_grouped_df \ + .withColumnRenamed('entity_id_list', 'doc_event_entity_id_list') + .withColumnRenamed('confidence_level_ent_list', 'doc_event_confidence_level_ent_list') \ + .alias('documents_event_entities_grouped'), + on=F.col("document_id_event") == F.col("documents_event_entities_grouped.document_id_ent"), + how='left') \ + .select('display_id','uuid_event','event_country','event_country_state','platform_event', + 'source_id_doc_event', 'publisher_doc_event','publish_time_doc_event', + 'publish_time', 'ad_id','document_id_promo','clicked', + 'geo_location_event', 'advertiser_id', 'publisher_id', + 'campaign_id', 'document_id_event', + 'traffic_source_pv', + int_list_null_to_empty_list_udf('doc_event_category_id_list') + .alias('doc_event_category_id_list'), + float_list_null_to_empty_list_udf('doc_event_confidence_level_cat_list') + .alias('doc_event_confidence_level_cat_list'), + int_list_null_to_empty_list_udf('doc_event_topic_id_list') + .alias('doc_event_topic_id_list'), + float_list_null_to_empty_list_udf('doc_event_confidence_level_top_list') + .alias('doc_event_confidence_level_top_list'), + str_list_null_to_empty_list_udf('doc_event_entity_id_list') + .alias('doc_event_entity_id_list'), + float_list_null_to_empty_list_udf('doc_event_confidence_level_ent_list') + .alias('doc_event_confidence_level_ent_list'), + int_null_to_minus_one_udf('source_id').alias('source_id'), + int_null_to_minus_one_udf('timestamp_event').alias('timestamp_event'), + int_list_null_to_empty_list_udf('category_id_list').alias('category_id_list'), + float_list_null_to_empty_list_udf('confidence_level_cat_list') + .alias('confidence_level_cat_list'), + int_list_null_to_empty_list_udf('topic_id_list').alias('topic_id_list'), + float_list_null_to_empty_list_udf('confidence_level_top_list') + .alias('confidence_level_top_list'), + str_list_null_to_empty_list_udf('entity_id_list').alias('entity_id_list'), + float_list_null_to_empty_list_udf('confidence_level_ent_list') + .alias('confidence_level_ent_list')) \ + .join(user_profiles_df, on=[F.col("user_profiles.uuid") == F.col("uuid_event")], how='left') \ + .withColumnRenamed('categories', 'user_categories') \ + .withColumnRenamed('topics', 'user_topics') \ + .withColumnRenamed('entities', 'user_entities') \ + .withColumnRenamed('doc_ids', 'user_doc_ids_viewed') \ + .withColumnRenamed('views', 'user_views_count') + +train_set_feature_vectors_df = train_set_enriched_df \ + .withColumn('feature_vector', + get_ad_feature_vector_integral_udf( + 'user_doc_ids_viewed', + 'user_views_count', + 'user_categories', + 'user_topics', + 'user_entities', + 'event_country', + 'event_country_state', + 'ad_id', + 'document_id_promo', + 'source_id', + 'publish_time', + 'timestamp_event', + 'platform_event', + 'geo_location_event', + 'source_id_doc_event', + 'publisher_doc_event', + 'publish_time_doc_event', + 'traffic_source_pv', + 'advertiser_id', + 'publisher_id', + 'campaign_id', + 'document_id_event', + 'category_id_list', + 'confidence_level_cat_list', + 'topic_id_list', + 'confidence_level_top_list', + 'entity_id_list', + 'confidence_level_ent_list', + 'doc_event_category_id_list', + 'doc_event_confidence_level_cat_list', + 'doc_event_topic_id_list', + 'doc_event_confidence_level_top_list', + 'doc_event_entity_id_list', + 'doc_event_confidence_level_ent_list')) \ + .select(F.col('uuid_event').alias('uuid'), 'display_id', 'ad_id', 'document_id_event', + F.col('document_id_promo').alias('document_id'), F.col('clicked').alias('label'), + 'feature_vector') + +if evaluation: + train_feature_vector_gcs_folder_name = 'train_feature_vectors_integral_eval' +else: + train_feature_vector_gcs_folder_name = 'train_feature_vectors_integral' + +train_set_feature_vectors_df.write.parquet(OUTPUT_BUCKET_FOLDER+train_feature_vector_gcs_folder_name, mode='overwrite') + + +# ## Exporting integral feature vectors to CSV +train_feature_vectors_exported_df = spark.read.parquet(OUTPUT_BUCKET_FOLDER+train_feature_vector_gcs_folder_name) +train_feature_vectors_exported_df.take(3) + +if evaluation: + train_feature_vector_integral_csv_folder_name = 'train_feature_vectors_integral_eval.csv' +else: + train_feature_vector_integral_csv_folder_name = 'train_feature_vectors_integral.csv' + +integral_headers = ['label', 'display_id', 'ad_id', 'doc_id', 'doc_event_id', 'is_leak'] + feature_vector_labels_integral + +with open(OUTPUT_BUCKET_FOLDER+train_feature_vector_integral_csv_folder_name+".header", 'w') as output: + output.writelines('\n'.join(integral_headers)) + +def sparse_vector_to_csv_with_nulls_row(additional_column_values, vec, num_columns): + def format_number(x): + if int(x) == x: + return str(int(x)) + else: + return '{:.3}'.format(x) + + return ','.join([str(value) for value in additional_column_values] + + list([ format_number(vec[x]) if x in vec.indices else '' for x in range(vec.size) ])[:num_columns]) \ + .replace('.0,',',') + +train_feature_vectors_integral_csv_rdd = train_feature_vectors_exported_df.select( + 'label', 'display_id', 'ad_id', 'document_id', 'document_id_event', 'feature_vector') \ + .withColumn('is_leak', F.lit(-1)) \ + .rdd.map(lambda x: sparse_vector_to_csv_with_nulls_row([x['label'], x['display_id'], + x['ad_id'], x['document_id'], x['document_id_event'], x['is_leak']], + x['feature_vector'], len(integral_headers))) + +train_feature_vectors_integral_csv_rdd.saveAsTextFile(OUTPUT_BUCKET_FOLDER+train_feature_vector_integral_csv_folder_name) + + +# # Export Validation/Test set feature vectors +def is_leak(max_timestamp_pv_leak, timestamp_event): + return max_timestamp_pv_leak >= 0 and max_timestamp_pv_leak >= timestamp_event + +is_leak_udf = F.udf(lambda max_timestamp_pv_leak, timestamp_event: int(is_leak(max_timestamp_pv_leak, timestamp_event)), IntegerType()) + +if evaluation: + data_df = validation_set_df +else: + data_df = test_set_df + + +test_validation_set_enriched_df = data_df.select( + 'display_id','uuid_event','event_country','event_country_state','platform_event', + 'source_id_doc_event', 'publisher_doc_event','publish_time_doc_event', + 'publish_time', + 'ad_id','document_id_promo','clicked', + 'geo_location_event', 'advertiser_id', 'publisher_id', + 'campaign_id', 'document_id_event', + 'traffic_source_pv', + int_list_null_to_empty_list_udf('doc_event_category_id_list') + .alias('doc_event_category_id_list'), + float_list_null_to_empty_list_udf('doc_event_confidence_level_cat_list') + .alias('doc_event_confidence_level_cat_list'), + int_list_null_to_empty_list_udf('doc_event_topic_id_list') + .alias('doc_event_topic_id_list'), + float_list_null_to_empty_list_udf('doc_event_confidence_level_top_list') + .alias('doc_event_confidence_level_top_list'), + str_list_null_to_empty_list_udf('doc_event_entity_id_list') + .alias('doc_event_entity_id_list'), + float_list_null_to_empty_list_udf('doc_event_confidence_level_ent_list') + .alias('doc_event_confidence_level_ent_list'), + int_null_to_minus_one_udf('source_id') + .alias('source_id'), + int_null_to_minus_one_udf('timestamp_event').alias('timestamp_event'), + int_list_null_to_empty_list_udf('category_id_list').alias('category_id_list'), + float_list_null_to_empty_list_udf('confidence_level_cat_list') + .alias('confidence_level_cat_list'), + int_list_null_to_empty_list_udf('topic_id_list').alias('topic_id_list'), + float_list_null_to_empty_list_udf('confidence_level_top_list') + .alias('confidence_level_top_list'), + str_list_null_to_empty_list_udf('entity_id_list').alias('entity_id_list'), + float_list_null_to_empty_list_udf('confidence_level_ent_list') + .alias('confidence_level_ent_list'), + int_null_to_minus_one_udf('max_timestamp_pv').alias('max_timestamp_pv_leak')) \ + .join(user_profiles_df, on=[F.col("user_profiles.uuid") == F.col("uuid_event")], how='left') \ + .withColumnRenamed('categories', 'user_categories') \ + .withColumnRenamed('topics', 'user_topics') \ + .withColumnRenamed('entities', 'user_entities') \ + .withColumnRenamed('doc_ids', 'user_doc_ids_viewed') \ + .withColumnRenamed('views', 'user_views_count') + +test_validation_set_feature_vectors_df = test_validation_set_enriched_df \ + .withColumn('feature_vector', + get_ad_feature_vector_integral_udf( + 'user_doc_ids_viewed', + 'user_views_count', + 'user_categories', + 'user_topics', + 'user_entities', + 'event_country', + 'event_country_state', + 'ad_id', + 'document_id_promo', + 'source_id', + 'publish_time', + 'timestamp_event', + 'platform_event', + 'geo_location_event', + 'source_id_doc_event', + 'publisher_doc_event', + 'publish_time_doc_event', + 'traffic_source_pv', + 'advertiser_id', + 'publisher_id', + 'campaign_id', + 'document_id_event', + 'category_id_list', + 'confidence_level_cat_list', + 'topic_id_list', + 'confidence_level_top_list', + 'entity_id_list', + 'confidence_level_ent_list', + 'doc_event_category_id_list', + 'doc_event_confidence_level_cat_list', + 'doc_event_topic_id_list', + 'doc_event_confidence_level_top_list', + 'doc_event_entity_id_list', + 'doc_event_confidence_level_ent_list')) \ + .select(F.col('uuid').alias('uuid'), 'display_id', 'ad_id', 'document_id_event', + F.col('document_id_promo').alias('document_id'), F.col('clicked').alias('label'), + is_leak_udf('max_timestamp_pv_leak','timestamp_event').alias('is_leak'), + 'feature_vector') + +if evaluation: + test_validation_feature_vector_gcs_folder_name = 'validation_feature_vectors_integral' +else: + test_validation_feature_vector_gcs_folder_name = 'test_feature_vectors_integral' + +test_validation_set_feature_vectors_df.write.parquet(OUTPUT_BUCKET_FOLDER+test_validation_feature_vector_gcs_folder_name, mode='overwrite') + + +# ## Exporting integral feature vectors to CSV +test_validation_feature_vectors_exported_df = spark.read.parquet(OUTPUT_BUCKET_FOLDER+test_validation_feature_vector_gcs_folder_name) +test_validation_feature_vectors_exported_df.take(3) + +if evaluation: + test_validation_feature_vector_integral_csv_folder_name = \ + 'validation_feature_vectors_integral.csv' +else: + test_validation_feature_vector_integral_csv_folder_name = \ + 'test_feature_vectors_integral.csv' + +integral_headers = ['label', 'display_id', 'ad_id', 'doc_id', 'doc_event_id', 'is_leak'] \ + + feature_vector_labels_integral + +with open(OUTPUT_BUCKET_FOLDER + test_validation_feature_vector_integral_csv_folder_name \ + +".header", 'w') as output: + output.writelines('\n'.join(integral_headers)) + +test_validation_feature_vectors_integral_csv_rdd = \ + test_validation_feature_vectors_exported_df.select( + 'label', 'display_id', 'ad_id', 'document_id', 'document_id_event', + 'is_leak', 'feature_vector') \ + .rdd.map(lambda x: sparse_vector_to_csv_with_nulls_row([x['label'], + x['display_id'], x['ad_id'], x['document_id'], x['document_id_event'], x['is_leak']], + x['feature_vector'], len(integral_headers))) + +test_validation_feature_vectors_integral_csv_rdd.saveAsTextFile( + OUTPUT_BUCKET_FOLDER+test_validation_feature_vector_integral_csv_folder_name) + +spark.stop() + diff --git a/TensorFlow/Recommendation/WideAndDeep/preproc/sort_csv.sh b/TensorFlow/Recommendation/WideAndDeep/preproc/sort_csv.sh new file mode 100755 index 00000000..8bd0297c --- /dev/null +++ b/TensorFlow/Recommendation/WideAndDeep/preproc/sort_csv.sh @@ -0,0 +1,24 @@ +#!/bin/bash + +# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +input=$1 +output=$2 +mkdir -p ${output} +for f in ${input}/* +do + filename=${f##*/} + sort -n -k2 -t ',' $f > ${output}/${filename} +done diff --git a/TensorFlow/Recommendation/WideAndDeep/scripts/benchmark_training_fp16_1gpu.sh b/TensorFlow/Recommendation/WideAndDeep/scripts/benchmark_training_fp16_1gpu.sh new file mode 100644 index 00000000..bb5f4415 --- /dev/null +++ b/TensorFlow/Recommendation/WideAndDeep/scripts/benchmark_training_fp16_1gpu.sh @@ -0,0 +1,20 @@ +#!/bin/bash + +# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -x +set -e + +python -m trainer.task --model_dir . --transformed_metadata_path "/outbrain/tfrecords" --eval_data_pattern "/outbrain/tfrecords/eval_*" --train_data_pattern "/outbrain/tfrecords/train_*" --save_checkpoints_secs 600 --linear_l1_regularization 0.0 --linear_l2_regularization 0.0 --linear_learning_rate 0.2 --deep_l1_regularization 0.0 --deep_l2_regularization 0.0 --deep_learning_rate 1.0 --deep_dropout 0.0 --deep_hidden_units 1024 1024 1024 1024 1024 --prebatch_size 4096 --batch_size 131072 --eval_batch_size 32768 --eval_steps 8 --num_epochs 15 --model_type wide_n_deep --gpu --benchmark --amp diff --git a/TensorFlow/Recommendation/WideAndDeep/scripts/benchmark_training_fp16_4gpu.sh b/TensorFlow/Recommendation/WideAndDeep/scripts/benchmark_training_fp16_4gpu.sh new file mode 100644 index 00000000..f96ee4e9 --- /dev/null +++ b/TensorFlow/Recommendation/WideAndDeep/scripts/benchmark_training_fp16_4gpu.sh @@ -0,0 +1,20 @@ +#!/bin/bash + +# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -x +set -e + +mpiexec --allow-run-as-root --bind-to socket -np 4 python -m trainer.task --hvd --model_dir . --transformed_metadata_path "/outbrain/tfrecords" --eval_data_pattern "/outbrain/tfrecords/eval_*" --train_data_pattern "/outbrain/tfrecords/train_*" --save_checkpoints_secs 600 --linear_l1_regularization 0.0 --linear_l2_regularization 0.0 --linear_learning_rate 0.2 --deep_l1_regularization 0.0 --deep_l2_regularization 0.0 --deep_learning_rate 1.0 --deep_dropout 0.0 --deep_hidden_units 1024 1024 1024 1024 1024 --prebatch_size 4096 --batch_size 131072 --eval_batch_size 32768 --eval_steps 8 --num_epochs 15 --model_type wide_n_deep --gpu --benchmark --amp diff --git a/TensorFlow/Recommendation/WideAndDeep/scripts/benchmark_training_fp16_8gpu.sh b/TensorFlow/Recommendation/WideAndDeep/scripts/benchmark_training_fp16_8gpu.sh new file mode 100644 index 00000000..4eea8c55 --- /dev/null +++ b/TensorFlow/Recommendation/WideAndDeep/scripts/benchmark_training_fp16_8gpu.sh @@ -0,0 +1,20 @@ +#!/bin/bash + +# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -x +set -e + +mpiexec --allow-run-as-root --bind-to socket -np 8 python -m trainer.task --hvd --model_dir . --transformed_metadata_path "/outbrain/tfrecords" --eval_data_pattern "/outbrain/tfrecords/eval_*" --train_data_pattern "/outbrain/tfrecords/train_*" --save_checkpoints_secs 600 --linear_l1_regularization 0.0 --linear_l2_regularization 0.0 --linear_learning_rate 0.2 --deep_l1_regularization 0.0 --deep_l2_regularization 0.0 --deep_learning_rate 1.0 --deep_dropout 0.0 --deep_hidden_units 1024 1024 1024 1024 1024 --prebatch_size 4096 --batch_size 131072 --eval_batch_size 32768 --eval_steps 8 --num_epochs 15 --model_type wide_n_deep --gpu --benchmark --amp diff --git a/TensorFlow/Recommendation/WideAndDeep/scripts/benchmark_training_fp32_1gpu.sh b/TensorFlow/Recommendation/WideAndDeep/scripts/benchmark_training_fp32_1gpu.sh new file mode 100644 index 00000000..6ac2cc13 --- /dev/null +++ b/TensorFlow/Recommendation/WideAndDeep/scripts/benchmark_training_fp32_1gpu.sh @@ -0,0 +1,20 @@ +#!/bin/bash + +# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -x +set -e + +python -m trainer.task --model_dir . --transformed_metadata_path "/outbrain/tfrecords" --eval_data_pattern "/outbrain/tfrecords/eval_*" --train_data_pattern "/outbrain/tfrecords/train_*" --save_checkpoints_secs 600 --linear_l1_regularization 0.0 --linear_l2_regularization 0.0 --linear_learning_rate 0.2 --deep_l1_regularization 0.0 --deep_l2_regularization 0.0 --deep_learning_rate 1.0 --deep_dropout 0.0 --deep_hidden_units 1024 1024 1024 1024 1024 --prebatch_size 4096 --batch_size 131072 --eval_batch_size 32768 --eval_steps 8 --num_epochs 15 --model_type wide_n_deep --gpu --benchmark diff --git a/TensorFlow/Recommendation/WideAndDeep/scripts/benchmark_training_fp32_4gpu.sh b/TensorFlow/Recommendation/WideAndDeep/scripts/benchmark_training_fp32_4gpu.sh new file mode 100644 index 00000000..43b653f5 --- /dev/null +++ b/TensorFlow/Recommendation/WideAndDeep/scripts/benchmark_training_fp32_4gpu.sh @@ -0,0 +1,20 @@ +#!/bin/bash + +# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -x +set -e + +mpiexec --allow-run-as-root --bind-to socket -np 4 python -m trainer.task --hvd --model_dir . --transformed_metadata_path "/outbrain/tfrecords" --eval_data_pattern "/outbrain/tfrecords/eval_*" --train_data_pattern "/outbrain/tfrecords/train_*" --save_checkpoints_secs 600 --linear_l1_regularization 0.0 --linear_l2_regularization 0.0 --linear_learning_rate 0.2 --deep_l1_regularization 0.0 --deep_l2_regularization 0.0 --deep_learning_rate 1.0 --deep_dropout 0.0 --deep_hidden_units 1024 1024 1024 1024 1024 --prebatch_size 4096 --batch_size 131072 --eval_batch_size 32768 --eval_steps 8 --num_epochs 15 --model_type wide_n_deep --gpu --benchmark diff --git a/TensorFlow/Recommendation/WideAndDeep/scripts/benchmark_training_fp32_8gpu.sh b/TensorFlow/Recommendation/WideAndDeep/scripts/benchmark_training_fp32_8gpu.sh new file mode 100644 index 00000000..b2ba6063 --- /dev/null +++ b/TensorFlow/Recommendation/WideAndDeep/scripts/benchmark_training_fp32_8gpu.sh @@ -0,0 +1,20 @@ +#!/bin/bash + +# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -x +set -e + +mpiexec --allow-run-as-root --bind-to socket -np 8 python -m trainer.task --hvd --model_dir . --transformed_metadata_path "/outbrain/tfrecords" --eval_data_pattern "/outbrain/tfrecords/eval_*" --train_data_pattern "/outbrain/tfrecords/train_*" --save_checkpoints_secs 600 --linear_l1_regularization 0.0 --linear_l2_regularization 0.0 --linear_learning_rate 0.2 --deep_l1_regularization 0.0 --deep_l2_regularization 0.0 --deep_learning_rate 1.0 --deep_dropout 0.0 --deep_hidden_units 1024 1024 1024 1024 1024 --prebatch_size 4096 --batch_size 131072 --eval_batch_size 32768 --eval_steps 8 --num_epochs 15 --model_type wide_n_deep --gpu --benchmark diff --git a/TensorFlow/Recommendation/WideAndDeep/scripts/preproc.sh b/TensorFlow/Recommendation/WideAndDeep/scripts/preproc.sh new file mode 100755 index 00000000..43b50626 --- /dev/null +++ b/TensorFlow/Recommendation/WideAndDeep/scripts/preproc.sh @@ -0,0 +1,54 @@ +#!/bin/bash + +# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +if [ $# -ge 1 ] +then + PREBATCH_SIZE=$1 +else + PREBATCH_SIZE=4096 +fi + +python preproc/preproc1.py +python preproc/preproc2.py +python preproc/preproc3.py + +export CUDA_VISIBLE_DEVICES= +LOCAL_DATA_DIR=/outbrain/preprocessed +LOCAL_DATA_TFRECORDS_DIR=/outbrain/tfrecords + +TRAIN_DIR=train_feature_vectors_integral_eval.csv +VALID_DIR=validation_feature_vectors_integral.csv +TRAIN_IMPUTED_DIR=train_feature_vectors_integral_eval_imputed.csv +VALID_IMPUTED_DIR=validation_feature_vectors_integral_imputed.csv +HEADER_PATH=train_feature_vectors_integral_eval.csv.header + +cd ${LOCAL_DATA_DIR} +python /wd/preproc/csv_data_imputation.py --num_workers 40 \ + --train_files_pattern 'train_feature_vectors_integral_eval.csv/part-*' \ + --valid_files_pattern 'validation_feature_vectors_integral.csv/part-*' \ + --train_dst_dir ${TRAIN_IMPUTED_DIR} \ + --valid_dst_dir ${VALID_IMPUTED_DIR} \ + --header_path ${HEADER_PATH} +cd - + +time preproc/sort_csv.sh ${LOCAL_DATA_DIR}/${VALID_IMPUTED_DIR} ${LOCAL_DATA_DIR}/${VALID_IMPUTED_DIR}_sorted + +python dataflow_preprocess.py \ + --eval_data "${LOCAL_DATA_DIR}/${VALID_IMPUTED_DIR}_sorted/part-*" \ + --training_data "${LOCAL_DATA_DIR}/${TRAIN_IMPUTED_DIR}/part-*" \ + --output_dir ${LOCAL_DATA_TFRECORDS_DIR} \ + --batch_size ${PREBATCH_SIZE} + diff --git a/TensorFlow/Recommendation/WideAndDeep/setup.py b/TensorFlow/Recommendation/WideAndDeep/setup.py new file mode 100644 index 00000000..51072c20 --- /dev/null +++ b/TensorFlow/Recommendation/WideAndDeep/setup.py @@ -0,0 +1,39 @@ +# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +# Copyright 2016 Google Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import setuptools + +NAME = 'trainer' +VERSION = '1.0' +TENSORFLOW_TRANSFORM = 'tensorflow-transform==0.1.8' + + +if __name__ == '__main__': + setuptools.setup(name=NAME, version=VERSION, packages=['trainer'], + install_requires=[TENSORFLOW_TRANSFORM]) diff --git a/TensorFlow/Recommendation/WideAndDeep/trainer/__init__.py b/TensorFlow/Recommendation/WideAndDeep/trainer/__init__.py new file mode 100644 index 00000000..067cf051 --- /dev/null +++ b/TensorFlow/Recommendation/WideAndDeep/trainer/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. \ No newline at end of file diff --git a/TensorFlow/Recommendation/WideAndDeep/trainer/features.py b/TensorFlow/Recommendation/WideAndDeep/trainer/features.py new file mode 100644 index 00000000..7931b45f --- /dev/null +++ b/TensorFlow/Recommendation/WideAndDeep/trainer/features.py @@ -0,0 +1,293 @@ +# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import re + + +AD_KEYWORDS = ['ad', 'advertiser', 'campain'] +AD_REs = ['(^|_){}_'.format(kw) for kw in AD_KEYWORDS] +AD_RE = re.compile('|'.join(AD_REs)) +def level_map(name): + return 0 if re.search(AD_RE, name) is None else 1 + + +LABEL_COLUMN = "label" + +DISPLAY_ID_COLUMN = 'display_id' + +AD_ID_COLUMN = 'ad_id' + +IS_LEAK_COLUMN = 'is_leak' + +DISPLAY_ID_AND_IS_LEAK_ENCODED_COLUMN = 'display_ad_and_is_leak' + +CATEGORICAL_COLUMNS = [ + 'ad_id', + 'doc_id', + 'doc_event_id', + 'ad_advertiser', + 'doc_ad_source_id', + 'doc_ad_publisher_id', + 'doc_event_publisher_id', + 'doc_event_source_id', + 'event_country', + 'event_country_state', + 'event_geo_location', + 'event_hour', + 'event_platform', + 'traffic_source'] + + +DOC_CATEGORICAL_MULTIVALUED_COLUMNS = { + 'doc_ad_category_id': ['doc_ad_category_id_1', + 'doc_ad_category_id_2', + 'doc_ad_category_id_3'], + 'doc_ad_topic_id': ['doc_ad_topic_id_1', + 'doc_ad_topic_id_2', + 'doc_ad_topic_id_3'], + 'doc_ad_entity_id': ['doc_ad_entity_id_1', + 'doc_ad_entity_id_2', + 'doc_ad_entity_id_3', + 'doc_ad_entity_id_4', + 'doc_ad_entity_id_5', + 'doc_ad_entity_id_6'], + 'doc_event_category_id': ['doc_event_category_id_1', + 'doc_event_category_id_2', + 'doc_event_category_id_3'], + 'doc_event_topic_id': ['doc_event_topic_id_1', + 'doc_event_topic_id_2', + 'doc_event_topic_id_3'], + 'doc_event_entity_id': ['doc_event_entity_id_1', + 'doc_event_entity_id_2', + 'doc_event_entity_id_3', + 'doc_event_entity_id_4', + 'doc_event_entity_id_5', + 'doc_event_entity_id_6'] +} + +BOOL_COLUMNS = [ + 'event_weekend', + 'user_has_already_viewed_doc'] + +INT_COLUMNS = [ + 'user_views', + 'ad_views', + 'doc_views', + 'doc_event_days_since_published', + 'doc_event_hour', + 'doc_ad_days_since_published'] + +FLOAT_COLUMNS_LOG_BIN_TRANSFORM = [ + 'pop_ad_id', + 'pop_ad_id_conf_multipl', + 'pop_document_id', + 'pop_document_id_conf_multipl', + 'pop_publisher_id', + 'pop_publisher_id_conf_multipl', + 'pop_advertiser_id', + 'pop_advertiser_id_conf_multipl', + 'pop_campain_id', + 'pop_campain_id_conf_multipl', + 'pop_doc_event_doc_ad', + 'pop_doc_event_doc_ad_conf_multipl', + 'pop_source_id', + 'pop_source_id_conf_multipl', + 'pop_source_id_country', + 'pop_source_id_country_conf_multipl', + 'pop_entity_id', + 'pop_entity_id_conf_multipl', + 'pop_entity_id_country', + 'pop_entity_id_country_conf_multipl', + 'pop_topic_id', + 'pop_topic_id_conf_multipl', + 'pop_topic_id_country', + 'pop_topic_id_country_conf_multipl', + 'pop_category_id', + 'pop_category_id_conf_multipl', + 'pop_category_id_country', + 'pop_category_id_country_conf_multipl', + 'user_doc_ad_sim_categories', + 'user_doc_ad_sim_categories_conf_multipl', + 'user_doc_ad_sim_topics', + 'user_doc_ad_sim_topics_conf_multipl', + 'user_doc_ad_sim_entities', + 'user_doc_ad_sim_entities_conf_multipl', + 'doc_event_doc_ad_sim_categories', + 'doc_event_doc_ad_sim_categories_conf_multipl', + 'doc_event_doc_ad_sim_topics', + 'doc_event_doc_ad_sim_topics_conf_multipl', + 'doc_event_doc_ad_sim_entities', + 'doc_event_doc_ad_sim_entities_conf_multipl'] + +FLOAT_COLUMNS_SIMPLE_BIN_TRANSFORM = [ + 'pop_ad_id_conf', + 'pop_document_id_conf', + 'pop_publisher_id_conf', + 'pop_advertiser_id_conf', + 'pop_campain_id_conf', + 'pop_doc_event_doc_ad_conf', + 'pop_source_id_conf', + 'pop_source_id_country_conf', + 'pop_entity_id_conf', + 'pop_entity_id_country_conf', + 'pop_topic_id_conf', + 'pop_topic_id_country_conf', + 'pop_category_id_conf', + 'pop_category_id_country_conf', + 'user_doc_ad_sim_categories_conf', + 'user_doc_ad_sim_topics_conf', + 'user_doc_ad_sim_entities_conf', + 'doc_event_doc_ad_sim_categories_conf', + 'doc_event_doc_ad_sim_topics_conf', + 'doc_event_doc_ad_sim_entities_conf'] + +FLOAT_COLUMNS = FLOAT_COLUMNS_LOG_BIN_TRANSFORM + FLOAT_COLUMNS_SIMPLE_BIN_TRANSFORM + +# Let's define the columns we're actually going to use +# during training +REQUEST_SINGLE_HOT_COLUMNS = [ + "doc_event_id", + "doc_id", + "doc_event_source_id", + "event_geo_location", + "event_country_state", + "doc_event_publisher_id", + "event_country", + "event_hour", + "event_platform", + "traffic_source", + "event_weekend", + "user_has_already_viewed_doc"] + +REQUEST_MULTI_HOT_COLUMNS = [ + "doc_event_entity_id", + "doc_event_topic_id", + "doc_event_category_id"] + +REQUEST_NUMERIC_COLUMNS = [ + "pop_document_id_conf", + "pop_publisher_id_conf", + "pop_source_id_conf", + "pop_entity_id_conf", + "pop_topic_id_conf", + "pop_category_id_conf", + "pop_document_id", + "pop_publisher_id", + "pop_source_id", + "pop_entity_id", + "pop_topic_id", + "pop_category_id", + "user_views", + "doc_views", + "doc_event_days_since_published", + "doc_event_hour"] + +ITEM_SINGLE_HOT_COLUMNS = [ + "ad_id", + "doc_ad_source_id", + "ad_advertiser", + "doc_ad_publisher_id"] + +ITEM_MULTI_HOT_COLUMNS = [ + "doc_ad_topic_id", + "doc_ad_entity_id", + "doc_ad_category_id"] + +ITEM_NUMERIC_COLUMNS = [ + "pop_ad_id_conf", + "user_doc_ad_sim_categories_conf", + "user_doc_ad_sim_topics_conf", + "pop_advertiser_id_conf", + "pop_campain_id_conf_multipl", + "pop_ad_id", + "pop_advertiser_id", + "pop_campain_id", + "user_doc_ad_sim_categories", + "user_doc_ad_sim_topics", + "user_doc_ad_sim_entities", + "doc_event_doc_ad_sim_categories", + "doc_event_doc_ad_sim_topics", + "doc_event_doc_ad_sim_entities", + "ad_views", + "doc_ad_days_since_published"] + +num_hot_map = { + name: 1 for name in + (REQUEST_SINGLE_HOT_COLUMNS + ITEM_SINGLE_HOT_COLUMNS)} +num_hot_map.update({ + name: -1 for name in + (REQUEST_MULTI_HOT_COLUMNS + ITEM_MULTI_HOT_COLUMNS)}) + +NV_TRAINING_COLUMNS = ( + REQUEST_SINGLE_HOT_COLUMNS + + REQUEST_MULTI_HOT_COLUMNS + + REQUEST_NUMERIC_COLUMNS + + ITEM_SINGLE_HOT_COLUMNS + + ITEM_MULTI_HOT_COLUMNS + + ITEM_NUMERIC_COLUMNS) + +ALL_TRAINING_COLUMNS = ( + CATEGORICAL_COLUMNS + + BOOL_COLUMNS + + INT_COLUMNS + + FLOAT_COLUMNS +) +for v in DOC_CATEGORICAL_MULTIVALUED_COLUMNS.values(): + ALL_TRAINING_COLUMNS.extend(v) + + +HASH_BUCKET_SIZES = { + 'doc_event_id': 300000, + 'ad_id': 250000, + 'doc_id': 100000, + 'doc_ad_entity_id': 10000, + 'doc_event_entity_id': 10000, + 'doc_ad_source_id': 4000, + 'doc_event_source_id': 4000, + 'event_geo_location': 2500, + 'ad_advertiser': 2500, + 'event_country_state': 2000, + 'doc_ad_publisher_id': 1000, + 'doc_event_publisher_id': 1000, + 'doc_ad_topic_id': 350, + 'doc_event_topic_id': 350, + 'event_country': 300, + 'doc_ad_category_id': 100, + 'doc_event_category_id': 100} + +IDENTITY_NUM_BUCKETS = { + 'event_hour': 6, + 'event_platform': 3, + 'traffic_source': 3, + 'event_weekend': 2, + 'user_has_already_viewed_doc': 2} + +EMBEDDING_DIMENSIONS = { + 'doc_event_id': 128, + 'ad_id': 128, + 'doc_id': 128, + 'doc_ad_entity_id': 64, + 'doc_event_entity_id': 64, + 'doc_ad_source_id': 64, + 'doc_event_source_id': 64, + 'event_geo_location': 64, + 'ad_advertiser': 64, + 'event_country_state': 64, + 'doc_ad_publisher_id': 64, + 'doc_event_publisher_id': 64, + 'doc_ad_topic_id': 64, + 'doc_event_topic_id': 64, + 'event_country': 64, + 'doc_ad_category_id': 64, + 'doc_event_category_id': 64} diff --git a/TensorFlow/Recommendation/WideAndDeep/trainer/task.py b/TensorFlow/Recommendation/WideAndDeep/trainer/task.py new file mode 100644 index 00000000..4e032f32 --- /dev/null +++ b/TensorFlow/Recommendation/WideAndDeep/trainer/task.py @@ -0,0 +1,800 @@ +# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import numpy as np +import argparse +import json +import os +import sys +import pickle +import tensorflow as tf +import tensorflow_transform as tft + +from trainer import features +from utils.hooks.benchmark_hooks import BenchmarkLoggingHook + +import horovod.tensorflow as hvd + +import dllogger + +MODEL_TYPES = ['wide', 'deep', 'wide_n_deep'] +WIDE, DEEP, WIDE_N_DEEP = MODEL_TYPES + +# Default train dataset size +TRAIN_DATASET_SIZE = 59761827 + +def create_parser(): + """Initialize command line parser using arparse. + + Returns: + An argparse.ArgumentParser. + """ + parser = argparse.ArgumentParser() + + parser.add_argument( + '--model_type', + help='Model type to train on', + choices=MODEL_TYPES, + default=WIDE_N_DEEP) + parser.add_argument( + '--canned_estimator', + help='Use canned estimator instead of the experimental custom estimator', + action='store_true', + default=False) + parser.add_argument( + '--train_data_pattern', + help='Pattern of training file names. For example if training files are train_000.tfrecord, \ + train_001.tfrecord then --train_data_pattern is train_*', + type=str, + default='/outbrain/tfrecords/train_*', + nargs='+' + ) + parser.add_argument( + '--eval_data_pattern', + help='Pattern of eval file names. For example if eval files are eval_000.tfrecord, \ + eval_001.tfrecord then --eval_data_pattern is eval_*', + type=str, + default='/outbrain/tfrecords/eval_*', + nargs='+' + ) + parser.add_argument( + '--model_dir', + help='Model Checkpoint will be saved here', + type=str, + default='/outbrain/checkpoints' + ) + parser.add_argument( + '--transformed_metadata_path', + help='Path to transformed_metadata.', + type=str, + default='/outbrain/tfrecords' + ) + parser.add_argument( + '--deep_hidden_units', + help='hidden units per layer, separated by spaces', + default=[1024, 1024, 1024, 1024, 1024], + type=int, + nargs="+") + parser.add_argument( + '--prebatch_size', + help='Size of the pre-batches in the tfrecords', + default=4096, + type=int) + parser.add_argument( + '--batch_size', + help='Training batch size', + default=131072, + type=int) + parser.add_argument( + '--eval_batch_size', + help='Evaluation batch size', + default=32768, + type=int) + parser.add_argument( + '--eval_steps', + help='Number of evaluation steps to perform', + default=8, + type=int) + parser.add_argument( + '--training_set_size', + help='Number of samples in the training set', + default=TRAIN_DATASET_SIZE, + type=int) + parser.add_argument( + '--num_epochs', + help='Number of epochs', + default=100, + type=int) + parser.add_argument( + '--save_checkpoints_secs', + help='Minimal number of seconds between evaluations', + default=600, + type=int) + parser.add_argument( + '--save_checkpoints_steps', + help='Training steps between saving checkpoints. If 0, then save_checkpoints_secs applies', + default=0, + type=int) + + parser.add_argument( + '--xla', + help='Enable XLA', + default=False, + action='store_true') + parser.add_argument( + '--gpu', + help='Run computations on the GPU', + default=False, + action='store_true') + parser.add_argument( + '--amp', + help='Attempt automatic mixed precision conversion', + default=False, + action='store_true') + parser.add_argument( + '--hvd', + help='Use Horovod', + action='store_true', + default=False) + + # hyperparameters for linear part + parser.add_argument( + '--linear_l1_regularization', + help='L1 regularization for linear model', + type=float, + default=0.0) + parser.add_argument( + '--linear_l2_regularization', + help='L2 regularization for linear model', + type=float, + default=0.0) + parser.add_argument( + '--linear_learning_rate', + help='Learning rate for linear model', + type=float, + default=0.2) + + # hyperparameters for deep part + parser.add_argument( + '--deep_l1_regularization', + help='L1 regularization for deep model', + type=float, + default=0.0) + parser.add_argument( + '--deep_l2_regularization', + help='L2 regularization for deep model', + type=float, + default=0.00) + parser.add_argument( + '--deep_learning_rate', + help='Learning rate for deep model', + type=float, + default=1.0) + parser.add_argument( + '--deep_dropout', + help='Dropout regularization for deep model', + type=float, + default=0.0) + + parser.add_argument( + '--log_device_placement', + help='Ask Tensorflow (via ConfigProto) to print device placement of nodes', + default=False, + action='store_true') + + parser.add_argument( + '--predict', + help='Only perform a prediction on the validation dataset, don\'t train', + default=False, + action='store_true') + parser.add_argument( + '--evaluate', + help='Only perform an evaluation on the validation dataset, don\'t train', + default=False, + action='store_true') + parser.add_argument( + '--results_dir', + type=str, + help='Directory to store training results', + default='/results') + parser.add_argument( + '--log_filename', + type=str, + help='Name of the file to store dlloger output', + default='log.json') + parser.add_argument( + '--use_all_columns', + help='Force using all features defined in the features.py file', + default=False, + action='store_true') + parser.add_argument( + '--shuffle_percentage', + type=float, + default=0.001, + help='Size of the shuffle buffer from 0 to 1. 1 means that the shuffle buffer size will be equal to the size of the training dataset.') + parser.add_argument( + '--print_display_ids', + help='Print the display ids processed by the input pipeline', + default=False, + action='store_true') + parser.add_argument( + '--eval_throttle_secs', + help='Number of evaluation steps to perform.', + default=600, + type=int) + parser.add_argument( + '--reader_num_threads', + default=12, + type=int) + parser.add_argument( + '--parser_num_threads', + default=3, + type=int) + parser.add_argument( + '--prefetch_buffer_size', + default=1, + type=int) + parser.add_argument( + '--submission', + action='store_true', + default=False) + parser.add_argument( + '--benchmark', + help='Collect performance metrics during training', + action='store_true', + default=False) + parser.add_argument( + '--benchmark_warmup_steps', + help='Warmup before starg of benchmarking the training', + type=int, + default=50) + parser.add_argument( + '--benchmark_steps', + help='Number of steps for train performance benchmark', + type=int, + default=100) + + return parser + + +def get_feature_columns(use_all_columns=False, force_subset=None): + # adding the force_subset as a way to directly pass in column changes for testing/profiling + assert not use_all_columns or force_subset is None, \ + 'Cannot both use all columns and use only a subset; give only one argument' + deep_columns, wide_columns = [], [] + + if use_all_columns: + training_columns = features.ALL_TRAINING_COLUMNS + elif force_subset is not None: + training_columns = force_subset + else: + training_columns = features.NV_TRAINING_COLUMNS + + tf.compat.v1.logging.warn('number of features: {}'.format(len(training_columns))) + + for column_name in training_columns: + if column_name in features.HASH_BUCKET_SIZES: + categorical_column = tf.feature_column.categorical_column_with_hash_bucket( + column_name, + hash_bucket_size=features.HASH_BUCKET_SIZES[column_name], + dtype=tf.int32) + wide_columns.append(categorical_column) + + elif column_name in features.IDENTITY_NUM_BUCKETS: + categorical_column = tf.feature_column.categorical_column_with_identity( + column_name, num_buckets=features.IDENTITY_NUM_BUCKETS[column_name]) + wide_columns.append(categorical_column) + + else: + columns = [] + if column_name in features.FLOAT_COLUMNS_SIMPLE_BIN_TRANSFORM: + # add a categorical_column for column_name + "_binned" + # just add the regular float column for now + columns.append(tf.feature_column.numeric_column( + column_name, shape=(1,))) + elif column_name in features.FLOAT_COLUMNS_LOG_BIN_TRANSFORM: + # add a categorical_column for column_name + "_log_binned") + columns.append(tf.feature_column.numeric_column( + column_name + "_log_01scaled", shape=(1,))) + elif column_name in features.INT_COLUMNS: + # add a categorical_column for column_name + "_log_int" + columns.append(tf.feature_column.numeric_column( + column_name+"_log_01scaled", shape=(1,))) + + for column in columns: + wide_columns.append(column) + deep_columns.append(column) + continue + if column_name in features.EMBEDDING_DIMENSIONS: + column = tf.feature_column.embedding_column( + categorical_column, + dimension=features.EMBEDDING_DIMENSIONS[column_name], + combiner='mean') + else: + column = tf.feature_column.indicator_column(categorical_column) + deep_columns.append(column) + tf.compat.v1.logging.warn('deep columns: {}'.format(len(deep_columns))) + tf.compat.v1.logging.warn('wide columns: {}'.format(len(wide_columns))) + tf.compat.v1.logging.warn('wide&deep intersection: {}'.format(len(set(wide_columns).intersection(set(deep_columns))))) + return wide_columns, deep_columns + +def separate_input_fn( + tf_transform_output, + transformed_examples, + batch_size, + mode, + reader_num_threads=1, + parser_num_threads=2, + shuffle_buffer_size=10, + prefetch_buffer_size=1, + print_display_ids=False): + """ + A version of the training + eval input function that uses dataset operations. + (For more straightforward tweaking.) + """ + + tf.compat.v1.logging.warn('Shuffle buffer size: {}'.format(shuffle_buffer_size)) + + filenames_dataset = tf.data.Dataset.list_files(transformed_examples, shuffle=False) + + raw_dataset = tf.data.TFRecordDataset(filenames_dataset, + num_parallel_reads=reader_num_threads) + + raw_dataset = raw_dataset.shuffle(shuffle_buffer_size) \ + if (mode==tf.estimator.ModeKeys.TRAIN and shuffle_buffer_size > 1) \ + else raw_dataset + raw_dataset = raw_dataset.repeat() + raw_dataset = raw_dataset.batch(batch_size) + + # this function appears to require each element to be a vector + # batching should mean that this is always true + # one possible alternative for any problematic case is tf.io.parse_single_example + parsed_dataset = raw_dataset.apply(tf.data.experimental.parse_example_dataset( + tf_transform_output.transformed_feature_spec(), + num_parallel_calls=parser_num_threads)) + + # a function mapped over each dataset element + # will separate label, ensure that elements are two-dimensional (batch size, elements per record) + # adds print_display_ids injection + def consolidate_batch(elem): + label = elem.pop('label') + reshaped_label = tf.reshape(label, [-1, label.shape[-1]]) + reshaped_elem = {key: tf.reshape(elem[key], [-1, elem[key].shape[-1]]) for key in elem} + if print_display_ids: + elem['ad_id'] = tf.Print(input_=elem['ad_id'], + data=[tf.reshape(elem['display_id'], [-1])], + message='display_id', name='print_display_ids', summarize=FLAGS.eval_batch_size) + elem['ad_id'] = tf.Print(input_=elem['ad_id'], + data=[tf.reshape(elem['ad_id'], [-1])], + message='ad_id', name='print_ad_ids', summarize=FLAGS.eval_batch_size) + elem['ad_id'] = tf.Print(input_=elem['ad_id'], + data=[tf.reshape(elem['is_leak'], [-1])], + message='is_leak', name='print_is_leak', summarize=FLAGS.eval_batch_size) + + return reshaped_elem, reshaped_label + + if mode == tf.estimator.ModeKeys.EVAL: + parsed_dataset = parsed_dataset.map(consolidate_batch, num_parallel_calls=None) + else: + parsed_dataset = parsed_dataset.map(consolidate_batch, + num_parallel_calls=parser_num_threads) + parsed_dataset = parsed_dataset.prefetch(prefetch_buffer_size) + + return parsed_dataset + +# rough approximation for MAP metric for measuring ad quality +# roughness comes from batch sizes falling between groups of +# display ids +# hack because of name clashes. Probably makes sense to rename features +DISPLAY_ID_COLUMN = features.DISPLAY_ID_COLUMN +def map_custom_metric(features, labels, predictions): + display_ids = tf.reshape(features[DISPLAY_ID_COLUMN], [-1]) + predictions = predictions['probabilities'][:, 1] + labels = labels[:, 0] + + # Processing unique display_ids, indexes and counts + # Sorting needed in case the same display_id occurs in two different places + sorted_ids = tf.argsort(display_ids) + display_ids = tf.gather(display_ids, indices=sorted_ids) + predictions = tf.gather(predictions, indices=sorted_ids) + labels = tf.gather(labels, indices=sorted_ids) + + _, display_ids_idx, display_ids_ads_count = tf.unique_with_counts( + display_ids, out_idx=tf.int64) + pad_length = 30 - tf.reduce_max(display_ids_ads_count) + pad_fn = lambda x: tf.pad(x, [(0, 0), (0, pad_length)]) + + preds = tf.RaggedTensor.from_value_rowids( + predictions, display_ids_idx).to_tensor() + labels = tf.RaggedTensor.from_value_rowids( + labels, display_ids_idx).to_tensor() + + labels = tf.argmax(labels, axis=1) + + return { + 'map': tf.compat.v1.metrics.average_precision_at_k( + predictions=pad_fn(preds), + labels=labels, + k=12, + name="streaming_map")} + + +IS_LEAK_COLUMN = features.IS_LEAK_COLUMN +def map_custom_metric_with_leak(features, labels, predictions): + display_ids = features[DISPLAY_ID_COLUMN] + display_ids = tf.reshape(display_ids, [-1]) + is_leak_tf = features[IS_LEAK_COLUMN] + is_leak_tf = tf.reshape(is_leak_tf, [-1]) + + predictions = predictions['probabilities'][:, 1] + predictions = predictions + tf.cast(is_leak_tf, tf.float32) + labels = labels[:, 0] + + # Processing unique display_ids, indexes and counts + # Sorting needed in case the same display_id occurs in two different places + sorted_ids = tf.argsort(display_ids) + display_ids = tf.gather(display_ids, indices=sorted_ids) + predictions = tf.gather(predictions, indices=sorted_ids) + labels = tf.gather(labels, indices=sorted_ids) + + _, display_ids_idx, display_ids_ads_count = tf.unique_with_counts( + display_ids, out_idx=tf.int64) + pad_length = 30 - tf.reduce_max(display_ids_ads_count) + pad_fn = lambda x: tf.pad(x, [(0, 0), (0, pad_length)]) + + preds = tf.RaggedTensor.from_value_rowids(predictions, display_ids_idx).to_tensor() + labels = tf.RaggedTensor.from_value_rowids(labels, display_ids_idx).to_tensor() + labels = tf.argmax(labels, axis=1) + + return { + 'map_with_leak': tf.compat.v1.metrics.average_precision_at_k( + predictions=pad_fn(preds), + labels=labels, + k=12, + name="streaming_map_with_leak")} + +# A quick custom hook to access and log info about an Estimator graph +class InternalLoggerHook(tf.estimator.SessionRunHook): + def __init__(self, logfile='internal_log.txt'): + self.logfile = logfile + + def after_create_session(self, session, coord): # runs once, after graph is finalized + log_writer = open(self.logfile, 'w') + + # one pass through to record a dictionary with {input: output} pairs of nodes + # doesn't add much overhead that I can see, and makes it easier to trace dependencies + hold_outputs_dict = {} + for op in tf.get_default_graph().get_operations(): + for tensor in op.inputs: + base_name = ''.join(tensor.name.split(':')[:-1]) if ':' in tensor.name else tensor.name + if base_name not in hold_outputs_dict: + hold_outputs_dict[base_name] = [op.name] + else: + hold_outputs_dict[base_name].append(op.name) + + # record information for each op to file, this time, drawing on the above dictionary for outputs + for op in tf.get_default_graph().get_operations(): + op_repr = '' + op_repr = op_repr + repr(op.node_def) # protobuf-style representation + outputs = hold_outputs_dict.pop(op.name, []) + op_repr = op_repr + '\n'.join(['output: ' + repr(o) for o in outputs] + \ + ['colocation_group: ' + repr(cg) for cg in op.colocation_groups()] + \ + ['control_input: ' + repr(ci) for ci in op.control_inputs]) + op_repr = ' ' + '\n '.join(op_repr.split('\n')) # indented + op_repr = op.name + ' {\n' + op_repr + '\n}\n\n' + log_writer.write(op_repr) + + # leave a warning at the end of the file if any outputs are left over + log_writer.write('Unclaimed outputs:\n' + '\n'.join([key + ': ' + repr(hold_outputs_dict[key]) \ + for key in hold_outputs_dict])) + + log_writer.close() + +# function to create a wide & deep Estimator, with options to knock out parts (model_type) +def custom_estimator_model_fn(features, labels, mode, params, config): + + with tf.compat.v1.variable_scope('deep', values=features) as scope: + deep_absolute_scope = scope.name + if params['model_type'] in [DEEP, WIDE_N_DEEP]: + deep_features = features.copy() + deep_current = tf.compat.v1.feature_column.input_layer(deep_features, params['deep_columns']) + + if params['model_type'] in [DEEP, WIDE_N_DEEP]: + for layer_ind in range(len(params['layers'])): + num_neurons = params['layers'][layer_ind] + deep_current = tf.keras.layers.Dense(num_neurons, activation=tf.nn.relu)(deep_current) + deep_current = tf.keras.layers.Dropout(params['deep_dropout'])(deep_current) + + deep_logits = tf.keras.layers.Dense(1)(deep_current) + else: + deep_logits = None + + with tf.compat.v1.variable_scope('wide', values=features) as scope: + wide_absolute_scope = scope.name + wide_logits = tf.compat.v1.feature_column.linear_model(features, params['wide_columns'], units=1, sparse_combiner='sum') \ + if params['model_type'] in [WIDE, WIDE_N_DEEP] else None + + if deep_logits is None and wide_logits is None: # with only the input pipeline, just return input features + assert mode == tf.estimator.ModeKeys.PREDICT, \ + 'Only the input pipeline is used; eval and train don\'t have meaning' + return tf.estimator.EstimatorSpec(mode, predictions=features) + else: + logits = deep_logits if wide_logits is None else wide_logits if deep_logits is None \ + else (wide_logits + deep_logits) + + head = tf.contrib.estimator.binary_classification_head(loss_reduction=tf.compat.v1.losses.Reduction.SUM_OVER_BATCH_SIZE) + + def train_op_fn(loss): + global_step = tf.compat.v1.train.get_global_step() + deep_optimizer = params['deep_optimizer'] + wide_optimizer = params['wide_optimizer'] + + # enable mixed precision if desired + if params['amp']: + deep_optimizer = tf.train.experimental.enable_mixed_precision_graph_rewrite(deep_optimizer) + wide_optimizer = tf.train.experimental.enable_mixed_precision_graph_rewrite(wide_optimizer) + + deep_op = deep_optimizer.minimize(loss, var_list=tf.compat.v1.get_collection( + tf.compat.v1.GraphKeys.TRAINABLE_VARIABLES, scope=deep_absolute_scope)) if deep_logits is not None else None + wide_op = wide_optimizer.minimize(loss, var_list=tf.compat.v1.get_collection( + tf.compat.v1.GraphKeys.TRAINABLE_VARIABLES, scope=wide_absolute_scope)) if wide_logits is not None else None + train_op = tf.group(deep_op, wide_op) if deep_logits is not None and wide_logits is not None \ + else deep_op if deep_logits is not None else wide_op + with tf.control_dependencies([train_op]): # increment global step once train op is done + return tf.compat.v1.assign_add(global_step, 1).op # this is how the canned estimator appears to do it + + return head.create_estimator_spec(features, mode, logits, labels=labels, train_op_fn=train_op_fn) + + +# a helper function to create an estimator of the specified type, either custom or canned +# custom estimators are created with the custom_estimator_model_fn, and have some options working +# that the canned ones do not (AMP, knocking out parts of the model, NVTX) +def construct_estimator(model_type, custom_estimator, run_config, + wide_columns, wide_optimizer, + deep_columns, deep_hidden_units, deep_dropout, deep_optimizer, + amp=False): + if custom_estimator: + estimator = tf.estimator.Estimator( + model_fn=custom_estimator_model_fn, + config=run_config, + params={ + 'wide_columns': wide_columns, + 'deep_columns': deep_columns, + 'deep_dropout': deep_dropout, + 'model_type': model_type, + 'layers': deep_hidden_units, + 'wide_optimizer': wide_optimizer, + 'deep_optimizer': deep_optimizer, + 'amp': amp, + }) + else: + assert model_type in [WIDE, DEEP, WIDE_N_DEEP], 'Canned estimator only supports basic wide, deep, wnd' + assert not amp, 'AMP not functional for canned estimator' # AMP does not optimize the separate graph + if model_type == WIDE: + estimator = tf.estimator.LinearClassifier( + feature_columns=wide_columns, + config=run_config, + optimizer=wide_optimizer) + + elif model_type == DEEP: + estimator = tf.estimator.DNNClassifier( + feature_columns=deep_columns, + hidden_units=deep_hidden_units, + dropout=deep_dropout, + config=run_config, + optimizer=deep_optimizer) + + elif model_type == WIDE_N_DEEP: + estimator = tf.estimator.DNNLinearCombinedClassifier( + config=run_config, + linear_feature_columns=wide_columns, + linear_optimizer=wide_optimizer, + dnn_feature_columns=deep_columns, + dnn_optimizer=deep_optimizer, + dnn_hidden_units=deep_hidden_units, + dnn_dropout=deep_dropout, + linear_sparse_combiner='sum', + loss_reduction=tf.losses.Reduction.SUM_OVER_BATCH_SIZE) + + return estimator + + + +def main(FLAGS): + if FLAGS.hvd: + hvd.init() + if hvd.local_rank() == 0: + tf.logging.set_verbosity(tf.logging.INFO) + log_path = os.path.join(FLAGS.results_dir, FLAGS.log_filename) + os.makedirs(FLAGS.results_dir, exist_ok=True) + dllogger.init(backends=[ + dllogger.JSONStreamBackend(verbosity=dllogger.Verbosity.VERBOSE, + filename=log_path), + dllogger.StdOutBackend(verbosity=dllogger.Verbosity.VERBOSE)]) + else: + tf.logging.set_verbosity(tf.logging.ERROR) + dllogger.init(backends=[]) + num_gpus = hvd.size() + else: + tf.logging.set_verbosity(tf.logging.INFO) + log_path = os.path.join(FLAGS.results_dir, FLAGS.log_filename) + os.makedirs(FLAGS.results_dir, exist_ok=True) + dllogger.init(backends=[ + dllogger.JSONStreamBackend(verbosity=dllogger.Verbosity.VERBOSE, + filename=log_path), + dllogger.StdOutBackend(verbosity=dllogger.Verbosity.VERBOSE)]) + num_gpus = 1 + + dllogger.log(data=vars(FLAGS), step='PARAMETER') + + create_batches = FLAGS.batch_size // FLAGS.prebatch_size + + wide_columns, deep_columns = get_feature_columns(use_all_columns=FLAGS.use_all_columns) + tf_transform_output = tft.TFTransformOutput(FLAGS.transformed_metadata_path) + + if not FLAGS.hvd or hvd.local_rank() == 0: + tf.compat.v1.logging.warn('command line arguments: {}'.format(json.dumps(vars(FLAGS)))) + if not os.path.exists(FLAGS.results_dir): + os.mkdir(FLAGS.results_dir) + + with open('{}/args.json'.format(FLAGS.results_dir), 'w') as f: + json.dump(vars(FLAGS), f, indent=4) + + if FLAGS.gpu: + session_config = tf.compat.v1.ConfigProto(log_device_placement=FLAGS.log_device_placement) + else: + session_config = tf.compat.v1.ConfigProto(device_count={'GPU': 0}, log_device_placement=FLAGS.log_device_placement) + + if FLAGS.hvd: + session_config.gpu_options.visible_device_list = str(hvd.local_rank()) + + if FLAGS.xla: + session_config.graph_options.optimizer_options.global_jit_level = tf.OptimizerOptions.ON_1 + + if FLAGS.benchmark: + model_dir = None + else: + model_dir = FLAGS.model_dir + + if FLAGS.save_checkpoints_steps != 0: + run_config = tf.estimator.RunConfig(model_dir=model_dir).replace(session_config=session_config, + save_checkpoints_steps=FLAGS.save_checkpoints_steps, + keep_checkpoint_max=1) + else: + run_config = tf.estimator.RunConfig(model_dir=model_dir).replace(session_config=session_config, + save_checkpoints_secs=FLAGS.save_checkpoints_secs, + keep_checkpoint_max=1) + + wide_optimizer = tf.compat.v1.train.FtrlOptimizer( + learning_rate=FLAGS.linear_learning_rate, + l1_regularization_strength=FLAGS.linear_l1_regularization, + l2_regularization_strength=FLAGS.linear_l2_regularization) + + deep_optimizer = tf.compat.v1.train.ProximalAdagradOptimizer( + learning_rate=FLAGS.deep_learning_rate, + initial_accumulator_value=0.1, + l1_regularization_strength=FLAGS.deep_l1_regularization, + l2_regularization_strength=FLAGS.deep_l2_regularization, + use_locking=False) + + if FLAGS.hvd: + wide_optimizer = hvd.DistributedOptimizer(wide_optimizer) + deep_optimizer = hvd.DistributedOptimizer(deep_optimizer) + + stats_filename = os.path.join(FLAGS.transformed_metadata_path, 'stats.json') + embed_columns = None + + # input functions to read data from disk + train_input_fn = lambda : separate_input_fn( + tf_transform_output, + FLAGS.train_data_pattern, + create_batches, + tf.estimator.ModeKeys.TRAIN, + reader_num_threads=FLAGS.reader_num_threads, + parser_num_threads=FLAGS.parser_num_threads, + shuffle_buffer_size=int(FLAGS.shuffle_percentage*create_batches), + prefetch_buffer_size=FLAGS.prefetch_buffer_size, + print_display_ids=FLAGS.print_display_ids) + eval_input_fn = lambda : separate_input_fn( + tf_transform_output, + FLAGS.eval_data_pattern, + (FLAGS.eval_batch_size // FLAGS.prebatch_size), + tf.estimator.ModeKeys.EVAL, + reader_num_threads=1, + parser_num_threads=1, + shuffle_buffer_size=int(FLAGS.shuffle_percentage*create_batches), + prefetch_buffer_size=FLAGS.prefetch_buffer_size, + print_display_ids=FLAGS.print_display_ids) + + estimator = construct_estimator(FLAGS.model_type, not FLAGS.canned_estimator, run_config, + wide_columns, wide_optimizer, + deep_columns, FLAGS.deep_hidden_units, FLAGS.deep_dropout, deep_optimizer, + amp=FLAGS.amp) + + estimator = tf.estimator.add_metrics(estimator, map_custom_metric) + estimator = tf.estimator.add_metrics(estimator, map_custom_metric_with_leak) + + steps_per_epoch = FLAGS.training_set_size / FLAGS.batch_size + + print('Steps per epoch: {}'.format(steps_per_epoch)) + max_steps = int(FLAGS.num_epochs * steps_per_epoch) + + hooks = [] + if FLAGS.hvd: + hooks.append(hvd.BroadcastGlobalVariablesHook(0)) + + if FLAGS.predict or FLAGS.evaluate: # inference + if FLAGS.benchmark: + benchmark_hook = BenchmarkLoggingHook(global_batch_size=num_gpus * FLAGS.eval_batch_size, warmup_steps=FLAGS.benchmark_warmup_steps) + hooks.append(benchmark_hook) + eval_steps = FLAGS.benchmark_steps + else: + eval_steps = FLAGS.eval_steps + + predict_result_iter = estimator.predict(input_fn=eval_input_fn, hooks=hooks, yield_single_examples=False) + + results = [] + for i, r in enumerate(predict_result_iter): + print('predicting batch: ', i) + results.append(r) + # TODO: use eval_steps + if i >= eval_steps - 1: + break + + if FLAGS.benchmark: + infer_throughput = benchmark_hook.mean_throughput.value() + + if FLAGS.benchmark: + dllogger.log(data={'infer_throughput': infer_throughput}, step=tuple()) + elif FLAGS.evaluate: + print('evaluating using estimator.evaluate with eval_batch_size = ', + FLAGS.eval_batch_size, ' and eval_steps = ', FLAGS.eval_steps) + + result = estimator.evaluate(eval_input_fn, hooks=hooks, steps=FLAGS.eval_steps) + dllogger.log(step=(), data={'map_infer': float(result['map']), 'map_with_leak_infer': float(result['map_with_leak'])}) + elif FLAGS.predict: + scores = [r['probabilities'][:, 1] for r in results] + scores = np.hstack(scores) + scores_path = os.path.join(FLAGS.model_dir, 'scores.txt') + print('saving the numpy scores array to: ', scores_path) + np.savetxt(scores_path, scores, fmt="%f", delimiter='\n') + + else: # training + + if FLAGS.benchmark: + benchmark_hook = BenchmarkLoggingHook(global_batch_size=num_gpus * FLAGS.batch_size, + warmup_steps=FLAGS.benchmark_warmup_steps) + hooks.append(benchmark_hook) + estimator.train(train_input_fn, hooks=hooks, steps=FLAGS.benchmark_steps) + train_throughput = benchmark_hook.mean_throughput.value() + dllogger.log(data={'train_throughput': train_throughput}, step=tuple()) + else: + train_spec = tf.estimator.TrainSpec(input_fn=train_input_fn, max_steps=max_steps, hooks=hooks) + eval_spec = tf.estimator.EvalSpec(input_fn=eval_input_fn, + throttle_secs=FLAGS.eval_throttle_secs, steps=FLAGS.eval_steps) + result = tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec) + + if result: + dllogger.log(step=(), data={'map': float(result[0]['map']), + 'map_with_leak': float(result[0]['map_with_leak'])}) + + +if __name__ == '__main__': + FLAGS = create_parser().parse_args() + main(FLAGS) + diff --git a/TensorFlow/Recommendation/WideAndDeep/utils/hooks/benchmark_hooks.py b/TensorFlow/Recommendation/WideAndDeep/utils/hooks/benchmark_hooks.py new file mode 100644 index 00000000..f66f4151 --- /dev/null +++ b/TensorFlow/Recommendation/WideAndDeep/utils/hooks/benchmark_hooks.py @@ -0,0 +1,49 @@ +#! /usr/bin/python +# -*- coding: utf-8 -*- + +# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import time +import tensorflow as tf + +import dllogger + +from .training_hooks import MeanAccumulator + + +__all__ = ['BenchmarkLoggingHook'] + + +class BenchmarkLoggingHook(tf.train.SessionRunHook): + + def __init__(self, global_batch_size, warmup_steps=100): + self.warmup_steps = warmup_steps + self.global_batch_size = global_batch_size + self.current_step = 0 + self.t0 = None + self.mean_throughput = MeanAccumulator() + + def before_run(self, run_context): + self.t0 = time.time() + + def after_run(self, run_context, run_values): + batch_time = time.time() - self.t0 + samplesps = self.global_batch_size / batch_time + if self.current_step >= self.warmup_steps: + self.mean_throughput.consume(samplesps) + dllogger.log(data={"samplesps" : samplesps}, step=(0, self.current_step)) + + self.current_step += 1 + diff --git a/TensorFlow/Recommendation/WideAndDeep/utils/hooks/training_hooks.py b/TensorFlow/Recommendation/WideAndDeep/utils/hooks/training_hooks.py new file mode 100644 index 00000000..c8b34278 --- /dev/null +++ b/TensorFlow/Recommendation/WideAndDeep/utils/hooks/training_hooks.py @@ -0,0 +1,36 @@ +#! /usr/bin/python +# -*- coding: utf-8 -*- + +# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import time +import tensorflow as tf + +import dllogger + + +class MeanAccumulator: + def __init__(self): + self.sum = 0 + self.count = 0 + + def consume(self, value): + self.sum += value + self.count += 1 + + def value(self): + return self.sum / self.count + +