DeepLearningExamples/TensorFlow/Recommendation/WideAndDeep/preproc/preproc4.py

#!/usr/bin/env python
# coding: utf-8

# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


import argparse
import datetime
import numpy as np
import pandas as pd
import pyspark.sql.functions as F
import tensorflow as tf
import trainer
from pyspark import TaskContext
from pyspark.context import SparkContext, SparkConf
from pyspark.sql.functions import col, udf
from pyspark.sql.session import SparkSession
from pyspark.sql.types import ArrayType, DoubleType
from tensorflow_transform.tf_metadata import dataset_metadata
from tensorflow_transform.tf_metadata import dataset_schema
from tensorflow_transform.tf_metadata import metadata_io
from trainer.features import LABEL_COLUMN, DISPLAY_ID_COLUMN, IS_LEAK_COLUMN, DISPLAY_ID_AND_IS_LEAK_ENCODED_COLUMN, \
    CATEGORICAL_COLUMNS, DOC_CATEGORICAL_MULTIVALUED_COLUMNS, BOOL_COLUMNS, INT_COLUMNS, FLOAT_COLUMNS, \
    FLOAT_COLUMNS_LOG_BIN_TRANSFORM, FLOAT_COLUMNS_SIMPLE_BIN_TRANSFORM

evaluation = True
evaluation_verbose = False

OUTPUT_BUCKET_FOLDER = "/outbrain/preprocessed/"
DATA_BUCKET_FOLDER = "/outbrain/orig/"
SPARK_TEMP_FOLDER = "/outbrain/spark-temp/"
LOCAL_DATA_TFRECORDS_DIR = "/outbrain/tfrecords"

TEST_SET_MODE = False

TENSORFLOW_HADOOP = "preproc/data/tensorflow-hadoop-1.5.0.jar"

conf = SparkConf().setMaster('local[*]').set('spark.executor.memory', '40g').set('spark.driver.memory', '200g').set(
    "spark.local.dir", SPARK_TEMP_FOLDER)
conf.set("spark.jars", TENSORFLOW_HADOOP)
conf.set("spark.sql.files.maxPartitionBytes", 805306368)

sc = SparkContext(conf=conf)
spark = SparkSession(sc)

parser = argparse.ArgumentParser()

parser.add_argument(
    '--prebatch_size',
    help='Prebatch size in created tfrecords',
    type=int,
    default=4096)

parser.add_argument(
    '--submission',
    action='store_true',
    default=False
)

args = parser.parse_args()

batch_size = args.prebatch_size

# # Feature Vector export
bool_feature_names = ['event_weekend',
                      'user_has_already_viewed_doc']

int_feature_names = ['user_views',
                     'ad_views',
                     'doc_views',
                     'doc_event_days_since_published',
                     'doc_event_hour',
                     'doc_ad_days_since_published',
                     ]

float_feature_names = [
    'pop_ad_id',
    'pop_ad_id_conf',
    'pop_ad_id_conf_multipl',
    'pop_document_id',
    'pop_document_id_conf',
    'pop_document_id_conf_multipl',
    'pop_publisher_id',
    'pop_publisher_id_conf',
    'pop_publisher_id_conf_multipl',
    'pop_advertiser_id',
    'pop_advertiser_id_conf',
    'pop_advertiser_id_conf_multipl',
    'pop_campain_id',
    'pop_campain_id_conf',
    'pop_campain_id_conf_multipl',
    'pop_doc_event_doc_ad',
    'pop_doc_event_doc_ad_conf',
    'pop_doc_event_doc_ad_conf_multipl',
    'pop_source_id',
    'pop_source_id_conf',
    'pop_source_id_conf_multipl',
    'pop_source_id_country',
    'pop_source_id_country_conf',
    'pop_source_id_country_conf_multipl',
    'pop_entity_id',
    'pop_entity_id_conf',
    'pop_entity_id_conf_multipl',
    'pop_entity_id_country',
    'pop_entity_id_country_conf',
    'pop_entity_id_country_conf_multipl',
    'pop_topic_id',
    'pop_topic_id_conf',
    'pop_topic_id_conf_multipl',
    'pop_topic_id_country',
    'pop_topic_id_country_conf',
    'pop_topic_id_country_conf_multipl',
    'pop_category_id',
    'pop_category_id_conf',
    'pop_category_id_conf_multipl',
    'pop_category_id_country',
    'pop_category_id_country_conf',
    'pop_category_id_country_conf_multipl',
    'user_doc_ad_sim_categories',
    'user_doc_ad_sim_categories_conf',
    'user_doc_ad_sim_categories_conf_multipl',
    'user_doc_ad_sim_topics',
    'user_doc_ad_sim_topics_conf',
    'user_doc_ad_sim_topics_conf_multipl',
    'user_doc_ad_sim_entities',
    'user_doc_ad_sim_entities_conf',
    'user_doc_ad_sim_entities_conf_multipl',
    'doc_event_doc_ad_sim_categories',
    'doc_event_doc_ad_sim_categories_conf',
    'doc_event_doc_ad_sim_categories_conf_multipl',
    'doc_event_doc_ad_sim_topics',
    'doc_event_doc_ad_sim_topics_conf',
    'doc_event_doc_ad_sim_topics_conf_multipl',
    'doc_event_doc_ad_sim_entities',
    'doc_event_doc_ad_sim_entities_conf',
    'doc_event_doc_ad_sim_entities_conf_multipl'
]

# ### Configuring feature vector
category_feature_names_integral = ['ad_advertiser',
                                   'doc_ad_category_id_1',
                                   'doc_ad_category_id_2',
                                   'doc_ad_category_id_3',
                                   'doc_ad_topic_id_1',
                                   'doc_ad_topic_id_2',
                                   'doc_ad_topic_id_3',
                                   'doc_ad_entity_id_1',
                                   'doc_ad_entity_id_2',
                                   'doc_ad_entity_id_3',
                                   'doc_ad_entity_id_4',
                                   'doc_ad_entity_id_5',
                                   'doc_ad_entity_id_6',
                                   'doc_ad_publisher_id',
                                   'doc_ad_source_id',
                                   'doc_event_category_id_1',
                                   'doc_event_category_id_2',
                                   'doc_event_category_id_3',
                                   'doc_event_topic_id_1',
                                   'doc_event_topic_id_2',
                                   'doc_event_topic_id_3',
                                   'doc_event_entity_id_1',
                                   'doc_event_entity_id_2',
                                   'doc_event_entity_id_3',
                                   'doc_event_entity_id_4',
                                   'doc_event_entity_id_5',
                                   'doc_event_entity_id_6',
                                   'doc_event_publisher_id',
                                   'doc_event_source_id',
                                   'event_country',
                                   'event_country_state',
                                   'event_geo_location',
                                   'event_hour',
                                   'event_platform',
                                   'traffic_source']

feature_vector_labels_integral = bool_feature_names \
                                 + int_feature_names \
                                 + float_feature_names \
                                 + category_feature_names_integral

if args.submission:
    train_feature_vector_gcs_folder_name = 'train_feature_vectors_integral'
else:
    train_feature_vector_gcs_folder_name = 'train_feature_vectors_integral_eval'

# ## Exporting integral feature vectors to CSV
train_feature_vectors_exported_df = spark.read.parquet(OUTPUT_BUCKET_FOLDER + train_feature_vector_gcs_folder_name)
train_feature_vectors_exported_df.take(3)

integral_headers = ['label', 'display_id', 'ad_id', 'doc_id', 'doc_event_id',
                    'is_leak'] + feature_vector_labels_integral

CSV_ORDERED_COLUMNS = ['label', 'display_id', 'ad_id', 'doc_id', 'doc_event_id', 'is_leak', 'event_weekend',
                       'user_has_already_viewed_doc', 'user_views', 'ad_views', 'doc_views',
                       'doc_event_days_since_published', 'doc_event_hour', 'doc_ad_days_since_published',
                       'pop_ad_id', 'pop_ad_id_conf',
                       'pop_ad_id_conf_multipl', 'pop_document_id', 'pop_document_id_conf',
                       'pop_document_id_conf_multipl', 'pop_publisher_id', 'pop_publisher_id_conf',
                       'pop_publisher_id_conf_multipl', 'pop_advertiser_id', 'pop_advertiser_id_conf',
                       'pop_advertiser_id_conf_multipl', 'pop_campain_id', 'pop_campain_id_conf',
                       'pop_campain_id_conf_multipl', 'pop_doc_event_doc_ad', 'pop_doc_event_doc_ad_conf',
                       'pop_doc_event_doc_ad_conf_multipl', 'pop_source_id', 'pop_source_id_conf',
                       'pop_source_id_conf_multipl', 'pop_source_id_country', 'pop_source_id_country_conf',
                       'pop_source_id_country_conf_multipl', 'pop_entity_id', 'pop_entity_id_conf',
                       'pop_entity_id_conf_multipl', 'pop_entity_id_country', 'pop_entity_id_country_conf',
                       'pop_entity_id_country_conf_multipl', 'pop_topic_id', 'pop_topic_id_conf',
                       'pop_topic_id_conf_multipl', 'pop_topic_id_country', 'pop_topic_id_country_conf',
                       'pop_topic_id_country_conf_multipl', 'pop_category_id', 'pop_category_id_conf',
                       'pop_category_id_conf_multipl', 'pop_category_id_country', 'pop_category_id_country_conf',
                       'pop_category_id_country_conf_multipl', 'user_doc_ad_sim_categories',
                       'user_doc_ad_sim_categories_conf', 'user_doc_ad_sim_categories_conf_multipl',
                       'user_doc_ad_sim_topics', 'user_doc_ad_sim_topics_conf', 'user_doc_ad_sim_topics_conf_multipl',
                       'user_doc_ad_sim_entities', 'user_doc_ad_sim_entities_conf',
                       'user_doc_ad_sim_entities_conf_multipl',
                       'doc_event_doc_ad_sim_categories', 'doc_event_doc_ad_sim_categories_conf',
                       'doc_event_doc_ad_sim_categories_conf_multipl', 'doc_event_doc_ad_sim_topics',
                       'doc_event_doc_ad_sim_topics_conf', 'doc_event_doc_ad_sim_topics_conf_multipl',
                       'doc_event_doc_ad_sim_entities', 'doc_event_doc_ad_sim_entities_conf',
                       'doc_event_doc_ad_sim_entities_conf_multipl', 'ad_advertiser', 'doc_ad_category_id_1',
                       'doc_ad_category_id_2', 'doc_ad_category_id_3', 'doc_ad_topic_id_1', 'doc_ad_topic_id_2',
                       'doc_ad_topic_id_3', 'doc_ad_entity_id_1', 'doc_ad_entity_id_2', 'doc_ad_entity_id_3',
                       'doc_ad_entity_id_4', 'doc_ad_entity_id_5', 'doc_ad_entity_id_6', 'doc_ad_publisher_id',
                       'doc_ad_source_id', 'doc_event_category_id_1', 'doc_event_category_id_2',
                       'doc_event_category_id_3',
                       'doc_event_topic_id_1', 'doc_event_topic_id_2', 'doc_event_topic_id_3', 'doc_event_entity_id_1',
                       'doc_event_entity_id_2', 'doc_event_entity_id_3', 'doc_event_entity_id_4',
                       'doc_event_entity_id_5',
                       'doc_event_entity_id_6', 'doc_event_publisher_id', 'doc_event_source_id', 'event_country',
                       'event_country_state', 'event_geo_location', 'event_hour', 'event_platform', 'traffic_source']

FEAT_CSV_ORDERED_COLUMNS = ['event_weekend',
                            'user_has_already_viewed_doc', 'user_views', 'ad_views', 'doc_views',
                            'doc_event_days_since_published', 'doc_event_hour', 'doc_ad_days_since_published',
                            'pop_ad_id', 'pop_ad_id_conf',
                            'pop_ad_id_conf_multipl', 'pop_document_id', 'pop_document_id_conf',
                            'pop_document_id_conf_multipl', 'pop_publisher_id', 'pop_publisher_id_conf',
                            'pop_publisher_id_conf_multipl', 'pop_advertiser_id', 'pop_advertiser_id_conf',
                            'pop_advertiser_id_conf_multipl', 'pop_campain_id', 'pop_campain_id_conf',
                            'pop_campain_id_conf_multipl', 'pop_doc_event_doc_ad', 'pop_doc_event_doc_ad_conf',
                            'pop_doc_event_doc_ad_conf_multipl', 'pop_source_id', 'pop_source_id_conf',
                            'pop_source_id_conf_multipl', 'pop_source_id_country', 'pop_source_id_country_conf',
                            'pop_source_id_country_conf_multipl', 'pop_entity_id', 'pop_entity_id_conf',
                            'pop_entity_id_conf_multipl', 'pop_entity_id_country', 'pop_entity_id_country_conf',
                            'pop_entity_id_country_conf_multipl', 'pop_topic_id', 'pop_topic_id_conf',
                            'pop_topic_id_conf_multipl', 'pop_topic_id_country', 'pop_topic_id_country_conf',
                            'pop_topic_id_country_conf_multipl', 'pop_category_id', 'pop_category_id_conf',
                            'pop_category_id_conf_multipl', 'pop_category_id_country', 'pop_category_id_country_conf',
                            'pop_category_id_country_conf_multipl', 'user_doc_ad_sim_categories',
                            'user_doc_ad_sim_categories_conf', 'user_doc_ad_sim_categories_conf_multipl',
                            'user_doc_ad_sim_topics', 'user_doc_ad_sim_topics_conf',
                            'user_doc_ad_sim_topics_conf_multipl',
                            'user_doc_ad_sim_entities', 'user_doc_ad_sim_entities_conf',
                            'user_doc_ad_sim_entities_conf_multipl',
                            'doc_event_doc_ad_sim_categories', 'doc_event_doc_ad_sim_categories_conf',
                            'doc_event_doc_ad_sim_categories_conf_multipl', 'doc_event_doc_ad_sim_topics',
                            'doc_event_doc_ad_sim_topics_conf', 'doc_event_doc_ad_sim_topics_conf_multipl',
                            'doc_event_doc_ad_sim_entities', 'doc_event_doc_ad_sim_entities_conf',
                            'doc_event_doc_ad_sim_entities_conf_multipl', 'ad_advertiser', 'doc_ad_category_id_1',
                            'doc_ad_category_id_2', 'doc_ad_category_id_3', 'doc_ad_topic_id_1', 'doc_ad_topic_id_2',
                            'doc_ad_topic_id_3', 'doc_ad_entity_id_1', 'doc_ad_entity_id_2', 'doc_ad_entity_id_3',
                            'doc_ad_entity_id_4', 'doc_ad_entity_id_5', 'doc_ad_entity_id_6', 'doc_ad_publisher_id',
                            'doc_ad_source_id', 'doc_event_category_id_1', 'doc_event_category_id_2',
                            'doc_event_category_id_3',
                            'doc_event_topic_id_1', 'doc_event_topic_id_2', 'doc_event_topic_id_3',
                            'doc_event_entity_id_1',
                            'doc_event_entity_id_2', 'doc_event_entity_id_3', 'doc_event_entity_id_4',
                            'doc_event_entity_id_5',
                            'doc_event_entity_id_6', 'doc_event_publisher_id', 'doc_event_source_id', 'event_country',
                            'event_country_state', 'event_geo_location', 'event_hour', 'event_platform',
                            'traffic_source']


def to_array(col):
    def to_array_(v):
        return v.toArray().tolist()

    # Important: asNondeterministic requires Spark 2.3 or later
    # It can be safely removed i.e.
    # return udf(to_array_, ArrayType(DoubleType()))(col)
    # but at the cost of decreased performance

    return udf(to_array_, ArrayType(DoubleType())).asNondeterministic()(col)


CONVERT_TO_INT = ['doc_ad_category_id_1',
                  'doc_ad_category_id_2', 'doc_ad_category_id_3', 'doc_ad_topic_id_1', 'doc_ad_topic_id_2',
                  'doc_ad_topic_id_3', 'doc_ad_entity_id_1', 'doc_ad_entity_id_2', 'doc_ad_entity_id_3',
                  'doc_ad_entity_id_4', 'doc_ad_entity_id_5', 'doc_ad_entity_id_6',
                  'doc_ad_source_id', 'doc_event_category_id_1', 'doc_event_category_id_2', 'doc_event_category_id_3',
                  'doc_event_topic_id_1', 'doc_event_topic_id_2', 'doc_event_topic_id_3', 'doc_event_entity_id_1',
                  'doc_event_entity_id_2', 'doc_event_entity_id_3', 'doc_event_entity_id_4', 'doc_event_entity_id_5',
                  'doc_event_entity_id_6']


def format_number(element, name):
    if name in BOOL_COLUMNS + CATEGORICAL_COLUMNS:
        return element.cast("int")
    elif name in CONVERT_TO_INT:
        return element.cast("int")
    else:
        return element


def to_array_with_none(col):
    def to_array_with_none_(v):
        tmp = np.full((v.size,), fill_value=None, dtype=np.float64)
        tmp[v.indices] = v.values
        return tmp.tolist()

    # Important: asNondeterministic requires Spark 2.3 or later
    # It can be safely removed i.e.
    # return udf(to_array_, ArrayType(DoubleType()))(col)
    # but at the cost of decreased performance

    return udf(to_array_with_none_, ArrayType(DoubleType())).asNondeterministic()(col)


@udf
def count_value(x):
    from collections import Counter
    tmp = Counter(x).most_common(2)
    if not tmp or np.isnan(tmp[0][0]):
        return 0
    return float(tmp[0][0])


def replace_with_most_frequent(most_value):
    return udf(lambda x: most_value if not x or np.isnan(x) else x)


train_feature_vectors_integral_csv_rdd_df = train_feature_vectors_exported_df.select('label', 'display_id', 'ad_id',
                                                                                     'document_id', 'document_id_event',
                                                                                     'feature_vector').withColumn(
    'is_leak', F.lit(-1)).withColumn("featvec", to_array("feature_vector")).select(
    ['label'] + ['display_id'] + ['ad_id'] + ['document_id'] + ['document_id_event'] + ['is_leak'] + [
        format_number(element, FEAT_CSV_ORDERED_COLUMNS[index]).alias(FEAT_CSV_ORDERED_COLUMNS[index]) for
        index, element in enumerate([col("featvec")[i] for i in range(len(feature_vector_labels_integral))])]).replace(
    float('nan'), 0)

if args.submission:
    test_validation_feature_vector_gcs_folder_name = 'test_feature_vectors_integral'
else:
    test_validation_feature_vector_gcs_folder_name = 'validation_feature_vectors_integral'

# ## Exporting integral feature vectors
test_validation_feature_vectors_exported_df = spark.read.parquet(
    OUTPUT_BUCKET_FOLDER + test_validation_feature_vector_gcs_folder_name)
test_validation_feature_vectors_exported_df.take(3)

test_validation_feature_vectors_integral_csv_rdd_df = test_validation_feature_vectors_exported_df.select(
    'label', 'display_id', 'ad_id', 'document_id', 'document_id_event',
    'is_leak', 'feature_vector').withColumn("featvec", to_array("feature_vector")).select(
    ['label'] + ['display_id'] + ['ad_id'] + ['document_id'] + ['document_id_event'] + ['is_leak'] + [
        format_number(element, FEAT_CSV_ORDERED_COLUMNS[index]).alias(FEAT_CSV_ORDERED_COLUMNS[index]) for
        index, element in enumerate([col("featvec")[i] for i in range(len(feature_vector_labels_integral))])]).replace(
    float('nan'), 0)


def make_spec(output_dir, batch_size=None):
    fixed_shape = [batch_size, 1] if batch_size is not None else []
    spec = {}
    spec[LABEL_COLUMN] = tf.FixedLenFeature(shape=fixed_shape, dtype=tf.int64, default_value=None)
    spec[DISPLAY_ID_COLUMN] = tf.FixedLenFeature(shape=fixed_shape, dtype=tf.int64, default_value=None)
    spec[IS_LEAK_COLUMN] = tf.FixedLenFeature(shape=fixed_shape, dtype=tf.int64, default_value=None)
    spec[DISPLAY_ID_AND_IS_LEAK_ENCODED_COLUMN] = tf.FixedLenFeature(shape=fixed_shape, dtype=tf.int64,
                                                                     default_value=None)
    for name in BOOL_COLUMNS:
        spec[name] = tf.FixedLenFeature(shape=fixed_shape, dtype=tf.int64, default_value=None)
    for name in FLOAT_COLUMNS_LOG_BIN_TRANSFORM + FLOAT_COLUMNS_SIMPLE_BIN_TRANSFORM:
        spec[name] = tf.FixedLenFeature(shape=fixed_shape, dtype=tf.float32, default_value=None)
    for name in FLOAT_COLUMNS_SIMPLE_BIN_TRANSFORM:
        spec[name + '_binned'] = tf.FixedLenFeature(shape=fixed_shape, dtype=tf.int64, default_value=None)
    for name in FLOAT_COLUMNS_LOG_BIN_TRANSFORM:
        spec[name + '_binned'] = tf.FixedLenFeature(shape=fixed_shape, dtype=tf.int64, default_value=None)
        spec[name + '_log_01scaled'] = tf.FixedLenFeature(shape=fixed_shape, dtype=tf.float32, default_value=None)
    for name in INT_COLUMNS:
        spec[name + '_log_int'] = tf.FixedLenFeature(shape=fixed_shape, dtype=tf.int64, default_value=None)
        spec[name + '_log_01scaled'] = tf.FixedLenFeature(shape=fixed_shape, dtype=tf.float32, default_value=None)
    for name in BOOL_COLUMNS + CATEGORICAL_COLUMNS:
        spec[name] = tf.FixedLenFeature(shape=fixed_shape, dtype=tf.int64, default_value=None)
    for multi_category in DOC_CATEGORICAL_MULTIVALUED_COLUMNS:
        shape = fixed_shape[:-1] + [len(DOC_CATEGORICAL_MULTIVALUED_COLUMNS[multi_category])]
        spec[multi_category] = tf.FixedLenFeature(shape=shape, dtype=tf.int64)
    metadata = dataset_metadata.DatasetMetadata(dataset_schema.from_feature_spec(spec))
    metadata_io.write_metadata(metadata, output_dir)


# write out tfrecords meta
make_spec(LOCAL_DATA_TFRECORDS_DIR + '/transformed_metadata', batch_size=batch_size)


def log2_1p(x):
    return np.log1p(x) / np.log(2.0)


# calculate min and max stats for the given dataframes all in one go
def compute_min_max_logs(df):
    print(str(datetime.datetime.now()) + '\tComputing min and max')
    min_logs = {}
    max_logs = {}
    float_expr = []
    for name in trainer.features.FLOAT_COLUMNS_LOG_BIN_TRANSFORM + trainer.features.INT_COLUMNS:
        float_expr.append(F.min(name))
        float_expr.append(F.max(name))
    floatDf = all_df.agg(*float_expr).collect()
    for name in trainer.features.FLOAT_COLUMNS_LOG_BIN_TRANSFORM:
        minAgg = floatDf[0]["min(" + name + ")"]
        maxAgg = floatDf[0]["max(" + name + ")"]
        min_logs[name + '_log_01scaled'] = log2_1p(minAgg * 1000)
        max_logs[name + '_log_01scaled'] = log2_1p(maxAgg * 1000)
    for name in trainer.features.INT_COLUMNS:
        minAgg = floatDf[0]["min(" + name + ")"]
        maxAgg = floatDf[0]["max(" + name + ")"]
        min_logs[name + '_log_01scaled'] = log2_1p(minAgg)
        max_logs[name + '_log_01scaled'] = log2_1p(maxAgg)

    return min_logs, max_logs


all_df = test_validation_feature_vectors_integral_csv_rdd_df.union(train_feature_vectors_integral_csv_rdd_df)
min_logs, max_logs = compute_min_max_logs(all_df)

if args.submission:
    train_output_string = '/sub_train'
    eval_output_string = '/test'
else:
    train_output_string = '/train'
    eval_output_string = '/eval'

path = LOCAL_DATA_TFRECORDS_DIR


def create_tf_example_spark(df, min_logs, max_logs):
    result = {}
    result[LABEL_COLUMN] = tf.train.Feature(int64_list=tf.train.Int64List(value=df[LABEL_COLUMN].to_list()))
    result[DISPLAY_ID_COLUMN] = tf.train.Feature(int64_list=tf.train.Int64List(value=df[DISPLAY_ID_COLUMN].to_list()))
    result[IS_LEAK_COLUMN] = tf.train.Feature(int64_list=tf.train.Int64List(value=df[IS_LEAK_COLUMN].to_list()))
    encoded_value = df[DISPLAY_ID_COLUMN].multiply(10).add(df[IS_LEAK_COLUMN].clip(lower=0)).to_list()
    result[DISPLAY_ID_AND_IS_LEAK_ENCODED_COLUMN] = tf.train.Feature(int64_list=tf.train.Int64List(value=encoded_value))
    for name in FLOAT_COLUMNS:
        value = df[name].to_list()
        result[name] = tf.train.Feature(float_list=tf.train.FloatList(value=value))
    for name in FLOAT_COLUMNS_SIMPLE_BIN_TRANSFORM:
        value = df[name].multiply(10).astype('int64').to_list()
        result[name + '_binned'] = tf.train.Feature(int64_list=tf.train.Int64List(value=value))
    for name in FLOAT_COLUMNS_LOG_BIN_TRANSFORM:
        value_prelim = df[name].multiply(1000).apply(np.log1p).multiply(1. / np.log(2.0))
        value = value_prelim.astype('int64').to_list()
        result[name + '_binned'] = tf.train.Feature(int64_list=tf.train.Int64List(value=value))
        nn = name + '_log_01scaled'
        value = value_prelim.add(-min_logs[nn]).multiply(1. / (max_logs[nn] - min_logs[nn])).to_list()
        result[nn] = tf.train.Feature(float_list=tf.train.FloatList(value=value))
    for name in INT_COLUMNS:
        value_prelim = df[name].apply(np.log1p).multiply(1. / np.log(2.0))
        value = value_prelim.astype('int64').to_list()
        result[name + '_log_int'] = tf.train.Feature(int64_list=tf.train.Int64List(value=value))
        nn = name + '_log_01scaled'
        value = value_prelim.add(-min_logs[nn]).multiply(1. / (max_logs[nn] - min_logs[nn])).to_list()
        result[nn] = tf.train.Feature(float_list=tf.train.FloatList(value=value))
    for name in BOOL_COLUMNS + CATEGORICAL_COLUMNS:
        value = df[name].fillna(0).astype('int64').to_list()
        result[name] = tf.train.Feature(int64_list=tf.train.Int64List(value=value))
    for multi_category in DOC_CATEGORICAL_MULTIVALUED_COLUMNS:
        values = []
        for category in DOC_CATEGORICAL_MULTIVALUED_COLUMNS[multi_category]:
            values = values + [df[category].to_numpy()]
        # need to transpose the series so they will be parsed correctly by the FixedLenFeature
        # we can pass in a single series here; they'll be reshaped to [batch_size, num_values]
        # when parsed from the TFRecord
        value = np.stack(values, axis=1).flatten().tolist()
        result[multi_category] = tf.train.Feature(int64_list=tf.train.Int64List(value=value))
    tf_example = tf.train.Example(features=tf.train.Features(feature=result))
    return tf_example


def _transform_to_tfrecords(rdds):
    csv = pd.DataFrame(list(rdds), columns=CSV_ORDERED_COLUMNS)
    num_rows = len(csv.index)
    examples = []
    for start_ind in range(0, num_rows, batch_size if batch_size is not None else 1):  # for each batch
        if start_ind + batch_size - 1 > num_rows:  # if we'd run out of rows
            csv_slice = csv.iloc[start_ind:]
            # drop the remainder
            print("last Example has: ", len(csv_slice))
            examples.append((create_tf_example_spark(csv_slice, min_logs, max_logs), len(csv_slice)))
            return examples
        else:
            csv_slice = csv.iloc[start_ind:start_ind + (batch_size if batch_size is not None else 1)]
        examples.append((create_tf_example_spark(csv_slice, min_logs, max_logs), batch_size))
    return examples


max_partition_num = 30


def _transform_to_slices(rdds):
    taskcontext = TaskContext.get()
    partitionid = taskcontext.partitionId()
    csv = pd.DataFrame(list(rdds), columns=CSV_ORDERED_COLUMNS)
    num_rows = len(csv.index)
    print("working with partition: ", partitionid, max_partition_num, num_rows)
    examples = []
    for start_ind in range(0, num_rows, batch_size if batch_size is not None else 1):  # for each batch
        if start_ind + batch_size - 1 > num_rows:  # if we'd run out of rows
            csv_slice = csv.iloc[start_ind:]
            print("last Example has: ", len(csv_slice), partitionid)
            examples.append((csv_slice, len(csv_slice)))
            return examples
        else:
            csv_slice = csv.iloc[start_ind:start_ind + (batch_size if batch_size is not None else 1)]
        examples.append((csv_slice, len(csv_slice)))
    return examples


def _transform_to_tfrecords_from_slices(rdds):
    examples = []
    for slice in rdds:
        if len(slice[0]) != batch_size:
            print("slice size is not correct, dropping: ", len(slice[0]))
        else:
            examples.append(
                (bytearray((create_tf_example_spark(slice[0], min_logs, max_logs)).SerializeToString()), None))
    return examples


def _transform_to_tfrecords_from_reslice(rdds):
    examples = []
    all_dataframes = pd.DataFrame([])
    for slice in rdds:
        all_dataframes = all_dataframes.append(slice[0])
    num_rows = len(all_dataframes.index)
    examples = []
    for start_ind in range(0, num_rows, batch_size if batch_size is not None else 1):  # for each batch
        if start_ind + batch_size - 1 > num_rows:  # if we'd run out of rows
            csv_slice = all_dataframes.iloc[start_ind:]
            if TEST_SET_MODE:
                remain_len = batch_size - len(csv_slice)
                (m, n) = divmod(remain_len, len(csv_slice))
                print("remainder: ", len(csv_slice), remain_len, m, n)
                if m:
                    for i in range(m):
                        csv_slice = csv_slice.append(csv_slice)
                csv_slice = csv_slice.append(csv_slice.iloc[:n])
                print("after fill remainder: ", len(csv_slice))
                examples.append(
                    (bytearray((create_tf_example_spark(csv_slice, min_logs, max_logs)).SerializeToString()), None))
                return examples
            # drop the remainder
            print("dropping remainder: ", len(csv_slice))
            return examples
        else:
            csv_slice = all_dataframes.iloc[start_ind:start_ind + (batch_size if batch_size is not None else 1)]
            examples.append(
                (bytearray((create_tf_example_spark(csv_slice, min_logs, max_logs)).SerializeToString()), None))
    return examples


TEST_SET_MODE = False
train_features = train_feature_vectors_integral_csv_rdd_df.coalesce(30).rdd.mapPartitions(_transform_to_slices)
cached_train_features = train_features.cache()
cached_train_features.count()
train_full = cached_train_features.filter(lambda x: x[1] == batch_size)
# split out slies where we don't have a full batch so that we can reslice them so we only drop mininal rows
train_not_full = cached_train_features.filter(lambda x: x[1] < batch_size)
train_examples_full = train_full.mapPartitions(_transform_to_tfrecords_from_slices)
train_left = train_not_full.coalesce(1).mapPartitions(_transform_to_tfrecords_from_reslice)
all_train = train_examples_full.union(train_left)

TEST_SET_MODE = True
valid_features = test_validation_feature_vectors_integral_csv_rdd_df.coalesce(30).rdd.mapPartitions(
    _transform_to_slices)
cached_valid_features = valid_features.cache()
cached_valid_features.count()
valid_full = cached_valid_features.filter(lambda x: x[1] == batch_size)
valid_not_full = cached_valid_features.filter(lambda x: x[1] < batch_size)
valid_examples_full = valid_full.mapPartitions(_transform_to_tfrecords_from_slices)
valid_left = valid_not_full.coalesce(1).mapPartitions(_transform_to_tfrecords_from_reslice)
all_valid = valid_examples_full.union(valid_left)

all_train.saveAsNewAPIHadoopFile(LOCAL_DATA_TFRECORDS_DIR + train_output_string,
                                 "org.tensorflow.hadoop.io.TFRecordFileOutputFormat",
                                 keyClass="org.apache.hadoop.io.BytesWritable",
                                 valueClass="org.apache.hadoop.io.NullWritable")

all_valid.saveAsNewAPIHadoopFile(LOCAL_DATA_TFRECORDS_DIR + eval_output_string,
                                 "org.tensorflow.hadoop.io.TFRecordFileOutputFormat",
                                 keyClass="org.apache.hadoop.io.BytesWritable",
                                 valueClass="org.apache.hadoop.io.NullWritable")

spark.stop()