DeepLearningExamples/TensorFlow2/Recommendation/WideAndDeep/data/outbrain/nvtabular/utils/converter.py
2021-03-04 14:25:59 +01:00

159 lines
6.3 KiB
Python

# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import logging
import os
from multiprocessing import Process
import pandas as pd
import tensorflow as tf
from tensorflow_transform.tf_metadata import dataset_metadata
from tensorflow_transform.tf_metadata import dataset_schema
from tensorflow_transform.tf_metadata import metadata_io
from data.outbrain.features import PREBATCH_SIZE
from data.outbrain.nvtabular.utils.feature_description import transform_nvt_to_spark, CATEGORICAL_COLUMNS, \
DISPLAY_ID_COLUMN, EXCLUDE_COLUMNS
def create_metadata(df, prebatch_size, output_path):
fixed_shape = [prebatch_size, 1]
spec = {}
for column in df:
if column in CATEGORICAL_COLUMNS + [DISPLAY_ID_COLUMN]:
spec[transform_nvt_to_spark(column)] = tf.io.FixedLenFeature(shape=fixed_shape, dtype=tf.int64,
default_value=None)
else:
spec[transform_nvt_to_spark(column)] = tf.io.FixedLenFeature(shape=fixed_shape, dtype=tf.float32,
default_value=None)
metadata = dataset_metadata.DatasetMetadata(dataset_schema.from_feature_spec(spec))
metadata_io.write_metadata(metadata, output_path)
def create_tf_example(df, start_index, offset):
parsed_features = {}
records = df.loc[start_index:start_index + offset - 1]
for column in records:
if column in CATEGORICAL_COLUMNS + [DISPLAY_ID_COLUMN]:
feature = tf.train.Feature(int64_list=tf.train.Int64List(value=records[column].to_numpy()))
else:
feature = tf.train.Feature(float_list=tf.train.FloatList(value=records[column].to_numpy()))
parsed_features[transform_nvt_to_spark(column)] = feature
features = tf.train.Features(feature=parsed_features)
return tf.train.Example(features=features)
def create_tf_records(df, prebatch_size, output_path):
with tf.io.TFRecordWriter(output_path) as out_file:
start_index = df.index[0]
for index in range(start_index, df.shape[0] + start_index - prebatch_size + 1, prebatch_size):
example = create_tf_example(df, index, prebatch_size)
out_file.write(example.SerializeToString())
def convert(path_to_nvt_dataset, output_path, prebatch_size, exclude_columns, workers=6):
train_path = os.path.join(path_to_nvt_dataset, 'train')
valid_path = os.path.join(path_to_nvt_dataset, 'valid')
output_metadata_path = os.path.join(output_path, 'transformed_metadata')
output_train_path = os.path.join(output_path, 'train')
output_valid_path = os.path.join(output_path, 'eval')
for directory in [output_metadata_path, output_train_path, output_valid_path]:
os.makedirs(directory, exist_ok=True)
train_workers, valid_workers = [], []
output_train_paths, output_valid_paths = [], []
for worker in range(workers):
part_number = str(worker).rjust(5, '0')
record_train_path = os.path.join(output_train_path, f'part-r-{part_number}')
record_valid_path = os.path.join(output_valid_path, f'part-r-{part_number}')
output_train_paths.append(record_train_path)
output_valid_paths.append(record_valid_path)
logging.warning(f'Prebatch size set to {prebatch_size}')
logging.warning(f'Number of TFRecords set to {workers}')
logging.warning(f'Reading training parquets from {train_path}')
df_train = pd.read_parquet(train_path, engine='pyarrow')
logging.warning('Done')
logging.warning(f'Removing training columns {exclude_columns}')
df_train = df_train.drop(columns=exclude_columns)
logging.warning('Done')
logging.warning(f'Creating metadata in {output_metadata_path}')
metadata_worker = Process(target=create_metadata, args=(df_train, prebatch_size, output_metadata_path))
metadata_worker.start()
logging.warning(f'Creating training TFrecords to {output_train_paths}')
shape = df_train.shape[0] // workers
shape = shape + (prebatch_size - shape % prebatch_size)
for worker_index in range(workers):
df_subset = df_train.loc[worker_index * shape:(worker_index + 1) * shape - 1]
worker = Process(target=create_tf_records, args=(df_subset, prebatch_size, output_train_paths[worker_index]))
train_workers.append(worker)
for worker in train_workers:
worker.start()
logging.warning(f'Reading validation parquets from {valid_path}')
df_valid = pd.read_parquet(valid_path, engine='pyarrow')
logging.warning('Done')
logging.warning(f'Removing validation columns {exclude_columns}')
df_valid = df_valid.drop(columns=exclude_columns)
logging.warning('Done')
logging.warning(f'Creating validation TFrecords to {output_valid_paths}')
shape = df_valid.shape[0] // workers
shape = shape + (prebatch_size - shape % prebatch_size)
for worker_index in range(workers):
df_subset = df_valid.loc[worker_index * shape:(worker_index + 1) * shape - 1]
worker = Process(target=create_tf_records, args=(df_subset, prebatch_size, output_valid_paths[worker_index]))
valid_workers.append(worker)
for worker in valid_workers:
worker.start()
for worker_index in range(workers):
metadata_worker.join()
train_workers[worker_index].join()
valid_workers[worker_index].join()
logging.warning('Done')
del df_train
del df_valid
return output_path
def nvt_to_tfrecords(config):
path_to_nvt_dataset = config['output_bucket_folder']
output_path = config['tfrecords_path']
workers = config['workers']
convert(
path_to_nvt_dataset=path_to_nvt_dataset,
output_path=output_path,
prebatch_size=PREBATCH_SIZE,
exclude_columns=EXCLUDE_COLUMNS,
workers=workers
)