DeepLearningExamples/TensorFlow2/Recommendation/WideAndDeep/data/outbrain/nvtabular/utils/workflow.py

255 lines
10 KiB
Python
Raw Normal View History

2021-03-04 14:25:59 +01:00
# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import shutil
import cudf
import cupy
import nvtabular as nvt
import rmm
from dask.distributed import Client
from dask_cuda import LocalCUDACluster
from data.outbrain.nvtabular.utils.feature_description import CATEGORICAL_COLUMNS, CONTINUOUS_COLUMNS, \
DISPLAY_ID_COLUMN, groupby_columns, ctr_columns
from nvtabular.io import Shuffle
from nvtabular.ops import Normalize, FillMedian, FillMissing, LogOp, LambdaOp, JoinGroupby, HashBucket
from nvtabular.ops.column_similarity import ColumnSimilarity
from nvtabular.utils import device_mem_size, get_rmm_size
TIMESTAMP_DELTA = 1465876799998
def get_devices():
try:
devices = [int(device) for device in os.environ["CUDA_VISIBLE_DEVICES"].split(",")]
except KeyError:
from pynvml import nvmlInit, nvmlDeviceGetCount
nvmlInit()
devices = list(range(nvmlDeviceGetCount()))
return devices
def _calculate_delta(col, gdf):
col.loc[col == ''] = None
col = col.astype('datetime64[ns]')
timestamp = (gdf['timestamp'] + TIMESTAMP_DELTA).astype('datetime64[ms]')
delta = (timestamp - col).dt.days
delta = delta * (delta >= 0) * (delta <= 10 * 365)
return delta
def _df_to_coo(df, row='document_id', col=None, data='confidence_level'):
return cupy.sparse.coo_matrix((df[data].values, (df[row].values, df[col].values)))
def setup_rmm_pool(client, pool_size):
pool_size = get_rmm_size(pool_size)
client.run(rmm.reinitialize, pool_allocator=True, initial_pool_size=pool_size)
return None
def create_client(devices, local_directory):
client = None
if len(devices) > 1:
device_size = device_mem_size(kind="total")
device_limit = int(0.8 * device_size)
device_pool_size = int(0.8 * device_size)
cluster = LocalCUDACluster(
n_workers=len(devices),
CUDA_VISIBLE_DEVICES=",".join(str(x) for x in devices),
device_memory_limit=device_limit,
local_directory=local_directory
)
client = Client(cluster)
setup_rmm_pool(client, device_pool_size)
return client
def create_workflow(data_bucket_folder, output_bucket_folder, hash_spec, devices, local_directory):
rmm.reinitialize(managed_memory=False)
documents_categories_path = os.path.join(data_bucket_folder, 'documents_categories.csv')
documents_topics_path = os.path.join(data_bucket_folder, 'documents_topics.csv')
documents_entities_path = os.path.join(data_bucket_folder, 'documents_entities.csv')
documents_categories_cudf = cudf.read_csv(documents_categories_path)
documents_topics_cudf = cudf.read_csv(documents_topics_path)
documents_entities_cudf = cudf.read_csv(documents_entities_path)
documents_entities_cudf['entity_id'] = documents_entities_cudf['entity_id'].astype('category').cat.codes
categories = _df_to_coo(documents_categories_cudf, col='category_id')
topics = _df_to_coo(documents_topics_cudf, col='topic_id')
entities = _df_to_coo(documents_entities_cudf, col='entity_id')
del documents_categories_cudf, documents_topics_cudf, documents_entities_cudf
ctr_thresh = {
'ad_id': 5,
'source_id_promo': 10,
'publisher_id_promo': 10,
'advertiser_id': 10,
'campaign_id': 10,
'document_id_promo': 5,
}
client = create_client(
devices=devices,
local_directory=local_directory
)
workflow = nvt.Workflow(
cat_names=CATEGORICAL_COLUMNS,
cont_names=CONTINUOUS_COLUMNS,
label_name=['clicked'],
client=client
)
workflow.add_feature([
LambdaOp(
op_name='country',
f=lambda col, gdf: col.str.slice(0, 2),
columns=['geo_location'], replace=False),
LambdaOp(
op_name='state',
f=lambda col, gdf: col.str.slice(0, 5),
columns=['geo_location'], replace=False),
LambdaOp(
op_name='days_since_published',
f=_calculate_delta,
columns=['publish_time', 'publish_time_promo'], replace=False),
FillMedian(columns=['publish_time_days_since_published', 'publish_time_promo_days_since_published']),
JoinGroupby(columns=['ad_id', 'source_id_promo', 'document_id_promo', 'publisher_id_promo', 'advertiser_id',
'campaign_id'],
cont_names=['clicked'], out_path=output_bucket_folder, stats=['sum', 'count']),
LambdaOp(
op_name='ctr',
f=lambda col, gdf: ((col) / (gdf[col.name.replace('_clicked_sum', '_count')])).where(
gdf[col.name.replace('_clicked_sum', '_count')] >= ctr_thresh[col.name.replace('_clicked_sum', '')], 0),
columns=['ad_id_clicked_sum', 'source_id_promo_clicked_sum', 'document_id_promo_clicked_sum',
'publisher_id_promo_clicked_sum',
'advertiser_id_clicked_sum', 'campaign_id_clicked_sum'], replace=False),
FillMissing(columns=groupby_columns + ctr_columns),
LogOp(
columns=groupby_columns + ['publish_time_days_since_published', 'publish_time_promo_days_since_published']),
Normalize(columns=groupby_columns),
ColumnSimilarity('doc_event_doc_ad_sim_categories', 'document_id', categories, 'document_id_promo',
metric='tfidf', on_device=False),
ColumnSimilarity('doc_event_doc_ad_sim_topics', 'document_id', topics, 'document_id_promo', metric='tfidf',
on_device=False),
ColumnSimilarity('doc_event_doc_ad_sim_entities', 'document_id', entities, 'document_id_promo', metric='tfidf',
on_device=False)
])
workflow.add_cat_preprocess([
HashBucket(hash_spec)
])
workflow.finalize()
return workflow
def create_parquets(data_bucket_folder, train_path, valid_path):
cupy.random.seed(seed=0)
rmm.reinitialize(managed_memory=True)
documents_meta_path = os.path.join(data_bucket_folder, 'documents_meta.csv')
clicks_train_path = os.path.join(data_bucket_folder, 'clicks_train.csv')
events_path = os.path.join(data_bucket_folder, 'events.csv')
promoted_content_path = os.path.join(data_bucket_folder, 'promoted_content.csv')
documents_meta = cudf.read_csv(documents_meta_path, na_values=['\\N', ''])
documents_meta = documents_meta.dropna(subset='source_id')
documents_meta['publisher_id'].fillna(
documents_meta['publisher_id'].isnull().cumsum() + documents_meta['publisher_id'].max() + 1, inplace=True)
merged = (cudf.read_csv(clicks_train_path, na_values=['\\N', ''])
.merge(cudf.read_csv(events_path, na_values=['\\N', '']), on=DISPLAY_ID_COLUMN, how='left',
suffixes=('', '_event'))
.merge(cudf.read_csv(promoted_content_path, na_values=['\\N', '']), on='ad_id',
how='left',
suffixes=('', '_promo'))
.merge(documents_meta, on='document_id', how='left')
.merge(documents_meta, left_on='document_id_promo', right_on='document_id', how='left',
suffixes=('', '_promo')))
merged['day_event'] = (merged['timestamp'] / 1000 / 60 / 60 / 24).astype(int)
merged['platform'] = merged['platform'].fillna(1)
merged['platform'] = merged['platform'] - 1
display_event = merged[[DISPLAY_ID_COLUMN, 'day_event']].drop_duplicates().reset_index()
random_state = cudf.Series(cupy.random.uniform(size=len(display_event)))
valid_ids, train_ids = display_event.scatter_by_map(
((display_event.day_event <= 10) & (random_state > 0.2)).astype(int))
valid_ids = valid_ids[DISPLAY_ID_COLUMN].drop_duplicates()
train_ids = train_ids[DISPLAY_ID_COLUMN].drop_duplicates()
valid_set = merged[merged[DISPLAY_ID_COLUMN].isin(valid_ids)]
train_set = merged[merged[DISPLAY_ID_COLUMN].isin(train_ids)]
valid_set = valid_set.sort_values(DISPLAY_ID_COLUMN)
train_set.to_parquet(train_path, compression=None)
valid_set.to_parquet(valid_path, compression=None)
del merged, train_set, valid_set
def save_stats(data_bucket_folder, output_bucket_folder,
output_train_folder, train_path, output_valid_folder,
valid_path, stats_file, hash_spec, local_directory):
devices = get_devices()
shuffle = Shuffle.PER_PARTITION if len(devices) > 1 else True
workflow = create_workflow(data_bucket_folder=data_bucket_folder,
output_bucket_folder=output_bucket_folder,
hash_spec=hash_spec,
devices=devices,
local_directory=local_directory)
train_dataset = nvt.Dataset(train_path, part_mem_fraction=0.12)
valid_dataset = nvt.Dataset(valid_path, part_mem_fraction=0.12)
workflow.apply(train_dataset, record_stats=True, output_path=output_train_folder, shuffle=shuffle,
out_files_per_proc=5)
workflow.apply(valid_dataset, record_stats=False, output_path=output_valid_folder, shuffle=None,
out_files_per_proc=None)
workflow.save_stats(stats_file)
return workflow
def clean(path):
shutil.rmtree(path)
def execute_pipeline(config):
required_folders = [config['temporary_folder'], config['output_train_folder'], config['output_valid_folder']]
for folder in required_folders:
os.makedirs(folder, exist_ok=True)
create_parquets(
data_bucket_folder=config['data_bucket_folder'],
train_path=config['train_path'],
valid_path=config['valid_path']
)
save_stats(
data_bucket_folder=config['data_bucket_folder'],
output_bucket_folder=config['output_bucket_folder'],
output_train_folder=config['output_train_folder'],
train_path=config['train_path'],
output_valid_folder=config['output_valid_folder'],
valid_path=config['valid_path'],
stats_file=config['stats_file'],
hash_spec=config['hash_spec'],
local_directory=config['temporary_folder']
)
clean(config['temporary_folder'])