DeepLearningExamples/TensorFlow2/Recommendation/WideAndDeep/data/outbrain/nvtabular/utils/workflow.py

331 lines
11 KiB
Python

# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import shutil
import cudf
import cupy
import nvtabular as nvt
import rmm
from dask.distributed import Client
from dask_cuda import LocalCUDACluster
from data.outbrain.nvtabular.utils.feature_description import (
CATEGORICAL_COLUMNS,
DISPLAY_ID_COLUMN,
CTR_INPUTS,
)
from nvtabular import ColumnGroup
from nvtabular.io import Shuffle
from nvtabular.ops import (
FillMedian,
LogOp,
Rename,
JoinGroupby,
LambdaOp,
FillMissing,
HashBucket,
Normalize,
)
from nvtabular.ops import Operator
from nvtabular.ops.column_similarity import ColumnSimilarity
from nvtabular.utils import device_mem_size, get_rmm_size
TIMESTAMP_DELTA = 1465876799998
def get_devices():
try:
devices = [
int(device) for device in os.environ["CUDA_VISIBLE_DEVICES"].split(",")
]
except KeyError:
from pynvml import nvmlInit, nvmlDeviceGetCount
nvmlInit()
devices = list(range(nvmlDeviceGetCount()))
return devices
class DaysSincePublished(Operator):
def transform(self, columns, gdf):
for column in columns:
col = gdf[column]
col.loc[col == ""] = None
col = col.astype("datetime64[ns]")
timestamp = (gdf["timestamp"] + TIMESTAMP_DELTA).astype("datetime64[ms]")
delta = (timestamp - col).dt.days
gdf[column + "_days_since_published"] = (
delta * (delta >= 0) * (delta <= 10 * 365)
)
return gdf
def output_column_names(self, columns):
return [column + "_days_since_published" for column in columns]
def dependencies(self):
return ["timestamp"]
def _df_to_coo(df, row="document_id", col=None, data="confidence_level"):
return cupy.sparse.coo_matrix((df[data].values, (df[row].values, df[col].values)))
def setup_rmm_pool(client, pool_size):
pool_size = get_rmm_size(pool_size)
client.run(rmm.reinitialize, pool_allocator=True, initial_pool_size=pool_size)
return None
def create_client(devices, local_directory):
client = None
if len(devices) > 1:
device_size = device_mem_size(kind="total")
device_limit = int(0.8 * device_size)
device_pool_size = int(0.8 * device_size)
cluster = LocalCUDACluster(
n_workers=len(devices),
CUDA_VISIBLE_DEVICES=",".join(str(x) for x in devices),
device_memory_limit=device_limit,
local_directory=local_directory,
)
client = Client(cluster)
setup_rmm_pool(client, device_pool_size)
return client
def create_workflow(data_bucket_folder, hash_spec, devices, local_directory, dask):
rmm.reinitialize(managed_memory=False)
documents_categories_path = os.path.join(
data_bucket_folder, "documents_categories.csv"
)
documents_topics_path = os.path.join(data_bucket_folder, "documents_topics.csv")
documents_entities_path = os.path.join(data_bucket_folder, "documents_entities.csv")
documents_categories_cudf = cudf.read_csv(documents_categories_path)
documents_topics_cudf = cudf.read_csv(documents_topics_path)
documents_entities_cudf = cudf.read_csv(documents_entities_path)
documents_entities_cudf["entity_id"] = (
documents_entities_cudf["entity_id"].astype("category").cat.codes
)
categories = _df_to_coo(documents_categories_cudf, col="category_id")
topics = _df_to_coo(documents_topics_cudf, col="topic_id")
entities = _df_to_coo(documents_entities_cudf, col="entity_id")
del documents_categories_cudf, documents_topics_cudf, documents_entities_cudf
ctr_thresh = {
"ad_id": 5,
"source_id_promo": 10,
"publisher_id_promo": 10,
"advertiser_id": 10,
"campaign_id": 10,
"document_id_promo": 5,
}
ctr_inputs = ColumnGroup(CTR_INPUTS)
cat_cols = ColumnGroup(CATEGORICAL_COLUMNS)
geo_location = ColumnGroup(["geo_location"])
country = (
geo_location >> (lambda col: col.str.slice(0, 2)) >> Rename(postfix="_country")
)
state = (
geo_location >> (lambda col: col.str.slice(0, 5)) >> Rename(postfix="_state")
)
geo_features = geo_location + country + state
dates = ["publish_time", "publish_time_promo"]
date_features = dates >> DaysSincePublished() >> FillMedian() >> LogOp
stat_cols = ctr_inputs >> JoinGroupby(cont_cols=["clicked"], stats=["sum", "count"])
ctr_cols = (
stat_cols - [column + "_count" for column in ctr_inputs.flattened_columns]
>> LambdaOp(
f=lambda col, gdf: (
(col) / (gdf[col.name.replace("_clicked_sum", "_count")])
).where(
gdf[col.name.replace("_clicked_sum", "_count")]
>= ctr_thresh[col.name.replace("_clicked_sum", "")],
0,
),
dependency=stat_cols
- [column + "clicked_sum" for column in ctr_inputs.flattened_columns],
)
>> Rename(f=lambda x: x.replace("_clicked_sum", "_ctr"))
)
stat_cols = stat_cols >> FillMissing() >> LogOp() >> Normalize()
ctr_cols = ctr_cols >> FillMissing()
cat_cols = cat_cols + geo_features >> HashBucket(hash_spec)
features = (
date_features + ctr_cols + stat_cols + cat_cols + ["clicked", "display_id"]
)
sim_features_categ = (
[["document_id", "document_id_promo"]]
>> ColumnSimilarity(categories, metric="tfidf", on_device=False)
>> Rename(postfix="_categories")
)
sim_features_topics = (
[["document_id", "document_id_promo"]]
>> ColumnSimilarity(topics, metric="tfidf", on_device=False)
>> Rename(postfix="_topics")
)
sim_features_entities = (
[["document_id", "document_id_promo"]]
>> ColumnSimilarity(entities, metric="tfidf", on_device=False)
>> Rename(postfix="_entities")
)
sim_features = sim_features_categ + sim_features_topics + sim_features_entities
client = create_client(devices=devices, local_directory=local_directory) if dask else None
workflow = nvt.Workflow(column_group=features + sim_features, client=client)
return workflow
def create_parquets(data_bucket_folder, train_path, valid_path):
cupy.random.seed(seed=0)
rmm.reinitialize(managed_memory=True)
documents_meta_path = os.path.join(data_bucket_folder, "documents_meta.csv")
clicks_train_path = os.path.join(data_bucket_folder, "clicks_train.csv")
events_path = os.path.join(data_bucket_folder, "events.csv")
promoted_content_path = os.path.join(data_bucket_folder, "promoted_content.csv")
documents_meta = cudf.read_csv(documents_meta_path, na_values=["\\N", ""])
documents_meta = documents_meta.dropna(subset="source_id")
documents_meta["publisher_id"].fillna(
documents_meta["publisher_id"].isnull().cumsum()
+ documents_meta["publisher_id"].max()
+ 1,
inplace=True,
)
merged = (
cudf.read_csv(clicks_train_path, na_values=["\\N", ""])
.merge(
cudf.read_csv(events_path, na_values=["\\N", ""]),
on=DISPLAY_ID_COLUMN,
how="left",
suffixes=("", "_event"),
)
.merge(
cudf.read_csv(promoted_content_path, na_values=["\\N", ""]),
on="ad_id",
how="left",
suffixes=("", "_promo"),
)
.merge(documents_meta, on="document_id", how="left")
.merge(
documents_meta,
left_on="document_id_promo",
right_on="document_id",
how="left",
suffixes=("", "_promo"),
)
)
merged["day_event"] = (merged["timestamp"] / 1000 / 60 / 60 / 24).astype(int)
merged["platform"] = merged["platform"].fillna(1)
merged["platform"] = merged["platform"] - 1
display_event = (
merged[[DISPLAY_ID_COLUMN, "day_event"]].drop_duplicates().reset_index()
)
random_state = cudf.Series(cupy.random.uniform(size=len(display_event)))
valid_ids, train_ids = display_event.scatter_by_map(
((display_event.day_event <= 10) & (random_state > 0.2)).astype(int)
)
valid_ids = valid_ids[DISPLAY_ID_COLUMN].drop_duplicates()
train_ids = train_ids[DISPLAY_ID_COLUMN].drop_duplicates()
valid_set = merged[merged[DISPLAY_ID_COLUMN].isin(valid_ids)]
train_set = merged[merged[DISPLAY_ID_COLUMN].isin(train_ids)]
valid_set = valid_set.sort_values(DISPLAY_ID_COLUMN)
train_set.to_parquet(train_path, compression=None)
valid_set.to_parquet(valid_path, compression=None)
del merged, train_set, valid_set
def save_stats(
data_bucket_folder,
output_train_folder,
train_path,
output_valid_folder,
valid_path,
stats_file,
hash_spec,
local_directory,
dask
):
devices = get_devices()
shuffle = Shuffle.PER_PARTITION if len(devices) > 1 else True
workflow = create_workflow(
data_bucket_folder=data_bucket_folder,
hash_spec=hash_spec,
devices=devices,
local_directory=local_directory,
dask=dask
)
train_dataset = nvt.Dataset(train_path, part_size="1GB")
valid_dataset = nvt.Dataset(valid_path, part_size="150MB")
workflow.fit(train_dataset)
workflow.transform(train_dataset).to_parquet(
output_path=output_train_folder, shuffle=shuffle, out_files_per_proc=8
)
workflow.transform(valid_dataset).to_parquet(
output_path=output_valid_folder, shuffle=None, output_files=8
)
workflow.save(stats_file)
return workflow
def clean(path):
shutil.rmtree(path)
def execute_pipeline(config):
required_folders = [
config["temporary_folder"],
config["output_train_folder"],
config["output_valid_folder"],
]
for folder in required_folders:
os.makedirs(folder, exist_ok=True)
create_parquets(
data_bucket_folder=config["data_bucket_folder"],
train_path=config["train_path"],
valid_path=config["valid_path"],
)
save_stats(
data_bucket_folder=config["data_bucket_folder"],
output_train_folder=config["output_train_folder"],
train_path=config["train_path"],
output_valid_folder=config["output_valid_folder"],
valid_path=config["valid_path"],
stats_file=config["stats_file"],
hash_spec=config["hash_spec"],
local_directory=config["temporary_folder"],
dask=config["dask"]
)
clean(config["temporary_folder"])
clean("./categories")