DeepLearningExamples/TensorFlow2/Recommendation/WideAndDeep/data/outbrain/features.py

133 lines
3.7 KiB
Python

# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import logging
import tensorflow as tf
DISPLAY_ID_COLUMN = "display_id"
NUMERIC_COLUMNS = [
"document_id_document_id_promo_sim_categories",
"document_id_document_id_promo_sim_topics",
"document_id_document_id_promo_sim_entities",
"document_id_promo_ctr",
"publisher_id_promo_ctr",
"source_id_promo_ctr",
"document_id_promo_count",
"publish_time_days_since_published",
"ad_id_ctr",
"advertiser_id_ctr",
"campaign_id_ctr",
"ad_id_count",
"publish_time_promo_days_since_published",
]
CATEGORICAL_COLUMNS = [
"ad_id",
"document_id",
"platform",
"document_id_promo",
"campaign_id",
"advertiser_id",
"source_id",
"geo_location",
"geo_location_country",
"geo_location_state",
"publisher_id",
"source_id_promo",
"publisher_id_promo",
]
HASH_BUCKET_SIZES = {
"document_id": 300000,
"ad_id": 250000,
"document_id_promo": 100000,
"source_id_promo": 4000,
"source_id": 4000,
"geo_location": 2500,
"advertiser_id": 2500,
"geo_location_state": 2000,
"publisher_id_promo": 1000,
"publisher_id": 1000,
"geo_location_country": 300,
"platform": 4,
"campaign_id": 5000,
}
EMBEDDING_DIMENSIONS = {
"document_id": 128,
"ad_id": 128,
"document_id_promo": 128,
"source_id_promo": 64,
"source_id": 64,
"geo_location": 64,
"advertiser_id": 64,
"geo_location_state": 64,
"publisher_id_promo": 64,
"publisher_id": 64,
"geo_location_country": 64,
"platform": 19,
"campaign_id": 128,
}
EMBEDDING_TABLE_SHAPES = {
column: (HASH_BUCKET_SIZES[column], EMBEDDING_DIMENSIONS[column])
for column in CATEGORICAL_COLUMNS
}
def get_features_keys():
return CATEGORICAL_COLUMNS + NUMERIC_COLUMNS + [DISPLAY_ID_COLUMN]
def get_feature_columns():
logger = logging.getLogger("tensorflow")
wide_columns, deep_columns = [], []
for column_name in CATEGORICAL_COLUMNS:
if column_name in EMBEDDING_TABLE_SHAPES:
categorical_column = tf.feature_column.categorical_column_with_identity(
column_name, num_buckets=EMBEDDING_TABLE_SHAPES[column_name][0]
)
wrapped_column = tf.feature_column.embedding_column(
categorical_column,
dimension=EMBEDDING_TABLE_SHAPES[column_name][1],
combiner="mean",
)
else:
raise ValueError(f"Unexpected categorical column found {column_name}")
wide_columns.append(categorical_column)
deep_columns.append(wrapped_column)
numerics = [
tf.feature_column.numeric_column(column_name, shape=(1,), dtype=tf.float32)
for column_name in NUMERIC_COLUMNS
if column_name != DISPLAY_ID_COLUMN
]
wide_columns.extend(numerics)
deep_columns.extend(numerics)
logger.warning("deep columns: {}".format(len(deep_columns)))
logger.warning("wide columns: {}".format(len(wide_columns)))
logger.warning(
"wide&deep intersection: {}".format(
len(set(wide_columns).intersection(set(deep_columns)))
)
)
return wide_columns, deep_columns