DeepLearningExamples/TensorFlow2/Recommendation/WideAndDeep/data/outbrain/spark/utils/feature_description.py
2021-03-04 14:25:59 +01:00

137 lines
3.4 KiB
Python

# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
LABEL_COLUMN = "label"
DISPLAY_ID_COLUMN = 'display_id'
IS_LEAK_COLUMN = 'is_leak'
DISPLAY_ID_AND_IS_LEAK_ENCODED_COLUMN = 'display_ad_and_is_leak'
CATEGORICAL_COLUMNS = [
'ad_id',
'campaign_id',
'doc_id',
'doc_event_id',
'ad_advertiser',
'doc_ad_source_id',
'doc_ad_publisher_id',
'doc_event_publisher_id',
'doc_event_source_id',
'event_country',
'event_country_state',
'event_geo_location',
'event_platform']
DOC_CATEGORICAL_MULTIVALUED_COLUMNS = {
}
BOOL_COLUMNS = []
INT_COLUMNS = [
'ad_views',
'doc_views',
'doc_event_days_since_published',
'doc_ad_days_since_published']
FLOAT_COLUMNS_LOG_BIN_TRANSFORM = []
FLOAT_COLUMNS_NO_TRANSFORM = [
'pop_ad_id',
'pop_document_id',
'pop_publisher_id',
'pop_advertiser_id',
'pop_campain_id',
'pop_source_id',
'doc_event_doc_ad_sim_categories',
'doc_event_doc_ad_sim_topics',
'doc_event_doc_ad_sim_entities',
]
FLOAT_COLUMNS_SIMPLE_BIN_TRANSFORM = []
FLOAT_COLUMNS = FLOAT_COLUMNS_LOG_BIN_TRANSFORM + FLOAT_COLUMNS_SIMPLE_BIN_TRANSFORM + FLOAT_COLUMNS_NO_TRANSFORM
REQUEST_SINGLE_HOT_COLUMNS = [
"doc_event_id",
"doc_id",
"doc_event_source_id",
"event_geo_location",
"event_country_state",
"doc_event_publisher_id",
"event_country",
"event_hour",
"event_platform",
"traffic_source",
"event_weekend",
"user_has_already_viewed_doc"]
REQUEST_MULTI_HOT_COLUMNS = [
"doc_event_entity_id",
"doc_event_topic_id",
"doc_event_category_id"]
REQUEST_NUMERIC_COLUMNS = [
"pop_document_id_conf",
"pop_publisher_id_conf",
"pop_source_id_conf",
"pop_entity_id_conf",
"pop_topic_id_conf",
"pop_category_id_conf",
"pop_document_id",
"pop_publisher_id",
"pop_source_id",
"pop_entity_id",
"pop_topic_id",
"pop_category_id",
"user_views",
"doc_views",
"doc_event_days_since_published",
"doc_event_hour"]
ITEM_SINGLE_HOT_COLUMNS = [
"ad_id",
'campaign_id',
"doc_ad_source_id",
"ad_advertiser",
"doc_ad_publisher_id"]
ITEM_MULTI_HOT_COLUMNS = [
"doc_ad_topic_id",
"doc_ad_entity_id",
"doc_ad_category_id"]
ITEM_NUMERIC_COLUMNS = [
"pop_ad_id_conf",
"user_doc_ad_sim_categories_conf",
"user_doc_ad_sim_topics_conf",
"pop_advertiser_id_conf",
"pop_ad_id",
"pop_advertiser_id",
"pop_campain_id",
"user_doc_ad_sim_categories",
"user_doc_ad_sim_topics",
"user_doc_ad_sim_entities",
"doc_event_doc_ad_sim_categories",
"doc_event_doc_ad_sim_topics",
"doc_event_doc_ad_sim_entities",
"ad_views",
"doc_ad_days_since_published"]
NV_TRAINING_COLUMNS = (
REQUEST_SINGLE_HOT_COLUMNS +
REQUEST_MULTI_HOT_COLUMNS +
REQUEST_NUMERIC_COLUMNS +
ITEM_SINGLE_HOT_COLUMNS +
ITEM_MULTI_HOT_COLUMNS +
ITEM_NUMERIC_COLUMNS)