# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. DISPLAY_ID_COLUMN = "display_id" BASE_CONT_COLUMNS = [ "publish_time", "publish_time_promo", "timestamp", "document_id_promo_clicked_sum_ctr", "publisher_id_promo_clicked_sum_ctr", "source_id_promo_clicked_sum_ctr", "document_id_promo_count", "publish_time_days_since_published", "ad_id_clicked_sum_ctr", "advertiser_id_clicked_sum_ctr", "campaign_id_clicked_sum_ctr", "ad_id_count", "publish_time_promo_days_since_published", ] SIM_COLUMNS = [ "doc_event_doc_ad_sim_categories", "doc_event_doc_ad_sim_topics", "doc_event_doc_ad_sim_entities", ] CONTINUOUS_COLUMNS = BASE_CONT_COLUMNS + SIM_COLUMNS + [DISPLAY_ID_COLUMN] exclude_conts = ["publish_time", "publish_time_promo", "timestamp"] NUMERIC_COLUMNS = [col for col in CONTINUOUS_COLUMNS if col not in exclude_conts] CATEGORICAL_COLUMNS = [ "ad_id", "document_id", "platform", "document_id_promo", "campaign_id", "advertiser_id", "source_id", "publisher_id", "source_id_promo", "publisher_id_promo", ] CTR_INPUTS = [ "ad_id", "source_id_promo", "document_id_promo", "publisher_id_promo", "advertiser_id", "campaign_id", ] EXCLUDE_COLUMNS = [ "publish_time", "publish_time_promo", "timestamp", "ad_id_clicked_sum", "source_id_promo_count", "source_id_promo_clicked_sum", "document_id_promo_clicked_sum", "publisher_id_promo_count", "publisher_id_promo_clicked_sum", "advertiser_id_count", "advertiser_id_clicked_sum", "campaign_id_count", "campaign_id_clicked_sum", "uuid", "day_event", ] nvt_to_spark = { "ad_id": "ad_id", "clicked": "label", "display_id": "display_id", "document_id": "doc_event_id", "platform": "event_platform", "document_id_promo": "doc_id", "campaign_id": "campaign_id", "advertiser_id": "ad_advertiser", "source_id": "doc_event_source_id", "publisher_id": "doc_event_publisher_id", "source_id_promo": "doc_ad_source_id", "publisher_id_promo": "doc_ad_publisher_id", "geo_location": "event_geo_location", "geo_location_country": "event_country", "geo_location_state": "event_country_state", "document_id_promo_ctr": "pop_document_id", "publisher_id_promo_ctr": "pop_publisher_id", "source_id_promo_ctr": "pop_source_id", "document_id_promo_count": "doc_views_log_01scaled", "publish_time_days_since_published": "doc_event_days_since_published_log_01scaled", "ad_id_ctr": "pop_ad_id", "advertiser_id_ctr": "pop_advertiser_id", "campaign_id_ctr": "pop_campain_id", "ad_id_count": "ad_views_log_01scaled", "publish_time_promo_days_since_published": "doc_ad_days_since_published_log_01scaled", "document_id_document_id_promo_sim_categories": "doc_event_doc_ad_sim_categories", "document_id_document_id_promo_sim_topics": "doc_event_doc_ad_sim_topics", "document_id_document_id_promo_sim_entities": "doc_event_doc_ad_sim_entities", } spark_to_nvt = {item: key for key, item in nvt_to_spark.items()} def transform_nvt_to_spark(column): return nvt_to_spark[column] def transform_spark_to_nvt(column): return spark_to_nvt[column]