791 lines
33 KiB
Python
791 lines
33 KiB
Python
# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
################################
|
|
# Copyright 2021 The Google Research Authors.
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
|
|
import os
|
|
import math
|
|
import pickle
|
|
import enum
|
|
import datetime
|
|
|
|
from collections import namedtuple, OrderedDict
|
|
|
|
import sklearn.preprocessing
|
|
from sklearn.impute import SimpleImputer
|
|
import pandas as pd
|
|
import numpy as np
|
|
from bisect import bisect
|
|
|
|
import torch
|
|
from torch.utils.data import Dataset,IterableDataset,DataLoader
|
|
|
|
class DataTypes(enum.IntEnum):
|
|
"""Defines numerical types of each column."""
|
|
CONTINUOUS = 0
|
|
CATEGORICAL = 1
|
|
DATE = 2
|
|
STR = 3
|
|
|
|
class InputTypes(enum.IntEnum):
|
|
"""Defines input types of each column."""
|
|
TARGET = 0
|
|
OBSERVED = 1
|
|
KNOWN = 2
|
|
STATIC = 3
|
|
ID = 4 # Single column used as an entity identifier
|
|
TIME = 5 # Single column exclusively used as a time index
|
|
|
|
FeatureSpec = namedtuple('FeatureSpec', ['name', 'feature_type', 'feature_embed_type'])
|
|
DTYPE_MAP = {
|
|
DataTypes.CONTINUOUS : np.float32,
|
|
DataTypes.CATEGORICAL : np.int64,
|
|
DataTypes.DATE:'datetime64[ns]',
|
|
DataTypes.STR: str
|
|
}
|
|
|
|
FEAT_ORDER = [
|
|
(InputTypes.STATIC, DataTypes.CATEGORICAL),
|
|
(InputTypes.STATIC, DataTypes.CONTINUOUS),
|
|
(InputTypes.KNOWN, DataTypes.CATEGORICAL),
|
|
(InputTypes.KNOWN, DataTypes.CONTINUOUS),
|
|
(InputTypes.OBSERVED, DataTypes.CATEGORICAL),
|
|
(InputTypes.OBSERVED, DataTypes.CONTINUOUS),
|
|
(InputTypes.TARGET, DataTypes.CONTINUOUS),
|
|
(InputTypes.ID, DataTypes.CATEGORICAL)
|
|
]
|
|
|
|
FEAT_NAMES = ['s_cat' , 's_cont' , 'k_cat' , 'k_cont' , 'o_cat' , 'o_cont' , 'target', 'id']
|
|
DEFAULT_ID_COL = 'id'
|
|
|
|
class TFTBinaryDataset(Dataset):
|
|
def __init__(self, path, config):
|
|
super(TFTBinaryDataset).__init__()
|
|
self.features = [x for x in config.features if x.feature_embed_type != DataTypes.DATE]
|
|
self.example_length = config.example_length
|
|
self.stride = config.dataset_stride
|
|
|
|
self.grouped = pickle.load(open(path, 'rb'))
|
|
self.grouped = [x for x in self.grouped if x.shape[0] >= self.example_length]
|
|
self._cum_examples_in_group = np.cumsum([(g.shape[0] - self.example_length + 1)//self.stride for g in self.grouped])
|
|
|
|
|
|
self.feature_type_col_map = [[i for i,f in enumerate(self.features) if (f.feature_type, f.feature_embed_type) == x] for x in FEAT_ORDER]
|
|
|
|
# The list comprehension below is an elaborate way of rearranging data into correct order,
|
|
# simultaneously doing casting to proper types. Probably can be written neater
|
|
self.grouped = [
|
|
[
|
|
arr[:, idxs].view(dtype=np.float32).astype(DTYPE_MAP[t[1]])
|
|
for t, idxs in zip(FEAT_ORDER, self.feature_type_col_map)
|
|
]
|
|
for arr in self.grouped
|
|
]
|
|
|
|
def __len__(self):
|
|
return self._cum_examples_in_group[-1] if len(self._cum_examples_in_group) else 0
|
|
|
|
def __getitem__(self, idx):
|
|
g_idx = bisect(self._cum_examples_in_group, idx)
|
|
e_idx = idx - self._cum_examples_in_group[g_idx-1] if g_idx else idx
|
|
|
|
group = self.grouped[g_idx]
|
|
|
|
tensors = [
|
|
torch.from_numpy(feat[e_idx * self.stride:e_idx*self.stride + self.example_length])
|
|
if feat.size else torch.empty(0)
|
|
for feat in group
|
|
]
|
|
|
|
return OrderedDict(zip(FEAT_NAMES, tensors))
|
|
|
|
|
|
class TFTDataset(Dataset):
|
|
def __init__(self, path, config):
|
|
super(TFTDataset).__init__()
|
|
self.features = config.features
|
|
self.data = pd.read_csv(path, index_col=0)
|
|
self.example_length = config.example_length
|
|
self.stride = config.dataset_stride
|
|
|
|
# name field is a column name.
|
|
# there can be multiple entries with the same name because one column can be interpreted in many ways
|
|
time_col_name = next(x.name for x in self.features if x.feature_type==InputTypes.TIME)
|
|
id_col_name = next(x.name for x in self.features if x.feature_type==InputTypes.ID)
|
|
if not id_col_name in self.data.columns:
|
|
id_col_name = DEFAULT_ID_COL
|
|
self.features = [x for x in self.features if x.feature_type!=InputTypes.ID]
|
|
self.features.append(FeatureSpec(DEFAULT_ID_COL, InputTypes.ID, DataTypes.CATEGORICAL))
|
|
col_dtypes = {v.name:DTYPE_MAP[v.feature_embed_type] for v in self.features}
|
|
|
|
|
|
self.data.sort_values(time_col_name,inplace=True)
|
|
self.data = self.data[set(x.name for x in self.features)] #leave only relevant columns
|
|
self.data = self.data.astype(col_dtypes)
|
|
self.data = self.data.groupby(id_col_name).filter(lambda group: len(group) >= self.example_length)
|
|
self.grouped = list(self.data.groupby(id_col_name))
|
|
|
|
self._cum_examples_in_group = np.cumsum([(len(g[1]) - self.example_length + 1)//self.stride for g in self.grouped])
|
|
|
|
def __len__(self):
|
|
return self._cum_examples_in_group[-1]
|
|
|
|
def __getitem__(self, idx):
|
|
g_idx = len([x for x in self._cum_examples_in_group if x <= idx])
|
|
e_idx = idx - self._cum_examples_in_group[g_idx-1] if g_idx else idx
|
|
|
|
group = self.grouped[g_idx][1]
|
|
sliced = group.iloc[e_idx * self.stride:e_idx*self.stride + self.example_length]
|
|
|
|
# We need to be sure that tensors are returned in the correct order
|
|
tensors = tuple([] for _ in range(8))
|
|
for v in self.features:
|
|
if v.feature_type == InputTypes.STATIC and v.feature_embed_type == DataTypes.CATEGORICAL:
|
|
tensors[0].append(torch.from_numpy(sliced[v.name].to_numpy()))
|
|
elif v.feature_type == InputTypes.STATIC and v.feature_embed_type == DataTypes.CONTINUOUS:
|
|
tensors[1].append(torch.from_numpy(sliced[v.name].to_numpy()))
|
|
elif v.feature_type == InputTypes.KNOWN and v.feature_embed_type == DataTypes.CATEGORICAL:
|
|
tensors[2].append(torch.from_numpy(sliced[v.name].to_numpy()))
|
|
elif v.feature_type == InputTypes.KNOWN and v.feature_embed_type == DataTypes.CONTINUOUS:
|
|
tensors[3].append(torch.from_numpy(sliced[v.name].to_numpy()))
|
|
elif v.feature_type == InputTypes.OBSERVED and v.feature_embed_type == DataTypes.CATEGORICAL:
|
|
tensors[4].append(torch.from_numpy(sliced[v.name].to_numpy()))
|
|
elif v.feature_type == InputTypes.OBSERVED and v.feature_embed_type == DataTypes.CONTINUOUS:
|
|
tensors[5].append(torch.from_numpy(sliced[v.name].to_numpy()))
|
|
elif v.feature_type == InputTypes.TARGET:
|
|
tensors[6].append(torch.from_numpy(sliced[v.name].to_numpy()))
|
|
elif v.feature_type == InputTypes.ID:
|
|
tensors[7].append(torch.from_numpy(sliced[v.name].to_numpy()))
|
|
|
|
|
|
tensors = [torch.stack(x, dim=-1) if x else torch.empty(0) for x in tensors]
|
|
|
|
return OrderedDict(zip(FEAT_NAMES, tensors))
|
|
|
|
def get_dataset_splits(df, config):
|
|
|
|
if hasattr(config, 'relative_split') and config.relative_split:
|
|
forecast_len = config.example_length - config.encoder_length
|
|
# The valid split is shifted from the train split by number of the forecast steps to the future.
|
|
# The test split is shifted by the number of the forecast steps from the valid split
|
|
train = []
|
|
valid = []
|
|
test = []
|
|
|
|
for _, group in df.groupby(DEFAULT_ID_COL):
|
|
index = group[config.time_ids]
|
|
_train = group.loc[index < config.valid_boundary]
|
|
_valid = group.iloc[(len(_train) - config.encoder_length):(len(_train) + forecast_len)]
|
|
_test = group.iloc[(len(_train) - config.encoder_length + forecast_len):(len(_train) + 2*forecast_len)]
|
|
train.append(_train)
|
|
valid.append(_valid)
|
|
test.append(_test)
|
|
|
|
train = pd.concat(train, axis=0)
|
|
valid = pd.concat(valid, axis=0)
|
|
test = pd.concat(test, axis=0)
|
|
else:
|
|
index = df[config.time_ids]
|
|
train = df.loc[(index >= config.train_range[0]) & (index < config.train_range[1])]
|
|
valid = df.loc[(index >= config.valid_range[0]) & (index < config.valid_range[1])]
|
|
test = df.loc[(index >= config.test_range[0]) & (index < config.test_range[1])]
|
|
|
|
return train, valid, test
|
|
|
|
def flatten_ids(df, config):
|
|
|
|
if config.missing_id_strategy == 'drop':
|
|
if hasattr(config, 'combine_ids') and config.combine_ids:
|
|
index = np.logical_or.reduce([df[c].isna() for c in config.combine_ids])
|
|
else:
|
|
id_col = next(x.name for x in config.features if x.feature_type == InputTypes.ID)
|
|
index = df[id_col].isna()
|
|
index = index[index == True].index # Extract indices of nans
|
|
df.drop(index, inplace=True)
|
|
|
|
if not (hasattr(config, 'combine_ids') and config.combine_ids):
|
|
id_col = next(x.name for x in config.features if x.feature_type == InputTypes.ID)
|
|
ids = df[id_col].apply(str)
|
|
df.drop(id_col, axis=1, inplace=True)
|
|
encoder = sklearn.preprocessing.LabelEncoder().fit(ids.values)
|
|
df[DEFAULT_ID_COL] = encoder.transform(ids)
|
|
encoders = OrderedDict({id_col: encoder})
|
|
|
|
else:
|
|
encoders = {c:sklearn.preprocessing.LabelEncoder().fit(df[c].values) for c in config.combine_ids}
|
|
encoders = OrderedDict(encoders)
|
|
lens = [len(v.classes_) for v in encoders.values()]
|
|
clens = np.roll(np.cumprod(lens), 1)
|
|
clens[0] = 1
|
|
|
|
# this takes a looooooot of time. Probably it would be better to create 2 dummy columns
|
|
df[DEFAULT_ID_COL] = df.apply(lambda row: sum([encoders[c].transform([row[c]])[0]*clens[i] for i,c in enumerate(encoders.keys())]), axis=1)
|
|
df.drop(config.combine_ids, axis=1, inplace=True)
|
|
|
|
return DEFAULT_ID_COL, encoders
|
|
|
|
def impute(df, config):
|
|
#XXX This ensures that out scaling will have the same mean. We still need to check the variance
|
|
if not hasattr(config, 'missing_data_label'):
|
|
return df, None
|
|
else:
|
|
imp = SimpleImputer(missing_values=config.missing_data_label, strategy='mean')
|
|
mask = df.applymap(lambda x: True if x == config.missing_data_label else False)
|
|
data = df.values
|
|
col_mask = (data == config.missing_data_label).all(axis=0)
|
|
data[:,~col_mask] = imp.fit_transform(data)
|
|
return data, mask
|
|
|
|
def normalize_reals(train, valid, test, config, id_col=DEFAULT_ID_COL):
|
|
tgt_cols = [x.name for x in config.features if x.feature_type == InputTypes.TARGET]
|
|
real_cols = list(set(v.name for v in config.features if v.feature_embed_type == DataTypes.CONTINUOUS).difference(set(tgt_cols)))
|
|
real_scalers = {}
|
|
tgt_scalers = {}
|
|
|
|
def apply_scalers(df, name=None):
|
|
if name is None:
|
|
name = df.name
|
|
mask = df.applymap(lambda x: True if x == config.missing_data_label else False) if hasattr(config, 'missing_data_label') else None
|
|
df[real_cols] = real_scalers[name].transform(df[real_cols])
|
|
if mask is not None and any(mask):
|
|
df[real_cols].mask(mask, 10**9)
|
|
df[tgt_cols] = tgt_scalers[name].transform(df[tgt_cols])
|
|
return df
|
|
|
|
if config.scale_per_id:
|
|
for identifier, sliced in train.groupby(id_col):
|
|
data = sliced[real_cols]
|
|
data, _ = impute(data, config)
|
|
real_scalers[identifier] = sklearn.preprocessing.StandardScaler().fit(data)
|
|
# XXX We should probably remove examples that contain NaN as a target
|
|
target = sliced[tgt_cols]
|
|
tgt_scalers[identifier] = sklearn.preprocessing.StandardScaler().fit(target)
|
|
|
|
train = train.groupby(id_col).apply(apply_scalers)
|
|
# For valid and testing leave only timeseries previously present in train subset
|
|
# XXX for proper data science we should consider encoding unseen timeseries as a special case, not throwing them away
|
|
valid = valid.loc[valid[id_col].isin(real_scalers.keys())]
|
|
valid = valid.groupby(id_col).apply(apply_scalers)
|
|
test = test.loc[test[id_col].isin(real_scalers.keys())]
|
|
test = test.groupby(id_col).apply(apply_scalers)
|
|
|
|
else:
|
|
data, _ = impute(train[real_cols], config)
|
|
real_scalers[''] = sklearn.preprocessing.StandardScaler().fit(data)
|
|
tgt_scalers[''] = sklearn.preprocessing.StandardScaler().fit(train[tgt_cols])
|
|
|
|
train = apply_scalers(train, name='')
|
|
valid = apply_scalers(valid, name='')
|
|
test = apply_scalers(test, name='')
|
|
|
|
return train, valid, test, real_scalers, tgt_scalers
|
|
|
|
def encode_categoricals(train, valid, test, config):
|
|
cat_encodings = {}
|
|
cat_cols = list(set(v.name for v in config.features if v.feature_embed_type == DataTypes.CATEGORICAL and v.feature_type != InputTypes.ID))
|
|
num_classes = [] #XXX Maybe we should modify config based on this value? Or send a warninig?
|
|
# For TC performance reasons we might want for num_classes[i] be divisible by 8
|
|
|
|
# Train categorical encoders
|
|
for c in cat_cols:
|
|
if config.missing_cat_data_strategy == 'special_token':
|
|
#XXX this will probably require some data augmentation
|
|
unique = train[c].unique()
|
|
valid[c].loc[valid[c].isin(unique)] = '<UNK>'
|
|
test[c].loc[test[c].isin(unique)] = '<UNK>'
|
|
|
|
if config.missing_cat_data_strategy == 'encode_all' or \
|
|
config.missing_cat_data_strategy == 'special_token':
|
|
srs = pd.concat([train[c], valid[c], test[c]]).apply(str)
|
|
cat_encodings[c] = sklearn.preprocessing.LabelEncoder().fit(srs.values)
|
|
elif config.missing_cat_data_strategy == 'drop':
|
|
# TODO: implement this. In addition to dropping rows this has to split specific time series in chunks
|
|
# to prevent data from having temporal gaps
|
|
pass
|
|
num_classes.append(srs.nunique())
|
|
print('Categorical variables encodings lens: ', num_classes)
|
|
|
|
|
|
for split in [train, valid, test]:
|
|
for c in cat_cols:
|
|
srs = split[c].apply(str)
|
|
split[c] = srs
|
|
split.loc[:,c] = cat_encodings[c].transform(srs)
|
|
|
|
return cat_encodings
|
|
|
|
|
|
def preprocess(src_path, dst_path, config):
|
|
df = pd.read_csv(src_path, index_col=0)
|
|
|
|
for c in config.features:
|
|
if c.feature_embed_type == DataTypes.DATE:
|
|
df[c.name] = pd.to_datetime(df[c.name])
|
|
|
|
# Leave only columns relevant to preprocessing
|
|
relevant_columns = list(set([f.name for f in config.features] + [config.time_ids]))
|
|
df = df[relevant_columns]
|
|
|
|
|
|
id_col, id_encoders = flatten_ids(df, config)
|
|
df = df.reindex(sorted(df.columns), axis=1)
|
|
|
|
train, valid, test = get_dataset_splits(df, config)
|
|
|
|
# Length filter the data (all timeseries shorter than example len will be dropped)
|
|
#for df in [train, valid, test]:
|
|
# df.groupby(id_col).filter(lambda x: len(x) >= config.example_length)
|
|
train = pd.concat([x[1] for x in train.groupby(id_col) if len(x[1]) >= config.example_length])
|
|
valid = pd.concat([x[1] for x in valid.groupby(id_col) if len(x[1]) >= config.example_length])
|
|
test = pd.concat([x[1] for x in test.groupby(id_col) if len(x[1]) >= config.example_length])
|
|
|
|
train, valid, test, real_scalers, tgt_scalers = normalize_reals(train, valid, test, config, id_col)
|
|
|
|
cat_encodings = encode_categoricals(train, valid, test, config)
|
|
|
|
os.makedirs(dst_path, exist_ok=True)
|
|
|
|
train.to_csv(os.path.join(dst_path, 'train.csv'))
|
|
valid.to_csv(os.path.join(dst_path, 'valid.csv'))
|
|
test.to_csv(os.path.join(dst_path, 'test.csv'))
|
|
|
|
# Save relevant columns in binary form for faster dataloading
|
|
# IMORTANT: We always expect id to be a single column indicating the complete timeseries
|
|
# We also expect a copy of id in form of static categorical input!!!
|
|
col_names = [id_col] + [x.name for x in config.features if x.feature_embed_type != DataTypes.DATE and x.feature_type != InputTypes.ID]
|
|
grouped_train = [x[1][col_names].values.astype(np.float32).view(dtype=np.int32) for x in train.groupby(id_col)]
|
|
grouped_valid = [x[1][col_names].values.astype(np.float32).view(dtype=np.int32) for x in valid.groupby(id_col)]
|
|
grouped_test = [x[1][col_names].values.astype(np.float32).view(dtype=np.int32) for x in test.groupby(id_col)]
|
|
|
|
pickle.dump(grouped_train, open(os.path.join(dst_path, 'train.bin'), 'wb'))
|
|
pickle.dump(grouped_valid, open(os.path.join(dst_path, 'valid.bin'), 'wb'))
|
|
pickle.dump(grouped_test, open(os.path.join(dst_path, 'test.bin'), 'wb'))
|
|
|
|
|
|
with open(os.path.join(dst_path, 'real_scalers.bin'), 'wb') as f:
|
|
pickle.dump(real_scalers, f)
|
|
with open(os.path.join(dst_path, 'tgt_scalers.bin'), 'wb') as f:
|
|
pickle.dump(tgt_scalers, f)
|
|
with open(os.path.join(dst_path, 'cat_encodings.bin'), 'wb') as f:
|
|
pickle.dump(cat_encodings, f)
|
|
with open(os.path.join(dst_path, 'id_encoders.bin'), 'wb') as f:
|
|
pickle.dump(id_encoders, f)
|
|
|
|
|
|
def sample_data(dataset, num_samples):
|
|
if num_samples < 0:
|
|
return dataset
|
|
else:
|
|
return torch.utils.data.Subset(dataset, np.random.choice(np.arange(len(dataset)), size=num_samples, replace=False))
|
|
|
|
|
|
def standarize_electricity(path):
|
|
"""Code taken from https://github.com/google-research/google-research/blob/master/tft/script_download_data.py"""
|
|
df = pd.read_csv(os.path.join(path, 'LD2011_2014.txt'), index_col=0, sep=';', decimal=',')
|
|
df.index = pd.to_datetime(df.index)
|
|
df.sort_index(inplace=True)
|
|
|
|
# Used to determine the start and end dates of a series
|
|
output = df.resample('1h').mean().replace(0., np.nan)
|
|
|
|
earliest_time = output.index.min()
|
|
|
|
df_list = []
|
|
for label in output:
|
|
print('Processing {}'.format(label))
|
|
srs = output[label]
|
|
|
|
start_date = min(srs.fillna(method='ffill').dropna().index)
|
|
end_date = max(srs.fillna(method='bfill').dropna().index)
|
|
|
|
active_range = (srs.index >= start_date) & (srs.index <= end_date)
|
|
srs = srs[active_range].fillna(0.)
|
|
|
|
tmp = pd.DataFrame({'power_usage': srs})
|
|
date = tmp.index
|
|
tmp['t'] = (date - earliest_time).seconds / 60 / 60 + (
|
|
date - earliest_time).days * 24
|
|
tmp['days_from_start'] = (date - earliest_time).days
|
|
tmp['categorical_id'] = label
|
|
tmp['date'] = date
|
|
tmp['id'] = label
|
|
tmp['hour'] = date.hour
|
|
tmp['day'] = date.day
|
|
tmp['day_of_week'] = date.dayofweek
|
|
tmp['month'] = date.month
|
|
|
|
df_list.append(tmp)
|
|
|
|
output = pd.concat(df_list, axis=0, join='outer').reset_index(drop=True)
|
|
|
|
output['categorical_id'] = output['id'].copy()
|
|
output['hours_from_start'] = output['t']
|
|
output['categorical_day_of_week'] = output['day_of_week'].copy()
|
|
output['categorical_hour'] = output['hour'].copy()
|
|
|
|
output.to_csv(os.path.join(path, 'standarized.csv'))
|
|
|
|
def standarize_volatility(path):
|
|
df = pd.read_csv(os.path.join(path, 'oxfordmanrealizedvolatilityindices.csv'), index_col=0) # no explicit index
|
|
|
|
# Adds additional date/day fields
|
|
idx = [str(s).split('+')[0] for s in df.index
|
|
] # ignore timezones, we don't need them
|
|
dates = pd.to_datetime(idx)
|
|
df['date'] = dates
|
|
df['days_from_start'] = (dates - pd.datetime(2000, 1, 3)).days
|
|
df['day_of_week'] = dates.dayofweek
|
|
df['day_of_month'] = dates.day
|
|
df['week_of_year'] = dates.weekofyear
|
|
df['month'] = dates.month
|
|
df['year'] = dates.year
|
|
df['categorical_id'] = df['Symbol'].copy()
|
|
|
|
# Processes log volatility
|
|
vol = df['rv5_ss'].copy()
|
|
vol.loc[vol == 0.] = np.nan
|
|
df['log_vol'] = np.log(vol)
|
|
|
|
# Adds static information
|
|
symbol_region_mapping = {
|
|
'.AEX': 'EMEA',
|
|
'.AORD': 'APAC',
|
|
'.BFX': 'EMEA',
|
|
'.BSESN': 'APAC',
|
|
'.BVLG': 'EMEA',
|
|
'.BVSP': 'AMER',
|
|
'.DJI': 'AMER',
|
|
'.FCHI': 'EMEA',
|
|
'.FTMIB': 'EMEA',
|
|
'.FTSE': 'EMEA',
|
|
'.GDAXI': 'EMEA',
|
|
'.GSPTSE': 'AMER',
|
|
'.HSI': 'APAC',
|
|
'.IBEX': 'EMEA',
|
|
'.IXIC': 'AMER',
|
|
'.KS11': 'APAC',
|
|
'.KSE': 'APAC',
|
|
'.MXX': 'AMER',
|
|
'.N225': 'APAC ',
|
|
'.NSEI': 'APAC',
|
|
'.OMXC20': 'EMEA',
|
|
'.OMXHPI': 'EMEA',
|
|
'.OMXSPI': 'EMEA',
|
|
'.OSEAX': 'EMEA',
|
|
'.RUT': 'EMEA',
|
|
'.SMSI': 'EMEA',
|
|
'.SPX': 'AMER',
|
|
'.SSEC': 'APAC',
|
|
'.SSMI': 'EMEA',
|
|
'.STI': 'APAC',
|
|
'.STOXX50E': 'EMEA'
|
|
}
|
|
|
|
df['Region'] = df['Symbol'].apply(lambda k: symbol_region_mapping[k])
|
|
|
|
# Performs final processing
|
|
output_df_list = []
|
|
for grp in df.groupby('Symbol'):
|
|
sliced = grp[1].copy()
|
|
sliced.sort_values('days_from_start', inplace=True)
|
|
# Impute log volatility values
|
|
sliced['log_vol'].fillna(method='ffill', inplace=True)
|
|
sliced.dropna()
|
|
output_df_list.append(sliced)
|
|
|
|
df = pd.concat(output_df_list, axis=0)
|
|
|
|
df.to_csv(os.path.join(path, 'standarized.csv'))
|
|
|
|
|
|
def standarize_traffic(path):
|
|
def process_list(s, variable_type=int, delimiter=None):
|
|
"""Parses a line in the PEMS format to a list."""
|
|
if delimiter is None:
|
|
l = [
|
|
variable_type(i) for i in s.replace('[', '').replace(']', '').split()
|
|
]
|
|
else:
|
|
l = [
|
|
variable_type(i)
|
|
for i in s.replace('[', '').replace(']', '').split(delimiter)
|
|
]
|
|
|
|
return l
|
|
|
|
def read_single_list(filename):
|
|
"""Returns single list from a file in the PEMS-custom format."""
|
|
with open(os.path.join(path, filename), 'r') as dat:
|
|
l = process_list(dat.readlines()[0])
|
|
return l
|
|
|
|
def read_matrix(filename):
|
|
"""Returns a matrix from a file in the PEMS-custom format."""
|
|
array_list = []
|
|
with open(os.path.join(path, filename), 'r') as dat:
|
|
lines = dat.readlines()
|
|
for i, line in enumerate(lines):
|
|
if (i + 1) % 50 == 0:
|
|
print('Completed {} of {} rows for {}'.format(i + 1, len(lines),
|
|
filename))
|
|
array = [
|
|
process_list(row_split, variable_type=float, delimiter=None)
|
|
for row_split in process_list(
|
|
line, variable_type=str, delimiter=';')
|
|
]
|
|
array_list.append(array)
|
|
|
|
return array_list
|
|
|
|
shuffle_order = np.array(read_single_list('randperm')) - 1 # index from 0
|
|
train_dayofweek = read_single_list('PEMS_trainlabels')
|
|
train_tensor = read_matrix('PEMS_train')
|
|
test_dayofweek = read_single_list('PEMS_testlabels')
|
|
test_tensor = read_matrix('PEMS_test')
|
|
|
|
# Inverse permutate shuffle order
|
|
print('Shuffling')
|
|
inverse_mapping = {
|
|
new_location: previous_location
|
|
for previous_location, new_location in enumerate(shuffle_order)
|
|
}
|
|
reverse_shuffle_order = np.array([
|
|
inverse_mapping[new_location]
|
|
for new_location, _ in enumerate(shuffle_order)
|
|
])
|
|
|
|
# Group and reoder based on permuation matrix
|
|
print('Reodering')
|
|
day_of_week = np.array(train_dayofweek + test_dayofweek)
|
|
combined_tensor = np.array(train_tensor + test_tensor)
|
|
|
|
day_of_week = day_of_week[reverse_shuffle_order]
|
|
combined_tensor = combined_tensor[reverse_shuffle_order]
|
|
|
|
# Put everything back into a dataframe
|
|
print('Parsing as dataframe')
|
|
labels = ['traj_{}'.format(i) for i in read_single_list('stations_list')]
|
|
|
|
hourly_list = []
|
|
for day, day_matrix in enumerate(combined_tensor):
|
|
# Hourly data
|
|
hourly = pd.DataFrame(day_matrix.T, columns=labels)
|
|
hourly['hour_on_day'] = [int(i / 6) for i in hourly.index
|
|
] # sampled at 10 min intervals
|
|
if hourly['hour_on_day'].max() > 23 or hourly['hour_on_day'].min() < 0:
|
|
raise ValueError('Invalid hour! {}-{}'.format(
|
|
hourly['hour_on_day'].min(), hourly['hour_on_day'].max()))
|
|
|
|
hourly = hourly.groupby('hour_on_day', as_index=True).mean()[labels]
|
|
hourly['sensor_day'] = day
|
|
hourly['time_on_day'] = hourly.index
|
|
hourly['day_of_week'] = day_of_week[day]
|
|
|
|
hourly_list.append(hourly)
|
|
|
|
hourly_frame = pd.concat(hourly_list, axis=0, ignore_index=True, sort=False)
|
|
|
|
# Flatten such that each entitiy uses one row in dataframe
|
|
store_columns = [c for c in hourly_frame.columns if 'traj' in c]
|
|
other_columns = [c for c in hourly_frame.columns if 'traj' not in c]
|
|
flat_df = pd.DataFrame(columns=['values', 'prev_values', 'next_values'] +
|
|
other_columns + ['id'])
|
|
|
|
for store in store_columns:
|
|
print('Processing {}'.format(store))
|
|
|
|
sliced = hourly_frame[[store] + other_columns].copy()
|
|
sliced.columns = ['values'] + other_columns
|
|
sliced['id'] = int(store.replace('traj_', ''))
|
|
|
|
# Sort by Sensor-date-time
|
|
key = sliced['id'].apply(str) \
|
|
+ sliced['sensor_day'].apply(lambda x: '_{:03d}'.format(x)) \
|
|
+ sliced['time_on_day'].apply(lambda x: '_{:03d}'.format(x))
|
|
sliced = sliced.set_index(key).sort_index()
|
|
|
|
sliced['values'] = sliced['values'].fillna(method='ffill')
|
|
sliced['prev_values'] = sliced['values'].shift(1)
|
|
sliced['next_values'] = sliced['values'].shift(-1)
|
|
|
|
flat_df = flat_df.append(sliced.dropna(), ignore_index=True, sort=False)
|
|
|
|
# Filter to match range used by other academic papers
|
|
index = flat_df['sensor_day']
|
|
flat_df = flat_df[index < 173].copy()
|
|
|
|
# Creating columns fo categorical inputs
|
|
flat_df['categorical_id'] = flat_df['id'].copy()
|
|
flat_df['hours_from_start'] = flat_df['time_on_day'] \
|
|
+ flat_df['sensor_day']*24.
|
|
flat_df['categorical_day_of_week'] = flat_df['day_of_week'].copy()
|
|
flat_df['categorical_time_on_day'] = flat_df['time_on_day'].copy()
|
|
|
|
flat_df.to_csv(os.path.join(path, 'standarized.csv'))
|
|
|
|
|
|
# XXX needs rework
|
|
def standarize_favorita(data_folder):
|
|
import gc
|
|
# Extract only a subset of data to save/process for efficiency
|
|
start_date = pd.datetime(2015, 1, 1)
|
|
end_date = pd.datetime(2016, 6, 1)
|
|
|
|
print('Regenerating data...')
|
|
|
|
# load temporal data
|
|
temporal = pd.read_csv(os.path.join(data_folder, 'train.csv'), index_col=0)
|
|
|
|
store_info = pd.read_csv(os.path.join(data_folder, 'stores.csv'), index_col=0)
|
|
oil = pd.read_csv(
|
|
os.path.join(data_folder, 'oil.csv'), index_col=0).iloc[:, 0]
|
|
holidays = pd.read_csv(os.path.join(data_folder, 'holidays_events.csv'))
|
|
items = pd.read_csv(os.path.join(data_folder, 'items.csv'), index_col=0)
|
|
transactions = pd.read_csv(os.path.join(data_folder, 'transactions.csv'))
|
|
|
|
# Take first 6 months of data
|
|
temporal['date'] = pd.to_datetime(temporal['date'])
|
|
|
|
# Filter dates to reduce storage space requirements
|
|
if start_date is not None:
|
|
temporal = temporal[(temporal['date'] >= start_date)]
|
|
if end_date is not None:
|
|
temporal = temporal[(temporal['date'] < end_date)]
|
|
|
|
dates = temporal['date'].unique()
|
|
|
|
# Add trajectory identifier
|
|
temporal['traj_id'] = temporal['store_nbr'].apply(
|
|
str) + '_' + temporal['item_nbr'].apply(str)
|
|
temporal['unique_id'] = temporal['traj_id'] + '_' + temporal['date'].apply(
|
|
str)
|
|
|
|
# Remove all IDs with negative returns
|
|
print('Removing returns data')
|
|
min_returns = temporal['unit_sales'].groupby(temporal['traj_id']).min()
|
|
valid_ids = set(min_returns[min_returns >= 0].index)
|
|
selector = temporal['traj_id'].apply(lambda traj_id: traj_id in valid_ids)
|
|
new_temporal = temporal[selector].copy()
|
|
del temporal
|
|
gc.collect()
|
|
temporal = new_temporal
|
|
temporal['open'] = 1
|
|
|
|
# Resampling
|
|
print('Resampling to regular grid')
|
|
resampled_dfs = []
|
|
for traj_id, raw_sub_df in temporal.groupby('traj_id'):
|
|
print('Resampling', traj_id)
|
|
sub_df = raw_sub_df.set_index('date', drop=True).copy()
|
|
sub_df = sub_df.resample('1d').last()
|
|
sub_df['date'] = sub_df.index
|
|
sub_df[['store_nbr', 'item_nbr', 'onpromotion']] \
|
|
= sub_df[['store_nbr', 'item_nbr', 'onpromotion']].fillna(method='ffill')
|
|
sub_df['open'] = sub_df['open'].fillna(
|
|
0) # flag where sales data is unknown
|
|
sub_df['log_sales'] = np.log(sub_df['unit_sales'])
|
|
|
|
resampled_dfs.append(sub_df.reset_index(drop=True))
|
|
|
|
new_temporal = pd.concat(resampled_dfs, axis=0)
|
|
del temporal
|
|
gc.collect()
|
|
temporal = new_temporal
|
|
|
|
print('Adding oil')
|
|
oil.name = 'oil'
|
|
oil.index = pd.to_datetime(oil.index)
|
|
#XXX the lines below match the value of the oil on given date with the rest of the timeseries
|
|
# missing values in oil series are copied from the index before. Then the oil series is joined with
|
|
# temporal. Then there are some dates present in temporal which arent present in oil, for which
|
|
# oil values is substituted with -1. WHY?!
|
|
#TODO: check how many nans there are after first step. Previously oil series was extended by dates
|
|
# present in dates variable with nan value, which were forward filled.
|
|
# This behavior is no longer supported by pandas, so we changed to DataFrame.isin method.
|
|
# This leaves us with more nans after first step than previously. To achieve previous behavior
|
|
# we have to join series before filling nans.
|
|
temporal = temporal.join(
|
|
#oil.loc[oil.index.isin(dates)].fillna(method='ffill'), on='date', how='left')
|
|
oil.loc[oil.index.isin(dates)], on='date', how='left')
|
|
temporal['oil'] = temporal['oil'].fillna(method='ffill')
|
|
temporal['oil'] = temporal['oil'].fillna(-1)
|
|
|
|
print('Adding store info')
|
|
temporal = temporal.join(store_info, on='store_nbr', how='left')
|
|
|
|
print('Adding item info')
|
|
temporal = temporal.join(items, on='item_nbr', how='left')
|
|
|
|
transactions['date'] = pd.to_datetime(transactions['date'])
|
|
temporal = temporal.merge(
|
|
transactions,
|
|
left_on=['date', 'store_nbr'],
|
|
right_on=['date', 'store_nbr'],
|
|
how='left')
|
|
temporal['transactions'] = temporal['transactions'].fillna(-1)
|
|
|
|
# Additional date info
|
|
temporal['day_of_week'] = pd.to_datetime(temporal['date'].values).dayofweek
|
|
temporal['day_of_month'] = pd.to_datetime(temporal['date'].values).day
|
|
temporal['month'] = pd.to_datetime(temporal['date'].values).month
|
|
|
|
# Add holiday info
|
|
print('Adding holidays')
|
|
holiday_subset = holidays[holidays['transferred'].apply(
|
|
lambda x: not x)].copy()
|
|
holiday_subset.columns = [
|
|
s if s != 'type' else 'holiday_type' for s in holiday_subset.columns
|
|
]
|
|
holiday_subset['date'] = pd.to_datetime(holiday_subset['date'])
|
|
local_holidays = holiday_subset[holiday_subset['locale'] == 'Local']
|
|
regional_holidays = holiday_subset[holiday_subset['locale'] == 'Regional']
|
|
national_holidays = holiday_subset[holiday_subset['locale'] == 'National']
|
|
|
|
temporal['national_hol'] = temporal.merge(
|
|
national_holidays, left_on=['date'], right_on=['date'],
|
|
how='left')['description'].fillna('')
|
|
temporal['regional_hol'] = temporal.merge(
|
|
regional_holidays,
|
|
left_on=['state', 'date'],
|
|
right_on=['locale_name', 'date'],
|
|
how='left')['description'].fillna('')
|
|
temporal['local_hol'] = temporal.merge(
|
|
local_holidays,
|
|
left_on=['city', 'date'],
|
|
right_on=['locale_name', 'date'],
|
|
how='left')['description'].fillna('')
|
|
|
|
temporal.sort_values('unique_id', inplace=True)
|
|
|
|
# Transform date to integer index
|
|
start_date = pd.to_datetime(min(temporal['date']))
|
|
dates = temporal['date'].apply(pd.to_datetime)
|
|
temporal['days_from_start'] = (dates - start_date).dt.days
|
|
temporal['categorical_id'] = temporal['traj_id'].copy()
|
|
|
|
print('Saving processed file to {}'.format(os.path.join(data_folder, 'standarized.csv')))
|
|
temporal.to_csv(os.path.join(data_folder, 'standarized.csv'))
|