DeepLearningExamples/PyTorch/Forecasting/TFT/tft_pyt/data_utils.py

# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#           http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
################################
# Copyright 2021 The Google Research Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import os
import math
import pickle
import enum
import datetime

from collections import namedtuple, OrderedDict

import sklearn.preprocessing
from sklearn.impute import SimpleImputer
import pandas as pd
import numpy as np
from bisect import bisect

import torch
from torch.utils.data import Dataset,IterableDataset,DataLoader

class DataTypes(enum.IntEnum):
    """Defines numerical types of each column."""
    CONTINUOUS = 0
    CATEGORICAL = 1
    DATE = 2
    STR = 3

class InputTypes(enum.IntEnum):
    """Defines input types of each column."""
    TARGET = 0
    OBSERVED = 1
    KNOWN = 2
    STATIC = 3
    ID = 4  # Single column used as an entity identifier
    TIME = 5  # Single column exclusively used as a time index

FeatureSpec = namedtuple('FeatureSpec', ['name', 'feature_type', 'feature_embed_type'])
DTYPE_MAP = {
        DataTypes.CONTINUOUS : np.float32,
        DataTypes.CATEGORICAL : np.int64,
        DataTypes.DATE:'datetime64[ns]',
        DataTypes.STR: str
        }

FEAT_ORDER = [
        (InputTypes.STATIC, DataTypes.CATEGORICAL),
        (InputTypes.STATIC, DataTypes.CONTINUOUS),
        (InputTypes.KNOWN, DataTypes.CATEGORICAL),
        (InputTypes.KNOWN, DataTypes.CONTINUOUS),
        (InputTypes.OBSERVED, DataTypes.CATEGORICAL),
        (InputTypes.OBSERVED, DataTypes.CONTINUOUS),
        (InputTypes.TARGET, DataTypes.CONTINUOUS),
        (InputTypes.ID, DataTypes.CATEGORICAL)
        ]

FEAT_NAMES = ['s_cat' , 's_cont' , 'k_cat' , 'k_cont' , 'o_cat' , 'o_cont' , 'target', 'id']
DEFAULT_ID_COL = 'id'

class TFTBinaryDataset(Dataset):
    def __init__(self, path, config):
        super(TFTBinaryDataset).__init__()
        self.features = [x for x in config.features if x.feature_embed_type != DataTypes.DATE]
        self.example_length = config.example_length
        self.stride = config.dataset_stride

        self.grouped = pickle.load(open(path, 'rb'))
        self.grouped = [x for x in self.grouped if x.shape[0] >= self.example_length]
        self._cum_examples_in_group = np.cumsum([(g.shape[0] - self.example_length + 1)//self.stride for g in self.grouped])


        self.feature_type_col_map = [[i for i,f in enumerate(self.features) if (f.feature_type, f.feature_embed_type) == x] for x in FEAT_ORDER]

        # The list comprehension below is an elaborate way of rearranging data into correct order,
        # simultaneously doing casting to proper types. Probably can be written neater
        self.grouped = [
                [
                    arr[:, idxs].view(dtype=np.float32).astype(DTYPE_MAP[t[1]])
                    for t, idxs in zip(FEAT_ORDER, self.feature_type_col_map)
                ]
                for arr in self.grouped
            ]

    def __len__(self):
        return self._cum_examples_in_group[-1] if len(self._cum_examples_in_group) else 0

    def __getitem__(self, idx):
        g_idx = bisect(self._cum_examples_in_group, idx)
        e_idx = idx - self._cum_examples_in_group[g_idx-1] if g_idx else idx

        group =  self.grouped[g_idx]

        tensors = [
                   torch.from_numpy(feat[e_idx * self.stride:e_idx*self.stride + self.example_length])
                   if feat.size else torch.empty(0)
                   for feat in group
                  ]

        return OrderedDict(zip(FEAT_NAMES, tensors))


class TFTDataset(Dataset):
    def __init__(self, path, config):
        super(TFTDataset).__init__()
        self.features = config.features
        self.data = pd.read_csv(path, index_col=0)
        self.example_length = config.example_length
        self.stride = config.dataset_stride

        # name field is a column name.
        # there can be multiple entries with the same name because one column can be interpreted in many ways
        time_col_name = next(x.name for x in self.features if x.feature_type==InputTypes.TIME)
        id_col_name   = next(x.name for x in self.features if x.feature_type==InputTypes.ID)
        if not id_col_name in self.data.columns:
            id_col_name = DEFAULT_ID_COL
            self.features = [x for x in self.features if x.feature_type!=InputTypes.ID]
            self.features.append(FeatureSpec(DEFAULT_ID_COL, InputTypes.ID, DataTypes.CATEGORICAL))
        col_dtypes    = {v.name:DTYPE_MAP[v.feature_embed_type] for v in self.features}


        self.data.sort_values(time_col_name,inplace=True)
        self.data = self.data[set(x.name for x in self.features)] #leave only relevant columns
        self.data = self.data.astype(col_dtypes)
        self.data = self.data.groupby(id_col_name).filter(lambda group: len(group) >= self.example_length)
        self.grouped = list(self.data.groupby(id_col_name))

        self._cum_examples_in_group = np.cumsum([(len(g[1]) - self.example_length + 1)//self.stride for g in self.grouped])

    def __len__(self):
        return self._cum_examples_in_group[-1]

    def __getitem__(self, idx):
        g_idx = len([x for x in self._cum_examples_in_group if x <= idx])
        e_idx = idx - self._cum_examples_in_group[g_idx-1] if g_idx else idx

        group =  self.grouped[g_idx][1]
        sliced = group.iloc[e_idx * self.stride:e_idx*self.stride + self.example_length]

        # We need to be sure that tensors are returned in the correct order
        tensors = tuple([] for _ in range(8))
        for v in self.features:
            if v.feature_type == InputTypes.STATIC and v.feature_embed_type == DataTypes.CATEGORICAL:
                tensors[0].append(torch.from_numpy(sliced[v.name].to_numpy()))
            elif v.feature_type == InputTypes.STATIC and v.feature_embed_type == DataTypes.CONTINUOUS:
                tensors[1].append(torch.from_numpy(sliced[v.name].to_numpy()))
            elif v.feature_type == InputTypes.KNOWN and v.feature_embed_type == DataTypes.CATEGORICAL:
                tensors[2].append(torch.from_numpy(sliced[v.name].to_numpy()))
            elif v.feature_type == InputTypes.KNOWN and v.feature_embed_type == DataTypes.CONTINUOUS:
                tensors[3].append(torch.from_numpy(sliced[v.name].to_numpy()))
            elif v.feature_type == InputTypes.OBSERVED and v.feature_embed_type == DataTypes.CATEGORICAL:
                tensors[4].append(torch.from_numpy(sliced[v.name].to_numpy()))
            elif v.feature_type == InputTypes.OBSERVED and v.feature_embed_type == DataTypes.CONTINUOUS:
                tensors[5].append(torch.from_numpy(sliced[v.name].to_numpy()))
            elif v.feature_type == InputTypes.TARGET:
                tensors[6].append(torch.from_numpy(sliced[v.name].to_numpy()))
            elif v.feature_type == InputTypes.ID:
                tensors[7].append(torch.from_numpy(sliced[v.name].to_numpy()))


        tensors = [torch.stack(x, dim=-1) if x else torch.empty(0) for x in tensors]

        return OrderedDict(zip(FEAT_NAMES, tensors))

def get_dataset_splits(df, config):

    if hasattr(config, 'relative_split') and config.relative_split:
        forecast_len = config.example_length - config.encoder_length
        # The valid split is shifted from the train split by number of the forecast steps to the future.
        # The test split is shifted by the number of the forecast steps from the valid split
        train = []
        valid = []
        test = []

        for _, group in df.groupby(DEFAULT_ID_COL):
            index = group[config.time_ids]
            _train = group.loc[index < config.valid_boundary]
            _valid = group.iloc[(len(_train) - config.encoder_length):(len(_train) + forecast_len)]
            _test = group.iloc[(len(_train) - config.encoder_length + forecast_len):(len(_train) + 2*forecast_len)]
            train.append(_train)
            valid.append(_valid)
            test.append(_test)

        train = pd.concat(train, axis=0)
        valid = pd.concat(valid, axis=0)
        test = pd.concat(test, axis=0)
    else:
        index = df[config.time_ids]
        train = df.loc[(index >= config.train_range[0]) & (index < config.train_range[1])]
        valid = df.loc[(index >= config.valid_range[0]) & (index < config.valid_range[1])]
        test  = df.loc[(index >= config.test_range[0]) & (index < config.test_range[1])]

    return train, valid, test

def flatten_ids(df, config):

    if config.missing_id_strategy == 'drop':
        if hasattr(config, 'combine_ids') and config.combine_ids:
            index = np.logical_or.reduce([df[c].isna() for c in config.combine_ids])
        else:
            id_col = next(x.name for x in config.features if x.feature_type == InputTypes.ID)
            index = df[id_col].isna()
        index = index[index == True].index # Extract indices of nans
        df.drop(index, inplace=True)

    if not (hasattr(config, 'combine_ids') and config.combine_ids):
        id_col = next(x.name for x in config.features if x.feature_type == InputTypes.ID)
        ids = df[id_col].apply(str)
        df.drop(id_col, axis=1, inplace=True)
        encoder = sklearn.preprocessing.LabelEncoder().fit(ids.values)
        df[DEFAULT_ID_COL] = encoder.transform(ids)
        encoders = OrderedDict({id_col: encoder})

    else:
        encoders = {c:sklearn.preprocessing.LabelEncoder().fit(df[c].values) for c in config.combine_ids}
        encoders = OrderedDict(encoders)
        lens = [len(v.classes_) for v in encoders.values()]
        clens = np.roll(np.cumprod(lens), 1)
        clens[0] = 1

        # this takes a looooooot of time. Probably it would be better to create 2 dummy columns
        df[DEFAULT_ID_COL] = df.apply(lambda row: sum([encoders[c].transform([row[c]])[0]*clens[i] for i,c in enumerate(encoders.keys())]), axis=1)
        df.drop(config.combine_ids, axis=1, inplace=True)

    return DEFAULT_ID_COL, encoders

def impute(df, config):
    #XXX This ensures that out scaling will have the same mean. We still need to check the variance
    if not hasattr(config, 'missing_data_label'):
        return df, None
    else:
        imp = SimpleImputer(missing_values=config.missing_data_label, strategy='mean')
        mask = df.applymap(lambda x: True if x == config.missing_data_label else False)
        data = df.values
        col_mask = (data == config.missing_data_label).all(axis=0)
        data[:,~col_mask] = imp.fit_transform(data)
        return data, mask

def normalize_reals(train, valid, test, config, id_col=DEFAULT_ID_COL):
    tgt_cols = [x.name for x in config.features if x.feature_type == InputTypes.TARGET]
    real_cols = list(set(v.name for v in config.features if v.feature_embed_type == DataTypes.CONTINUOUS).difference(set(tgt_cols)))
    real_scalers = {}
    tgt_scalers = {}

    def apply_scalers(df, name=None):
        if name is None:
            name = df.name
        mask = df.applymap(lambda x: True if x == config.missing_data_label else False) if hasattr(config, 'missing_data_label') else None
        df[real_cols] = real_scalers[name].transform(df[real_cols])
        if mask is not None and any(mask):
            df[real_cols].mask(mask, 10**9)
        df[tgt_cols] = tgt_scalers[name].transform(df[tgt_cols])
        return df

    if config.scale_per_id:
        for identifier, sliced in train.groupby(id_col):
            data = sliced[real_cols]
            data, _ = impute(data, config)
            real_scalers[identifier] = sklearn.preprocessing.StandardScaler().fit(data)
            # XXX We should probably remove examples that contain NaN as a target
            target = sliced[tgt_cols]
            tgt_scalers[identifier] = sklearn.preprocessing.StandardScaler().fit(target)

        train = train.groupby(id_col).apply(apply_scalers)
        # For valid and testing leave only timeseries previously present in train subset
        # XXX for proper data science we should consider encoding unseen timeseries as a special case, not throwing them away
        valid = valid.loc[valid[id_col].isin(real_scalers.keys())]
        valid = valid.groupby(id_col).apply(apply_scalers)
        test = test.loc[test[id_col].isin(real_scalers.keys())]
        test = test.groupby(id_col).apply(apply_scalers)

    else:
        data, _ = impute(train[real_cols], config)
        real_scalers[''] = sklearn.preprocessing.StandardScaler().fit(data)
        tgt_scalers[''] = sklearn.preprocessing.StandardScaler().fit(train[tgt_cols])

        train = apply_scalers(train, name='')
        valid = apply_scalers(valid, name='')
        test = apply_scalers(test, name='')

    return train, valid, test, real_scalers, tgt_scalers

def encode_categoricals(train, valid, test, config):
    cat_encodings = {}
    cat_cols = list(set(v.name for v in config.features if v.feature_embed_type == DataTypes.CATEGORICAL and v.feature_type != InputTypes.ID))
    num_classes = [] #XXX Maybe we should modify config based on this value? Or send a warninig?
                     # For TC performance reasons we might want for num_classes[i] be divisible by 8

    # Train categorical encoders
    for c in cat_cols:
        if config.missing_cat_data_strategy == 'special_token':
            #XXX this will probably require some data augmentation
            unique = train[c].unique()
            valid[c].loc[valid[c].isin(unique)] = '<UNK>'
            test[c].loc[test[c].isin(unique)] = '<UNK>'

        if config.missing_cat_data_strategy == 'encode_all' or \
                config.missing_cat_data_strategy == 'special_token':
            srs = pd.concat([train[c], valid[c], test[c]]).apply(str)
            cat_encodings[c] = sklearn.preprocessing.LabelEncoder().fit(srs.values)
        elif config.missing_cat_data_strategy == 'drop':
            # TODO: implement this. In addition to dropping rows this has to split specific time series in chunks
            # to prevent data from having temporal gaps
            pass
        num_classes.append(srs.nunique())
    print('Categorical variables encodings lens: ', num_classes)


    for split in [train, valid, test]:
        for c in cat_cols:
            srs = split[c].apply(str)
            split[c] = srs
            split.loc[:,c] = cat_encodings[c].transform(srs)

    return cat_encodings


def preprocess(src_path, dst_path, config):
    df = pd.read_csv(src_path, index_col=0)

    for c in config.features:
        if c.feature_embed_type == DataTypes.DATE:
            df[c.name] = pd.to_datetime(df[c.name])

    # Leave only columns relevant to preprocessing
    relevant_columns = list(set([f.name for f in config.features] + [config.time_ids]))
    df = df[relevant_columns]


    id_col, id_encoders = flatten_ids(df, config)
    df = df.reindex(sorted(df.columns), axis=1)

    train, valid, test = get_dataset_splits(df, config)

    # Length filter the data (all timeseries shorter than example len will be dropped)
    #for df in [train, valid, test]:
    #    df.groupby(id_col).filter(lambda x: len(x) >= config.example_length)
    train = pd.concat([x[1] for x in train.groupby(id_col) if len(x[1]) >= config.example_length])
    valid = pd.concat([x[1] for x in valid.groupby(id_col) if len(x[1]) >= config.example_length])
    test  = pd.concat([x[1] for x in test.groupby(id_col)  if len(x[1]) >= config.example_length])

    train, valid, test, real_scalers, tgt_scalers = normalize_reals(train, valid, test, config, id_col)

    cat_encodings = encode_categoricals(train, valid, test, config)

    os.makedirs(dst_path, exist_ok=True)

    train.to_csv(os.path.join(dst_path, 'train.csv'))
    valid.to_csv(os.path.join(dst_path, 'valid.csv'))
    test.to_csv(os.path.join(dst_path, 'test.csv'))

    # Save relevant columns in binary form for faster dataloading
    # IMORTANT: We always expect id to be a single column indicating the complete timeseries
    # We also expect a copy of id in form of static categorical input!!!
    col_names = [id_col] + [x.name for x in config.features if x.feature_embed_type != DataTypes.DATE and x.feature_type != InputTypes.ID]
    grouped_train = [x[1][col_names].values.astype(np.float32).view(dtype=np.int32) for x in train.groupby(id_col)]
    grouped_valid = [x[1][col_names].values.astype(np.float32).view(dtype=np.int32) for x in valid.groupby(id_col)]
    grouped_test  = [x[1][col_names].values.astype(np.float32).view(dtype=np.int32) for x in test.groupby(id_col)]

    pickle.dump(grouped_train, open(os.path.join(dst_path, 'train.bin'), 'wb'))
    pickle.dump(grouped_valid, open(os.path.join(dst_path, 'valid.bin'), 'wb'))
    pickle.dump(grouped_test,  open(os.path.join(dst_path, 'test.bin'), 'wb'))


    with open(os.path.join(dst_path, 'real_scalers.bin'), 'wb') as f:
        pickle.dump(real_scalers, f)
    with open(os.path.join(dst_path, 'tgt_scalers.bin'), 'wb') as f:
        pickle.dump(tgt_scalers, f)
    with open(os.path.join(dst_path, 'cat_encodings.bin'), 'wb') as f:
        pickle.dump(cat_encodings, f)
    with open(os.path.join(dst_path, 'id_encoders.bin'), 'wb') as f:
        pickle.dump(id_encoders, f)


def sample_data(dataset, num_samples):
    if num_samples < 0:
        return dataset
    else:
        return torch.utils.data.Subset(dataset, np.random.choice(np.arange(len(dataset)), size=num_samples, replace=False))


def standarize_electricity(path):
    """Code taken from https://github.com/google-research/google-research/blob/master/tft/script_download_data.py"""
    df = pd.read_csv(os.path.join(path, 'LD2011_2014.txt'), index_col=0, sep=';', decimal=',')
    df.index = pd.to_datetime(df.index)
    df.sort_index(inplace=True)

    # Used to determine the start and end dates of a series
    output = df.resample('1h').mean().replace(0., np.nan)

    earliest_time = output.index.min()

    df_list = []
    for label in output:
        print('Processing {}'.format(label))
        srs = output[label]

        start_date = min(srs.fillna(method='ffill').dropna().index)
        end_date = max(srs.fillna(method='bfill').dropna().index)

        active_range = (srs.index >= start_date) & (srs.index <= end_date)
        srs = srs[active_range].fillna(0.)

        tmp = pd.DataFrame({'power_usage': srs})
        date = tmp.index
        tmp['t'] = (date - earliest_time).seconds / 60 / 60 + (
            date - earliest_time).days * 24
        tmp['days_from_start'] = (date - earliest_time).days
        tmp['categorical_id'] = label
        tmp['date'] = date
        tmp['id'] = label
        tmp['hour'] = date.hour
        tmp['day'] = date.day
        tmp['day_of_week'] = date.dayofweek
        tmp['month'] = date.month

        df_list.append(tmp)

    output = pd.concat(df_list, axis=0, join='outer').reset_index(drop=True)

    output['categorical_id'] = output['id'].copy()
    output['hours_from_start'] = output['t']
    output['categorical_day_of_week'] = output['day_of_week'].copy()
    output['categorical_hour'] = output['hour'].copy()

    output.to_csv(os.path.join(path, 'standarized.csv'))

def standarize_volatility(path):
    df = pd.read_csv(os.path.join(path, 'oxfordmanrealizedvolatilityindices.csv'), index_col=0)  # no explicit index

    # Adds additional date/day fields
    idx = [str(s).split('+')[0] for s in df.index
          ]  # ignore timezones, we don't need them
    dates = pd.to_datetime(idx)
    df['date'] = dates
    df['days_from_start'] = (dates - pd.datetime(2000, 1, 3)).days
    df['day_of_week'] = dates.dayofweek
    df['day_of_month'] = dates.day
    df['week_of_year'] = dates.weekofyear
    df['month'] = dates.month
    df['year'] = dates.year
    df['categorical_id'] = df['Symbol'].copy()

    # Processes log volatility
    vol = df['rv5_ss'].copy()
    vol.loc[vol == 0.] = np.nan
    df['log_vol'] = np.log(vol)

    # Adds static information
    symbol_region_mapping = {
        '.AEX': 'EMEA',
        '.AORD': 'APAC',
        '.BFX': 'EMEA',
        '.BSESN': 'APAC',
        '.BVLG': 'EMEA',
        '.BVSP': 'AMER',
        '.DJI': 'AMER',
        '.FCHI': 'EMEA',
        '.FTMIB': 'EMEA',
        '.FTSE': 'EMEA',
        '.GDAXI': 'EMEA',
        '.GSPTSE': 'AMER',
        '.HSI': 'APAC',
        '.IBEX': 'EMEA',
        '.IXIC': 'AMER',
        '.KS11': 'APAC',
        '.KSE': 'APAC',
        '.MXX': 'AMER',
        '.N225': 'APAC ',
        '.NSEI': 'APAC',
        '.OMXC20': 'EMEA',
        '.OMXHPI': 'EMEA',
        '.OMXSPI': 'EMEA',
        '.OSEAX': 'EMEA',
        '.RUT': 'EMEA',
        '.SMSI': 'EMEA',
        '.SPX': 'AMER',
        '.SSEC': 'APAC',
        '.SSMI': 'EMEA',
        '.STI': 'APAC',
        '.STOXX50E': 'EMEA'
    }

    df['Region'] = df['Symbol'].apply(lambda k: symbol_region_mapping[k])

    # Performs final processing
    output_df_list = []
    for grp in df.groupby('Symbol'):
        sliced = grp[1].copy()
        sliced.sort_values('days_from_start', inplace=True)
        # Impute log volatility values
        sliced['log_vol'].fillna(method='ffill', inplace=True)
        sliced.dropna()
        output_df_list.append(sliced)

    df = pd.concat(output_df_list, axis=0)

    df.to_csv(os.path.join(path, 'standarized.csv'))


def standarize_traffic(path):
    def process_list(s, variable_type=int, delimiter=None):
        """Parses a line in the PEMS format to a list."""
        if delimiter is None:
            l = [
                variable_type(i) for i in s.replace('[', '').replace(']', '').split()
            ]
        else:
            l = [
                variable_type(i)
                for i in s.replace('[', '').replace(']', '').split(delimiter)
            ]

        return l

    def read_single_list(filename):
        """Returns single list from a file in the PEMS-custom format."""
        with open(os.path.join(path, filename), 'r') as dat:
            l = process_list(dat.readlines()[0])
        return l

    def read_matrix(filename):
        """Returns a matrix from a file in the PEMS-custom format."""
        array_list = []
        with open(os.path.join(path, filename), 'r') as dat:
            lines = dat.readlines()
            for i, line in enumerate(lines):
                if (i + 1) % 50 == 0:
                    print('Completed {} of {} rows for {}'.format(i + 1, len(lines),
                                                                filename))
                array = [
                    process_list(row_split, variable_type=float, delimiter=None)
                    for row_split in process_list(
                        line, variable_type=str, delimiter=';')
                ]
                array_list.append(array)

        return array_list

    shuffle_order = np.array(read_single_list('randperm')) - 1  # index from 0
    train_dayofweek = read_single_list('PEMS_trainlabels')
    train_tensor = read_matrix('PEMS_train')
    test_dayofweek = read_single_list('PEMS_testlabels')
    test_tensor = read_matrix('PEMS_test')

    # Inverse permutate shuffle order
    print('Shuffling')
    inverse_mapping = {
        new_location: previous_location
        for previous_location, new_location in enumerate(shuffle_order)
    }
    reverse_shuffle_order = np.array([
        inverse_mapping[new_location]
        for new_location, _ in enumerate(shuffle_order)
    ])

    # Group and reoder based on permuation matrix
    print('Reodering')
    day_of_week = np.array(train_dayofweek + test_dayofweek)
    combined_tensor = np.array(train_tensor + test_tensor)

    day_of_week = day_of_week[reverse_shuffle_order]
    combined_tensor = combined_tensor[reverse_shuffle_order]

    # Put everything back into a dataframe
    print('Parsing as dataframe')
    labels = ['traj_{}'.format(i) for i in read_single_list('stations_list')]

    hourly_list = []
    for day, day_matrix in enumerate(combined_tensor):
        # Hourly data
        hourly = pd.DataFrame(day_matrix.T, columns=labels)
        hourly['hour_on_day'] = [int(i / 6) for i in hourly.index
                                ]  # sampled at 10 min intervals
        if hourly['hour_on_day'].max() > 23 or hourly['hour_on_day'].min() < 0:
            raise ValueError('Invalid hour! {}-{}'.format(
                hourly['hour_on_day'].min(), hourly['hour_on_day'].max()))

        hourly = hourly.groupby('hour_on_day', as_index=True).mean()[labels]
        hourly['sensor_day'] = day
        hourly['time_on_day'] = hourly.index
        hourly['day_of_week'] = day_of_week[day]

        hourly_list.append(hourly)

    hourly_frame = pd.concat(hourly_list, axis=0, ignore_index=True, sort=False)

    # Flatten such that each entitiy uses one row in dataframe
    store_columns = [c for c in hourly_frame.columns if 'traj' in c]
    other_columns = [c for c in hourly_frame.columns if 'traj' not in c]
    flat_df = pd.DataFrame(columns=['values', 'prev_values', 'next_values'] +
                           other_columns + ['id'])

    for store in store_columns:
        print('Processing {}'.format(store))

        sliced = hourly_frame[[store] + other_columns].copy()
        sliced.columns = ['values'] + other_columns
        sliced['id'] = int(store.replace('traj_', ''))

        # Sort by Sensor-date-time
        key = sliced['id'].apply(str) \
                + sliced['sensor_day'].apply(lambda x: '_{:03d}'.format(x)) \
                + sliced['time_on_day'].apply(lambda x: '_{:03d}'.format(x))
        sliced = sliced.set_index(key).sort_index()

        sliced['values'] = sliced['values'].fillna(method='ffill')
        sliced['prev_values'] = sliced['values'].shift(1)
        sliced['next_values'] = sliced['values'].shift(-1)

        flat_df = flat_df.append(sliced.dropna(), ignore_index=True, sort=False)

    # Filter to match range used by other academic papers
    index = flat_df['sensor_day']
    flat_df = flat_df[index < 173].copy()

    # Creating columns fo categorical inputs
    flat_df['categorical_id'] = flat_df['id'].copy()
    flat_df['hours_from_start'] = flat_df['time_on_day'] \
        + flat_df['sensor_day']*24.
    flat_df['categorical_day_of_week'] = flat_df['day_of_week'].copy()
    flat_df['categorical_time_on_day'] = flat_df['time_on_day'].copy()

    flat_df.to_csv(os.path.join(path, 'standarized.csv'))


# XXX needs rework
def standarize_favorita(data_folder):
    import gc
    # Extract only a subset of data to save/process for efficiency
    start_date = pd.datetime(2015, 1, 1)
    end_date = pd.datetime(2016, 6, 1)

    print('Regenerating data...')

    # load temporal data
    temporal = pd.read_csv(os.path.join(data_folder, 'train.csv'), index_col=0)

    store_info = pd.read_csv(os.path.join(data_folder, 'stores.csv'), index_col=0)
    oil = pd.read_csv(
        os.path.join(data_folder, 'oil.csv'), index_col=0).iloc[:, 0]
    holidays = pd.read_csv(os.path.join(data_folder, 'holidays_events.csv'))
    items = pd.read_csv(os.path.join(data_folder, 'items.csv'), index_col=0)
    transactions = pd.read_csv(os.path.join(data_folder, 'transactions.csv'))

    # Take first 6 months of data
    temporal['date'] = pd.to_datetime(temporal['date'])

    # Filter dates to reduce storage space requirements
    if start_date is not None:
        temporal = temporal[(temporal['date'] >= start_date)]
    if end_date is not None:
        temporal = temporal[(temporal['date'] < end_date)]

    dates = temporal['date'].unique()

    # Add trajectory identifier
    temporal['traj_id'] = temporal['store_nbr'].apply(
        str) + '_' + temporal['item_nbr'].apply(str)
    temporal['unique_id'] = temporal['traj_id'] + '_' + temporal['date'].apply(
        str)

    # Remove all IDs with negative returns
    print('Removing returns data')
    min_returns = temporal['unit_sales'].groupby(temporal['traj_id']).min()
    valid_ids = set(min_returns[min_returns >= 0].index)
    selector = temporal['traj_id'].apply(lambda traj_id: traj_id in valid_ids)
    new_temporal = temporal[selector].copy()
    del temporal
    gc.collect()
    temporal = new_temporal
    temporal['open'] = 1

    # Resampling
    print('Resampling to regular grid')
    resampled_dfs = []
    for traj_id, raw_sub_df in temporal.groupby('traj_id'):
        print('Resampling', traj_id)
        sub_df = raw_sub_df.set_index('date', drop=True).copy()
        sub_df = sub_df.resample('1d').last()
        sub_df['date'] = sub_df.index
        sub_df[['store_nbr', 'item_nbr', 'onpromotion']] \
            = sub_df[['store_nbr', 'item_nbr', 'onpromotion']].fillna(method='ffill')
        sub_df['open'] = sub_df['open'].fillna(
            0)  # flag where sales data is unknown
        sub_df['log_sales'] = np.log(sub_df['unit_sales'])

        resampled_dfs.append(sub_df.reset_index(drop=True))

    new_temporal = pd.concat(resampled_dfs, axis=0)
    del temporal
    gc.collect()
    temporal = new_temporal

    print('Adding oil')
    oil.name = 'oil'
    oil.index = pd.to_datetime(oil.index)
    #XXX the lines below match the value of the oil on given date with the rest of the timeseries
    # missing values in oil series are copied from the index before. Then the oil series is joined with
    # temporal. Then there are some dates present in temporal which arent present in oil, for which
    # oil values is substituted with -1. WHY?!
    #TODO: check how many nans there are after first step. Previously oil series was extended by dates
    # present in dates variable with nan value, which were forward filled.
    # This behavior is no longer supported by pandas, so we changed to DataFrame.isin method.
    # This leaves us with more nans after first step than previously. To achieve previous behavior
    # we have to join series before filling nans.
    temporal = temporal.join(
        #oil.loc[oil.index.isin(dates)].fillna(method='ffill'), on='date', how='left')
        oil.loc[oil.index.isin(dates)], on='date', how='left')
    temporal['oil'] = temporal['oil'].fillna(method='ffill')
    temporal['oil'] = temporal['oil'].fillna(-1)

    print('Adding store info')
    temporal = temporal.join(store_info, on='store_nbr', how='left')

    print('Adding item info')
    temporal = temporal.join(items, on='item_nbr', how='left')

    transactions['date'] = pd.to_datetime(transactions['date'])
    temporal = temporal.merge(
        transactions,
        left_on=['date', 'store_nbr'],
        right_on=['date', 'store_nbr'],
        how='left')
    temporal['transactions'] = temporal['transactions'].fillna(-1)

    # Additional date info
    temporal['day_of_week'] = pd.to_datetime(temporal['date'].values).dayofweek
    temporal['day_of_month'] = pd.to_datetime(temporal['date'].values).day
    temporal['month'] = pd.to_datetime(temporal['date'].values).month

    # Add holiday info
    print('Adding holidays')
    holiday_subset = holidays[holidays['transferred'].apply(
        lambda x: not x)].copy()
    holiday_subset.columns = [
        s if s != 'type' else 'holiday_type' for s in holiday_subset.columns
    ]
    holiday_subset['date'] = pd.to_datetime(holiday_subset['date'])
    local_holidays = holiday_subset[holiday_subset['locale'] == 'Local']
    regional_holidays = holiday_subset[holiday_subset['locale'] == 'Regional']
    national_holidays = holiday_subset[holiday_subset['locale'] == 'National']

    temporal['national_hol'] = temporal.merge(
        national_holidays, left_on=['date'], right_on=['date'],
        how='left')['description'].fillna('')
    temporal['regional_hol'] = temporal.merge(
        regional_holidays,
        left_on=['state', 'date'],
        right_on=['locale_name', 'date'],
        how='left')['description'].fillna('')
    temporal['local_hol'] = temporal.merge(
        local_holidays,
        left_on=['city', 'date'],
        right_on=['locale_name', 'date'],
        how='left')['description'].fillna('')

    temporal.sort_values('unique_id', inplace=True)

    # Transform date to integer index
    start_date = pd.to_datetime(min(temporal['date']))
    dates = temporal['date'].apply(pd.to_datetime)
    temporal['days_from_start'] = (dates - start_date).dt.days
    temporal['categorical_id'] = temporal['traj_id'].copy()

    print('Saving processed file to {}'.format(os.path.join(data_folder, 'standarized.csv')))
    temporal.to_csv(os.path.join(data_folder, 'standarized.csv'))