187 lines
7.8 KiB
Python
187 lines
7.8 KiB
Python
# -----------------------------------------------------------------------
|
|
#
|
|
# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
|
|
|
|
import numpy as np
|
|
import cupy as cp
|
|
|
|
def generate_negatives(neg_users, true_mat, item_range, sort=False, use_trick=False):
|
|
"""
|
|
Generate negative samples for data augmentation
|
|
"""
|
|
neg_u = []
|
|
neg_i = []
|
|
|
|
# If using the shortcut, generate negative items without checking if the associated
|
|
# user has interacted with it. Speeds up training significantly with very low impact
|
|
# on accuracy.
|
|
if use_trick:
|
|
neg_items = cp.random.randint(0, high=item_range, size=neg_users.shape[0])
|
|
return neg_users, neg_items
|
|
|
|
# Otherwise, generate negative items, check if associated user has interacted with it,
|
|
# then generate a new one if true
|
|
while len(neg_users) > 0:
|
|
neg_items = cp.random.randint(0, high=item_range, size=neg_users.shape[0])
|
|
neg_mask = true_mat[neg_users, neg_items]
|
|
neg_u.append(neg_users[neg_mask])
|
|
neg_i.append(neg_items[neg_mask])
|
|
|
|
neg_users = neg_users[cp.logical_not(neg_mask)]
|
|
|
|
neg_users = cp.concatenate(neg_u)
|
|
neg_items = cp.concatenate(neg_i)
|
|
|
|
if not sort:
|
|
return neg_users, neg_items
|
|
|
|
sorted_users = cp.sort(neg_users)
|
|
sort_indices = cp.argsort(neg_users)
|
|
|
|
return sorted_users, neg_items[sort_indices]
|
|
|
|
|
|
class DataGenerator:
|
|
"""
|
|
Class to handle data augmentation
|
|
"""
|
|
def __init__(self,
|
|
seed,
|
|
hvd_rank,
|
|
num_users, # type: int
|
|
num_items, # type: int
|
|
neg_mat, # type: np.ndarray
|
|
train_users, # type: np.ndarray
|
|
train_items, # type: np.ndarray
|
|
train_labels, # type: np.ndarray
|
|
train_batch_size, # type: int
|
|
train_negative_samples, # type: int
|
|
pos_eval_users, # type: np.ndarray
|
|
pos_eval_items, # type: np.ndarray
|
|
eval_users_per_batch, # type: int
|
|
eval_negative_samples, # type: int
|
|
):
|
|
# Check input data
|
|
if train_users.shape != train_items.shape:
|
|
raise ValueError(
|
|
"Train shapes mismatch! {} Users vs {} Items!".format(
|
|
train_users.shape, train_items.shape))
|
|
if pos_eval_users.shape != pos_eval_items.shape:
|
|
raise ValueError(
|
|
"Eval shapes mismatch! {} Users vs {} Items!".format(
|
|
pos_eval_users.shape, pos_eval_items.shape))
|
|
|
|
np.random.seed(seed)
|
|
cp.random.seed(seed)
|
|
# Use GPU assigned to the horovod rank
|
|
self.hvd_rank = hvd_rank
|
|
cp.cuda.Device(self.hvd_rank).use()
|
|
|
|
self.num_users = num_users
|
|
self.num_items = num_items
|
|
self._neg_mat = neg_mat
|
|
self._train_users = cp.array(train_users)
|
|
self._train_items = cp.array(train_items)
|
|
self._train_labels = cp.array(train_labels)
|
|
self.train_batch_size = train_batch_size
|
|
self._train_negative_samples = train_negative_samples
|
|
self._pos_eval_users = pos_eval_users
|
|
self._pos_eval_items = pos_eval_items
|
|
self.eval_users_per_batch = eval_users_per_batch
|
|
self._eval_negative_samples = eval_negative_samples
|
|
|
|
# Eval data
|
|
self.eval_users = None
|
|
self.eval_items = None
|
|
self.dup_mask = None
|
|
|
|
# Training data
|
|
self.train_users_batches = None
|
|
self.train_items_batches = None
|
|
self.train_labels_batches = None
|
|
|
|
# Augment test data with negative samples
|
|
def prepare_eval_data(self):
|
|
pos_eval_users = cp.array(self._pos_eval_users)
|
|
pos_eval_items = cp.array(self._pos_eval_items)
|
|
|
|
neg_mat = cp.array(self._neg_mat)
|
|
|
|
neg_eval_users_base = cp.repeat(pos_eval_users, self._eval_negative_samples)
|
|
|
|
# Generate negative samples
|
|
test_u_neg, test_i_neg = generate_negatives(neg_users=neg_eval_users_base, true_mat=neg_mat,
|
|
item_range=self.num_items, sort=True, use_trick=False)
|
|
|
|
test_u_neg = test_u_neg.reshape((-1, self._eval_negative_samples)).get()
|
|
test_i_neg = test_i_neg.reshape((-1, self._eval_negative_samples)).get()
|
|
|
|
test_users = self._pos_eval_users.reshape((-1, 1))
|
|
test_items = self._pos_eval_items.reshape((-1, 1))
|
|
# Combine positive and negative samples
|
|
test_users = np.concatenate((test_u_neg, test_users), axis=1)
|
|
test_items = np.concatenate((test_i_neg, test_items), axis=1)
|
|
|
|
# Generate duplicate mask
|
|
## Stable sort indices by incrementing all values with fractional position
|
|
indices = np.arange(test_users.shape[1]).reshape((1, -1)).repeat(test_users.shape[0], axis=0)
|
|
summed_items = np.add(test_items, indices/test_users.shape[1])
|
|
sorted_indices = np.argsort(summed_items, axis=1)
|
|
sorted_order = np.argsort(sorted_indices, axis=1)
|
|
sorted_items = np.sort(test_items, axis=1)
|
|
## Generate duplicate mask
|
|
dup_mask = np.equal(sorted_items[:,0:-1], sorted_items[:,1:])
|
|
dup_mask = np.concatenate((dup_mask, np.zeros((test_users.shape[0], 1))), axis=1)
|
|
r_indices = np.arange(test_users.shape[0]).reshape((-1, 1)).repeat(test_users.shape[1], axis=1)
|
|
dup_mask = dup_mask[r_indices, sorted_order].astype(np.float32)
|
|
|
|
# Reshape all to (-1) and split into chunks
|
|
batch_size = self.eval_users_per_batch * test_users.shape[1]
|
|
split_indices = np.arange(batch_size, test_users.shape[0]*test_users.shape[1], batch_size)
|
|
self.eval_users = np.split(test_users.reshape(-1), split_indices)
|
|
self.eval_items = np.split(test_items.reshape(-1), split_indices)
|
|
self.dup_mask = np.split(dup_mask.reshape(-1), split_indices)
|
|
|
|
# Free GPU memory to make space for Tensorflow
|
|
cp.get_default_memory_pool().free_all_blocks()
|
|
|
|
# Augment training data with negative samples
|
|
def prepare_train_data(self):
|
|
batch_size = self.train_batch_size
|
|
|
|
is_neg = cp.logical_not(self._train_labels)
|
|
|
|
# Do not store verification matrix if using the negatives generation shortcut
|
|
neg_mat = None
|
|
|
|
# If there are no negative samples in the local portion of the training data, do nothing
|
|
any_neg = cp.any(is_neg)
|
|
if any_neg:
|
|
self._train_users[is_neg], self._train_items[is_neg] = generate_negatives(
|
|
self._train_users[is_neg], neg_mat, self.num_items, use_trick=True
|
|
)
|
|
|
|
shuffled_order = cp.random.permutation(self._train_users.shape[0])
|
|
self._train_users = self._train_users[shuffled_order]
|
|
self._train_items = self._train_items[shuffled_order]
|
|
self._train_labels = self._train_labels[shuffled_order]
|
|
|
|
# Manually create batches
|
|
split_indices = np.arange(batch_size, self._train_users.shape[0], batch_size)
|
|
self.train_users_batches = np.split(self._train_users, split_indices)
|
|
self.train_items_batches = np.split(self._train_items, split_indices)
|
|
self.train_labels_batches = np.split(self._train_labels, split_indices)
|