123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357 |
- '''
- Created on Oct 10, 2018
- Tensorflow Implementation of Neural Graph Collaborative Filtering (NGCF) model in:
- Wang Xiang et al. Neural Graph Collaborative Filtering. In SIGIR 2019.
- @author: Xiang Wang (xiangwang@u.nus.edu)
- '''
- import numpy as np
- import random as rd
- import scipy.sparse as sp
- from time import time
- class Data(object):
- def __init__(self, path, batch_size):
- self.path = path
- self.batch_size = batch_size
- train_file = path + '/train.txt'
- test_file = path + '/test.txt'
- self.n_users, self.n_items = 0, 0
- self.n_train, self.n_test = 0, 0
- self.neg_pools = {}
- self.exist_users = []
-
- with open(train_file) as f:
- for l in f.readlines():
- if len(l) > 0:
- l = l.strip('\n').split(' ')
-
- #items = [int(i) for i in l[1:]]
- items = list()
- for idx , i in enumerate(l[1:]):
- if idx == 0:
- val = i.strip('\"')
- print("begin val: " + val)
- items.append(int(val))
- elif idx == (len(l[1:]) - 1):
- val = i.strip('\"')
- print("end val: " + val)
- items.append(int(val))
- else:
- items.append(int(val))
-
- uid = int(l[0])
- self.exist_users.append(uid)
- self.n_items = max(self.n_items, max(items))
- self.n_users = max(self.n_users, uid)
- self.n_train += len(items)
- with open(test_file) as f:
- for l in f.readlines():
- if len(l) > 0:
- #l = l.strip('\n')
- try:
- #items = [int(i) for i in l.split(' ')[1:]]
- l = l.strip('\n').split(' ')
-
- #items = [int(i) for i in l[1:]]
- items = list()
- for idx , i in enumerate(l[1:]):
- if idx == 0:
- val = i.strip('\"')
- print("begin val: " + val)
- items.append(int(val))
- elif idx == (len(l[1:]) - 1):
- val = i.strip('\"')
- print("end val: " + val)
- items.append(int(val))
- else:
- items.append(int(val))
- except Exception:
- continue
- self.n_items = max(self.n_items, max(items))
- self.n_test += len(items)
- self.n_items += 1
- self.n_users += 1
- self.print_statistics()
- count_size = max(self.n_items, self.n_users)
- self.R = sp.dok_matrix((count_size, count_size), dtype=np.float32)
- self.train_items, self.test_set = {}, {}
- with open(train_file) as f_train:
- with open(test_file) as f_test:
- for l in f_train.readlines():
- if len(l) == 0: break
- l = l.strip('\n')
- items = [int(i.strip('\"')) for i in l.split(' ')]
- uid, train_items = items[0], items[1:]
- for i in train_items:
- self.R[uid, i] = 1.
-
- self.train_items[uid] = train_items
-
- for l in f_test.readlines():
- if len(l) == 0: break
- l = l.strip('\n')
- try:
- items = [int(i.strip('\"')) for i in l.split(' ')]
- except Exception:
- continue
-
- uid, test_items = items[0], items[1:]
- self.test_set[uid] = test_items
- def get_adj_mat(self):
- try:
- t1 = time()
- adj_mat = sp.load_npz(self.path + '/s_adj_mat.npz')
- norm_adj_mat = sp.load_npz(self.path + '/s_norm_adj_mat.npz')
- mean_adj_mat = sp.load_npz(self.path + '/s_mean_adj_mat.npz')
- print('already load adj matrix', adj_mat.shape, time() - t1)
-
- except Exception:
- adj_mat, norm_adj_mat, mean_adj_mat = self.create_adj_mat()
- sp.save_npz(self.path + '/s_adj_mat.npz', adj_mat)
- sp.save_npz(self.path + '/s_norm_adj_mat.npz', norm_adj_mat)
- sp.save_npz(self.path + '/s_mean_adj_mat.npz', mean_adj_mat)
-
- try:
- pre_adj_mat = sp.load_npz(self.path + '/s_pre_adj_mat.npz')
- except Exception:
- adj_mat=adj_mat
- rowsum = np.array(adj_mat.sum(1))
- d_inv = np.power(rowsum, -0.5).flatten()
-
- d_inv[np.isinf(d_inv)] = 0.
- d_mat_inv = sp.diags(d_inv)
- norm_adj = d_mat_inv.dot(adj_mat)
- norm_adj = norm_adj.dot(d_mat_inv)
- print('generate pre adjacency matrix.')
- pre_adj_mat = norm_adj.tocsr()
- sp.save_npz(self.path + '/s_pre_adj_mat.npz', norm_adj)
-
- return adj_mat, norm_adj_mat, mean_adj_mat,pre_adj_mat
- def create_adj_mat(self):
- t1 = time()
- adj_mat = sp.dok_matrix((self.n_users + self.n_items, self.n_users + self.n_items), dtype=np.float32)
- adj_mat = adj_mat.tolil()
- R = self.R.tolil()
- # prevent memory from overflowing
- for i in range(5):
- adj_mat[int(self.n_users*i/5.0):int(self.n_users*(i+1.0)/5), self.n_users:] =\
- R[int(self.n_users*i/5.0):int(self.n_users*(i+1.0)/5)]
- adj_mat[self.n_users:,int(self.n_users*i/5.0):int(self.n_users*(i+1.0)/5)] =\
- R[int(self.n_users*i/5.0):int(self.n_users*(i+1.0)/5)].T
- adj_mat = adj_mat.todok()
- print('already create adjacency matrix', adj_mat.shape, time() - t1)
-
- t2 = time()
- def normalized_adj_single(adj):
- rowsum = np.array(adj.sum(1))
- d_inv = np.power(rowsum, -1).flatten()
- d_inv[np.isinf(d_inv)] = 0.
- d_mat_inv = sp.diags(d_inv)
- norm_adj = d_mat_inv.dot(adj)
- print('generate single-normalized adjacency matrix.')
- return norm_adj.tocoo()
- def check_adj_if_equal(adj):
- dense_A = np.array(adj.todense())
- degree = np.sum(dense_A, axis=1, keepdims=False)
- temp = np.dot(np.diag(np.power(degree, -1)), dense_A)
- print('check normalized adjacency matrix whether equal to this laplacian matrix.')
- return temp
-
- norm_adj_mat = normalized_adj_single(adj_mat + sp.eye(adj_mat.shape[0]))
- mean_adj_mat = normalized_adj_single(adj_mat)
-
- print('already normalize adjacency matrix', time() - t2)
- return adj_mat.tocsr(), norm_adj_mat.tocsr(), mean_adj_mat.tocsr()
-
- def negative_pool(self):
- t1 = time()
- for u in self.train_items.keys():
- neg_items = list(set(range(self.n_items)) - set(self.train_items[u]))
- pools = [rd.choice(neg_items) for _ in range(100)]
- self.neg_pools[u] = pools
- print('refresh negative pools', time() - t1)
- def sample(self):
- if self.batch_size <= self.n_users:
- users = rd.sample(self.exist_users, self.batch_size)
- else:
- users = [rd.choice(self.exist_users) for _ in range(self.batch_size)]
- def sample_pos_items_for_u(u, num):
- pos_items = self.train_items[u]
- n_pos_items = len(pos_items)
- pos_batch = []
- while True:
- if len(pos_batch) == num: break
- pos_id = np.random.randint(low=0, high=n_pos_items, size=1)[0]
- pos_i_id = pos_items[pos_id]
- if pos_i_id not in pos_batch:
- pos_batch.append(pos_i_id)
- return pos_batch
- def sample_neg_items_for_u(u, num):
- neg_items = []
- while True:
- if len(neg_items) == num: break
- neg_id = np.random.randint(low=0, high=self.n_items,size=1)[0]
- if neg_id not in self.train_items[u] and neg_id not in neg_items:
- neg_items.append(neg_id)
- return neg_items
- def sample_neg_items_for_u_from_pools(u, num):
- neg_items = list(set(self.neg_pools[u]) - set(self.train_items[u]))
- return rd.sample(neg_items, num)
- pos_items, neg_items = [], []
- for u in users:
- pos_items += sample_pos_items_for_u(u, 1)
- neg_items += sample_neg_items_for_u(u, 1)
- return users, pos_items, neg_items
-
- def sample_test(self):
- if self.batch_size <= self.n_users:
- users = rd.sample(self.test_set.keys(), self.batch_size)
- else:
- users = [rd.choice(self.exist_users) for _ in range(self.batch_size)]
- def sample_pos_items_for_u(u, num):
- pos_items = self.test_set[u]
- n_pos_items = len(pos_items)
- pos_batch = []
- while True:
- if len(pos_batch) == num: break
- pos_id = np.random.randint(low=0, high=n_pos_items, size=1)[0]
- pos_i_id = pos_items[pos_id]
- if pos_i_id not in pos_batch:
- pos_batch.append(pos_i_id)
- return pos_batch
- def sample_neg_items_for_u(u, num):
- neg_items = []
- while True:
- if len(neg_items) == num: break
- neg_id = np.random.randint(low=0, high=self.n_items, size=1)[0]
- if neg_id not in (self.test_set[u]+self.train_items[u]) and neg_id not in neg_items:
- neg_items.append(neg_id)
- return neg_items
-
- def sample_neg_items_for_u_from_pools(u, num):
- neg_items = list(set(self.neg_pools[u]) - set(self.train_items[u]))
- return rd.sample(neg_items, num)
- pos_items, neg_items = [], []
- for u in users:
- pos_items += sample_pos_items_for_u(u, 1)
- neg_items += sample_neg_items_for_u(u, 1)
- return users, pos_items, neg_items
-
-
-
-
-
-
- def get_num_users_items(self):
- return self.n_users, self.n_items
- def print_statistics(self):
- print('n_users=%d, n_items=%d' % (self.n_users, self.n_items))
- print('n_interactions=%d' % (self.n_train + self.n_test))
- print('n_train=%d, n_test=%d, sparsity=%.5f' % (self.n_train, self.n_test, (self.n_train + self.n_test)/(self.n_users * self.n_items)))
- def get_sparsity_split(self):
- try:
- split_uids, split_state = [], []
- lines = open(self.path + '/sparsity.split', 'r').readlines()
- for idx, line in enumerate(lines):
- if idx % 2 == 0:
- split_state.append(line.strip())
- print(line.strip())
- else:
- split_uids.append([int(uid) for uid in line.strip().split(' ')])
- print('get sparsity split.')
- except Exception:
- split_uids, split_state = self.create_sparsity_split()
- f = open(self.path + '/sparsity.split', 'w')
- for idx in range(len(split_state)):
- f.write(split_state[idx] + '\n')
- f.write(' '.join([str(uid) for uid in split_uids[idx]]) + '\n')
- print('create sparsity split.')
- return split_uids, split_state
- def create_sparsity_split(self):
- all_users_to_test = list(self.test_set.keys())
- user_n_iid = dict()
- # generate a dictionary to store (key=n_iids, value=a list of uid).
- for uid in all_users_to_test:
- train_iids = self.train_items[uid]
- test_iids = self.test_set[uid]
- n_iids = len(train_iids) + len(test_iids)
- if n_iids not in user_n_iid.keys():
- user_n_iid[n_iids] = [uid]
- else:
- user_n_iid[n_iids].append(uid)
- split_uids = list()
- # split the whole user set into four subset.
- temp = []
- count = 1
- fold = 4
- n_count = (self.n_train + self.n_test)
- n_rates = 0
- split_state = []
- for idx, n_iids in enumerate(sorted(user_n_iid)):
- temp += user_n_iid[n_iids]
- n_rates += n_iids * len(user_n_iid[n_iids])
- n_count -= n_iids * len(user_n_iid[n_iids])
- if n_rates >= count * 0.25 * (self.n_train + self.n_test):
- split_uids.append(temp)
- state = '#inter per user<=[%d], #users=[%d], #all rates=[%d]' %(n_iids, len(temp), n_rates)
- split_state.append(state)
- print(state)
- temp = []
- n_rates = 0
- fold -= 1
- if idx == len(user_n_iid.keys()) - 1 or n_count == 0:
- split_uids.append(temp)
- state = '#inter per user<=[%d], #users=[%d], #all rates=[%d]' % (n_iids, len(temp), n_rates)
- split_state.append(state)
- print(state)
- return split_uids, split_state
|