-
Notifications
You must be signed in to change notification settings - Fork 0
/
preprocessing.py
275 lines (222 loc) · 10.3 KB
/
preprocessing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
import numpy as np
import scipy.sparse as sp
import tensorflow as tf
import os
import psutil
flags = tf.app.flags
FLAGS = flags.FLAGS
"""
Disclaimer: functions defined from lines 18 to 54 in this file come from
the tkipf/gae original repository on Graph Autoencoders. Moreover, the
mask_test_edges function is borrowed from philipjackson's mask_test_edges
pull request on this same repository.
"""
def sparse_to_tuple(sparse_mx):
if not sp.isspmatrix_coo(sparse_mx):
sparse_mx = sparse_mx.tocoo()
coords = np.vstack((sparse_mx.row, sparse_mx.col)).transpose()
values = sparse_mx.data
shape = sparse_mx.shape
return coords, values, shape
def preprocess_graph(adj):
adj = sp.coo_matrix(adj)
adj_ = adj + sp.eye(adj.shape[0])
degree_mat_inv_sqrt = sp.diags(np.power(np.array(adj_.sum(1)), -0.5).flatten())
adj_normalized = adj_.dot(degree_mat_inv_sqrt).transpose().dot(degree_mat_inv_sqrt)
return sparse_to_tuple(adj_normalized)
def construct_feed_dict(adj_normalized, adj_normalized_layer2, adj_orig, features, deg_matrix, placeholders):
# Construct feed dictionary
feed_dict = dict()
feed_dict.update({placeholders['features']: features})
feed_dict.update({placeholders['adj']: adj_normalized})
feed_dict.update({placeholders['adj_layer2']: adj_normalized_layer2})
feed_dict.update({placeholders['adj_orig']: adj_orig})
if not FLAGS.simple:
feed_dict.update({placeholders['degree_matrix']: deg_matrix})
return feed_dict
def mask_test_edges(adj, test_percent = 10., val_percent = 5.):
"""
Randomly removes some edges from original graph to create
test and validation sets for the link prediction task
:param adj: complete sparse adjacency matrix of the graph
:param test_percent: percentage of edges in test set
:param val_percent: percentage of edges in validation set
:return: train incomplete adjacency matrix, validation and test sets
"""
# Remove diagonal elements
adj = adj - sp.dia_matrix((adj.diagonal()[None, :], [0]), shape = adj.shape)
adj.eliminate_zeros()
edges_positive, _, _ = sparse_to_tuple(adj)
# Filtering out edges from lower triangle of adjacency matrix
edges_positive = edges_positive[edges_positive[:, 1] > edges_positive[:, 0], :]
# Number of positive (and negative) edges in test and val sets:
num_test = int(np.floor(edges_positive.shape[0] / (100. / test_percent)))
num_val = int(np.floor(edges_positive.shape[0] / (100. / val_percent)))
# Sample positive edges for test and val sets:
edges_positive_idx = np.arange(edges_positive.shape[0])
np.random.shuffle(edges_positive_idx)
val_edge_idx = edges_positive_idx[:num_val]
test_edge_idx = edges_positive_idx[num_val:(num_val + num_test)]
test_edges = edges_positive[test_edge_idx] # positive test edges
val_edges = edges_positive[val_edge_idx] # positive val edges
train_edges = np.delete(edges_positive, np.hstack([test_edge_idx, val_edge_idx]), axis = 0) # positive train edges
# The above strategy for sampling without replacement will not work for
# sampling negative edges on large graphs, because the pool of negative
# edges is much much larger due to sparsity, therefore we'll use
# the following strategy:
# 1. sample random linear indices from adjacency matrix WITH REPLACEMENT
# (without replacement is super slow). sample more than we need so we'll
# probably have enough after all the filtering steps.
# 2. remove any edges that have already been added to the other edge lists
# 3. convert to (i,j) coordinates
# 4. swap i and j where i > j, to ensure they're upper triangle elements
# 5. remove any duplicate elements if there are any
# 6. remove any diagonal elements
# 7. if we don't have enough edges, repeat this process until we get enough
positive_idx, _, _ = sparse_to_tuple(adj) # [i,j] coord pairs for all true edges
positive_idx = positive_idx[:, 0]*adj.shape[0] + positive_idx[:, 1] # linear indices
test_edges_false = np.empty((0,2), dtype = 'int64')
idx_test_edges_false = np.empty((0,), dtype = 'int64')
while len(test_edges_false) < len(test_edges):
# step 1:
idx = np.random.choice(adj.shape[0]**2, 2*(num_test - len(test_edges_false)), replace = True)
# step 2:
idx = idx[~np.in1d(idx, positive_idx, assume_unique = True)]
idx = idx[~np.in1d(idx, idx_test_edges_false, assume_unique = True)]
# step 3:
rowidx = idx // adj.shape[0]
colidx = idx % adj.shape[0]
coords = np.vstack((rowidx, colidx)).transpose()
# step 4:
lowertrimask = coords[:, 0] > coords[:, 1]
coords[lowertrimask] = coords[lowertrimask][:, ::-1]
# step 5:
coords = np.unique(coords, axis = 0) # note: coords are now sorted lexicographically
np.random.shuffle(coords) # not anymore
# step 6:
coords = coords[coords[:,0] != coords[:,1]]
# step 7:
coords = coords[:min(num_test, len(idx))]
test_edges_false = np.append(test_edges_false, coords, axis = 0)
idx = idx[:min(num_test, len(idx))]
idx_test_edges_false = np.append(idx_test_edges_false, idx)
val_edges_false = np.empty((0,2), dtype = 'int64')
idx_val_edges_false = np.empty((0,), dtype = 'int64')
while len(val_edges_false) < len(val_edges):
# step 1:
idx = np.random.choice(adj.shape[0]**2, 2*(num_val - len(val_edges_false)), replace = True)
# step 2:
idx = idx[~np.in1d(idx, positive_idx, assume_unique = True)]
idx = idx[~np.in1d(idx, idx_test_edges_false, assume_unique = True)]
idx = idx[~np.in1d(idx, idx_val_edges_false, assume_unique = True)]
# step 3:
rowidx = idx // adj.shape[0]
colidx = idx % adj.shape[0]
coords = np.vstack((rowidx,colidx)).transpose()
# step 4:
lowertrimask = coords[:, 0] > coords[:, 1]
coords[lowertrimask] = coords[lowertrimask][:, ::-1]
# step 5:
coords = np.unique(coords, axis = 0) # note: coords are now sorted lexicographically
np.random.shuffle(coords) # not any more
# step 6:
coords = coords[coords[:, 0] != coords[:, 1]]
# step 7:
coords = coords[:min(num_val, len(idx))]
val_edges_false = np.append(val_edges_false, coords, axis = 0)
idx = idx[:min(num_val, len(idx))]
idx_val_edges_false = np.append(idx_val_edges_false, idx)
# Re-build adj matrix
data = np.ones(train_edges.shape[0])
adj_train = sp.csr_matrix((data, (train_edges[:, 0], train_edges[:, 1])), shape = adj.shape)
adj_train = adj_train + adj_train.T
return adj_train, val_edges, val_edges_false, test_edges, test_edges_false
def preprocess_degree(adj, is_simple):
"""
Preprocessing degree-based term for modularity loss
:param adj: sparse adjacency matrix of the graph
:param is_simple: "simple" boolean flag for modularity
:return: degree-based term matrices
"""
if is_simple:
deg_matrix = None
deg_matrix_init = None
else:
if FLAGS.verbose:
print("Preprocessing on degree matrices")
deg = np.sum(adj, 1)
deg_matrix = (1.0 / np.sum(adj)) * deg.dot(np.transpose(deg))
#deg_matrix = deg_matrix - np.diag(np.diag(deg_matrix))
deg_matrix_init = sp.csr_matrix(deg_matrix)
deg_matrix = sparse_to_tuple(deg_matrix_init)
if FLAGS.verbose:
print("Done! \n")
return deg_matrix, deg_matrix_init
def introductory_message():
"""
An introductory message to display when launching experiments
"""
print("\n \n \n \n[MODULARITY-AWARE GRAPH AUTOENCODERS]\n \n \n \n")
print("EXPERIMENTAL SETTING \n")
print("- Graph dataset:", FLAGS.dataset)
print("- Mode name:", FLAGS.model)
print("- Number of models to train:", FLAGS.nb_run)
print("- Number of training iterations for each model:", FLAGS.iterations)
print("- Learning rate:", FLAGS.learning_rate)
print("- Dropout rate:", FLAGS.dropout)
print("- Use of node features in the input layer:", FLAGS.features)
if FLAGS.model in ("gcn_ae", "gcn_vae"):
print("- Dimension of the GCN hidden layer:", FLAGS.hidden)
print("- Dimension of the output layer:", FLAGS.dimension)
print("- lambda:", FLAGS.lamb)
print("- beta:", FLAGS.beta)
print("- gamma:", FLAGS.gamma)
print("- s:", FLAGS.s_reg)
print("Final embedding vectors will be evaluated on:")
if FLAGS.task == 'task_1':
print('- Task 1, i.e., pure community detection')
else:
print('- Task 2, i.e., joint community detection and link prediction')
print("\n \n \n \n")
class EarlyStopping:
'''Early stop the training if validation loss doesn't improve after a given patience'''
def __init__(self, patience=10, verbose=False,
delta=0.01, path='checkpoint.pt'):
self.patience = patience
self.verbose = verbose
self.counter = 0
self.best_score = None
self.early_stop = False
self.val_loss_min = -np.Inf
self.delta = delta
self.path = path
def __call__(self, val_loss, model,save_dir):
score = val_loss
if self.best_score is None:
self.best_score = score
#self.save_checkpoint(val_loss, model,save_dir)
elif score > self.best_score - self.delta:
self.counter += 1
#print("self.best_score:"+str(self.best_score)+"score:"+str(score)+"delta:"+str(self.delta))
print(f'EarlyStopping Counter: {self.counter} out of {self.patience}')
if self.counter >= self.patience:
self.early_stop = True
#self.save_checkpoint(val_loss, model,save_dir)
else:
self.best_score = score
#self.save_checkpoint(val_loss, model,save_dir)
self.counter = 0
def save_checkpoint(self, val_loss, model,save_dir):
'''save model when validation loss decreas'''
if self.verbose:
print(f'validation loss decrease ({self.val_loss_min:.6f})')
saver_to_save = tf.train.Saver()
saver_to_save.save(model, save_dir + 'best')
print('save success')
self.val_loss_min = val_loss
def show_info(self):
pid = os.getpid()
p = psutil.Process(pid)
info = p.memory_full_info()
memory = info.uss / 1024 / 1024
return memory