-
Notifications
You must be signed in to change notification settings - Fork 2
/
graph_metrics.py
255 lines (182 loc) · 9.82 KB
/
graph_metrics.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
import pandas as pd
import numpy as np
import scipy.sparse
import networkx as nx
from networkx.algorithms import bipartite
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
import pickle
import gc
def general_info(review_bipart_network: nx.MultiGraph, network_name: str) -> None:
if bipartite.is_bipartite(review_bipart_network):
bip= 'is'
else:
bip = 'is not'
print(f'The {network_name} {bip} bipartite')
if nx.is_connected(review_bipart_network):
bip= 'is'
else:
bip = 'is not'
print(f'The {network_name} {bip} connected')
def get_node_sets(review_bipart_network: nx.MultiGraph) -> (list, list):
top_node_set = set(n for n,d in review_bipart_network.nodes(data=True) if d['bipartite']==0)
bottom_node_set = set(n for n,d in review_bipart_network.nodes(data=True) if d['bipartite']==1)
return top_node_set, bottom_node_set
def get_degree_centrality(G: nx.Graph) ->dict:
return nx.degree_centrality(G)
def get_betweenness_centrality(G: nx.Graph, approximation: bool=False) ->dict:
if approximation:
betweenness_cen = nx.betweenness_centrality(G, k=2000)
else:
betweenness_cen = nx.betweenness_centrality(G)
return betweenness_cen
def get_closeness_centrality(G: nx.Graph, approximation: bool=False) ->dict:
if approximation:
node_list = np.array(G.nodes())
A = nx.adjacency_matrix(G).tolil()
D = scipy.sparse.csgraph.floyd_warshall(A, directed=False, unweighted=False)
n = D.shape[0]
closeness_cenrtality = {}
for r in tqdm(range(n)):
cc = 0.0
possible_paths = list(enumerate(D[r, :]))
shortest_paths = dict(filter(lambda x: not x[1] == np.inf, possible_paths))
total = sum(shortest_paths.values())
n_shortest_paths = len(shortest_paths) - 1
if total > 0 and n > 1:
s = n_shortest_paths/(n-1)
cc = (n_shortest_paths/total)*s
closeness_cenrtality[r] = cc
closeness_cen = {}
for n, node in enumerate(node_list):
closeness_cen[node] = closeness_cenrtality[n]
else:
closeness_cen = nx.closeness_centrality(G, wf_improved = True)
return closeness_cen
def measures_for_centrality(G: nx.Graph) ->(dict, dict):
# calculate node sets
G_top_node_set = set(n for n,d in G.nodes(data=True) if d['bipartite']==0)
G_bottom_node_set = set(G) - G_top_node_set
print('Calculating degree centrality')
degree_cen = get_degree_centrality(G)
if len(G_top_node_set) < 2000:
print('Calculating betweenness centrality. Please wait..')
betweenness_cen = get_betweenness_centrality(G)
else:
# approximation of betweenness centrality
print('Calculating approximation of betweenness centrality. Please wait..')
betweenness_cen = get_betweenness_centrality(G, approximation=True)
if len(G_top_node_set) < 2000:
print('Calculating closeness centrality. Please wait..')
closeness_cen = get_closeness_centrality(G)
else:
print('Calculating approximation of closeness centrality. Please wait..')
# add approximation using floyd-warshal method for adjastency matrix
# https://medium.com/@pasdan/closeness-centrality-via-networkx-is-taking-too-long-1a58e648f5ce
closeness_cen = get_closeness_centrality(G, approximation=True)
#page_rank = nx.pagerank(G, alpha=0.8)
#hubs, authorities = nx.hits(review_bipart_network)
# keep centrality measures per node set
top_node_set_centralities = {}
bottom_node_set_centralities = {}
# for centrality_dict, centrality_name in zip([degree_cen, betweenness_cen, closeness_cen, page_rank],
# ['Degree_Centrality', 'Betweenness_Centrality', 'Closeness_Centrality', 'Page_Rank']):
for centrality_dict, centrality_name in zip([degree_cen, betweenness_cen, closeness_cen],
['Degree_Centrality', 'Betweenness_Centrality', 'Closeness_Centrality']):
# centrality measures of nodes in top node set
top_node_centralities = {node_id: centrality for node_id, centrality in centrality_dict.items() if node_id in G_top_node_set}
# centrality measures of nodes in bottom node set
bottom_node_centralities = {node_id: centrality for node_id, centrality in centrality_dict.items() if node_id in G_bottom_node_set}
centrality_dict = None
gc.collect()
# save in distinct dictionaries
top_node_set_centralities.update({centrality_name: top_node_centralities})
bottom_node_set_centralities.update({centrality_name: bottom_node_centralities})
return top_node_set_centralities, bottom_node_set_centralities
def plot_centralities(node_set_centralities: dict, node_set_name: str,
split_centralities_on_flag: bool=False, flagged_ids: list=None,
savefig: bool=False, figname: str=None) -> None:
if not split_centralities_on_flag:
fig, axs = plt.subplots(nrows=1, ncols=len(node_set_centralities), sharey=True, sharex=False, figsize=(14,4))
for plot_counter, centrality_measure in enumerate(node_set_centralities.keys()):
sns.histplot(list(node_set_centralities[centrality_measure].values()), bins=30, ax=axs[plot_counter])
axs[plot_counter].set_xlabel(centrality_measure)
plt.yscale('log', basey=10)
plt.suptitle(f'Histograms of centrality measures of {node_set_name} node set', size=12)
axs[0].set_ylabel('Number of nodes')
plt.tight_layout()
else:
if not flagged_ids:
raise NameError('flagged_ids list is not defined')
# transform the data into a pandas dataframe
df = get_dataframe_from_centralities_dict(node_set_centralities, node_set_name)
# add flag column
df[f'flagged_{node_set_name}'] = df[node_set_name].isin(flagged_ids)
fig, axs = plt.subplots(nrows=1, ncols=len(node_set_centralities), sharey=True, sharex=False, figsize=(14,4))
for plot_counter, centrality_measure in enumerate(node_set_centralities.keys()):
sns.histplot(df[~df[f'flagged_{node_set_name}']][centrality_measure],
bins=30, color='steelblue', alpha=0.5, label='Real', ax=axs[plot_counter])
sns.histplot(df[df[f'flagged_{node_set_name}']][centrality_measure],
bins=30, color='firebrick', alpha=0.5, label='Fake', ax=axs[plot_counter])
axs[plot_counter].set_xlabel(centrality_measure)
axs[plot_counter].legend(loc='upper right')
plt.yscale('log', basey=10)
plt.suptitle(f'Histograms of centrality measures of {node_set_name} node set per flagged group', size=12)
axs[0].set_ylabel('Number of nodes')
#axs[0].tight_layout()
if savefig:
plt.savefig(f'graph_plots/{figname}.png', bbox_inches='tight')
plt.show()
def get_dataframe_from_centralities_dict(node_set_centralities: dict, node_set_name:str) ->pd.DataFrame:
# the dictionary contains dictionaries of centralities
# the keys of these dictionaries are the same between all of them
centrality_name = list(node_set_centralities.keys())[0]
# initialize
centralities_df = pd.DataFrame({node_set_name: list(node_set_centralities[centrality_name].keys())})
for centrality_name in node_set_centralities.keys():
df = pd.DataFrame.from_dict(data=node_set_centralities[centrality_name], orient='index')
df = df.reset_index()
df.columns=[node_set_name, centrality_name]
# merge the new information
centralities_df = centralities_df.merge(df, on=node_set_name, how='inner')
return centralities_df
def common_neighbors_plot(G: nx.Graph, node_set: set, node_set_name: str, other_node_set_name: str,
savefig: bool=False, figname: str=None) ->None:
# initialize a second node set
G_set = node_set
G_n_neighbors = []
print('Calculating common neighbors. Be patient..')
for u in tqdm(node_set):
# remove an item from the second node set with each iteration
G_set = G_set - {u}
if len(G_set) > 0:
G_n_neighbors.append([len(list(nx.common_neighbors(G, u, v))) for v in G_set])
# flaten the list
G_n_neighbors = [item for sublist in G_n_neighbors for item in sublist]
# plot the resutls
plt.figure(figsize=(6, 5))
sns.histplot(G_n_neighbors)
plt.yscale('log', basey=10)
plt.xscale('log', basex=10)
plt.title(f'Log-log distribution plot of {node_set_name}\' common {other_node_set_name}')
if savefig:
plt.savefig(f'graph_plots/{figname}.png', bbox_inches='tight')
plt.show()
def main():
G = nx.read_gpickle('graph_data/review_graph.gpickle')
B = nx.Graph(G)
# get centrality values for every node set in the network
reviewer_centralities, restaurant_centralities = measures_for_centrality(B)
# save the centrality dictionaries
for centrality_dict, dict_name in zip([reviewer_centralities, restaurant_centralities], ['reviewer_centralities', 'restaurant_centralities']):
with open(f'graph_data/{dict_name}.pkl', 'wb') as f:
pickle.dump(centrality_dict, f)
# plot the centrality values distribution
plot_centralities(reviewer_centralities, 'reviewer', True, 'reviewer_centralities')
plot_centralities(restaurant_centralities, 'restaurants', True, 'restaurant_centralities')
# # plot number of common reviewers per restaurant
# restaurant_nodes = set(n for n,d in B.nodes(data=True) if d['bipartite']==1)
# common_neighbors_plot(B, restaurant_nodes, 'restaurants', 'reviewers', True, 'common_neighbors')
if __name__ == '__main__':
main()