-
Notifications
You must be signed in to change notification settings - Fork 2
/
graph_creation_helpers.py
86 lines (54 loc) · 3.1 KB
/
graph_creation_helpers.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
import sqlalchemy
import pandas as pd
import networkx as nx
from networkx.algorithms import bipartite
import pickle
def sqlite_to_table(db_file: str, table_name: str) ->pd.DataFrame:
e = sqlalchemy.create_engine(f'sqlite:///{db_file}.db')
df = pd.read_sql_table(table_name, e)
return df
def review_network_from_pandas(df: pd.DataFrame) ->nx.MultiGraph:
if all(name in df.columns for name in ['reviewerID', 'restaurantID', 'rating']):
# initialize the graph object
B = nx.MultiGraph()
# add the nodes of the two sets
B.add_nodes_from(df['reviewerID'].unique().tolist(), bipartite=0, label='reviewer')
B.add_nodes_from(df['restaurantID'].unique().tolist(), bipartite=1, label='restaurant')
# define adge and weight list and add them on the network
edges = [tuple(x) for x in df[['reviewerID','restaurantID','rating']].values.tolist()]
B.add_weighted_edges_from(edges, weight='rating')
else:
raise KeyError('The dataframe does not contain the correct variable to create a review network')
return B
# function that connects to the database and fetches the appropriate table
def sqlite_to_table(db_file: str, table_name: str) ->pd.DataFrame:
e = sqlalchemy.create_engine(f'sqlite:///{db_file}.db')
df = pd.read_sql_table(table_name, e)
return df
def load_restaurant_data(db_file: str) -> pd.DataFrame:
# load the restauran data from the sqlite database
df = sqlite_to_table(db_file, 'restaurant')
# add the state for the restaurant
df['state'] = [loc.split(',')[-1].strip() for loc in df['location']]
# rename the dataset to indicate its source table
df.columns = ['restaurantID'] + [f'restaurant_{col_name}' for col_name in df.columns if col_name != 'restaurantID']
return df
def load_reviewer_data(db_file: str) -> pd.DataFrame:
# load reviewer data from sqlite database
df = sqlite_to_table(db_file, 'reviewer')
# remove reviewer's location as it is irrelevant of the review
df.drop(columns='location', inplace=True)
# rename the dataset to indicate its source table
df.columns = ['reviewerID'] + [f'reviewer_{col_name}' for col_name in df.columns if col_name != 'reviewerID']
return df
# function that keeps reviewers and restaurants with at least N reviews
def keep_top_entries(df: pd.DataFrame, N: int) -> pd.DataFrame:
orig_shape = df.shape[0]
n_rating_per_restaurant=df.groupby('restaurantID')['rating'].count().rename('n_reviews').reset_index()
restaurants_of_interest = n_rating_per_restaurant[n_rating_per_restaurant['n_reviews']>=N]['restaurantID'].tolist()
df = df[df['restaurantID'].isin(restaurants_of_interest)]
n_rating_per_user=df.groupby('reviewerID')['rating'].count().rename('n_reviews').reset_index()
reviewers_of_interest = n_rating_per_user[n_rating_per_user['n_reviews']>=N]['reviewerID'].tolist()
df[df['reviewerID'].isin(reviewers_of_interest)]
print(f'Keeping reviewers and restaurants with at least {N} reviews, we keep {df.shape[0]/orig_shape*100:.2f}% of the data')
return df