-
Notifications
You must be signed in to change notification settings - Fork 0
/
VotingReg.py
121 lines (96 loc) · 4.07 KB
/
VotingReg.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
# VotingReg.py
print(__doc__)
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
import numpy as np
from sklearn import neighbors
from sklearn.metrics import mean_squared_error
from math import sqrt
import matplotlib.pyplot as plt
from sklearn.model_selection import cross_val_predict
from numpy import asarray
from numpy import savetxt
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import VotingRegressor
# read the file
df = pd.read_csv("Datasets/Outfield_Players_features.csv")
df.head
# impute the missing values
df.isnull().sum()
# missing values in the Outfield Players dataset needs to be imputed
# this little checkup (previous code block) reveals the followng missing values:
# release_clause_eur 55
# team_position 21 -> not too relevant. to be dropped later on
# dribbling 25
# passing 25
# shooting 25
# pace 25
# imputing release_clause_eur with mean value
mean = df['release_clause_eur'].mean()
df['release_clause_eur'].fillna(mean, inplace = True)
# imputing dribbling with mean value
mean = df['dribbling'].mean()
df['dribbling'].fillna(mean, inplace = True)
# imputing passing with mean value
mean = df['passing'].mean()
df['passing'].fillna(mean, inplace = True)
# now for shooting
mean = df['shooting'].mean()
df['shooting'].fillna(mean, inplace = True)
# same is done for pace
mean = df['pace'].mean()
df['pace'].fillna(mean, inplace = True)
# note that the team_position is not really too important,
# so it can be momentarily dropped or else we convert to numeric values
#df.drop(['team_position'], axis=1, inplace=True)
# convert categorical data into numerical data if need be
df = pd.get_dummies(df)
print(df.isnull().sum(), "\n\n")
# split data into training (80%) and test set (20%)
train, test = train_test_split(df, test_size = 0.2)
# print(test[0:1])
# save the cleaned data tocsv for future use
df.to_csv("Datasets/cleaned_dataset.csv")
# identify the data to be trained followed by labels and target (overall)
x_train = train.drop('overall', axis = 1)
y_train = train['overall']
x_test = test.drop('overall', axis = 1)
y_test = test['overall']
# we begin the voting regression algorithm using RandomForest and LinearRegression
reg1 = RandomForestRegressor(random_state = 0, n_estimators = 10)
reg2 = LinearRegression()
vreg = VotingRegressor([ ('rf', reg1), ('lr', reg2) ])
# fit the additive models
reg1.fit(x_train, y_train)
reg2.fit(x_train, y_train)
vreg.fit(x_train, y_train)
# score and check the accuracy
rfscore = reg1.score(x_test, y_test)
lrscore = reg2.score(x_test, y_test)
vrscore = vreg.score(x_test, y_test)
# make predictions and print intermediate results
print("RandomForestRegressor Predictions:\n", reg1.predict(x_test), "\n")
print("Linear Regression Predictions:\n", reg2.predict(x_test), "\n")
print("Voting Regressor Predictions:\n", vreg.predict(x_test), "\n")
# visualization and comparison
plt.figure()
plt.plot(reg1.predict(x_test), 'b^', label = 'RandomForestRegressor @score {0:.2f}'.format(rfscore))
plt.plot(reg2.predict(x_test), 'ys', label = 'LinearRegression @score {0:.2f}'.format(lrscore))
plt.plot(vreg.predict(x_test), 'r*', label = 'VotingRegressor @score {0:.2f}'.format(vrscore))
# plt.tick_params(axis = 'x', which = 'both', top = False, labelbottom = True)
plt.tick_params(axis = 'x', which = 'both', bottom = False, top = False, labelbottom = False)
plt.ylabel('predicted')
plt.xlabel('training samples')
plt.legend(loc="best")
plt.title('Comparison of individual predictions with averaged')
plt.show()
# recommended writing to csv since there're multiple additive predictors
actual_col = np.array(y_test)
rf_col = np.array(reg1.predict(x_test))
lr_col = np.array(reg2.predict(x_test))
vr_col = np.array(vreg.predict(x_test))
save_df = pd.DataFrame(
{'actual_overall':actual_col, 'rforest_reg_predicted':rf_col, 'linear_reg_predicted':lr_col, 'voting_reg_predicted':vr_col})
save_df.to_csv("Datasets/VotingRegressorSubmission.csv", index = False)