-
Notifications
You must be signed in to change notification settings - Fork 2
/
Linear-Regression.py
349 lines (208 loc) · 7.16 KB
/
Linear-Regression.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
#!/usr/bin/env python
# coding: utf-8
# In[111]:
#importing necessary packages behorehand
#for mathematical and matrix operations
import numpy as np
import pandas as pd
#for data visualisation
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
from pandas.plotting import scatter_matrix
#for pre-processing data
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
import sklearn
from sklearn.datasets.samples_generator import make_regression
import pylab
from scipy import stats
import random
# In[112]:
# importing the training dataset
train_df = pd.read_csv("train.csv")
#description of the dataset in terms of basic measures
train_df.describe()
# In[113]:
#finding out the data types of all fields
train_df.dtypes
# In[114]:
#importing the training dataset
test_df = pd.read_csv("test.csv")
#description of the dataset in terms of basic measures
test_df.describe()
# In[115]:
df = train_df
df.describe()
# In[116]:
#to check if any values of any of the fields are null.
df.isnull().sum()
# In[117]:
df.drop(['post_day', 'basetime_day' ], axis = 1)
#make dummy variables for them later and add these fields too.
# In[118]:
#making a boxplot for outlier detection
matplotlib.pyplot.boxplot(df.target)
# In[125]:
#finding out the correlation between features
corr1 = df.corr()
# In[127]:
corr1
# In[128]:
#plotting a heat map for easier detection of correlation amongst variables
plt.subplots(figsize=(10,10))
sns.heatmap(df.corr())
# In[129]:
### selecting top 20 attributes with respect to correlation
feature_list = corr1["target"].sort_values().tail(30).head(20)
feature_list.index
# In[130]:
df[feature_list.index].corr()
# In[131]:
#number of instances for different count of comments
print(df['target'].value_counts())
# In[132]:
#Bar chart of independent variables with respect to output
labels = df['target'].astype('category').cat.categories.tolist()
counts = df['target'].value_counts()
sizes = [counts[var_cat] for var_cat in labels]
fig1, ax1 = plt.subplots()
ax1.pie(sizes, labels=labels, autopct='%1.1f%%', shadow=True) #autopct is show the % on plot
ax1.axis('equal')
plt.show()
# In[133]:
X = df.drop(['post_day', 'basetime_day' ], axis = 1)
# deciding on the final features
Features_final = ['base_time', 'page_category', 'post_length', 'h_target', 'promotion',
'page_checkin', 'page_likes', 'c3', 'c5', 'share_count',
'daily_crowd', 'F1', 'c1','target']
df_final = df[Features_final]
df_final.describe()
# In[134]:
# outlier removal by gaussian score
df_final = df_final[(np.abs(stats.zscore(df_final)) < 3).all(axis=1)]
df_final.describe()
# In[135]:
df_final.head()
# In[136]:
X = df_final.iloc[:,0:13]
Y = df_final["target"]
## Splitting data train and test set
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, random_state = 0, test_size = 0.1)
#applying feature scaling, specifically min-max scaler
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
#using training data's min and max for scaling the testing data
X_train.shape
# In[151]:
#simple gradient descent algorithm
def gradient_descent(alpha, x, y, numIterations):
# number of samples
m = x.shape[0]
theta = np.ones(14)
x_transpose = x.transpose()
for iter in range(0, numIterations):
hypothesis = np.dot(x, theta)
loss = hypothesis - y
## Cost Function J to be calculated
J = np.sum(loss ** 2) / (2 * m)
print("iter %s | J: %.3f" % (iter, J))
gradient = np.dot(x_transpose, loss) / m
## Batch update
theta = theta - alpha * gradient
print(theta)
return theta
if __name__ == '__main__':
x, y = X_train, Y_train
m, n = np.shape(x)
x = np.c_[ np.ones(m), x] # insert column
alpha = 0.30 # learning rate
theta = gradient_descent(alpha, x, y, 3000)
# In[152]:
def gradient_descent(alpha, x, y, numIterations):
m = x.shape[0] # number of samples
x = np.c_[ np.ones(m), x]
theta = np.ones(14)
x_transpose = x.transpose()
for iter in range(0, numIterations):
hypothesis = np.dot(x, theta)
loss = hypothesis - y
## Cost Function J to be calculated
J = np.sum(loss ** 2) / (2 * m)
gradient = np.dot(x_transpose, loss) / m
theta = theta - alpha * gradient
theta1 = theta
y_train_pred = np.dot(x,theta1)
rms_train = ((((y_train_pred - np.array(y))**2).sum())/m)**0.5
x1 = np.c_[ np.ones(X_test.shape[0]), X_test]
y_test_pred = np.dot(x1,theta)
rms_test = ((((y_test_pred - np.array(Y_test))**2).sum())/(x1.shape[0]))**0.5
xyz = (alpha,rms_train,rms_test)
return xyz
# In[153]:
#running the algorithm for different values of alpha
df_exp1 = pd.DataFrame()
for alpha in (0.01,0.1,0.3,1.1,1.15,1.2):
alpha,rms_train,rms_test=gradient_descent(alpha, X_train, Y_train, 3000)
dict={"alpha" : [alpha],"rms_train" :[rms_train] ,"rms_test":[rms_test]}
df1=pd.DataFrame(data = dict)
df_exp1 = df_exp1.append(df1,ignore_index = True)
# In[154]:
df_exp1
# In[155]:
plt.plot(df_exp1.alpha,df_exp1.rms_train)
plt.plot(df_exp1.alpha,df_exp1.rms_test)
plt.axis([0, 1.5, 7.5, 9.5])
plt.xlabel("alpha-->")
plt.ylabel("root mean square error")
plt.legend()
plt.show()
# In[156]:
#saving the value of cost function at each step
costs = list()
#regularised gradient descent with lambda user-provided
def gradient_descent(alpha, x, y, numIterations, lmbda):
m = x.shape[0] # number of samples
x = np.c_[ np.ones(m), x]
theta = np.ones(14)
x_transpose = x.transpose()
for iter in range(0, numIterations):
hypothesis = np.dot(x, theta)
loss = hypothesis - y
J = (np.sum(loss ** 2) / (2 * m)) + ((lmbda * np.sum(theta[1:]**2) )/ (2 * m))
costs.append(J)
gradient = np.dot(x_transpose, loss) / m
theta = theta - alpha * gradient
theta1 = theta
y_train_pred = np.dot(x,theta1)
# root mean square error
rms_train = ((((y_train_pred - np.array(y))**2).sum())/m)**0.5
#mean square error
mse_train= ((((y_train_pred - np.array(y))**2).sum())/m)
x1 = np.c_[ np.ones(X_test.shape[0]), X_test]
y_test_pred = np.dot(x1,theta)
mse_test = ((((y_test_pred - np.array(Y_test))**2).sum())/(x1.shape[0]))
rms_test = ((((y_test_pred - np.array(Y_test))**2).sum())/(x1.shape[0]))**0.5 ## root mean square error
xyz = (alpha,rms_train,rms_test,mse_train, mse_test)
return xyz
# In[157]:
df_exp1 = pd.DataFrame()
for alpha in (0.005, 0.05, 0.025,0.01,0.1,0.3):
alpha,rms_train,rms_test, mse_train, mse_test=gradient_descent(alpha, X_train, Y_train, 10000,0.5)
dict={"alpha" : [alpha],"rms_train" :[rms_train] ,"rms_test":[rms_test], "mse_train": [mse_train],"mse_test": [mse_test]}
df1=pd.DataFrame(data = dict)
df_exp1 = df_exp1.append(df1,ignore_index = True)
# In[158]:
df_exp1
# In[159]:
costs
# In[160]:
plt.plot(df_exp1.alpha,df_exp1.rms_train)
plt.plot(df_exp1.alpha,df_exp1.rms_test)
plt.axis([0.005, 1.5, 7.5, 9.5])
plt.xlabel("alpha-->")
plt.ylabel("root mean square error")
plt.legend()
plt.show()
# In[ ]: