-
Notifications
You must be signed in to change notification settings - Fork 1
/
feature_engineering.py
211 lines (134 loc) · 6.79 KB
/
feature_engineering.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
"""
Module for feature engineering
Author: Son Gyo Jung
Email: sgj13@cam.ac.uk
"""
import os
import numpy as np
import pandas as pd
import joblib
import itertools
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
class engineering():
"""
Class is used to engineer use features
Two methods present: (a) brute force method, (b) feature marker
Note: this should be used before scaling the features and before oversampling.
args:
(1) path_to_file (type:str) - location of the data file with features; last column is taken as the target feature
(2) path_to_save (type:str) - loacation to save data
(3) target (type:str) - name of target variable
(4) features (list) - list of exploratory features
(5) csv (type:bool) - whether to save as csv
return:
(1) pandas.Dataframe with newly engineered features
"""
def __init__(self, path_to_file, path_to_save, target, features, csv = False):
self.path_to_save = path_to_save
self.csv = csv
self.sample_train = joblib.load(path_to_file)
# Define input and target variables
if isinstance(features, list):
self.features = features
else:
self.features = joblib.load(features)
self.target = target
print('Name of target column: ', self.target)
print('No. of exploratory features: ', len(self.features))
def movecol(self, dataframe, cols_to_move = [], ref_col = '', place = 'after'):
"""
Function to rearrange columns
arg:
(a) cols_to_move (list) - list of columns to move
(b) ref_col (type:str) - reference column
(c) place (type:str) - whether to move the specified columns 'before' or 'after' the reference column (set to 'after' as default)
return:
(a) pandas.Dataframe
"""
cols = dataframe.columns.tolist()
if place == 'after':
s1 = cols[:list(cols).index(ref_col) + 1]
s2 = cols_to_move
if place == 'before':
s1 = cols[:list(cols).index(ref_col)]
s2 = cols_to_move + [ref_col]
s1 = [i for i in s1 if i not in s2]
s3 = [i for i in cols if i not in s1 + s2]
return dataframe[s1 + s2 + s3]
def brute_force(self, feature_list):
"""
Feature engineering using brute force method
Use features identified to have statistical significance
args:
(1) feature_list (list) - list of features to use
return:
(1) pandas.Dataframe with newly engineered features
"""
# List of all permutations
all_perm = list()
# Find possible permutations between a pair of features
for i in itertools.permutations(set(feature_list), r=2):
all_perm.append(list(i))
print('Total number of permutation:', len(all_perm))
# Original columns
original_cols = self.sample_train.columns.tolist()
self.sample_train.replace({False: 0, True: 1}, inplace=True)
# Compute the feature values of the new features
for f in all_perm:
if str(f[0]) != str(f[1]):
self.sample_train[str(f[0]) + '/' + str(f[1])] = self.sample_train[f[0]] / self.sample_train[f[1]]
print('Invalid operations:', len(all_perm) - (len(self.sample_train.columns.tolist())-len(original_cols)))
# Move target to last column
self.sample_train = self.movecol(
self.sample_train,
cols_to_move = [self.target],
ref_col = self.sample_train.columns.tolist()[-1],
place = 'after'
)
# New columns
latest_cols = self.sample_train.columns.tolist()
self.new_cols = []
for c in latest_cols:
if c not in original_cols:
self.new_cols.append(c)
# Replace str to Nan
self.sample_train[self.new_cols] = self.sample_train[self.new_cols].applymap(lambda x: x if not isinstance(x, str) else np.nan)
# Replace NaN with zeros
self.sample_train = self.sample_train.fillna(0)
# Replace the infinites to zeros
self.sample_train = self.sample_train.replace([np.inf, -np.inf], 0)
return self.sample_train, self.new_cols
def feature_markers(self, feature_list):
"""
Create feature markers (1 or 0) to indicate the presence or absence of a feature, respectively
Use features identified to have statistical significance
args:
(1) feature_list (list) - list of features to use
return:
(1) pandas.Dataframe with newly engineered features
"""
# Compute the feature values of the new features
for f in feature_list:
self.sample_train[str(f) + '_marker'] = self.sample_train[f].apply(lambda x: 1 if x != 0 else 0)
# Move target to last column
self.sample_train = self.movecol(self.sample_train, cols_to_move=[self.target], ref_col=self.sample_train.columns.tolist()[-1], place='after')
return self.sample_train
def save(self, test_size=0.2, random_state=42):
"""
Save data file with new features
"""
#Save data as csv
if self.csv == True:
self.sample_train.to_csv(os.path.join(self.path_to_save, r'engineered_features_' + self.target + '.csv'))
print('Result saved as: engineered_features_' + self.target + '.csv')
joblib.dump(self.sample_train, os.path.join(self.path_to_save, r'df_' + self.target + '_engineered_features.pkl'))
print('Result saved as: df_' + self.target + '_engineered_features.pkl')
joblib.dump(self.new_cols, os.path.join(self.path_to_save, r'features_' + self.target + '_engineered.pkl'))
print('Result saved as: features_' + self.target + '_engineered.pkl')
# Split dataset
df_train, df_test = train_test_split(self.sample_train, test_size=test_size, random_state=random_state)
joblib.dump(df_train, os.path.join(self.path_to_save, r'df_train_' + self.target + '_engineered.pkl'))
joblib.dump(df_test, os.path.join(self.path_to_save, r'df_test_' + self.target + '_engineered.pkl'))
print('Result saved as: df_train_' + self.target + '_engineered.pkl')
print('Result saved as: df_test_' + self.target + '_engineered.pkl')