-
Notifications
You must be signed in to change notification settings - Fork 30
/
Features.py
51 lines (42 loc) · 1.85 KB
/
Features.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
import pandas as pd
import scipy.stats as stats
class CategoricalFeature():
def __init__(self, df, feature):
self.df = df
self.feature = feature
@property
def df_lite(self):
df_lite = self.df
df_lite['bin'] = df_lite[self.feature].fillna('MISSING')
return df_lite[['bin', 'label']]
class ContinuousFeature():
def __init__(self, df, feature):
self.df = df
self.feature = feature
self.bin_min_size = int(len(self.df) * 0.05)
def __generate_bins(self, bins_num):
df = self.df[[self.feature, 'label']]
df['bin'] = pd.qcut(df[self.feature], bins_num, duplicates='drop') \
.apply(lambda x: x.left) \
.astype(float)
return df
def __generate_correct_bins(self, bins_max=20):
for bins_num in range(bins_max, 1, -1):
df = self.__generate_bins(bins_num)
df_grouped = pd.DataFrame(df.groupby('bin') \
.agg({self.feature: 'count',
'label': 'sum'})) \
.reset_index()
r, p = stats.stats.spearmanr(df_grouped['bin'], df_grouped['label'])
if (
abs(r)==1 and # check if woe for bins are monotonic
df_grouped[self.feature].min() > self.bin_min_size # check if bin size is greater than 5%
and not (df_grouped[self.feature] == df_grouped['label']).any() # check if number of good and bad is not equal to 0
):
break
return df
@property
def df_lite(self):
df_lite = self.__generate_correct_bins()
df_lite['bin'].fillna('MISSING', inplace=True)
return df_lite[['bin', 'label']]