-
Notifications
You must be signed in to change notification settings - Fork 0
/
load_data.py
282 lines (223 loc) · 9.42 KB
/
load_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
# Load and clean data
import pandas as pd
import numpy as np
import json
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
import warnings
from imblearn.over_sampling import RandomOverSampler, SVMSMOTE
from exploratory_analysis import loops
import os
SEED = 2023
def load_train_data(filepath="Kaggle_download/train.csv", seed=SEED):
"""
Loads, cleans, and imputes new variables in Kaggle data
Input:
file (csv): optional, default is the train data
seed (int): optional seed
Returns:
df (dataframe), composed of X_train and y_train
X_valid (dataframe)
y_valid (dataframe)
"""
# Load data
df = pd.read_csv(filepath)
# Clean a couple data fields
###########################################################################
# see here for info: https://www.kaggle.com/competitions/costa-rican-household-poverty-prediction/discussion/61751
# edjefe
df.loc[df.loc[:, "edjefe"] == "yes", "edjefe"] = 1
df.loc[df.loc[:, "edjefe"] == "no", "edjefe"] = 0
df["edjefe"] = df["edjefe"].astype(str).astype(int)
# edjefa
df.loc[df.loc[:, "edjefa"] == "yes", "edjefa"] = 1
df.loc[df.loc[:, "edjefa"] == "no", "edjefa"] = 0
df["edjefa"] = df["edjefa"].astype(str).astype(int)
# ASSUME DEPENDENCY HAS THE SAME MISCODING
# https://www.kaggle.com/competitions/costa-rican-household-povertyx-prediction/discussion/73055
df.loc[df.loc[:, "dependency"] == "yes", "dependency"] = 1
df.loc[df.loc[:, "dependency"] == "no", "dependency"] = 0
df["dependency"] = df["dependency"].astype(str).astype(float)
# Fix NAs for number of tablets owned
df.loc[:, "v18q1"] = df.loc[:, "v18q1"].fillna(0)
# Create new individual-level variables base on lit review
###########################################################################
# highest level of education in household
def get_max_education_level(row):
education_levels = [
row["instlevel1"],
row["instlevel2"],
row["instlevel3"],
row["instlevel4"],
row["instlevel5"],
row["instlevel6"],
row["instlevel7"],
row["instlevel8"],
row["instlevel9"],
]
return max(education_levels)
# Create a new column in the DataFrame representing the highest education level in a household
df["max_education_level"] = df.apply(get_max_education_level, axis=1)
# if there is a marriage in the household
df.loc[:, "hh_has_marriage"] = (
df.loc[:, "estadocivil3"].groupby(df.loc[:, "idhogar"]).transform("max")
)
# max age in household
df.loc[:, "hh_max_age"] = (
df.loc[:, "age"].groupby(df.loc[:, "idhogar"]).transform("max")
)
# sex ratio in household
# #male/#female
df.loc[:, "hh_sex_ratio"] = df.loc[:, "r4h3"] / df.loc[:, "r4m3"]
# child/woman ratio in household
# children defined as under 12
# women defined as 12 and over
df.loc[:, "hh_child_woman_ratio_12"] = df.loc[:, "r4t1"] / df.loc[:, "r4m3"]
# child/adult ratio in household
# children defined as under 12
# adults defined as 12 and over
df.loc[:, "hh_child_adult_ratio_12"] = df.loc[:, "r4t1"] / df.loc[:, "r4t2"]
# child/woman ratio in household
# children defined as under 19
# women defined as 12 and over
# THIS IS A DATA QUALITY ISSUE -- CATS AREN'T MUTUALLY EXCLUSIVE
df.loc[:, "hh_child_woman_ratio_19"] = df.loc[:, "hogar_nin"] / df.loc[:, "r4m2"]
# child/adult ratio in household
# children defined as under 19
# adults defined as 19 and over
df.loc[:, "hh_child_adult_ratio_19"] = df.loc[:, "hogar_nin"] / df.loc[:, "hogar_adul"]
# Reshape the data to be at household level rather than individual level
###########################################################################
# pick the head of the household
df.loc[df.loc[:, "parentesco1"] == 1, "hh_head"] = 1
# create temp vars to determine if household head exists and max age in household
df.loc[:, "hh_head_exists"] = df.groupby([df.loc[:, "idhogar"]])[
"hh_head"
].transform(max)
# in instances where there isn't a head of household, pick the oldest male
df.loc[
(
(df.loc[:, "hh_head_exists"] == 0)
& (df.loc[:, "age"] == df.loc[:, "hh_max_age"])
& (df.loc[:, "male"] == 1)
),
"hh_head",
] = 1
# update the temp hh head flag var
df.loc[:, "hh_head_exists"] = df.groupby([df.loc[:, "idhogar"]])[
"hh_head"
].transform(max)
# in instances where there isn't an oldest male, pick the oldest
df.loc[
(
(df.loc[:, "hh_head_exists"] == 0)
& (df.loc[:, "age"] == df.loc[:, "hh_max_age"])
),
"hh_head",
] = 1
# collapse the data
df = df.loc[df.loc[:, "hh_head"] == 1]
# drop the temp var and other household head vars
df = df.drop(columns=["hh_head_exists", "parentesco1", "hh_head"])
# Create household-level variables
###########################################################################
with open("var_descriptions.json", "r") as f:
# Load JSON data as a dictionary
var_desc = json.load(f)
features_to_include = [x for x in var_desc.keys() if x not in [
"Id",
"idhogar",
"dependency",
"rez_esc",
"hh_head",
"parentesco1",
"hh_head_exists",
]
]
df_subset = df[features_to_include]
# impute mean rent values while suppressing error message
with warnings.catch_warnings():
warnings.simplefilter(action='ignore', category=Warning)
imp_mean = IterativeImputer(random_state=0, n_nearest_features=5)
imp_mean.fit(df_subset)
mean_subset = imp_mean.transform(df_subset)
# replace 0s
df.loc[:, "v2a1"] = mean_subset[:, 0]
df.loc[df.loc[:, "v2a1"] < 0, "v2a1"] = 0
# define logged value of v2a1, it provides a better distribution
df["v2a1_log"] = np.log1p(df["v2a1"])
# Clean up NAs and inf values
cols_to_drop = ['Id', 'idhogar', 'rez_esc']
df.replace([np.inf, -np.inf], np.nan, inplace=True)
for col in cols_to_drop:
if col in df.columns:
df.drop(col, axis=1, inplace=True)
df.fillna(df.mean(), inplace=True)
train_indices, valid_indices = implement_kfold(df)
return df, train_indices, valid_indices
def implement_kfold(df, n_splits=5, shuffle=True, random_state=SEED):
"""
This helper function implements stratified k-fold cross validation.
Primarily called within the load_data function but can be called
independently
Inputs:
df (DataFrame): a dataframe with features and a target column
n_splits (int, optional): k, the number of splits to make in the dataframe
shuffle (Bool, optional): Whether to shuffle each class's samples before
splitting into batches.
random_state (int, optional): the random seed to set for replicability
Outputs:
train_indices (dictionary): A dictionary with keys referring to an
individual fold (from k-fold) and values referring to the indexes to
include as training data for that pass
valid_indices (dictionary): A dictionary with keys referring to an
individual fold (from k-fold) and values referring to the indexes to
include as validation data for that pass
"""
skf = StratifiedKFold(n_splits=n_splits, shuffle=shuffle, random_state=random_state)
indices = skf.split(df.drop(columns="Target"), df.loc[:, ["Target"]])
train_indices = {}
valid_indices = {}
for i, (train_index, valid_index) in enumerate(indices):
train_indices[i] = train_index
valid_indices[i] = valid_index
return train_indices, valid_indices
def gen_oversample_data(df, seed = SEED):
'''
Generate resampled dataframes.
Inputs:
df (dataframe): data and labels
seed (int): optional seed
Returns:
train_X_resampled (dataframe): the resampled data
train_y_resampled (dataframe): the resampled labels
'''
X = df.iloc[:, :-1]
y = df.loc[:, 'Target']
ros = RandomOverSampler(random_state = seed)
train_X_resampled, train_y_resampled = ros.fit_resample(X, y)
return train_X_resampled, train_y_resampled
def gen_SMOTE_data(df, seed = SEED):
'''
Generate SMOTE dataframes.
Inputs:
df (dataframe): data and labels
seed (int): optional seed
Returns:
X_smote (dataframe): the resampled data
y_smote (dataframe): the resampled labels
'''
X = df.drop(columns='Target')
y = df.loc[:, 'Target']
sm = SVMSMOTE(random_state = seed)
X_smote, y_smote = sm.fit_resample(X, y)
return X_smote, y_smote
def two_step(model, df, train_indices, valid_indices, oversample=None, var_thresh=False):
print("Classification for 4")
loops.loop_model(model,df,train_indices,valid_indices,oversample=None,var_thresh=False)
not_pred= df.loc[df.loc[:,'Target'].isin([1,2,3]),:]
train_indices_not, valid_indices_not = implement_kfold(not_pred, n_splits=5, shuffle=True, random_state=SEED)
print("Classification for 1,2,3")
loops.loop_model(model,not_pred,train_indices_not, valid_indices_not ,oversample=oversample ,var_thresh=var_thresh)