-
Notifications
You must be signed in to change notification settings - Fork 0
/
parser.py
182 lines (151 loc) · 6.39 KB
/
parser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
import pandas as pd
import os
import glob
import errno
def clean(filename):
# Read csv into a df
df = pd.read_csv(INPUT_DIR + filename)
print("=" * len(filename))
print(filename)
print("=" * len(filename))
print()
new_column_names = {
# Common
'Street Address': 'street_address',
'Creation Date': 'creation_date',
'Completion Date': 'completion_date',
'Status': 'status',
'Service Request Number': 'request_number',
'Street Address': 'street_address',
'Zip Code': 'zip_code',
'Zip Codes': 'zip_codes',
'Ward': 'ward',
'Wards': 'wards',
'Location': 'location',
'Historical Wards 2003-2015': 'historical_wards',
'Community Area': 'community_area',
'Community Areas': 'community_areas',
'Census Tracts': 'census_tracts',
'Police District': 'police_district',
'Type Of Service Request': 'request_type',
'Latitude': 'latitude',
'Longitude': 'longitude',
'X Coordinate': 'x_coordinate',
'Y Coordinate': 'y_coordinate',
# RequestType-specific
'Current Activity': 'current_activity',
'Most Recent Action': 'most_recent_action',
'Zip': 'zip_code',
'Ssa': 'ssa',
'License Plate': 'license_plate',
'Vehicle Color': 'vehicle_color',
'Vehicle Make/Model': 'vehicle_model',
'How Many Days Has The Vehicle Been Reported As Parked?': 'days_parked',
'Number Of Black Carts Delivered': 'num_of_black_carts',
'What Type Of Surface Is The Graffiti On?': 'graffiti_surface',
'Where Is The Graffiti Located?': 'graffiti_location',
'Number Of Potholes Filled On Block': 'potholes_filled',
'Number Of Premises Baited': 'premises_baited',
'Number Of Premises With Garbage': 'premises_with_garbage',
'Number Of Premises With Rats': 'premises_with_rats',
'What Is The Nature Of This Code Violation?': 'nature',
'If Yes, Where Is The Debris Located?': 'debris_location',
'Location Of Trees': 'tree_location'
}
# Output filenames (they correspond to relation names)
outnames = {
'311-service-requests-abandoned-vehicles.csv': 'vehicle_request.csv',
'311-service-requests-alley-lights-out.csv': 'alley_lights_request.csv',
'311-service-requests-garbage-carts.csv': 'garbage_request.csv',
'311-service-requests-graffiti-removal.csv': 'graffiti_request.csv',
'311-service-requests-pot-holes-reported.csv': 'pothole_request.csv',
'311-service-requests-rodent-baiting.csv': 'rodent_request.csv',
'311-service-requests-sanitation-code-complaints.csv': 'sanitation_request.csv',
'311-service-requests-street-lights-all-out.csv': 'lights_all_request.csv',
'311-service-requests-street-lights-one-out.csv': 'lights_one_request.csv',
'311-service-requests-tree-debris.csv': 'tree_debris_request.csv',
'311-service-requests-tree-trims.csv': 'tree_trim_request.csv'
}
# Columns that exist in all csv files
common_columns = [
'request_id',
'street_address',
'creation_date',
'completion_date',
'status',
'request_number',
'zip_code',
'zip_codes',
'ward',
'wards',
'location',
'historical_wards',
'community_area',
'community_areas',
'census_tracts',
'police_district',
'request_type',
'latitude',
'longitude',
'x_coordinate',
'user_reg_id',
'y_coordinate']
# Remove duplicate rows
df = df[~df.duplicated()]
# Rename column names to match column names in our DB
df.columns = map(str.title, df.columns)
df.rename(columns=new_column_names, inplace=True)
# Drop location column [redundant]
df.drop(columns=['location'], inplace=True)
# Remove 'T' from dates
df['creation_date'] = df['creation_date'].str.replace('T', ' ')
df['completion_date'] = df['completion_date'].str.replace('T', ' ')
# Applies only on './data/311-service-requests-street-lights-one-out.csv'
df['request_type'] = df['request_type'].str.replace('Street Light Out', 'Street Light - 1/Out')
# Applies only on './data/311-service-requests-pot-holes-reported.csv'
df['request_type'] = df['request_type'].str.replace('Pot Hole in Street', 'Pothole in Street')
# Filter license plates
if 'license_plate' in df.columns:
df['license_plate'] = df[df['license_plate'].str.len() < 10]['license_plate']
df['longitude'] = df['longitude'].map(lambda x: '%20.17f' % x)
df['latitude'] = df['latitude'].map(lambda x: '%20.17f' % x)
# Add user id column
df["user_reg_id"] = ""
# Change index
df.reset_index(drop=True, inplace=True)
df.index += clean.start_pos
# Increment starting position for the next call
clean.start_pos += df.shape[0]
# Make index one of the columns with name 'request_id'
df['request_id'] = df.index + 1
# Create csv with specific columns for this request type [i.e.: exclude common columns]
current_df = df[df.columns[~df.columns.isin([c for c in common_columns if c != 'request_id'])]]
current_df = current_df[['request_id'] + sorted([a for a in current_df.columns.to_list() if a != 'request_id'])]
current_df.to_csv(OUTPUT_DIR + outnames[filename], index=False, header=False, float_format='%.f')
# Keep only common columns
df = df[df.columns[df.columns.isin(common_columns)]]
# Append rows to request_df
clean.request_df = pd.concat([clean.request_df,df])
INPUT_DIR = './data/'
OUTPUT_DIR = './out/'
input_files = [f for f in os.listdir(INPUT_DIR) if f.endswith('.csv')]
input_files.remove('311-service-requests-vacant-and-abandoned-buildings-reported.csv')
# Create OUTPUT_DIR if not exist
try:
os.makedirs(OUTPUT_DIR)
except OSError as e:
if e.errno != errno.EEXIST:
raise
# Delete all files in OUTPUT_DIR
for f in glob.glob(OUTPUT_DIR + '*'):
os.remove(f)
# Produce clean csv files
clean.start_pos = 0
clean.request_df = pd.DataFrame()
for f in input_files:
clean(f)
# Produce general request.csv
df = clean.request_df
excluded_cols = ['request_id', 'user_reg_id']
df = df[['request_id'] + sorted([a for a in df.columns.to_list() if a not in excluded_cols]) + ['user_reg_id']]
df.to_csv(OUTPUT_DIR + 'request.csv', header=False, index=False, float_format='%.f')