-
Notifications
You must be signed in to change notification settings - Fork 0
/
experiment.py
87 lines (71 loc) · 3.04 KB
/
experiment.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
from data import *
pd.options.mode.chained_assignment = None
bucket = 'darkanita'
key = 'Safety_GPS1.csv'
def first_preprocessing(data,app_id,app_code):
data = format_date(data,'INCIDENT DATE')
data = drop_duplicates(data,['#'])
print(data.shape)
data = drop_duplicates(data,['INCIDENT TITLE', 'INCIDENT DATE', 'LOCATION', 'DESCRIPTION','CATEGORY', 'LATITUDE', 'LONGITUDE', 'More Info'])
print(data.shape)
data,category = create_category_columns(data)
print(data.shape)
print(category)
data,problems = add_data_location(data,app_id,app_code)
print(problems)
print(data.shape)
data,problemsD = translate_columns(data,'DESCRIPTION')
print(problemsD)
data,wordsT = normalize_text(data,'INCIDENT TITLE')
print(list(set(wordsT)))
data,wordsT = normalize_text(data,'INCIDENT TITLE')
print(list(set(wordsT)))
data,wordsD = normalize_text(data,'DESCRIPTION')
print(list(set(wordsD)))
print(data[['INCIDENT TITLE','INCIDENT TITLE WORDS']].head())
print(data[['DESCRIPTION','DESCRIPTION WORDS']].head())
return data
def main(aws_access_key_id,aws_secret_access_key,app_id,app_code):
#urlDataSet = 'https://darkanita.s3-sa-east-1.amazonaws.com/Safecity+Reports+-+28072019.csv'
#dataSet = load_data(urlDataSet)
#print(dataSet.shape)
#dataSet = first_preprocessing(dataSet,app_id,app_code)
urlDataSet = 'https://darkanita.s3-sa-east-1.amazonaws.com/Safety_GPS1.csv'
dataSet = load_data(urlDataSet)
print(dataSet.shape)
dataSet_India = dataSet[dataSet['COUNTRY']=='India']
#dataSet_India = dataSet_India[dataSet_India['CITY'].isin(['Delhi','Mumbai'])]
print(dataSet_India[['INCIDENT TITLE','CITY','DESCRIPTION']].head())
#dataSet,problemsT = translate_columns(dataSet,'INCIDENT TITLE')
#print(problemsT)
#obj = upload_data(dataSet,bucket,key,aws_access_key_id,aws_secret_access_key)
#print(obj)
#obj = upload_data(dataSet,bucket,key,aws_access_key_id,aws_secret_access_key)
#print(obj)
#obj = upload_data(dataSet,bucket,key,aws_access_key_id,aws_secret_access_key)
#print(obj)
#print(dataSet[dataSet['DESCRIPTION WORDS'].isna()]['DESCRIPTION'].head())
#print(dataSet_India['INCIDENT TITLE'].unique()[:100])
#print(dataSet['STATE'].unique())
#print(dataSet['COUNTY'].unique())
#print(dataSet['CITY'].unique())
#obj = upload_data(dataSet,bucket,key,aws_access_key_id,aws_secret_access_key)
#print(obj)
if __name__ == '__main__':
import argparse
parser = argparse.ArgumentParser(
description="Prepare dataset")
parser.add_argument('--aws_access_key_id', '-aws_id',
help="aws_access_key_id"
)
parser.add_argument('--aws_secret_access_key', '-aws_key',
help="aws_secret_access_key"
)
parser.add_argument('--here_id', '-app_id',
help="app_id"
)
parser.add_argument('--here_code', '-app_code',
help="app_code"
)
args = parser.parse_args()
main(args.aws_access_key_id, args.aws_secret_access_key, args.here_id,args.here_code)