-
Notifications
You must be signed in to change notification settings - Fork 3
/
data.py
163 lines (137 loc) · 6.39 KB
/
data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
"""
Part of the SafeDroid v2.0 FrameWork.
Author : Arygriou Marios
Year : 2018
The framework is distributed under the GNU General Public License v3.0
"""
from vectors import Applications, AppToApi, API
from config import Config
import pandas as pd
import numpy as np
import os
from sklearn import preprocessing
import cPickle as pickle
from joblib import Parallel, delayed
import multiprocessing
import collections
"""
class Data represents database entries and correlations to dataframes.
Dataframes are attributes of the class and logic is applied to them at creation time.
"""
class Data:
def __init__(self, dir_name):
_columnNames = [['api', 'malicious', 'benign', 'ratio'], # api
['name', 'md5', 'isMalicious'], # application
['appid', 'apiid'], # apptoapi
['appid', 'prmid'], # apptoprm
['permission', 'malicious', 'benign', 'ratio']] # permission
_files = ['api.csv', 'applications.csv',
'apptoapi.csv', 'apptoprm.csv', 'permission.csv']
''' app , api , apptoapi hold the relative data extracted from the database
apiVector holds the filtered apis with a malicious ratio above threshold
apisOfApps holds the api calls of each of the apps found in the app matrix
'''
self.config = Config('model_training.config')
self.sizes = []
self.app = self.init_app(os.path.join(
os.getcwd(), dir_name, _files[1]), _columnNames[1])
self.api = API(os.path.join(os.getcwd(), dir_name, _files[0]), _columnNames[0], [
self.app.getMaliciousSize(), self.app.getBenignSize()])
self.apptoapi = self.init_apptoapi(os.path.join(
os.getcwd(), dir_name, _files[2]), _columnNames[2])
# apis after reduction
self.apiVector = self.createApiVectors()
# names of apis in apiVector
self.feature_names = self.setFeatureNames()
# create datasets with respect to inputs for config file
# list of dicts, dict keys = ['data_sets'],['info']
self.data_sets = self.formDataSets()
self.target = self.setTargets()
self.target_names = self.app.target_names
# list of dicts, keys = 'apisOfApps','info'
self.apisOfApps = self.apisOfAppListDF()
return
def createApiVectors(self):
av = []
for threshold in self.config.threshold():
av.append(self.api.getFiltered(threshold=threshold))
return av
def addDataSetSizeInfo(self, mal_s, mal_r, ben_s, ben_r, overal_s, overal_r):
return dict(mal_size=mal_s, mal_ratio=mal_r, ben_size=ben_s, ben_ratio=ben_r, overall_size=overal_s, overall_ratio=overal_r)
def formDataSets(self):
ds = []
for reduction in self.config.reduce_size():
new_overall_size = int(self.app.getOverallSize() * reduction)
for malicious_perc in self.config.malicious_size():
dic = {}
if (int(new_overall_size * malicious_perc) > len(self.app.malicious)):
malicious_size = len(self.app.malicious)
else:
malicious_size = int(new_overall_size * malicious_perc)
benign_size = new_overall_size - malicious_size
if (benign_size > len(self.app.benign)):
benign_size = len(self.app.benign)
# create redused malicious set
mask_malicious = np.random.choice([False, True], len(self.app.malicious), p=[
1-float(malicious_size)/len(self.app.malicious), float(malicious_size)/len(self.app.malicious)])
malicious_set = self.app.malicious.iloc[mask_malicious]
# create redused benign set
mask_benign = np.random.choice([False, True], len(self.app.benign), p=[
1-float(benign_size)/len(self.app.benign), float(benign_size)/len(self.app.benign)])
benign_set = self.app.benign.iloc[mask_benign]
# concatenate two sets and shuffle entries
dic['data_set'] = pd.concat(
[malicious_set, benign_set], axis=0).sample(frac=1)
dic['info'] = self.addDataSetSizeInfo(len(malicious_set), float(len(malicious_set))/(len(malicious_set)+len(benign_set)), len(
benign_set), 1-float(len(malicious_set))/(len(malicious_set)+len(benign_set)), len(malicious_set)+len(benign_set), reduction)
# filter out possible duplicate data sets that will slow down the procedure
if len(self.sizes) == 0:
self.sizes.append(dic['info'])
ds.append(dic)
else:
for item in self.sizes:
if item == dic['info']:
print 'Same values already computed'
else:
ds.append(dic)
self.sizes.append(dic['info'])
break
return ds
def setTargets(self):
t = []
for entry in self.data_sets:
ds = entry['data_set']
t.append(np.array(ds.isMalicious))
return t
def setFeatureNames(self):
fn = []
for vector in self.apiVector:
fn.append(np.array(vector.api))
return fn
def init_api(self, filepath, columnNames):
api = API(filepath, columnNames)
api.setRatio()
return api
def init_app(self, filepath, columnNames):
app = Applications(filepath, columnNames)
return app
def init_apptoapi(self, filepath, columnNames):
return AppToApi(filepath, columnNames)
def extractApis(index, api, apptoapi):
return api.loc[apptoapi[apptoapi.appid == index].apiid]
def apisOfAppListDF(self):
df = []
for ds in self.data_sets:
dic = {}
matrix = ds['data_set']
application = []
length = len(matrix)
print length
for i in range(0, length):
ID = matrix.index[i]
application.append(
self.api._matrix.loc[self.apptoapi._matrix[self.apptoapi._matrix.appid == ID].apiid])
dic['apisOfApps'] = application
dic['info'] = ds['info']
df.append(dic)
return df