-
Notifications
You must be signed in to change notification settings - Fork 44
/
sklearn_tree.py
69 lines (54 loc) · 2.25 KB
/
sklearn_tree.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
#!/usr/bin/python3
import os
import json
import sys
import sklearn
import pydotplus
from sklearn import cross_validation, grid_search, tree
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.externals import joblib
def train_tree_classifer(features, labels, model_output_path):
"""
train_tree_classifer will train a DecisionTree and write it out to a pdf file
features: 2D array of each input feature for each sample
labels: array of string labels classifying each sample
model_output_path: path for storing the trained tree model
"""
# save 20% of data for performance evaluation
X_train, X_test, y_train, y_test = cross_validation.train_test_split(features, labels, test_size=0.2)
param = [
{
"max_depth": [None, 10, 100, 1000, 10000]
}
]
dtree = tree.DecisionTreeClassifier(random_state=0)
# 10-fold cross validation, use 4 thread as each fold and each parameter set can be train in parallel
clf = grid_search.GridSearchCV(dtree, param,
cv=10, n_jobs=20, verbose=3)
clf.fit(X_train, y_train)
if os.path.exists(model_output_path):
joblib.dump(clf.best_estimator_, model_output_path)
else:
print("Cannot save trained tree model to {0}.".format(model_output_path))
dot_data = tree.export_graphviz(clf.best_estimator_, out_file=None)
graph = pydotplus.graph_from_dot_data(dot_data)
graph.write_pdf('best_tree.pdf')
print("\nBest parameters set:")
print(clf.best_params_)
y_predict=clf.predict(X_test)
labels=sorted(list(set(labels)))
print("\nConfusion matrix:")
print("Labels: {0}\n".format(",".join(labels)))
print(confusion_matrix(y_test, y_predict, labels=labels))
print("\nClassification report:")
print(classification_report(y_test, y_predict))
def main():
# load the feature data from a file
with open(sys.argv[1]) as infile:
dataset = json.load(infile)
app_names = list(dataset['apps'].keys())
feature_vectors = [dataset['apps'][app]['vector'] for app in app_names]
labels = ['1' if dataset['apps'][app]['malicious'] == [1,0] else '0' for app in app_names]
train_tree_classifer(feature_vectors, labels, 'model.out')
if __name__=='__main__':
main()