-
Notifications
You must be signed in to change notification settings - Fork 0
/
main.py
100 lines (77 loc) · 2.95 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
#Detecting malicious URL with hML
#Importing packages
import pandas as pd
import numpy as np
import streamlit as st
import random
import pickle
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
#Loading URL Data
data = pd.read_csv("urldata.csv")
#check the type of data
type(data)
#check what the data contains
print(data.head())
#now let's vectorize using tfidVectorizer
## we have to create a token
def createTokens(f):
tkns_BySlash = str(f.encode('utf-8')).split('/') # make tokens after splitting by slash
total_Tokens = []
for i in tkns_BySlash:
tokens = str(i).split('-') # make tokens after splitting by dash
tkns_ByDot = []
for j in range(0,len(tokens)):
temp_Tokens = str(tokens[j]).split('.') # make tokens after splitting by dot
tkns_ByDot = tkns_ByDot + temp_Tokens
total_Tokens = total_Tokens + tokens + tkns_ByDot
total_Tokens = list(set(total_Tokens)) #remove redundant tokens
if 'com' in total_Tokens:
total_Tokens.remove('com') #removing .com since it occurs a lot of times and it should not be included in our features
return total_Tokens
#now we need to label
y = data["label"]
#select feature
url_list = data["url"]
#custpm tokenizer
vectorizer = TfidfVectorizer(tokenizer=createTokens)
# Store vectors into X variable as Our XFeatures
x = vectorizer.fit_transform(url_list)
#the training and testing dataset is split by 80/20 ratio
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
#now we need to build the model usinf regression logistic
logic = LogisticRegression(max_iter=3000)
logic.fit(x_train, y_train)
pickle.dump(logic, open('final_model.pkl', 'wb'))
#get the accuracy
print("Accuracy ", logic.score(x_test, y_test))
#now let's predict with our model
# now get a list of malwaare urls to test.
X_predict = ['google.com/search=jcharistech',
'google.com/search=faizanahmad',
'pakistanifacebookforever.com/getpassword.php/',
'www.radsport-voggel.de/wp-admin/includes/log.exe',
'ahrenhei.without-transfer.ru/nethost.exe',
'www.itidea.it/centroesteticosothys/img/_notes/gum.exe']
X_predict = vectorizer.transform(X_predict)
New_predict = logic.predict(X_predict)
print(New_predict)
# X_predict1 = ['www.buyfakebillsonlinee.blogspot.com']
input_string = input('Enter elements of a list separated by space ')
print("\n")
user_list = input_string.split()
# print list
print('list: ', user_list)
# # convert each item to int type
# for i in range(len(user_list)):
# # convert each item to int type
# user_list[i] = int(user_list[i])
# Calculating the sum of list elements
# print("Sum = ", sum(user_list))
X_predict1 = user_list
print(list(X_predict1))
X_predict1 = vectorizer.transform(X_predict1)
New_predict1 = logic.predict(X_predict1)
print(New_predict1)