-
Notifications
You must be signed in to change notification settings - Fork 0
/
spam_filter.py
77 lines (56 loc) · 2.49 KB
/
spam_filter.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
#!/usr/bin/env python
# encoding: utf-8
import os
import random
from collections import Counter
from nltk import NaiveBayesClassifier, classify
from nltk import word_tokenize, WordNetLemmatizer
from nltk.corpus import stopwords
stoplist = stopwords.words('english')
# Read emails in
def init_lists(folder):
a_list = []
file_list = os.listdir(folder)
for a_file in file_list:
f = open(folder + a_file, 'r')
a_list.append(f.read())
f.close()
return a_list
# For example change Apple, apples to apple
def preprocess(sentence):
lemmatizer = WordNetLemmatizer()
return [lemmatizer.lemmatize(word.lower()) for word in word_tokenize(unicode(sentence, errors='ignore'))]
# Extract words from an email without stopwords
def get_features(text, setting):
if setting == 'bow':
return {word: count for word, count in Counter(preprocess(text)).items() if word not in stoplist}
else:
return {word: True for word in preprocess(text) if word not in stoplist}
def train(features, samples_proportion):
train_size = int(len(features) * samples_proportion)
train_set, test_set = features[:train_size], features[train_size:]
print('Training set size = ' + str(len(train_set)) + ' emails')
print('Test set size = ' + str(len(test_set)) + ' emails')
classifier = NaiveBayesClassifier.train(train_set)
return train_set, test_set, classifier
def evaluate(train_set, test_set, classifier):
# check how the classifier performs on the training and test sets
print ('Accuracy on the training set = ' + str(classify.accuracy(classifier, train_set)))
print ('Accuracy of the test set = ' + str(classify.accuracy(classifier, test_set)))
# check which words are most informative for the classifier
classifier.show_most_informative_features(20)
def main():
spam = init_lists('/Users/fniu/Downloads/enron1/spam/')
ham = init_lists('/Users/fniu/Downloads/enron1/ham/')
all_emails = [(email, 'spam') for email in spam] + [(email, 'ham') for email in ham]
random.shuffle(all_emails)
print ('Corpus size = ' + str(len(all_emails)) + ' emails')
# extract the features
all_features = [(get_features(email, ''), label) for (email, label) in all_emails]
print ('Collected ' + str(len(all_features)) + ' feature sets')
# train the classifier
train_set, test_set, classifier = train(all_features, 0.8)
# evaluate its performance
evaluate(train_set, test_set, classifier)
if __name__ == '__main__':
main()