-
Notifications
You must be signed in to change notification settings - Fork 8
/
doc2vec.py
167 lines (149 loc) · 7.19 KB
/
doc2vec.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
import sys
import re
import string
import os
import numpy as np
import codecs
# From scikit learn that got words from:
# http://ir.dcs.gla.ac.uk/resources/linguistic_utils/stop_words
ENGLISH_STOP_WORDS = frozenset([
"a", "about", "above", "across", "after", "afterwards", "again", "against",
"all", "almost", "alone", "along", "already", "also", "although", "always",
"am", "among", "amongst", "amoungst", "amount", "an", "and", "another",
"any", "anyhow", "anyone", "anything", "anyway", "anywhere", "are",
"around", "as", "at", "back", "be", "became", "because", "become",
"becomes", "becoming", "been", "before", "beforehand", "behind", "being",
"below", "beside", "besides", "between", "beyond", "bill", "both",
"bottom", "but", "by", "call", "can", "cannot", "cant", "co", "con",
"could", "couldnt", "cry", "de", "describe", "detail", "do", "done",
"down", "due", "during", "each", "eg", "eight", "either", "eleven", "else",
"elsewhere", "empty", "enough", "etc", "even", "ever", "every", "everyone",
"everything", "everywhere", "except", "few", "fifteen", "fifty", "fill",
"find", "fire", "first", "five", "for", "former", "formerly", "forty",
"found", "four", "from", "front", "full", "further", "get", "give", "go",
"had", "has", "hasnt", "have", "he", "hence", "her", "here", "hereafter",
"hereby", "herein", "hereupon", "hers", "herself", "him", "himself", "his",
"how", "however", "hundred", "i", "ie", "if", "in", "inc", "indeed",
"interest", "into", "is", "it", "its", "itself", "keep", "last", "latter",
"latterly", "least", "less", "ltd", "made", "many", "may", "me",
"meanwhile", "might", "mill", "mine", "more", "moreover", "most", "mostly",
"move", "much", "must", "my", "myself", "name", "namely", "neither",
"never", "nevertheless", "next", "nine", "no", "nobody", "none", "noone",
"nor", "not", "nothing", "now", "nowhere", "of", "off", "often", "on",
"once", "one", "only", "onto", "or", "other", "others", "otherwise", "our",
"ours", "ourselves", "out", "over", "own", "part", "per", "perhaps",
"please", "put", "rather", "re", "same", "see", "seem", "seemed",
"seeming", "seems", "serious", "several", "she", "should", "show", "side",
"since", "sincere", "six", "sixty", "so", "some", "somehow", "someone",
"something", "sometime", "sometimes", "somewhere", "still", "such",
"system", "take", "ten", "than", "that", "the", "their", "them",
"themselves", "then", "thence", "there", "thereafter", "thereby",
"therefore", "therein", "thereupon", "these", "they", "thick", "thin",
"third", "this", "those", "though", "three", "through", "throughout",
"thru", "thus", "to", "together", "too", "top", "toward", "towards",
"twelve", "twenty", "two", "un", "under", "until", "up", "upon", "us",
"very", "via", "was", "we", "well", "were", "what", "whatever", "when",
"whence", "whenever", "where", "whereafter", "whereas", "whereby",
"wherein", "whereupon", "wherever", "whether", "which", "while", "whither",
"who", "whoever", "whole", "whom", "whose", "why", "will", "with",
"within", "without", "would", "yet", "you", "your", "yours", "yourself",
"yourselves"])
def load_glove(filename):
"""
Read all lines from the indicated file and return a dictionary
mapping word:vector where vectors are of numpy `array` type.
GloVe file lines are of the form:
the 0.418 0.24968 -0.41242 0.1217 ...
So split each line on spaces into a list; the first element is the word
and the remaining elements represent factor components. The length of the vector
should not matter; read vectors of any length.
"""
GloVe = {}
with open(filename) as f:
lines = f.readlines()
for line in lines:
item = line.split()
GloVe[item[0]] = np.asfarray(np.array(item[1:]), float)
return GloVe
def filelist(root):
"""Return a fully-qualified list of filenames under root directory"""
allfiles = []
for path, subdirs, files in os.walk(root):
for name in files:
allfiles.append(os.path.join(path, name))
return allfiles
def get_text(filename):
"""
Load and return the text of a text file, assuming latin-1 encoding as that
is what the BBC corpus uses. Use codecs.open() function not open().
"""
f = codecs.open(filename, encoding='latin-1', mode='r')
s = f.read()
f.close()
return s
def words(text):
"""
Given a string, return a list of words normalized as follows.
Split the string to make words first by using regex compile() function
and string.punctuation + '0-9\\r\\t\\n]' to replace all those
char with a space character.
Split on space to get word list.
Ignore words < 3 char long.
Lowercase all words
Remove English stop words
"""
regex = re.compile('[' + re.escape(string.punctuation) + '0-9\\r\\t\\n]')
nopunct = regex.sub(" ",text)
words = nopunct.split(" ")
words = [w for w in words if len(w) > 2]
words = [w.lower() for w in words]
return words
def load_articles(articles_dirname, gloves):
"""
Load all .txt files under articles_dirname and return a table (list of lists/tuples)
where each record is a list of:
[filename, title, article-text-minus-title, wordvec-centroid-for-article-text]
We use gloves parameter to compute the word vectors and centroid.
The filename is stripped of the prefix of the articles_dirname pulled in as
script parameter sys.argv[2]. E.g., filename will be "business/223.txt"
"""
filenames = filelist(articles_dirname)
filenames = [name for name in filenames if name.endswith('.txt')]
#print filenames[:10]
table = [[] for x in xrange(len(filenames))]
for i, fname in enumerate(filenames):
text = get_text(fname)
filename = fname.split('bbc/')[1]
title = text.split("\n")[0]
body = text.split(title+"\n")[1]
centroid = doc2vec(text, gloves)
table[i] = [filename, title, body, centroid]
return table
def doc2vec(text, gloves):
"""
Return the word vector centroid for the text. Sum the word vectors
for each word and then divide by the number of words. Ignore words
not in gloves.
"""
text = words(text)
text = [w for w in text if w in gloves] # Ignore if not in gloves
centroid = np.sum(gloves[word] for word in text)/len(text) # sum and divide
return centroid
def distances(article, articles):
"""
Compute the euclidean distance from article to every other article and return
a list of (distance, a) tuples for all a in articles. The article is one
of the elements (tuple) from the articles list.
"""
# euclidean distance = np.linalg.norm(a[3]-article[3])
dist = [(np.linalg.norm(a[3]-article[3]), i) for i, a in enumerate(articles) if a[0] != article[0]]
return dist
def recommended(article, articles, n):
"""
Return a list of the n articles (records with filename, title, etc...)
closest to article's word vector centroid. The article is one of the elements
(tuple) from the articles list.
"""
dist = distances(article, articles)
idx = [a[1] for a in sorted(dist, key=lambda t: t[0])[:n]]
return [articles[i] for i in idx]