-
Notifications
You must be signed in to change notification settings - Fork 21
/
Copy pathmxpost.py
executable file
·435 lines (352 loc) · 16.4 KB
/
mxpost.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
# -*- coding: utf-8 -*-
# Maximum Entropy Part-of-Speech Tagger for NLTK (Natural Language Toolkit)
# Author: Arne Neumann
# Licence: GPL 3
#__docformat__ = 'epytext en'
"""
A I{part-of-speech tagger} that uses NLTK's build-in L{Maximum Entropy
models<nltk.MaxentClassifier>} to find the most likely I{part-of-speech
tag} (POS) for each word in a given sequence.
The tagger will be trained on a corpus of tagged sentences. For every word
in the corpus, a C{tuple} consisting of a C{dictionary} of features from
the word's context (e.g. preceding/succeeding words and tags, word
prefixes/suffixes etc.) and the word's tag will be generated.
The maximum entropy classifier will learn a model from these tuples that
will be used by the tagger to find the most likely POS-tag for any given
word, even unseen ones.
The tagger and the featuresets chosen for training are implemented as described
in Ratnaparkhi, Adwait (1996). A Maximum Entropy Model for Part-Of-Speech
Tagging. In Proceedings of the ARPA Human Language Technology Workshop. Pages
250-255.
Usage notes:
============
Please install the MEGAM package (http://hal3.name/megam),
otherwise training will take forever.
To use the demo, please install either 'brown' or 'treebank' with::
import nltk
nltk.download()
in the Python interpreter. Proper usage of demo() and all other functions and
methods is described below.
"""
import time
import re
from collections import defaultdict
from nltk import TaggerI, FreqDist, untag, config_megam
from nltk.classify.maxent import MaxentClassifier
PATH_TO_MEGAM_EXECUTABLE = "/usr/bin/megam"
config_megam(PATH_TO_MEGAM_EXECUTABLE)
class MaxentPosTagger(TaggerI):
"""
MaxentPosTagger is a part-of-speech tagger based on Maximum Entropy models.
"""
def train(self, train_sents, algorithm='megam', rare_word_cutoff=5,
rare_feat_cutoff=5, uppercase_letters='[A-Z]', trace=3,
**cutoffs):
"""
MaxentPosTagger trains a Maximum Entropy model from a C{list} of tagged
sentences.
@type train_sents: C{list} of C{list} of tuples of (C{str}, C{str})
@param train_sents: A list of tagged sentences. Each sentence is
represented by a list of tuples. Each tuple holds two strings, a
word and its tag, e.g. ('company','NN').
@type algorithm: C{str}
@param algorithm: The algorithm that is used by
L{nltk.MaxentClassifier.train()} to train and optimise the model. It is
B{strongly recommended} to use the C{LM-BFGS} algorithm provided by the
external package U{megam<http://hal3.name/megam/>} as it is much faster
and uses less memory than any of the algorithms provided by NLTK (i.e.
C{GIS}, C{IIS}) or L{scipy} (e.g. C{CG} and C{BFGS}).
@type rare_word_cutoff: C{int}
@param rare_word_cutoff: Words with less occurrences than
C{rare_word_cutoff} will be treated differently by L{extract_feats}
than non-rare words (cf. Ratnaparkhi 1996).
@type rare_feat_cutoff: C{int}
@param rare_feat_cutoff: ignore features that occur less than
C{rare_feat_cutoff} during training.
@type uppercase_letters: C{regex}
@param uppercase_letters: a regular expression that covers all
uppercase letters of the language of your corpus (e.g. '[A-ZÄÖÜ]' for
German)
@type trace: C{int}
@param trace: The level of diagnostic output to produce. C{0} doesn't
produce any output, while C{3} will give all the output that C{megam}
produces plus the time it took to train the model.
@param cutoffs: Arguments specifying various conditions under
which the training should be halted. When using C{MEGAM}, only
C{max_iter} should be relevant. For other cutoffs see
L{nltk.MaxentClassifier}
- C{max_iter=v}: Terminate after C{v} iterations.
"""
self.uppercase_letters = uppercase_letters
self.word_freqdist = self.gen_word_freqs(train_sents)
self.featuresets = self.gen_featsets(train_sents,
rare_word_cutoff)
self.features_freqdist = self.gen_feat_freqs(self.featuresets)
self.cutoff_rare_feats(self.featuresets, rare_feat_cutoff)
t1 = time.time()
self.classifier = MaxentClassifier.train(self.featuresets, algorithm,
trace, **cutoffs)
t2 = time.time()
if trace > 0:
print "time to train the classifier: {0}".format(round(t2-t1, 3))
def gen_feat_freqs(self, featuresets):
"""
Generates a frequency distribution of joint features (feature, tag)
tuples. The frequency distribution will be used by the tagger to
determine which (rare) features should not be considered during
training (feature cutoff).
This is how joint features look like::
(('t-2 t-1', 'IN DT'), 'NN')
(('w-2', '<START>'), 'NNP')
(('w+1', 'of'), 'NN')
@type featuresets: {list} of C{tuples} of (C{dict}, C{str})
@param featuresets: a list of tuples that contain the featureset of a
word from the training set and its POS tag.
@rtype: C{FreqDist}
@return: a L{frequency distribution<nltk.FreqDist()>},
counting how often each (context information feature, tag) tuple occurs
in the training sentences.
"""
features_freqdist = defaultdict(int)
for (feat_dict, tag) in featuresets:
for (feature, value) in feat_dict.items():
features_freqdist[ ((feature, value), tag) ] += 1
return features_freqdist
def gen_word_freqs(self, train_sents):
"""
Generates word frequencies from the training sentences for the feature
extractor.
@type train_sents: C{list} of C{list} of tuples of (C{str}, C{str})
@param train_sents: A list of tagged sentences.
@rtype: C{FreqDist}
@return: a L{frequency distribution<nltk.FreqDist()>},
counting how often each word occurs in the training sentences.
"""
word_freqdist = FreqDist()
for tagged_sent in train_sents:
for (word, _tag) in tagged_sent:
word_freqdist[word] += 1
return word_freqdist
def gen_featsets(self, train_sents, rare_word_cutoff):
"""
Generates featuresets for each token in the training sentences.
@type train_sents: C{list} of C{list} of tuples of (C{str}, C{str})
@param train_sents: A list of tagged sentences.
@type rare_word_cutoff: C{int}
@param rare_word_cutoff: Words with less occurrences than
C{rare_word_cutoff} will be treated differently by L{extract_feats}
than non-rare words (cf. Ratnaparkhi 1996).
@rtype: {list} of C{tuples} of (C{dict}, C{str})
@return: a list of tuples that contains the featureset of
a token and its POS-tag.
"""
featuresets = []
for tagged_sent in train_sents:
history = []
untagged_sent = untag(tagged_sent)
for (i, (_word, tag)) in enumerate(tagged_sent):
featuresets.append( (self.extract_feats(untagged_sent, i,
history, rare_word_cutoff), tag) )
history.append(tag)
return featuresets
def cutoff_rare_feats(self, featuresets, rare_feat_cutoff):
"""
Cuts off rare features to reduce training time and prevent overfitting.
Example
=======
Let's say, the suffixes of this featureset are too rare to learn.
>>> featuresets[46712]
({'suffix(1)': 't',
'prefix(1)': 'L',
'prefix(2)': 'Le',
'prefix(3)': 'Lem',
'suffix(3)': 'ont',
'suffix(2)': 'nt',
'contains-uppercase': True,
'prefix(4)': 'Lemo',
'suffix(4)': 'mont'},
'NNP')
C{cutoff_rare_feats} would then remove the rare joint features::
(('suffix(1)', 't'), 'NNP')
(('suffix(3)', 'ont'), 'NNP')
((suffix(2)': 'nt'), 'NNP')
(('suffix(4)', 'mont'), 'NNP')
and return a featureset that only contains non-rare features:
>>> featuresets[46712]
({'prefix(1)': 'L',
'prefix(2)': 'Le',
'prefix(3)': 'Lem',
'contains-uppercase': True,
'prefix(4)': 'Lemo'},
'NNP')
@type featuresets: {list} of C{tuples} of (C{dict}, C{str})
@param featuresets: a list of tuples that contain the featureset of a
word from the training set and its POS tag
@type rare_feat_cutoff: C{int}
@param rare_feat_cutoff: if a (context information feature, tag)
tuple occurs less than C{rare_feat_cutoff} times in the training
set, then its corresponding feature will be removed from the
C{featuresets} to be learned.
"""
never_cutoff_features = set(['w','t'])
for (feat_dict, tag) in featuresets:
for (feature, value) in feat_dict.items():
feat_value_tag = ((feature, value), tag)
if self.features_freqdist[feat_value_tag] < rare_feat_cutoff:
if feature not in never_cutoff_features:
feat_dict.pop(feature)
def extract_feats(self, sentence, i, history, rare_word_cutoff=5):
"""
Generates a featureset from a word (in a sentence). The features
were chosen as described in Ratnaparkhi (1996) and his Java
software package U{MXPOST<ftp://ftp.cis.upenn.edu/pub/adwait/jmx>}.
The following features are extracted:
- features for all words: last tag (C{t-1}), last two tags (C{t-2
t-1}), last words (C{w-1}) and (C{w-2}), next words (C{w+1}) and
(C{w+2})
- features for non-rare words: current word (C{w})
- features for rare words: word suffixes (last 1-4 letters),
word prefixes (first 1-4 letters),
word contains number (C{bool}), word contains uppercase character
(C{bool}), word contains hyphen (C{bool})
Ratnaparkhi experimented with his tagger on the Wall Street Journal
corpus (Penn Treebank project). He found that the tagger yields
better results when words which occur less than 5 times are treated
as rare. As your mileage may vary, please adjust
L{rare_word_cutoff} accordingly.
Examples
========
1. This is a featureset extracted from the nonrare (word, tag)
tuple ('considerably', 'RB')
>>> featuresets[22356]
({'t-1': 'VB',
't-2 t-1': 'TO VB',
'w': 'considerably',
'w+1': '.',
'w+2': '<END>',
'w-1': 'improve',
'w-2': 'to'},
'RB')
2. A featureset extracted from the rare tuple ('Lemont', 'NN')
>>> featuresets[46712]
({'suffix(1)': 't',
'prefix(1)': 'L',
'prefix(2)': 'Le',
'prefix(3)': 'Lem',
'suffix(3)': 'ont',
'suffix(2)': 'nt',
'contains-uppercase': True,
'prefix(4)': 'Lemo',
'suffix(4)': 'mont'},
'NNP')
@type sentence: C{list} of C{str}
@param sentence: A list of words, usually a sentence.
@type i: C{int}
@param i: The index of a word in a sentence, where C{sentence[0]} would
represent the first word of a sentence.
@type history: C{int} of C{str}
@param history: A list of POS-tags that have been assigned to the
preceding words in a sentence.
@type rare_word_cutoff: C{int}
@param rare_word_cutoff: Words with less occurrences than
C{rare_word_cutoff} will be treated differently than non-rare words
(cf. Ratnaparkhi 1996).
@rtype: C{dict}
@return: a dictionary of features extracted from a word's
context.
"""
features = {}
hyphen = re.compile("-")
number = re.compile("\d")
uppercase = re.compile(self.uppercase_letters)
#get features: w-1, w-2, t-1, t-2.
#takes care of the beginning of a sentence
if i == 0: #first word of sentence
features.update({"w-1": "<START>", "t-1": "<START>",
"w-2": "<START>", "t-2 t-1": "<START> <START>"})
elif i == 1: #second word of sentence
features.update({"w-1": sentence[i-1], "t-1": history[i-1],
"w-2": "<START>",
"t-2 t-1": "<START> %s" % (history[i-1])})
else:
features.update({"w-1": sentence[i-1], "t-1": history[i-1],
"w-2": sentence[i-2],
"t-2 t-1": "%s %s" % (history[i-2], history[i-1])})
#get features: w+1, w+2. takes care of the end of a sentence.
for inc in [1, 2]:
try:
features["w+%i" % (inc)] = sentence[i+inc]
except IndexError:
features["w+%i" % (inc)] = "<END>"
if self.word_freqdist[sentence[i]] >= rare_word_cutoff:
#additional features for 'non-rare' words
features["w"] = sentence[i]
else: #additional features for 'rare' or 'unseen' words
features.update({"suffix(1)": sentence[i][-1:],
"suffix(2)": sentence[i][-2:], "suffix(3)": sentence[i][-3:],
"suffix(4)": sentence[i][-4:], "prefix(1)": sentence[i][:1],
"prefix(2)": sentence[i][:2], "prefix(3)": sentence[i][:3],
"prefix(4)": sentence[i][:4]})
if hyphen.search(sentence[i]) != None:
#set True, if regex is found at least once
features["contains-hyphen"] = True
if number.search(sentence[i]) != None:
features["contains-number"] = True
if uppercase.search(sentence[i]) != None:
features["contains-uppercase"] = True
return features
def tag(self, sentence, rare_word_cutoff=5):
"""
Attaches a part-of-speech tag to each word in a sequence.
@type sentence: C{list} of C{str}
@param sentence: a list of words to be tagged.
@type rare_word_cutoff: C{int}
@param rare_word_cutoff: words with less occurrences than
C{rare_word_cutoff} will be treated differently than non-rare words
(cf. Ratnaparkhi 1996).
@rtype: C{list} of C{tuples} of (C{str}, C{str})
@return: a list of tuples consisting of a word and its corresponding
part-of-speech tag.
"""
history = []
for i in xrange(len(sentence)):
featureset = self.extract_feats(sentence, i, history,
rare_word_cutoff)
tag = self.classifier.classify(featureset)
history.append(tag)
return zip(sentence, history)
def demo(corpus, num_sents):
"""
Loads a few sentences from the Brown corpus or the Wall Street Journal
corpus, trains them, tests the tagger's accuracy and tags an unseen
sentence.
@type corpus: C{str}
@param corpus: Name of the corpus to load, either C{brown} or C{treebank}.
@type num_sents: C{int}
@param num_sents: Number of sentences to load from a corpus. Use a small
number, as training might take a while.
"""
if corpus.lower() == "brown":
from nltk.corpus import brown
tagged_sents = brown.tagged_sents()[:num_sents]
elif corpus.lower() == "treebank":
from nltk.corpus import treebank
tagged_sents = treebank.tagged_sents()[:num_sents]
else:
print "Please load either the 'brown' or the 'treebank' corpus."
size = int(len(tagged_sents) * 0.1)
train_sents, test_sents = tagged_sents[size:], tagged_sents[:size]
maxent_tagger = MaxentPosTagger()
maxent_tagger.train(train_sents)
print "tagger accuracy (test %i sentences, after training %i):" % \
(size, (num_sents - size)), maxent_tagger.evaluate(test_sents)
print "\n\n"
print "classify unseen sentence: ", maxent_tagger.tag(["This", "is", "so",
"slow", "!"])
print "\n\n"
print "show the 10 most informative features:"
print maxent_tagger.classifier.show_most_informative_features(10)
if __name__ == '__main__':
demo("treebank", 200)
#~ featuresets = demo_debugger("treebank", 10000)
print "\n\n\n"