-
Notifications
You must be signed in to change notification settings - Fork 28
/
utils_pos.py
93 lines (75 loc) · 2.44 KB
/
utils_pos.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
import numpy as np
import string
# Punctuation characters
punct = set(string.punctuation)
# Morphology rules used to assign unknown word tokens
noun_suffix = ["action", "age", "ance", "cy", "dom", "ee", "ence", "er", "hood", "ion", "ism", "ist", "ity", "ling", "ment", "ness", "or", "ry", "scape", "ship", "ty"]
verb_suffix = ["ate", "ify", "ise", "ize"]
adj_suffix = ["able", "ese", "ful", "i", "ian", "ible", "ic", "ish", "ive", "less", "ly", "ous"]
adv_suffix = ["ward", "wards", "wise"]
# Additive smoothing parameter
alpha = 0.001
def assign_unk(tok):
"""
Assign unknown word tokens
"""
# Digits
if any(char.isdigit() for char in tok):
return "--unk_digit--"
# Punctuation
elif any(char in punct for char in tok):
return "--unk_punct--"
# Upper-case
elif any(char.isupper() for char in tok):
return "--unk_upper--"
# Nouns
elif any(tok.endswith(suffix) for suffix in noun_suffix):
return "--unk_noun--"
# Verbs
elif any(tok.endswith(suffix) for suffix in verb_suffix):
return "--unk_verb--"
# Adjectives
elif any(tok.endswith(suffix) for suffix in adj_suffix):
return "--unk_adj--"
# Adverbs
elif any(tok.endswith(suffix) for suffix in adv_suffix):
return "--unk_adv--"
return "--unk--"
def get_word_tag(line, vocab):
if not line.split():
word = "--n--"
tag = "--s--"
return word, tag
else:
word, tag = line.split()
if word not in vocab:
# Handle unknown words
word = assign_unk(word)
return word, tag
def preprocess(vocab, data_fp):
"""
Preprocess data
"""
orig = []
prep = []
# Read data
with open(data_fp, "r") as data_file:
for cnt, word in enumerate(data_file):
# End of sentence
if not word.split():
orig.append(word.strip())
word = "--n--"
prep.append(word)
continue
# Handle unknown words
elif word.strip() not in vocab:
orig.append(word.strip())
word = assign_unk(word)
prep.append(word)
continue
else:
orig.append(word.strip())
prep.append(word.strip())
assert(len(orig) == len(open(data_fp, "r").readlines()))
assert(len(prep) == len(open(data_fp, "r").readlines()))
return orig, prep