-
Notifications
You must be signed in to change notification settings - Fork 0
/
validation_utils.py
257 lines (186 loc) · 8.09 KB
/
validation_utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
import nltk
from rdflib import Namespace, URIRef
STATUS = {'Unknown', 'FN1_Sent', 'Test', 'Add_Annotation',
'New', 'Finished_Checked', 'FN1_NoSent',
'Rules_Defined', 'SC_Defined', 'In_Use', 'Finished_Initial',
'BTDT', 'Finished_X-Gov', 'Insufficient_Attestations', 'Created',
'Problem', 'Needs_SCs', 'Pre-Marked'}
POS = {'PRON', 'NUM', 'A', 'PREP', 'IDIO', 'N', 'INTJ', 'V', 'ART', 'SCON', 'ADV', 'C',
'I'}
ILLEGAL_CHARS_IN_AGENT = {'#', '/', ' '}
LEXEME_ATTRS = {
'order',
'headword',
'breakBefore',
'POS',
'name'
}
OPTIONAL_LEXEME_ATTRS = {
'incorporatedFE',
'lu_id'
}
TYPES = {
'singleton',
'phrasal',
'idiom',
'endocentric compound',
'exocentric compound'
}
def create_lemma(lexemes, separator=''):
order_to_lexeme = dict()
for lexeme in lexemes:
order_to_lexeme[int(lexeme['order'])] = lexeme
parts = []
for order, lexeme in sorted(order_to_lexeme.items()):
if order == 1:
parts.append(lexeme['name'])
elif order >= 2:
parts.append(separator + lexeme['name'])
lemma = ''.join(parts)
return lemma
the_lexemes = [{
'order': '1',
'headword': 'false',
'breakBefore': 'false',
'POS': 'V',
'name': 'give'
},
{
'order': '2',
'headword': 'false',
'breakBefore': 'false',
'POS': 'A',
'name': 'up'
}
]
assert create_lemma(lexemes=the_lexemes, separator=' ') == 'give up'
def validate_status(status):
assert status in STATUS, f'{status} not part of accepted set: {STATUS}'
validate_status(status='New')
def validate_pos(pos):
assert pos in POS, f'{pos} not part of accepted set: {POS}'
validate_pos(pos='N')
def validate_frame(your_fn, frame_name):
try:
your_fn.frame_by_name(frame_name)
except nltk.corpus.reader.framenet.FramenetError:
raise KeyError(f'{frame_name} not part of your FrameNet.')
def validate_lexeme(my_fn, lexeme, lu_type):
for lexeme_attr in LEXEME_ATTRS:
# a lexeme of a phrasal verb does not need to have a POS attribute
# we do not specify it for the verb particle
if all([lu_type in {'phrasal', 'exocentric compound'},
lexeme_attr == 'POS']):
continue
assert lexeme_attr in lexeme, \
f'missing atribute {lexeme_attr} in {lexeme} (required are {LEXEME_ATTRS}'
for lexeme_attr, value in lexeme.items():
assert lexeme_attr in LEXEME_ATTRS | OPTIONAL_LEXEME_ATTRS, \
f'{lexeme_attr} not part of allowed attributes. Please inspect.'
if lexeme_attr == 'lu_id':
assert int(value) in my_fn.lu_ids_and_names(), \
f'lu id {value} not found in your FrameNet. Please inspect.'
int(lexeme['order'])
assert lexeme['headword'] in {'true', 'false'}, \
f'possible values for headword are "true" and "false". You specified {lexeme["headword"]}'
assert lexeme['breakBefore'] in {'true', 'false'}, \
f'possible values for breakBefore are "true" and "false". You specified {lexeme["breakBefore"]}'
if 'POS' in lexeme:
validate_pos(pos=lexeme["POS"])
name = lexeme['name']
assert type(name) == str, f'the name of lexeme should be a string, you provided a {type(name)}.'
def validate_order_attr(lexemes):
orders_gold = [str(i) for i in range(1, len(lexemes) + 1)]
orders_provided = [lexeme['order'] for lexeme in lexemes]
assert set(orders_gold) == set(orders_provided), f'Please inspect order attribute: {lexemes}'
def validate_lexemes(my_fn, lexemes, lu_type):
for lexeme in lexemes:
validate_lexeme(my_fn=my_fn, lexeme=lexeme, lu_type=lu_type)
if len(lexemes) == 1:
lexeme = lexemes[0]
assert 'lu_id' not in lexeme, \
f'the optional attribute lu_id is only allowed in multi-lexeme expressions. Please inspect.'
def frames_with_lemma_pos_in_lexicon(your_fn, lemma, pos):
frames = your_fn.frames_by_lemma(f'{lemma}.{pos.lower()}')
frame_to_frame_obj = dict()
for frame in frames:
frame_to_frame_obj[frame.name] = frame
return frame_to_frame_obj
def validate_incorporated_fe(fn_en,
frame_label,
incorporated_fe):
"""
we validate that the incorporated_fe is part of the frame
that the LU is added to.
(for more information about incorporation, we refer to
Subsubsection 3.2.4 from the FrameNet book
https://framenet2.icsi.berkeley.edu/docs/r1.7/book.pdf)
:param fn_en:
:param frame_label:
:param incorporated_fe:
:return:
"""
frame = fn_en.frame_by_name(frame_label)
assert incorporated_fe in frame.FE.keys(), f'{incorporated_fe} not part of frame {frame_label}'
def validate_incorporate_fe_lu_and_lexemes(incorporated_fe,
lexemes):
if len(lexemes) == 1:
return
incorporated_fes = set()
for lexeme in lexemes:
incor_fe = lexeme.get('incorporatedFE', None)
if incor_fe is not None:
incorporated_fes.add(incor_fe)
incorporated_fe_lu = set()
if incorporated_fe is not None:
incorporated_fe_lu.add(incorporated_fe)
assert incorporated_fes == incorporated_fe_lu,\
f'mismatch between incorporatedFE at LU level and in the lexemes: {incorporated_fe} {lexemes}'
def validate_lu_type(lu_type):
assert lu_type in TYPES, f'type {lu_type} is not part of the accepted set: {TYPES}'
def validate_lu_pos(lu_pos, pos):
assert lu_pos == pos.lower(), f'different POS provided for lu_name and pos of lu: {lu_pos} and {pos}'
def validate_num_lexemes(lexemes, lu_type):
if lu_type in {'singleton'}:
assert len(lexemes) == 1, f'for lu_type {lu_type} the number of lexemes should be one, you provided {len(lexemes)}.'
elif lu_type in {'phrasal',
'endocentric compound'}:
assert len(lexemes) >= 2, f'for lu_type {lu_type} the number of lexemes should be 2>, you provided {len(lexemes)}.'
def validate_lexemes_vs_luname(lexemes, lu_type, lu_lemma):
if lu_type == 'singleton':
lexeme = lexemes[0]['name']
assert lexeme == lu_lemma, f'for lu_type singleton, the lu_name ({lu_lemma}) and lexeme ({lexeme}) should match.'
elif lu_type == 'endocentric compound':
recreated_lemma = create_lemma(lexemes=lexemes,
separator='')
parts = [f'recreated lemma from lexemes ({recreated_lemma}) does not match the lu_name ({lu_lemma})',
f'for the chosen lu_type ({lu_type}), this is needed.']
error_message = '\n'.join(parts)
assert lu_lemma == recreated_lemma, error_message
elif lu_type in {'idiom',
'phrasal',
'exocentric compound'}:
for lexeme in lexemes:
name = lexeme['name']
parts = [f'lexeme: {name} is not part of the lu_lemma ({lu_lemma})']
error_message = '\n'.join(parts)
assert name in lu_lemma, error_message
def validate_skos(skos_predicate_to_external_references, skos):
skos_namespace = None
if skos_predicate_to_external_references:
assert skos is not None, f'skos is None. Please provide FrameNetNLTK.skos'
if skos is not None:
for prefix, namespace in skos.namespaces():
if prefix == 'skos':
skos_namespace = namespace.toPython()
SKOS = Namespace(skos_namespace)
for predicate in skos_predicate_to_external_references:
pred_uriref = URIRef(SKOS + predicate)
assert pred_uriref in skos.subjects(), f'{predicate} ({pred_uriref}) not part of skos.'
return skos_namespace
def validate_agent(agent):
for illegal_char_in_agent in ILLEGAL_CHARS_IN_AGENT:
assert illegal_char_in_agent not in agent, f'character (repr({illegal_char_in_agent})) are not allowed in agent: {agent}'
def validate_provenance(provenance):
for illegal_char_in_agent in ILLEGAL_CHARS_IN_AGENT:
assert illegal_char_in_agent not in provenance, f'character (repr({illegal_char_in_agent})) are not allowed in agent: {provenance}'