-
Notifications
You must be signed in to change notification settings - Fork 3
/
vrt-old-sparv-cstlemma
executable file
·510 lines (422 loc) · 17.3 KB
/
vrt-old-sparv-cstlemma
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
#! /usr/bin/env python3
# -*- mode: Python; -*-
import codecs
from itertools import groupby, count
from queue import Queue
from subprocess import Popen, PIPE
from threading import Thread
import enum, os, re, sys, traceback
from vrtargslib import trans_args, trans_main
from vrtargslib import BadData, BadCode
from vrtnamelib import binxname, binxrest
from vrtnamelib import isbinnames as isnames
from vrtnamelib import binnamelist as namelist, nameindex, nameindices
from vrtnamelib import bininsertnames as insertnames
from vrtdatalib import binasrecord as asrecord
from vrtdatalib import binescape as escape, binunescape as unescape
from outsidelib import CSTLEMMA, CSTLEMMAMODELS as MODELS
PROC = [
CSTLEMMA,
# input is tagged, word TAB tag NL
'-t',
'-I', r'$w\t$t\n',
# output is word TAB info TAB dictionary lemma TAB rule-based
# lemma NL and lemmas turned out to be *sometimes* not UTF-8 (but
# Latin-1?) NEEDS INVESTIGATED BUT WORKAROUNDIT FOR NOW; note
# that that was observed and workarounded for UTF-8 input/output
# (-eU) to cstlemma before attempting to use the other encodings
# because there were missing letters in UTF-8 output from it -
# maybe it does not quite work?
# From cstlemma -h for the option -c:
# $i info: - full form not in dictionary.
# + full form has more than one lemma in the dictionary.
# (If that is the case, the lemmatiser also generates lemma(s)
# using the flex pattern rules, so you can choose.)
# (blank) full form in dictionary.
'-c', '$w\t$i\t$b\t$B\n',
'-b', '$w',
'-B', '$w',
# separate alternative lemmas with unit separator, U+001F, rather
# than the default vertical pipe, because cannot prevent vertical
# pipes from occurring in the input (it must be one argument: -s
# alone is another option); will split cstlemma output at U+001F
# and then replace any vertical bars, U+007C, in base forms with
# broken vertical bars, U+00A6, and join alternative lemmas with
# vertical bars (iso-8859-2 does not contain broken bar at all)
R'-s\x1f',
# encoding and models are appended according to options
]
ENCODING = {
# $ cstlemma -h
# -e<n> ISO8859 Character encoding. 'n' is one of 1,2,7 and 9 (ISO8859-1,2, etc).
# -eU turns out to produce garbage (missing and malcoded letters)
'Latin-1' : [ '-e1' ],
'Latin-2' : [ '-e2' ],
'Latin/Greek' : [ '-e7' ],
'Latin/Turkish' : [ '-e9' ],
'UTF-8' : [ '-eU' ]
}
def parsearguments():
description = '''
Pass word forms and specially prepared features in a flat vrt
document through cstlemma using models based on the tagged saldo
and suc word lists in the sparv distribution. Insert after each
word a lemma, and when verbose also a dictionary-match indicator
(lemma.match: full, form, none), dictionary lemmas (lemma.dict),
and rule-derived lemmas (lemma.flex).
'''
parser = trans_args(description = description)
parser.add_argument('--word', '-w', metavar = 'name',
type = binxname, default = b'word',
help = 'input word field name (default word)')
parser.add_argument('--tag', '-t', metavar = 'name',
type = binxname, default = b'msd',
help = 'input tag field name (default msd)')
parser.add_argument('--lemma', metavar = 'name',
type = binxname, default = b'lemma',
help = 'output field name (default lemma)')
parser.add_argument('--verbose', action = 'store_true',
help = '''
in addition to the chosen lemma, produce also
the output fields from cstlemma
''')
# so the defaults are not valid values of the type?
parser.add_argument('--prefix', '-p', metavar = 'fix',
type = binxname, default = b'',
help = 'prefix to output field names')
parser.add_argument('--suffix', '-s', metavar = 'fix',
type = binxrest, default = b'',
help = 'suffix to output field names')
parser.add_argument('--model',
choices = [ 'saldo', 'suc', 'sucsaldo' ],
default = 'saldo',
help = '''
lemmatisation models to use (saldo is based on
a huge list, suc on a short list; default is
to use saldo; sucsaldo combines both lists)
''')
parser.add_argument('--strict', action = 'store_true',
help = '''
highlight all invalid bytes in lemmas as
"[UTF-8?xx]" (default is to interpret hex
a0-ff as Latin-1)
''')
parser.add_argument('--encoding',
choices = [
'Latin-1', 'Latin-2',
'Latin/Greek', # iso-8859-7
'Latin/Turkish', # iso-8859-9
'UTF-8'
],
default = 'Latin-1',
help = '''
internal re-encoding of input for cstlemma;
where not possible, surface form is used as
base form; the available encodings are
ISO-8859-1,2,7,9 as indicated in the help text
of cstlemma 7.36 (default is Latin-1, since
UTF-8 does not seem to work for all lemmas)
''')
args = parser.parse_args()
args.prog = parser.prog
return args
def message(args, mess):
print(args.prog + ':', mess, file = sys.stderr)
def terminate(proc):
try:
proc.terminate()
except ProcessLookupError:
pass
def highlight_errors(exn):
'''Handles UnicodeDecodeError. Because some but not all occurrences of
the usual letters are mal-encoded in the output of cstlemma.
'''
bad = (
codecs
.encode(exn.object[exn.start:exn.end], 'hex')
.decode('ASCII')
)
return '[UTF-8?{}]'.format(bad), exn.end
def interpret_errors(exn):
'''Handles UnicodeDecodeError. Because some but not all occurrences of
the usual letters are mal-encoded in the output of cstlemma.
Interpret them in Latin-1.
'''
bad = exn.object[exn.start:exn.end]
# only handle graphic characters
if all(b > 0xa0 for b in bad):
return bad.decode('iso-8859-1'), exn.end
return highlight_errors(exn)
codecs.register_error('highlight', highlight_errors)
codecs.register_error('interpret', interpret_errors)
def utf8(args, word):
'''Bring the word to this millennium.'''
if args.strict:
word = word.decode('UTF-8', errors = 'highlight')
else:
word = word.decode('UTF-8', errors = 'interpret')
return word.encode('UTF-8')
def main(args, inf, ouf):
with Popen(PROC
+ ENCODING[args.encoding or 'UTF-8']
+ MODELS[args.model],
stdin = PIPE,
stdout = PIPE,
stderr = sys.stderr.buffer) as cstlemma:
copy = Queue()
Thread(target = combine, args = (args, cstlemma, copy, ouf)).start()
status = 1
try:
implement_main(args, inf, cstlemma, copy)
status = 0
except BadData as exn:
message(args, exn)
except BrokenPipeError as exn:
message(args, 'broken pipe in main thread')
except KeyboardInterrupt as exn:
message(args, 'keyboard interrupt in main thread')
except Exception as exn:
print(traceback.format_exc(), file = sys.stderr)
if status:
terminate(cstlemma)
else:
cstlemma.stdin.close()
try:
copy.join()
except KeyboardInterrupt:
message(args, 'keyboard interrupt in main thread')
status = 1
return status
def implement_main(args, inf, cstlemma, copy):
# each "word" and "tag" go to cstlemma, with an empty line (EXCEPT
# IT CANNOT BE AN EMPTY LINE BECAUSE THOSE VANISH) after each
# sentence; everything goes to copy in alternative groups of meta
# and data, with new "dict.lemma", "rule.lemma" (or such) in names
wordix, posix = None, None
if args.encoding == 'UTF-8':
# UTF-8 as is - does not work right
def recode(word): return word
elif args.encoding == 'Latin-1':
def recode(word):
return ( word
.decode('UTF-8', errors = 'strict')
.encode('iso-8859-1', errors = 'replace')
)
elif args.encoding == 'Latin-2':
def recode(word):
return ( word
.decode('UTF-8', errors = 'strict')
.encode('iso-8859-2', errors = 'replace')
)
elif args.encoding == 'Latin/Greek':
def recode(word):
return ( word
.decode('UTF-8', errors = 'strict')
.encode('iso-8859-7', errors = 'replace')
)
elif args.encoding == 'Latin/Turkish':
def recode(word):
return ( word
.decode('UTF-8', errors = 'strict')
.encode('iso-8859-9', errors = 'replace')
)
else:
raise BadCode('unrecognized --encoding in main')
def send(sentence):
for record in sentence:
cstlemma.stdin.write(recode(unescape(record[wordix])))
cstlemma.stdin.write(b'\t')
cstlemma.stdin.write(unescape(record[posix]))
cstlemma.stdin.write(b'\n')
else:
cstlemma.stdin.write(b'<< >>\t_\n')
cstlemma.stdin.flush()
def setnames(line):
nonlocal wordix, posix
if isnames(line):
wordix, posix = nameindices(namelist(line), args.word, args.tag)
return insertnames(line, args.word,
args.prefix + b'lemma' + args.suffix,
*((args.prefix + b'lemma.match' + args.suffix,
args.prefix + b'lemma.dict' + args.suffix,
args.prefix + b'lemma.flex' + args.suffix)
if args.verbose
else ()))
return line
def issome(line): return not line.isspace()
def ismeta(line): return line.startswith(b'<')
first = True
for groupismeta, group in groupby(filter(issome, inf), ismeta):
if groupismeta:
meta = tuple(map(setnames, group))
copy.put(meta)
first = False
continue
# groupisdata
if first:
# there shall always be previous meta
copy.put(())
first = False
if wordix is None:
raise BadData('error: token before field names')
sentence = tuple(map(asrecord, group))
copy.put(sentence)
send(sentence)
if not groupismeta:
# there shall always be final meta
copy.put(())
def combine(args, cstlemma, copy, out):
'''Read cstlemma output (word TAB lemma TAB lemma NL) and flat vrt
from the copy process. Insert lemmas from cstlemma to the vrt at
the named position.
This is run as a thread that consumes the cstlemma process and
syncs it with the copy queue. Preceding meta and corresponding
data were put in the queue before the data was sent to cstlemma,
so they will always be there when a sentence is read out of
cstlemma.
'''
fail = True
try:
implement_combine(args, cstlemma, copy, out)
fail = False
except BrokenPipeError:
message(args, 'broken pipe in combine thread')
except StopIteration:
# not sure when this can happen now
message(args, 'stop iteration in combine thread')
except ValueError as exn:
# sometimes keyboard interruption in main thread produces here
# a readline of closed file (or at least it did in one stage
# in development)
message(args, 'value error in combine thread ' + str(exn))
finally:
if fail: terminate(cstlemma)
def implement_combine(args, cstlemma, copy, out):
'''Thread may find pipe closed.'''
# separator of alternative base forms becomes vertical bar; any
# vertical bars in base forms become broken bars
repipe = str.maketrans({ '|' : '¦', '\x1f' : '|' })
if args.encoding == 'UTF-8':
# UTF-8 as is ... except those pipes,
# and with error handling according to
# args.strict or not args.strict :/
# because cstlemma seems broken with -eU
# (possibly it is losing bytes rather
# than characters and sometimes just
# happens to lose the first byte of
# a double-byte UTF-8 character)
def recode(word):
return ( utf8(args, word)
.decode('UTF-8')
.translate(repipe)
.encode('UTF-8')
)
elif args.encoding == 'Latin-1':
def recode(word):
return ( word
.decode('iso-8859-1')
.translate(repipe)
.encode('UTF-8')
)
elif args.encoding == 'Latin-2':
def recode(word):
return ( word
.decode('iso-8859-2')
.translate(repipe)
.encode('UTF-8')
)
elif args.encoding == 'Latin/Greek':
def recode(word):
return ( word
.decode('iso-8859-7')
.translate(repipe)
.encode('UTF-8')
)
elif args.encoding == 'Latin/Turkish':
def recode(word):
return ( word
.decode('iso-8859-9')
.translate(repipe)
.encode('UTF-8')
)
else:
raise BadCode('unrecognized --encoding in combine')
def issentinel(line): return line.startswith(b'<< >>\t')
it = {
b'-' : b'none', # full form not in dictionary
b'+' : b'many', # has many lemmas in dictionary
b' ' : b'one', # update: "(blank)" seems to be this one!
b'' : b'one', # should not happen, nor any other
}
response = (tokens
for isempty, tokens
in groupby(cstlemma.stdout, issentinel)
if not isempty)
at = None # word field index, after which insert new
for analyses in response:
meta = copy.get_nowait()
data = copy.get_nowait()
copy.task_done()
copy.task_done()
for line in meta:
if isnames(line):
at = nameindex(namelist(line), args.word)
if at is None:
raise BadData('combine thread: data before names')
shipmeta(meta, out)
for new, old in zip(analyses, data):
[
word,
info,
dilemma,
rulemma
] = asrecord(new)
dilemma8 = utf8(args, recode(dilemma))
rulemma8 = utf8(args, recode(rulemma))
# presumably dilemma8 or rulemma8 is always non-empty so
# that the last choice of '_' is never used; not using
# word for that because word can contain the separator
# WAS lemma = (dilemma8 or rulemma8 or '_').split(b'|')[0]
# NOW is multivalued
lemma = (
b'|' + (dilemma8 or rulemma8) + b'|'
if dilemma8 or rulemma8
else b'|'
)
args.verbose and old.insert(at + 1, escape(rulemma8) or b'_')
args.verbose and old.insert(at + 1, escape(dilemma8) or b'_')
args.verbose and old.insert(at + 1, it.get(info, info))
old.insert(at + 1, escape(lemma))
else:
shipdata(data, out)
# should this have a timeout? or could it be .get_nowait()?
# added flush and made get get_nowait because a few processes
# seemed to make no progress till they timed out - were they
# stalled here?
out.flush()
shipmeta(copy.get_nowait(), out)
copy.task_done()
def shipmeta(meta, out):
for line in meta: out.write(line)
from datetime import datetime
def shipdata(data, out):
if len(data) > 2000:
# [this is a remnant from the marmot tool in td pipeline]
# where even marmot appeared to behave badly with long
# "sentences" (though only much longer than the parser -
# present limit of 2000 is taken out of thin air and could
# probably have been much larger but then it only controls a
# warning in stderr)
sys.stderr.buffer.write(b'shipping data at ')
sys.stderr.buffer.write(datetime.now().isoformat().encode())
sys.stderr.buffer.write(b' of len ')
sys.stderr.buffer.write(str(len(data)).encode())
sys.stderr.buffer.write(b'\n')
sys.stderr.buffer.flush()
for record in data:
out.write(b'\t'.join(record))
out.write(b'\n')
if __name__ == '__main__':
trans_main(parsearguments(), main,
in_as_text = False,
out_as_text = False)