-
Notifications
You must be signed in to change notification settings - Fork 3
/
vrt-old-sparv-huntag
executable file
·298 lines (247 loc) · 10.3 KB
/
vrt-old-sparv-huntag
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
#! /usr/bin/env python3
# -*- mode: Python; -*-
from argparse import ArgumentParser
from itertools import groupby, count
from subprocess import Popen, PIPE
from tempfile import mkstemp
from threading import Thread
import enum, os, re, sys, traceback
from outsidelib import HUNPOSTAG, HUNPOSMODELS
from vrtnamelib import isbinnames
from vrtnamelib import binnamelist, nameindex, bininsertnames
from vrtargslib import VERSION
HUNPOS = [ HUNPOSTAG,
# could use a morphtable here? from sparv
HUNPOSMODELS['sparv']
]
parser = ArgumentParser(description = '''
Pass the word field in a flat vrt document through hunpos using a
Swedish SUC (Stockholm-Umeå Corpus) model from Sparv (compatible with
Swemalt). Insert the tags in a new field after the word field. The vrt
document must have position names in a comment before any content. (A
"flat" vrt document has no markup or comment lines inside sentence
elements.)
''')
parser.add_argument('infile', nargs = '?', metavar = 'file',
help = 'input file (default stdin)')
parser.add_argument('--out', '-o',
dest = 'outfile', metavar = 'file',
help = 'output file (default stdout)')
parser.add_argument('--in-place', '-i',
dest = 'inplace', action = 'store_true',
help = 'overwrite input file with output')
parser.add_argument('--backup', '-b', metavar = 'bak',
help = 'keep input file with suffix bak')
parser.add_argument('--tag', '-t', metavar = 'name',
type = bytes, default = b'msd',
help = 'output field name (default msd)')
parser.add_argument('--word', '-w', metavar = 'name',
type = bytes, default = b'word',
help = 'input field name (default word)')
parser.add_argument('--version', action = 'store_true',
help = 'print {} and exit'.format(VERSION))
# old values are passed unescaped to the underlying process,
# new values are inserted escaped to the output stream
bare, code = (b'&', b'<', b'>'), (b'&', b'<', b'>')
def encm(m, d = dict(zip(bare, code))): return d[m.group()]
def decm(m, d = dict(zip(code, bare))): return d[m.group()]
def escape(value): return re.sub(b'[&<>]', encm, value)
def unescape(value): return re.sub(b'&(amp|lt|gt);', decm, value)
class BadData(Exception): pass # stack trace is just noise
class BadCode(Exception): pass # this cannot happen
class Kind(enum.Enum):
# kind of line (group of them)
meta = 1
data = 2
begin = 3
names = 4
comment = 5
def identify(line):
# used by groupby to identify the kind of a line group
if line.startswith(b'<!--'):
if isbinnames(line): return Kind.names
else: return Kind.comment
if line.startswith(b'<sentence'): return Kind.begin
if line.startswith(b'<'): return Kind.meta
return Kind.data
def combine(hunpos, copy, out):
'''Read hunpos output (word TAB msd TAB NL / NL) and flat vrt from the
copy process. Insert msd from hunpos to the vrt at the named
position.
This is run as a thread that consumes the two processes.
'''
try:
# wonder if just setting the signal thing would work here
implement_combine(hunpos, copy, out)
except BrokenPipeError:
print(parser.prog + ':', 'in combine thread: Broken Pipe',
file = sys.stderr)
except StopIteration:
# from next(response) when main thread got keyboard interruption
print(parser.prog + ':', 'in combine thread: Stop Iteration',
file = sys.stderr)
except ValueError as exn:
# sometimes keyboard interruption in main thread produces here
# a readline of closed file
print(parser.prog + ':', 'in combine thread: Value Error:', exn,
file = sys.stderr)
finally:
# closing their stdouts should send hunpos and copy the signal
# to shut down, right? and then the main thread should get
# broken pipe to indicate that it can no longer write to them?
hunpos.stdout.close() # no need to wait for it?
copy.stdout.close() # no need to wait for it or something?
def implement_combine(hunpos, copy, out):
'''Thread may find pipe closed.'''
response = (tokens
for isempty, tokens
in groupby(hunpos.stdout, bytes.isspace)
if not isempty)
pos = None
found = False
for kind, group in groupby(copy.stdout, identify):
if kind is not Kind.data: found = False
if kind is Kind.begin: found = True
if (kind is Kind.data) and found:
for new, old in zip(next(response), group):
form, msd, nl = new.split(b'\t')
values = old.rstrip(b'\n').split(b'\t')
values.insert(pos, escape(msd))
out.write(b'\t'.join(values))
out.write(b'\n')
else:
continue
if kind is Kind.names:
for line in group:
pos = nameindex(binnamelist(line), args.tag)
out.write(line)
else:
continue
for line in group:
out.write(line)
else:
continue
def getword(line, pos):
# todo use datalib for this too
return line.rstrip(b'\n').split(b'\t')[pos]
def wrap_main():
if (args.backup is not None) and '/' in args.backup:
print('usage: --backup suffix cannot contain /', file = sys.stderr)
exit(1)
if (args.backup is not None) and not args.backup:
print('usage: --backup suffix cannot be empty', file = sys.stderr)
exit(1)
if (args.backup is not None) and not args.inplace:
print('usage: --backup requires --in-place', file = sys.stderr)
exit(1)
if args.inplace and (args.infile is None):
print('usage: --in-place requires input file', file = sys.stderr)
exit(1)
if args.inplace and (args.outfile is not None):
print('usage: --in-place not allowed with --out', file = sys.stderr)
exit(1)
if (args.outfile is not None) and os.path.exists(args.outfile):
# easier to check this than that output file is different than
# input file, though it be annoying when overwrite is wanted
print('usage: --out file must not exist', file = sys.stderr)
exit(1)
try:
if args.inplace or (args.outfile is not None):
head, tail = os.path.split(args.infile
if args.inplace
else args.outfile)
fd, temp = mkstemp(dir = head, prefix = tail)
os.close(fd)
else:
temp = None
with ((args.infile and open(args.infile, mode = 'br'))
or sys.stdin.buffer) as inf:
with ((temp and open(temp, mode = 'bw'))
or sys.stdout.buffer) as ouf:
status = main(inf, ouf)
args.backup and os.rename(args.infile, args.infile + args.backup)
args.inplace and os.rename(temp, args.infile)
args.outfile and os.rename(temp, args.outfile)
exit(status)
except IOError as exn:
print(exn, file = sys.stderr)
exit(1)
def main(inf, ouf):
with Popen(HUNPOS,
stdin = PIPE,
stdout = PIPE,
stderr = sys.stderr) as hunpos:
with Popen(['cat'],
stdin = PIPE,
stdout = PIPE,
stderr = sys.stderr) as copy:
# is this a "useless use of cat"?
t = Thread(target = combine, args = (hunpos, copy, ouf))
t.start()
status = 1
try:
implement_main(inf, hunpos, copy)
status = 0
except BadData as exn:
print(parser.prog + ':', exn, file = sys.stderr)
except BrokenPipeError as exn:
# when combine thread gets broken pipe, it closes the
# output side of hunpos and copy, then they close and
# this thread gets broken pipe - right?
print(parser.prog + ':', 'in main thread: Broken Pipe',
file = sys.stderr)
except KeyboardInterrupt as exn:
print(parser.prog + ':', 'in main thread: Keyboard Interrupt',
file = sys.stderr)
except Exception as exn:
print(traceback.format_exc(), file = sys.stderr)
hunpos.stdin.close()
copy.stdin.close()
t.join()
return status
def implement_main(inf, hunpos, copy):
# each "word" goes to hunpos, with empty line after sentence;
# everything but empty goes to copy, with new "msd" in names
pos = None
found = False
def isnotspace(line): return not line.isspace()
for kind, group in groupby(filter(isnotspace, inf), identify):
if kind is not Kind.data: found = False
if kind is Kind.begin: found = True
if kind is Kind.data and found:
if pos is None:
raise BadData('error: unexpected data: '
'token line before field names')
for line in group:
hunpos.stdin.write(unescape(getword(line, pos)))
hunpos.stdin.write(b'\n')
copy.stdin.write(line)
else:
hunpos.stdin.write(b'\n')
hunpos.stdin.flush()
copy.stdin.flush()
continue
if kind is Kind.begin:
for line in group: copy.stdin.write(line)
continue
if kind is Kind.meta:
for line in group: copy.stdin.write(line)
continue
if kind is Kind.data:
for line in group: copy.stdin.write(line)
continue
if kind is Kind.names:
for line in group:
pos = nameindex(binnamelist(line), args.word)
copy.stdin.write(bininsertnames(line, args.word,
args.tag))
else:
continue
if kind is Kind.comment:
for line in group: copy.stdin.write(line)
continue
raise BadCode('this cannot happen')
if __name__ == '__main__':
args = parser.parse_args()
if args.version: print('vrt tools', VERSION) ; exit(0)
wrap_main()