-
Notifications
You must be signed in to change notification settings - Fork 3
/
vrt-paragraize
executable file
·58 lines (49 loc) · 2.23 KB
/
vrt-paragraize
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
#! /usr/bin/env python3
# -*- mode: Python; -*-
# This script inserts paragraph tags into plaintext within VRT markup
# using lines of all whitespace as boundary indicators within blocks
# of plaintext. The input plaintext itself is not yet in the VRT form,
# only the markup is assumed to be. An important structural assumption
# is that tag lines start with < and other lines do not start with <.
# The intended mechanism to achieve that is to have < as < in
# content lines.
# Original markup of a particular corpus might support smarter
# detection of paragraph boundaries.
import argparse, os, sys
from itertools import chain, groupby
import signal
signal.signal(signal.SIGINT, signal.SIG_DFL)
signal.signal(signal.SIGPIPE, signal.SIG_DFL)
def paragraize(source, target):
for ismeta, part in groupby(source, lambda line:
line.startswith('<')):
if ismeta:
print(*part, sep = '', end = '', file = target)
else:
for isspace, para in groupby(part, str.isspace):
if isspace: continue
print(*chain(['<paragraph>\n'], para, ['</paragraph>\n']),
sep = '', end = '', file = target)
def main():
parser = argparse.ArgumentParser(description = '''
Segments plaintext within VRT markup into paragraph
elements, using runs of all-whitespace lines as the
indication of an intended boundary.''')
parser.add_argument('arg', metavar = 'FILE', nargs = '?',
type = argparse.FileType('r', encoding = 'UTF-8'),
default = sys.stdin,
help = 'input file (default stdin)')
parser.add_argument('--out', '-o', metavar = 'outfile',
type = argparse.FileType('w', encoding = 'UTF-8'),
default = sys.stdout,
help = 'output file (default stdout)')
parser.add_argument('--version', action = 'store_true',
help = 'print a version indicator and exit')
args = parser.parse_args()
if args.version:
print('vrt-paragraize 0.1a (FIN-CLARIN 2018)')
exit(0)
with args.arg as source, args.out as target:
paragraize(source, target)
if __name__ == '__main__':
main()