-
Notifications
You must be signed in to change notification settings - Fork 3
/
libylespecial.py
169 lines (134 loc) · 5.42 KB
/
libylespecial.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
# For YLE SV material, meant to be adaptable to other materials.
# sane(s, *, asis) = low-level sanification
# delink(s, *, info)
# splitby(s) = separation of a multiparagraph (or paragraph) and a byline
# ??normalspaced(s) = normalization of whitespace
import html, re, sys
# such rare incidences as were actually observed in the text
broken_markup_remains = re.compile(R'''
<em>
| </em>
| <strong>
| </strong>
| </a> # some source attribute - also <a href=...> but leave that
| /nobr>
| b>
| p>
| >P>
''', re.VERBOSE)
# such rare but actually observed control characters as should not be
# there at all - not at all sure what they were meant to be or how
# they came to be there, either - and the couple of "angle brackets"
# that seem to occur as remnants of some markup about as often as in
# some creative ASCII-art construction: sigh - too expensive oh
# nupdate these control codes are Microsoft character codes encoded in
# UTF-8 instead of translated to the intended Unicode points first.
character_encoding = str.maketrans({
# U+0091, U+0092 probably meant to be U+2018, U+2019 but make both U+0027
# according to the advice that both should look the same in ... Finnish?
# but this is Swedish ... and also that the latter (either way) can also
# occur as an apostrophe
'\u0002' : '', # STX (start of text)
'\u0003' : '', # ETX (end of text)
'\u0008' : '', # BS (sic)
'\u007f' : "'", # DEL -> '
'\u0080' : '\u20ac', # -> EURO SIGN
'\u008f' : '', # SS3 (Single Shift Three)
'\u0090' : '', # DCS (Device Control String)
'\u0091' : "'", # PRIVATE USE ONE -> left single quotation mark (U+2018)
'\u0092' : "'", # PRIVATE USE TWO -> right single quotation mark (U+2019)
'\u0093' : '"', # -> left double quotation mark (U+201C)
'\u0094' : '"', # CANCEL CHARACTER -> right double quotation mark (U+201D)
'\u0095' : '\u2022', # MESSAGE WAITING -> bullet
'\u0096' : '-', # -> en dash (U+2013)
'\u0097' : '-', # -> em dash (U+2014)
'\u009a' : '\u0161', # SINGLE CHARACTER INTRODUCER to s caron (Windows 1252)
'\u009e' : '\u017e', # PRIVACY MESSAGE to z caron (Windows 1252)
# private use characters
'\uf02d' : '',
'\uf04a' : '',
'\uf0bd' : '',
})
last_angles = str.maketrans({
'<' : ' ', # still in <3
'>' : ' ', # still in => -> >> and > (some clearly wrong?)
})
def sane(text, *, asis = False):
'''Undo XML character entities (there are a few), remove other
remnants of XML markup (a few half-tags and such), eliminate
garbage characters (random control characters? not many).
There are _both_ raw ampersands and ampersands as entities. When
an irrestible force meets an immovable object, something's gotta
give.
'''
# early return option in case there is need to observe insane text
# for debugging, probably in combination with args.parainfo to
# observe just the phenomena that are being sanitized right here
if asis: return text
text = text.translate(character_encoding)
# Note: — becomes U+2014 EM DASH; it is used to introduce
# some bylines(?) at end of a paragraph
text = html.unescape(text)
text = broken_markup_remains.sub(' ', text)
text = text.translate(last_angles)
return text
# se [denna sida](http://example.com); => se denna sida;
# pattern captures full match and the replacement text
delink_pattern = re.compile(R'''
( \[ ( [^\[\]]+ ) \] \( [^\(\)]+ \) )
''', re.VERBOSE)
def delink(text, *, info = False):
'''Replace every [text](link) in text with just text. Used before
actual tokenization to remove highly non-linguistic material that
does not even appear to an ordinary reader.
'''
if info:
first = True
for full, keep in delink_pattern.findall(text):
if first:
print('----', file = sys.stderr)
first = False
print(keep, '<=', full, file = sys.stderr)
retext = delink_pattern.sub(r'\2', text)
# to see the full effect on the whole text - much output
# if text != retext:
# print('----', file = sys.stderr)
# print(repr(text), file = sys.stderr)
# print('->', file = sys.stderr)
# print(repr(retext), file = sys.stderr)
return retext
# splitby(text) to split away a paragraph-final "byline" (if that be
# what it is called), rather heuristically tailored to what is
# observed, because such an element is not properly an element of the
# last sentence but at best of the paragraph as a whole - probably
# ship them as a kind of sentence? or leave them out? decide later.
# Decided to ship them as another paragraph. May be more proper
# anyway, and is not worth the hassle not to anyway.
byparentheses = re.compile(R'''
# Like one of these at the very end of a multiparagraph:
#
# (Av Maria von Kraemer)
# (Av Susanne Nylund-Torp)
# (Av Rabbe Nilsson)
# (Av Åboland)
# (Av intfree01)
#
# But there are also such as use a hyphen or an em-dash :(
# There may or may not be whitespace just before the byline.
# Or anything!
[(]
Av
(?: [ ] \w+ (?: -\w+)? ) {1,3}
[)]
$
''', re.VERBOSE)
def splitby(text):
'''Return actual text (a possibly empty multi-paragraph) together with
a byline paragraph (as a string) split off at the end.
'''
mo = byparentheses.search(text)
if mo:
text = mo.string[:mo.start()].rstrip()
by = mo.group()
return text, by # [ (dict(type = 'by'), by.split()) ]
return text, ''