forked from AylaRT/D-Terminer
-
Notifications
You must be signed in to change notification settings - Fork 0
/
prepro_normalise.py
executable file
·109 lines (90 loc) · 3.93 KB
/
prepro_normalise.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
"""script for normalisation
WARNING: exact copy of part of prepro_normalise.py in ID_TERM repo!
"""
import unicodedata
def normalise_dashes_quotes(text):
"""
Given a string, replace all dashes and quotes (both single and double quotes)
of various types of encoding, to be the same.
:param text = string to normalise
:return string_normalised
"""
dashes = ["-", "−", "‐"]
double_quotes = ['"', '“', '”', '„', "„", "„"]
single_quotes = ["'", "`", "´", "’", "‘", "’"]
for double_quote in [',,', "''", "''", "‘’", "’’"]:
if double_quote in text:
text = text.replace(double_quote, '"')
string_normalised = ""
for char in text:
if char in dashes:
string_normalised += "-"
elif char in double_quotes:
string_normalised += '"'
elif char in single_quotes:
string_normalised += "'"
else:
string_normalised += char
return string_normalised
def normalise_accented_i(text):
"""
Given a string (of any length), replace all uppercase i's with a dotted accent ("İ")
to a normal uppercase "I", since this accent causes trouble when lowercasing the data,
since the lowercase i already has the dot above it.
:param text: input string
:return: normalised_text where any "İ" has been replaced by "I"
"""
normalised_text = text.replace("İ", "I")
return normalised_text
def normalise_unicode(text):
"""
Given an input string, normalise this text to use the same kind of unicode
(avoids encoding problems), using the unidecode package.
:param text: input string
:return: normalised_text: same string but with unidecode applied to it
"""
normalised_text = unicodedata.normalize("NFC", text)
return normalised_text
def normalise_text_standard(text, unidecoded=True, dashes_quotes=True, accented_i=True, lowercase=False):
"""
Given an input string, apply the standard normalisation procedure
in the correct order (with customisable steps):
1. unicodedata.normalise("NFC", content) (normalise_unicode)
2. normalise_dashes_quotes(content) (normalise_dashes_quotes)
3. content.replace("İ", "I") (normalise_accented_i)
4. content.lower()
:param text: input string
:param unidecoded: whether to use normalise_unicode
:param dashes_quotes: whether to normalise dashes and quotes
:param accented_i: whether to normalise dot-accented, uppercase I's
:param lowercase: whether to lowercase the text
:return: normalised text
"""
if unidecoded:
text = normalise_unicode(text)
if dashes_quotes:
text = normalise_dashes_quotes(text)
if accented_i:
text = text.replace("İ", "I")
if lowercase:
text = text.lower()
return text
def normalise_file_standard(in_fp, out_fp, unidecoded=True, dashes_quotes=True, accented_i=True, lowercase=False):
"""
Apply the standard normalisation procedure to the text in a given filepath
and write the normalised output to the provided output filepath.
Order of operations (first check which operations are required).
See function: normalise_text_standard
:param in_fp: input filepath with original data
:param unidecoded: whether to use normalise_unicode
:param dashes_quotes: whether to normalise dashes and quotes
:param accented_i: whether to normalise dot-accented, uppercase I's
:param lowercase: whether to lowercase the text
:param out_fp: output filepath for normalised data
"""
with open(in_fp, "rt", encoding="utf-8") as in_f:
content = in_f.read()
normalised_content = normalise_text_standard(content, unidecoded=unidecoded, dashes_quotes=dashes_quotes,
accented_i=accented_i, lowercase=lowercase)
with open(out_fp, "wt", encoding="utf-8") as out_f:
out_f.write(normalised_content)