-
Notifications
You must be signed in to change notification settings - Fork 0
/
cleaners.py
142 lines (116 loc) · 5.36 KB
/
cleaners.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
import regex
class DummyLinesCleaner:
@staticmethod
def clean_dummy_lines(tex_lines):
new_tex_lines = []
for line in tex_lines:
line = line.strip().replace(" +", " ")
left_bracket = line.count('{')
right_bracket = line.count('}')
if (line.startswith("\\documentclass") or line.startswith("\\usepackage") or line.startswith("\\newcommand")
or line.startswith("\\renewcommand") or line.startswith("\\setlength") or line.startswith(
"\\hypersetup")
or line.startswith("\\makeatletter") or line.startswith("\\makeatother")
or line.startswith("%") or line.startswith("\\providecommand") or line.startswith("\\liststyle")
or line.startswith("\\pagestyle")
):
continue
if line.startswith("{") and left_bracket > right_bracket:
continue
if left_bracket < right_bracket and line.endswith("}"):
line = line[:-1]
new_tex_lines.append(line + "\n")
if line.startswith("\\sub"):
new_tex_lines.append("")
return new_tex_lines
class DummyCharactersCleaner:
@staticmethod
def clear_dummy_characters(tex_lines):
for i in range(len(tex_lines)):
s = regex.sub(" +", " ", tex_lines[i].strip())
s = s.replace("\\ ", "")
s = s.replace("„", ",,")
s = s.replace("”", "''")
s = s.replace("’", "'")
s = s.replace("“", "``")
s = s.replace(" s. ", " s.~")
s = s.replace(" ss. ", " ss.~")
s = s.replace(" t. ", " t.~")
s = s.replace(" rozdz. ", " rozdz.~")
s = s.replace(" r. ", "~r.")
s = s.replace(" – ", " -- ")
s = s.replace("—", "---")
s = s.replace("\\par", "")
s = regex.sub(" +", " ", s.strip())
s = regex.sub(r"\\textcolor\{[^}]*\}\{([^}]*)\}", r"\1", s)
s = regex.sub(r"\\textrm\{([^}]*)\}", r"\1", s)
s = regex.sub(r"\\textstyleDomylnaczcionkaakapitu\{([^}]*)\}", r"\1", s)
s = regex.sub("(\\s)([a-zA-Z])\\s", "\\1\\2~", s)
s = regex.sub("(\\{)([a-zA-Z])\\s", "\\1\\2~", s)
s = regex.sub("^([a-zA-Z])\\s", "\\1~", s)
s = regex.sub("(\\\\[sub]{0,}section)(\\[.{1,}\\])?\\{(\\s?\\d\\.\\s)?(.{1,})\\}", "\\1{\\4}", s)
s = regex.sub("(\\s)([\\.,?!;]{1}|'')(\\s)", "\\2\\3", s)
s = regex.sub("(,,)(\\s)", "\\1", s)
# c
s = regex.sub(r"\b(\d+)(st|nd|rd|th)\b", r"\1\\textsuperscript{\2}", s)
s = regex.sub(" +", " ", s.strip())
s = s.replace("\\textbf{ }", " ")
s = s.replace("\\textit{ }", " ")
s = s.replace("\\textbf{}", "")
s = s.replace("\\textit{}", "")
s = DummyCharactersCleaner.clear_left_bracket("textit", s)
s = DummyCharactersCleaner.clear_right_bracket("textit", s)
s = DummyCharactersCleaner.clear_left_bracket("textbf", s)
s = DummyCharactersCleaner.clear_right_bracket("textbf", s)
s = regex.sub("\\\\[a-zA-Z]{1,}\\{\\s?\\}", "", s)
s = s.replace("\\footnote{ ", "\\footnote{")
# s = regex.sub(r"(\\(?:sub)*section\{)(.*?)\\(\})", r"\1\2}",
# regex.sub(r"(\\(?:sub)*section)\\(\{)", r"\1\2", s))
s = regex.sub(r"(\\(?:sub)*section\{)(?:\d+(?:\.\d+)*\.?\s*)?(.*?)\}", r"\1\2}", s)
s = s.replace("~ ", "~")
tex_lines[i] = s
# @staticmethod
# def clear_left_bracket(enviroment, text):
# regex_pattern = "(\\\\" + enviroment + "\\{)([^\\p{L}\\\\])"
# while regex.search(regex_pattern, text):
# text = regex.sub(regex_pattern, "\\2\\1", text, count=1)
# return text
@staticmethod
def clear_left_bracket(environment, text):
regex_pattern = r"(\\" + regex.escape(environment) + r"\{)(\s*)"
text = regex.sub(regex_pattern, r"\2\1", text)
return text
# @staticmethod
# def clear_right_bracket(enviroment, text):
# regex_pattern = "(\\\\" + enviroment + "\\{.{0,}?)([^\\p{L}0-9\\.])(\\})"
# return regex.sub(regex_pattern, "\\1\\3\\2", text, count=1)
@staticmethod
def clear_right_bracket(environment, text):
regex_pattern = r"(\\{env}\{{[^}}]*?)(~|\s*)(\}}+)".format(env=environment)
text = regex.sub(regex_pattern, r"\1\3\2", text)
return text
class BibFileCleaner:
fields_to_remove = [
'month',
'file',
'abstract',
'copyright',
'language',
'isbn',
'issn',
'keywords',
]
def __init__(self):
self.cleaned_content = []
def clean_bib_content(self, bib_content):
inside_removed_field = False
for line in bib_content:
if any(line.strip().startswith(field) for field in self.fields_to_remove):
inside_removed_field = True
if inside_removed_field and (
line.strip().endswith('},') or line.strip().endswith('}') or line.strip().endswith(',')):
inside_removed_field = False
continue
if not inside_removed_field:
self.cleaned_content.append(line)
return self.cleaned_content