-
Notifications
You must be signed in to change notification settings - Fork 0
/
main.py
264 lines (224 loc) · 11.1 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
import csv
import fitz
import unicodedata, re
import os
import nltk
from nltk.corpus import words
# nltk.download('words')
def highlight_punctuation_errors(input_file:str, output_filename_end:str, summary_filename:str, pages:list=None, skip_chars:str="", skip_japanese:bool=False, skip_hyphens:bool=False):
comment_name = "PunctChecker"
skip_chars = set(skip_chars) if skip_chars else set()
error_summary = []
input_pdf = fitz.open(input_file)
for page_num, page in enumerate(input_pdf):
page_num +=1
if pages and page_num not in pages:
continue
text = page.get_text("text")
target_chars = check_punctuation_errors(text, error_summary, skip_chars, skip_japanese, skip_hyphens)
highlight_errors(target_chars, text, page, comment_name, error_summary)
# export_summary(error_summary, summary_filename)
save_output_file(input_file, input_pdf, output_filename_end)
def check_hyphenation_errors(text):
hyphenation_errors = set()
word_list = set(words.words()) # Set of valid English words from NLTK
hyphenated_words = re.findall(r'\b[a-zA-Z]+-[a-zA-Z]+\b', text) # Finds hyphenated words
for word in hyphenated_words:
parts = word.split('-')
# Check if both parts of the hyphenated word are valid words
if any(part.lower() not in word_list for part in parts):
hyphenation_errors.add((word, '不適切なハイフネーション'))
return hyphenation_errors
def check_full_width_chars(text, skip_chars, skip_japanese):
full_width_chars = set()
full_status = ['W', 'F', 'A']
full_width_pattern = re.compile("[\uFF01-\uFF5E]+")
excluded_chars = check_excluded_chars(skip_chars, skip_japanese)
status_descriptions = {
'W': '全角を半角に', # Full-width: Wide
'F': '半角に', # Full-width: Full-width
'A': '半角か確認' # Full-width: Ambiguous
}
for char in text:
if char not in excluded_chars:
status = unicodedata.east_asian_width(char)
if status in full_status or full_width_pattern.search(char):
description = status_descriptions.get(status, 'エラー不明')
full_width_chars.add((char, description))
return full_width_chars
def check_excluded_chars(skip_chars:set, skip_japanese:bool=False):
excluded_chars = {
'\u0022', # Half-width double quote mark (")
'\u0027', # Half-width single quote mark/apostrophe (')
'\u2018', # Left single quotation mark (‘)
'\u2019', # Right single quotation mark (’)
'\u201C', # Left double quotation mark (“)
'\u201D', # Right double quotation mark (”)
'\u2014', # Em dash (—)
}
# add to excluded_chars, if necessary
excluded_chars.update(skip_chars)
if skip_japanese:
excluded_chars.update(set(chr(i) for i in range(0x3040, 0x30A0))) # Hiragana
excluded_chars.update(set(chr(i) for i in range(0x30A0, 0x3100))) # Katakana
excluded_chars.update(set(chr(i) for i in range(0x4E00, 0x9FB0))) # Kanji
return excluded_chars
def check_punctuation_patterns(text):
punctuation_errors = set()
error_patterns = re.compile(
r"(?P<straight_quotes>['\"])|" # Straight quotes
r"(?P<space_around_punct>\s[.,;:?!'\[\]{}()“”‘’%$¥—-]\s)|" # Space before and after punctuation
r"(?P<space_before_closing_quote>\s[’”](?=[a-zA-Z0-9]))|" # Space before closing quotation mark followed by a character
r"(?P<repeated_punct>(?:(?P<punct>[.,;:?!'\[\]{}()“”‘’&%$¥—-]))(?P=punct))|" # Same punctuation is used twice in a row
r"(?P<yen_symbol_and_word>¥[\w.,]+\syen)|" # ¥ and yen used at the same time
r"(?P<incorrect_year_abbr>‘\d{2})|" # Incorrect year abbreviation
r"(?P<apostrophe_in_decade>\b\d{2,4}'s\b)|" # Apostrophe in decade
r"(?P<missing_space_after_sent>[a-zA-Z]([.?!])[a-zA-Z])" # No space after punctuation between letters
)
error_descriptions = {
'straight_quotes': 'オタマジャクシ型に',
'space_around_punct': '記号の前後のスペースを詰める',
'space_before_closing_quote': '開始のクォーテーションマークに変更',
'repeated_punct': 'マーク重複',
'yen_symbol_and_word': '¥ かyen か選択',
'incorrect_year_abbr': 'シングルクォーテーションの向きが逆',
'apostrophe_in_decade': 'クォーテーション不要',
'missing_space_after_sent': '文章間の句読点の後にスペースがない'
}
for error_match in error_patterns.finditer(text):
error_type = error_match.lastgroup
error_char = error_match.group()
description = error_descriptions.get(error_type, 'エラー不明')
punctuation_errors.add((error_char, description))
return punctuation_errors
def check_incomplete_pairs(text):
punctuation_pairs = {
'(': ')',
'[': ']',
'{': '}',
'“': '”'
}
reverse_punctuation_pairs = {value: key for key, value in punctuation_pairs.items()}
stack = []
errors = set()
max_string_length = 4 # Maximum characters to return including the punctuation
for i, char in enumerate(text):
if char in punctuation_pairs:
stack.append((char, i))
elif char in reverse_punctuation_pairs:
if stack:
start_punct, _ = stack[-1]
if start_punct == reverse_punctuation_pairs[char]:
stack.pop()
continue
# Extract the string after the punctuation, up to max_string_length characters
end_pos = min(i+1+max_string_length, len(text))
errors.add((text[i: end_pos], '対応するマークがない。要確認'))
else:
continue
while stack:
start_punct, pos = stack.pop()
# Extract the string after the punctuation, up to max_string_length characters
end_pos = min(pos+max_string_length, len(text))
errors.add((text[pos: end_pos], '対応するマークがない。要確認'))
return errors
def check_punctuation_errors(text, summary, skip_chars, skip_japanese=False, skip_hyphens=False):
errors = (
check_full_width_chars(text, skip_chars, skip_japanese)
| check_punctuation_patterns(text)
| check_incomplete_pairs(text)
)
if not skip_hyphens:
errors |= check_hyphenation_errors(text)
error_characters = []
for error_char, error_description in errors:
error_characters.append([error_char, error_description])
update_summary(summary, error_char, error_description)
return error_characters
def update_summary(summary:list, char, description, count=1):
found = False
for entry in summary:
if entry['char'] == char and entry['description'] == description:
entry['count'] += count
found = True
break
if not found:
summary.append({'char': char, 'count': count, 'description': description})
def get_positions(target_chars, text, page, page_highlights):
for char, description in target_chars:
start_idx = 0
while True:
start_idx = text.find(char, start_idx)
if start_idx == -1:
break
end_idx = start_idx + len(char)
matches = page.search_for(text[start_idx:end_idx])
if matches:
handle_matches(matches, char, description, page_highlights)
start_idx += 1
def handle_matches(matches, char, description, page_highlights):
for match in matches:
if char not in page_highlights:
page_highlights[char] = {"matches": [match], "description": description}
else:
# Check if the match rectangle is not already in the list
# print(char, description, match, len(page_highlights[char]))
if not any([rects_are_equal(match, rect, threshold=1) for rect in page_highlights[char]["matches"]]):
page_highlights[char]["matches"].append(match)
def rects_are_equal(rect1, rect2, threshold=1e-6):
return all([abs(rect1[i] - rect2[i]) < threshold for i in range(4)])
def add_highlights(page_highlights:dict, page, comment_name, error_summary:list):
for char, char_data in page_highlights.items():
match_rects = char_data["matches"]
description = char_data["description"]
for rect in match_rects:
if not rect_is_valid(rect):
update_summary(error_summary, char, f"{description}, error on page {page.number+1} NOT highlighted!", count=1)
continue
annot = page.add_highlight_annot(rect)
info = annot.info
info["title"] = comment_name
info["content"] = f"「{char}」{description}" # PDF comment template
annot.set_info(info)
annot.update()
def rect_is_valid(rect):
#Check if a rectangle is valid (x1 < x2 and y1 < y2).
x1, y1, x2, y2 = rect
return x1 < x2 and y1 < y2
def highlight_errors(target_char, text, page, comment_name, error_summary):
page_highlights = {}
get_positions(target_char, text, page, page_highlights)
add_highlights(page_highlights, page, comment_name, error_summary)
def export_summary(error_summary:list, summary_filename):
fieldnames = ['Character', 'Count', 'Description']
with open(f"input_files/{summary_filename}.csv", mode='w', newline='', encoding='utf-8') as csv_file:
csv_writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
csv_writer.writeheader()
for entry in error_summary:
csv_writer.writerow({
fieldnames[0]: entry['char'],
fieldnames[1]: entry['count'],
fieldnames[2]: entry['description']
})
def save_output_file(input_file, input_pdf, output_filename_end):
output_file_name = input_file.split(".")[0] + f" {output_filename_end}.pdf"
input_pdf.save(output_file_name, garbage=3, deflate=True)
input_pdf.close()
def process_directory(dir_name:str, output_filename_end:str, pages:list=None, skip_chars:str="", skip_japanese:bool=False, skip_hyphens:bool=False):
processed_files = set()
for file_name in os.listdir(dir_name):
if file_name.endswith(output_filename_end + ".pdf"):
processed_files.add(file_name)
source_file_name = file_name.replace(f" {output_filename_end}.pdf", ".pdf")
processed_files.add(source_file_name)
for file_name in os.listdir(dir_name):
if file_name.endswith(".pdf") and file_name not in processed_files:
full_path = os.path.join(dir_name, file_name)
summary_filename = file_name[:-4] + " error_summary"
highlight_punctuation_errors(full_path, output_filename_end, summary_filename, pages, skip_chars, skip_japanese, skip_hyphens)
else:
print(f'Skipping {file_name}')
if __name__ == '__main__':
dir_name = "input_files" # "/Users/daryl-villalobos/Google Drive/My Drive/Punct Checker Magic Box"
output_filename_end = "punct_checker"
process_directory(dir_name, output_filename_end, skip_chars="•", skip_japanese=True, skip_hyphens=False)