-
Notifications
You must be signed in to change notification settings - Fork 0
/
postprocessing.py
161 lines (130 loc) · 6.63 KB
/
postprocessing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Wed Mar 27 14:55:27 2024
@author: pmchozas
"""
import json
import os
from translation_class import read_lines, read_file_content, Translation, TranslationH,get_source_identifiers, separate_sentences, is_sentence_to_translate, extract_quoted_terms, find_last_term_and_remove, translate_text_google, detect_different_translations, check_repetition_percentage
import shutil
import traceback
#SourcePath = 'datasets/doc_translations/SemEval2010_GTranslate_Annotated'
SourcePath = 'datasets/doc_translations/errors_translation/'
#OutPath = 'datasets/doc_translations/SemEval2017_GTranslateReviewedAuto/'
OutPath='datasets/doc_translations/SemEval2010_GTranslate_Postprocessed'
sourcedocs = os.listdir(SourcePath)
targetdocs= os.listdir(OutPath)
source_identifiers = get_source_identifiers(sourcedocs)
target_identifiers = get_source_identifiers(targetdocs)
error_folder='datasets/doc_translations/errors_postprocess'
fatal_errors= open('datasets/doc_translations/fatal_errors_post')
'''
#SI ES UN ARCHIVO FATAL ERROR LO MUEVES DE LA CARPETA TARGET A LA DE ERRORS
for error in fatal_errors:
new_error=error.rstrip("\n")
if new_error in target_identifiers:
output_file= OutPath + '/' + new_error+'.json'
error_file= error_folder + '/' + new_error+'.json'
shutil.move(output_file, error_file)
else:
continue
'''
#POSTPROCESSING SEMEVAL2010
translation = Translation()
fatal_errors = []
key_errors=[]
for identifier in source_identifiers:
if identifier in target_identifiers:
continue # si está en los ya traducidos pasamos
try:
print(identifier)
source_file= SourcePath + '/' + identifier+'.json'
output_file= OutPath + '/' + identifier+'.json'
shutil.copy(source_file, output_file)
with open(source_file, 'r', encoding='utf-8') as file:
data=json.load(file)
for key in data['keys']:
print(key)
translated_terms=[]
modified_translated_sentences=[]
if 'translated_annotated_samples' in data['keys'][key] and data['keys'][key]['translated_annotated_samples'] is not None and data['keys'][key]['translated_annotated_samples'] !=[] :
for translated in data['keys'][key]['translated_annotated_samples']:
res=find_last_term_and_remove(translated)
if res:
print(res)
term=res[0]
modified_translated_sentences.append(res[1])
if term != None:
translated_terms.append(term.strip())
else:
id_key =(identifier, key)
key_errors.append(id_key)
continue
print('esto es term')
print(term)
terms=extract_quoted_terms(translated)
for t in terms:
translated_terms.append(t)
print('Todas las candidates')
print(translated_terms)
if detect_different_translations(translated_terms):
translated_key=translated_terms[0]
print('esto es translated key '+ translated_key)
else:
translated_key=""
with open(output_file, "r", encoding="utf-8") as out_json_file:
output_data = json.load(out_json_file)
output_data['keys'][key]['translated_key']=translated_key
output_data['keys'][key]['candidates']=translated_terms
output_data['keys'][key]['translated_annotated_samples']=modified_translated_sentences
with open(output_file, "w", encoding="utf-8") as out_json_file:
json.dump(output_data, out_json_file, ensure_ascii=False, indent=4)
else:
print(key + ' ---- NO CANDIDATES, esto es translated keyword sin contexto')
trans_key=translate_text_google(key, src_lang='en', dest_lang='es')
print(trans_key)
with open(output_file, "r", encoding="utf-8") as out_json_file:
output_data = json.load(out_json_file)
output_data['keys'][key]['translated_key']=trans_key
with open(output_file, "w", encoding="utf-8") as out_json_file:
json.dump(output_data, out_json_file, ensure_ascii=False, indent=4)
except Exception as e:
print("FATAL ERROR IN "+ str(identifier))
fatal_errors.append((identifier))
print(e)
print(traceback.format_exc())
'''
#PARA AUTOMATICAMENTE ASIGNAR UNA TRADUCCIÓN DADA SU FRECUENCIA EN LAS CANDIDATES
for identifier in source_identifiers:
if identifier in target_identifiers:
continue
try:
print(identifier)
source_file= SourcePath + '/' + identifier+'.json'
output_file= OutPath + '/' + identifier+'.json'
shutil.copy(source_file, output_file)
with open(source_file, 'r', encoding='utf-8') as file:
data=json.load(file)
for key in data['keys']:
translation=data['keys'][key]['translated_key']
if isinstance(translation, list):
print(key)
print(translation)
result=check_repetition_percentage(translation)
for term, percent in result.items():
print(f"{term}: {percent:.2f}%")
if percent >= 80:
print('GANA')
print(term, percent)
with open(output_file, "r", encoding="utf-8") as out_json_file:
output_data = json.load(out_json_file)
output_data['keys'][key]['translated_key']=term
with open(output_file, "w", encoding="utf-8") as out_json_file:
json.dump(output_data, out_json_file, ensure_ascii=False, indent=4)
except Exception as e:
print("FATAL ERROR IN "+ str(identifier))
fatal_errors.append((identifier))
print(e)
print(traceback.format_exc())
'''