-
Notifications
You must be signed in to change notification settings - Fork 0
/
Google_dataset_translation.py
147 lines (110 loc) · 6.06 KB
/
Google_dataset_translation.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
import os
from translation_class import read_lines, read_file_content, Translation, TranslationH,get_source_identifiers, translate_text_google, separate_sentences, is_sentence_to_translate
import json
import shutil
import traceback
#from translation_helsinki import translate_keyword,translate_text_original
#InputPath= 'datasets/Inspec/'
#OutputPath = 'datasets/doc_translations/Helsinki/Inspec/' #'datasets/translation_test/trans'
InputPath= 'datasets/doc_translations/errors_test'
#InputPath= 'datasets/annotated/SemEval2010'
#OutputPath = 'datasets/doc_translations/SemEval2010_GTranslate_Annotated' #'datasets/translation_test/trans'
OutputPath= 'datasets/doc_translations/errors_translation'
sourcedocs = os.listdir(InputPath)
targetdocs = os.listdir(OutputPath)
source_identifiers = get_source_identifiers(sourcedocs)
target_identifiers = get_source_identifiers(targetdocs) # for filter
fatal_errors= open('datasets/doc_translations/fatal_errors')
error_folder='datasets/doc_translations/errors_semeval'
'''
for error in fatal_errors:
new_error=error.rstrip("\n")
if new_error in target_identifiers:
output_file= OutputPath + '/' + new_error+'.json'
error_file= error_folder + '/' + new_error+'.json'
shutil.move(output_file, error_file)
else:
continue
'''
fatal_errors = []
sentence_errors = []
for identifier in source_identifiers:
if identifier in target_identifiers:
continue # si está en los ya traducidos pasamos
try:
print(identifier)
translation = Translation()
file_path = InputPath + '/' + identifier+'.json'
output_file= OutputPath + '/' + identifier+'.json'
shutil.copy(file_path, output_file)
with open(file_path, "r") as json_file:
source_data = json.load(json_file)
if 'original_text' in source_data:
text=source_data['original_text']
translated_sentences=[]
sentences=separate_sentences(text)
for sentence in sentences:
if len(sentence) >= 30:
print(sentence)
tr_sentence=translate_text_google(sentence, src_lang='en', dest_lang='es')
translated_sentences.append(tr_sentence)
else:
error_tuple=(identifier, sentence)
sentence_errors.append(error_tuple)
translation.original_translation=' '.join(translated_sentences)
with open(output_file, "r", encoding="utf-8") as out_json_file:
output_data = json.load(out_json_file)
output_data['original_translation'] = translation.original_translation
with open(output_file, "w", encoding="utf-8") as out_json_file:
json.dump(output_data, out_json_file, ensure_ascii=False, indent=4)
except Exception as e:
print("FATAL ERROR IN "+ str(identifier))
fatal_errors.append((identifier))
print(e)
print(traceback.format_exc())
#break
'''
if 'keys' in source_data:
for key in source_data['keys']:
translated_br=[]
source_br=[]
counter=0
if 'original_annotated_sentences' in source_data['keys'][key]:
print('yes')
for sentence in source_data['keys'][key]['original_annotated_sentences']:
br=is_sentence_to_translate(sentence)
if br==True:
source_br.append(br)
counter+=1
#print("true")
if counter<=10:
replaced_sentence=sentence.replace("<br>", "\"")
replaced_sentence2=replaced_sentence.replace("</br>", "\"")
new_sentence=replaced_sentence2 + " " + key
tr_sentence=translate_text_google(new_sentence, src_lang='en', dest_lang='es')
translated_br.append(tr_sentence)
if counter>=10:
print('COUNTER 10')
break
# with open(output_file, "r") as out_json_file:
# output_data = json.load(out_json_file)
for b in translated_br:
output_data['keys'][key]['translated_annotated_samples'].append(b)
with open(output_file, 'w', encoding='utf-8') as out_json_file:
json.dump(output_data, out_json_file, ensure_ascii=False, indent=4)
print('--------------FINISH---------------')
'''
# el objeto
#translation = TranslationH(identifier,textdoc, textkeys)
## annotation and first translation
#translation.generate_annotated_sentences()
# translation.translated_text_sentences= translate_text_original(translation.original_text_sentences)
# translation.translated_text = " ".join(translation.translated_text_sentences)
# for key in translation.keys:
# #tr = translate_text_google(annotated, src_lang='en', dest_lang='es')
# tr= translate_keyword(key,translation.translated_text_sentences)
# #translation.translated_annotated_text.append(tr)
# translation.compare_annotated_keywords()
#translation.write_json(OutputPath)
#print("ERRORS:",translation.error_count)
#break