-
Notifications
You must be signed in to change notification settings - Fork 1
/
Open_WebUI.py
370 lines (333 loc) · 14.6 KB
/
Open_WebUI.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
import argparse
import torch
parser = argparse.ArgumentParser()
parser.add_argument('--repo-id','-m', default='guillaumekln/faster-whisper-large-v2', help='Path to model repo (default: guillaumekln/faster-whisper-large-v2)')
parser.add_argument('--device','-d', default='cuda', help='Device to use for inference (default: cuda)')
parser.add_argument('--compute-type', default='float16', help='Compute type for inference (default: float16)')
parser.add_argument('--no-translate', action='store_true', help='Disable automatic translation')
# parser.add_argument('--trans-word-ts', action='store_true', help='If set, the program will generate word-level timestamps for translations. It may be unreliable.')
parser.add_argument('--force-overwrite','-f', action='store_true', help='If set, the program will overwrite any existing output files. If not set (default behavior), the program will skip writing to an output file that already exists.')
parser.add_argument('--translate-lang','-t', default=None, help='Translate to another language other than English. This is not an official behavior.')
parser.add_argument('--live', action='store_true', help='Enable live update of the output text')
parser.add_argument('--cache-dir', default=None, help='Directory of the folder to download models. Ex: "models" will make/use a folder named models in the same directory as this program (~\\models\\). The default directory is C:\\Users\\[username]\\.cache\\huggingface\\hub\\')
parser.add_argument("--autolaunch", action='store_true', help="open the webui URL in the system's default browser upon launch", default=False)
args = parser.parse_args()
print(args)
import glob
import os
if args.cache_dir:
cache_dir = (args.cache_dir+"/"+args.repo_id.split("/")[-1])
else:
cache_dir = args.repo_id
#os.environ['TRANSFORMERS_CACHE'] = cache_dir
#os.environ['PYTORCH_TRANSFORMERS_CACHE'] = '~/.cache/huggingfaces/'
#print(os.getenv('TRANSFORMERS_CACHE'))
#print(os.path.isdir(cache_dir))
if os.path.isdir(cache_dir):
model_path = cache_dir
else:
import huggingface_hub
print("Downloading model...")
kwargs = {}
if cache_dir is not None:
kwargs["local_dir"] = cache_dir
kwargs["cache_dir"] = cache_dir
# kwargs["local_dir"] = "C:/AI/MyThings/nimple Speech Recognition/ssd"
# kwargs["cache_dir"] = "C:/AI/MyThings/nimple Speech Recognition/sfw"
kwargs["local_dir_use_symlinks"] = False
allow_patterns = ["config.json","model.bin","tokenizer.json","vocabulary.txt",]
model_path=huggingface_hub.snapshot_download(args.repo_id,allow_patterns=allow_patterns,**kwargs)#tqdm_class=disabled_tqdm,,
print("Loading model...")
from faster_whisper import WhisperModel
model = WhisperModel(model_path, device=args.device, compute_type=args.compute_type, local_files_only=True)
print("Model loaded.")
supported_languages = [
"",
"Afrikaans", "Albanian", "Amharic", "Arabic", "Armenian", "Assamese",
"Azerbaijani", "Bashkir", "Basque", "Belarusian", "Bengali", "Bosnian",
"Breton", "Bulgarian", "Burmese", "Castilian", "Catalan", "Chinese",
"Croatian", "Czech", "Danish", "Dutch", "English", "Estonian", "Faroese",
"Finnish", "Flemish", "French", "Galician", "Georgian", "German", "Greek",
"Gujarati", "Haitian", "Haitian Creole", "Hausa", "Hawaiian", "Hebrew",
"Hindi", "Hungarian", "Icelandic", "Indonesian", "Italian", "Japanese",
"Javanese", "Kannada", "Kazakh", "Khmer", "Korean", "Lao", "Latin",
"Latvian", "Letzeburgesch", "Lingala", "Lithuanian", "Luxembourgish",
"Macedonian", "Malagasy", "Malay", "Malayalam", "Maltese", "Maori",
"Marathi", "Moldavian", "Moldovan", "Mongolian", "Myanmar", "Nepali",
"Norwegian", "Nynorsk", "Occitan", "Panjabi", "Pashto", "Persian",
"Polish", "Portuguese", "Punjabi", "Pushto", "Romanian", "Russian",
"Sanskrit", "Serbian", "Shona", "Sindhi", "Sinhala", "Sinhalese", "Slovak",
"Slovenian", "Somali", "Spanish", "Sundanese", "Swahili", "Swedish",
"Tagalog", "Tajik", "Tamil", "Tatar", "Telugu", "Thai", "Tibetan",
"Turkish", "Turkmen", "Ukrainian", "Urdu", "Uzbek", "Valencian",
"Vietnamese", "Welsh", "Yiddish", "Yoruba"
]
language_codes = {
"": "",
"Afrikaans": "af",
"Albanian": "sq",
"Amharic": "am",
"Arabic": "ar",
"Armenian": "hy",
"Assamese": "as",
"Azerbaijani": "az",
"Bashkir": "ba",
"Basque": "eu",
"Belarusian": "be",
"Bengali": "bn",
"Bosnian": "bs",
"Breton": "br",
"Bulgarian": "bg",
"Burmese": "my",
"Castilian": "es",
"Catalan": "ca",
"Chinese": "zh",
"Croatian": "hr",
"Czech": "cs",
"Danish": "da",
"Dutch": "nl",
"English": "en",
"Estonian": "et",
"Faroese": "fo",
"Finnish": "fi",
"Flemish": "nl",
"French": "fr",
"Galician": "gl",
"Georgian": "ka",
"German": "de",
"Greek": "el",
"Gujarati": "gu",
"Haitian": "ht",
"Haitian Creole": "ht",
"Hausa": "ha",
"Hawaiian": "haw",
"Hebrew": "he",
"Hindi": "hi",
"Hungarian": "hu",
"Icelandic": "is",
"Indonesian": "id",
"Italian": "it",
"Japanese": "ja",
"Javanese": "jv",
"Kannada": "kn",
"Kazakh": "kk",
"Khmer": "km",
"Korean": "ko",
"Lao": "lo",
"Latin": "la",
"Latvian": "lv",
"Letzeburgesch": "lb",
"Lingala": "ln",
"Lithuanian": "lt",
"Luxembourgish": "lb",
"Macedonian": "mk",
"Malagasy": "mg",
"Malay": "ms",
"Malayalam": "ml",
"Maltese": "mt",
"Maori": "mi",
"Marathi": "mr",
"Moldavian": "mo",
"Moldovan": "mo",
"Mongolian": "mn",
"Myanmar": "my",
"Nepali": "ne",
"Norwegian": "no",
"Nynorsk": "nn",
"Occitan": "oc",
"Panjabi": "pa",
"Pashto": "ps",
"Persian": "fa",
"Polish": "pl",
"Portuguese": "pt",
"Punjabi": "pa",
"Pushto": "ps",
"Romanian": "ro",
"Russian": "ru",
"Sanskrit": "sa",
"Serbian": "sr",
"Shona": "sn",
"Sindhi": "sd",
"Sinhala": "si",
"Sinhalese": "si",
"Slovak": "sk",
"Slovenian": "sl",
"Somali": "so",
"Spanish": "es",
"Sundanese": "su",
"Swahili": "sw",
"Swedish": "sv",
"Tagalog": "tl",
"Tajik": "tg",
"Tamil": "ta",
"Tatar": "tt",
"Telugu": "te",
"Thai": "th",
"Tibetan": "bo",
"Turkish": "tr",
"Turkmen": "tk",
"Ukrainian": "uk",
"Urdu": "ur",
"Uzbek": "uz",
"Valencian": "ca",
"Vietnamese": "vi",
"Welsh": "cy",
"Yiddish": "yi",
"Yoruba": "yo"
}
#from assgen import gen_subtitles
# Iterate through each audio file provided in the command line arguments
"""
for entry in audio_files:
for audio_file in glob.glob(entry):
# Extract the name of the file without its extension
name = '.'.join(audio_file.split('.')[:-1])
print('Transcribing '+name)
if args.language is None:
# Transcribe the audio using the provided model
segments, info = model.transcribe(audio_file, beam_size=5, word_timestamps=True)
# Print the detected language and its probability
print(f"Detected language '{info.language}' with probability {info.language_probability}")
language = info.language
else:
segments, info = model.transcribe(audio_file, beam_size=5, word_timestamps=True, language=args.language)
language = args.language
# Generate subtitles file with the same name as the original audio file and the detected language as the extension
output_file = f"{name}.{language}.ass"
if args.force_overwrite or not os.path.exists(output_file):
gen_subtitles(segments, output_file)
# If the detected language is not English, transcribe the audio using translation
if not args.no_translate and (
(args.translate_lang is None and language != 'en') or
(args.translate_lang is not None and language != args.translate_lang)
):
if args.translate_lang is not None:
segments, info = model.transcribe(audio_file, beam_size=5, language=args.translate_lang, word_timestamps=True) #args.trans_word_ts)
else:
segments, info = model.transcribe(audio_file, beam_size=5, task='translate', word_timestamps=True)
# output_file = f"{name}.en.translated"
# Append English translation
gen_subtitles(segments, output_file, append=True)
# Print the name of the output subtitle file
print(f"Subtitles saved to {output_file}")
else:
print(f"Skipping {output_file} (file already exists). Pass -f to overwrite existing files.")"""
ohNoButton=False
def iterate_segments(segments,output_txtFile,useNewLines,outputFileName):
#for i, segment in enumerate(segments):
wholeText_n=""
prevNewline=True
for segment in (segments):
if ohNoButton:
break
if useNewLines==0:
segmentText=segment.text.replace(". ",".\n").replace("! ","!\n").replace("? ","?\n")
if segment.text[0]==" " and prevNewline==True:
segmentText=segmentText.replace(" ","",1)
prevNewline=False
if segment.text[-1]=="." or segment.text[-1]=="!" or segment.text[-1]=="?":
segmentText=segmentText+"\n"
prevNewline=True
elif useNewLines==1:
segmentText=(segment.text+"\n")
if segment.text[0]==" ":
segmentText=segmentText.replace(" ","",1)
else:
segmentText=segment.text
if (output_txtFile):
with open(outputFileName,'a', encoding="utf-8") as f:
f.write(segmentText)
print(segmentText)
wholeText_n=f"{wholeText_n}{segmentText}"
yield wholeText_n
print("__________DONE__________")
return wholeText_n
def generate_subtitles(audio_files, translate_lang='', language='', output_txtFile=True, useNewLines=2 , translate=False, beam_size=5): #word_ts=True, attach=False,
#clear txt files
global ohNoButton
if ohNoButton:
ohNoButton=False
#print(audio_files.orig_name)
#print(audio_files.name)
if output_txtFile:
outputFileName = (audio_files.name.split('.')[-2].split('\\')[-1])
open(f'{outputFileName}_transcript.txt','w').close()
if translate and ((translate_lang == '' and language != 'en') or (translate_lang != '' and language != translate_lang)):
open(f'{outputFileName}_translate.txt','w').close()
else:
outputFileName=None
#outputFileName=audio_files.split
audio_files_path=audio_files.name.replace("\\","/")
wholeText=""
#print(audio_files)
#print(audio_files_path)
for audio_file in glob.glob(audio_files_path):
name = '.'.join(audio_file.split('.')[:-1])
print('Transcribing '+name)
if language == '':
# Transcribe the audio using the provided model
segments, info = model.transcribe(audio_file, beam_size=beam_size, word_timestamps=True)
# Print the detected language and its probability
print(f"Detected language '{info.language}' with probability {info.language_probability}")
language = info.language
else:
segments, info = model.transcribe(audio_file, beam_size=beam_size, word_timestamps=True, language=language_codes[language])
language = language
for wholeText_n in iterate_segments(segments,output_txtFile,useNewLines,outputFileName=f'{outputFileName}_transcript.txt'):
yield wholeText_n
if translate and (
(translate_lang == '' and language != 'en') or
(translate_lang != '' and language != translate_lang)):
if translate_lang != '':
print("______________BEGINNING TRANSLATION______________")
segments, info = model.transcribe(audio_file, beam_size=beam_size, language=language_codes[translate_lang], word_timestamps=True) #args.trans_word_ts)
else:
segments, info = model.transcribe(audio_file, beam_size=beam_size, task='translate', word_timestamps=True)
for wholeText_n in iterate_segments(segments,output_txtFile,useNewLines,outputFileName=f'{outputFileName}_translate.txt'):
yield wholeText_n
ohNoButton=False
return wholeText
#generate_subtitles(args.audio_files, args.translate_lang)
def ohNo():
global ohNoButton
ohNoButton = True
print("Stopping stuff")
def restartRecompile():
import sys
if '--autolaunch' in sys.argv:
sys.argv.remove('--autolaunch')
os.execl(sys.executable, 'python', '"'+__file__+'"', *sys.argv[1:])
import gradio as gr
with gr.Blocks() as demo:
with gr.Row():
with gr.Column():
audio_files = gr.File(label="Audio or video files to transcribe",type="filepath")#,file_types=['audio', 'video', '.flv'])
translate_lang = gr.Dropdown(choices=supported_languages,label="Translate to language",allow_custom_value=True,value='')
language = gr.Dropdown(choices=supported_languages, label="Force language",allow_custom_value=True,value='')
output_txtFile = gr.Checkbox(label="Write output to text file", value=True)
useNewLines = gr.Radio(["Per sentence", "Per computed segment", "Don\'t put Newlines"], label="Where to put new lines", info="Applies to both .txt file output (if enabled) and text box output", type="index", value="Don\'t put Newlines")
translate = gr.Checkbox(label="Automatic translation", value=False)
#word_ts = gr.Checkbox(label="Word-level timestamps", value=True)
#attach = gr.Checkbox(label="Produce a video with embedded subtitles")
beam_size = gr.Number(label="Beam size", value=5, precision=0)
button = gr.Button("Do Stuff")
stopButton = gr.Button("Stop Stuff",variant="stop")
restartButton = gr.Button("Restart Program",variant="primary")
with gr.Column():
output_text = gr.Textbox(label='Output subtitle files',max_lines=30,interactive=True)
button.click(generate_subtitles,
inputs=[
audio_files,
translate_lang,
language,
output_txtFile,
useNewLines,
translate,
#word_ts,
#attach,
beam_size
], outputs=output_text)
stopButton.click(ohNo, None, None, queue=False)
restartButton.click(restartRecompile, None, None, queue=False)
demo.queue(max_size=30)
demo.launch(inbrowser=args.autolaunch, show_error=True, share=False)