From f3f351fed310e56592f5621eb3a716d93575b89e Mon Sep 17 00:00:00 2001 From: Robert M Ochshorn Date: Fri, 27 May 2016 00:58:50 +0200 Subject: [PATCH] remove sharded_transcribe and simplify POST requirements --- examples/sharded_transcribe.py | 60 ---------------------------------- serve.py | 10 ++---- 2 files changed, 3 insertions(+), 67 deletions(-) delete mode 100644 examples/sharded_transcribe.py diff --git a/examples/sharded_transcribe.py b/examples/sharded_transcribe.py deleted file mode 100644 index 453defe1..00000000 --- a/examples/sharded_transcribe.py +++ /dev/null @@ -1,60 +0,0 @@ -import gentle.standard_kaldi as standard_kaldi -import numm3 -from Queue import Queue -from multiprocessing.pool import ThreadPool as Pool -import json -import math -import sys - -AUDIOPATH = sys.argv[1] -JSON_OUT = sys.argv[2] - -N_THREADS = 4 -T_PER_CHUNK = 10 -OVERLAP_T = 2 - -kaldi_queue = Queue() -for i in range(N_THREADS): - kaldi_queue.put(standard_kaldi.Kaldi()) - -# Preload entire audio -audiobuf = numm3.sound2np(AUDIOPATH, R=8000, nchannels=1) -n_chunks = int(math.ceil(len(audiobuf) / (8000.0 * (T_PER_CHUNK-OVERLAP_T)))) - -print 'sharding into %d chunks' % (n_chunks) - -chunks = [] # (idx, [words]) - -def transcribe_chunk(idx): - st = idx * (T_PER_CHUNK-OVERLAP_T) * 8000 - end= st + T_PER_CHUNK * 8000 - - buf = audiobuf[st:end] - print buf.shape - - k = kaldi_queue.get() - - # # Break into 2s chunks - # n_buf_chunks = int(buf.shape[0] / 16000.0 - - k.push_chunk(buf.tostring()) - - ret = k.get_final() - print ' '.join([X['word'] for X in ret]) - k.reset() - - chunks.append({"start": idx*(T_PER_CHUNK-OVERLAP_T), "words": ret}) - - print '%d chunks (of %d)' % (len(chunks), n_chunks) - - kaldi_queue.put(k) - - -pool = Pool(N_THREADS) -pool.map(transcribe_chunk, range(n_chunks)) -pool.close() -pool.join() - -chunks.sort(key=lambda x: x['start']) - -json.dump(chunks, open(JSON_OUT, 'w'), indent=2) diff --git a/serve.py b/serve.py index e574b3db..119442a1 100644 --- a/serve.py +++ b/serve.py @@ -6,18 +6,14 @@ import json import logging -import math import multiprocessing -from multiprocessing.pool import ThreadPool as Pool import os from Queue import Queue import shutil -import subprocess -import sys import uuid import wave -from gentle.paths import get_binary, get_resource, get_datadir +from gentle.paths import get_resource, get_datadir from gentle.transcription import to_csv, MultiThreadedTranscriber from gentle.cyst import Insist from gentle.ffmpeg import to_wav @@ -206,7 +202,7 @@ def getChild(self, uid, req): def render_POST(self, req): uid = self.transcriber.next_id() - tran = req.args['transcript'][0] + tran = req.args.get('transcript', [''])[0] audio = req.args['audio'][0] async = True @@ -230,7 +226,7 @@ def render_POST(self, req): if not async: def write_result(result): '''Write JSON to client on completion''' - req.headers["Content-Type"] = "application/json" + req.setHeader("Content-Type", "application/json") req.write(json.dumps(result, indent=2)) req.finish() result_promise.addCallback(write_result)