-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
0e7eeec
commit 1b18b19
Showing
8 changed files
with
3,021 additions
and
12 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,36 @@ | ||
name: build | ||
|
||
on: | ||
push: | ||
branches: | ||
- build | ||
jobs: | ||
build: | ||
|
||
env: | ||
BUILD_TYPE: Release | ||
|
||
runs-on: ubuntu-latest | ||
|
||
steps: | ||
|
||
- name: Prepare | ||
run: | | ||
sudo apt-get update | ||
- name: Checkout | ||
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 | ||
|
||
- name: Build | ||
run: | | ||
mkdir build | ||
cd build | ||
cmake -DCMAKE_BUILD_TYPE=$BUILD_TYPE .. | ||
make | ||
- uses: actions/upload-artifact@0b2256b8c012f0828dc542b3febcab082c67f72b | ||
with: | ||
name: binaries | ||
path: | | ||
build/kws_cli | ||
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,11 +1 @@ | ||
CMakeLists.txt.user | ||
CMakeCache.txt | ||
CMakeFiles | ||
CMakeScripts | ||
Testing | ||
Makefile | ||
cmake_install.cmake | ||
install_manifest.txt | ||
compile_commands.json | ||
CTestTestfile.cmake | ||
_deps | ||
build/ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,16 @@ | ||
cmake_minimum_required(VERSION 3.16.3) | ||
|
||
project(kws_cli) | ||
|
||
set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} -s") | ||
|
||
include_directories(include) | ||
set(LIBS m) | ||
|
||
add_compile_options(-Wno-deprecated-declarations | ||
-Wall -Werror -fPIC) | ||
|
||
add_executable(kws_cli src/kws_cli.c src/fbank.c) | ||
target_link_libraries(kws_cli ${LIBS}) | ||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1 +1,51 @@ | ||
# kws_cli | ||
# kws_cli | ||
|
||
[![build](https://github.com/mryndzionek/kws_cli/actions/workflows/build.yml/badge.svg)](https://github.com/mryndzionek/kws_cli/actions/workflows/build.yml) | ||
|
||
## About | ||
|
||
Speech recognition in ~300kB of code. | ||
|
||
Small footprint, standalone, zero dependency, offline | ||
keyword spotting (KWS) CLI tool. Might be useful in | ||
some automation task. Accepts audio on stdin a and recognizes | ||
following words: `up`, `down`, `left`, `right`, `stop`. | ||
|
||
Here is an example invocation: | ||
|
||
``` | ||
rec -q -t raw -c1 -e signed -b 16 -r16k - | ./kws_cli | ||
``` | ||
|
||
Make sure you have decent microphone and the system audio | ||
is on a decent level. | ||
|
||
Individual WAV files can piped (e.g. for testing) using: | ||
|
||
``` | ||
sox -S ../untitled.wav -t raw -c1 -e signed -b 16 -r16k - | ./kws_cli | ||
``` | ||
|
||
## More details | ||
|
||
Speech recognition is based on [this](https://github.com/microsoft/EdgeML/blob/master/docs/publications/Sha-RNN.pdf) | ||
model and examples from the same repository. | ||
This simple model with three layers: 2x LSTM + 1x fully connected. | ||
The model is trained in PyTorch and exported to ONNX. | ||
Then [onnx2c](https://github.com/kraiskil/onnx2c) | ||
is used to convert the model to a bunch of C code. | ||
The LSTM layers had become mainstream in recent years and are well | ||
supported in different frameworks. The model is small, so it might | ||
be possible to run it on Cortex-M4/M7, or ESP32 (future work). | ||
|
||
## Building | ||
|
||
The usual CMake routine: | ||
|
||
``` | ||
mkdir build | ||
cd build | ||
cmake -DCMAKE_BUILD_TYPE=Release | ||
make | ||
``` | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,14 @@ | ||
#ifndef __FBANK_H__ | ||
#define __FBANK_H__ | ||
|
||
#define SAMPLE_LEN (16000UL) | ||
#define NUM_FRAMES (99UL) | ||
#define NUM_FILT (32UL) | ||
|
||
void fbank(float input[SAMPLE_LEN], float output[NUM_FRAMES][NUM_FILT]); | ||
void fbank_norm(float inputoutput[NUM_FILT]); | ||
void fbank_speech_detect(float input[NUM_FRAMES][NUM_FILT], size_t *label, float *logit); | ||
void fbank_print_min_max(float input[NUM_FRAMES][NUM_FILT]); | ||
char const *const fbank_label_idx_to_str(size_t label); | ||
|
||
#endif // __FBANK_H__ |
Large diffs are not rendered by default.
Oops, something went wrong.
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,77 @@ | ||
#include <stdio.h> | ||
#include <stdint.h> | ||
#include <stdbool.h> | ||
|
||
#include <signal.h> | ||
#include <string.h> | ||
|
||
#include "fbank.h" | ||
|
||
#define CHUNK_SIZE (SAMPLE_LEN / 2) | ||
|
||
static volatile bool byebye = false; | ||
|
||
static void INThandler(int sig) | ||
{ | ||
byebye = true; | ||
} | ||
|
||
int main(int argc, char *argv[]) | ||
{ | ||
int16_t in[CHUNK_SIZE] = {0.0}; | ||
float input[SAMPLE_LEN] = {0.0}; | ||
float fbins[NUM_FRAMES][NUM_FILT]; | ||
size_t n = 0; | ||
size_t label; | ||
float logit; | ||
bool debounce_active = false; | ||
|
||
struct sigaction sa = {.sa_handler = INThandler, .sa_flags = 0}; | ||
sigaction(SIGINT, &sa, 0); | ||
|
||
fprintf(stderr, "\nStarting...\n"); | ||
|
||
while (!byebye) | ||
{ | ||
size_t m = fread(&in[n], sizeof(int16_t), CHUNK_SIZE - n, stdin); | ||
n += m; | ||
|
||
if (m == 0) | ||
{ | ||
break; | ||
} | ||
|
||
if (n == CHUNK_SIZE) | ||
{ | ||
if (CHUNK_SIZE < SAMPLE_LEN) | ||
{ | ||
memmove(input, &input[CHUNK_SIZE], (SAMPLE_LEN - CHUNK_SIZE) * sizeof(float)); | ||
} | ||
|
||
for (size_t i = 0; i < CHUNK_SIZE; i++) | ||
{ | ||
input[SAMPLE_LEN - CHUNK_SIZE + i] = in[i]; | ||
} | ||
|
||
fbank(input, fbins); | ||
fbank_speech_detect(fbins, &label, &logit); | ||
if (debounce_active) | ||
{ | ||
if (label == 0) | ||
{ | ||
debounce_active = false; | ||
} | ||
} | ||
else | ||
{ | ||
if (label > 0) | ||
{ | ||
fprintf(stderr, "label: '%s', label_idx: %ld, logit: %f\n", fbank_label_idx_to_str(label), label, logit); | ||
fflush(stderr); | ||
debounce_active = true; | ||
} | ||
} | ||
n = 0; | ||
} | ||
} | ||
} |