This repository has been archived by the owner on May 9, 2024. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
0 parents
commit 0899aa5
Showing
11 changed files
with
2,690 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
* | ||
!Makefile | ||
!src | ||
!static |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,53 @@ | ||
on: [push, pull_request, workflow_dispatch] | ||
|
||
jobs: | ||
build: | ||
runs-on: ubuntu-latest | ||
steps: | ||
|
||
- name: Shallow-clone this repo | ||
uses: actions/checkout@v4 | ||
with: | ||
fetch-depth: 0 | ||
|
||
- name: Shallow-clone data repo | ||
uses: actions/checkout@v4 | ||
with: | ||
repository: toaq/data | ||
token: ${{ secrets.PAT_TOKEN }} | ||
fetch-depth: 0 | ||
path: data | ||
|
||
- name: Build via Docker | ||
run: docker build -t data . | ||
|
||
- name: Extract container image | ||
run: | | ||
shopt -s extglob | ||
rm -rf data/!(.git) | ||
docker create --name data data - | ||
docker export data | tar x data | ||
rm -f data/.dockerenv | ||
- name: Commit new data | ||
id: commit | ||
if: github.ref == 'refs/heads/main' | ||
run: | | ||
cd data | ||
git config --local user.email "github-actions[bot]@users.noreply.github.com" | ||
git config --local user.name "github-actions[bot]" | ||
git add -A | ||
# shield against empty commits | ||
[ -n "$(git diff --shortstat --staged)" ] || exit 0 | ||
git commit -m https://github.com/toaq/data-gen/commit/$GITHUB_SHA | ||
- name: Push changes | ||
uses: ad-m/github-push-action@master | ||
if: github.ref == 'refs/heads/main' | ||
with: | ||
github_token: ${{ secrets.PAT_TOKEN }} | ||
repository: toaq/data | ||
directory: data | ||
force: true |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
/data |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,9 @@ | ||
FROM alpine AS build-stage | ||
RUN apk add --no-cache curl findutils jq make python3 | ||
|
||
WORKDIR /work | ||
COPY . . | ||
RUN make | ||
|
||
FROM scratch AS export-stage | ||
COPY --from=build-stage /work/data /data |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,47 @@ | ||
.ONESHELL: | ||
.SHELLFLAGS = -ce | ||
.PHONY: checkdeps clean redownload | ||
|
||
all: checkdeps clean redownload data | ||
|
||
checkdeps: | ||
@curl --version >/dev/null | ||
find --version >/dev/null | ||
jq --version >/dev/null | ||
python3 --version >/dev/null | ||
|
||
clean: | ||
rm -rf data | ||
|
||
data: data/toadua $(shell find static/*) | ||
cp -a static/* data | ||
touch $@ | ||
|
||
data/toadua: data/toadua/basic.json data/toadua/dump.json data/toadua/glosses.json | ||
touch $@ | ||
|
||
redownload: | ||
make -B data/toadua/dump.json | ||
|
||
data/toadua/dump.json: | ||
mkdir -p $(shell dirname $@) | ||
curl https://toadua.uakci.pl/api \ | ||
-X POST -H 'Content-Type: application/json' \ | ||
-d '{"action": "search", "query": ["term", ""]}' \ | ||
-o $@ | ||
jq -e .success $@ >/dev/null || exit 1 | ||
cp $@ $@.temp | ||
jq .results $@.temp > $@ | ||
rm $@.temp | ||
|
||
data/toadua/basic.json: data/toadua/dump.json | ||
jq -c ' | ||
[ .[] | ||
| select(.scope == "en" | ||
and .score >= 0 | ||
and (.head | index(" ") | not)) | ||
| {head, body} ] | ||
' $< > $@ | ||
|
||
data/toadua/glosses.json: data/toadua/basic.json | ||
python3 ./scripts/extract-glosses.py < $< > $@ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,9 @@ | ||
# `data-gen` | ||
|
||
This repository contains a number of scripts for generating processed | ||
data for use in downstream Toaq applications. This data is then | ||
published to the [`data` repository](https://github.com/toaq/data) for | ||
use with git submodules. | ||
|
||
* `scripts/` houses support scripts for the `Makefile` logic. | ||
* `static/` contains files that are transplanted to the repo 1:1. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,42 @@ | ||
import json | ||
import re | ||
import sys | ||
|
||
|
||
def extract_gloss(text): | ||
return re.sub( | ||
'[()"]', | ||
"", | ||
text.split("/")[0].strip(), | ||
) | ||
|
||
|
||
def gloss(body, head): | ||
m = re.search("['‘’\"“”]([a-z .]+)['‘’\"“”];", body) | ||
if m: | ||
return m.group(1) | ||
body = body.split(";")[0].strip() | ||
body = re.sub("\.$", "", body) | ||
body = re.sub("\(.+\)$", "", body) | ||
if body.count("▯") >= 3: | ||
body = "▯".join(body.split("▯")[:2]) + "▯" | ||
body = body.strip() | ||
body = re.sub(r" (of|for|to|by|from)? ▯$", "", body) | ||
m = re.search(r"^▯ (?:is|are) (?:(?:a|an|the) )?([^▯]+)$", body) | ||
if m: | ||
return extract_gloss(m.group(1)) | ||
m = re.search(r"^▯ ([^▯]+)( ▯)?$", body) | ||
if m: | ||
return extract_gloss(m.group(1)) | ||
return None | ||
|
||
|
||
glosses = {} | ||
for entry in sorted(json.load(sys.stdin), key=lambda x: x["head"]): | ||
head = entry["head"] | ||
body = entry["body"] | ||
g = gloss(body, head) | ||
if g and 1 <= len(g) <= 22 and len(head) <= 30: | ||
glosses[head] = g | ||
|
||
print(json.dumps(glosses)) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
# `data` | ||
|
||
This repository contains various lexical data for use by Toaq | ||
appliances. **Do not** commit to this repository as it is updated automatically by [`data-gen`](https://github.com/toaq/data-gen). |
Oops, something went wrong.