Skip to content
This repository has been archived by the owner on May 9, 2024. It is now read-only.

Commit

Permalink
initial
Browse files Browse the repository at this point in the history
  • Loading branch information
uakci committed Oct 26, 2023
0 parents commit 0899aa5
Show file tree
Hide file tree
Showing 11 changed files with 2,690 additions and 0 deletions.
4 changes: 4 additions & 0 deletions .dockerignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
*
!Makefile
!src
!static
53 changes: 53 additions & 0 deletions .github/workflows/build.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
on: [push, pull_request, workflow_dispatch]

jobs:
build:
runs-on: ubuntu-latest
steps:

- name: Shallow-clone this repo
uses: actions/checkout@v4
with:
fetch-depth: 0

- name: Shallow-clone data repo
uses: actions/checkout@v4
with:
repository: toaq/data
token: ${{ secrets.PAT_TOKEN }}
fetch-depth: 0
path: data

- name: Build via Docker
run: docker build -t data .

- name: Extract container image
run: |
shopt -s extglob
rm -rf data/!(.git)
docker create --name data data -
docker export data | tar x data
rm -f data/.dockerenv
- name: Commit new data
id: commit
if: github.ref == 'refs/heads/main'
run: |
cd data
git config --local user.email "github-actions[bot]@users.noreply.github.com"
git config --local user.name "github-actions[bot]"
git add -A
# shield against empty commits
[ -n "$(git diff --shortstat --staged)" ] || exit 0
git commit -m https://github.com/toaq/data-gen/commit/$GITHUB_SHA
- name: Push changes
uses: ad-m/github-push-action@master
if: github.ref == 'refs/heads/main'
with:
github_token: ${{ secrets.PAT_TOKEN }}
repository: toaq/data
directory: data
force: true
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
/data
9 changes: 9 additions & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
FROM alpine AS build-stage
RUN apk add --no-cache curl findutils jq make python3

WORKDIR /work
COPY . .
RUN make

FROM scratch AS export-stage
COPY --from=build-stage /work/data /data
47 changes: 47 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
.ONESHELL:
.SHELLFLAGS = -ce
.PHONY: checkdeps clean redownload

all: checkdeps clean redownload data

checkdeps:
@curl --version >/dev/null
find --version >/dev/null
jq --version >/dev/null
python3 --version >/dev/null

clean:
rm -rf data

data: data/toadua $(shell find static/*)
cp -a static/* data
touch $@

data/toadua: data/toadua/basic.json data/toadua/dump.json data/toadua/glosses.json
touch $@

redownload:
make -B data/toadua/dump.json

data/toadua/dump.json:
mkdir -p $(shell dirname $@)
curl https://toadua.uakci.pl/api \
-X POST -H 'Content-Type: application/json' \
-d '{"action": "search", "query": ["term", ""]}' \
-o $@
jq -e .success $@ >/dev/null || exit 1
cp $@ $@.temp
jq .results $@.temp > $@
rm $@.temp

data/toadua/basic.json: data/toadua/dump.json
jq -c '
[ .[]
| select(.scope == "en"
and .score >= 0
and (.head | index(" ") | not))
| {head, body} ]
' $< > $@

data/toadua/glosses.json: data/toadua/basic.json
python3 ./scripts/extract-glosses.py < $< > $@
9 changes: 9 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
# `data-gen`

This repository contains a number of scripts for generating processed
data for use in downstream Toaq applications. This data is then
published to the [`data` repository](https://github.com/toaq/data) for
use with git submodules.

* `scripts/` houses support scripts for the `Makefile` logic.
* `static/` contains files that are transplanted to the repo 1:1.
42 changes: 42 additions & 0 deletions scripts/extract-glosses.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
import json
import re
import sys


def extract_gloss(text):
return re.sub(
'[()"]',
"",
text.split("/")[0].strip(),
)


def gloss(body, head):
m = re.search("['‘’\"“”]([a-z .]+)['‘’\"“”];", body)
if m:
return m.group(1)
body = body.split(";")[0].strip()
body = re.sub("\.$", "", body)
body = re.sub("\(.+\)$", "", body)
if body.count("▯") >= 3:
body = "▯".join(body.split("▯")[:2]) + "▯"
body = body.strip()
body = re.sub(r" (of|for|to|by|from)? ▯$", "", body)
m = re.search(r"^▯ (?:is|are) (?:(?:a|an|the) )?([^▯]+)$", body)
if m:
return extract_gloss(m.group(1))
m = re.search(r"^▯ ([^▯]+)( ▯)?$", body)
if m:
return extract_gloss(m.group(1))
return None


glosses = {}
for entry in sorted(json.load(sys.stdin), key=lambda x: x["head"]):
head = entry["head"]
body = entry["body"]
g = gloss(body, head)
if g and 1 <= len(g) <= 22 and len(head) <= 30:
glosses[head] = g

print(json.dumps(glosses))
4 changes: 4 additions & 0 deletions static/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
# `data`

This repository contains various lexical data for use by Toaq
appliances. **Do not** commit to this repository as it is updated automatically by [`data-gen`](https://github.com/toaq/data-gen).
Loading

0 comments on commit 0899aa5

Please sign in to comment.