This repository has been archived by the owner on Nov 1, 2023. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
ocr.js
75 lines (63 loc) · 1.76 KB
/
ocr.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
import dayjs from "dayjs";
import fs from "fs/promises";
import { createWorker } from "tesseract.js";
import utc from "dayjs/plugin/utc.js";
import csv from "./csv.js";
import json from "./json.js";
import sqlite from "./sqlite.js";
dayjs.extend(utc);
const fields = JSON.parse(
await fs.readFile("./fields.json", { encoding: "utf-8" })
)
.filter(({ active }) => active)
.flatMap(({ fields }) => fields)
.filter(({ tracked }) => tracked);
const worker = createWorker({
logger: () => false,
});
await worker.load();
await worker.loadLanguage("eng");
await worker.initialize("eng");
const dates = await (async () => {
const {
data: { text },
} = await worker.recognize("./screenshots/date.png");
const [, updated] = text.match(
/Data last updated on (\d{1,2}\/\d{1,2}\/\d{4}) and/i
);
const latest = dayjs.utc(updated);
return [
latest.format("YYYY-MM-DD"),
dayjs(latest).subtract(8, "days").format("YYYY-MM-DD"),
dayjs(latest).subtract(2, "days").format("YYYY-MM-DD"),
].join(",");
})();
const ocrNextImage = async function* () {
const ids = fields.map(({ id }) => id);
while (ids.length) {
const id = ids.pop();
const {
data: { text },
} = await worker.recognize(`./screenshots/${id}.png`);
yield {
[id]: +text
.replace(/\n/g, " ")
.replace(/7 (per)?day/gi, "")
.replace(/per 100k residents/gi, "")
.replace(/covid-?\S+/gi, "")
.replace(/as ?of ?\d+\/\d+\/\d+/gi, "")
.replace(/\([^)]*\)/g, "")
.replace(/[^\d.]/g, ""),
};
}
};
let allData = {};
for await (const data of ocrNextImage()) {
allData = { ...allData, ...data };
}
await worker.terminate();
await Promise.all([
csv(dates, allData),
json(dates, allData),
sqlite(dates, allData),
]);