https://www.cortsfoundation.org/digitheek-en#CD
mkdir ppm
for file in /path/to/folder/*.pdf; do
prefix=$(basename "$file" .pdf)
pdftoppm -r 300 "$file" /path/to/folder/ppm/"$prefix"
done
Using Tesseract version 5.3.1
$ tesseract --version
tesseract 5.3.1
leptonica-1.82.0
libgif 5.2.1 : libjpeg 8d (libjpeg-turbo 2.1.5.1) : libpng 1.6.39 : libtiff 4.5.0 : zlib 1.2.11 : libwebp 1.3.0 : libopenjp2 2.5.0
Found SSE4.1
Found libarchive 3.6.2 zlib/1.2.11 liblzma/5.4.1 bz2lib/1.0.8 liblz4/1.9.4 libzstd/1.5.4
Found libcurl/7.87.0 SecureTransport (LibreSSL/3.3.6) zlib/1.2.11 nghttp2/1.51.0
mkdir txt
for f in images/*.ppm; do
output_file="txt/$(basename "${f%.ppm}")"
tesseract "$f" "$output_file" \
-l nld \
-c tessedit_char_whitelist="\"-()'’.,1234567890ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz ;:" \
-c preserve_interword_spaces=1;
done
sourcedir="txt/"
prefixes=$(find "$sourcedir" -type f -name "*.txt" -exec basename {} \; | awk -F '-' '{print $1}' | sort -u)
for prefix in $prefixes; do
folder_path="$sourcedir/$prefix"
mkdir -p "$folder_path"
mv "$sourcedir/$prefix"* "$folder_path/"
done
- contains sections in modern and old Dutch that may be difficult to separate (the modern sections are predominantly the indented introductions to the treaties)
- contains footnotes