OCR-D · bertsky · Jul 6, 2024 · Jul 6, 2024 · Jul 7, 2024 · Jul 7, 2024
diff --git a/.circleci/config.yml b/.circleci/config.yml
@@ -22,12 +22,13 @@ jobs:
       - run: make install
       - run: make deps-test
       - run: mkdir test-results
-      - run: make test PYTEST_ARGS=--junitxml=test-results/test.xml
+      - run: make test PYTEST_ARGS="-vv --junitxml=test-results/test.xml"
       - store_test_results:
           path: test-results
       - run: make test-cli
       - run: make coverage
       - codecov/upload
+    resource_class: large
 
   build-docker:
     docker:
@@ -75,7 +76,7 @@ workflows:
       - test-python:
           matrix:
             parameters:
-              python-version: ['3.7', '3.8', '3.9', '3.10']
+              python-version: ['3.8', '3.9', '3.10', '3.11']
       - build-docker
 
   deploy:

diff --git a/.pylintrc b/.pylintrc
@@ -21,8 +21,5 @@ disable =
     wrong-import-order,
     duplicate-code
 
-# allow indented whitespace (as required by interpreter):
-no-space-check=empty-line
-
 # allow non-snake-case identifiers:
 good-names=n,i
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -5,6 +5,10 @@ Versioned according to [Semantic Versioning](http://semver.org/).
 
 ## Unreleased
 
+Changed:
+
+ * adapt to ocrd 3.0, #216
+
 ## [0.19.1] - 2024-07-01
 
 Fixed:

diff --git a/Makefile b/Makefile
@@ -141,13 +141,12 @@ install: deps
 	ocrd resmgr download ocrd-tesserocr-recognize osd.traineddata
 	ocrd resmgr download ocrd-tesserocr-recognize equ.traineddata
 
+test test-cli coverage: export OCRD_MISSING_OUTPUT := ABORT
+
 # Run unit tests
 test: test/assets deps-test
 	@# declare -p HTTP_PROXY
-	#$(PYTHON) -m pytest -n auto --continue-on-collection-errors test $(PYTEST_ARGS)
-	# workaround for pytest-xdist not isolating setenv calls in click.CliRunner from each other:
-	$(PYTHON) -m pytest --continue-on-collection-errors test/test_cli.py $(PYTEST_ARGS)
-	$(PYTHON) -m pytest --continue-on-collection-errors test/test_{segment_{region,table,line,word},recognize}.py $(PYTEST_ARGS)
+	$(PYTHON) -m pytest test --durations=0 --continue-on-collection-errors $(PYTEST_ARGS)
 
 # Run unit tests and determine test coverage
 coverage:

diff --git a/ocrd_tesserocr/binarize.py b/ocrd_tesserocr/binarize.py
@@ -1,40 +1,34 @@
 from __future__ import absolute_import
 
+from typing import Optional
 import os.path
 from tesserocr import (
     PyTessBaseAPI,
     PSM, RIL
 )
 
-from ocrd_utils import (
-    getLogger,
-    assert_file_grp_cardinality,
-    make_file_id,
-    MIMETYPE_PAGE
-)
-from ocrd_modelfactory import page_from_file
 from ocrd_models.ocrd_page import (
     AlternativeImageType,
     TextRegionType,
-    to_xml
+    OcrdPage
 )
+from ocrd.processor import OcrdPageResult, OcrdPageResultImage
 
-from .config import OCRD_TOOL
 from .recognize import TesserocrRecognize
 
-TOOL = 'ocrd-tesserocr-binarize'
-
 class TesserocrBinarize(TesserocrRecognize):
-    def __init__(self, *args, **kwargs):
-        kwargs.setdefault('ocrd_tool', OCRD_TOOL['tools'][TOOL])
-        super().__init__(*args, **kwargs)
-        if hasattr(self, 'parameter'):
-            self.logger = getLogger('processor.TesserocrBinarize')
+    @property
+    def executable(self):
+        return 'ocrd-tesserocr-binarize'
+
+    def _init(self):
+        # use default model (eng) with vanilla tesserocr API
+        self.tessapi = PyTessBaseAPI()
 
-    def process(self):
+    def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id : Optional[str] = None) -> OcrdPageResult:
         """Performs binarization of the region / line with Tesseract on the workspace.
 
-        Open and deserialize PAGE input files and their respective images,
+        Open and deserialize PAGE input file and its respective images,
         then iterate over the element hierarchy down to the requested level.
 
         Set up Tesseract to recognize the segment image's layout, and get
@@ -47,109 +41,86 @@ def process(self):
 
         Produce a new output file by serialising the resulting hierarchy.
         """
-        assert_file_grp_cardinality(self.input_file_grp, 1)
-        assert_file_grp_cardinality(self.output_file_grp, 1)
 
         sepmask = self.parameter['tiseg']
         oplevel = self.parameter['operation_level']
 
-        with PyTessBaseAPI() as tessapi:
-            for n, input_file in enumerate(self.input_files):
-                file_id = make_file_id(input_file, self.output_file_grp)
-                page_id = input_file.pageId or input_file.ID
-                self.logger.info("INPUT FILE %i / %s", n, page_id)
-                pcgts = page_from_file(self.workspace.download_file(input_file))
-                self.add_metadata(pcgts)
-                page = pcgts.get_Page()
-                page_image, page_xywh, page_image_info = self.workspace.image_from_page(
-                    page, page_id)
-                if self.parameter['dpi'] > 0:
-                    dpi = self.parameter['dpi']
-                    self.logger.info("Page '%s' images will use %d DPI from parameter override", page_id, dpi)
-                elif page_image_info.resolution != 1:
-                    dpi = page_image_info.resolution
-                    if page_image_info.resolutionUnit == 'cm':
-                        dpi = round(dpi * 2.54)
-                    self.logger.info("Page '%s' images will use %d DPI from image meta-data", page_id, dpi)
-                else:
-                    dpi = 0
-                    self.logger.info("Page '%s' images will use DPI estimated from segmentation", page_id)
-                tessapi.SetVariable('user_defined_dpi', str(dpi))
-                self.logger.info("Binarizing on '%s' level in page '%s'", oplevel, page_id)
+        pcgts = input_pcgts[0]
+        result = OcrdPageResult(pcgts)
+        page = pcgts.get_Page()
+        page_image, page_xywh, page_image_info = self.workspace.image_from_page(
+            page, page_id)
+        if self.parameter['dpi'] > 0:
+            dpi = self.parameter['dpi']
+            self.logger.info("Page '%s' images will use %d DPI from parameter override", page_id, dpi)
+        elif page_image_info.resolution != 1:
+            dpi = page_image_info.resolution
+            if page_image_info.resolutionUnit == 'cm':
+                dpi = round(dpi * 2.54)
+            self.logger.info("Page '%s' images will use %d DPI from image meta-data", page_id, dpi)
+        else:
+            dpi = 0
+            self.logger.info("Page '%s' images will use DPI estimated from segmentation", page_id)
+        self.tessapi.SetVariable('user_defined_dpi', str(dpi))
+        self.logger.info("Binarizing on '%s' level in page '%s'", oplevel, page_id)
 
-                if oplevel == 'page':
-                    tessapi.SetPageSegMode(PSM.AUTO_ONLY)
-                    tessapi.SetImage(page_image)
-                    if sepmask:
-                        # will trigger FindLines() → SegmentPage() → AutoPageSeg()
-                        # → SetupPageSegAndDetectOrientation() → FindAndRemoveLines() + FindImages()
-                        tessapi.AnalyseLayout()
-                    page_image_bin = tessapi.GetThresholdedImage()
-                    if page_image_bin:
-                        # update METS (add the image file):
-                        file_path = self.workspace.save_image_file(page_image_bin,
-                                                                   file_id + '.IMG-BIN',
-                                                                   page_id=input_file.pageId,
-                                                                   file_grp=self.output_file_grp)
-                        # update PAGE (reference the image file):
-                        features = page_xywh['features'] + ",binarized"
-                        if sepmask:
-                            features += ",clipped"
-                        page.add_AlternativeImage(AlternativeImageType(
-                            filename=file_path, comments=features))
-                    else:
-                        self.logger.error('Cannot binarize %s', "page '%s'" % page_id)
-                else:
-                    regions = page.get_TextRegion() + page.get_TableRegion()
-                    if not regions:
-                        self.logger.warning("Page '%s' contains no text regions", page_id)
-                    for region in regions:
-                        region_image, region_xywh = self.workspace.image_from_segment(
-                            region, page_image, page_xywh)
-                        if oplevel == 'region':
-                            tessapi.SetPageSegMode(PSM.SINGLE_BLOCK)
-                            self._process_segment(tessapi, RIL.BLOCK, region, region_image, region_xywh,
-                                                  "region '%s'" % region.id, input_file.pageId,
-                                                  file_id + '_' + region.id)
-                        elif isinstance(region, TextRegionType):
-                            lines = region.get_TextLine()
-                            if not lines:
-                                self.logger.warning("Page '%s' region '%s' contains no text lines",
-                                                    page_id, region.id)
-                            for line in lines:
-                                line_image, line_xywh = self.workspace.image_from_segment(
-                                    line, region_image, region_xywh)
-                                tessapi.SetPageSegMode(PSM.SINGLE_LINE)
-                                self._process_segment(tessapi, RIL.TEXTLINE, line, line_image, line_xywh,
-                                                      "line '%s'" % line.id, input_file.pageId,
-                                                      file_id + '_' + region.id + '_' + line.id)
+        if oplevel == 'page':
+            image = self._process_segment(-1, page, page_image, page_xywh, page_id)
+            if image:
+                result.images.append(image)
+            return result
 
-                file_id = make_file_id(input_file, self.output_file_grp)
-                pcgts.set_pcGtsId(file_id)
-                self.workspace.add_file(
-                    file_id=file_id,
-                    file_grp=self.output_file_grp,
-                    page_id=input_file.pageId,
-                    mimetype=MIMETYPE_PAGE,
-                    local_filename=os.path.join(self.output_file_grp,
-                                                file_id + '.xml'),
-                    content=to_xml(pcgts))
+        regions = page.get_AllRegions(classes=['Text', 'Table'])
+        if not regions:
+            self.logger.warning("Page '%s' contains no text regions", page_id)
+        for region in regions:
+            region_image, region_xywh = self.workspace.image_from_segment(
+                region, page_image, page_xywh)
+            if oplevel == 'region':
+                image = self._process_segment(RIL.BLOCK, region, region_image, region_xywh,
+                                              "region '%s'" % region.id)
+                if image:
+                    result.images.append(image)
+            elif isinstance(region, TextRegionType):
+                lines = region.get_TextLine()
+                if not lines:
+                    self.logger.warning("Page '%s' region '%s' contains no text lines",
+                                        page_id, region.id)
+                for line in lines:
+                    line_image, line_xywh = self.workspace.image_from_segment(
+                        line, region_image, region_xywh)
+                    image = self._process_segment(RIL.TEXTLINE, line, line_image, line_xywh,
+                                                  "line '%s'" % line.id)
+                    if image:
+                        result.images.append(image)
 
-    def _process_segment(self, tessapi, ril, segment, image, xywh, where, page_id, file_id):
-        tessapi.SetImage(image)
+        return result
+
+    def _process_segment(self, ril, segment, image, xywh, where) -> Optional[OcrdPageResultImage]:
+        self.tessapi.SetImage(image)
+        features = xywh['features'] + ",binarized"
         image_bin = None
-        layout = tessapi.AnalyseLayout()
-        if layout:
-            image_bin = layout.GetBinaryImage(ril)
+        if ril == -1:
+            # page level
+            self.tessapi.SetPageSegMode(PSM.AUTO_ONLY)
+            if self.parameter['tiseg']:
+                features += ",clipped"
+                # will trigger FindLines() → SegmentPage() → AutoPageSeg()
+                # → SetupPageSegAndDetectOrientation() → FindAndRemoveLines() + FindImages()
+                self.tessapi.AnalyseLayout()
+            image_bin = self.tessapi.GetThresholdedImage()
+        else:
+            if ril == RIL.BLOCK:
+                self.tessapi.SetPageSegMode(PSM.SINGLE_BLOCK)
+            if ril == RIL.TEXTLINE:
+                self.tessapi.SetPageSegMode(PSM.SINGLE_LINE)
+            layout = self.tessapi.AnalyseLayout()
+            if layout:
+                image_bin = layout.GetBinaryImage(ril)
         if not image_bin:
             self.logger.error('Cannot binarize %s', where)
-            return
-        # update METS (add the image file):
-        file_path = self.workspace.save_image_file(image_bin,
-                                    file_id + '.IMG-BIN',
-                                    page_id=page_id,
-                                    file_grp=self.output_file_grp)
+            return None
         # update PAGE (reference the image file):
-        features = xywh['features'] + ",binarized"
-        segment.add_AlternativeImage(AlternativeImageType(
-            filename=file_path, comments=features))
+        image_ref = AlternativeImageType(comments=features)
+        segment.add_AlternativeImage(image_ref)
+        return OcrdPageResultImage(image_bin, segment.id + '.IMG-BIN', image_ref)