diff --git a/ocrd_tesserocr/__init__.py b/ocrd_tesserocr/__init__.py index 0ffe2f8..ab9c56b 100644 --- a/ocrd_tesserocr/__init__.py +++ b/ocrd_tesserocr/__init__.py @@ -1,10 +1,10 @@ +from .binarize import TesserocrBinarize +from .crop import TesserocrCrop +from .deskew import TesserocrDeskew from .fontshape import TesserocrFontShape from .recognize import TesserocrRecognize from .segment import TesserocrSegment -from .segment_word import TesserocrSegmentWord from .segment_line import TesserocrSegmentLine -from .segment_table import TesserocrSegmentTable from .segment_region import TesserocrSegmentRegion -from .crop import TesserocrCrop -from .deskew import TesserocrDeskew -from .binarize import TesserocrBinarize +from .segment_table import TesserocrSegmentTable +from .segment_word import TesserocrSegmentWord diff --git a/ocrd_tesserocr/binarize.py b/ocrd_tesserocr/binarize.py index 84a5e5e..c37e457 100644 --- a/ocrd_tesserocr/binarize.py +++ b/ocrd_tesserocr/binarize.py @@ -1,23 +1,12 @@ from __future__ import absolute_import import os.path -from tesserocr import ( - PyTessBaseAPI, - PSM, RIL -) -from ocrd_utils import ( - getLogger, - assert_file_grp_cardinality, - make_file_id, - MIMETYPE_PAGE -) from ocrd_modelfactory import page_from_file -from ocrd_models.ocrd_page import ( - AlternativeImageType, - TextRegionType, - to_xml -) +from ocrd_models.ocrd_page import AlternativeImageType, TextRegionType, to_xml +from ocrd_utils import (MIMETYPE_PAGE, assert_file_grp_cardinality, getLogger, + make_file_id) +from tesserocr import PSM, RIL, PyTessBaseAPI from .config import OCRD_TOOL from .recognize import TesserocrRecognize diff --git a/ocrd_tesserocr/cli.py b/ocrd_tesserocr/cli.py index 1193132..d2402d2 100644 --- a/ocrd_tesserocr/cli.py +++ b/ocrd_tesserocr/cli.py @@ -1,16 +1,17 @@ import click - from ocrd.decorators import ocrd_cli_options, ocrd_cli_wrap_processor + +from ocrd_tesserocr.binarize import TesserocrBinarize +from ocrd_tesserocr.crop import TesserocrCrop +from ocrd_tesserocr.deskew import TesserocrDeskew from ocrd_tesserocr.fontshape import TesserocrFontShape from ocrd_tesserocr.recognize import TesserocrRecognize from ocrd_tesserocr.segment import TesserocrSegment +from ocrd_tesserocr.segment_line import TesserocrSegmentLine from ocrd_tesserocr.segment_region import TesserocrSegmentRegion from ocrd_tesserocr.segment_table import TesserocrSegmentTable -from ocrd_tesserocr.segment_line import TesserocrSegmentLine from ocrd_tesserocr.segment_word import TesserocrSegmentWord -from ocrd_tesserocr.crop import TesserocrCrop -from ocrd_tesserocr.deskew import TesserocrDeskew -from ocrd_tesserocr.binarize import TesserocrBinarize + @click.command() @ocrd_cli_options diff --git a/ocrd_tesserocr/config.py b/ocrd_tesserocr/config.py index 01e0b23..c2e46e8 100644 --- a/ocrd_tesserocr/config.py +++ b/ocrd_tesserocr/config.py @@ -1,4 +1,5 @@ import json + from pkg_resources import resource_string OCRD_TOOL = json.loads(resource_string(__name__, 'ocrd-tool.json').decode('utf8')) diff --git a/ocrd_tesserocr/crop.py b/ocrd_tesserocr/crop.py index ca66f8e..c96e487 100644 --- a/ocrd_tesserocr/crop.py +++ b/ocrd_tesserocr/crop.py @@ -1,28 +1,16 @@ from __future__ import absolute_import + import os.path import tesserocr -from ocrd_utils import ( - getLogger, - crop_image, - coordinates_for_segment, - coordinates_of_segment, - bbox_from_polygon, - bbox_from_points, - polygon_from_bbox, - points_from_polygon, - bbox_from_xywh, - make_file_id, - assert_file_grp_cardinality, - MIMETYPE_PAGE -) from ocrd_modelfactory import page_from_file -from ocrd_models.ocrd_page import ( - CoordsType, - AlternativeImageType, - BorderType, - to_xml -) +from ocrd_models.ocrd_page import (AlternativeImageType, BorderType, + CoordsType, to_xml) +from ocrd_utils import (MIMETYPE_PAGE, assert_file_grp_cardinality, + bbox_from_points, bbox_from_polygon, bbox_from_xywh, + coordinates_for_segment, coordinates_of_segment, + crop_image, getLogger, make_file_id, + points_from_polygon, polygon_from_bbox) from .config import OCRD_TOOL from .recognize import TesserocrRecognize, polygon_for_parent diff --git a/ocrd_tesserocr/deskew.py b/ocrd_tesserocr/deskew.py index a76e203..46e6be8 100644 --- a/ocrd_tesserocr/deskew.py +++ b/ocrd_tesserocr/deskew.py @@ -1,28 +1,15 @@ from __future__ import absolute_import -import os.path import math -from tesserocr import ( - PyTessBaseAPI, - PSM, OEM, - Orientation, - WritingDirection, - TextlineOrder -) +import os.path -from ocrd_utils import ( - getLogger, - make_file_id, - assert_file_grp_cardinality, - membername, - MIMETYPE_PAGE -) from ocrd_modelfactory import page_from_file -from ocrd_models.ocrd_page import ( - AlternativeImageType, - TextLineType, TextRegionType, PageType, - to_xml -) +from ocrd_models.ocrd_page import (AlternativeImageType, PageType, + TextLineType, TextRegionType, to_xml) +from ocrd_utils import (MIMETYPE_PAGE, assert_file_grp_cardinality, getLogger, + make_file_id, membername) +from tesserocr import (OEM, PSM, Orientation, PyTessBaseAPI, TextlineOrder, + WritingDirection) from .config import OCRD_TOOL from .recognize import TesserocrRecognize diff --git a/ocrd_tesserocr/fontshape.py b/ocrd_tesserocr/fontshape.py index bfac399..c094abc 100644 --- a/ocrd_tesserocr/fontshape.py +++ b/ocrd_tesserocr/fontshape.py @@ -1,23 +1,13 @@ from __future__ import absolute_import -import os.path -from PIL import Image, ImageStat -from tesserocr import ( - RIL, PSM, OEM, - PyTessBaseAPI, - get_languages -) +import os.path -from ocrd_utils import ( - getLogger, - make_file_id, - assert_file_grp_cardinality, - MIMETYPE_PAGE -) -from ocrd_models.ocrd_page import ( - TextStyleType, - to_xml) from ocrd_modelfactory import page_from_file +from ocrd_models.ocrd_page import TextStyleType, to_xml +from ocrd_utils import (MIMETYPE_PAGE, assert_file_grp_cardinality, getLogger, + make_file_id) +from PIL import Image, ImageStat +from tesserocr import OEM, PSM, RIL, PyTessBaseAPI, get_languages from .config import OCRD_TOOL from .recognize import TesserocrRecognize diff --git a/ocrd_tesserocr/recognize.py b/ocrd_tesserocr/recognize.py index f4148e4..e2f54e0 100644 --- a/ocrd_tesserocr/recognize.py +++ b/ocrd_tesserocr/recognize.py @@ -1,65 +1,38 @@ from __future__ import absolute_import + +import itertools +import math from os.path import join from pathlib import Path -import math -import itertools -from PIL import Image, ImageStat -import numpy as np -from scipy.sparse.csgraph import minimum_spanning_tree -from shapely.geometry import Polygon, LineString -from shapely.ops import unary_union, nearest_points - -from tesserocr import ( - RIL, PSM, PT, OEM, - Orientation, - WritingDirection, - TextlineOrder, - tesseract_version, - PyTessBaseAPI, - get_languages) -from ocrd_utils import ( - getLogger, - make_file_id, - assert_file_grp_cardinality, - shift_coordinates, - coordinates_for_segment, - polygon_from_x0y0x1y1, - polygon_from_points, - points_from_polygon, - xywh_from_polygon, - MIMETYPE_PAGE, - membername -) -from ocrd_models.ocrd_page import ( - ReadingOrderType, - RegionRefType, - RegionRefIndexedType, - OrderedGroupType, - OrderedGroupIndexedType, - UnorderedGroupType, - UnorderedGroupIndexedType, - PageType, - CoordsType, - ImageRegionType, - MathsRegionType, - SeparatorRegionType, - NoiseRegionType, - TableRegionType, - TextRegionType, - TextLineType, - WordType, - GlyphType, - TextEquivType, - AlternativeImageType, - to_xml) -from ocrd_models.ocrd_page_generateds import ( - ReadingDirectionSimpleType, - TextLineOrderSimpleType, - TextTypeSimpleType -) -from ocrd_modelfactory import page_from_file +import numpy as np from ocrd import Processor +from ocrd_modelfactory import page_from_file +from ocrd_models.ocrd_page import (AlternativeImageType, CoordsType, GlyphType, + ImageRegionType, MathsRegionType, + NoiseRegionType, OrderedGroupIndexedType, + OrderedGroupType, PageType, + ReadingOrderType, RegionRefIndexedType, + RegionRefType, SeparatorRegionType, + TableRegionType, TextEquivType, + TextLineType, TextRegionType, + UnorderedGroupIndexedType, + UnorderedGroupType, WordType, to_xml) +from ocrd_models.ocrd_page_generateds import (ReadingDirectionSimpleType, + TextLineOrderSimpleType, + TextTypeSimpleType) +from ocrd_utils import (MIMETYPE_PAGE, assert_file_grp_cardinality, + coordinates_for_segment, getLogger, make_file_id, + membername, points_from_polygon, polygon_from_points, + polygon_from_x0y0x1y1, shift_coordinates, + xywh_from_polygon) +from PIL import Image, ImageStat +from scipy.sparse.csgraph import minimum_spanning_tree +from shapely.geometry import LineString, Polygon +from shapely.ops import nearest_points, unary_union +from tesserocr import (OEM, PSM, PT, RIL, Orientation, PyTessBaseAPI, + TextlineOrder, WritingDirection, get_languages, + tesseract_version) from .config import OCRD_TOOL diff --git a/test/conftest.py b/test/conftest.py index 54cba29..4014317 100644 --- a/test/conftest.py +++ b/test/conftest.py @@ -1,9 +1,9 @@ +from test.assets import assets as assets + from ocrd.resolver import Resolver -from ocrd_utils import pushd_popd, initLogging +from ocrd_utils import initLogging, pushd_popd from pytest import fixture -from test.assets import assets as assets - METS_KANT_BINARIZED = assets.url_of('kant_aufklaerung_1784-binarized/data/mets.xml') METS_HEROLD_SMALL = assets.url_of('SBB0000F29300010000/data/mets_one_file.xml') diff --git a/test/test_cli.py b/test/test_cli.py index c679209..18b3a09 100644 --- a/test/test_cli.py +++ b/test/test_cli.py @@ -1,7 +1,7 @@ -from click.testing import CliRunner - from pathlib import Path +from click.testing import CliRunner + runner = CliRunner() def test_show_resource(tmpdir, monkeypatch): diff --git a/test/test_recognize.py b/test/test_recognize.py index a7f47ae..798ed81 100644 --- a/test/test_recognize.py +++ b/test/test_recognize.py @@ -1,14 +1,13 @@ import os -from ocrd_models.constants import NAMESPACES from ocrd_modelfactory import page_from_file +from ocrd_models.constants import NAMESPACES from ocrd_utils import MIMETYPE_PAGE -from ocrd_tesserocr import TesserocrDeskew -from ocrd_tesserocr import TesserocrSegmentWord -from ocrd_tesserocr import TesserocrSegmentLine -from ocrd_tesserocr import TesserocrSegmentRegion -from ocrd_tesserocr import TesserocrRecognize -from ocrd_tesserocr import TesserocrFontShape + +from ocrd_tesserocr import (TesserocrDeskew, TesserocrFontShape, + TesserocrRecognize, TesserocrSegmentLine, + TesserocrSegmentRegion, TesserocrSegmentWord) + def test_run_modular(workspace_kant_binarized): TesserocrSegmentRegion( diff --git a/test/test_segment_line.py b/test/test_segment_line.py index 99602f2..7e91543 100644 --- a/test/test_segment_line.py +++ b/test/test_segment_line.py @@ -1,6 +1,6 @@ -from ocrd_tesserocr import TesserocrSegmentRegion -from ocrd_tesserocr import TesserocrSegmentLine -from ocrd_tesserocr import TesserocrSegment +from ocrd_tesserocr import (TesserocrSegment, TesserocrSegmentLine, + TesserocrSegmentRegion) + def test_run_modular(workspace_herold_small): TesserocrSegmentRegion( diff --git a/test/test_segment_region.py b/test/test_segment_region.py index c250ebd..071e4f4 100644 --- a/test/test_segment_region.py +++ b/test/test_segment_region.py @@ -1,5 +1,6 @@ from ocrd_tesserocr import TesserocrSegmentRegion + def test_run(workspace_herold_small): TesserocrSegmentRegion( workspace_herold_small, diff --git a/test/test_segment_word.py b/test/test_segment_word.py index 86fcc6b..f4bb2f1 100644 --- a/test/test_segment_word.py +++ b/test/test_segment_word.py @@ -1,6 +1,6 @@ -from ocrd_tesserocr import TesserocrSegmentRegion -from ocrd_tesserocr import TesserocrSegmentLine -from ocrd_tesserocr import TesserocrSegmentWord +from ocrd_tesserocr import (TesserocrSegmentLine, TesserocrSegmentRegion, + TesserocrSegmentWord) + def test_run_modular(workspace_kant_binarized): TesserocrSegmentRegion(