From 45c8918703152450faef727681872a9c7b48676d Mon Sep 17 00:00:00 2001 From: Marijn Koolen Date: Mon, 18 Mar 2024 10:22:31 +0100 Subject: [PATCH 1/7] Improve working with region areas, overlap and size New functionality - add functionality to compute area size of an element - add `EmptyRegionDoc` to allow explicit modelling of empty regions in a scan Improvements of existing functionality - extend document stats with additional categories (alpha-only words) - improve checking of text region overlap to work with points, lines and boxes Error handling and bug fixes - improve error handling for deriving coordinates for a combination of regions - fix empty word bug --- pagexml/analysis/layout_stats.py | 72 ++++++++++----- pagexml/analysis/stats.py | 22 +++-- pagexml/analysis/text_stats.py | 11 ++- pagexml/column_parser.py | 2 +- pagexml/helper/pagexml_helper.py | 67 +++++++++++--- pagexml/model/physical_document_model.py | 92 +++++++++++++++----- tests/helper-pagexml_helper_test.py | 106 ++++++++++++++++++++++- tests/physical_document_model_test.py | 61 +++++++++++-- 8 files changed, 363 insertions(+), 70 deletions(-) diff --git a/pagexml/analysis/layout_stats.py b/pagexml/analysis/layout_stats.py index 2c12980..c54ae53 100644 --- a/pagexml/analysis/layout_stats.py +++ b/pagexml/analysis/layout_stats.py @@ -96,7 +96,7 @@ def interpolate_baseline_points(points: List[Tuple[int, int]], def compute_points_distances(points1: List[Tuple[int, int]], points2: List[Tuple[int, int]], - step: int = 50): + step: int = 50) -> np.ndarray: if points1 is None or points2 is None: return np.array([]) b1_points = interpolate_baseline_points(points1, step=step) @@ -158,12 +158,15 @@ def compute_bounding_box_distances(line1: Union[pdm.PageXMLTextLine, List[pdm.Pa return distances -def average_baseline_height(line: Union[pdm.PageXMLTextLine, List[pdm.PageXMLTextLine]]) -> int: +def average_baseline_height(line: Union[pdm.PageXMLTextLine, List[pdm.PageXMLTextLine]], + debug: int = 0) -> int: """Compute the average (mean) baseline height for comparing lines that are not horizontally aligned. :param line: a TextLine or a list of adjacent lines :type line: PageXMLTextLine + :param debug: Boolean to show debug information or not + :type debug: bool :return: the average (mean) baseline height across all its baseline points :rtype: int """ @@ -179,7 +182,8 @@ def average_baseline_height(line: Union[pdm.PageXMLTextLine, List[pdm.PageXMLTex # segment contributes its average height times its width total_avg += segment_avg * abs(next_point[0] - curr_point[0]) if total_avg < 0: - print(f'total_avg: {total_avg}\n') + print(f'pagexml.analysis.layout_stats.average_baseline_height - ' + f'negative total_avg {total_avg} for line {line.id}\n') # average is total of average heights divided by total width x = sorted([point[0] for point in points]) @@ -189,8 +193,9 @@ def average_baseline_height(line: Union[pdm.PageXMLTextLine, List[pdm.PageXMLTex else: # this should not happen, but if it does, we need to calculate # the average differently, to avoid a division by zero error - print(f"total_avg={total_avg}") - print(f"baseline.points[-1][0]={points[-1][0]}") + if debug > 0: + print(f"total_avg={total_avg}") + print(f"baseline.points[-1][0]={points[-1][0]}") xcoords = [p[0] for p in points] left_x = min(xcoords) right_x = max(xcoords) @@ -200,7 +205,7 @@ def average_baseline_height(line: Union[pdm.PageXMLTextLine, List[pdm.PageXMLTex return int(total_avg) -def sort_coords_above_below_baseline(line: pdm.PageXMLTextLine, +def sort_coords_above_below_baseline(line: pdm.PageXMLTextLine, step: int = 50, debug: int = 0) -> Tuple[List[Tuple[int, int]], List[Tuple[int, int]]]: """Split the list of bounding polygon coordinates of a line in sets of points above and below @@ -209,6 +214,8 @@ def sort_coords_above_below_baseline(line: pdm.PageXMLTextLine, :param line: a PageXML text line :type line: PageXMLTextLine + :param step: number of pixels between interpolated points + :type step: int :param debug: the detail level of debug information (0 = none, higher is more) :type debug: int :return: two lists of bounding polygon points @@ -225,12 +232,14 @@ def sort_coords_above_below_baseline(line: pdm.PageXMLTextLine, return above_baseline, below_baseline if line.coords.left > line.baseline.right: return above_baseline, below_baseline - interpolated_baseline_points = [i for i in interpolate_baseline_points(line.baseline.points, step=50).items()] + interpolated_baseline_points = [i for i in interpolate_baseline_points(line.baseline.points, step=step).items()] if debug > 2: + print('sort_coords_above_below_baseline - line.id:', line.id) + print('sort_coords_above_below_baseline - line.coords.points:', line.coords.points) print('baseline_points:', line.baseline.points) print('interpolated_baseline_points:', interpolated_baseline_points) sorted_coord_points = sorted(line.coords.points, key=lambda p: p[0]) - if debug > 0: + if debug > 1: print('sorted_coord_points:', sorted_coord_points) print('len(sorted_coord_points):', len(sorted_coord_points)) if debug > 1: @@ -240,7 +249,7 @@ def sort_coords_above_below_baseline(line: pdm.PageXMLTextLine, for ci_b, curr_b in enumerate(interpolated_baseline_points): curr_bx, curr_by = curr_b next_b = interpolated_baseline_points[ci_b + 1] if ci_b + 1 < num_baseline_points else None - if debug > 0: + if debug > 1: print(f'sort_above_below - curr_b: {curr_b}') print('\tci_c:', ci_c, '\tnum_coord_points:', num_coord_points) if ci_c == num_coord_points: @@ -249,24 +258,29 @@ def sort_coords_above_below_baseline(line: pdm.PageXMLTextLine, curr_cx, curr_cy = curr_c if next_b and abs(next_b[0] - curr_cx) < abs(curr_b[0] - curr_cx): break - if debug > 0: + if debug > 1: print(f'sort_above_below - curr_c ({ci_c}): {curr_c}') ci_c += 1 if curr_cy < curr_by: - if debug > 0: + if debug > 1: print(f'sort_above_below - above') above_baseline.append(curr_c) else: - if debug > 0: + if debug > 1: print(f'sort_above_below - below') below_baseline.append(curr_c) + if debug > 2: + print('sort_coords_above_below_baseline - above_baseline:', above_baseline) + print('sort_coords_above_below_baseline - below_baseline:', below_baseline) return above_baseline, below_baseline def get_text_heights(line: pdm.PageXMLTextLine, step: int = 50, ignore_errors: bool = True, debug: int = 0) -> np.array: - above_baseline, below_baseline = sort_coords_above_below_baseline(line, debug=debug) + if line.baseline.width <= step: + step = 5 + above_baseline, below_baseline = sort_coords_above_below_baseline(line, step=step, debug=debug) if len(above_baseline) == 0: if ignore_errors is False: ValueError(f'line {line.id} has no bounding coordinates above baseline') @@ -276,6 +290,10 @@ def get_text_heights(line: pdm.PageXMLTextLine, step: int = 50, ValueError(f'Warning: line {line.id} has no bounding coordinates below baseline') int_base = interpolate_baseline_points(line.baseline.points, step=step) int_above = interpolate_baseline_points(above_baseline, step=step) + if debug > 1: + print('get_text_heights - line.id:', line.id) + print('get_text_heights - int_base:', int_base) + print('get_text_heights - int_above:', int_above) height = {} for x in int_base: @@ -327,7 +345,7 @@ def get_line_distances(lines: List[pdm.PageXMLTextLine]) -> List[np.ndarray]: else: distances = compute_bounding_box_distances(curr_line, next_line) all_distances.append(distances) - return all_distances + return all_distances def get_textregion_line_distances(text_region: pdm.PageXMLTextRegion) -> List[np.ndarray]: @@ -593,7 +611,7 @@ def get_line_widths(pagexml_files: List[Union[str, pdm.PageXMLTextRegion]] = Non def find_line_width_boundary_points(line_widths: List[int], line_bin_size: int = 50, - min_ratio: float = 0.25) -> List[int]: + min_ratio: float = 0.25, debug: int = 0) -> List[int]: """Find the minima in the distribution of line widths relative to the peaks in the distribution. These minima represent the boundaries between clusters of lines within the same line width intervals. @@ -619,29 +637,37 @@ def find_line_width_boundary_points(line_widths: List[int], line_bin_size: int = curr_max_width = None curr_min_width = None prev_freq = 0 + if debug > 0: + print(f"find_line_width_boundary_points - total_widths: {total_widths}") + print(f"find_line_width_boundary_points - max_width: {max_width}") + print(f"find_line_width_boundary_points - max_freq: {max_freq}") for w in range(0, max_width + 1, line_bin_size): f = width_freq[w] if f > curr_max_freq: - # print(f'\tfreq {f} bigger than curr max: {curr_max_freq}') + if debug > 0: + print(f'\tfreq {f} bigger than curr max: {curr_max_freq}') curr_max_freq = f curr_max_width = w if f < prev_freq and f < curr_min_freq: - # print(f'\twidth: {w}\tfreq {f} smaller than prev freq: {prev_freq} and than curr min {curr_min_freq}') + if debug > 0: + print(f'\twidth: {w}\tfreq {f} smaller than prev freq: {prev_freq} and than curr min {curr_min_freq}') curr_min_freq = f curr_min_width = w if f / num_lines > 0.01 and f > prev_freq and f > curr_min_freq: - # print(f'\twidth: {w}\tfreq {f} bigger than prev freq: {prev_freq} and than curr min {curr_min_freq}') - # if prev_freq > 0 and f / prev_freq > 1.2 and (curr_max_freq - curr_min_freq) / curr_max_freq > min_ratio: - # print('\t\tRatio:', (curr_max_freq - curr_min_freq) / curr_max_freq) + if debug > 0: + print(f'\twidth: {w}\tfreq {f} bigger than prev freq: {prev_freq} and than curr min {curr_min_freq}') + # if prev_freq > 0 and f / prev_freq > 1.2 and (curr_max_freq - curr_min_freq) / curr_max_freq > min_ratio: + print('\t\tRatio:', (curr_max_freq - curr_min_freq) / curr_max_freq) if (curr_max_freq - curr_min_freq) / curr_max_freq > min_ratio: boundary_points.append((curr_min_width, curr_min_freq)) curr_max_freq = 0 curr_max_width = 0 curr_min_freq = max_freq + 1 - # print(f"width: {w: >5}\tfreq: {f: >8}\tprev_freq: {prev_freq: >8}" - # f"\tcurr_min_freq: {curr_min_freq: >8}" - # f"\tcurr_max_freq: {curr_max_freq}\tboundary points: {boundary_points}") + if debug > 0: + print(f"width: {w: >5}\tfreq: {f: >8}\tprev_freq: {prev_freq: >8}" + f"\tcurr_min_freq: {curr_min_freq: >8}" + f"\tcurr_max_freq: {curr_max_freq}\tboundary points: {boundary_points}") prev_freq = f return [bp[0] for bp in boundary_points] diff --git a/pagexml/analysis/stats.py b/pagexml/analysis/stats.py index 28a77d5..432638c 100644 --- a/pagexml/analysis/stats.py +++ b/pagexml/analysis/stats.py @@ -7,6 +7,9 @@ import pagexml.model.physical_document_model as pdm +DEFAULT_ELEMENTS = ['lines', 'words', 'text_regions', 'columns', 'extra', 'pages'] + + def derive_boundary_points(pagexml_doc: pdm.PageXMLTextRegion) -> List[int]: bin_width = pagexml_doc.coords.width / 5 return [point for point in np.arange(bin_width, pagexml_doc.coords.width, bin_width)] @@ -17,11 +20,14 @@ def _init_doc_stats(line_width_boundary_points: List[int], fields = ['doc_id', 'doc_num', 'doc_width', 'doc_height', 'lines', 'words', 'text_regions', 'columns', 'extra', 'pages', - 'num_words', 'num_number_words', 'num_title_words', 'num_non_title_words', + 'num_words', 'num_alpha_words', 'num_number_words', + 'num_title_words', 'num_non_title_words', 'num_stop_words', 'num_punctuation_words', 'num_oversized_words'] doc_stats = {field: [] for field in fields} for cat_wpl in text_stats.wpl_cat_range: doc_stats[f"words_per_line_{text_stats.wpl_cat_range[cat_wpl]}"] = [] + for cat_wpl in text_stats.wpl_cat_range: + doc_stats[f"alpha_words_per_line_{text_stats.wpl_cat_range[cat_wpl]}"] = [] for length_bin in range(word_length_bin_size, max_word_length + 1, word_length_bin_size): doc_stats[f"num_words_length_{length_bin}"] = [] for width_range in layout_stats.get_boundary_width_ranges(line_width_boundary_points): @@ -70,19 +76,23 @@ def get_doc_stats(pagexml_docs: Union[pdm.PageXMLTextRegion, List[pdm.PageXMLTex for pi, pagexml_doc in enumerate(pagexml_docs): pagexml_doc_stats['doc_id'].append(pagexml_doc.id) pagexml_doc_stats['doc_num'].append(pi + 1) - pagexml_doc_stats['doc_width'].append(pagexml_doc.coords.width) - pagexml_doc_stats['doc_height'].append(pagexml_doc.coords.height) + pagexml_doc_stats['doc_width'].append(pagexml_doc.coords.width if pagexml_doc.coords else None) + pagexml_doc_stats['doc_height'].append(pagexml_doc.coords.height if pagexml_doc.coords else None) lines = [line for line in pagexml_doc.get_lines() if line.text is not None] words = text_stats.get_doc_words(pagexml_doc, use_re_word_boundaries=use_re_word_boundaries) word_stats = text_stats.get_word_cat_stats(words, stop_words=stop_words, max_word_length=max_word_length) - wpl_stats = text_stats.get_words_per_line(lines) - for field in pagexml_doc.stats: - pagexml_doc_stats[field].append(pagexml_doc.stats[field]) + wpl_stats = text_stats.get_words_per_line(lines, alpha_words_only=False) + awpl_stats = text_stats.get_words_per_line(lines, alpha_words_only=True) + # for field in pagexml_doc.stats: + for field in DEFAULT_ELEMENTS: + pagexml_doc_stats[field].append(pagexml_doc.stats[field] if field in pagexml_doc.stats else 0) for word_cat in word_stats: pagexml_doc_stats[word_cat].append((word_stats[word_cat])) for wpl_cat in text_stats.wpl_cat_range.values(): pagexml_doc_stats[f'words_per_line_{wpl_cat}'].append(wpl_stats[wpl_cat]) + for wpl_cat in text_stats.wpl_cat_range.values(): + pagexml_doc_stats[f'alpha_words_per_line_{wpl_cat}'].append(awpl_stats[wpl_cat]) if line_width_boundary_points is None: bin_width = pagexml_doc.coords.width / 5 line_width_boundary_points = [point for point in np.arange(bin_width, pagexml_doc.coords.width, bin_width)] diff --git a/pagexml/analysis/text_stats.py b/pagexml/analysis/text_stats.py index f3b08f1..3b18487 100644 --- a/pagexml/analysis/text_stats.py +++ b/pagexml/analysis/text_stats.py @@ -927,13 +927,16 @@ def get_typical_start_end_words(wbd: WordBreakDetector, return typical_start_words, typical_end_words -def get_words_per_line(lines: List[pdm.PageXMLTextLine], use_re_word_boundaries: bool = False): +def get_words_per_line(lines: List[pdm.PageXMLTextLine], use_re_word_boundaries: bool = False, + alpha_words_only: bool = False): """Return a Counter of the number of words per line of a PageXML pagexml_doc object. :param lines: a list of PageXMLTextLine objects :type lines: List[PageXMLTextLine] :param use_re_word_boundaries: whether to split words of a line using RegEx word boundaries :type use_re_word_boundaries: bool + :param alpha_words_only: whether to only count words consisting of alpha characters (e.g. no numbers) + :type alpha_words_only: bool :return: a counter of the number of words per line of a pagexml_doc :rtype: Counter """ @@ -944,9 +947,12 @@ def get_words_per_line(lines: List[pdm.PageXMLTextLine], use_re_word_boundaries: if line.text is None or line.text == '': words = [] elif use_re_word_boundaries: - words = [w.replace(' ', '') for w in re.split(r'\b', line.text) if w != ' ' and w != ''] + words = [w.replace(' ', '') for w in re.split(r'\b', line.text)] else: words = [w for w in line.text.split(' ')] + words = [w for w in words if w != ' ' and w != ''] + if alpha_words_only is True: + words = [w for w in words if w.isalpha()] # words_per_line.update([len(words)]) if len(words) in wpl_to_cat: wpl_cat = wpl_to_cat[len(words)] @@ -991,6 +997,7 @@ def get_word_cat_stats(words, stop_words=None, max_word_length: int = 30, word_length_freq = Counter([len(w) for w in words if len(w) <= max_word_length]) word_cat_stats = { 'num_words': len(words), + 'num_alpha_words': len([w for w in words if w.isalpha()]), 'num_number_words': len([w for w in words if w.isdigit()]), 'num_title_words': len([w for w in words if w.istitle()]), 'num_non_title_words': len([w for w in words if w.istitle() is False]), diff --git a/pagexml/column_parser.py b/pagexml/column_parser.py index 63ea874..0630510 100644 --- a/pagexml/column_parser.py +++ b/pagexml/column_parser.py @@ -79,7 +79,7 @@ def column_bounding_box_surrounds_lines(column: pdm.PageXMLColumn) -> bool: """Check if the column coordinates contain the coordinate boxes of the column lines.""" for line in column.get_lines(): - if not pagexml_helper.elements_overlap(column, line, threshold=0.6): + if not pagexml_helper.regions_overlap(column, line, threshold=0.6): return False return True diff --git a/pagexml/helper/pagexml_helper.py b/pagexml/helper/pagexml_helper.py index 9c5704b..5a9db9c 100644 --- a/pagexml/helper/pagexml_helper.py +++ b/pagexml/helper/pagexml_helper.py @@ -3,6 +3,7 @@ import re import string from collections import Counter +from enum import Enum from typing import Dict, Generator, List, Set, Tuple, Union import numpy as np @@ -13,19 +14,63 @@ import pagexml.model.physical_document_model as pdm -def elements_overlap(element1: pdm.PageXMLDoc, element2: pdm.PageXMLDoc, - threshold: float = 0.5) -> bool: - """Check if two elements have overlapping coordinates.""" - v_overlap = pdm.get_vertical_overlap(element1, element2) - h_overlap = pdm.get_horizontal_overlap(element1, element2) - if v_overlap / element1.coords.height > threshold: - if h_overlap / element1.coords.width > threshold: +def is_point_inside(point: Tuple[int, int], element: pdm.PageXMLDoc) -> bool: + x, y = point + if x < element.coords.left or x > element.coords.right: + return False + if y < element.coords.top or y > element.coords.bottom: + return False + return True + + +class RegionType(Enum): + + POINT = 1 + HLINE = 2 + VLINE = 3 + BOX = 4 + + +def get_region_type(element: pdm.PageXMLDoc) -> RegionType: + if element.coords.height == 0: + if element.coords.width == 0: + return RegionType.POINT + else: + return RegionType.HLINE + elif element.coords.width == 0: + return RegionType.VLINE + else: + return RegionType.BOX + + +def same_point(point1: Tuple[int, int], point2: Tuple[int, int]) -> bool: + """Check if two points are the same.""" + return point1[0] == point2[0] and point1[1] == point2[1] + + +def regions_overlap(region1: pdm.PageXMLDoc, region2: pdm.PageXMLDoc, + threshold: float = 0.5) -> bool: + """Check if two regions have overlapping coordinates. + + Assumption: points are pixels, so regions with at least one point have at least + a width, height and area of 1.""" + if region1.coords is None or region2.coords is None: + return False + + height1 = region1.coords.height + 1 + width1 = region1.coords.width + 1 + height2 = region2.coords.height + 1 + width2 = region2.coords.width + 1 + + v_overlap = pdm.get_vertical_overlap(region1, region2) + h_overlap = pdm.get_horizontal_overlap(region1, region2) + + if v_overlap / height1 > threshold: + if h_overlap / width1 > threshold: return True - if v_overlap / element2.coords.height > threshold: - if h_overlap / element2.coords.width > threshold: + if v_overlap / height2 > threshold: + if h_overlap / width2 > threshold: return True - else: - return False else: return False diff --git a/pagexml/model/physical_document_model.py b/pagexml/model/physical_document_model.py index 2c4b248..fdd1fdc 100644 --- a/pagexml/model/physical_document_model.py +++ b/pagexml/model/physical_document_model.py @@ -7,6 +7,7 @@ import numpy as np from scipy.spatial import ConvexHull from scipy.spatial import QhullError +from shapely.geometry import Polygon def parse_points(points: Union[str, List[Tuple[int, int]]]) -> List[Tuple[int, int]]: @@ -30,6 +31,7 @@ def parse_points(points: Union[str, List[Tuple[int, int]]]) -> List[Tuple[int, i class Coords: def __init__(self, points: Union[str, List[Tuple[int, int]]]): + """Coordinates of a PageXML region based on a set of points.""" self.points: List[Tuple[int, int]] = parse_points(points) self.point_string = " ".join( ",".join([str(point[0]), str(point[1])]) for point in self.points @@ -149,13 +151,13 @@ def get_horizontal_overlap(doc1: PageXMLDoc, doc2: PageXMLDoc) -> int: else: overlap_left = max([doc1.coords.left, doc2.coords.left]) overlap_right = min([doc1.coords.right, doc2.coords.right]) - return overlap_right - overlap_left if overlap_right > overlap_left else 0 + return overlap_right - overlap_left + 1 if overlap_right >= overlap_left else 0 def get_vertical_overlap(doc1: PageXMLDoc, doc2: PageXMLDoc) -> int: overlap_top = max([doc1.coords.top, doc2.coords.top]) overlap_bottom = min([doc1.coords.bottom, doc2.coords.bottom]) - return overlap_bottom - overlap_top if overlap_bottom > overlap_top else 0 + return overlap_bottom - overlap_top + 1 if overlap_bottom >= overlap_top else 0 def is_vertically_overlapping(region1: PageXMLDoc, @@ -299,34 +301,40 @@ def sort_lines(line1: PageXMLTextLine, line2: PageXMLTextLine, as_column: bool = def parse_derived_coords(document_list: list) -> Coords: """Derive scan coordinates for a composite document based on the list of documents it contains. A convex hull is drawn around all points of all contained documents.""" - return coords_list_to_hull_coords([document.coords for document in document_list]) + try: + return coords_list_to_hull_coords([document.coords for document in document_list]) + except (IndexError, QhullError) as err: + print('pagexml.model.physical_document_model.parse_derived_coords - ' + 'Error with coords in list of documents with the following ids:\n', + [doc.id for doc in document_list]) + raise def coords_list_to_hull_coords(coords_list): # print(coords_list) points = [point for coords in coords_list for point in coords.points] - points_array = np.array(points) + if len(points) <= 2: + return Coords(points) # print(points) try: - edges = points_to_hull_edges(points_array) + edges = points_to_hull_edges(points) # print(edges) hull_points = edges_to_hull_points(edges) return Coords(hull_points) - except IndexError: - print([coords for coords in coords_list]) - print('points:', points) + except (IndexError, QhullError): + print('pagexml.model.physical_document_model.coords_list_to_hull_coords - IndexError') + print('coords in coords_list:', [coords for coords in coords_list]) + print('points derived from list of coords:', points) raise - except QhullError: - print('points:', points) - return Coords([point for point in points]) -def points_to_hull_edges(points): - hull = ConvexHull(points) +def points_to_hull_edges(points: List[Tuple[int, int]]): + points_array = np.array(points) + hull = ConvexHull(points_array) edges = defaultdict(dict) for simplex in hull.simplices: - p1 = (int(points[simplex, 0][0]), int(points[simplex, 1][0])) - p2 = (int(points[simplex, 0][1]), int(points[simplex, 1][1])) + p1 = (int(points_array[simplex, 0][0]), int(points_array[simplex, 1][0])) + p2 = (int(points_array[simplex, 0][1]), int(points_array[simplex, 1][1])) edges[p2][p1] = 1 edges[p1][p2] = 1 return edges @@ -354,11 +362,11 @@ def __init__(self, doc_id: Union[None, str] = None, doc_type: Union[None, str, L self.type = "structure_doc" self.metadata = metadata if metadata else {} self.main_type = 'structure_doc' - if doc_type: - self.type = doc_type + if doc_type is not None: + self.add_type(doc_type) if isinstance(doc_type, str): - self.main_type = main_type - if main_type: + self.main_type = doc_type + if main_type is not None: self.main_type = main_type self.domain = None self.reading_order: Dict[int, str] = reading_order if reading_order else {} @@ -375,6 +383,8 @@ def add_type(self, doc_type: Union[str, List[str]]) -> None: doc_types = [doc_type] if isinstance(doc_type, str) else doc_type if isinstance(self.type, str): self.type = [self.type] + elif isinstance(self.type, set): + self.type = list(self.type) for doc_type in doc_types: if doc_type not in self.type: self.type.append(doc_type) @@ -383,6 +393,8 @@ def remove_type(self, doc_type: Union[str, List[str]]) -> None: doc_types = [doc_type] if isinstance(doc_type, str) else doc_type if isinstance(self.type, str): self.type = [self.type] + elif isinstance(self.type, set): + self.type = list(self.type) for doc_type in doc_types: if doc_type in self.type: self.type.remove(doc_type) @@ -423,7 +435,8 @@ def add_parent_id_to_metadata(self): def json(self) -> Dict[str, any]: json_data = { 'id': self.id, - 'type': self.type, + 'type': list(self.type) if isinstance(self.type, set) else self.type, + 'main_type': self.main_type, 'domain': self.domain, 'metadata': self.metadata } @@ -465,11 +478,27 @@ def __init__(self, doc_id: str = None, reading_order: Dict[int, str] = None): super().__init__(doc_id=doc_id, doc_type='physical_structure_doc', metadata=metadata, reading_order=reading_order) self.coords: Union[None, Coords] = coords + self._area = None if doc_type: self.main_type = doc_type self.add_type(doc_type) self.domain = 'physical' + @property + def area(self): + """Returns the size of the area represented by the convex hull of the coordinates. + + The area is calculated the first time this function is called and stored in a + private property for later calls. The reason to not call it at object instantiation + is that it probably not often needed and only computing it when needed is more + efficient.""" + if self._area is None: + if self.coords is None: + self._area = 0 + else: + self._area = poly_area(self.coords.points) + return self._area + @property def json(self) -> Dict[str, any]: doc_json = super().json @@ -491,6 +520,28 @@ def add_parent_id_to_metadata(self): self.metadata[f'{self.parent.main_type}_id'] = self.parent.id +def poly_area(points: List[Tuple[int, int]]): + """Compute the surface area of a polygon represented by a set of Points.""" + if points is None: + return 0 + if len(points) <= 2: + # two points represent a line, which has an area of zero + return 0 + hull_points = points_to_hull_edges(points) + polygon = Polygon(hull_points) + return polygon.area + + +class EmptyRegionDoc(PhysicalStructureDoc): + + def __init__(self, doc_id: str = None, doc_type: str = None, metadata: Dict[str, any] = None, + coords: Coords = None): + super().__init__(doc_id=doc_id, doc_type=doc_type, metadata=metadata, coords=coords) + self.add_type('empty') + if doc_type is None: + self.main_type = 'empty' + + class LogicalStructureDoc(StructureDoc): def __init__(self, doc_id: str = None, doc_type: Union[str, List[str]] = None, @@ -689,6 +740,7 @@ def __init__(self, doc_id: str = None, doc_type: Union[str, List[str]] = None, self.set_text_regions_in_reader_order() if doc_type: self.add_type(doc_type) + self.empty_regions = [] def __repr__(self): stats = json.dumps(self.stats) diff --git a/tests/helper-pagexml_helper_test.py b/tests/helper-pagexml_helper_test.py index 55cd9fc..250f00a 100644 --- a/tests/helper-pagexml_helper_test.py +++ b/tests/helper-pagexml_helper_test.py @@ -1,16 +1,118 @@ import unittest +from typing import List, Tuple from pagexml.parser import parse_pagexml_file +import pagexml.helper.pagexml_helper as helper +import pagexml.model.physical_document_model as pdm + + +def make_region(points: List[Tuple[int, int]], doc_id: str = 'doc') -> pdm.PageXMLTextRegion: + coords = pdm.Coords(points) + return pdm.PageXMLTextRegion(doc_id=doc_id, coords=coords) + + +class TestRegionType(unittest.TestCase): + + def test_point(self): + tr = make_region([(1, 1)]) + self.assertEqual(helper.RegionType.POINT, helper.get_region_type(tr)) + + def test_hline(self): + tr = make_region([(1, 1), (2, 1)]) + self.assertEqual(helper.RegionType.HLINE, helper.get_region_type(tr)) + + def test_vline(self): + tr = make_region([(1, 1), (1, 2)]) + self.assertEqual(helper.RegionType.VLINE, helper.get_region_type(tr)) + + def test_box(self): + tr = make_region([(1, 1), (2, 2)]) + self.assertEqual(helper.RegionType.BOX, helper.get_region_type(tr)) class TestPageXMLHelper(unittest.TestCase): def setUp(self) -> None: + no_coords = None + point_coords1 = pdm.Coords([(1, 1)]) + point_coords2 = pdm.Coords([(1, 1)]) + point_coords3 = pdm.Coords([(2, 2)]) + hline_coords1 = pdm.Coords([(0, 0), (10, 0)]) + hline_coords2 = pdm.Coords([(5, 0), (15, 0)]) + hline_coords3 = pdm.Coords([(0, 5), (10, 5)]) + vline_coords1 = pdm.Coords([(0, 0), (0, 10)]) + vline_coords2 = pdm.Coords([(0, 5), (0, 15)]) + vline_coords3 = pdm.Coords([(5, 0), (5, 10)]) + self.no_coords_region = pdm.PageXMLTextRegion(doc_id='no_coords') + self.point_coords_region1 = pdm.PageXMLTextRegion(doc_id='point_coords1', coords=point_coords1) self.page_file = 'data/example.xml' self.page_doc = parse_pagexml_file(self.page_file) - def test_something(self): - self.assertEqual(True, 1 == 1) + def test_element_overlap_no_coords(self): + tr1 = make_region([(1, 1)]) + tr2 = pdm.PageXMLTextRegion(doc_id='no_coords') + self.assertEqual(False, helper.regions_overlap(tr1, tr2)) + self.assertEqual(False, helper.regions_overlap(tr2, tr1)) + + def test_element_overlap_same_points(self): + tr1 = make_region([(1, 1)]) + tr2 = make_region([(1, 1)]) + self.assertEqual(True, helper.regions_overlap(tr1, tr2)) + self.assertEqual(True, helper.regions_overlap(tr2, tr1)) + + def test_element_overlap_different_points(self): + tr1 = make_region([(1, 1)]) + tr2 = make_region([(1, 2)]) + self.assertEqual(False, helper.regions_overlap(tr1, tr2)) + self.assertEqual(False, helper.regions_overlap(tr2, tr1)) + + def test_element_overlap_point_on_horizontal_line(self): + tr1 = make_region([(5, 1)]) + tr2 = make_region([(1, 1), (10, 1)]) + self.assertEqual(True, helper.regions_overlap(tr1, tr2)) + self.assertEqual(True, helper.regions_overlap(tr2, tr1)) + + def test_element_overlap_point_on_vertical_line(self): + tr1 = make_region([(1, 5)]) + tr2 = make_region([(1, 1), (1, 10)]) + self.assertEqual(True, helper.regions_overlap(tr1, tr2)) + self.assertEqual(True, helper.regions_overlap(tr2, tr1)) + + def test_element_overlap_point_not_on_horizontal_line(self): + tr1 = make_region([(5, 2)]) + tr2 = make_region([(1, 1), (10, 1)]) + self.assertEqual(False, helper.regions_overlap(tr1, tr2)) + self.assertEqual(False, helper.regions_overlap(tr2, tr1)) + + def test_element_overlap_point_not_on_vertical_line(self): + tr1 = make_region([(2, 5)]) + tr2 = make_region([(1, 1), (1, 10)]) + self.assertEqual(False, helper.regions_overlap(tr1, tr2)) + self.assertEqual(False, helper.regions_overlap(tr2, tr1)) + + def test_element_overlap_point_inside_box(self): + tr1 = make_region([(5, 5)]) + tr2 = make_region([(0, 0), (10, 0), (10, 10), (0, 10)]) + self.assertEqual(True, helper.regions_overlap(tr1, tr2)) + self.assertEqual(True, helper.regions_overlap(tr2, tr1)) + + def test_element_overlap_point_outside_box(self): + tr1 = make_region([(5, 15)]) + tr2 = make_region([(0, 0), (10, 0), (10, 10), (0, 10)]) + self.assertEqual(False, helper.regions_overlap(tr1, tr2)) + self.assertEqual(False, helper.regions_overlap(tr2, tr1)) + + def test_element_overlap_horizontal_line_through_box(self): + tr1 = make_region([(5, 5), (5, 15)]) + tr2 = make_region([(0, 0), (10, 0), (10, 10), (0, 10)]) + self.assertEqual(True, helper.regions_overlap(tr1, tr2)) + self.assertEqual(True, helper.regions_overlap(tr2, tr1)) + + def test_element_overlap_horizontal_line_outside_box(self): + tr1 = make_region([(5, 15), (5, 20)]) + tr2 = make_region([(0, 0), (10, 0), (10, 10), (0, 10)]) + self.assertEqual(False, helper.regions_overlap(tr1, tr2)) + self.assertEqual(False, helper.regions_overlap(tr2, tr1)) if __name__ == '__main__': diff --git a/tests/physical_document_model_test.py b/tests/physical_document_model_test.py index e2cdfe9..9d71b9b 100644 --- a/tests/physical_document_model_test.py +++ b/tests/physical_document_model_test.py @@ -1,8 +1,8 @@ +import math import unittest from unittest.mock import Mock import pagexml.model.physical_document_model as pdm -# from pagexml.model.physical_document_model import pdm.Coords, pdm.StructureDoc, PhysicalStructureDoc, pdm.LogicalStructureDoc class TestCoords(unittest.TestCase): @@ -24,7 +24,7 @@ def test_point_string(self): def test_invalid_points(self): with self.assertRaises(ValueError): - coords = pdm.Coords('invalid points') + pdm.Coords('invalid points') class TestHullCoords(unittest.TestCase): @@ -51,6 +51,19 @@ def test_list_of_line_point_coords_to_hull_of_coords(self): self.assertIn(point, hull_coords.points) +class TestHelperFunctions(unittest.TestCase): + + def test_poly_area_correctly_calculates_square_area(self): + side = 50 + square_points = [(0, 0), (0, side), (side, side), (side, 0)] + self.assertEqual(side**2, pdm.poly_area(square_points)) + + def test_poly_area_ignores_inner_points(self): + side = 50 + square_points = [(0, 0), (0, side), (side, side), (side, 0), (side/2, side/2)] + self.assertEqual(side**2, pdm.poly_area(square_points)) + + class TestStructureDoc(unittest.TestCase): def test_init(self): @@ -146,9 +159,11 @@ def test_add_parent_id_to_metadata(self): def test_json(self): doc = pdm.StructureDoc(doc_id='doc1', doc_type='book', metadata={'title': 'The Great Gatsby'}) + print('TEST_JSON - doc.main_type:', doc.main_type) json_data = doc.json self.assertEqual('doc1', json_data['id']) - self.assertEqual('book', json_data['type']) + self.assertIn('book', json_data['type']) + self.assertEqual('book', json_data['main_type']) self.assertEqual({'title': 'The Great Gatsby'}, json_data['metadata']) self.assertEqual({}, json_data.get('reading_order', {})) @@ -166,7 +181,7 @@ def setUp(self): def test_init(self): self.assertEqual('doc1', self.doc.id) - self.assertEqual(['physical_structure_doc', 'book'], self.doc.type) + self.assertEqual(['structure_doc', 'physical_structure_doc', 'book'], self.doc.type) self.assertEqual(self.metadata, self.doc.metadata) self.assertEqual(self.coords, self.doc.coords) @@ -187,7 +202,8 @@ def test_add_parent_id_to_metadata(self): def test_json(self): expected_json = { 'id': 'doc1', - 'type': ['physical_structure_doc', 'book'], + 'type': ['structure_doc', 'physical_structure_doc', 'book'], + 'main_type': 'book', 'domain': 'physical', 'metadata': {'author': 'Jane Doe'}, 'coords': [(0, 0), (0, 10), (10, 10), (10, 0)] @@ -195,6 +211,41 @@ def test_json(self): self.assertEqual(expected_json, self.doc.json) +class TestPhysicalDocArea(unittest.TestCase): + + def setUp(self) -> None: + points = [(0, 100), (300, 100), (300, 200), (0, 200), (150, 150)] + coords = pdm.Coords(points) + self.doc = pdm.PhysicalStructureDoc(doc_id='doc1', coords=coords) + + def test_doc_has_no_initial_area(self): + self.assertEqual(None, self.doc._area) + + def test_doc_has_area(self): + self.assertEqual(100*300, self.doc.area) + + def test_doc_area_sets_area(self): + area = self.doc.area + self.assertEqual(area, self.doc._area) + + def test_diamoned_shape_has_correct_area(self): + points = [(0, 100), (100, 0), (200, 100), (100, 200)] + coords = pdm.Coords(points) + diamond = pdm.PhysicalStructureDoc(doc_id='doc1', coords=coords) + side = math.sqrt(100**2 + 100**2) + area = side * side + self.assertEqual(area, diamond.area) + + +class TestEmptyRegion(unittest.TestCase): + + def test_create_empty_region(self): + points = [(0, 100), (300, 100), (300, 200), (0, 200), (150, 150)] + coords = pdm.Coords(points) + empty_region = pdm.EmptyRegionDoc(doc_id='empty', coords=coords) + self.assertEqual(300 * 100, empty_region.area) + + class TestLogicalStructureDoc(unittest.TestCase): def setUp(self): self.doc = pdm.LogicalStructureDoc( From 41d669ab5e3b700d576500ceaddf1c5fc9045e09 Mon Sep 17 00:00:00 2001 From: Marijn Koolen Date: Mon, 18 Mar 2024 14:18:35 +0100 Subject: [PATCH 2/7] Add missing package (shapely) and update to include python 3.12 --- pyproject.toml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 2ef3f75..c00898a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -18,7 +18,7 @@ classifiers = [ packages = [{ include = "pagexml" }] [tool.poetry.dependencies] -python = "^3.8,<3.12" +python = "^3.8,<3.13" fuzzy-search = "^2.0.0a" matplotlib = "^3.7.0" numpy = "^1.22.3" @@ -29,6 +29,7 @@ scipy = "^1.7.0" seaborn = "^0.12.2" tqdm = "^4.64.1" xmltodict = "^0.13.0" +shapely = "^2.0.3" [tool.poetry.dev-dependencies] From ebc46c01ea31d0b4ce7a31743bb163f7d4d7a350 Mon Sep 17 00:00:00 2001 From: Marijn Koolen Date: Mon, 18 Mar 2024 14:21:49 +0100 Subject: [PATCH 3/7] Update to include python 3.12 --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index c00898a..cc6491c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -18,7 +18,7 @@ classifiers = [ packages = [{ include = "pagexml" }] [tool.poetry.dependencies] -python = "^3.8,<3.13" +python = "^3.8,<4.0" fuzzy-search = "^2.0.0a" matplotlib = "^3.7.0" numpy = "^1.22.3" From 92d8cfebc8e6b112acbca47be695d12f15b97e76 Mon Sep 17 00:00:00 2001 From: Marijn Koolen Date: Mon, 18 Mar 2024 14:24:23 +0100 Subject: [PATCH 4/7] Update to include python 3.12 --- pyproject.toml | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index cc6491c..47b2d9f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "pagexml-tools" -version = "0.3.4" +version = "0.4.2" description = "Utility functions for reading PageXML files" authors = ["Marijn Koolen ", "Bram Buitendijk "] readme = "README.md" @@ -19,14 +19,15 @@ packages = [{ include = "pagexml" }] [tool.poetry.dependencies] python = "^3.8,<4.0" -fuzzy-search = "^2.0.0a" -matplotlib = "^3.7.0" -numpy = "^1.22.3" -pandas = "^1.5.3" +fuzzy-search = "^2.1.0" +matplotlib = ">=3.7.0" +numpy = ">=1.22.3" +pandas = ">=1.5.3 <3.0.0" py7zr = "^0.20.2" python-dateutil = "^2.8.2" -scipy = "^1.7.0" -seaborn = "^0.12.2" +pyyaml = "^6.0" +scipy = ">=1.7.0" +seaborn = "^0.13.0" tqdm = "^4.64.1" xmltodict = "^0.13.0" shapely = "^2.0.3" From acbdc9a1022d1c51f6dbe40375345552b28e6798 Mon Sep 17 00:00:00 2001 From: Marijn Koolen Date: Mon, 18 Mar 2024 14:25:45 +0100 Subject: [PATCH 5/7] Update to include python 3.12 --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 47b2d9f..24a6a9b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -18,7 +18,7 @@ classifiers = [ packages = [{ include = "pagexml" }] [tool.poetry.dependencies] -python = "^3.8,<4.0" +python = "^3.8 <4.0" fuzzy-search = "^2.1.0" matplotlib = ">=3.7.0" numpy = ">=1.22.3" From f37773a04acbf6088a7c5f15b3cf4273b553ec6e Mon Sep 17 00:00:00 2001 From: Marijn Koolen Date: Mon, 18 Mar 2024 14:26:35 +0100 Subject: [PATCH 6/7] Update to include python 3.12 --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 24a6a9b..bd8db5d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -18,7 +18,7 @@ classifiers = [ packages = [{ include = "pagexml" }] [tool.poetry.dependencies] -python = "^3.8 <4.0" +python = ">=3.8 <4.0" fuzzy-search = "^2.1.0" matplotlib = ">=3.7.0" numpy = ">=1.22.3" From ba9de4e8fcb8ade8238bdfae0b0e5b967cc66297 Mon Sep 17 00:00:00 2001 From: Bram Buitendijk Date: Mon, 18 Mar 2024 14:44:54 +0100 Subject: [PATCH 7/7] fix failing test: let regions_overlap return when none of the conditions are met --- pagexml/helper/pagexml_helper.py | 3 +-- tests/helper-pagexml_helper_test.py | 2 +- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/pagexml/helper/pagexml_helper.py b/pagexml/helper/pagexml_helper.py index c3d52d8..d66cf17 100644 --- a/pagexml/helper/pagexml_helper.py +++ b/pagexml/helper/pagexml_helper.py @@ -71,8 +71,7 @@ def regions_overlap(region1: pdm.PageXMLDoc, region2: pdm.PageXMLDoc, if v_overlap / height2 > threshold: if h_overlap / width2 > threshold: return True - else: - return False + return False def sort_regions_in_reading_order(doc: pdm.PageXMLDoc) -> List[pdm.PageXMLTextRegion]: diff --git a/tests/helper-pagexml_helper_test.py b/tests/helper-pagexml_helper_test.py index 250f00a..b078007 100644 --- a/tests/helper-pagexml_helper_test.py +++ b/tests/helper-pagexml_helper_test.py @@ -1,9 +1,9 @@ import unittest from typing import List, Tuple -from pagexml.parser import parse_pagexml_file import pagexml.helper.pagexml_helper as helper import pagexml.model.physical_document_model as pdm +from pagexml.parser import parse_pagexml_file def make_region(points: List[Tuple[int, int]], doc_id: str = 'doc') -> pdm.PageXMLTextRegion: