feat: Add pipeline timings and toggle visualization, establish debug …

…settings (#183) * Add settings to turn visualization on or off Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Add profiling code to all models Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Refactor and fix profiling codes Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Visualization codes output PNG to debug dir Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Fixes for time logging Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Optimize imports Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Update lockfile Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Add start_timestamps to ProfilingItem Signed-off-by: Christoph Auer <cau@zurich.ibm.com> --------- Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
DS4SD · Oct 30, 2024 · 2a2c65b · 2a2c65b
1 parent 94a5290
commit 2a2c65b
Show file tree

Hide file tree

Showing 23 changed files with 1,004 additions and 777 deletions.
diff --git a/docling/backend/asciidoc_backend.py b/docling/backend/asciidoc_backend.py
@@ -1,24 +1,20 @@
 import logging
-import os
 import re
 from io import BytesIO
 from pathlib import Path
 from typing import Set, Union
 
 from docling_core.types.doc import (
-    DocItem,
     DocItemLabel,
     DoclingDocument,
     DocumentOrigin,
     GroupItem,
     GroupLabel,
     ImageRef,
-    NodeItem,
     Size,
     TableCell,
     TableData,
 )
-from pydantic import AnyUrl
 
 from docling.backend.abstract_backend import DeclarativeDocumentBackend
 from docling.datamodel.base_models import InputFormat

diff --git a/docling/datamodel/base_models.py b/docling/datamodel/base_models.py
@@ -1,6 +1,6 @@
 from enum import Enum, auto
 from io import BytesIO
-from typing import TYPE_CHECKING, Dict, List, Optional, Set, Union
+from typing import TYPE_CHECKING, Dict, List, Optional, Union
 
 from docling_core.types.doc import (
     BoundingBox,

diff --git a/docling/datamodel/document.py b/docling/datamodel/document.py
@@ -3,7 +3,7 @@
 from enum import Enum
 from io import BytesIO
 from pathlib import Path, PurePath
-from typing import TYPE_CHECKING, Dict, Iterable, List, Optional, Tuple, Type, Union
+from typing import TYPE_CHECKING, Dict, Iterable, List, Optional, Type, Union
 
 import filetype
 from docling_core.types.doc import (
@@ -52,6 +52,7 @@
     Page,
 )
 from docling.datamodel.settings import DocumentLimits
+from docling.utils.profiling import ProfilingItem
 from docling.utils.utils import create_file_hash, create_hash
 
 if TYPE_CHECKING:
@@ -187,6 +188,7 @@ class ConversionResult(BaseModel):
 
     pages: List[Page] = []
     assembled: AssembledUnit = AssembledUnit()
+    timings: Dict[str, ProfilingItem] = {}
 
     document: DoclingDocument = _EMPTY_DOCLING_DOC
 

diff --git a/docling/datamodel/settings.py b/docling/datamodel/settings.py
@@ -1,4 +1,5 @@
 import sys
+from pathlib import Path
 
 from pydantic import BaseModel
 from pydantic_settings import BaseSettings
@@ -26,8 +27,21 @@ class BatchConcurrencySettings(BaseModel):
     # To force models into single core: export OMP_NUM_THREADS=1
 
 
+class DebugSettings(BaseModel):
+    visualize_cells: bool = False
+    visualize_ocr: bool = False
+    visualize_layout: bool = False
+    visualize_tables: bool = False
+
+    profile_pipeline_timings: bool = False
+
+    # Path used to output debug information.
+    debug_output_path: str = str(Path.cwd() / "debug")
+
+
 class AppSettings(BaseSettings):
     perf: BatchConcurrencySettings
+    debug: DebugSettings
 
 
-settings = AppSettings(perf=BatchConcurrencySettings())
+settings = AppSettings(perf=BatchConcurrencySettings(), debug=DebugSettings())
diff --git a/docling/document_converter.py b/docling/document_converter.py
@@ -189,24 +189,35 @@ def _convert(
     ) -> Iterator[ConversionResult]:
         assert self.format_to_options is not None
 
+        start_time = time.monotonic()
+
         for input_batch in chunkify(
             conv_input.docs(self.format_to_options),
             settings.perf.doc_batch_size,  # pass format_options
         ):
             _log.info(f"Going to convert document batch...")
+
             # parallel processing only within input_batch
             # with ThreadPoolExecutor(
             #    max_workers=settings.perf.doc_batch_concurrency
             # ) as pool:
             #   yield from pool.map(self.process_document, input_batch)
-
             # Note: PDF backends are not thread-safe, thread pool usage was disabled.
+
             for item in map(
                 partial(self._process_document, raises_on_error=raises_on_error),
                 input_batch,
             ):
+                elapsed = time.monotonic() - start_time
+                start_time = time.monotonic()
+
                 if item is not None:
+                    _log.info(
+                        f"Finished converting document {item.input.file.name} in {elapsed:.2f} sec."
+                    )
                     yield item
+                else:
+                    _log.info(f"Skipped a document. We lost {elapsed:.2f} sec.")
 
     def _get_pipeline(self, doc: InputDocument) -> Optional[BasePipeline]:
         assert self.format_to_options is not None
@@ -237,15 +248,8 @@ def _process_document(
         assert self.allowed_formats is not None
         assert in_doc.format in self.allowed_formats
 
-        start_doc_time = time.time()
-
         conv_res = self._execute_pipeline(in_doc, raises_on_error=raises_on_error)
 
-        end_doc_time = time.time() - start_doc_time
-        _log.info(
-            f"Finished converting document {in_doc.file.name} in {end_doc_time:.2f} seconds."
-        )
-
         return conv_res
 
     def _execute_pipeline(

diff --git a/docling/models/base_model.py b/docling/models/base_model.py
@@ -4,11 +4,14 @@
 from docling_core.types.doc import DoclingDocument, NodeItem
 
 from docling.datamodel.base_models import Page
+from docling.datamodel.document import ConversionResult
 
 
 class BasePageModel(ABC):
     @abstractmethod
-    def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
+    def __call__(
+        self, conv_res: ConversionResult, page_batch: Iterable[Page]
+    ) -> Iterable[Page]:
         pass
 
 

diff --git a/docling/models/base_ocr_model.py b/docling/models/base_ocr_model.py
@@ -1,6 +1,7 @@
 import copy
 import logging
 from abc import abstractmethod
+from pathlib import Path
 from typing import Iterable, List
 
 import numpy as np
@@ -10,12 +11,15 @@
 from scipy.ndimage import find_objects, label
 
 from docling.datamodel.base_models import OcrCell, Page
+from docling.datamodel.document import ConversionResult
 from docling.datamodel.pipeline_options import OcrOptions
+from docling.datamodel.settings import settings
+from docling.models.base_model import BasePageModel
 
 _log = logging.getLogger(__name__)
 
 
-class BaseOcrModel:
+class BaseOcrModel(BasePageModel):
     def __init__(self, enabled: bool, options: OcrOptions):
         self.enabled = enabled
         self.options = options
@@ -113,7 +117,7 @@ def is_overlapping_with_existing_cells(ocr_cell):
         ]
         return filtered_ocr_cells
 
-    def draw_ocr_rects_and_cells(self, page, ocr_rects):
+    def draw_ocr_rects_and_cells(self, conv_res, page, ocr_rects, show: bool = False):
         image = copy.deepcopy(page.image)
         draw = ImageDraw.Draw(image, "RGBA")
 
@@ -130,8 +134,21 @@ def draw_ocr_rects_and_cells(self, page, ocr_rects):
             if isinstance(tc, OcrCell):
                 color = "magenta"
             draw.rectangle([(x0, y0), (x1, y1)], outline=color)
-        image.show()
+
+        if show:
+            image.show()
+        else:
+            out_path: Path = (
+                Path(settings.debug.debug_output_path)
+                / f"debug_{conv_res.input.file.stem}"
+            )
+            out_path.mkdir(parents=True, exist_ok=True)
+
+            out_file = out_path / f"ocr_page_{page.page_no:05}.png"
+            image.save(str(out_file), format="png")
 
     @abstractmethod
-    def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
+    def __call__(
+        self, conv_res: ConversionResult, page_batch: Iterable[Page]
+    ) -> Iterable[Page]:
         pass
diff --git a/docling/models/ds_glm_model.py b/docling/models/ds_glm_model.py
@@ -1,5 +1,6 @@
 import copy
 import random
+from pathlib import Path
 from typing import List, Union
 
 from deepsearch_glm.nlp_utils import init_nlp_model
@@ -27,6 +28,8 @@
 
 from docling.datamodel.base_models import Cluster, FigureElement, Table, TextElement
 from docling.datamodel.document import ConversionResult, layout_label_to_ds_type
+from docling.datamodel.settings import settings
+from docling.utils.profiling import ProfilingScope, TimeRecorder
 from docling.utils.utils import create_hash
 
 
@@ -226,23 +229,24 @@ def make_spans(cell):
         return ds_doc
 
     def __call__(self, conv_res: ConversionResult) -> DoclingDocument:
-        ds_doc = self._to_legacy_document(conv_res)
-        ds_doc_dict = ds_doc.model_dump(by_alias=True)
+        with TimeRecorder(conv_res, "glm", scope=ProfilingScope.DOCUMENT):
+            ds_doc = self._to_legacy_document(conv_res)
+            ds_doc_dict = ds_doc.model_dump(by_alias=True)
 
-        glm_doc = self.model.apply_on_doc(ds_doc_dict)
+            glm_doc = self.model.apply_on_doc(ds_doc_dict)
 
-        docling_doc: DoclingDocument = to_docling_document(glm_doc)  # Experimental
+            docling_doc: DoclingDocument = to_docling_document(glm_doc)  # Experimental
 
         # DEBUG code:
-        def draw_clusters_and_cells(ds_document, page_no):
+        def draw_clusters_and_cells(ds_document, page_no, show: bool = False):
             clusters_to_draw = []
             image = copy.deepcopy(conv_res.pages[page_no].image)
             for ix, elem in enumerate(ds_document.main_text):
                 if isinstance(elem, BaseText):
-                    prov = elem.prov[0]
+                    prov = elem.prov[0]  # type: ignore
                 elif isinstance(elem, Ref):
                     _, arr, index = elem.ref.split("/")
-                    index = int(index)
+                    index = int(index)  # type: ignore
                     if arr == "tables":
                         prov = ds_document.tables[index].prov[0]
                     elif arr == "figures":
@@ -256,7 +260,7 @@ def draw_clusters_and_cells(ds_document, page_no):
                             id=ix,
                             label=elem.name,
                             bbox=BoundingBox.from_tuple(
-                                coord=prov.bbox,
+                                coord=prov.bbox,  # type: ignore
                                 origin=CoordOrigin.BOTTOMLEFT,
                             ).to_top_left_origin(conv_res.pages[page_no].size.height),
                         )
@@ -276,9 +280,21 @@ def draw_clusters_and_cells(ds_document, page_no):
                 for tc in c.cells:  # [:1]:
                     x0, y0, x1, y1 = tc.bbox.as_tuple()
                     draw.rectangle([(x0, y0), (x1, y1)], outline=cell_color)
-            image.show()
 
-        # draw_clusters_and_cells(ds_doc, 0)
-        # draw_clusters_and_cells(exported_doc, 0)
+            if show:
+                image.show()
+            else:
+                out_path: Path = (
+                    Path(settings.debug.debug_output_path)
+                    / f"debug_{conv_res.input.file.stem}"
+                )
+                out_path.mkdir(parents=True, exist_ok=True)
+
+                out_file = out_path / f"doc_page_{page_no:05}.png"
+                image.save(str(out_file), format="png")
+
+        # for item in ds_doc.page_dimensions:
+        #    page_no = item.page
+        #    draw_clusters_and_cells(ds_doc, page_no)
 
         return docling_doc