Skip to content

Commit

Permalink
feat: Add pipeline timings and toggle visualization, establish debug …
Browse files Browse the repository at this point in the history
…settings (#183)

* Add settings to turn visualization on or off

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* Add profiling code to all models

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* Refactor and fix profiling codes

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* Visualization codes output PNG to debug dir

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* Fixes for time logging

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* Optimize imports

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* Update lockfile

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* Add start_timestamps to ProfilingItem

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

---------

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
  • Loading branch information
cau-git authored Oct 30, 2024
1 parent 94a5290 commit 2a2c65b
Show file tree
Hide file tree
Showing 23 changed files with 1,004 additions and 777 deletions.
4 changes: 0 additions & 4 deletions docling/backend/asciidoc_backend.py
Original file line number Diff line number Diff line change
@@ -1,24 +1,20 @@
import logging
import os
import re
from io import BytesIO
from pathlib import Path
from typing import Set, Union

from docling_core.types.doc import (
DocItem,
DocItemLabel,
DoclingDocument,
DocumentOrigin,
GroupItem,
GroupLabel,
ImageRef,
NodeItem,
Size,
TableCell,
TableData,
)
from pydantic import AnyUrl

from docling.backend.abstract_backend import DeclarativeDocumentBackend
from docling.datamodel.base_models import InputFormat
Expand Down
2 changes: 1 addition & 1 deletion docling/datamodel/base_models.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from enum import Enum, auto
from io import BytesIO
from typing import TYPE_CHECKING, Dict, List, Optional, Set, Union
from typing import TYPE_CHECKING, Dict, List, Optional, Union

from docling_core.types.doc import (
BoundingBox,
Expand Down
4 changes: 3 additions & 1 deletion docling/datamodel/document.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from enum import Enum
from io import BytesIO
from pathlib import Path, PurePath
from typing import TYPE_CHECKING, Dict, Iterable, List, Optional, Tuple, Type, Union
from typing import TYPE_CHECKING, Dict, Iterable, List, Optional, Type, Union

import filetype
from docling_core.types.doc import (
Expand Down Expand Up @@ -52,6 +52,7 @@
Page,
)
from docling.datamodel.settings import DocumentLimits
from docling.utils.profiling import ProfilingItem
from docling.utils.utils import create_file_hash, create_hash

if TYPE_CHECKING:
Expand Down Expand Up @@ -187,6 +188,7 @@ class ConversionResult(BaseModel):

pages: List[Page] = []
assembled: AssembledUnit = AssembledUnit()
timings: Dict[str, ProfilingItem] = {}

document: DoclingDocument = _EMPTY_DOCLING_DOC

Expand Down
16 changes: 15 additions & 1 deletion docling/datamodel/settings.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import sys
from pathlib import Path

from pydantic import BaseModel
from pydantic_settings import BaseSettings
Expand Down Expand Up @@ -26,8 +27,21 @@ class BatchConcurrencySettings(BaseModel):
# To force models into single core: export OMP_NUM_THREADS=1


class DebugSettings(BaseModel):
visualize_cells: bool = False
visualize_ocr: bool = False
visualize_layout: bool = False
visualize_tables: bool = False

profile_pipeline_timings: bool = False

# Path used to output debug information.
debug_output_path: str = str(Path.cwd() / "debug")


class AppSettings(BaseSettings):
perf: BatchConcurrencySettings
debug: DebugSettings


settings = AppSettings(perf=BatchConcurrencySettings())
settings = AppSettings(perf=BatchConcurrencySettings(), debug=DebugSettings())
20 changes: 12 additions & 8 deletions docling/document_converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -189,24 +189,35 @@ def _convert(
) -> Iterator[ConversionResult]:
assert self.format_to_options is not None

start_time = time.monotonic()

for input_batch in chunkify(
conv_input.docs(self.format_to_options),
settings.perf.doc_batch_size, # pass format_options
):
_log.info(f"Going to convert document batch...")

# parallel processing only within input_batch
# with ThreadPoolExecutor(
# max_workers=settings.perf.doc_batch_concurrency
# ) as pool:
# yield from pool.map(self.process_document, input_batch)

# Note: PDF backends are not thread-safe, thread pool usage was disabled.

for item in map(
partial(self._process_document, raises_on_error=raises_on_error),
input_batch,
):
elapsed = time.monotonic() - start_time
start_time = time.monotonic()

if item is not None:
_log.info(
f"Finished converting document {item.input.file.name} in {elapsed:.2f} sec."
)
yield item
else:
_log.info(f"Skipped a document. We lost {elapsed:.2f} sec.")

def _get_pipeline(self, doc: InputDocument) -> Optional[BasePipeline]:
assert self.format_to_options is not None
Expand Down Expand Up @@ -237,15 +248,8 @@ def _process_document(
assert self.allowed_formats is not None
assert in_doc.format in self.allowed_formats

start_doc_time = time.time()

conv_res = self._execute_pipeline(in_doc, raises_on_error=raises_on_error)

end_doc_time = time.time() - start_doc_time
_log.info(
f"Finished converting document {in_doc.file.name} in {end_doc_time:.2f} seconds."
)

return conv_res

def _execute_pipeline(
Expand Down
5 changes: 4 additions & 1 deletion docling/models/base_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,14 @@
from docling_core.types.doc import DoclingDocument, NodeItem

from docling.datamodel.base_models import Page
from docling.datamodel.document import ConversionResult


class BasePageModel(ABC):
@abstractmethod
def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
def __call__(
self, conv_res: ConversionResult, page_batch: Iterable[Page]
) -> Iterable[Page]:
pass


Expand Down
25 changes: 21 additions & 4 deletions docling/models/base_ocr_model.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import copy
import logging
from abc import abstractmethod
from pathlib import Path
from typing import Iterable, List

import numpy as np
Expand All @@ -10,12 +11,15 @@
from scipy.ndimage import find_objects, label

from docling.datamodel.base_models import OcrCell, Page
from docling.datamodel.document import ConversionResult
from docling.datamodel.pipeline_options import OcrOptions
from docling.datamodel.settings import settings
from docling.models.base_model import BasePageModel

_log = logging.getLogger(__name__)


class BaseOcrModel:
class BaseOcrModel(BasePageModel):
def __init__(self, enabled: bool, options: OcrOptions):
self.enabled = enabled
self.options = options
Expand Down Expand Up @@ -113,7 +117,7 @@ def is_overlapping_with_existing_cells(ocr_cell):
]
return filtered_ocr_cells

def draw_ocr_rects_and_cells(self, page, ocr_rects):
def draw_ocr_rects_and_cells(self, conv_res, page, ocr_rects, show: bool = False):
image = copy.deepcopy(page.image)
draw = ImageDraw.Draw(image, "RGBA")

Expand All @@ -130,8 +134,21 @@ def draw_ocr_rects_and_cells(self, page, ocr_rects):
if isinstance(tc, OcrCell):
color = "magenta"
draw.rectangle([(x0, y0), (x1, y1)], outline=color)
image.show()

if show:
image.show()
else:
out_path: Path = (
Path(settings.debug.debug_output_path)
/ f"debug_{conv_res.input.file.stem}"
)
out_path.mkdir(parents=True, exist_ok=True)

out_file = out_path / f"ocr_page_{page.page_no:05}.png"
image.save(str(out_file), format="png")

@abstractmethod
def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
def __call__(
self, conv_res: ConversionResult, page_batch: Iterable[Page]
) -> Iterable[Page]:
pass
38 changes: 27 additions & 11 deletions docling/models/ds_glm_model.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import copy
import random
from pathlib import Path
from typing import List, Union

from deepsearch_glm.nlp_utils import init_nlp_model
Expand Down Expand Up @@ -27,6 +28,8 @@

from docling.datamodel.base_models import Cluster, FigureElement, Table, TextElement
from docling.datamodel.document import ConversionResult, layout_label_to_ds_type
from docling.datamodel.settings import settings
from docling.utils.profiling import ProfilingScope, TimeRecorder
from docling.utils.utils import create_hash


Expand Down Expand Up @@ -226,23 +229,24 @@ def make_spans(cell):
return ds_doc

def __call__(self, conv_res: ConversionResult) -> DoclingDocument:
ds_doc = self._to_legacy_document(conv_res)
ds_doc_dict = ds_doc.model_dump(by_alias=True)
with TimeRecorder(conv_res, "glm", scope=ProfilingScope.DOCUMENT):
ds_doc = self._to_legacy_document(conv_res)
ds_doc_dict = ds_doc.model_dump(by_alias=True)

glm_doc = self.model.apply_on_doc(ds_doc_dict)
glm_doc = self.model.apply_on_doc(ds_doc_dict)

docling_doc: DoclingDocument = to_docling_document(glm_doc) # Experimental
docling_doc: DoclingDocument = to_docling_document(glm_doc) # Experimental

# DEBUG code:
def draw_clusters_and_cells(ds_document, page_no):
def draw_clusters_and_cells(ds_document, page_no, show: bool = False):
clusters_to_draw = []
image = copy.deepcopy(conv_res.pages[page_no].image)
for ix, elem in enumerate(ds_document.main_text):
if isinstance(elem, BaseText):
prov = elem.prov[0]
prov = elem.prov[0] # type: ignore
elif isinstance(elem, Ref):
_, arr, index = elem.ref.split("/")
index = int(index)
index = int(index) # type: ignore
if arr == "tables":
prov = ds_document.tables[index].prov[0]
elif arr == "figures":
Expand All @@ -256,7 +260,7 @@ def draw_clusters_and_cells(ds_document, page_no):
id=ix,
label=elem.name,
bbox=BoundingBox.from_tuple(
coord=prov.bbox,
coord=prov.bbox, # type: ignore
origin=CoordOrigin.BOTTOMLEFT,
).to_top_left_origin(conv_res.pages[page_no].size.height),
)
Expand All @@ -276,9 +280,21 @@ def draw_clusters_and_cells(ds_document, page_no):
for tc in c.cells: # [:1]:
x0, y0, x1, y1 = tc.bbox.as_tuple()
draw.rectangle([(x0, y0), (x1, y1)], outline=cell_color)
image.show()

# draw_clusters_and_cells(ds_doc, 0)
# draw_clusters_and_cells(exported_doc, 0)
if show:
image.show()
else:
out_path: Path = (
Path(settings.debug.debug_output_path)
/ f"debug_{conv_res.input.file.stem}"
)
out_path.mkdir(parents=True, exist_ok=True)

out_file = out_path / f"doc_page_{page_no:05}.png"
image.save(str(out_file), format="png")

# for item in ds_doc.page_dimensions:
# page_no = item.page
# draw_clusters_and_cells(ds_doc, page_no)

return docling_doc
Loading

0 comments on commit 2a2c65b

Please sign in to comment.