From 2a2c65bf4f89a715c27310eaa9cd9db635e0f673 Mon Sep 17 00:00:00 2001 From: Christoph Auer <60343111+cau-git@users.noreply.github.com> Date: Wed, 30 Oct 2024 15:04:19 +0100 Subject: [PATCH] feat: Add pipeline timings and toggle visualization, establish debug settings (#183) * Add settings to turn visualization on or off Signed-off-by: Christoph Auer * Add profiling code to all models Signed-off-by: Christoph Auer * Refactor and fix profiling codes Signed-off-by: Christoph Auer * Visualization codes output PNG to debug dir Signed-off-by: Christoph Auer * Fixes for time logging Signed-off-by: Christoph Auer * Optimize imports Signed-off-by: Christoph Auer * Update lockfile Signed-off-by: Christoph Auer * Add start_timestamps to ProfilingItem Signed-off-by: Christoph Auer --------- Signed-off-by: Christoph Auer --- docling/backend/asciidoc_backend.py | 4 - docling/datamodel/base_models.py | 2 +- docling/datamodel/document.py | 4 +- docling/datamodel/settings.py | 16 +- docling/document_converter.py | 20 +- docling/models/base_model.py | 5 +- docling/models/base_ocr_model.py | 25 +- docling/models/ds_glm_model.py | 38 +- docling/models/easyocr_model.py | 88 +++-- docling/models/layout_model.py | 148 +++++--- docling/models/page_assemble_model.py | 202 +++++----- docling/models/page_preprocessing_model.py | 32 +- docling/models/table_structure_model.py | 215 ++++++----- docling/models/tesseract_ocr_cli_model.py | 114 +++--- docling/models/tesseract_ocr_model.py | 102 ++--- docling/pipeline/base_pipeline.py | 137 ++++--- docling/pipeline/simple_pipeline.py | 19 +- docling/pipeline/standard_pdf_pipeline.py | 115 +++--- docling/utils/profiling.py | 62 +++ docs/examples/batch_convert.py | 7 + poetry.lock | 420 ++++++++++----------- tests/test_backend_asciidoc.py | 3 - tests/test_e2e_conversion.py | 3 - 23 files changed, 1004 insertions(+), 777 deletions(-) create mode 100644 docling/utils/profiling.py diff --git a/docling/backend/asciidoc_backend.py b/docling/backend/asciidoc_backend.py index c9d2fc52..829419af 100644 --- a/docling/backend/asciidoc_backend.py +++ b/docling/backend/asciidoc_backend.py @@ -1,24 +1,20 @@ import logging -import os import re from io import BytesIO from pathlib import Path from typing import Set, Union from docling_core.types.doc import ( - DocItem, DocItemLabel, DoclingDocument, DocumentOrigin, GroupItem, GroupLabel, ImageRef, - NodeItem, Size, TableCell, TableData, ) -from pydantic import AnyUrl from docling.backend.abstract_backend import DeclarativeDocumentBackend from docling.datamodel.base_models import InputFormat diff --git a/docling/datamodel/base_models.py b/docling/datamodel/base_models.py index a82d86a5..d06b6097 100644 --- a/docling/datamodel/base_models.py +++ b/docling/datamodel/base_models.py @@ -1,6 +1,6 @@ from enum import Enum, auto from io import BytesIO -from typing import TYPE_CHECKING, Dict, List, Optional, Set, Union +from typing import TYPE_CHECKING, Dict, List, Optional, Union from docling_core.types.doc import ( BoundingBox, diff --git a/docling/datamodel/document.py b/docling/datamodel/document.py index 41a8af35..be4e9a12 100644 --- a/docling/datamodel/document.py +++ b/docling/datamodel/document.py @@ -3,7 +3,7 @@ from enum import Enum from io import BytesIO from pathlib import Path, PurePath -from typing import TYPE_CHECKING, Dict, Iterable, List, Optional, Tuple, Type, Union +from typing import TYPE_CHECKING, Dict, Iterable, List, Optional, Type, Union import filetype from docling_core.types.doc import ( @@ -52,6 +52,7 @@ Page, ) from docling.datamodel.settings import DocumentLimits +from docling.utils.profiling import ProfilingItem from docling.utils.utils import create_file_hash, create_hash if TYPE_CHECKING: @@ -187,6 +188,7 @@ class ConversionResult(BaseModel): pages: List[Page] = [] assembled: AssembledUnit = AssembledUnit() + timings: Dict[str, ProfilingItem] = {} document: DoclingDocument = _EMPTY_DOCLING_DOC diff --git a/docling/datamodel/settings.py b/docling/datamodel/settings.py index 616d41dc..7daf5047 100644 --- a/docling/datamodel/settings.py +++ b/docling/datamodel/settings.py @@ -1,4 +1,5 @@ import sys +from pathlib import Path from pydantic import BaseModel from pydantic_settings import BaseSettings @@ -26,8 +27,21 @@ class BatchConcurrencySettings(BaseModel): # To force models into single core: export OMP_NUM_THREADS=1 +class DebugSettings(BaseModel): + visualize_cells: bool = False + visualize_ocr: bool = False + visualize_layout: bool = False + visualize_tables: bool = False + + profile_pipeline_timings: bool = False + + # Path used to output debug information. + debug_output_path: str = str(Path.cwd() / "debug") + + class AppSettings(BaseSettings): perf: BatchConcurrencySettings + debug: DebugSettings -settings = AppSettings(perf=BatchConcurrencySettings()) +settings = AppSettings(perf=BatchConcurrencySettings(), debug=DebugSettings()) diff --git a/docling/document_converter.py b/docling/document_converter.py index 0fae96b0..d6d4a630 100644 --- a/docling/document_converter.py +++ b/docling/document_converter.py @@ -189,24 +189,35 @@ def _convert( ) -> Iterator[ConversionResult]: assert self.format_to_options is not None + start_time = time.monotonic() + for input_batch in chunkify( conv_input.docs(self.format_to_options), settings.perf.doc_batch_size, # pass format_options ): _log.info(f"Going to convert document batch...") + # parallel processing only within input_batch # with ThreadPoolExecutor( # max_workers=settings.perf.doc_batch_concurrency # ) as pool: # yield from pool.map(self.process_document, input_batch) - # Note: PDF backends are not thread-safe, thread pool usage was disabled. + for item in map( partial(self._process_document, raises_on_error=raises_on_error), input_batch, ): + elapsed = time.monotonic() - start_time + start_time = time.monotonic() + if item is not None: + _log.info( + f"Finished converting document {item.input.file.name} in {elapsed:.2f} sec." + ) yield item + else: + _log.info(f"Skipped a document. We lost {elapsed:.2f} sec.") def _get_pipeline(self, doc: InputDocument) -> Optional[BasePipeline]: assert self.format_to_options is not None @@ -237,15 +248,8 @@ def _process_document( assert self.allowed_formats is not None assert in_doc.format in self.allowed_formats - start_doc_time = time.time() - conv_res = self._execute_pipeline(in_doc, raises_on_error=raises_on_error) - end_doc_time = time.time() - start_doc_time - _log.info( - f"Finished converting document {in_doc.file.name} in {end_doc_time:.2f} seconds." - ) - return conv_res def _execute_pipeline( diff --git a/docling/models/base_model.py b/docling/models/base_model.py index dffad502..1147896c 100644 --- a/docling/models/base_model.py +++ b/docling/models/base_model.py @@ -4,11 +4,14 @@ from docling_core.types.doc import DoclingDocument, NodeItem from docling.datamodel.base_models import Page +from docling.datamodel.document import ConversionResult class BasePageModel(ABC): @abstractmethod - def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]: + def __call__( + self, conv_res: ConversionResult, page_batch: Iterable[Page] + ) -> Iterable[Page]: pass diff --git a/docling/models/base_ocr_model.py b/docling/models/base_ocr_model.py index da6860a8..9d26a317 100644 --- a/docling/models/base_ocr_model.py +++ b/docling/models/base_ocr_model.py @@ -1,6 +1,7 @@ import copy import logging from abc import abstractmethod +from pathlib import Path from typing import Iterable, List import numpy as np @@ -10,12 +11,15 @@ from scipy.ndimage import find_objects, label from docling.datamodel.base_models import OcrCell, Page +from docling.datamodel.document import ConversionResult from docling.datamodel.pipeline_options import OcrOptions +from docling.datamodel.settings import settings +from docling.models.base_model import BasePageModel _log = logging.getLogger(__name__) -class BaseOcrModel: +class BaseOcrModel(BasePageModel): def __init__(self, enabled: bool, options: OcrOptions): self.enabled = enabled self.options = options @@ -113,7 +117,7 @@ def is_overlapping_with_existing_cells(ocr_cell): ] return filtered_ocr_cells - def draw_ocr_rects_and_cells(self, page, ocr_rects): + def draw_ocr_rects_and_cells(self, conv_res, page, ocr_rects, show: bool = False): image = copy.deepcopy(page.image) draw = ImageDraw.Draw(image, "RGBA") @@ -130,8 +134,21 @@ def draw_ocr_rects_and_cells(self, page, ocr_rects): if isinstance(tc, OcrCell): color = "magenta" draw.rectangle([(x0, y0), (x1, y1)], outline=color) - image.show() + + if show: + image.show() + else: + out_path: Path = ( + Path(settings.debug.debug_output_path) + / f"debug_{conv_res.input.file.stem}" + ) + out_path.mkdir(parents=True, exist_ok=True) + + out_file = out_path / f"ocr_page_{page.page_no:05}.png" + image.save(str(out_file), format="png") @abstractmethod - def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]: + def __call__( + self, conv_res: ConversionResult, page_batch: Iterable[Page] + ) -> Iterable[Page]: pass diff --git a/docling/models/ds_glm_model.py b/docling/models/ds_glm_model.py index 2f7078d3..e63bad3a 100644 --- a/docling/models/ds_glm_model.py +++ b/docling/models/ds_glm_model.py @@ -1,5 +1,6 @@ import copy import random +from pathlib import Path from typing import List, Union from deepsearch_glm.nlp_utils import init_nlp_model @@ -27,6 +28,8 @@ from docling.datamodel.base_models import Cluster, FigureElement, Table, TextElement from docling.datamodel.document import ConversionResult, layout_label_to_ds_type +from docling.datamodel.settings import settings +from docling.utils.profiling import ProfilingScope, TimeRecorder from docling.utils.utils import create_hash @@ -226,23 +229,24 @@ def make_spans(cell): return ds_doc def __call__(self, conv_res: ConversionResult) -> DoclingDocument: - ds_doc = self._to_legacy_document(conv_res) - ds_doc_dict = ds_doc.model_dump(by_alias=True) + with TimeRecorder(conv_res, "glm", scope=ProfilingScope.DOCUMENT): + ds_doc = self._to_legacy_document(conv_res) + ds_doc_dict = ds_doc.model_dump(by_alias=True) - glm_doc = self.model.apply_on_doc(ds_doc_dict) + glm_doc = self.model.apply_on_doc(ds_doc_dict) - docling_doc: DoclingDocument = to_docling_document(glm_doc) # Experimental + docling_doc: DoclingDocument = to_docling_document(glm_doc) # Experimental # DEBUG code: - def draw_clusters_and_cells(ds_document, page_no): + def draw_clusters_and_cells(ds_document, page_no, show: bool = False): clusters_to_draw = [] image = copy.deepcopy(conv_res.pages[page_no].image) for ix, elem in enumerate(ds_document.main_text): if isinstance(elem, BaseText): - prov = elem.prov[0] + prov = elem.prov[0] # type: ignore elif isinstance(elem, Ref): _, arr, index = elem.ref.split("/") - index = int(index) + index = int(index) # type: ignore if arr == "tables": prov = ds_document.tables[index].prov[0] elif arr == "figures": @@ -256,7 +260,7 @@ def draw_clusters_and_cells(ds_document, page_no): id=ix, label=elem.name, bbox=BoundingBox.from_tuple( - coord=prov.bbox, + coord=prov.bbox, # type: ignore origin=CoordOrigin.BOTTOMLEFT, ).to_top_left_origin(conv_res.pages[page_no].size.height), ) @@ -276,9 +280,21 @@ def draw_clusters_and_cells(ds_document, page_no): for tc in c.cells: # [:1]: x0, y0, x1, y1 = tc.bbox.as_tuple() draw.rectangle([(x0, y0), (x1, y1)], outline=cell_color) - image.show() - # draw_clusters_and_cells(ds_doc, 0) - # draw_clusters_and_cells(exported_doc, 0) + if show: + image.show() + else: + out_path: Path = ( + Path(settings.debug.debug_output_path) + / f"debug_{conv_res.input.file.stem}" + ) + out_path.mkdir(parents=True, exist_ok=True) + + out_file = out_path / f"doc_page_{page_no:05}.png" + image.save(str(out_file), format="png") + + # for item in ds_doc.page_dimensions: + # page_no = item.page + # draw_clusters_and_cells(ds_doc, page_no) return docling_doc diff --git a/docling/models/easyocr_model.py b/docling/models/easyocr_model.py index d535b593..8dff0fff 100644 --- a/docling/models/easyocr_model.py +++ b/docling/models/easyocr_model.py @@ -5,8 +5,11 @@ from docling_core.types.doc import BoundingBox, CoordOrigin from docling.datamodel.base_models import OcrCell, Page +from docling.datamodel.document import ConversionResult from docling.datamodel.pipeline_options import EasyOcrOptions +from docling.datamodel.settings import settings from docling.models.base_ocr_model import BaseOcrModel +from docling.utils.profiling import TimeRecorder _log = logging.getLogger(__name__) @@ -33,58 +36,65 @@ def __init__(self, enabled: bool, options: EasyOcrOptions): download_enabled=self.options.download_enabled, ) - def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]: + def __call__( + self, conv_res: ConversionResult, page_batch: Iterable[Page] + ) -> Iterable[Page]: if not self.enabled: yield from page_batch return for page in page_batch: + assert page._backend is not None if not page._backend.is_valid(): yield page else: - ocr_rects = self.get_ocr_rects(page) - - all_ocr_cells = [] - for ocr_rect in ocr_rects: - # Skip zero area boxes - if ocr_rect.area() == 0: - continue - high_res_image = page._backend.get_page_image( - scale=self.scale, cropbox=ocr_rect - ) - im = numpy.array(high_res_image) - result = self.reader.readtext(im) - - del high_res_image - del im - - cells = [ - OcrCell( - id=ix, - text=line[1], - confidence=line[2], - bbox=BoundingBox.from_tuple( - coord=( - (line[0][0][0] / self.scale) + ocr_rect.l, - (line[0][0][1] / self.scale) + ocr_rect.t, - (line[0][2][0] / self.scale) + ocr_rect.l, - (line[0][2][1] / self.scale) + ocr_rect.t, - ), - origin=CoordOrigin.TOPLEFT, - ), + with TimeRecorder(conv_res, "ocr"): + ocr_rects = self.get_ocr_rects(page) + + all_ocr_cells = [] + for ocr_rect in ocr_rects: + # Skip zero area boxes + if ocr_rect.area() == 0: + continue + high_res_image = page._backend.get_page_image( + scale=self.scale, cropbox=ocr_rect ) - for ix, line in enumerate(result) - ] - all_ocr_cells.extend(cells) - - ## Remove OCR cells which overlap with programmatic cells. - filtered_ocr_cells = self.filter_ocr_cells(all_ocr_cells, page.cells) + im = numpy.array(high_res_image) + result = self.reader.readtext(im) + + del high_res_image + del im + + cells = [ + OcrCell( + id=ix, + text=line[1], + confidence=line[2], + bbox=BoundingBox.from_tuple( + coord=( + (line[0][0][0] / self.scale) + ocr_rect.l, + (line[0][0][1] / self.scale) + ocr_rect.t, + (line[0][2][0] / self.scale) + ocr_rect.l, + (line[0][2][1] / self.scale) + ocr_rect.t, + ), + origin=CoordOrigin.TOPLEFT, + ), + ) + for ix, line in enumerate(result) + ] + all_ocr_cells.extend(cells) + + ## Remove OCR cells which overlap with programmatic cells. + filtered_ocr_cells = self.filter_ocr_cells( + all_ocr_cells, page.cells + ) - page.cells.extend(filtered_ocr_cells) + page.cells.extend(filtered_ocr_cells) # DEBUG code: - # self.draw_ocr_rects_and_cells(page, ocr_rects) + if settings.debug.visualize_ocr: + self.draw_ocr_rects_and_cells(conv_res, page, ocr_rects) yield page diff --git a/docling/models/layout_model.py b/docling/models/layout_model.py index 009a5b92..91897df4 100644 --- a/docling/models/layout_model.py +++ b/docling/models/layout_model.py @@ -16,8 +16,11 @@ LayoutPrediction, Page, ) +from docling.datamodel.document import ConversionResult +from docling.datamodel.settings import settings from docling.models.base_model import BasePageModel from docling.utils import layout_utils as lu +from docling.utils.profiling import TimeRecorder _log = logging.getLogger(__name__) @@ -271,74 +274,97 @@ def postprocess(self, clusters_in: List[Cluster], cells: List[Cell], page_height return clusters_out_new, cells_out_new - def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]: + def __call__( + self, conv_res: ConversionResult, page_batch: Iterable[Page] + ) -> Iterable[Page]: + for page in page_batch: assert page._backend is not None if not page._backend.is_valid(): yield page else: - assert page.size is not None - - clusters = [] - for ix, pred_item in enumerate( - self.layout_predictor.predict(page.get_image(scale=1.0)) - ): - label = DocItemLabel( - pred_item["label"].lower().replace(" ", "_").replace("-", "_") - ) # Temporary, until docling-ibm-model uses docling-core types - cluster = Cluster( - id=ix, - label=label, - confidence=pred_item["confidence"], - bbox=BoundingBox.model_validate(pred_item), - cells=[], - ) - clusters.append(cluster) - - # Map cells to clusters - # TODO: Remove, postprocess should take care of it anyway. - for cell in page.cells: - for cluster in clusters: - if not cell.bbox.area() > 0: - overlap_frac = 0.0 - else: - overlap_frac = ( - cell.bbox.intersection_area_with(cluster.bbox) - / cell.bbox.area() - ) - - if overlap_frac > 0.5: - cluster.cells.append(cell) - - # Pre-sort clusters - # clusters = self.sort_clusters_by_cell_order(clusters) - - # DEBUG code: - def draw_clusters_and_cells(): - image = copy.deepcopy(page.image) - draw = ImageDraw.Draw(image) - for c in clusters: - x0, y0, x1, y1 = c.bbox.as_tuple() - draw.rectangle([(x0, y0), (x1, y1)], outline="green") - - cell_color = ( - random.randint(30, 140), - random.randint(30, 140), - random.randint(30, 140), + with TimeRecorder(conv_res, "layout"): + assert page.size is not None + + clusters = [] + for ix, pred_item in enumerate( + self.layout_predictor.predict(page.get_image(scale=1.0)) + ): + label = DocItemLabel( + pred_item["label"] + .lower() + .replace(" ", "_") + .replace("-", "_") + ) # Temporary, until docling-ibm-model uses docling-core types + cluster = Cluster( + id=ix, + label=label, + confidence=pred_item["confidence"], + bbox=BoundingBox.model_validate(pred_item), + cells=[], ) - for tc in c.cells: # [:1]: - x0, y0, x1, y1 = tc.bbox.as_tuple() - draw.rectangle([(x0, y0), (x1, y1)], outline=cell_color) - image.show() - - # draw_clusters_and_cells() - - clusters, page.cells = self.postprocess( - clusters, page.cells, page.size.height - ) + clusters.append(cluster) + + # Map cells to clusters + # TODO: Remove, postprocess should take care of it anyway. + for cell in page.cells: + for cluster in clusters: + if not cell.bbox.area() > 0: + overlap_frac = 0.0 + else: + overlap_frac = ( + cell.bbox.intersection_area_with(cluster.bbox) + / cell.bbox.area() + ) + + if overlap_frac > 0.5: + cluster.cells.append(cell) + + # Pre-sort clusters + # clusters = self.sort_clusters_by_cell_order(clusters) + + # DEBUG code: + def draw_clusters_and_cells(show: bool = False): + image = copy.deepcopy(page.image) + if image is not None: + draw = ImageDraw.Draw(image) + for c in clusters: + x0, y0, x1, y1 = c.bbox.as_tuple() + draw.rectangle([(x0, y0), (x1, y1)], outline="green") + + cell_color = ( + random.randint(30, 140), + random.randint(30, 140), + random.randint(30, 140), + ) + for tc in c.cells: # [:1]: + x0, y0, x1, y1 = tc.bbox.as_tuple() + draw.rectangle( + [(x0, y0), (x1, y1)], outline=cell_color + ) + if show: + image.show() + else: + out_path: Path = ( + Path(settings.debug.debug_output_path) + / f"debug_{conv_res.input.file.stem}" + ) + out_path.mkdir(parents=True, exist_ok=True) + + out_file = ( + out_path / f"layout_page_{page.page_no:05}.png" + ) + image.save(str(out_file), format="png") + + # draw_clusters_and_cells() + + clusters, page.cells = self.postprocess( + clusters, page.cells, page.size.height + ) - # draw_clusters_and_cells() + page.predictions.layout = LayoutPrediction(clusters=clusters) - page.predictions.layout = LayoutPrediction(clusters=clusters) + if settings.debug.visualize_layout: + draw_clusters_and_cells() yield page diff --git a/docling/models/page_assemble_model.py b/docling/models/page_assemble_model.py index caf168cc..9b064ead 100644 --- a/docling/models/page_assemble_model.py +++ b/docling/models/page_assemble_model.py @@ -12,8 +12,10 @@ Table, TextElement, ) +from docling.datamodel.document import ConversionResult from docling.models.base_model import BasePageModel from docling.models.layout_model import LayoutModel +from docling.utils.profiling import TimeRecorder _log = logging.getLogger(__name__) @@ -51,122 +53,122 @@ def sanitize_text(self, lines): return sanitized_text.strip() # Strip any leading or trailing whitespace - def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]: + def __call__( + self, conv_res: ConversionResult, page_batch: Iterable[Page] + ) -> Iterable[Page]: for page in page_batch: assert page._backend is not None if not page._backend.is_valid(): yield page else: - assert page.predictions.layout is not None - - # assembles some JSON output page by page. - - elements: List[PageElement] = [] - headers: List[PageElement] = [] - body: List[PageElement] = [] - - for cluster in page.predictions.layout.clusters: - # _log.info("Cluster label seen:", cluster.label) - if cluster.label in LayoutModel.TEXT_ELEM_LABELS: - - textlines = [ - cell.text.replace("\x02", "-").strip() - for cell in cluster.cells - if len(cell.text.strip()) > 0 - ] - text = self.sanitize_text(textlines) - text_el = TextElement( - label=cluster.label, - id=cluster.id, - text=text, - page_no=page.page_no, - cluster=cluster, - ) - elements.append(text_el) - - if cluster.label in LayoutModel.PAGE_HEADER_LABELS: - headers.append(text_el) - else: - body.append(text_el) - elif cluster.label == LayoutModel.TABLE_LABEL: - tbl = None - if page.predictions.tablestructure: - tbl = page.predictions.tablestructure.table_map.get( - cluster.id, None - ) - if ( - not tbl - ): # fallback: add table without structure, if it isn't present - tbl = Table( + with TimeRecorder(conv_res, "page_assemble"): + + assert page.predictions.layout is not None + + # assembles some JSON output page by page. + + elements: List[PageElement] = [] + headers: List[PageElement] = [] + body: List[PageElement] = [] + + for cluster in page.predictions.layout.clusters: + # _log.info("Cluster label seen:", cluster.label) + if cluster.label in LayoutModel.TEXT_ELEM_LABELS: + + textlines = [ + cell.text.replace("\x02", "-").strip() + for cell in cluster.cells + if len(cell.text.strip()) > 0 + ] + text = self.sanitize_text(textlines) + text_el = TextElement( label=cluster.label, id=cluster.id, - text="", - otsl_seq=[], - table_cells=[], - cluster=cluster, + text=text, page_no=page.page_no, + cluster=cluster, ) + elements.append(text_el) + + if cluster.label in LayoutModel.PAGE_HEADER_LABELS: + headers.append(text_el) + else: + body.append(text_el) + elif cluster.label == LayoutModel.TABLE_LABEL: + tbl = None + if page.predictions.tablestructure: + tbl = page.predictions.tablestructure.table_map.get( + cluster.id, None + ) + if ( + not tbl + ): # fallback: add table without structure, if it isn't present + tbl = Table( + label=cluster.label, + id=cluster.id, + text="", + otsl_seq=[], + table_cells=[], + cluster=cluster, + page_no=page.page_no, + ) - elements.append(tbl) - body.append(tbl) - elif cluster.label == LayoutModel.FIGURE_LABEL: - fig = None - if page.predictions.figures_classification: - fig = ( - page.predictions.figures_classification.figure_map.get( + elements.append(tbl) + body.append(tbl) + elif cluster.label == LayoutModel.FIGURE_LABEL: + fig = None + if page.predictions.figures_classification: + fig = page.predictions.figures_classification.figure_map.get( cluster.id, None ) - ) - if ( - not fig - ): # fallback: add figure without classification, if it isn't present - fig = FigureElement( - label=cluster.label, - id=cluster.id, - text="", - data=None, - cluster=cluster, - page_no=page.page_no, - ) - elements.append(fig) - body.append(fig) - elif cluster.label == LayoutModel.FORMULA_LABEL: - equation = None - if page.predictions.equations_prediction: - equation = ( - page.predictions.equations_prediction.equation_map.get( + if ( + not fig + ): # fallback: add figure without classification, if it isn't present + fig = FigureElement( + label=cluster.label, + id=cluster.id, + text="", + data=None, + cluster=cluster, + page_no=page.page_no, + ) + elements.append(fig) + body.append(fig) + elif cluster.label == LayoutModel.FORMULA_LABEL: + equation = None + if page.predictions.equations_prediction: + equation = page.predictions.equations_prediction.equation_map.get( cluster.id, None ) - ) - if ( - not equation - ): # fallback: add empty formula, if it isn't present - text = self.sanitize_text( - [ - cell.text.replace("\x02", "-").strip() - for cell in cluster.cells - if len(cell.text.strip()) > 0 - ] - ) - equation = TextElement( - label=cluster.label, - id=cluster.id, - cluster=cluster, - page_no=page.page_no, - text=text, - ) - elements.append(equation) - body.append(equation) + if ( + not equation + ): # fallback: add empty formula, if it isn't present + text = self.sanitize_text( + [ + cell.text.replace("\x02", "-").strip() + for cell in cluster.cells + if len(cell.text.strip()) > 0 + ] + ) + equation = TextElement( + label=cluster.label, + id=cluster.id, + cluster=cluster, + page_no=page.page_no, + text=text, + ) + elements.append(equation) + body.append(equation) - page.assembled = AssembledUnit( - elements=elements, headers=headers, body=body - ) + page.assembled = AssembledUnit( + elements=elements, headers=headers, body=body + ) - # Remove page images (can be disabled) - if not self.options.keep_images: - page._image_cache = {} + # Remove page images (can be disabled) + if not self.options.keep_images: + page._image_cache = {} - # Unload backend - page._backend.unload() + # Unload backend + page._backend.unload() yield page diff --git a/docling/models/page_preprocessing_model.py b/docling/models/page_preprocessing_model.py index 1e0032c1..63f1a4f6 100644 --- a/docling/models/page_preprocessing_model.py +++ b/docling/models/page_preprocessing_model.py @@ -1,10 +1,14 @@ +from pathlib import Path from typing import Iterable, Optional from PIL import ImageDraw from pydantic import BaseModel from docling.datamodel.base_models import Page +from docling.datamodel.document import ConversionResult +from docling.datamodel.settings import settings from docling.models.base_model import BasePageModel +from docling.utils.profiling import TimeRecorder class PagePreprocessingOptions(BaseModel): @@ -15,14 +19,17 @@ class PagePreprocessingModel(BasePageModel): def __init__(self, options: PagePreprocessingOptions): self.options = options - def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]: + def __call__( + self, conv_res: ConversionResult, page_batch: Iterable[Page] + ) -> Iterable[Page]: for page in page_batch: assert page._backend is not None if not page._backend.is_valid(): yield page else: - page = self._populate_page_images(page) - page = self._parse_page_cells(page) + with TimeRecorder(conv_res, "page_parse"): + page = self._populate_page_images(page) + page = self._parse_page_cells(conv_res, page) yield page # Generate the page image and store it in the page object @@ -43,19 +50,30 @@ def _populate_page_images(self, page: Page) -> Page: return page # Extract and populate the page cells and store it in the page object - def _parse_page_cells(self, page: Page) -> Page: + def _parse_page_cells(self, conv_res: ConversionResult, page: Page) -> Page: assert page._backend is not None page.cells = list(page._backend.get_text_cells()) # DEBUG code: - def draw_text_boxes(image, cells): + def draw_text_boxes(image, cells, show: bool = False): draw = ImageDraw.Draw(image) for c in cells: x0, y0, x1, y1 = c.bbox.as_tuple() draw.rectangle([(x0, y0), (x1, y1)], outline="red") - image.show() + if show: + image.show() + else: + out_path: Path = ( + Path(settings.debug.debug_output_path) + / f"debug_{conv_res.input.file.stem}" + ) + out_path.mkdir(parents=True, exist_ok=True) + + out_file = out_path / f"cells_page_{page.page_no:05}.png" + image.save(str(out_file), format="png") - # draw_text_boxes(page.get_image(scale=1.0), cells) + if settings.debug.visualize_cells: + draw_text_boxes(page.get_image(scale=1.0), page.cells) return page diff --git a/docling/models/table_structure_model.py b/docling/models/table_structure_model.py index a3257ab3..12bc2838 100644 --- a/docling/models/table_structure_model.py +++ b/docling/models/table_structure_model.py @@ -1,6 +1,6 @@ import copy from pathlib import Path -from typing import Iterable, List +from typing import Iterable import numpy from docling_core.types.doc import BoundingBox, DocItemLabel, TableCell @@ -8,8 +8,11 @@ from PIL import ImageDraw from docling.datamodel.base_models import Page, Table, TableStructurePrediction +from docling.datamodel.document import ConversionResult from docling.datamodel.pipeline_options import TableFormerMode, TableStructureOptions +from docling.datamodel.settings import settings from docling.models.base_model import BasePageModel +from docling.utils.profiling import TimeRecorder class TableStructureModel(BasePageModel): @@ -35,7 +38,13 @@ def __init__( self.tf_predictor = TFPredictor(self.tm_config) self.scale = 2.0 # Scale up table input images to 144 dpi - def draw_table_and_cells(self, page: Page, tbl_list: List[Table]): + def draw_table_and_cells( + self, + conv_res: ConversionResult, + page: Page, + tbl_list: Iterable[Table], + show: bool = False, + ): assert page._backend is not None image = ( @@ -61,9 +70,21 @@ def draw_table_and_cells(self, page: Page, tbl_list: List[Table]): fill="black", ) - image.show() + if show: + image.show() + else: + out_path: Path = ( + Path(settings.debug.debug_output_path) + / f"debug_{conv_res.input.file.stem}" + ) + out_path.mkdir(parents=True, exist_ok=True) + + out_file = out_path / f"table_struct_page_{page.page_no:05}.png" + image.save(str(out_file), format="png") - def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]: + def __call__( + self, conv_res: ConversionResult, page_batch: Iterable[Page] + ) -> Iterable[Page]: if not self.enabled: yield from page_batch @@ -74,98 +95,112 @@ def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]: if not page._backend.is_valid(): yield page else: - - assert page.predictions.layout is not None - assert page.size is not None - - page.predictions.tablestructure = TableStructurePrediction() # dummy - - in_tables = [ - ( - cluster, - [ - round(cluster.bbox.l) * self.scale, - round(cluster.bbox.t) * self.scale, - round(cluster.bbox.r) * self.scale, - round(cluster.bbox.b) * self.scale, - ], + with TimeRecorder(conv_res, "table_structure"): + + assert page.predictions.layout is not None + assert page.size is not None + + page.predictions.tablestructure = ( + TableStructurePrediction() + ) # dummy + + in_tables = [ + ( + cluster, + [ + round(cluster.bbox.l) * self.scale, + round(cluster.bbox.t) * self.scale, + round(cluster.bbox.r) * self.scale, + round(cluster.bbox.b) * self.scale, + ], + ) + for cluster in page.predictions.layout.clusters + if cluster.label == DocItemLabel.TABLE + ] + if not len(in_tables): + yield page + continue + + tokens = [] + for c in page.cells: + for cluster, _ in in_tables: + if c.bbox.area() > 0: + if ( + c.bbox.intersection_area_with(cluster.bbox) + / c.bbox.area() + > 0.2 + ): + # Only allow non empty stings (spaces) into the cells of a table + if len(c.text.strip()) > 0: + new_cell = copy.deepcopy(c) + new_cell.bbox = new_cell.bbox.scaled( + scale=self.scale + ) + + tokens.append(new_cell.model_dump()) + + page_input = { + "tokens": tokens, + "width": page.size.width * self.scale, + "height": page.size.height * self.scale, + } + page_input["image"] = numpy.asarray( + page.get_image(scale=self.scale) ) - for cluster in page.predictions.layout.clusters - if cluster.label == DocItemLabel.TABLE - ] - if not len(in_tables): - yield page - continue - - tokens = [] - for c in page.cells: - for cluster, _ in in_tables: - if c.bbox.area() > 0: - if ( - c.bbox.intersection_area_with(cluster.bbox) - / c.bbox.area() - > 0.2 - ): - # Only allow non empty stings (spaces) into the cells of a table - if len(c.text.strip()) > 0: - new_cell = copy.deepcopy(c) - new_cell.bbox = new_cell.bbox.scaled( - scale=self.scale - ) - - tokens.append(new_cell.model_dump()) - page_input = { - "tokens": tokens, - "width": page.size.width * self.scale, - "height": page.size.height * self.scale, - } - page_input["image"] = numpy.asarray(page.get_image(scale=self.scale)) + table_clusters, table_bboxes = zip(*in_tables) - table_clusters, table_bboxes = zip(*in_tables) - - if len(table_bboxes): - tf_output = self.tf_predictor.multi_table_predict( - page_input, table_bboxes, do_matching=self.do_cell_matching - ) - - for table_cluster, table_out in zip(table_clusters, tf_output): - table_cells = [] - for element in table_out["tf_responses"]: - - if not self.do_cell_matching: - the_bbox = BoundingBox.model_validate( - element["bbox"] - ).scaled(1 / self.scale) - text_piece = page._backend.get_text_in_rect(the_bbox) - element["bbox"]["token"] = text_piece - - tc = TableCell.model_validate(element) - if self.do_cell_matching and tc.bbox is not None: - tc.bbox = tc.bbox.scaled(1 / self.scale) - table_cells.append(tc) - - # Retrieving cols/rows, after post processing: - num_rows = table_out["predict_details"]["num_rows"] - num_cols = table_out["predict_details"]["num_cols"] - otsl_seq = table_out["predict_details"]["prediction"]["rs_seq"] - - tbl = Table( - otsl_seq=otsl_seq, - table_cells=table_cells, - num_rows=num_rows, - num_cols=num_cols, - id=table_cluster.id, - page_no=page.page_no, - cluster=table_cluster, - label=DocItemLabel.TABLE, + if len(table_bboxes): + tf_output = self.tf_predictor.multi_table_predict( + page_input, table_bboxes, do_matching=self.do_cell_matching ) - page.predictions.tablestructure.table_map[table_cluster.id] = ( - tbl - ) + for table_cluster, table_out in zip(table_clusters, tf_output): + table_cells = [] + for element in table_out["tf_responses"]: + + if not self.do_cell_matching: + the_bbox = BoundingBox.model_validate( + element["bbox"] + ).scaled(1 / self.scale) + text_piece = page._backend.get_text_in_rect( + the_bbox + ) + element["bbox"]["token"] = text_piece + + tc = TableCell.model_validate(element) + if self.do_cell_matching and tc.bbox is not None: + tc.bbox = tc.bbox.scaled(1 / self.scale) + table_cells.append(tc) + + # Retrieving cols/rows, after post processing: + num_rows = table_out["predict_details"]["num_rows"] + num_cols = table_out["predict_details"]["num_cols"] + otsl_seq = table_out["predict_details"]["prediction"][ + "rs_seq" + ] + + tbl = Table( + otsl_seq=otsl_seq, + table_cells=table_cells, + num_rows=num_rows, + num_cols=num_cols, + id=table_cluster.id, + page_no=page.page_no, + cluster=table_cluster, + label=DocItemLabel.TABLE, + ) + + page.predictions.tablestructure.table_map[ + table_cluster.id + ] = tbl # For debugging purposes: - # self.draw_table_and_cells(page, page.predictions.tablestructure.table_map.values()) + if settings.debug.visualize_tables: + self.draw_table_and_cells( + conv_res, + page, + page.predictions.tablestructure.table_map.values(), + ) yield page diff --git a/docling/models/tesseract_ocr_cli_model.py b/docling/models/tesseract_ocr_cli_model.py index b042653b..6f939351 100644 --- a/docling/models/tesseract_ocr_cli_model.py +++ b/docling/models/tesseract_ocr_cli_model.py @@ -8,8 +8,11 @@ from docling_core.types.doc import BoundingBox, CoordOrigin from docling.datamodel.base_models import OcrCell, Page +from docling.datamodel.document import ConversionResult from docling.datamodel.pipeline_options import TesseractCliOcrOptions +from docling.datamodel.settings import settings from docling.models.base_ocr_model import BaseOcrModel +from docling.utils.profiling import TimeRecorder _log = logging.getLogger(__name__) @@ -102,7 +105,9 @@ def _run_tesseract(self, ifilename: str): return df_filtered - def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]: + def __call__( + self, conv_res: ConversionResult, page_batch: Iterable[Page] + ) -> Iterable[Page]: if not self.enabled: yield from page_batch @@ -113,62 +118,67 @@ def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]: if not page._backend.is_valid(): yield page else: - ocr_rects = self.get_ocr_rects(page) - - all_ocr_cells = [] - for ocr_rect in ocr_rects: - # Skip zero area boxes - if ocr_rect.area() == 0: - continue - high_res_image = page._backend.get_page_image( - scale=self.scale, cropbox=ocr_rect - ) + with TimeRecorder(conv_res, "ocr"): - with tempfile.NamedTemporaryFile( - suffix=".png", mode="w" - ) as image_file: - fname = image_file.name - high_res_image.save(fname) - - df = self._run_tesseract(fname) - - # _log.info(df) - - # Print relevant columns (bounding box and text) - for ix, row in df.iterrows(): - text = row["text"] - conf = row["conf"] - - l = float(row["left"]) - b = float(row["top"]) - w = float(row["width"]) - h = float(row["height"]) - - t = b + h - r = l + w - - cell = OcrCell( - id=ix, - text=text, - confidence=conf / 100.0, - bbox=BoundingBox.from_tuple( - coord=( - (l / self.scale) + ocr_rect.l, - (b / self.scale) + ocr_rect.t, - (r / self.scale) + ocr_rect.l, - (t / self.scale) + ocr_rect.t, - ), - origin=CoordOrigin.TOPLEFT, - ), + ocr_rects = self.get_ocr_rects(page) + + all_ocr_cells = [] + for ocr_rect in ocr_rects: + # Skip zero area boxes + if ocr_rect.area() == 0: + continue + high_res_image = page._backend.get_page_image( + scale=self.scale, cropbox=ocr_rect ) - all_ocr_cells.append(cell) - ## Remove OCR cells which overlap with programmatic cells. - filtered_ocr_cells = self.filter_ocr_cells(all_ocr_cells, page.cells) + with tempfile.NamedTemporaryFile( + suffix=".png", mode="w" + ) as image_file: + fname = image_file.name + high_res_image.save(fname) + + df = self._run_tesseract(fname) + + # _log.info(df) + + # Print relevant columns (bounding box and text) + for ix, row in df.iterrows(): + text = row["text"] + conf = row["conf"] + + l = float(row["left"]) + b = float(row["top"]) + w = float(row["width"]) + h = float(row["height"]) + + t = b + h + r = l + w + + cell = OcrCell( + id=ix, + text=text, + confidence=conf / 100.0, + bbox=BoundingBox.from_tuple( + coord=( + (l / self.scale) + ocr_rect.l, + (b / self.scale) + ocr_rect.t, + (r / self.scale) + ocr_rect.l, + (t / self.scale) + ocr_rect.t, + ), + origin=CoordOrigin.TOPLEFT, + ), + ) + all_ocr_cells.append(cell) + + ## Remove OCR cells which overlap with programmatic cells. + filtered_ocr_cells = self.filter_ocr_cells( + all_ocr_cells, page.cells + ) - page.cells.extend(filtered_ocr_cells) + page.cells.extend(filtered_ocr_cells) # DEBUG code: - # self.draw_ocr_rects_and_cells(page, ocr_rects) + if settings.debug.visualize_ocr: + self.draw_ocr_rects_and_cells(conv_res, page, ocr_rects) yield page diff --git a/docling/models/tesseract_ocr_model.py b/docling/models/tesseract_ocr_model.py index f8a1fe57..42513239 100644 --- a/docling/models/tesseract_ocr_model.py +++ b/docling/models/tesseract_ocr_model.py @@ -4,8 +4,11 @@ from docling_core.types.doc import BoundingBox, CoordOrigin from docling.datamodel.base_models import OcrCell, Page +from docling.datamodel.document import ConversionResult from docling.datamodel.pipeline_options import TesseractOcrOptions +from docling.datamodel.settings import settings from docling.models.base_ocr_model import BaseOcrModel +from docling.utils.profiling import TimeRecorder _log = logging.getLogger(__name__) @@ -61,7 +64,9 @@ def __del__(self): # Finalize the tesseractAPI self.reader.End() - def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]: + def __call__( + self, conv_res: ConversionResult, page_batch: Iterable[Page] + ) -> Iterable[Page]: if not self.enabled: yield from page_batch @@ -72,59 +77,66 @@ def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]: if not page._backend.is_valid(): yield page else: - assert self.reader is not None + with TimeRecorder(conv_res, "ocr"): - ocr_rects = self.get_ocr_rects(page) + assert self.reader is not None - all_ocr_cells = [] - for ocr_rect in ocr_rects: - # Skip zero area boxes - if ocr_rect.area() == 0: - continue - high_res_image = page._backend.get_page_image( - scale=self.scale, cropbox=ocr_rect - ) + ocr_rects = self.get_ocr_rects(page) - # Retrieve text snippets with their bounding boxes - self.reader.SetImage(high_res_image) - boxes = self.reader.GetComponentImages( - self.reader_RIL.TEXTLINE, True - ) + all_ocr_cells = [] + for ocr_rect in ocr_rects: + # Skip zero area boxes + if ocr_rect.area() == 0: + continue + high_res_image = page._backend.get_page_image( + scale=self.scale, cropbox=ocr_rect + ) - cells = [] - for ix, (im, box, _, _) in enumerate(boxes): - # Set the area of interest. Tesseract uses Bottom-Left for the origin - self.reader.SetRectangle(box["x"], box["y"], box["w"], box["h"]) - - # Extract text within the bounding box - text = self.reader.GetUTF8Text().strip() - confidence = self.reader.MeanTextConf() - left = box["x"] / self.scale - bottom = box["y"] / self.scale - right = (box["x"] + box["w"]) / self.scale - top = (box["y"] + box["h"]) / self.scale - - cells.append( - OcrCell( - id=ix, - text=text, - confidence=confidence, - bbox=BoundingBox.from_tuple( - coord=(left, top, right, bottom), - origin=CoordOrigin.TOPLEFT, - ), - ) + # Retrieve text snippets with their bounding boxes + self.reader.SetImage(high_res_image) + boxes = self.reader.GetComponentImages( + self.reader_RIL.TEXTLINE, True ) - # del high_res_image - all_ocr_cells.extend(cells) + cells = [] + for ix, (im, box, _, _) in enumerate(boxes): + # Set the area of interest. Tesseract uses Bottom-Left for the origin + self.reader.SetRectangle( + box["x"], box["y"], box["w"], box["h"] + ) - ## Remove OCR cells which overlap with programmatic cells. - filtered_ocr_cells = self.filter_ocr_cells(all_ocr_cells, page.cells) + # Extract text within the bounding box + text = self.reader.GetUTF8Text().strip() + confidence = self.reader.MeanTextConf() + left = box["x"] / self.scale + bottom = box["y"] / self.scale + right = (box["x"] + box["w"]) / self.scale + top = (box["y"] + box["h"]) / self.scale + + cells.append( + OcrCell( + id=ix, + text=text, + confidence=confidence, + bbox=BoundingBox.from_tuple( + coord=(left, top, right, bottom), + origin=CoordOrigin.TOPLEFT, + ), + ) + ) + + # del high_res_image + all_ocr_cells.extend(cells) + + ## Remove OCR cells which overlap with programmatic cells. + filtered_ocr_cells = self.filter_ocr_cells( + all_ocr_cells, page.cells + ) - page.cells.extend(filtered_ocr_cells) + page.cells.extend(filtered_ocr_cells) # DEBUG code: - # self.draw_ocr_rects_and_cells(page, ocr_rects) + if settings.debug.visualize_ocr: + self.draw_ocr_rects_and_cells(conv_res, page, ocr_rects) yield page diff --git a/docling/pipeline/base_pipeline.py b/docling/pipeline/base_pipeline.py index 8dd074cc..5013ad58 100644 --- a/docling/pipeline/base_pipeline.py +++ b/docling/pipeline/base_pipeline.py @@ -19,6 +19,7 @@ from docling.datamodel.pipeline_options import PipelineOptions from docling.datamodel.settings import settings from docling.models.base_model import BaseEnrichmentModel +from docling.utils.profiling import ProfilingScope, TimeRecorder from docling.utils.utils import chunkify _log = logging.getLogger(__name__) @@ -35,13 +36,16 @@ def execute(self, in_doc: InputDocument, raises_on_error: bool) -> ConversionRes _log.info(f"Processing document {in_doc.file.name}") try: - # These steps are building and assembling the structure of the - # output DoclingDocument - conv_res = self._build_document(in_doc, conv_res) - conv_res = self._assemble_document(in_doc, conv_res) - # From this stage, all operations should rely only on conv_res.output - conv_res = self._enrich_document(in_doc, conv_res) - conv_res.status = self._determine_status(in_doc, conv_res) + with TimeRecorder( + conv_res, "pipeline_total", scope=ProfilingScope.DOCUMENT + ): + # These steps are building and assembling the structure of the + # output DoclingDocument + conv_res = self._build_document(conv_res) + conv_res = self._assemble_document(conv_res) + # From this stage, all operations should rely only on conv_res.output + conv_res = self._enrich_document(conv_res) + conv_res.status = self._determine_status(conv_res) except Exception as e: conv_res.status = ConversionStatus.FAILURE if raises_on_error: @@ -50,19 +54,13 @@ def execute(self, in_doc: InputDocument, raises_on_error: bool) -> ConversionRes return conv_res @abstractmethod - def _build_document( - self, in_doc: InputDocument, conv_res: ConversionResult - ) -> ConversionResult: + def _build_document(self, conv_res: ConversionResult) -> ConversionResult: pass - def _assemble_document( - self, in_doc: InputDocument, conv_res: ConversionResult - ) -> ConversionResult: + def _assemble_document(self, conv_res: ConversionResult) -> ConversionResult: return conv_res - def _enrich_document( - self, in_doc: InputDocument, conv_res: ConversionResult - ) -> ConversionResult: + def _enrich_document(self, conv_res: ConversionResult) -> ConversionResult: def _filter_elements( doc: DoclingDocument, model: BaseEnrichmentModel @@ -71,24 +69,23 @@ def _filter_elements( if model.is_processable(doc=doc, element=element): yield element - for model in self.enrichment_pipe: - for element_batch in chunkify( - _filter_elements(conv_res.document, model), - settings.perf.elements_batch_size, - ): - # TODO: currently we assume the element itself is modified, because - # we don't have an interface to save the element back to the document - for element in model( - doc=conv_res.document, element_batch=element_batch - ): # Must exhaust! - pass + with TimeRecorder(conv_res, "doc_enrich", scope=ProfilingScope.DOCUMENT): + for model in self.enrichment_pipe: + for element_batch in chunkify( + _filter_elements(conv_res.document, model), + settings.perf.elements_batch_size, + ): + # TODO: currently we assume the element itself is modified, because + # we don't have an interface to save the element back to the document + for element in model( + doc=conv_res.document, element_batch=element_batch + ): # Must exhaust! + pass return conv_res @abstractmethod - def _determine_status( - self, in_doc: InputDocument, conv_res: ConversionResult - ) -> ConversionStatus: + def _determine_status(self, conv_res: ConversionResult) -> ConversionStatus: pass @classmethod @@ -110,66 +107,68 @@ def is_backend_supported(cls, backend: AbstractDocumentBackend): class PaginatedPipeline(BasePipeline): # TODO this is a bad name. - def _apply_on_pages(self, page_batch: Iterable[Page]) -> Iterable[Page]: + def _apply_on_pages( + self, conv_res: ConversionResult, page_batch: Iterable[Page] + ) -> Iterable[Page]: for model in self.build_pipe: - page_batch = model(page_batch) + page_batch = model(conv_res, page_batch) yield from page_batch - def _build_document( - self, in_doc: InputDocument, conv_res: ConversionResult - ) -> ConversionResult: + def _build_document(self, conv_res: ConversionResult) -> ConversionResult: - if not isinstance(in_doc._backend, PdfDocumentBackend): + if not isinstance(conv_res.input._backend, PdfDocumentBackend): raise RuntimeError( - f"The selected backend {type(in_doc._backend).__name__} for {in_doc.file} is not a PDF backend. " + f"The selected backend {type(conv_res.input._backend).__name__} for {conv_res.input.file} is not a PDF backend. " f"Can not convert this with a PDF pipeline. " f"Please check your format configuration on DocumentConverter." ) # conv_res.status = ConversionStatus.FAILURE # return conv_res - for i in range(0, in_doc.page_count): - conv_res.pages.append(Page(page_no=i)) + with TimeRecorder(conv_res, "doc_build", scope=ProfilingScope.DOCUMENT): - try: - # Iterate batches of pages (page_batch_size) in the doc - for page_batch in chunkify(conv_res.pages, settings.perf.page_batch_size): - start_pb_time = time.time() + for i in range(0, conv_res.input.page_count): + conv_res.pages.append(Page(page_no=i)) - # 1. Initialise the page resources - init_pages = map( - functools.partial(self.initialize_page, in_doc), page_batch - ) + try: + # Iterate batches of pages (page_batch_size) in the doc + for page_batch in chunkify( + conv_res.pages, settings.perf.page_batch_size + ): + start_pb_time = time.time() - # 2. Run pipeline stages - pipeline_pages = self._apply_on_pages(init_pages) + # 1. Initialise the page resources + init_pages = map( + functools.partial(self.initialize_page, conv_res), page_batch + ) - for p in pipeline_pages: # Must exhaust! - pass + # 2. Run pipeline stages + pipeline_pages = self._apply_on_pages(conv_res, init_pages) - end_pb_time = time.time() - start_pb_time - _log.debug(f"Finished converting page batch time={end_pb_time:.3f}") + for p in pipeline_pages: # Must exhaust! + pass - except Exception as e: - conv_res.status = ConversionStatus.FAILURE - trace = "\n".join(traceback.format_exception(e)) - _log.warning( - f"Encountered an error during conversion of document {in_doc.document_hash}:\n" - f"{trace}" - ) - raise e + end_pb_time = time.time() - start_pb_time + _log.debug(f"Finished converting page batch time={end_pb_time:.3f}") + + except Exception as e: + conv_res.status = ConversionStatus.FAILURE + trace = "\n".join(traceback.format_exception(e)) + _log.warning( + f"Encountered an error during conversion of document {conv_res.input.document_hash}:\n" + f"{trace}" + ) + raise e - finally: - # Always unload the PDF backend, even in case of failure - if in_doc._backend: - in_doc._backend.unload() + finally: + # Always unload the PDF backend, even in case of failure + if conv_res.input._backend: + conv_res.input._backend.unload() return conv_res - def _determine_status( - self, in_doc: InputDocument, conv_res: ConversionResult - ) -> ConversionStatus: + def _determine_status(self, conv_res: ConversionResult) -> ConversionStatus: status = ConversionStatus.SUCCESS for page in conv_res.pages: if page._backend is None or not page._backend.is_valid(): @@ -186,5 +185,5 @@ def _determine_status( # Initialise and load resources for a page @abstractmethod - def initialize_page(self, doc: InputDocument, page: Page) -> Page: + def initialize_page(self, conv_res: ConversionResult, page: Page) -> Page: pass diff --git a/docling/pipeline/simple_pipeline.py b/docling/pipeline/simple_pipeline.py index 0858af0b..fb985231 100644 --- a/docling/pipeline/simple_pipeline.py +++ b/docling/pipeline/simple_pipeline.py @@ -5,9 +5,10 @@ DeclarativeDocumentBackend, ) from docling.datamodel.base_models import ConversionStatus -from docling.datamodel.document import ConversionResult, InputDocument +from docling.datamodel.document import ConversionResult from docling.datamodel.pipeline_options import PipelineOptions from docling.pipeline.base_pipeline import BasePipeline +from docling.utils.profiling import ProfilingScope, TimeRecorder _log = logging.getLogger(__name__) @@ -22,13 +23,11 @@ class SimplePipeline(BasePipeline): def __init__(self, pipeline_options: PipelineOptions): super().__init__(pipeline_options) - def _build_document( - self, in_doc: InputDocument, conv_res: ConversionResult - ) -> ConversionResult: + def _build_document(self, conv_res: ConversionResult) -> ConversionResult: - if not isinstance(in_doc._backend, DeclarativeDocumentBackend): + if not isinstance(conv_res.input._backend, DeclarativeDocumentBackend): raise RuntimeError( - f"The selected backend {type(in_doc._backend).__name__} for {in_doc.file} is not a declarative backend. " + f"The selected backend {type(conv_res.input._backend).__name__} for {conv_res.input.file} is not a declarative backend. " f"Can not convert this with simple pipeline. " f"Please check your format configuration on DocumentConverter." ) @@ -38,13 +37,11 @@ def _build_document( # Instead of running a page-level pipeline to build up the document structure, # the backend is expected to be of type DeclarativeDocumentBackend, which can output # a DoclingDocument straight. - - conv_res.document = in_doc._backend.convert() + with TimeRecorder(conv_res, "doc_build", scope=ProfilingScope.DOCUMENT): + conv_res.document = conv_res.input._backend.convert() return conv_res - def _determine_status( - self, in_doc: InputDocument, conv_res: ConversionResult - ) -> ConversionStatus: + def _determine_status(self, conv_res: ConversionResult) -> ConversionStatus: # This is called only if the previous steps didn't raise. # Since we don't have anything else to evaluate, we can # safely return SUCCESS. diff --git a/docling/pipeline/standard_pdf_pipeline.py b/docling/pipeline/standard_pdf_pipeline.py index 5de2e32f..65803d4f 100644 --- a/docling/pipeline/standard_pdf_pipeline.py +++ b/docling/pipeline/standard_pdf_pipeline.py @@ -7,7 +7,7 @@ from docling.backend.abstract_backend import AbstractDocumentBackend from docling.backend.pdf_backend import PdfDocumentBackend from docling.datamodel.base_models import AssembledUnit, Page -from docling.datamodel.document import ConversionResult, InputDocument +from docling.datamodel.document import ConversionResult from docling.datamodel.pipeline_options import ( EasyOcrOptions, PdfPipelineOptions, @@ -27,6 +27,7 @@ from docling.models.tesseract_ocr_cli_model import TesseractOcrCliModel from docling.models.tesseract_ocr_model import TesseractOcrModel from docling.pipeline.base_pipeline import PaginatedPipeline +from docling.utils.profiling import ProfilingScope, TimeRecorder _log = logging.getLogger(__name__) @@ -119,73 +120,75 @@ def get_ocr_model(self) -> Optional[BaseOcrModel]: ) return None - def initialize_page(self, doc: InputDocument, page: Page) -> Page: - page._backend = doc._backend.load_page(page.page_no) # type: ignore - if page._backend is not None and page._backend.is_valid(): - page.size = page._backend.get_size() + def initialize_page(self, conv_res: ConversionResult, page: Page) -> Page: + with TimeRecorder(conv_res, "page_init"): + page._backend = conv_res.input._backend.load_page(page.page_no) # type: ignore + if page._backend is not None and page._backend.is_valid(): + page.size = page._backend.get_size() return page - def _assemble_document( - self, in_doc: InputDocument, conv_res: ConversionResult - ) -> ConversionResult: + def _assemble_document(self, conv_res: ConversionResult) -> ConversionResult: all_elements = [] all_headers = [] all_body = [] - for p in conv_res.pages: - if p.assembled is not None: - for el in p.assembled.body: - all_body.append(el) - for el in p.assembled.headers: - all_headers.append(el) - for el in p.assembled.elements: - all_elements.append(el) - - conv_res.assembled = AssembledUnit( - elements=all_elements, headers=all_headers, body=all_body - ) - - conv_res.document = self.glm_model(conv_res) + with TimeRecorder(conv_res, "doc_assemble", scope=ProfilingScope.DOCUMENT): + for p in conv_res.pages: + if p.assembled is not None: + for el in p.assembled.body: + all_body.append(el) + for el in p.assembled.headers: + all_headers.append(el) + for el in p.assembled.elements: + all_elements.append(el) + + conv_res.assembled = AssembledUnit( + elements=all_elements, headers=all_headers, body=all_body + ) - # Generate page images in the output - if self.pipeline_options.generate_page_images: - for page in conv_res.pages: - assert page.image is not None - page_no = page.page_no + 1 - conv_res.document.pages[page_no].image = ImageRef.from_pil( - page.image, dpi=int(72 * self.pipeline_options.images_scale) - ) + conv_res.document = self.glm_model(conv_res) - # Generate images of the requested element types - if ( - self.pipeline_options.generate_picture_images - or self.pipeline_options.generate_table_images - ): - scale = self.pipeline_options.images_scale - for element, _level in conv_res.document.iterate_items(): - if not isinstance(element, DocItem) or len(element.prov) == 0: - continue - if ( - isinstance(element, PictureItem) - and self.pipeline_options.generate_picture_images - ) or ( - isinstance(element, TableItem) - and self.pipeline_options.generate_table_images - ): - page_ix = element.prov[0].page_no - 1 - page = conv_res.pages[page_ix] - assert page.size is not None + # Generate page images in the output + if self.pipeline_options.generate_page_images: + for page in conv_res.pages: assert page.image is not None - - crop_bbox = ( - element.prov[0] - .bbox.scaled(scale=scale) - .to_top_left_origin(page_height=page.size.height * scale) + page_no = page.page_no + 1 + conv_res.document.pages[page_no].image = ImageRef.from_pil( + page.image, dpi=int(72 * self.pipeline_options.images_scale) ) - cropped_im = page.image.crop(crop_bbox.as_tuple()) - element.image = ImageRef.from_pil(cropped_im, dpi=int(72 * scale)) + # Generate images of the requested element types + if ( + self.pipeline_options.generate_picture_images + or self.pipeline_options.generate_table_images + ): + scale = self.pipeline_options.images_scale + for element, _level in conv_res.document.iterate_items(): + if not isinstance(element, DocItem) or len(element.prov) == 0: + continue + if ( + isinstance(element, PictureItem) + and self.pipeline_options.generate_picture_images + ) or ( + isinstance(element, TableItem) + and self.pipeline_options.generate_table_images + ): + page_ix = element.prov[0].page_no - 1 + page = conv_res.pages[page_ix] + assert page.size is not None + assert page.image is not None + + crop_bbox = ( + element.prov[0] + .bbox.scaled(scale=scale) + .to_top_left_origin(page_height=page.size.height * scale) + ) + + cropped_im = page.image.crop(crop_bbox.as_tuple()) + element.image = ImageRef.from_pil( + cropped_im, dpi=int(72 * scale) + ) return conv_res diff --git a/docling/utils/profiling.py b/docling/utils/profiling.py new file mode 100644 index 00000000..0d09f17d --- /dev/null +++ b/docling/utils/profiling.py @@ -0,0 +1,62 @@ +import time +from datetime import datetime +from enum import Enum +from typing import TYPE_CHECKING, List + +import numpy as np +from pydantic import BaseModel + +from docling.datamodel.settings import settings + +if TYPE_CHECKING: + from docling.datamodel.document import ConversionResult + + +class ProfilingScope(str, Enum): + PAGE = "page" + DOCUMENT = "document" + + +class ProfilingItem(BaseModel): + scope: ProfilingScope + count: int = 0 + times: List[float] = [] + start_timestamps: List[datetime] = [] + + def avg(self) -> float: + return np.average(self.times) # type: ignore + + def std(self) -> float: + return np.std(self.times) # type: ignore + + def mean(self) -> float: + return np.mean(self.times) # type: ignore + + def percentile(self, perc: float) -> float: + return np.percentile(self.times, perc) # type: ignore + + +class TimeRecorder: + def __init__( + self, + conv_res: "ConversionResult", + key: str, + scope: ProfilingScope = ProfilingScope.PAGE, + ): + if settings.debug.profile_pipeline_timings: + if key not in conv_res.timings.keys(): + conv_res.timings[key] = ProfilingItem(scope=scope) + self.conv_res = conv_res + self.key = key + + def __enter__(self): + if settings.debug.profile_pipeline_timings: + self.start = time.monotonic() + self.conv_res.timings[self.key].start_timestamps.append(datetime.utcnow()) + return self + + def __exit__(self, *args): + if settings.debug.profile_pipeline_timings: + elapsed = time.monotonic() - self.start + self.conv_res.timings[self.key].times.append(elapsed) + self.conv_res.timings[self.key].count += 1 diff --git a/docs/examples/batch_convert.py b/docs/examples/batch_convert.py index 6809c176..2c61336c 100644 --- a/docs/examples/batch_convert.py +++ b/docs/examples/batch_convert.py @@ -8,6 +8,7 @@ from docling.datamodel.base_models import ConversionStatus from docling.datamodel.document import ConversionResult +from docling.datamodel.settings import settings from docling.document_converter import DocumentConverter _log = logging.getLogger(__name__) @@ -113,6 +114,12 @@ def main(): # docs = [DocumentStream(name="my_doc.pdf", stream=buf)] # input = DocumentConversionInput.from_streams(docs) + # # Turn on inline debug visualizations: + # settings.debug.visualize_layout = True + # settings.debug.visualize_ocr = True + # settings.debug.visualize_tables = True + # settings.debug.visualize_cells = True + doc_converter = DocumentConverter() start_time = time.time() diff --git a/poetry.lock b/poetry.lock index b1fa58b7..cf758c30 100644 --- a/poetry.lock +++ b/poetry.lock @@ -913,13 +913,13 @@ tabulate = ">=0.9.0,<0.10.0" [[package]] name = "docling-ibm-models" -version = "2.0.1" +version = "2.0.2" description = "This package contains the AI models used by the Docling PDF conversion package" optional = false python-versions = "<4.0,>=3.10" files = [ - {file = "docling_ibm_models-2.0.1-py3-none-any.whl", hash = "sha256:f81c6002b7e102aa79afb8287fce48872f27d1cffb088ea4d1fbebe490364a1d"}, - {file = "docling_ibm_models-2.0.1.tar.gz", hash = "sha256:4fb0300022cfa0d0ac1fcbcb296c144e71ee9816654407f8a4d3a7b934f3065f"}, + {file = "docling_ibm_models-2.0.2-py3-none-any.whl", hash = "sha256:dd27889838319d55a45704f80eb1e75ccfe98de907e5d53f7815ef50402dffe7"}, + {file = "docling_ibm_models-2.0.2.tar.gz", hash = "sha256:5c8b7030faa171558fa83fabd3d1bade729e0319265ad776ed78b89aefbb1982"}, ] [package.dependencies] @@ -945,41 +945,41 @@ tqdm = ">=4.64.0,<5.0.0" [[package]] name = "docling-parse" -version = "2.0.0" +version = "2.0.1" description = "Simple package to extract text with coordinates from programmatic PDFs" optional = false python-versions = "<4.0,>=3.9" files = [ - {file = "docling_parse-2.0.0-cp310-cp310-macosx_13_0_arm64.whl", hash = "sha256:269a0ba847e2faf7aa72a31861141f09ffa9a347e0b16810f45eba8f9104d2ca"}, - {file = "docling_parse-2.0.0-cp310-cp310-macosx_13_0_x86_64.whl", hash = "sha256:1bf48f8042345ec94d2de3a375711fcd7ed55356e3f60f220fc5df09fe9031b0"}, - {file = "docling_parse-2.0.0-cp310-cp310-macosx_14_0_arm64.whl", hash = "sha256:5e4da33ce3857c7c83628b9dbd136932b81ad7718e7edf2fe8e44c3469ee0b9a"}, - {file = "docling_parse-2.0.0-cp310-cp310-macosx_14_0_x86_64.whl", hash = "sha256:011b4a847c71e1a1ac362dfe8c7951997fc4eef47eb29b43e7f1c13fcac983d8"}, - {file = "docling_parse-2.0.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e77b8f731a614c6da3beb4c59eb878c1668a6956f5de06f42a3d4f502cde46e4"}, - {file = "docling_parse-2.0.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:696d9a93e8879026ae56cb677b355778e09c146cd07c0a3f6f99a3fdc5846421"}, - {file = "docling_parse-2.0.0-cp310-cp310-win_amd64.whl", hash = "sha256:0a8b10fed8022e9343c1be9ea9e82bcc77a5e32042a585cb816db9c5f51b906f"}, - {file = "docling_parse-2.0.0-cp311-cp311-macosx_13_0_arm64.whl", hash = "sha256:8b9f8e91656bf5c8c1ea99eb29b8b41cd1cd156e33d1fec530add4ee30705da9"}, - {file = "docling_parse-2.0.0-cp311-cp311-macosx_13_0_x86_64.whl", hash = "sha256:bceb96fbbd286d71ba12bf876bc08f772a526a45e8e2dda8567b171c7a5a6c02"}, - {file = "docling_parse-2.0.0-cp311-cp311-macosx_14_0_arm64.whl", hash = "sha256:729c18b42e316b3cc64af9feb8fcb020e7dabe4d8d45ded424c2508b5284631b"}, - {file = "docling_parse-2.0.0-cp311-cp311-macosx_14_0_x86_64.whl", hash = "sha256:ac2725cdbc3f7ef5ea5351f6b7e78e93c3b271a72f827d42210fbe9cb46d9644"}, - {file = "docling_parse-2.0.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:965a7ffd5a1b83006ab47ca76be02725439e9dd80aaadf7f3258b81c5cd6ad22"}, - {file = "docling_parse-2.0.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:eb1116f179dc2d8f13d2870dd6b0dfa443eea2994c0a75e4b7e8cae7d9f969b3"}, - {file = "docling_parse-2.0.0-cp311-cp311-win_amd64.whl", hash = "sha256:6a675b4aced7abc0f4b0df25af1d80b93feba55159b830b6c2b4eafb1719d389"}, - {file = "docling_parse-2.0.0-cp312-cp312-macosx_13_0_arm64.whl", hash = "sha256:f3677b2ce7bfa1119573aa8ffbf7ece6d46ed1af55094c60d1a1f8fd8477dbe8"}, - {file = "docling_parse-2.0.0-cp312-cp312-macosx_13_0_x86_64.whl", hash = "sha256:ff94ad7c2f3404904b05338e2efa5b7359078992bd293b046fceed5d98f653b0"}, - {file = "docling_parse-2.0.0-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:fd280cbfef2040f8016ae010fe06be7b610b7ff3cba2014aea9193481c938f6e"}, - {file = "docling_parse-2.0.0-cp312-cp312-macosx_14_0_x86_64.whl", hash = "sha256:e5f4274fe619c4adf3d2daffca1ae14000c2498c006c971bbf77f250f6e9289d"}, - {file = "docling_parse-2.0.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c612320c22353e93a0c74a59cd220bdd253e5cd18d281ea918f3200e92b6b29b"}, - {file = "docling_parse-2.0.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:05aaa4fbcccfafd3c026b802c60a2968e4fe163f9f9cd1ed95823d4a5f93099c"}, - {file = "docling_parse-2.0.0-cp312-cp312-win_amd64.whl", hash = "sha256:2bad0b05512731c3c0a6b3057d7a8c407b219aafe6360644c3d64c9d0b9b1bac"}, - {file = "docling_parse-2.0.0-cp313-cp313-macosx_13_0_arm64.whl", hash = "sha256:404633fe50f2537486479243b12b279dc11929cb4307700edefc882d852e91bc"}, - {file = "docling_parse-2.0.0-cp313-cp313-macosx_13_0_x86_64.whl", hash = "sha256:75f23e8cbbfc80f79974b79d613cf86e67b7adb39fac713f2192ffdcce5d3bc5"}, - {file = "docling_parse-2.0.0-cp313-cp313-macosx_14_0_arm64.whl", hash = "sha256:247cc5044d6e9d2adc4d2ffd65f4db3776d3388dc76675f4c44b8c0e4063c85d"}, - {file = "docling_parse-2.0.0-cp313-cp313-macosx_14_0_x86_64.whl", hash = "sha256:42a2429bcad03a9e975af4e6390fc012eafc8b1adee93713d73c6dc374a38de3"}, - {file = "docling_parse-2.0.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b094164f0cf8dffad7ce9a220f8337e9cbc7cea1470f763b836c5cbd22703cec"}, - {file = "docling_parse-2.0.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7628fc03ce9ea9424a5c4304059e1377b17f1a6a6ccffcd87b3a1a16776a36f1"}, - {file = "docling_parse-2.0.0-cp313-cp313-win_amd64.whl", hash = "sha256:6d805607b072d1facac69c277be52a36f71bdb6d0b2a748e8bbc3453f652ce1f"}, - {file = "docling_parse-2.0.0-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:f6948878bc4930b5d344a3b8ceb7f2550513b1888cf3c157ef466096fe848dd1"}, - {file = "docling_parse-2.0.0.tar.gz", hash = "sha256:6509f4776442593e7d2d9433375618edcac619a2a992ebaaed731be7ed6d42b7"}, + {file = "docling_parse-2.0.1-cp310-cp310-macosx_13_0_arm64.whl", hash = "sha256:dab77f27ad8327a9350ca69792d18e0b4297877cc64917b46c2bb7b6e8c6ea31"}, + {file = "docling_parse-2.0.1-cp310-cp310-macosx_13_0_x86_64.whl", hash = "sha256:ae7caa659cd6eae718668f690086949046e366042b23d6a736e5f1dc2a5d9afc"}, + {file = "docling_parse-2.0.1-cp310-cp310-macosx_14_0_arm64.whl", hash = "sha256:ffb500b5113129dfd606399b1d09a61b2fbec62596b77c20044a05ea599f05aa"}, + {file = "docling_parse-2.0.1-cp310-cp310-macosx_14_0_x86_64.whl", hash = "sha256:0e3f30eba548a37fb821db29a8c7e9dfb99b6262194bb305bee2f833d65c3134"}, + {file = "docling_parse-2.0.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:039d775fe017dfed5856e08a3db15dece5cb00e164c1529f37aca57e90fb5fc6"}, + {file = "docling_parse-2.0.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6c6db928e1ac317db77bec87f76e47023b2fd4aac4f81babef00cf30bd526618"}, + {file = "docling_parse-2.0.1-cp310-cp310-win_amd64.whl", hash = "sha256:1d94e5d7146bfc5f0cb767000f0b0bc4837df9075d046df46ef3802b08c12f2e"}, + {file = "docling_parse-2.0.1-cp311-cp311-macosx_13_0_arm64.whl", hash = "sha256:033129a2535408104867682ea3e2cc9cb2690790075f30365052789ab71a017b"}, + {file = "docling_parse-2.0.1-cp311-cp311-macosx_13_0_x86_64.whl", hash = "sha256:a0e16fca70848195487615fc61a04f9afdbfec34a672e92ce8cb6b8a54daca55"}, + {file = "docling_parse-2.0.1-cp311-cp311-macosx_14_0_arm64.whl", hash = "sha256:acb83350f4f5010ca18a3a891c7751132abeb1e727e52f9b44a16bf8f65a6d3a"}, + {file = "docling_parse-2.0.1-cp311-cp311-macosx_14_0_x86_64.whl", hash = "sha256:2286ec6d1dea6d0fc259fe4e375c79ace4499a6cce0b4341bdd689d54744efc4"}, + {file = "docling_parse-2.0.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5c83117642d2494a4baa92224f5d6de0fe37beb235ff4e48642786a160c5d3f8"}, + {file = "docling_parse-2.0.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:61d8808d9b50104c982bce8390900e4060ef3fa829e9449ac1bdbc0f0be61091"}, + {file = "docling_parse-2.0.1-cp311-cp311-win_amd64.whl", hash = "sha256:07d918a5f98e5a78f0e9d08267242d81c35a788f21403d7c2bb4a862b3073c97"}, + {file = "docling_parse-2.0.1-cp312-cp312-macosx_13_0_arm64.whl", hash = "sha256:1128f330cb56ba8bd58eed7221c92ef54b544c077dc2f04b0b2bbabe2a53e688"}, + {file = "docling_parse-2.0.1-cp312-cp312-macosx_13_0_x86_64.whl", hash = "sha256:4a868f9057403c7dac57f8c086aa51d976500c191f21ff54d8923d1144714baa"}, + {file = "docling_parse-2.0.1-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:1681679041cbda708e4961f3ce4ff9acea1eeadc203a94874a860104acc2b446"}, + {file = "docling_parse-2.0.1-cp312-cp312-macosx_14_0_x86_64.whl", hash = "sha256:747638746e4a81c74fc067a1b4efb9b35f8a223f7dee02c180cf6ceb80dc128c"}, + {file = "docling_parse-2.0.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1d9938c58a6121be456334dbf116af8fdee00c1a257c983a3236b6dfc6e37cbf"}, + {file = "docling_parse-2.0.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6ae727fef7e107b5136f8182eb08e456881f7a7cd2a5635991c9e62b8cd0e8cc"}, + {file = "docling_parse-2.0.1-cp312-cp312-win_amd64.whl", hash = "sha256:3b3553d2a781528b7674c4b383b6be92b5ac8c68d657d51671a10c50ac370b45"}, + {file = "docling_parse-2.0.1-cp313-cp313-macosx_13_0_arm64.whl", hash = "sha256:f9af29a48f9523a7b121b0350eb84a523ae961fc569dc7fcd2d5ed484bcf278d"}, + {file = "docling_parse-2.0.1-cp313-cp313-macosx_13_0_x86_64.whl", hash = "sha256:76d895091553dc56fda99ff6019140819bf4a3bffb3c655b0f851e86a62b1775"}, + {file = "docling_parse-2.0.1-cp313-cp313-macosx_14_0_arm64.whl", hash = "sha256:172605fad8ec32c4c5480152218d5b3085fffab17b958a3a44302ff5466ed0bd"}, + {file = "docling_parse-2.0.1-cp313-cp313-macosx_14_0_x86_64.whl", hash = "sha256:5a900a076797754babe64369a7b49438639898b4cf3ce438ab1f0e9ba2fa78de"}, + {file = "docling_parse-2.0.1-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2cb64c9e353893181b7c9177b954ec2bb8925c62f5ec362934fc699bd180e171"}, + {file = "docling_parse-2.0.1-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:706f39b1f417a79b13e539fd25e10ff8f2ac603ad61fa7f05586f4e067e84a18"}, + {file = "docling_parse-2.0.1-cp313-cp313-win_amd64.whl", hash = "sha256:e8f5809a94373d1b20f834d5916471d36e6d076e49b151f2a1196719505c3e8e"}, + {file = "docling_parse-2.0.1-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:a56e960cc915fb67fb97210d069dbf0cb195f4a12518edc182552fe8aaf103bd"}, + {file = "docling_parse-2.0.1.tar.gz", hash = "sha256:27c3c1f22de2afede928ed1d139d0378faf31c55b3416dfe9be01e879a18072e"}, ] [package.dependencies] @@ -1363,70 +1363,70 @@ test = ["coverage[toml]", "ddt (>=1.1.1,!=1.4.3)", "mock", "mypy", "pre-commit", [[package]] name = "grpcio" -version = "1.67.0" +version = "1.67.1" description = "HTTP/2-based RPC framework" optional = false python-versions = ">=3.8" files = [ - {file = "grpcio-1.67.0-cp310-cp310-linux_armv7l.whl", hash = "sha256:bd79929b3bb96b54df1296cd3bf4d2b770bd1df6c2bdf549b49bab286b925cdc"}, - {file = "grpcio-1.67.0-cp310-cp310-macosx_12_0_universal2.whl", hash = "sha256:16724ffc956ea42967f5758c2f043faef43cb7e48a51948ab593570570d1e68b"}, - {file = "grpcio-1.67.0-cp310-cp310-manylinux_2_17_aarch64.whl", hash = "sha256:2b7183c80b602b0ad816315d66f2fb7887614ead950416d60913a9a71c12560d"}, - {file = "grpcio-1.67.0-cp310-cp310-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:efe32b45dd6d118f5ea2e5deaed417d8a14976325c93812dd831908522b402c9"}, - {file = "grpcio-1.67.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fe89295219b9c9e47780a0f1c75ca44211e706d1c598242249fe717af3385ec8"}, - {file = "grpcio-1.67.0-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:aa8d025fae1595a207b4e47c2e087cb88d47008494db258ac561c00877d4c8f8"}, - {file = "grpcio-1.67.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:f95e15db43e75a534420e04822df91f645664bf4ad21dfaad7d51773c80e6bb4"}, - {file = "grpcio-1.67.0-cp310-cp310-win32.whl", hash = "sha256:a6b9a5c18863fd4b6624a42e2712103fb0f57799a3b29651c0e5b8119a519d65"}, - {file = "grpcio-1.67.0-cp310-cp310-win_amd64.whl", hash = "sha256:b6eb68493a05d38b426604e1dc93bfc0137c4157f7ab4fac5771fd9a104bbaa6"}, - {file = "grpcio-1.67.0-cp311-cp311-linux_armv7l.whl", hash = "sha256:e91d154689639932305b6ea6f45c6e46bb51ecc8ea77c10ef25aa77f75443ad4"}, - {file = "grpcio-1.67.0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:cb204a742997277da678611a809a8409657b1398aaeebf73b3d9563b7d154c13"}, - {file = "grpcio-1.67.0-cp311-cp311-manylinux_2_17_aarch64.whl", hash = "sha256:ae6de510f670137e755eb2a74b04d1041e7210af2444103c8c95f193340d17ee"}, - {file = "grpcio-1.67.0-cp311-cp311-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:74b900566bdf68241118f2918d312d3bf554b2ce0b12b90178091ea7d0a17b3d"}, - {file = "grpcio-1.67.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a4e95e43447a02aa603abcc6b5e727d093d161a869c83b073f50b9390ecf0fa8"}, - {file = "grpcio-1.67.0-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:0bb94e66cd8f0baf29bd3184b6aa09aeb1a660f9ec3d85da615c5003154bc2bf"}, - {file = "grpcio-1.67.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:82e5bd4b67b17c8c597273663794a6a46a45e44165b960517fe6d8a2f7f16d23"}, - {file = "grpcio-1.67.0-cp311-cp311-win32.whl", hash = "sha256:7fc1d2b9fd549264ae585026b266ac2db53735510a207381be509c315b4af4e8"}, - {file = "grpcio-1.67.0-cp311-cp311-win_amd64.whl", hash = "sha256:ac11ecb34a86b831239cc38245403a8de25037b448464f95c3315819e7519772"}, - {file = "grpcio-1.67.0-cp312-cp312-linux_armv7l.whl", hash = "sha256:227316b5631260e0bef8a3ce04fa7db4cc81756fea1258b007950b6efc90c05d"}, - {file = "grpcio-1.67.0-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:d90cfdafcf4b45a7a076e3e2a58e7bc3d59c698c4f6470b0bb13a4d869cf2273"}, - {file = "grpcio-1.67.0-cp312-cp312-manylinux_2_17_aarch64.whl", hash = "sha256:77196216d5dd6f99af1c51e235af2dd339159f657280e65ce7e12c1a8feffd1d"}, - {file = "grpcio-1.67.0-cp312-cp312-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:15c05a26a0f7047f720da41dc49406b395c1470eef44ff7e2c506a47ac2c0591"}, - {file = "grpcio-1.67.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3840994689cc8cbb73d60485c594424ad8adb56c71a30d8948d6453083624b52"}, - {file = "grpcio-1.67.0-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:5a1e03c3102b6451028d5dc9f8591131d6ab3c8a0e023d94c28cb930ed4b5f81"}, - {file = "grpcio-1.67.0-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:682968427a63d898759474e3b3178d42546e878fdce034fd7474ef75143b64e3"}, - {file = "grpcio-1.67.0-cp312-cp312-win32.whl", hash = "sha256:d01793653248f49cf47e5695e0a79805b1d9d4eacef85b310118ba1dfcd1b955"}, - {file = "grpcio-1.67.0-cp312-cp312-win_amd64.whl", hash = "sha256:985b2686f786f3e20326c4367eebdaed3e7aa65848260ff0c6644f817042cb15"}, - {file = "grpcio-1.67.0-cp313-cp313-linux_armv7l.whl", hash = "sha256:8c9a35b8bc50db35ab8e3e02a4f2a35cfba46c8705c3911c34ce343bd777813a"}, - {file = "grpcio-1.67.0-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:42199e704095b62688998c2d84c89e59a26a7d5d32eed86d43dc90e7a3bd04aa"}, - {file = "grpcio-1.67.0-cp313-cp313-manylinux_2_17_aarch64.whl", hash = "sha256:c4c425f440fb81f8d0237c07b9322fc0fb6ee2b29fbef5f62a322ff8fcce240d"}, - {file = "grpcio-1.67.0-cp313-cp313-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:323741b6699cd2b04a71cb38f502db98f90532e8a40cb675393d248126a268af"}, - {file = "grpcio-1.67.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:662c8e105c5e5cee0317d500eb186ed7a93229586e431c1bf0c9236c2407352c"}, - {file = "grpcio-1.67.0-cp313-cp313-musllinux_1_1_i686.whl", hash = "sha256:f6bd2ab135c64a4d1e9e44679a616c9bc944547357c830fafea5c3caa3de5153"}, - {file = "grpcio-1.67.0-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:2f55c1e0e2ae9bdd23b3c63459ee4c06d223b68aeb1961d83c48fb63dc29bc03"}, - {file = "grpcio-1.67.0-cp313-cp313-win32.whl", hash = "sha256:fd6bc27861e460fe28e94226e3673d46e294ca4673d46b224428d197c5935e69"}, - {file = "grpcio-1.67.0-cp313-cp313-win_amd64.whl", hash = "sha256:cf51d28063338608cd8d3cd64677e922134837902b70ce00dad7f116e3998210"}, - {file = "grpcio-1.67.0-cp38-cp38-linux_armv7l.whl", hash = "sha256:7f200aca719c1c5dc72ab68be3479b9dafccdf03df530d137632c534bb6f1ee3"}, - {file = "grpcio-1.67.0-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:0892dd200ece4822d72dd0952f7112c542a487fc48fe77568deaaa399c1e717d"}, - {file = "grpcio-1.67.0-cp38-cp38-manylinux_2_17_aarch64.whl", hash = "sha256:f4d613fbf868b2e2444f490d18af472ccb47660ea3df52f068c9c8801e1f3e85"}, - {file = "grpcio-1.67.0-cp38-cp38-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:0c69bf11894cad9da00047f46584d5758d6ebc9b5950c0dc96fec7e0bce5cde9"}, - {file = "grpcio-1.67.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b9bca3ca0c5e74dea44bf57d27e15a3a3996ce7e5780d61b7c72386356d231db"}, - {file = "grpcio-1.67.0-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:014dfc020e28a0d9be7e93a91f85ff9f4a87158b7df9952fe23cc42d29d31e1e"}, - {file = "grpcio-1.67.0-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:d4ea4509d42c6797539e9ec7496c15473177ce9abc89bc5c71e7abe50fc25737"}, - {file = "grpcio-1.67.0-cp38-cp38-win32.whl", hash = "sha256:9d75641a2fca9ae1ae86454fd25d4c298ea8cc195dbc962852234d54a07060ad"}, - {file = "grpcio-1.67.0-cp38-cp38-win_amd64.whl", hash = "sha256:cff8e54d6a463883cda2fab94d2062aad2f5edd7f06ae3ed030f2a74756db365"}, - {file = "grpcio-1.67.0-cp39-cp39-linux_armv7l.whl", hash = "sha256:62492bd534979e6d7127b8a6b29093161a742dee3875873e01964049d5250a74"}, - {file = "grpcio-1.67.0-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:eef1dce9d1a46119fd09f9a992cf6ab9d9178b696382439446ca5f399d7b96fe"}, - {file = "grpcio-1.67.0-cp39-cp39-manylinux_2_17_aarch64.whl", hash = "sha256:f623c57a5321461c84498a99dddf9d13dac0e40ee056d884d6ec4ebcab647a78"}, - {file = "grpcio-1.67.0-cp39-cp39-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:54d16383044e681f8beb50f905249e4e7261dd169d4aaf6e52eab67b01cbbbe2"}, - {file = "grpcio-1.67.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b2a44e572fb762c668e4812156b81835f7aba8a721b027e2d4bb29fb50ff4d33"}, - {file = "grpcio-1.67.0-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:391df8b0faac84d42f5b8dfc65f5152c48ed914e13c522fd05f2aca211f8bfad"}, - {file = "grpcio-1.67.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:cfd9306511fdfc623a1ba1dc3bc07fbd24e6cfbe3c28b4d1e05177baa2f99617"}, - {file = "grpcio-1.67.0-cp39-cp39-win32.whl", hash = "sha256:30d47dbacfd20cbd0c8be9bfa52fdb833b395d4ec32fe5cff7220afc05d08571"}, - {file = "grpcio-1.67.0-cp39-cp39-win_amd64.whl", hash = "sha256:f55f077685f61f0fbd06ea355142b71e47e4a26d2d678b3ba27248abfe67163a"}, - {file = "grpcio-1.67.0.tar.gz", hash = "sha256:e090b2553e0da1c875449c8e75073dd4415dd71c9bde6a406240fdf4c0ee467c"}, + {file = "grpcio-1.67.1-cp310-cp310-linux_armv7l.whl", hash = "sha256:8b0341d66a57f8a3119b77ab32207072be60c9bf79760fa609c5609f2deb1f3f"}, + {file = "grpcio-1.67.1-cp310-cp310-macosx_12_0_universal2.whl", hash = "sha256:f5a27dddefe0e2357d3e617b9079b4bfdc91341a91565111a21ed6ebbc51b22d"}, + {file = "grpcio-1.67.1-cp310-cp310-manylinux_2_17_aarch64.whl", hash = "sha256:43112046864317498a33bdc4797ae6a268c36345a910de9b9c17159d8346602f"}, + {file = "grpcio-1.67.1-cp310-cp310-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c9b929f13677b10f63124c1a410994a401cdd85214ad83ab67cc077fc7e480f0"}, + {file = "grpcio-1.67.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e7d1797a8a3845437d327145959a2c0c47c05947c9eef5ff1a4c80e499dcc6fa"}, + {file = "grpcio-1.67.1-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:0489063974d1452436139501bf6b180f63d4977223ee87488fe36858c5725292"}, + {file = "grpcio-1.67.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:9fd042de4a82e3e7aca44008ee2fb5da01b3e5adb316348c21980f7f58adc311"}, + {file = "grpcio-1.67.1-cp310-cp310-win32.whl", hash = "sha256:638354e698fd0c6c76b04540a850bf1db27b4d2515a19fcd5cf645c48d3eb1ed"}, + {file = "grpcio-1.67.1-cp310-cp310-win_amd64.whl", hash = "sha256:608d87d1bdabf9e2868b12338cd38a79969eaf920c89d698ead08f48de9c0f9e"}, + {file = "grpcio-1.67.1-cp311-cp311-linux_armv7l.whl", hash = "sha256:7818c0454027ae3384235a65210bbf5464bd715450e30a3d40385453a85a70cb"}, + {file = "grpcio-1.67.1-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:ea33986b70f83844cd00814cee4451055cd8cab36f00ac64a31f5bb09b31919e"}, + {file = "grpcio-1.67.1-cp311-cp311-manylinux_2_17_aarch64.whl", hash = "sha256:c7a01337407dd89005527623a4a72c5c8e2894d22bead0895306b23c6695698f"}, + {file = "grpcio-1.67.1-cp311-cp311-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:80b866f73224b0634f4312a4674c1be21b2b4afa73cb20953cbbb73a6b36c3cc"}, + {file = "grpcio-1.67.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f9fff78ba10d4250bfc07a01bd6254a6d87dc67f9627adece85c0b2ed754fa96"}, + {file = "grpcio-1.67.1-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:8a23cbcc5bb11ea7dc6163078be36c065db68d915c24f5faa4f872c573bb400f"}, + {file = "grpcio-1.67.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:1a65b503d008f066e994f34f456e0647e5ceb34cfcec5ad180b1b44020ad4970"}, + {file = "grpcio-1.67.1-cp311-cp311-win32.whl", hash = "sha256:e29ca27bec8e163dca0c98084040edec3bc49afd10f18b412f483cc68c712744"}, + {file = "grpcio-1.67.1-cp311-cp311-win_amd64.whl", hash = "sha256:786a5b18544622bfb1e25cc08402bd44ea83edfb04b93798d85dca4d1a0b5be5"}, + {file = "grpcio-1.67.1-cp312-cp312-linux_armv7l.whl", hash = "sha256:267d1745894200e4c604958da5f856da6293f063327cb049a51fe67348e4f953"}, + {file = "grpcio-1.67.1-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:85f69fdc1d28ce7cff8de3f9c67db2b0ca9ba4449644488c1e0303c146135ddb"}, + {file = "grpcio-1.67.1-cp312-cp312-manylinux_2_17_aarch64.whl", hash = "sha256:f26b0b547eb8d00e195274cdfc63ce64c8fc2d3e2d00b12bf468ece41a0423a0"}, + {file = "grpcio-1.67.1-cp312-cp312-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:4422581cdc628f77302270ff839a44f4c24fdc57887dc2a45b7e53d8fc2376af"}, + {file = "grpcio-1.67.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1d7616d2ded471231c701489190379e0c311ee0a6c756f3c03e6a62b95a7146e"}, + {file = "grpcio-1.67.1-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:8a00efecde9d6fcc3ab00c13f816313c040a28450e5e25739c24f432fc6d3c75"}, + {file = "grpcio-1.67.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:699e964923b70f3101393710793289e42845791ea07565654ada0969522d0a38"}, + {file = "grpcio-1.67.1-cp312-cp312-win32.whl", hash = "sha256:4e7b904484a634a0fff132958dabdb10d63e0927398273917da3ee103e8d1f78"}, + {file = "grpcio-1.67.1-cp312-cp312-win_amd64.whl", hash = "sha256:5721e66a594a6c4204458004852719b38f3d5522082be9061d6510b455c90afc"}, + {file = "grpcio-1.67.1-cp313-cp313-linux_armv7l.whl", hash = "sha256:aa0162e56fd10a5547fac8774c4899fc3e18c1aa4a4759d0ce2cd00d3696ea6b"}, + {file = "grpcio-1.67.1-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:beee96c8c0b1a75d556fe57b92b58b4347c77a65781ee2ac749d550f2a365dc1"}, + {file = "grpcio-1.67.1-cp313-cp313-manylinux_2_17_aarch64.whl", hash = "sha256:a93deda571a1bf94ec1f6fcda2872dad3ae538700d94dc283c672a3b508ba3af"}, + {file = "grpcio-1.67.1-cp313-cp313-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:0e6f255980afef598a9e64a24efce87b625e3e3c80a45162d111a461a9f92955"}, + {file = "grpcio-1.67.1-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9e838cad2176ebd5d4a8bb03955138d6589ce9e2ce5d51c3ada34396dbd2dba8"}, + {file = "grpcio-1.67.1-cp313-cp313-musllinux_1_1_i686.whl", hash = "sha256:a6703916c43b1d468d0756c8077b12017a9fcb6a1ef13faf49e67d20d7ebda62"}, + {file = "grpcio-1.67.1-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:917e8d8994eed1d86b907ba2a61b9f0aef27a2155bca6cbb322430fc7135b7bb"}, + {file = "grpcio-1.67.1-cp313-cp313-win32.whl", hash = "sha256:e279330bef1744040db8fc432becc8a727b84f456ab62b744d3fdb83f327e121"}, + {file = "grpcio-1.67.1-cp313-cp313-win_amd64.whl", hash = "sha256:fa0c739ad8b1996bd24823950e3cb5152ae91fca1c09cc791190bf1627ffefba"}, + {file = "grpcio-1.67.1-cp38-cp38-linux_armv7l.whl", hash = "sha256:178f5db771c4f9a9facb2ab37a434c46cb9be1a75e820f187ee3d1e7805c4f65"}, + {file = "grpcio-1.67.1-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:0f3e49c738396e93b7ba9016e153eb09e0778e776df6090c1b8c91877cc1c426"}, + {file = "grpcio-1.67.1-cp38-cp38-manylinux_2_17_aarch64.whl", hash = "sha256:24e8a26dbfc5274d7474c27759b54486b8de23c709d76695237515bc8b5baeab"}, + {file = "grpcio-1.67.1-cp38-cp38-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:3b6c16489326d79ead41689c4b84bc40d522c9a7617219f4ad94bc7f448c5085"}, + {file = "grpcio-1.67.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:60e6a4dcf5af7bbc36fd9f81c9f372e8ae580870a9e4b6eafe948cd334b81cf3"}, + {file = "grpcio-1.67.1-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:95b5f2b857856ed78d72da93cd7d09b6db8ef30102e5e7fe0961fe4d9f7d48e8"}, + {file = "grpcio-1.67.1-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:b49359977c6ec9f5d0573ea4e0071ad278ef905aa74e420acc73fd28ce39e9ce"}, + {file = "grpcio-1.67.1-cp38-cp38-win32.whl", hash = "sha256:f5b76ff64aaac53fede0cc93abf57894ab2a7362986ba22243d06218b93efe46"}, + {file = "grpcio-1.67.1-cp38-cp38-win_amd64.whl", hash = "sha256:804c6457c3cd3ec04fe6006c739579b8d35c86ae3298ffca8de57b493524b771"}, + {file = "grpcio-1.67.1-cp39-cp39-linux_armv7l.whl", hash = "sha256:a25bdea92b13ff4d7790962190bf6bf5c4639876e01c0f3dda70fc2769616335"}, + {file = "grpcio-1.67.1-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:cdc491ae35a13535fd9196acb5afe1af37c8237df2e54427be3eecda3653127e"}, + {file = "grpcio-1.67.1-cp39-cp39-manylinux_2_17_aarch64.whl", hash = "sha256:85f862069b86a305497e74d0dc43c02de3d1d184fc2c180993aa8aa86fbd19b8"}, + {file = "grpcio-1.67.1-cp39-cp39-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ec74ef02010186185de82cc594058a3ccd8d86821842bbac9873fd4a2cf8be8d"}, + {file = "grpcio-1.67.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:01f616a964e540638af5130469451cf580ba8c7329f45ca998ab66e0c7dcdb04"}, + {file = "grpcio-1.67.1-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:299b3d8c4f790c6bcca485f9963b4846dd92cf6f1b65d3697145d005c80f9fe8"}, + {file = "grpcio-1.67.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:60336bff760fbb47d7e86165408126f1dded184448e9a4c892189eb7c9d3f90f"}, + {file = "grpcio-1.67.1-cp39-cp39-win32.whl", hash = "sha256:5ed601c4c6008429e3d247ddb367fe8c7259c355757448d7c1ef7bd4a6739e8e"}, + {file = "grpcio-1.67.1-cp39-cp39-win_amd64.whl", hash = "sha256:5db70d32d6703b89912af16d6d45d78406374a8b8ef0d28140351dd0ec610e98"}, + {file = "grpcio-1.67.1.tar.gz", hash = "sha256:3dc2ed4cabea4dc14d5e708c2b426205956077cc5de419b4d4079315017e9732"}, ] [package.extras] -protobuf = ["grpcio-tools (>=1.67.0)"] +protobuf = ["grpcio-tools (>=1.67.1)"] [[package]] name = "h11" @@ -1487,13 +1487,13 @@ zstd = ["zstandard (>=0.18.0)"] [[package]] name = "huggingface-hub" -version = "0.26.1" +version = "0.26.2" description = "Client library to download and publish models, datasets and other repos on the huggingface.co hub" optional = false python-versions = ">=3.8.0" files = [ - {file = "huggingface_hub-0.26.1-py3-none-any.whl", hash = "sha256:5927a8fc64ae68859cd954b7cc29d1c8390a5e15caba6d3d349c973be8fdacf3"}, - {file = "huggingface_hub-0.26.1.tar.gz", hash = "sha256:414c0d9b769eecc86c70f9d939d0f48bb28e8461dd1130021542eff0212db890"}, + {file = "huggingface_hub-0.26.2-py3-none-any.whl", hash = "sha256:98c2a5a8e786c7b2cb6fdeb2740893cba4d53e312572ed3d8afafda65b128c46"}, + {file = "huggingface_hub-0.26.2.tar.gz", hash = "sha256:b100d853465d965733964d123939ba287da60a547087783ddff8a323f340332b"}, ] [package.dependencies] @@ -1660,13 +1660,13 @@ test = ["flaky", "ipyparallel", "pre-commit", "pytest (>=7.0)", "pytest-asyncio [[package]] name = "ipython" -version = "8.28.0" +version = "8.29.0" description = "IPython: Productive Interactive Computing" optional = false python-versions = ">=3.10" files = [ - {file = "ipython-8.28.0-py3-none-any.whl", hash = "sha256:530ef1e7bb693724d3cdc37287c80b07ad9b25986c007a53aa1857272dac3f35"}, - {file = "ipython-8.28.0.tar.gz", hash = "sha256:0d0d15ca1e01faeb868ef56bc7ee5a0de5bd66885735682e8a322ae289a13d1a"}, + {file = "ipython-8.29.0-py3-none-any.whl", hash = "sha256:0188a1bd83267192123ccea7f4a8ed0a78910535dbaa3f37671dca76ebd429c8"}, + {file = "ipython-8.29.0.tar.gz", hash = "sha256:40b60e15b22591450eef73e40a027cf77bd652e757523eebc5bd7c7c498290eb"}, ] [package.dependencies] @@ -2031,13 +2031,13 @@ test-ui = ["calysto-bash"] [[package]] name = "keyring" -version = "25.4.1" +version = "25.5.0" description = "Store and access your passwords safely." optional = false python-versions = ">=3.8" files = [ - {file = "keyring-25.4.1-py3-none-any.whl", hash = "sha256:5426f817cf7f6f007ba5ec722b1bcad95a75b27d780343772ad76b17cb47b0bf"}, - {file = "keyring-25.4.1.tar.gz", hash = "sha256:b07ebc55f3e8ed86ac81dd31ef14e81ace9dd9c3d4b5d77a6e9a2016d0d71a1b"}, + {file = "keyring-25.5.0-py3-none-any.whl", hash = "sha256:e67f8ac32b04be4714b42fe84ce7dad9c40985b9ca827c592cc303e7c26d9741"}, + {file = "keyring-25.5.0.tar.gz", hash = "sha256:4c753b3ec91717fe713c4edd522d625889d8973a349b0e582622f49766de58e6"}, ] [package.dependencies] @@ -4437,13 +4437,13 @@ testutils = ["gitpython (>3)"] [[package]] name = "pymdown-extensions" -version = "10.11.2" +version = "10.12" description = "Extension pack for Python Markdown." optional = false python-versions = ">=3.8" files = [ - {file = "pymdown_extensions-10.11.2-py3-none-any.whl", hash = "sha256:41cdde0a77290e480cf53892f5c5e50921a7ee3e5cd60ba91bf19837b33badcf"}, - {file = "pymdown_extensions-10.11.2.tar.gz", hash = "sha256:bc8847ecc9e784a098efd35e20cba772bc5a1b529dfcef9dc1972db9021a1049"}, + {file = "pymdown_extensions-10.12-py3-none-any.whl", hash = "sha256:49f81412242d3527b8b4967b990df395c89563043bc51a3d2d7d500e52123b77"}, + {file = "pymdown_extensions-10.12.tar.gz", hash = "sha256:b0ee1e0b2bef1071a47891ab17003bfe5bf824a398e13f49f8ed653b699369a7"}, ] [package.dependencies] @@ -4455,13 +4455,13 @@ extra = ["pygments (>=2.12)"] [[package]] name = "pymilvus" -version = "2.4.8" +version = "2.4.9" description = "Python Sdk for Milvus" optional = false python-versions = ">=3.8" files = [ - {file = "pymilvus-2.4.8-py3-none-any.whl", hash = "sha256:5824f8ef4ecb14cfd4b205bf976aa52576c3a83c3cd848d21c8f5f9bb99b29e1"}, - {file = "pymilvus-2.4.8.tar.gz", hash = "sha256:0ddd18a060635fc8f1d1ab5635d9cc340ef29a97783b73db186df6334fa31ee2"}, + {file = "pymilvus-2.4.9-py3-none-any.whl", hash = "sha256:45313607d2c164064bdc44e0f933cb6d6afa92e9efcc7f357c5240c57db58fbe"}, + {file = "pymilvus-2.4.9.tar.gz", hash = "sha256:0937663700007c23a84cfc0656160b301f6ff9247aaec4c96d599a6b43572136"}, ] [package.dependencies] @@ -5712,23 +5712,23 @@ train = ["accelerate (>=0.20.3)", "datasets"] [[package]] name = "setuptools" -version = "75.2.0" +version = "75.3.0" description = "Easily download, build, install, upgrade, and uninstall Python packages" optional = false python-versions = ">=3.8" files = [ - {file = "setuptools-75.2.0-py3-none-any.whl", hash = "sha256:a7fcb66f68b4d9e8e66b42f9876150a3371558f98fa32222ffaa5bced76406f8"}, - {file = "setuptools-75.2.0.tar.gz", hash = "sha256:753bb6ebf1f465a1912e19ed1d41f403a79173a9acf66a42e7e6aec45c3c16ec"}, + {file = "setuptools-75.3.0-py3-none-any.whl", hash = "sha256:f2504966861356aa38616760c0f66568e535562374995367b4e69c7143cf6bcd"}, + {file = "setuptools-75.3.0.tar.gz", hash = "sha256:fba5dd4d766e97be1b1681d98712680ae8f2f26d7881245f2ce9e40714f1a686"}, ] [package.extras] check = ["pytest-checkdocs (>=2.4)", "pytest-ruff (>=0.2.1)", "ruff (>=0.5.2)"] -core = ["importlib-metadata (>=6)", "importlib-resources (>=5.10.2)", "jaraco.collections", "jaraco.functools", "jaraco.text (>=3.7)", "more-itertools", "more-itertools (>=8.8)", "packaging", "packaging (>=24)", "platformdirs (>=2.6.2)", "tomli (>=2.0.1)", "wheel (>=0.43.0)"] +core = ["importlib-metadata (>=6)", "importlib-resources (>=5.10.2)", "jaraco.collections", "jaraco.functools", "jaraco.text (>=3.7)", "more-itertools", "more-itertools (>=8.8)", "packaging", "packaging (>=24)", "platformdirs (>=4.2.2)", "tomli (>=2.0.1)", "wheel (>=0.43.0)"] cover = ["pytest-cov"] doc = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "pygments-github-lexers (==0.0.5)", "pyproject-hooks (!=1.1)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-favicon", "sphinx-inline-tabs", "sphinx-lint", "sphinx-notfound-page (>=1,<2)", "sphinx-reredirects", "sphinxcontrib-towncrier", "towncrier (<24.7)"] enabler = ["pytest-enabler (>=2.2)"] -test = ["build[virtualenv] (>=1.0.3)", "filelock (>=3.4.0)", "ini2toml[lite] (>=0.14)", "jaraco.develop (>=7.21)", "jaraco.envs (>=2.2)", "jaraco.path (>=3.2.0)", "jaraco.test", "packaging (>=23.2)", "pip (>=19.1)", "pyproject-hooks (!=1.1)", "pytest (>=6,!=8.1.*)", "pytest-home (>=0.5)", "pytest-perf", "pytest-subprocess", "pytest-timeout", "pytest-xdist (>=3)", "tomli-w (>=1.0.0)", "virtualenv (>=13.0.0)", "wheel (>=0.44.0)"] -type = ["importlib-metadata (>=7.0.2)", "jaraco.develop (>=7.21)", "mypy (==1.11.*)", "pytest-mypy"] +test = ["build[virtualenv] (>=1.0.3)", "filelock (>=3.4.0)", "ini2toml[lite] (>=0.14)", "jaraco.develop (>=7.21)", "jaraco.envs (>=2.2)", "jaraco.path (>=3.2.0)", "jaraco.test (>=5.5)", "packaging (>=23.2)", "pip (>=19.1)", "pyproject-hooks (!=1.1)", "pytest (>=6,!=8.1.*)", "pytest-home (>=0.5)", "pytest-perf", "pytest-subprocess", "pytest-timeout", "pytest-xdist (>=3)", "tomli-w (>=1.0.0)", "virtualenv (>=13.0.0)", "wheel (>=0.44.0)"] +type = ["importlib-metadata (>=7.0.2)", "jaraco.develop (>=7.21)", "mypy (==1.12.*)", "pytest-mypy"] [[package]] name = "shapely" @@ -5995,13 +5995,13 @@ zarr = ["fsspec", "zarr"] [[package]] name = "tinycss2" -version = "1.3.0" +version = "1.4.0" description = "A tiny CSS parser" optional = false python-versions = ">=3.8" files = [ - {file = "tinycss2-1.3.0-py3-none-any.whl", hash = "sha256:54a8dbdffb334d536851be0226030e9505965bb2f30f21a4a82c55fb2a80fae7"}, - {file = "tinycss2-1.3.0.tar.gz", hash = "sha256:152f9acabd296a8375fbca5b84c961ff95971fcfc32e79550c8df8e29118c54d"}, + {file = "tinycss2-1.4.0-py3-none-any.whl", hash = "sha256:3a49cf47b7675da0b15d0c6e1df8df4ebd96e9394bb905a5775adb0d884c5289"}, + {file = "tinycss2-1.4.0.tar.gz", hash = "sha256:10c0972f6fc0fbee87c3edb76549357415e94548c1ae10ebccdea16fb404a9b7"}, ] [package.dependencies] @@ -6373,13 +6373,13 @@ files = [ [[package]] name = "tqdm" -version = "4.66.5" +version = "4.66.6" description = "Fast, Extensible Progress Meter" optional = false python-versions = ">=3.7" files = [ - {file = "tqdm-4.66.5-py3-none-any.whl", hash = "sha256:90279a3770753eafc9194a0364852159802111925aa30eb3f9d85b0e805ac7cd"}, - {file = "tqdm-4.66.5.tar.gz", hash = "sha256:e1020aef2e5096702d8a025ac7d16b1577279c9d63f8375b63083e9a5f0fcbad"}, + {file = "tqdm-4.66.6-py3-none-any.whl", hash = "sha256:223e8b5359c2efc4b30555531f09e9f2f3589bcd7fdd389271191031b49b7a63"}, + {file = "tqdm-4.66.6.tar.gz", hash = "sha256:4bdd694238bef1485ce839d67967ab50af8f9272aab687c0d7702a01da0be090"}, ] [package.dependencies] @@ -6408,13 +6408,13 @@ test = ["argcomplete (>=3.0.3)", "mypy (>=1.7.0)", "pre-commit", "pytest (>=7.0, [[package]] name = "transformers" -version = "4.45.2" +version = "4.46.0" description = "State-of-the-art Machine Learning for JAX, PyTorch and TensorFlow" optional = false python-versions = ">=3.8.0" files = [ - {file = "transformers-4.45.2-py3-none-any.whl", hash = "sha256:c551b33660cfc815bae1f9f097ecfd1e65be623f13c6ee0dda372bd881460210"}, - {file = "transformers-4.45.2.tar.gz", hash = "sha256:72bc390f6b203892561f05f86bbfaa0e234aab8e927a83e62b9d92ea7e3ae101"}, + {file = "transformers-4.46.0-py3-none-any.whl", hash = "sha256:e161268ae8bee315eb9e9b4c0b27f1bd6980f91e0fc292d75249193d339704c0"}, + {file = "transformers-4.46.0.tar.gz", hash = "sha256:3a9e2eb537094db11c3652334d281afa4766c0e5091c4dcdb454e9921bb0d2b7"}, ] [package.dependencies] @@ -6432,13 +6432,13 @@ tqdm = ">=4.27" [package.extras] accelerate = ["accelerate (>=0.26.0)"] agents = ["Pillow (>=10.0.1,<=15.0)", "accelerate (>=0.26.0)", "datasets (!=2.5.0)", "diffusers", "opencv-python", "sentencepiece (>=0.1.91,!=0.1.92)", "torch"] -all = ["Pillow (>=10.0.1,<=15.0)", "accelerate (>=0.26.0)", "av (==9.2.0)", "codecarbon (==1.2.0)", "decord (==0.6.0)", "flax (>=0.4.1,<=0.7.0)", "jax (>=0.4.1,<=0.4.13)", "jaxlib (>=0.4.1,<=0.4.13)", "kenlm", "keras-nlp (>=0.3.1,<0.14.0)", "librosa", "onnxconverter-common", "optax (>=0.0.8,<=0.1.4)", "optuna", "phonemizer", "protobuf", "pyctcdecode (>=0.4.0)", "ray[tune] (>=2.7.0)", "scipy (<1.13.0)", "sentencepiece (>=0.1.91,!=0.1.92)", "sigopt", "tensorflow (>2.9,<2.16)", "tensorflow-text (<2.16)", "tf2onnx", "timm (<=0.9.16)", "tokenizers (>=0.20,<0.21)", "torch", "torchaudio", "torchvision"] +all = ["Pillow (>=10.0.1,<=15.0)", "accelerate (>=0.26.0)", "av (==9.2.0)", "codecarbon (==1.2.0)", "flax (>=0.4.1,<=0.7.0)", "jax (>=0.4.1,<=0.4.13)", "jaxlib (>=0.4.1,<=0.4.13)", "kenlm", "keras-nlp (>=0.3.1,<0.14.0)", "librosa", "onnxconverter-common", "optax (>=0.0.8,<=0.1.4)", "optuna", "phonemizer", "protobuf", "pyctcdecode (>=0.4.0)", "ray[tune] (>=2.7.0)", "scipy (<1.13.0)", "sentencepiece (>=0.1.91,!=0.1.92)", "sigopt", "tensorflow (>2.9,<2.16)", "tensorflow-text (<2.16)", "tf2onnx", "timm (<=0.9.16)", "tokenizers (>=0.20,<0.21)", "torch", "torchaudio", "torchvision"] audio = ["kenlm", "librosa", "phonemizer", "pyctcdecode (>=0.4.0)"] benchmark = ["optimum-benchmark (>=0.3.0)"] codecarbon = ["codecarbon (==1.2.0)"] deepspeed = ["accelerate (>=0.26.0)", "deepspeed (>=0.9.3)"] deepspeed-testing = ["GitPython (<3.1.19)", "accelerate (>=0.26.0)", "beautifulsoup4", "cookiecutter (==1.7.3)", "datasets (!=2.5.0)", "deepspeed (>=0.9.3)", "dill (<0.3.5)", "evaluate (>=0.2.0)", "faiss-cpu", "nltk (<=3.8.1)", "optuna", "parameterized", "protobuf", "psutil", "pydantic", "pytest (>=7.2.0,<8.0.0)", "pytest-rich", "pytest-timeout", "pytest-xdist", "rjieba", "rouge-score (!=0.0.7,!=0.0.8,!=0.1,!=0.1.1)", "ruff (==0.5.1)", "sacrebleu (>=1.4.12,<2.0.0)", "sacremoses", "sentencepiece (>=0.1.91,!=0.1.92)", "tensorboard", "timeout-decorator"] -dev = ["GitPython (<3.1.19)", "Pillow (>=10.0.1,<=15.0)", "accelerate (>=0.26.0)", "av (==9.2.0)", "beautifulsoup4", "codecarbon (==1.2.0)", "cookiecutter (==1.7.3)", "datasets (!=2.5.0)", "decord (==0.6.0)", "dill (<0.3.5)", "evaluate (>=0.2.0)", "faiss-cpu", "flax (>=0.4.1,<=0.7.0)", "fugashi (>=1.0)", "ipadic (>=1.0.0,<2.0)", "isort (>=5.5.4)", "jax (>=0.4.1,<=0.4.13)", "jaxlib (>=0.4.1,<=0.4.13)", "kenlm", "keras-nlp (>=0.3.1,<0.14.0)", "libcst", "librosa", "nltk (<=3.8.1)", "onnxconverter-common", "optax (>=0.0.8,<=0.1.4)", "optuna", "parameterized", "phonemizer", "protobuf", "psutil", "pyctcdecode (>=0.4.0)", "pydantic", "pytest (>=7.2.0,<8.0.0)", "pytest-rich", "pytest-timeout", "pytest-xdist", "ray[tune] (>=2.7.0)", "rhoknp (>=1.1.0,<1.3.1)", "rich", "rjieba", "rouge-score (!=0.0.7,!=0.0.8,!=0.1,!=0.1.1)", "ruff (==0.5.1)", "sacrebleu (>=1.4.12,<2.0.0)", "sacremoses", "scikit-learn", "scipy (<1.13.0)", "sentencepiece (>=0.1.91,!=0.1.92)", "sigopt", "sudachidict-core (>=20220729)", "sudachipy (>=0.6.6)", "tensorboard", "tensorflow (>2.9,<2.16)", "tensorflow-text (<2.16)", "tf2onnx", "timeout-decorator", "timm (<=0.9.16)", "tokenizers (>=0.20,<0.21)", "torch", "torchaudio", "torchvision", "unidic (>=1.0.2)", "unidic-lite (>=1.0.7)", "urllib3 (<2.0.0)"] +dev = ["GitPython (<3.1.19)", "Pillow (>=10.0.1,<=15.0)", "accelerate (>=0.26.0)", "av (==9.2.0)", "beautifulsoup4", "codecarbon (==1.2.0)", "cookiecutter (==1.7.3)", "datasets (!=2.5.0)", "dill (<0.3.5)", "evaluate (>=0.2.0)", "faiss-cpu", "flax (>=0.4.1,<=0.7.0)", "fugashi (>=1.0)", "ipadic (>=1.0.0,<2.0)", "isort (>=5.5.4)", "jax (>=0.4.1,<=0.4.13)", "jaxlib (>=0.4.1,<=0.4.13)", "kenlm", "keras-nlp (>=0.3.1,<0.14.0)", "libcst", "librosa", "nltk (<=3.8.1)", "onnxconverter-common", "optax (>=0.0.8,<=0.1.4)", "optuna", "parameterized", "phonemizer", "protobuf", "psutil", "pyctcdecode (>=0.4.0)", "pydantic", "pytest (>=7.2.0,<8.0.0)", "pytest-rich", "pytest-timeout", "pytest-xdist", "ray[tune] (>=2.7.0)", "rhoknp (>=1.1.0,<1.3.1)", "rich", "rjieba", "rouge-score (!=0.0.7,!=0.0.8,!=0.1,!=0.1.1)", "ruff (==0.5.1)", "sacrebleu (>=1.4.12,<2.0.0)", "sacremoses", "scikit-learn", "scipy (<1.13.0)", "sentencepiece (>=0.1.91,!=0.1.92)", "sigopt", "sudachidict-core (>=20220729)", "sudachipy (>=0.6.6)", "tensorboard", "tensorflow (>2.9,<2.16)", "tensorflow-text (<2.16)", "tf2onnx", "timeout-decorator", "timm (<=0.9.16)", "tokenizers (>=0.20,<0.21)", "torch", "torchaudio", "torchvision", "unidic (>=1.0.2)", "unidic-lite (>=1.0.7)", "urllib3 (<2.0.0)"] dev-tensorflow = ["GitPython (<3.1.19)", "Pillow (>=10.0.1,<=15.0)", "beautifulsoup4", "cookiecutter (==1.7.3)", "datasets (!=2.5.0)", "dill (<0.3.5)", "evaluate (>=0.2.0)", "faiss-cpu", "isort (>=5.5.4)", "kenlm", "keras-nlp (>=0.3.1,<0.14.0)", "libcst", "librosa", "nltk (<=3.8.1)", "onnxconverter-common", "onnxruntime (>=1.4.0)", "onnxruntime-tools (>=1.4.2)", "parameterized", "phonemizer", "protobuf", "psutil", "pyctcdecode (>=0.4.0)", "pydantic", "pytest (>=7.2.0,<8.0.0)", "pytest-rich", "pytest-timeout", "pytest-xdist", "rich", "rjieba", "rouge-score (!=0.0.7,!=0.0.8,!=0.1,!=0.1.1)", "ruff (==0.5.1)", "sacrebleu (>=1.4.12,<2.0.0)", "sacremoses", "scikit-learn", "sentencepiece (>=0.1.91,!=0.1.92)", "tensorboard", "tensorflow (>2.9,<2.16)", "tensorflow-text (<2.16)", "tf2onnx", "timeout-decorator", "tokenizers (>=0.20,<0.21)", "urllib3 (<2.0.0)"] dev-torch = ["GitPython (<3.1.19)", "Pillow (>=10.0.1,<=15.0)", "accelerate (>=0.26.0)", "beautifulsoup4", "codecarbon (==1.2.0)", "cookiecutter (==1.7.3)", "datasets (!=2.5.0)", "dill (<0.3.5)", "evaluate (>=0.2.0)", "faiss-cpu", "fugashi (>=1.0)", "ipadic (>=1.0.0,<2.0)", "isort (>=5.5.4)", "kenlm", "libcst", "librosa", "nltk (<=3.8.1)", "onnxruntime (>=1.4.0)", "onnxruntime-tools (>=1.4.2)", "optuna", "parameterized", "phonemizer", "protobuf", "psutil", "pyctcdecode (>=0.4.0)", "pydantic", "pytest (>=7.2.0,<8.0.0)", "pytest-rich", "pytest-timeout", "pytest-xdist", "ray[tune] (>=2.7.0)", "rhoknp (>=1.1.0,<1.3.1)", "rich", "rjieba", "rouge-score (!=0.0.7,!=0.0.8,!=0.1,!=0.1.1)", "ruff (==0.5.1)", "sacrebleu (>=1.4.12,<2.0.0)", "sacremoses", "scikit-learn", "sentencepiece (>=0.1.91,!=0.1.92)", "sigopt", "sudachidict-core (>=20220729)", "sudachipy (>=0.6.6)", "tensorboard", "timeout-decorator", "timm (<=0.9.16)", "tokenizers (>=0.20,<0.21)", "torch", "torchaudio", "torchvision", "unidic (>=1.0.2)", "unidic-lite (>=1.0.7)", "urllib3 (<2.0.0)"] flax = ["flax (>=0.4.1,<=0.7.0)", "jax (>=0.4.1,<=0.4.13)", "jaxlib (>=0.4.1,<=0.4.13)", "optax (>=0.0.8,<=0.1.4)", "scipy (<1.13.0)"] @@ -6472,7 +6472,7 @@ torch = ["accelerate (>=0.26.0)", "torch"] torch-speech = ["kenlm", "librosa", "phonemizer", "pyctcdecode (>=0.4.0)", "torchaudio"] torch-vision = ["Pillow (>=10.0.1,<=15.0)", "torchvision"] torchhub = ["filelock", "huggingface-hub (>=0.23.2,<1.0)", "importlib-metadata", "numpy (>=1.17)", "packaging (>=20.0)", "protobuf", "regex (!=2019.12.17)", "requests", "sentencepiece (>=0.1.91,!=0.1.92)", "tokenizers (>=0.20,<0.21)", "torch", "tqdm (>=4.27)"] -video = ["av (==9.2.0)", "decord (==0.6.0)"] +video = ["av (==9.2.0)"] vision = ["Pillow (>=10.0.1,<=15.0)"] [[package]] @@ -6713,13 +6713,13 @@ zstd = ["zstandard (>=0.18.0)"] [[package]] name = "virtualenv" -version = "20.27.0" +version = "20.27.1" description = "Virtual Python Environment builder" optional = false python-versions = ">=3.8" files = [ - {file = "virtualenv-20.27.0-py3-none-any.whl", hash = "sha256:44a72c29cceb0ee08f300b314848c86e57bf8d1f13107a5e671fb9274138d655"}, - {file = "virtualenv-20.27.0.tar.gz", hash = "sha256:2ca56a68ed615b8fe4326d11a0dca5dfbe8fd68510fb6c6349163bed3c15f2b2"}, + {file = "virtualenv-20.27.1-py3-none-any.whl", hash = "sha256:f11f1b8a29525562925f745563bfd48b189450f61fb34c4f9cc79dd5aa32a1f4"}, + {file = "virtualenv-20.27.1.tar.gz", hash = "sha256:142c6be10212543b32c6c45d3d3893dff89112cc588b7d0879ae5a1ec03a47ba"}, ] [package.dependencies] @@ -7044,93 +7044,93 @@ files = [ [[package]] name = "yarl" -version = "1.16.0" +version = "1.17.0" description = "Yet another URL library" optional = false python-versions = ">=3.9" files = [ - {file = "yarl-1.16.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:32468f41242d72b87ab793a86d92f885355bcf35b3355aa650bfa846a5c60058"}, - {file = "yarl-1.16.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:234f3a3032b505b90e65b5bc6652c2329ea7ea8855d8de61e1642b74b4ee65d2"}, - {file = "yarl-1.16.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:8a0296040e5cddf074c7f5af4a60f3fc42c0237440df7bcf5183be5f6c802ed5"}, - {file = "yarl-1.16.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:de6c14dd7c7c0badba48157474ea1f03ebee991530ba742d381b28d4f314d6f3"}, - {file = "yarl-1.16.0-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:b140e532fe0266003c936d017c1ac301e72ee4a3fd51784574c05f53718a55d8"}, - {file = "yarl-1.16.0-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:019f5d58093402aa8f6661e60fd82a28746ad6d156f6c5336a70a39bd7b162b9"}, - {file = "yarl-1.16.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8c42998fd1cbeb53cd985bff0e4bc25fbe55fd6eb3a545a724c1012d69d5ec84"}, - {file = "yarl-1.16.0-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7c7c30fb38c300fe8140df30a046a01769105e4cf4282567a29b5cdb635b66c4"}, - {file = "yarl-1.16.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:e49e0fd86c295e743fd5be69b8b0712f70a686bc79a16e5268386c2defacaade"}, - {file = "yarl-1.16.0-cp310-cp310-musllinux_1_2_armv7l.whl", hash = "sha256:b9ca7b9147eb1365c8bab03c003baa1300599575effad765e0b07dd3501ea9af"}, - {file = "yarl-1.16.0-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:27e11db3f1e6a51081a981509f75617b09810529de508a181319193d320bc5c7"}, - {file = "yarl-1.16.0-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:8994c42f4ca25df5380ddf59f315c518c81df6a68fed5bb0c159c6cb6b92f120"}, - {file = "yarl-1.16.0-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:542fa8e09a581bcdcbb30607c7224beff3fdfb598c798ccd28a8184ffc18b7eb"}, - {file = "yarl-1.16.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:2bd6a51010c7284d191b79d3b56e51a87d8e1c03b0902362945f15c3d50ed46b"}, - {file = "yarl-1.16.0-cp310-cp310-win32.whl", hash = "sha256:178ccb856e265174a79f59721031060f885aca428983e75c06f78aa24b91d929"}, - {file = "yarl-1.16.0-cp310-cp310-win_amd64.whl", hash = "sha256:fe8bba2545427418efc1929c5c42852bdb4143eb8d0a46b09de88d1fe99258e7"}, - {file = "yarl-1.16.0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:d8643975a0080f361639787415a038bfc32d29208a4bf6b783ab3075a20b1ef3"}, - {file = "yarl-1.16.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:676d96bafc8c2d0039cea0cd3fd44cee7aa88b8185551a2bb93354668e8315c2"}, - {file = "yarl-1.16.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:d9525f03269e64310416dbe6c68d3b23e5d34aaa8f47193a1c45ac568cecbc49"}, - {file = "yarl-1.16.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8b37d5ec034e668b22cf0ce1074d6c21fd2a08b90d11b1b73139b750a8b0dd97"}, - {file = "yarl-1.16.0-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:4f32c4cb7386b41936894685f6e093c8dfaf0960124d91fe0ec29fe439e201d0"}, - {file = "yarl-1.16.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:5b8e265a0545637492a7e12fd7038370d66c9375a61d88c5567d0e044ded9202"}, - {file = "yarl-1.16.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:789a3423f28a5fff46fbd04e339863c169ece97c827b44de16e1a7a42bc915d2"}, - {file = "yarl-1.16.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f1d1f45e3e8d37c804dca99ab3cf4ab3ed2e7a62cd82542924b14c0a4f46d243"}, - {file = "yarl-1.16.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:621280719c4c5dad4c1391160a9b88925bb8b0ff6a7d5af3224643024871675f"}, - {file = "yarl-1.16.0-cp311-cp311-musllinux_1_2_armv7l.whl", hash = "sha256:ed097b26f18a1f5ff05f661dc36528c5f6735ba4ce8c9645e83b064665131349"}, - {file = "yarl-1.16.0-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:2f1fe2b2e3ee418862f5ebc0c0083c97f6f6625781382f828f6d4e9b614eba9b"}, - {file = "yarl-1.16.0-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:87dd10bc0618991c66cee0cc65fa74a45f4ecb13bceec3c62d78ad2e42b27a16"}, - {file = "yarl-1.16.0-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:4199db024b58a8abb2cfcedac7b1292c3ad421684571aeb622a02f242280e8d6"}, - {file = "yarl-1.16.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:99a9dcd4b71dd5f5f949737ab3f356cfc058c709b4f49833aeffedc2652dac56"}, - {file = "yarl-1.16.0-cp311-cp311-win32.whl", hash = "sha256:a9394c65ae0ed95679717d391c862dece9afacd8fa311683fc8b4362ce8a410c"}, - {file = "yarl-1.16.0-cp311-cp311-win_amd64.whl", hash = "sha256:5b9101f528ae0f8f65ac9d64dda2bb0627de8a50344b2f582779f32fda747c1d"}, - {file = "yarl-1.16.0-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:4ffb7c129707dd76ced0a4a4128ff452cecf0b0e929f2668ea05a371d9e5c104"}, - {file = "yarl-1.16.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:1a5e9d8ce1185723419c487758d81ac2bde693711947032cce600ca7c9cda7d6"}, - {file = "yarl-1.16.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:d743e3118b2640cef7768ea955378c3536482d95550222f908f392167fe62059"}, - {file = "yarl-1.16.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:26768342f256e6e3c37533bf9433f5f15f3e59e3c14b2409098291b3efaceacb"}, - {file = "yarl-1.16.0-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:d1b0796168b953bca6600c5f97f5ed407479889a36ad7d17183366260f29a6b9"}, - {file = "yarl-1.16.0-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:858728086914f3a407aa7979cab743bbda1fe2bdf39ffcd991469a370dd7414d"}, - {file = "yarl-1.16.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5570e6d47bcb03215baf4c9ad7bf7c013e56285d9d35013541f9ac2b372593e7"}, - {file = "yarl-1.16.0-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:66ea8311422a7ba1fc79b4c42c2baa10566469fe5a78500d4e7754d6e6db8724"}, - {file = "yarl-1.16.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:649bddcedee692ee8a9b7b6e38582cb4062dc4253de9711568e5620d8707c2a3"}, - {file = "yarl-1.16.0-cp312-cp312-musllinux_1_2_armv7l.whl", hash = "sha256:3a91654adb7643cb21b46f04244c5a315a440dcad63213033826549fa2435f71"}, - {file = "yarl-1.16.0-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:b439cae82034ade094526a8f692b9a2b5ee936452de5e4c5f0f6c48df23f8604"}, - {file = "yarl-1.16.0-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:571f781ae8ac463ce30bacebfaef2c6581543776d5970b2372fbe31d7bf31a07"}, - {file = "yarl-1.16.0-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:aa7943f04f36d6cafc0cf53ea89824ac2c37acbdb4b316a654176ab8ffd0f968"}, - {file = "yarl-1.16.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:1a5cf32539373ff39d97723e39a9283a7277cbf1224f7aef0c56c9598b6486c3"}, - {file = "yarl-1.16.0-cp312-cp312-win32.whl", hash = "sha256:a5b6c09b9b4253d6a208b0f4a2f9206e511ec68dce9198e0fbec4f160137aa67"}, - {file = "yarl-1.16.0-cp312-cp312-win_amd64.whl", hash = "sha256:1208ca14eed2fda324042adf8d6c0adf4a31522fa95e0929027cd487875f0240"}, - {file = "yarl-1.16.0-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:a5ace0177520bd4caa99295a9b6fb831d0e9a57d8e0501a22ffaa61b4c024283"}, - {file = "yarl-1.16.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:7118bdb5e3ed81acaa2095cba7ec02a0fe74b52a16ab9f9ac8e28e53ee299732"}, - {file = "yarl-1.16.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:38fec8a2a94c58bd47c9a50a45d321ab2285ad133adefbbadf3012c054b7e656"}, - {file = "yarl-1.16.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8791d66d81ee45866a7bb15a517b01a2bcf583a18ebf5d72a84e6064c417e64b"}, - {file = "yarl-1.16.0-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:1cf936ba67bc6c734f3aa1c01391da74ab7fc046a9f8bbfa230b8393b90cf472"}, - {file = "yarl-1.16.0-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d1aab176dd55b59f77a63b27cffaca67d29987d91a5b615cbead41331e6b7428"}, - {file = "yarl-1.16.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:995d0759004c08abd5d1b81300a91d18c8577c6389300bed1c7c11675105a44d"}, - {file = "yarl-1.16.0-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1bc22e00edeb068f71967ab99081e9406cd56dbed864fc3a8259442999d71552"}, - {file = "yarl-1.16.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:35b4f7842154176523e0a63c9b871168c69b98065d05a4f637fce342a6a2693a"}, - {file = "yarl-1.16.0-cp313-cp313-musllinux_1_2_armv7l.whl", hash = "sha256:7ace71c4b7a0c41f317ae24be62bb61e9d80838d38acb20e70697c625e71f120"}, - {file = "yarl-1.16.0-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:8f639e3f5795a6568aa4f7d2ac6057c757dcd187593679f035adbf12b892bb00"}, - {file = "yarl-1.16.0-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:e8be3aff14f0120ad049121322b107f8a759be76a6a62138322d4c8a337a9e2c"}, - {file = "yarl-1.16.0-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:122d8e7986043d0549e9eb23c7fd23be078be4b70c9eb42a20052b3d3149c6f2"}, - {file = "yarl-1.16.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:0fd9c227990f609c165f56b46107d0bc34553fe0387818c42c02f77974402c36"}, - {file = "yarl-1.16.0-cp313-cp313-win32.whl", hash = "sha256:595ca5e943baed31d56b33b34736461a371c6ea0038d3baec399949dd628560b"}, - {file = "yarl-1.16.0-cp313-cp313-win_amd64.whl", hash = "sha256:921b81b8d78f0e60242fb3db615ea3f368827a76af095d5a69f1c3366db3f596"}, - {file = "yarl-1.16.0-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:ab2b2ac232110a1fdb0d3ffcd087783edd3d4a6ced432a1bf75caf7b7be70916"}, - {file = "yarl-1.16.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:7f8713717a09acbfee7c47bfc5777e685539fefdd34fa72faf504c8be2f3df4e"}, - {file = "yarl-1.16.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:cdcffe1dbcb4477d2b4202f63cd972d5baa155ff5a3d9e35801c46a415b7f71a"}, - {file = "yarl-1.16.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9a91217208306d82357c67daeef5162a41a28c8352dab7e16daa82e3718852a7"}, - {file = "yarl-1.16.0-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:3ab3ed42c78275477ea8e917491365e9a9b69bb615cb46169020bd0aa5e2d6d3"}, - {file = "yarl-1.16.0-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:707ae579ccb3262dfaef093e202b4c3fb23c3810e8df544b1111bd2401fd7b09"}, - {file = "yarl-1.16.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ad7a852d1cd0b8d8b37fc9d7f8581152add917a98cfe2ea6e241878795f917ae"}, - {file = "yarl-1.16.0-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d3f1cc3d3d4dc574bebc9b387f6875e228ace5748a7c24f49d8f01ac1bc6c31b"}, - {file = "yarl-1.16.0-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:5ff96da263740779b0893d02b718293cc03400c3a208fc8d8cd79d9b0993e532"}, - {file = "yarl-1.16.0-cp39-cp39-musllinux_1_2_armv7l.whl", hash = "sha256:3d375a19ba2bfe320b6d873f3fb165313b002cef8b7cc0a368ad8b8a57453837"}, - {file = "yarl-1.16.0-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:62c7da0ad93a07da048b500514ca47b759459ec41924143e2ddb5d7e20fd3db5"}, - {file = "yarl-1.16.0-cp39-cp39-musllinux_1_2_ppc64le.whl", hash = "sha256:147b0fcd0ee33b4b5f6edfea80452d80e419e51b9a3f7a96ce98eaee145c1581"}, - {file = "yarl-1.16.0-cp39-cp39-musllinux_1_2_s390x.whl", hash = "sha256:504e1fe1cc4f170195320eb033d2b0ccf5c6114ce5bf2f617535c01699479bca"}, - {file = "yarl-1.16.0-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:bdcf667a5dec12a48f669e485d70c54189f0639c2157b538a4cffd24a853624f"}, - {file = "yarl-1.16.0-cp39-cp39-win32.whl", hash = "sha256:e9951afe6557c75a71045148890052cb942689ee4c9ec29f5436240e1fcc73b7"}, - {file = "yarl-1.16.0-cp39-cp39-win_amd64.whl", hash = "sha256:7d7aaa8ff95d0840e289423e7dc35696c2b058d635f945bf05b5cd633146b027"}, - {file = "yarl-1.16.0-py3-none-any.whl", hash = "sha256:e6980a558d8461230c457218bd6c92dfc1d10205548215c2c21d79dc8d0a96f3"}, - {file = "yarl-1.16.0.tar.gz", hash = "sha256:b6f687ced5510a9a2474bbae96a4352e5ace5fa34dc44a217b0537fec1db00b4"}, + {file = "yarl-1.17.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:2d8715edfe12eee6f27f32a3655f38d6c7410deb482158c0b7d4b7fad5d07628"}, + {file = "yarl-1.17.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:1803bf2a7a782e02db746d8bd18f2384801bc1d108723840b25e065b116ad726"}, + {file = "yarl-1.17.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:2e66589110e20c2951221a938fa200c7aa134a8bdf4e4dc97e6b21539ff026d4"}, + {file = "yarl-1.17.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7069d411cfccf868e812497e0ec4acb7c7bf8d684e93caa6c872f1e6f5d1664d"}, + {file = "yarl-1.17.0-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:cbf70ba16118db3e4b0da69dcde9d4d4095d383c32a15530564c283fa38a7c52"}, + {file = "yarl-1.17.0-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:0bc53cc349675b32ead83339a8de79eaf13b88f2669c09d4962322bb0f064cbc"}, + {file = "yarl-1.17.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d6aa18a402d1c80193ce97c8729871f17fd3e822037fbd7d9b719864018df746"}, + {file = "yarl-1.17.0-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d89c5bc701861cfab357aa0cd039bc905fe919997b8c312b4b0c358619c38d4d"}, + {file = "yarl-1.17.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:b728bdf38ca58f2da1d583e4af4ba7d4cd1a58b31a363a3137a8159395e7ecc7"}, + {file = "yarl-1.17.0-cp310-cp310-musllinux_1_2_armv7l.whl", hash = "sha256:5542e57dc15d5473da5a39fbde14684b0cc4301412ee53cbab677925e8497c11"}, + {file = "yarl-1.17.0-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:e564b57e5009fb150cb513804d7e9e9912fee2e48835638f4f47977f88b4a39c"}, + {file = "yarl-1.17.0-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:eb3c4cff524b4c1c1dba3a6da905edb1dfd2baf6f55f18a58914bbb2d26b59e1"}, + {file = "yarl-1.17.0-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:05e13f389038842da930d439fbed63bdce3f7644902714cb68cf527c971af804"}, + {file = "yarl-1.17.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:153c38ee2b4abba136385af4467459c62d50f2a3f4bde38c7b99d43a20c143ef"}, + {file = "yarl-1.17.0-cp310-cp310-win32.whl", hash = "sha256:4065b4259d1ae6f70fd9708ffd61e1c9c27516f5b4fae273c41028afcbe3a094"}, + {file = "yarl-1.17.0-cp310-cp310-win_amd64.whl", hash = "sha256:abf366391a02a8335c5c26163b5fe6f514cc1d79e74d8bf3ffab13572282368e"}, + {file = "yarl-1.17.0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:19a4fe0279626c6295c5b0c8c2bb7228319d2e985883621a6e87b344062d8135"}, + {file = "yarl-1.17.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:cadd0113f4db3c6b56868d6a19ca6286f5ccfa7bc08c27982cf92e5ed31b489a"}, + {file = "yarl-1.17.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:60d6693eef43215b1ccfb1df3f6eae8db30a9ff1e7989fb6b2a6f0b468930ee8"}, + {file = "yarl-1.17.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5bb8bf3843e1fa8cf3fe77813c512818e57368afab7ebe9ef02446fe1a10b492"}, + {file = "yarl-1.17.0-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:d2a5b35fd1d8d90443e061d0c8669ac7600eec5c14c4a51f619e9e105b136715"}, + {file = "yarl-1.17.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:c5bf17b32f392df20ab5c3a69d37b26d10efaa018b4f4e5643c7520d8eee7ac7"}, + {file = "yarl-1.17.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:48f51b529b958cd06e78158ff297a8bf57b4021243c179ee03695b5dbf9cb6e1"}, + {file = "yarl-1.17.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:5fcaa06bf788e19f913d315d9c99a69e196a40277dc2c23741a1d08c93f4d430"}, + {file = "yarl-1.17.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:32f3ee19ff0f18a7a522d44e869e1ebc8218ad3ae4ebb7020445f59b4bbe5897"}, + {file = "yarl-1.17.0-cp311-cp311-musllinux_1_2_armv7l.whl", hash = "sha256:a4fb69a81ae2ec2b609574ae35420cf5647d227e4d0475c16aa861dd24e840b0"}, + {file = "yarl-1.17.0-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:7bacc8b77670322132a1b2522c50a1f62991e2f95591977455fd9a398b4e678d"}, + {file = "yarl-1.17.0-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:437bf6eb47a2d20baaf7f6739895cb049e56896a5ffdea61a4b25da781966e8b"}, + {file = "yarl-1.17.0-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:30534a03c87484092080e3b6e789140bd277e40f453358900ad1f0f2e61fc8ec"}, + {file = "yarl-1.17.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:b30df4ff98703649915144be6f0df3b16fd4870ac38a09c56d5d9e54ff2d5f96"}, + {file = "yarl-1.17.0-cp311-cp311-win32.whl", hash = "sha256:263b487246858e874ab53e148e2a9a0de8465341b607678106829a81d81418c6"}, + {file = "yarl-1.17.0-cp311-cp311-win_amd64.whl", hash = "sha256:07055a9e8b647a362e7d4810fe99d8f98421575e7d2eede32e008c89a65a17bd"}, + {file = "yarl-1.17.0-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:84095ab25ba69a8fa3fb4936e14df631b8a71193fe18bd38be7ecbe34d0f5512"}, + {file = "yarl-1.17.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:02608fb3f6df87039212fc746017455ccc2a5fc96555ee247c45d1e9f21f1d7b"}, + {file = "yarl-1.17.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:13468d291fe8c12162b7cf2cdb406fe85881c53c9e03053ecb8c5d3523822cd9"}, + {file = "yarl-1.17.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8da3f8f368fb7e2f052fded06d5672260c50b5472c956a5f1bd7bf474ae504ab"}, + {file = "yarl-1.17.0-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:ec0507ab6523980bed050137007c76883d941b519aca0e26d4c1ec1f297dd646"}, + {file = "yarl-1.17.0-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:08fc76df7fd8360e9ff30e6ccc3ee85b8dbd6ed5d3a295e6ec62bcae7601b932"}, + {file = "yarl-1.17.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8d522f390686acb6bab2b917dd9ca06740c5080cd2eaa5aef8827b97e967319d"}, + {file = "yarl-1.17.0-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:147c527a80bb45b3dcd6e63401af8ac574125d8d120e6afe9901049286ff64ef"}, + {file = "yarl-1.17.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:24cf43bcd17a0a1f72284e47774f9c60e0bf0d2484d5851f4ddf24ded49f33c6"}, + {file = "yarl-1.17.0-cp312-cp312-musllinux_1_2_armv7l.whl", hash = "sha256:c28a44b9e0fba49c3857360e7ad1473fc18bc7f6659ca08ed4f4f2b9a52c75fa"}, + {file = "yarl-1.17.0-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:350cacb2d589bc07d230eb995d88fcc646caad50a71ed2d86df533a465a4e6e1"}, + {file = "yarl-1.17.0-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:fd1ab1373274dea1c6448aee420d7b38af163b5c4732057cd7ee9f5454efc8b1"}, + {file = "yarl-1.17.0-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:4934e0f96dadc567edc76d9c08181633c89c908ab5a3b8f698560124167d9488"}, + {file = "yarl-1.17.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:8d0a278170d75c88e435a1ce76557af6758bfebc338435b2eba959df2552163e"}, + {file = "yarl-1.17.0-cp312-cp312-win32.whl", hash = "sha256:61584f33196575a08785bb56db6b453682c88f009cd9c6f338a10f6737ce419f"}, + {file = "yarl-1.17.0-cp312-cp312-win_amd64.whl", hash = "sha256:9987a439ad33a7712bd5bbd073f09ad10d38640425fa498ecc99d8aa064f8fc4"}, + {file = "yarl-1.17.0-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:8deda7b8eb15a52db94c2014acdc7bdd14cb59ec4b82ac65d2ad16dc234a109e"}, + {file = "yarl-1.17.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:56294218b348dcbd3d7fce0ffd79dd0b6c356cb2a813a1181af730b7c40de9e7"}, + {file = "yarl-1.17.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:1fab91292f51c884b290ebec0b309a64a5318860ccda0c4940e740425a67b6b7"}, + {file = "yarl-1.17.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5cf93fa61ff4d9c7d40482ce1a2c9916ca435e34a1b8451e17f295781ccc034f"}, + {file = "yarl-1.17.0-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:261be774a0d71908c8830c33bacc89eef15c198433a8cc73767c10eeeb35a7d0"}, + {file = "yarl-1.17.0-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:deec9693b67f6af856a733b8a3e465553ef09e5e8ead792f52c25b699b8f9e6e"}, + {file = "yarl-1.17.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c804b07622ba50a765ca7fb8145512836ab65956de01307541def869e4a456c9"}, + {file = "yarl-1.17.0-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1d013a7c9574e98c14831a8f22d27277688ec3b2741d0188ac01a910b009987a"}, + {file = "yarl-1.17.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:e2cfcba719bd494c7413dcf0caafb51772dec168c7c946e094f710d6aa70494e"}, + {file = "yarl-1.17.0-cp313-cp313-musllinux_1_2_armv7l.whl", hash = "sha256:c068aba9fc5b94dfae8ea1cedcbf3041cd4c64644021362ffb750f79837e881f"}, + {file = "yarl-1.17.0-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:3616df510ffac0df3c9fa851a40b76087c6c89cbcea2de33a835fc80f9faac24"}, + {file = "yarl-1.17.0-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:755d6176b442fba9928a4df787591a6a3d62d4969f05c406cad83d296c5d4e05"}, + {file = "yarl-1.17.0-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:c18f6e708d1cf9ff5b1af026e697ac73bea9cb70ee26a2b045b112548579bed2"}, + {file = "yarl-1.17.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:5b937c216b6dee8b858c6afea958de03c5ff28406257d22b55c24962a2baf6fd"}, + {file = "yarl-1.17.0-cp313-cp313-win32.whl", hash = "sha256:d0131b14cb545c1a7bd98f4565a3e9bdf25a1bd65c83fc156ee5d8a8499ec4a3"}, + {file = "yarl-1.17.0-cp313-cp313-win_amd64.whl", hash = "sha256:01c96efa4313c01329e88b7e9e9e1b2fc671580270ddefdd41129fa8d0db7696"}, + {file = "yarl-1.17.0-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:0d44f67e193f0a7acdf552ecb4d1956a3a276c68e7952471add9f93093d1c30d"}, + {file = "yarl-1.17.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:16ea0aa5f890cdcb7ae700dffa0397ed6c280840f637cd07bffcbe4b8d68b985"}, + {file = "yarl-1.17.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:cf5469dc7dcfa65edf5cc3a6add9f84c5529c6b556729b098e81a09a92e60e51"}, + {file = "yarl-1.17.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e662bf2f6e90b73cf2095f844e2bc1fda39826472a2aa1959258c3f2a8500a2f"}, + {file = "yarl-1.17.0-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:8260e88f1446904ba20b558fa8ce5d0ab9102747238e82343e46d056d7304d7e"}, + {file = "yarl-1.17.0-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:5dc16477a4a2c71e64c5d3d15d7ae3d3a6bb1e8b955288a9f73c60d2a391282f"}, + {file = "yarl-1.17.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:46027e326cecd55e5950184ec9d86c803f4f6fe4ba6af9944a0e537d643cdbe0"}, + {file = "yarl-1.17.0-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:fc95e46c92a2b6f22e70afe07e34dbc03a4acd07d820204a6938798b16f4014f"}, + {file = "yarl-1.17.0-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:16ca76c7ac9515320cd09d6cc083d8d13d1803f6ebe212b06ea2505fd66ecff8"}, + {file = "yarl-1.17.0-cp39-cp39-musllinux_1_2_armv7l.whl", hash = "sha256:eb1a5b97388f2613f9305d78a3473cdf8d80c7034e554d8199d96dcf80c62ac4"}, + {file = "yarl-1.17.0-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:41fd5498975418cdc34944060b8fbeec0d48b2741068077222564bea68daf5a6"}, + {file = "yarl-1.17.0-cp39-cp39-musllinux_1_2_ppc64le.whl", hash = "sha256:146ca582ed04a5664ad04b0e0603934281eaab5c0115a5a46cce0b3c061a56a1"}, + {file = "yarl-1.17.0-cp39-cp39-musllinux_1_2_s390x.whl", hash = "sha256:6abb8c06107dbec97481b2392dafc41aac091a5d162edf6ed7d624fe7da0587a"}, + {file = "yarl-1.17.0-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:4d14be4613dd4f96c25feb4bd8c0d8ce0f529ab0ae555a17df5789e69d8ec0c5"}, + {file = "yarl-1.17.0-cp39-cp39-win32.whl", hash = "sha256:174d6a6cad1068f7850702aad0c7b1bca03bcac199ca6026f84531335dfc2646"}, + {file = "yarl-1.17.0-cp39-cp39-win_amd64.whl", hash = "sha256:6af417ca2c7349b101d3fd557ad96b4cd439fdb6ab0d288e3f64a068eea394d0"}, + {file = "yarl-1.17.0-py3-none-any.whl", hash = "sha256:62dd42bb0e49423f4dd58836a04fcf09c80237836796025211bbe913f1524993"}, + {file = "yarl-1.17.0.tar.gz", hash = "sha256:d3f13583f378930377e02002b4085a3d025b00402d5a80911726d43a67911cd9"}, ] [package.dependencies] diff --git a/tests/test_backend_asciidoc.py b/tests/test_backend_asciidoc.py index ab94d58c..e4fae312 100644 --- a/tests/test_backend_asciidoc.py +++ b/tests/test_backend_asciidoc.py @@ -2,9 +2,6 @@ import os from pathlib import Path -import pytest -from docling_core.types.doc import BoundingBox - from docling.backend.asciidoc_backend import AsciiDocBackend from docling.datamodel.base_models import InputFormat from docling.datamodel.document import InputDocument diff --git a/tests/test_e2e_conversion.py b/tests/test_e2e_conversion.py index 4911c248..d6753b04 100644 --- a/tests/test_e2e_conversion.py +++ b/tests/test_e2e_conversion.py @@ -1,8 +1,5 @@ from pathlib import Path -import yaml -from docling_core.types.doc import DoclingDocument - from docling.backend.docling_parse_backend import DoclingParseDocumentBackend from docling.datamodel.base_models import InputFormat from docling.datamodel.document import ConversionResult