Skip to content

Commit

Permalink
feat: allow to explicitly initialize the pipeline
Browse files Browse the repository at this point in the history
Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>
  • Loading branch information
dolfim-ibm committed Oct 30, 2024
1 parent 4334986 commit 4dfa7e6
Show file tree
Hide file tree
Showing 2 changed files with 14 additions and 4 deletions.
12 changes: 8 additions & 4 deletions docling/document_converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -139,6 +139,10 @@ def __init__(

self.initialized_pipelines: Dict[Type[BasePipeline], BasePipeline] = {}

def initialize_pipeline(self, format: InputFormat):
"""Initialize the conversion pipeline for the selected format."""
self._get_pipeline(doc_format=format)

@validate_call(config=ConfigDict(strict=True))
def convert(
self,
Expand Down Expand Up @@ -219,13 +223,13 @@ def _convert(
else:
_log.info(f"Skipped a document. We lost {elapsed:.2f} sec.")

def _get_pipeline(self, doc: InputDocument) -> Optional[BasePipeline]:
def _get_pipeline(self, doc_format: InputFormat) -> Optional[BasePipeline]:
assert self.format_to_options is not None

fopt = self.format_to_options.get(doc.format)
fopt = self.format_to_options.get(doc_format)

if fopt is None:
raise RuntimeError(f"Could not get pipeline for document {doc.file}")
raise RuntimeError(f"Could not get pipeline for {doc_format}")
else:
pipeline_class = fopt.pipeline_cls
pipeline_options = fopt.pipeline_options
Expand Down Expand Up @@ -256,7 +260,7 @@ def _execute_pipeline(
self, in_doc: InputDocument, raises_on_error: bool
) -> ConversionResult:
if in_doc.valid:
pipeline = self._get_pipeline(in_doc)
pipeline = self._get_pipeline(in_doc.format)
if pipeline is None: # Can't find a default pipeline. Should this raise?
if raises_on_error:
raise RuntimeError(
Expand Down
6 changes: 6 additions & 0 deletions docs/examples/run_with_formats.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,12 @@ def main():
)
)

# initialize the pipeline for PDF. This will also download the required models.
print("Doing initialize for PDF")
doc_converter.initialize_pipeline(InputFormat.PDF)

# convert the documents
print("Converting documents")
conv_results = doc_converter.convert_all(input_paths)

for res in conv_results:
Expand Down

0 comments on commit 4dfa7e6

Please sign in to comment.