feat: allow to explicitly initialize the pipeline

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>
DS4SD · Oct 30, 2024 · 4dfa7e6 · 4dfa7e6
1 parent 4334986
commit 4dfa7e6
Show file tree

Hide file tree

Showing 2 changed files with 14 additions and 4 deletions.
diff --git a/docling/document_converter.py b/docling/document_converter.py
@@ -139,6 +139,10 @@ def __init__(
 
         self.initialized_pipelines: Dict[Type[BasePipeline], BasePipeline] = {}
 
+    def initialize_pipeline(self, format: InputFormat):
+        """Initialize the conversion pipeline for the selected format."""
+        self._get_pipeline(doc_format=format)
+
     @validate_call(config=ConfigDict(strict=True))
     def convert(
         self,
@@ -219,13 +223,13 @@ def _convert(
                 else:
                     _log.info(f"Skipped a document. We lost {elapsed:.2f} sec.")
 
-    def _get_pipeline(self, doc: InputDocument) -> Optional[BasePipeline]:
+    def _get_pipeline(self, doc_format: InputFormat) -> Optional[BasePipeline]:
         assert self.format_to_options is not None
 
-        fopt = self.format_to_options.get(doc.format)
+        fopt = self.format_to_options.get(doc_format)
 
         if fopt is None:
-            raise RuntimeError(f"Could not get pipeline for document {doc.file}")
+            raise RuntimeError(f"Could not get pipeline for {doc_format}")
         else:
             pipeline_class = fopt.pipeline_cls
             pipeline_options = fopt.pipeline_options
@@ -256,7 +260,7 @@ def _execute_pipeline(
         self, in_doc: InputDocument, raises_on_error: bool
     ) -> ConversionResult:
         if in_doc.valid:
-            pipeline = self._get_pipeline(in_doc)
+            pipeline = self._get_pipeline(in_doc.format)
             if pipeline is None:  # Can't find a default pipeline. Should this raise?
                 if raises_on_error:
                     raise RuntimeError(

diff --git a/docs/examples/run_with_formats.py b/docs/examples/run_with_formats.py
@@ -57,6 +57,12 @@ def main():
         )
     )
 
+    # initialize the pipeline for PDF. This will also download the required models.
+    print("Doing initialize for PDF")
+    doc_converter.initialize_pipeline(InputFormat.PDF)
+
+    # convert the documents
+    print("Converting documents")
     conv_results = doc_converter.convert_all(input_paths)
 
     for res in conv_results: