Merge branch 'main' of github.com:DS4SD/docling into release_v3

DS4SD · Dec 17, 2024 · 2c2026d · 2c2026d
2 parents dca32bf + 00dec7a
commit 2c2026d
Show file tree

Hide file tree

Showing 35 changed files with 54,815 additions and 20 deletions.
diff --git a/docling/backend/xml/__init__.py b/docling/backend/xml/__init__.py
diff --git a/docling/backend/xml/uspto_backend.py b/docling/backend/xml/uspto_backend.py
diff --git a/docling/datamodel/base_models.py b/docling/datamodel/base_models.py
@@ -1,4 +1,4 @@
-from enum import Enum, auto
+from enum import Enum
 from typing import TYPE_CHECKING, Dict, List, Optional, Union
 
 from docling_core.types.doc import (
@@ -28,6 +28,8 @@ class ConversionStatus(str, Enum):
 
 
 class InputFormat(str, Enum):
+    """A document format supported by document backend parsers."""
+
     DOCX = "docx"
     PPTX = "pptx"
     HTML = "html"
@@ -36,6 +38,7 @@ class InputFormat(str, Enum):
     ASCIIDOC = "asciidoc"
     MD = "md"
     XLSX = "xlsx"
+    XML_USPTO = "xml_uspto"
 
 
 class OutputFormat(str, Enum):
@@ -55,6 +58,7 @@ class OutputFormat(str, Enum):
     InputFormat.IMAGE: ["jpg", "jpeg", "png", "tif", "tiff", "bmp"],
     InputFormat.ASCIIDOC: ["adoc", "asciidoc", "asc"],
     InputFormat.XLSX: ["xlsx"],
+    InputFormat.XML_USPTO: ["xml", "txt"],
 }
 
 FormatToMimeType: Dict[InputFormat, List[str]] = {
@@ -81,10 +85,13 @@ class OutputFormat(str, Enum):
     InputFormat.XLSX: [
         "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
     ],
+    InputFormat.XML_USPTO: ["application/xml", "text/plain"],
 }
 
-MimeTypeToFormat = {
-    mime: fmt for fmt, mimes in FormatToMimeType.items() for mime in mimes
+MimeTypeToFormat: dict[str, list[InputFormat]] = {
+    mime: [fmt for fmt in FormatToMimeType if mime in FormatToMimeType[fmt]]
+    for value in FormatToMimeType.values()
+    for mime in value
 }
 
 

diff --git a/docling/datamodel/document.py b/docling/datamodel/document.py
@@ -3,7 +3,17 @@
 from enum import Enum
 from io import BytesIO
 from pathlib import Path, PurePath
-from typing import TYPE_CHECKING, Dict, Iterable, List, Optional, Set, Type, Union
+from typing import (
+    TYPE_CHECKING,
+    Dict,
+    Iterable,
+    List,
+    Literal,
+    Optional,
+    Set,
+    Type,
+    Union,
+)
 
 import filetype
 from docling_core.types.doc import (
@@ -237,31 +247,31 @@ def docs(
             if isinstance(obj, Path):
                 yield InputDocument(
                     path_or_stream=obj,
-                    format=format,
+                    format=format,  # type: ignore[arg-type]
                     filename=obj.name,
                     limits=self.limits,
                     backend=backend,
                 )
             elif isinstance(obj, DocumentStream):
                 yield InputDocument(
                     path_or_stream=obj.stream,
-                    format=format,
+                    format=format,  # type: ignore[arg-type]
                     filename=obj.name,
                     limits=self.limits,
                     backend=backend,
                 )
             else:
                 raise RuntimeError(f"Unexpected obj type in iterator: {type(obj)}")
 
-    def _guess_format(self, obj: Union[Path, DocumentStream]):
+    def _guess_format(self, obj: Union[Path, DocumentStream]) -> Optional[InputFormat]:
         content = b""  # empty binary blob
-        format = None
+        formats: list[InputFormat] = []
 
         if isinstance(obj, Path):
             mime = filetype.guess_mime(str(obj))
             if mime is None:
                 ext = obj.suffix[1:]
-                mime = self._mime_from_extension(ext)
+                mime = _DocumentConversionInput._mime_from_extension(ext)
             if mime is None:  # must guess from
                 with obj.open("rb") as f:
                     content = f.read(1024)  # Read first 1KB
@@ -276,15 +286,53 @@ def _guess_format(self, obj: Union[Path, DocumentStream]):
                     if ("." in obj.name and not obj.name.startswith("."))
                     else ""
                 )
-                mime = self._mime_from_extension(ext)
+                mime = _DocumentConversionInput._mime_from_extension(ext)
 
-        mime = mime or self._detect_html_xhtml(content)
+        mime = mime or _DocumentConversionInput._detect_html_xhtml(content)
         mime = mime or "text/plain"
+        formats = MimeTypeToFormat.get(mime, [])
+        if formats:
+            # TODO: remove application/xml case after adding another XML parse
+            if len(formats) == 1 and mime not in ("text/plain", "application/xml"):
+                return formats[0]
+            else:  # ambiguity in formats
+                return _DocumentConversionInput._guess_from_content(
+                    content, mime, formats
+                )
+        else:
+            return None
+
+    @staticmethod
+    def _guess_from_content(
+        content: bytes, mime: str, formats: list[InputFormat]
+    ) -> Optional[InputFormat]:
+        """Guess the input format of a document by checking part of its content."""
+        input_format: Optional[InputFormat] = None
+        content_str = content.decode("utf-8")
+
+        if mime == "application/xml":
+            match_doctype = re.search(r"<!DOCTYPE [^>]+>", content_str)
+            if match_doctype:
+                xml_doctype = match_doctype.group()
+                if InputFormat.XML_USPTO in formats and any(
+                    item in xml_doctype
+                    for item in (
+                        "us-patent-application-v4",
+                        "us-patent-grant-v4",
+                        "us-grant-025",
+                        "patent-application-publication",
+                    )
+                ):
+                    input_format = InputFormat.XML_USPTO
+
+        elif mime == "text/plain":
+            if InputFormat.XML_USPTO in formats and content_str.startswith("PATN\r\n"):
+                input_format = InputFormat.XML_USPTO
 
-        format = MimeTypeToFormat.get(mime)
-        return format
+        return input_format
 
-    def _mime_from_extension(self, ext):
+    @staticmethod
+    def _mime_from_extension(ext):
         mime = None
         if ext in FormatToExtensions[InputFormat.ASCIIDOC]:
             mime = FormatToMimeType[InputFormat.ASCIIDOC][0]
@@ -295,7 +343,19 @@ def _mime_from_extension(self, ext):
 
         return mime
 
-    def _detect_html_xhtml(self, content):
+    @staticmethod
+    def _detect_html_xhtml(
+        content: bytes,
+    ) -> Optional[Literal["application/xhtml+xml", "application/xml", "text/html"]]:
+        """Guess the mime type of an XHTML, HTML, or XML file from its content.
+
+        Args:
+            content: A short piece of a document from its beginning.
+
+        Returns:
+            The mime type of an XHTML, HTML, or XML file, or None if the content does
+              not match any of these formats.
+        """
         content_str = content.decode("ascii", errors="ignore").lower()
         # Remove XML comments
         content_str = re.sub(r"<!--(.*?)-->", "", content_str, flags=re.DOTALL)
@@ -304,6 +364,8 @@ def _detect_html_xhtml(self, content):
         if re.match(r"<\?xml", content_str):
             if "xhtml" in content_str[:1000]:
                 return "application/xhtml+xml"
+            else:
+                return "application/xml"
 
         if re.match(r"<!doctype\s+html|<html|<head|<body", content_str):
             return "text/html"

diff --git a/docling/document_converter.py b/docling/document_converter.py
@@ -15,6 +15,7 @@
 from docling.backend.msexcel_backend import MsExcelDocumentBackend
 from docling.backend.mspowerpoint_backend import MsPowerpointDocumentBackend
 from docling.backend.msword_backend import MsWordDocumentBackend
+from docling.backend.xml.uspto_backend import PatentUsptoDocumentBackend
 from docling.datamodel.base_models import (
     ConversionStatus,
     DoclingComponentType,
@@ -82,12 +83,17 @@ class HTMLFormatOption(FormatOption):
     backend: Type[AbstractDocumentBackend] = HTMLDocumentBackend
 
 
-class PdfFormatOption(FormatOption):
+class PatentUsptoFormatOption(FormatOption):
+    pipeline_cls: Type = SimplePipeline
+    backend: Type[PatentUsptoDocumentBackend] = PatentUsptoDocumentBackend
+
+
+class ImageFormatOption(FormatOption):
     pipeline_cls: Type = StandardPdfPipeline
     backend: Type[AbstractDocumentBackend] = DoclingParseV2DocumentBackend
 
 
-class ImageFormatOption(FormatOption):
+class PdfFormatOption(FormatOption):
     pipeline_cls: Type = StandardPdfPipeline
     backend: Type[AbstractDocumentBackend] = DoclingParseV2DocumentBackend
 
@@ -112,6 +118,9 @@ def _get_default_option(format: InputFormat) -> FormatOption:
         InputFormat.HTML: FormatOption(
             pipeline_cls=SimplePipeline, backend=HTMLDocumentBackend
         ),
+        InputFormat.XML_USPTO: FormatOption(
+            pipeline_cls=SimplePipeline, backend=PatentUsptoDocumentBackend
+        ),
         InputFormat.IMAGE: FormatOption(
             pipeline_cls=StandardPdfPipeline, backend=DoclingParseV2DocumentBackend
         ),