Skip to content

Commit

Permalink
Merge branch 'main' of github.com:DS4SD/docling into release_v3
Browse files Browse the repository at this point in the history
  • Loading branch information
cau-git committed Dec 17, 2024
2 parents dca32bf + 00dec7a commit 2c2026d
Show file tree
Hide file tree
Showing 35 changed files with 54,815 additions and 20 deletions.
Empty file added docling/backend/xml/__init__.py
Empty file.
1,888 changes: 1,888 additions & 0 deletions docling/backend/xml/uspto_backend.py

Large diffs are not rendered by default.

13 changes: 10 additions & 3 deletions docling/datamodel/base_models.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from enum import Enum, auto
from enum import Enum
from typing import TYPE_CHECKING, Dict, List, Optional, Union

from docling_core.types.doc import (
Expand Down Expand Up @@ -28,6 +28,8 @@ class ConversionStatus(str, Enum):


class InputFormat(str, Enum):
"""A document format supported by document backend parsers."""

DOCX = "docx"
PPTX = "pptx"
HTML = "html"
Expand All @@ -36,6 +38,7 @@ class InputFormat(str, Enum):
ASCIIDOC = "asciidoc"
MD = "md"
XLSX = "xlsx"
XML_USPTO = "xml_uspto"


class OutputFormat(str, Enum):
Expand All @@ -55,6 +58,7 @@ class OutputFormat(str, Enum):
InputFormat.IMAGE: ["jpg", "jpeg", "png", "tif", "tiff", "bmp"],
InputFormat.ASCIIDOC: ["adoc", "asciidoc", "asc"],
InputFormat.XLSX: ["xlsx"],
InputFormat.XML_USPTO: ["xml", "txt"],
}

FormatToMimeType: Dict[InputFormat, List[str]] = {
Expand All @@ -81,10 +85,13 @@ class OutputFormat(str, Enum):
InputFormat.XLSX: [
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
],
InputFormat.XML_USPTO: ["application/xml", "text/plain"],
}

MimeTypeToFormat = {
mime: fmt for fmt, mimes in FormatToMimeType.items() for mime in mimes
MimeTypeToFormat: dict[str, list[InputFormat]] = {
mime: [fmt for fmt in FormatToMimeType if mime in FormatToMimeType[fmt]]
for value in FormatToMimeType.values()
for mime in value
}


Expand Down
86 changes: 74 additions & 12 deletions docling/datamodel/document.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,17 @@
from enum import Enum
from io import BytesIO
from pathlib import Path, PurePath
from typing import TYPE_CHECKING, Dict, Iterable, List, Optional, Set, Type, Union
from typing import (
TYPE_CHECKING,
Dict,
Iterable,
List,
Literal,
Optional,
Set,
Type,
Union,
)

import filetype
from docling_core.types.doc import (
Expand Down Expand Up @@ -237,31 +247,31 @@ def docs(
if isinstance(obj, Path):
yield InputDocument(
path_or_stream=obj,
format=format,
format=format, # type: ignore[arg-type]
filename=obj.name,
limits=self.limits,
backend=backend,
)
elif isinstance(obj, DocumentStream):
yield InputDocument(
path_or_stream=obj.stream,
format=format,
format=format, # type: ignore[arg-type]
filename=obj.name,
limits=self.limits,
backend=backend,
)
else:
raise RuntimeError(f"Unexpected obj type in iterator: {type(obj)}")

def _guess_format(self, obj: Union[Path, DocumentStream]):
def _guess_format(self, obj: Union[Path, DocumentStream]) -> Optional[InputFormat]:
content = b"" # empty binary blob
format = None
formats: list[InputFormat] = []

if isinstance(obj, Path):
mime = filetype.guess_mime(str(obj))
if mime is None:
ext = obj.suffix[1:]
mime = self._mime_from_extension(ext)
mime = _DocumentConversionInput._mime_from_extension(ext)
if mime is None: # must guess from
with obj.open("rb") as f:
content = f.read(1024) # Read first 1KB
Expand All @@ -276,15 +286,53 @@ def _guess_format(self, obj: Union[Path, DocumentStream]):
if ("." in obj.name and not obj.name.startswith("."))
else ""
)
mime = self._mime_from_extension(ext)
mime = _DocumentConversionInput._mime_from_extension(ext)

mime = mime or self._detect_html_xhtml(content)
mime = mime or _DocumentConversionInput._detect_html_xhtml(content)
mime = mime or "text/plain"
formats = MimeTypeToFormat.get(mime, [])
if formats:
# TODO: remove application/xml case after adding another XML parse
if len(formats) == 1 and mime not in ("text/plain", "application/xml"):
return formats[0]
else: # ambiguity in formats
return _DocumentConversionInput._guess_from_content(
content, mime, formats
)
else:
return None

@staticmethod
def _guess_from_content(
content: bytes, mime: str, formats: list[InputFormat]
) -> Optional[InputFormat]:
"""Guess the input format of a document by checking part of its content."""
input_format: Optional[InputFormat] = None
content_str = content.decode("utf-8")

if mime == "application/xml":
match_doctype = re.search(r"<!DOCTYPE [^>]+>", content_str)
if match_doctype:
xml_doctype = match_doctype.group()
if InputFormat.XML_USPTO in formats and any(
item in xml_doctype
for item in (
"us-patent-application-v4",
"us-patent-grant-v4",
"us-grant-025",
"patent-application-publication",
)
):
input_format = InputFormat.XML_USPTO

elif mime == "text/plain":
if InputFormat.XML_USPTO in formats and content_str.startswith("PATN\r\n"):
input_format = InputFormat.XML_USPTO

format = MimeTypeToFormat.get(mime)
return format
return input_format

def _mime_from_extension(self, ext):
@staticmethod
def _mime_from_extension(ext):
mime = None
if ext in FormatToExtensions[InputFormat.ASCIIDOC]:
mime = FormatToMimeType[InputFormat.ASCIIDOC][0]
Expand All @@ -295,7 +343,19 @@ def _mime_from_extension(self, ext):

return mime

def _detect_html_xhtml(self, content):
@staticmethod
def _detect_html_xhtml(
content: bytes,
) -> Optional[Literal["application/xhtml+xml", "application/xml", "text/html"]]:
"""Guess the mime type of an XHTML, HTML, or XML file from its content.
Args:
content: A short piece of a document from its beginning.
Returns:
The mime type of an XHTML, HTML, or XML file, or None if the content does
not match any of these formats.
"""
content_str = content.decode("ascii", errors="ignore").lower()
# Remove XML comments
content_str = re.sub(r"<!--(.*?)-->", "", content_str, flags=re.DOTALL)
Expand All @@ -304,6 +364,8 @@ def _detect_html_xhtml(self, content):
if re.match(r"<\?xml", content_str):
if "xhtml" in content_str[:1000]:
return "application/xhtml+xml"
else:
return "application/xml"

if re.match(r"<!doctype\s+html|<html|<head|<body", content_str):
return "text/html"
Expand Down
13 changes: 11 additions & 2 deletions docling/document_converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
from docling.backend.msexcel_backend import MsExcelDocumentBackend
from docling.backend.mspowerpoint_backend import MsPowerpointDocumentBackend
from docling.backend.msword_backend import MsWordDocumentBackend
from docling.backend.xml.uspto_backend import PatentUsptoDocumentBackend
from docling.datamodel.base_models import (
ConversionStatus,
DoclingComponentType,
Expand Down Expand Up @@ -82,12 +83,17 @@ class HTMLFormatOption(FormatOption):
backend: Type[AbstractDocumentBackend] = HTMLDocumentBackend


class PdfFormatOption(FormatOption):
class PatentUsptoFormatOption(FormatOption):
pipeline_cls: Type = SimplePipeline
backend: Type[PatentUsptoDocumentBackend] = PatentUsptoDocumentBackend


class ImageFormatOption(FormatOption):
pipeline_cls: Type = StandardPdfPipeline
backend: Type[AbstractDocumentBackend] = DoclingParseV2DocumentBackend


class ImageFormatOption(FormatOption):
class PdfFormatOption(FormatOption):
pipeline_cls: Type = StandardPdfPipeline
backend: Type[AbstractDocumentBackend] = DoclingParseV2DocumentBackend

Expand All @@ -112,6 +118,9 @@ def _get_default_option(format: InputFormat) -> FormatOption:
InputFormat.HTML: FormatOption(
pipeline_cls=SimplePipeline, backend=HTMLDocumentBackend
),
InputFormat.XML_USPTO: FormatOption(
pipeline_cls=SimplePipeline, backend=PatentUsptoDocumentBackend
),
InputFormat.IMAGE: FormatOption(
pipeline_cls=StandardPdfPipeline, backend=DoclingParseV2DocumentBackend
),
Expand Down
Loading

0 comments on commit 2c2026d

Please sign in to comment.