You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
from docling.document_converter import DocumentConverter
doc_converter = DocumentConverter()
doc_converter.convert('image.pptx')
Results in:
File ~/dev/docling-test/.venv/lib/python3.12/site-packages/docling/backend/mspowerpoint_backend.py:409, in MsPowerpointDocumentBackend.walk_linear(self, pptx_obj, doc)
[407](https://file+.vscode-resource.vscode-cdn.net/dev/docling-test/~/dev/docling-test/.venv/lib/python3.12/site-packages/docling/backend/mspowerpoint_backend.py:407) for shape in slide.shapes:
[408](https://file+.vscode-resource.vscode-cdn.net/dev/docling-test/~/dev/docling-test/.venv/lib/python3.12/site-packages/docling/backend/mspowerpoint_backend.py:408) print(f"Processing shape {shape.shape_type}...")
--> [409](https://file+.vscode-resource.vscode-cdn.net/dev/docling-test/~/dev/docling-test/.venv/lib/python3.12/site-packages/docling/backend/mspowerpoint_backend.py:409) handle_shapes(shape, parent_slide, slide_ind, doc)
[411](https://file+.vscode-resource.vscode-cdn.net/dev/docling-test/~/dev/docling-test/.venv/lib/python3.12/site-packages/docling/backend/mspowerpoint_backend.py:411) return doc
File ~/dev/docling-test/.venv/lib/python3.12/site-packages/docling/backend/mspowerpoint_backend.py:385, in MsPowerpointDocumentBackend.walk_linear.<locals>.handle_shapes(shape, parent_slide, slide_ind, doc)
[382](https://file+.vscode-resource.vscode-cdn.net/dev/docling-test/~/dev/docling-test/.venv/lib/python3.12/site-packages/docling/backend/mspowerpoint_backend.py:382) self.handle_tables(shape, parent_slide, slide_ind, doc)
[383](https://file+.vscode-resource.vscode-cdn.net/dev/docling-test/~/dev/docling-test/.venv/lib/python3.12/site-packages/docling/backend/mspowerpoint_backend.py:383) if shape.shape_type == MSO_SHAPE_TYPE.PICTURE:
[384](https://file+.vscode-resource.vscode-cdn.net/dev/docling-test/~/dev/docling-test/.venv/lib/python3.12/site-packages/docling/backend/mspowerpoint_backend.py:384) # Handle Pictures
--> [385](https://file+.vscode-resource.vscode-cdn.net/dev/docling-test/~/dev/docling-test/.venv/lib/python3.12/site-packages/docling/backend/mspowerpoint_backend.py:385) self.handle_pictures(shape, parent_slide, slide_ind, doc)
[386](https://file+.vscode-resource.vscode-cdn.net/dev/docling-test/~/dev/docling-test/.venv/lib/python3.12/site-packages/docling/backend/mspowerpoint_backend.py:386) # If shape doesn't have any text, move on to the next shape
[387](https://file+.vscode-resource.vscode-cdn.net/dev/docling-test/~/dev/docling-test/.venv/lib/python3.12/site-packages/docling/backend/mspowerpoint_backend.py:387) if not hasattr(shape, "text"):
File ~/dev/docling-test/.venv/lib/python3.12/site-packages/docling/backend/mspowerpoint_backend.py:276, in MsPowerpointDocumentBackend.handle_pictures(self, shape, parent_slide, slide_ind, doc)
[274](https://file+.vscode-resource.vscode-cdn.net/dev/docling-test/~/dev/docling-test/.venv/lib/python3.12/site-packages/docling/backend/mspowerpoint_backend.py:274) image = shape.image
[275](https://file+.vscode-resource.vscode-cdn.net/dev/docling-test/~/dev/docling-test/.venv/lib/python3.12/site-packages/docling/backend/mspowerpoint_backend.py:275) image_bytes = image.blob
--> [276](https://file+.vscode-resource.vscode-cdn.net/dev/docling-test/~/dev/docling-test/.venv/lib/python3.12/site-packages/docling/backend/mspowerpoint_backend.py:276) im_dpi, _ = image.dpi
[278](https://file+.vscode-resource.vscode-cdn.net/dev/docling-test/~/dev/docling-test/.venv/lib/python3.12/site-packages/docling/backend/mspowerpoint_backend.py:278) # Open it with PIL
[279](https://file+.vscode-resource.vscode-cdn.net/dev/docling-test/~/dev/docling-test/.venv/lib/python3.12/site-packages/docling/backend/mspowerpoint_backend.py:279) pil_image = Image.open(BytesIO(image_bytes))
File ~/dev/docling-test/.venv/lib/python3.12/site-packages/pptx/util.py:191, in lazyproperty.__get__(self, obj, type)
[186](https://file+.vscode-resource.vscode-cdn.net/dev/docling-test/~/dev/docling-test/.venv/lib/python3.12/site-packages/pptx/util.py:186) value = obj.__dict__.get(self._name)
[187](https://file+.vscode-resource.vscode-cdn.net/dev/docling-test/~/dev/docling-test/.venv/lib/python3.12/site-packages/pptx/util.py:187) if value is None:
[188](https://file+.vscode-resource.vscode-cdn.net/dev/docling-test/~/dev/docling-test/.venv/lib/python3.12/site-packages/pptx/util.py:188) # --- on first access, the __dict__ item will be absent. Evaluate fget()
[189](https://file+.vscode-resource.vscode-cdn.net/dev/docling-test/~/dev/docling-test/.venv/lib/python3.12/site-packages/pptx/util.py:189) # --- and store that value in the (otherwise unused) host-object
[190](https://file+.vscode-resource.vscode-cdn.net/dev/docling-test/~/dev/docling-test/.venv/lib/python3.12/site-packages/pptx/util.py:190) # --- __dict__ value of same name ('fget' nominally)
--> [191](https://file+.vscode-resource.vscode-cdn.net/dev/docling-test/~/dev/docling-test/.venv/lib/python3.12/site-packages/pptx/util.py:191) value = self._fget(obj)
[192](https://file+.vscode-resource.vscode-cdn.net/dev/docling-test/~/dev/docling-test/.venv/lib/python3.12/site-packages/pptx/util.py:192) obj.__dict__[self._name] = value
[193](https://file+.vscode-resource.vscode-cdn.net/dev/docling-test/~/dev/docling-test/.venv/lib/python3.12/site-packages/pptx/util.py:193) return cast(_T, value)
File ~/dev/docling-test/.venv/lib/python3.12/site-packages/pptx/parts/image.py:217, in Image.dpi(self)
[214](https://file+.vscode-resource.vscode-cdn.net/dev/docling-test/~/dev/docling-test/.venv/lib/python3.12/site-packages/pptx/parts/image.py:214) return (int_dpi(pil_dpi[0]), int_dpi(pil_dpi[1]))
[215](https://file+.vscode-resource.vscode-cdn.net//dev/docling-test/~/dev/docling-test/.venv/lib/python3.12/site-packages/pptx/parts/image.py:215) return (72, 72)
--> [217](https://file+.vscode-resource.vscode-cdn.net/dev/docling-test/~/dev/docling-test/.venv/lib/python3.12/site-packages/pptx/parts/image.py:217) return normalize_pil_dpi(self._pil_props[2])
File ~/dev/docling-test/.venv/lib/python3.12/site-packages/pptx/util.py:191, in lazyproperty.__get__(self, obj, type)
[186](https://file+.vscode-resource.vscode-cdn.net/dev/docling-test/~/dev/docling-test/.venv/lib/python3.12/site-packages/pptx/util.py:186) value = obj.__dict__.get(self._name)
[187](https://file+.vscode-resource.vscode-cdn.net/dev/docling-test/~/dev/docling-test/.venv/lib/python3.12/site-packages/pptx/util.py:187) if value is None:
[188](https://file+.vscode-resource.vscode-cdn.net/dev/docling-test/~/dev/docling-test/.venv/lib/python3.12/site-packages/pptx/util.py:188) # --- on first access, the __dict__ item will be absent. Evaluate fget()
[189](https://file+.vscode-resource.vscode-cdn.net/dev/docling-test/~/dev/docling-test/.venv/lib/python3.12/site-packages/pptx/util.py:189) # --- and store that value in the (otherwise unused) host-object
[190](https://file+.vscode-resource.vscode-cdn.net/dev/docling-test/~/dev/docling-test/.venv/lib/python3.12/site-packages/pptx/util.py:190) # --- __dict__ value of same name ('fget' nominally)
--> [191](https://file+.vscode-resource.vscode-cdn.net/dev/docling-test/~/dev/docling-test/.venv/lib/python3.12/site-packages/pptx/util.py:191) value = self._fget(obj)
[192](https://file+.vscode-resource.vscode-cdn.net/dev/docling-test/~/dev/docling-test/.venv/lib/python3.12/site-packages/pptx/util.py:192) obj.__dict__[self._name] = value
[193](https://file+.vscode-resource.vscode-cdn.net/dev/docling-test/~/dev/docling-test/.venv/lib/python3.12/site-packages/pptx/util.py:193) return cast(_T, value)
File ~/dev/docling-test/.venv/lib/python3.12/site-packages/pptx/parts/image.py:267, in Image._pil_props(self)
[265](https://file+.vscode-resource.vscode-cdn.net/dev/docling-test/~/dev/docling-test/.venv/lib/python3.12/site-packages/pptx/parts/image.py:265) """tuple of image properties extracted from this image using Pillow."""
[266](https://file+.vscode-resource.vscode-cdn.net/dev/docling-test/~/dev/docling-test/.venv/lib/python3.12/site-packages/pptx/parts/image.py:266) stream = io.BytesIO(self._blob)
--> [267](https://file+.vscode-resource.vscode-cdn.net/dev/docling-test/~/dev/docling-test/.venv/lib/python3.12/site-packages/pptx/parts/image.py:267) pil_image = PIL_Image.open(stream) # pyright: ignore[reportUnknownMemberType]
[268](https://file+.vscode-resource.vscode-cdn.net/dev/docling-test/~/dev/docling-test/.venv/lib/python3.12/site-packages/pptx/parts/image.py:268) format = pil_image.format
[269](https://file+.vscode-resource.vscode-cdn.net/dev/docling-test/~/dev/docling-test/.venv/lib/python3.12/site-packages/pptx/parts/image.py:269) width_px, height_px = pil_image.size
File ~/dev/docling-test/.venv/lib/python3.12/site-packages/PIL/Image.py:3498, in open(fp, mode, formats)
[3496](https://file+.vscode-resource.vscode-cdn.net/dev/docling-test/~/dev/docling-test/.venv/lib/python3.12/site-packages/PIL/Image.py:3496) warnings.warn(message)
[3497](https://file+.vscode-resource.vscode-cdn.net/dev/docling-test/~/dev/docling-test/.venv/lib/python3.12/site-packages/PIL/Image.py:3497) msg = "cannot identify image file %r" % (filename if filename else fp)
-> [3498](https://file+.vscode-resource.vscode-cdn.net/dev/docling-test/~/dev/docling-test/.venv/lib/python3.12/site-packages/PIL/Image.py:3498) raise UnidentifiedImageError(msg)
UnidentifiedImageError: cannot identify image file <_io.BytesIO object at 0x177e0c450>
Bug
Powerpoint files with images unsupported by Pillow, ex: WMF/EMF cause an unhandled
UnidentifiedImageError
exception.Steps to reproduce
Sample file: image.pptx
Results in:
We may want to wrap
Image.open()
in atry
:Docling version
2.11.0
Python version
Python 3.12.0
The text was updated successfully, but these errors were encountered: