diff --git a/docling/backend/html_backend.py b/docling/backend/html_backend.py
index 3680e057..9a5fc7b3 100644
--- a/docling/backend/html_backend.py
+++ b/docling/backend/html_backend.py
@@ -214,7 +214,7 @@ def extract_text_recursively(self, item):
except:
pass
except:
- _log.warn("item has no children")
+ _log.warning("item has no children")
pass
return "".join(result) + " "
@@ -352,14 +352,14 @@ def handle_listitem(self, element, idx, doc):
parent=self.parents[self.level],
)
else:
- _log.warn("list-item has no text: ", element)
+ _log.debug("list-item has no text: ", element)
def handle_table(self, element, idx, doc):
"""Handles table tags."""
nested_tables = element.find("table")
if nested_tables is not None:
- _log.warn("detected nested tables: skipping for now")
+ _log.warning("detected nested tables: skipping for now")
return
# Count the number of rows (number of
elements)
@@ -398,10 +398,7 @@ def handle_table(self, element, idx, doc):
try:
text = self.extract_table_cell_text(html_cell)
except Exception as exc:
- _log.warn("exception: ", exc)
- exit(-1)
-
- # label = html_cell.name
+ _log.warning("exception: ", exc)
col_span = int(html_cell.get("colspan", 1))
row_span = int(html_cell.get("rowspan", 1))
@@ -469,49 +466,49 @@ def _get_imageref(self, element):
fig_ref = None
img = element.find(["img"])
- _log.info(img)
if img is not None and img.has_attr("src"):
fig_uri = img["src"]
- _log.info(fig_uri)
- dpi = 128
+ if fig_uri.startswith("//"):
+ fig_uri = "https:" + fig_uri
+
+ dpi: int = 128
try:
dpi = int(img["dpi"])
except:
_log.debug("could not identify `dpi` of image")
- width = 128
+ width: float = 128.0
try:
- width = int(img["width"])
+ width = float(img["width"])
except:
_log.debug("could not identify `width` of image")
- height = 128
+ height: float = 128.0
try:
- height = int(img["height"])
+ height = float(img["height"])
except:
_log.debug("could not identify `height` of image")
- if fig_uri.endswith(".jpg"):
- fig_ref = ImageRef(
- mimetype="image/jpg", dpi=dpi, size=Size(width, height), uri=fig_uri
- )
+ size = Size(width=width, height=height)
- elif fig_uri.endswith(".jpeg"):
+ if fig_uri.endswith(".jpg") or fig_uri.endswith(".jpeg"):
fig_ref = ImageRef(
- mimetype="image/jpg", dpi=dpi, size=Size(width, height), uri=fig_uri
+ mimetype="image/jpeg", dpi=dpi, size=size, uri=fig_uri
)
elif fig_uri.endswith(".png"):
fig_ref = ImageRef(
- mimetype="image/png", dpi=dpi, size=Size(width, height), uri=fig_uri
+ mimetype="image/png", dpi=dpi, size=size, uri=fig_uri
)
elif fig_uri.endswith(".svg"):
fig_ref = ImageRef(
- mimetype="image/svg", dpi=dpi, size=Size(width, height), uri=fig_uri
+ mimetype="image/svg", dpi=dpi, size=size, uri=fig_uri
)
+ else:
+ _log.debug(f"We do not yet support uri of type: {fig_uri}")
return fig_ref
@@ -537,8 +534,6 @@ def handle_figure(self, element, idx, doc):
fig_ref = self._get_imageref(element)
fig_caption = self._get_figcaption(element, doc)
- _log.warn(fig_ref)
-
doc.add_picture(
parent=self.parents[self.level], image=fig_ref, caption=fig_caption
)
diff --git a/tests/data/groundtruth/docling_v2/wiki_duck.html.json b/tests/data/groundtruth/docling_v2/wiki_duck.html.json
index 8c345c34..08c45007 100644
--- a/tests/data/groundtruth/docling_v2/wiki_duck.html.json
+++ b/tests/data/groundtruth/docling_v2/wiki_duck.html.json
@@ -5906,6 +5906,15 @@
],
"references": [],
"footnotes": [],
+ "image": {
+ "mimetype": "image/jpeg",
+ "dpi": 128,
+ "size": {
+ "width": 220.0,
+ "height": 205.0
+ },
+ "uri": "https://upload.wikimedia.org/wikipedia/commons/thumb/8/82/Pacific_Black_Ducks_on_pond_ducking.jpg/220px-Pacific_Black_Ducks_on_pond_ducking.jpg"
+ },
"annotations": []
},
{
@@ -5923,6 +5932,15 @@
],
"references": [],
"footnotes": [],
+ "image": {
+ "mimetype": "image/jpeg",
+ "dpi": 128,
+ "size": {
+ "width": 220.0,
+ "height": 214.0
+ },
+ "uri": "https://upload.wikimedia.org/wikipedia/commons/thumb/7/78/Mallard-drake-chicago-march-2024.jpg/220px-Mallard-drake-chicago-march-2024.jpg"
+ },
"annotations": []
},
{
@@ -5940,6 +5958,15 @@
],
"references": [],
"footnotes": [],
+ "image": {
+ "mimetype": "image/jpeg",
+ "dpi": 128,
+ "size": {
+ "width": 220.0,
+ "height": 128.0
+ },
+ "uri": "https://upload.wikimedia.org/wikipedia/commons/thumb/7/7d/Wood-ducks-male-female-chicago-march-2024.jpg/220px-Wood-ducks-male-female-chicago-march-2024.jpg"
+ },
"annotations": []
},
{
@@ -5957,6 +5984,15 @@
],
"references": [],
"footnotes": [],
+ "image": {
+ "mimetype": "image/jpeg",
+ "dpi": 128,
+ "size": {
+ "width": 220.0,
+ "height": 147.0
+ },
+ "uri": "https://upload.wikimedia.org/wikipedia/commons/thumb/5/54/Mallard_drake_.02.jpg/220px-Mallard_drake_.02.jpg"
+ },
"annotations": []
},
{
@@ -5974,6 +6010,15 @@
],
"references": [],
"footnotes": [],
+ "image": {
+ "mimetype": "image/jpeg",
+ "dpi": 128,
+ "size": {
+ "width": 220.0,
+ "height": 157.0
+ },
+ "uri": "https://upload.wikimedia.org/wikipedia/commons/thumb/5/51/Mandarin.duck.arp.jpg/220px-Mandarin.duck.arp.jpg"
+ },
"annotations": []
},
{
@@ -5991,6 +6036,15 @@
],
"references": [],
"footnotes": [],
+ "image": {
+ "mimetype": "image/jpeg",
+ "dpi": 128,
+ "size": {
+ "width": 220.0,
+ "height": 147.0
+ },
+ "uri": "https://upload.wikimedia.org/wikipedia/commons/thumb/d/d0/Last_day_in_Ushuaia%2C_Argentina.Flying_Steamer-Ducks_%28Tachyeres_patachonicus%29_in_various_artistic_settings.Harbour_silhouettes._%2825921897721%29.jpg/220px-Last_day_in_Ushuaia%2C_Argentina.Flying_Steamer-Ducks_%28Tachyeres_patachonicus%29_in_various_artistic_settings.Harbour_silhouettes._%2825921897721%29.jpg"
+ },
"annotations": []
},
{
@@ -6008,6 +6062,15 @@
],
"references": [],
"footnotes": [],
+ "image": {
+ "mimetype": "image/jpeg",
+ "dpi": 128,
+ "size": {
+ "width": 220.0,
+ "height": 165.0
+ },
+ "uri": "https://upload.wikimedia.org/wikipedia/commons/thumb/a/a5/Female_Mallard_at_Menacuddle_Well.jpg/220px-Female_Mallard_at_Menacuddle_Well.jpg"
+ },
"annotations": []
},
{
@@ -6025,6 +6088,15 @@
],
"references": [],
"footnotes": [],
+ "image": {
+ "mimetype": "image/jpeg",
+ "dpi": 128,
+ "size": {
+ "width": 220.0,
+ "height": 106.0
+ },
+ "uri": "https://upload.wikimedia.org/wikipedia/commons/thumb/6/65/Duck_1_filter_teeth_edit.jpg/220px-Duck_1_filter_teeth_edit.jpg"
+ },
"annotations": []
},
{
@@ -6059,6 +6131,15 @@
],
"references": [],
"footnotes": [],
+ "image": {
+ "mimetype": "image/jpeg",
+ "dpi": 128,
+ "size": {
+ "width": 220.0,
+ "height": 165.0
+ },
+ "uri": "https://upload.wikimedia.org/wikipedia/commons/thumb/a/a6/Parrulo_-Muscovy_duckling.jpg/220px-Parrulo_-Muscovy_duckling.jpg"
+ },
"annotations": []
},
{
@@ -6093,6 +6174,15 @@
],
"references": [],
"footnotes": [],
+ "image": {
+ "mimetype": "image/jpeg",
+ "dpi": 128,
+ "size": {
+ "width": 220.0,
+ "height": 220.0
+ },
+ "uri": "https://upload.wikimedia.org/wikipedia/commons/thumb/d/d8/Tunnel_of_ducks.jpg/220px-Tunnel_of_ducks.jpg"
+ },
"annotations": []
},
{
@@ -6110,6 +6200,15 @@
],
"references": [],
"footnotes": [],
+ "image": {
+ "mimetype": "image/png",
+ "dpi": 128,
+ "size": {
+ "width": 130.0,
+ "height": 149.0
+ },
+ "uri": "https://upload.wikimedia.org/wikipedia/commons/thumb/0/04/Maaninka.vaakuna.svg/130px-Maaninka.vaakuna.svg.png"
+ },
"annotations": []
},
{