Skip to content

Commit

Permalink
another fix to the tests
Browse files Browse the repository at this point in the history
Signed-off-by: Peter Staar <taa@zurich.ibm.com>
  • Loading branch information
PeterStaar-IBM committed Nov 8, 2024
1 parent 311640f commit 9e54a74
Show file tree
Hide file tree
Showing 2 changed files with 118 additions and 24 deletions.
43 changes: 19 additions & 24 deletions docling/backend/html_backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -214,7 +214,7 @@ def extract_text_recursively(self, item):
except:
pass
except:
_log.warn("item has no children")
_log.warning("item has no children")
pass

return "".join(result) + " "
Expand Down Expand Up @@ -352,14 +352,14 @@ def handle_listitem(self, element, idx, doc):
parent=self.parents[self.level],
)
else:
_log.warn("list-item has no text: ", element)
_log.debug("list-item has no text: ", element)

def handle_table(self, element, idx, doc):
"""Handles table tags."""

nested_tables = element.find("table")
if nested_tables is not None:
_log.warn("detected nested tables: skipping for now")
_log.warning("detected nested tables: skipping for now")
return

# Count the number of rows (number of <tr> elements)
Expand Down Expand Up @@ -398,10 +398,7 @@ def handle_table(self, element, idx, doc):
try:
text = self.extract_table_cell_text(html_cell)
except Exception as exc:
_log.warn("exception: ", exc)
exit(-1)

# label = html_cell.name
_log.warning("exception: ", exc)

col_span = int(html_cell.get("colspan", 1))
row_span = int(html_cell.get("rowspan", 1))
Expand Down Expand Up @@ -469,49 +466,49 @@ def _get_imageref(self, element):
fig_ref = None

img = element.find(["img"])
_log.info(img)

if img is not None and img.has_attr("src"):
fig_uri = img["src"]
_log.info(fig_uri)

dpi = 128
if fig_uri.startswith("//"):
fig_uri = "https:" + fig_uri

dpi: int = 128
try:
dpi = int(img["dpi"])
except:
_log.debug("could not identify `dpi` of image")

width = 128
width: float = 128.0
try:
width = int(img["width"])
width = float(img["width"])
except:
_log.debug("could not identify `width` of image")

height = 128
height: float = 128.0
try:
height = int(img["height"])
height = float(img["height"])
except:
_log.debug("could not identify `height` of image")

if fig_uri.endswith(".jpg"):
fig_ref = ImageRef(
mimetype="image/jpg", dpi=dpi, size=Size(width, height), uri=fig_uri
)
size = Size(width=width, height=height)

elif fig_uri.endswith(".jpeg"):
if fig_uri.endswith(".jpg") or fig_uri.endswith(".jpeg"):
fig_ref = ImageRef(
mimetype="image/jpg", dpi=dpi, size=Size(width, height), uri=fig_uri
mimetype="image/jpeg", dpi=dpi, size=size, uri=fig_uri
)

elif fig_uri.endswith(".png"):
fig_ref = ImageRef(
mimetype="image/png", dpi=dpi, size=Size(width, height), uri=fig_uri
mimetype="image/png", dpi=dpi, size=size, uri=fig_uri
)

elif fig_uri.endswith(".svg"):
fig_ref = ImageRef(
mimetype="image/svg", dpi=dpi, size=Size(width, height), uri=fig_uri
mimetype="image/svg", dpi=dpi, size=size, uri=fig_uri
)
else:
_log.debug(f"We do not yet support uri of type: {fig_uri}")

return fig_ref

Expand All @@ -537,8 +534,6 @@ def handle_figure(self, element, idx, doc):
fig_ref = self._get_imageref(element)
fig_caption = self._get_figcaption(element, doc)

_log.warn(fig_ref)

doc.add_picture(
parent=self.parents[self.level], image=fig_ref, caption=fig_caption
)
Expand Down
99 changes: 99 additions & 0 deletions tests/data/groundtruth/docling_v2/wiki_duck.html.json
Original file line number Diff line number Diff line change
Expand Up @@ -5906,6 +5906,15 @@
],
"references": [],
"footnotes": [],
"image": {
"mimetype": "image/jpeg",
"dpi": 128,
"size": {
"width": 220.0,
"height": 205.0
},
"uri": "https://upload.wikimedia.org/wikipedia/commons/thumb/8/82/Pacific_Black_Ducks_on_pond_ducking.jpg/220px-Pacific_Black_Ducks_on_pond_ducking.jpg"
},
"annotations": []
},
{
Expand All @@ -5923,6 +5932,15 @@
],
"references": [],
"footnotes": [],
"image": {
"mimetype": "image/jpeg",
"dpi": 128,
"size": {
"width": 220.0,
"height": 214.0
},
"uri": "https://upload.wikimedia.org/wikipedia/commons/thumb/7/78/Mallard-drake-chicago-march-2024.jpg/220px-Mallard-drake-chicago-march-2024.jpg"
},
"annotations": []
},
{
Expand All @@ -5940,6 +5958,15 @@
],
"references": [],
"footnotes": [],
"image": {
"mimetype": "image/jpeg",
"dpi": 128,
"size": {
"width": 220.0,
"height": 128.0
},
"uri": "https://upload.wikimedia.org/wikipedia/commons/thumb/7/7d/Wood-ducks-male-female-chicago-march-2024.jpg/220px-Wood-ducks-male-female-chicago-march-2024.jpg"
},
"annotations": []
},
{
Expand All @@ -5957,6 +5984,15 @@
],
"references": [],
"footnotes": [],
"image": {
"mimetype": "image/jpeg",
"dpi": 128,
"size": {
"width": 220.0,
"height": 147.0
},
"uri": "https://upload.wikimedia.org/wikipedia/commons/thumb/5/54/Mallard_drake_.02.jpg/220px-Mallard_drake_.02.jpg"
},
"annotations": []
},
{
Expand All @@ -5974,6 +6010,15 @@
],
"references": [],
"footnotes": [],
"image": {
"mimetype": "image/jpeg",
"dpi": 128,
"size": {
"width": 220.0,
"height": 157.0
},
"uri": "https://upload.wikimedia.org/wikipedia/commons/thumb/5/51/Mandarin.duck.arp.jpg/220px-Mandarin.duck.arp.jpg"
},
"annotations": []
},
{
Expand All @@ -5991,6 +6036,15 @@
],
"references": [],
"footnotes": [],
"image": {
"mimetype": "image/jpeg",
"dpi": 128,
"size": {
"width": 220.0,
"height": 147.0
},
"uri": "https://upload.wikimedia.org/wikipedia/commons/thumb/d/d0/Last_day_in_Ushuaia%2C_Argentina.Flying_Steamer-Ducks_%28Tachyeres_patachonicus%29_in_various_artistic_settings.Harbour_silhouettes._%2825921897721%29.jpg/220px-Last_day_in_Ushuaia%2C_Argentina.Flying_Steamer-Ducks_%28Tachyeres_patachonicus%29_in_various_artistic_settings.Harbour_silhouettes._%2825921897721%29.jpg"
},
"annotations": []
},
{
Expand All @@ -6008,6 +6062,15 @@
],
"references": [],
"footnotes": [],
"image": {
"mimetype": "image/jpeg",
"dpi": 128,
"size": {
"width": 220.0,
"height": 165.0
},
"uri": "https://upload.wikimedia.org/wikipedia/commons/thumb/a/a5/Female_Mallard_at_Menacuddle_Well.jpg/220px-Female_Mallard_at_Menacuddle_Well.jpg"
},
"annotations": []
},
{
Expand All @@ -6025,6 +6088,15 @@
],
"references": [],
"footnotes": [],
"image": {
"mimetype": "image/jpeg",
"dpi": 128,
"size": {
"width": 220.0,
"height": 106.0
},
"uri": "https://upload.wikimedia.org/wikipedia/commons/thumb/6/65/Duck_1_filter_teeth_edit.jpg/220px-Duck_1_filter_teeth_edit.jpg"
},
"annotations": []
},
{
Expand Down Expand Up @@ -6059,6 +6131,15 @@
],
"references": [],
"footnotes": [],
"image": {
"mimetype": "image/jpeg",
"dpi": 128,
"size": {
"width": 220.0,
"height": 165.0
},
"uri": "https://upload.wikimedia.org/wikipedia/commons/thumb/a/a6/Parrulo_-Muscovy_duckling.jpg/220px-Parrulo_-Muscovy_duckling.jpg"
},
"annotations": []
},
{
Expand Down Expand Up @@ -6093,6 +6174,15 @@
],
"references": [],
"footnotes": [],
"image": {
"mimetype": "image/jpeg",
"dpi": 128,
"size": {
"width": 220.0,
"height": 220.0
},
"uri": "https://upload.wikimedia.org/wikipedia/commons/thumb/d/d8/Tunnel_of_ducks.jpg/220px-Tunnel_of_ducks.jpg"
},
"annotations": []
},
{
Expand All @@ -6110,6 +6200,15 @@
],
"references": [],
"footnotes": [],
"image": {
"mimetype": "image/png",
"dpi": 128,
"size": {
"width": 130.0,
"height": 149.0
},
"uri": "https://upload.wikimedia.org/wikipedia/commons/thumb/0/04/Maaninka.vaakuna.svg/130px-Maaninka.vaakuna.svg.png"
},
"annotations": []
},
{
Expand Down

0 comments on commit 9e54a74

Please sign in to comment.