diff --git a/docling/backend/html_backend.py b/docling/backend/html_backend.py index 3680e057..9a5fc7b3 100644 --- a/docling/backend/html_backend.py +++ b/docling/backend/html_backend.py @@ -214,7 +214,7 @@ def extract_text_recursively(self, item): except: pass except: - _log.warn("item has no children") + _log.warning("item has no children") pass return "".join(result) + " " @@ -352,14 +352,14 @@ def handle_listitem(self, element, idx, doc): parent=self.parents[self.level], ) else: - _log.warn("list-item has no text: ", element) + _log.debug("list-item has no text: ", element) def handle_table(self, element, idx, doc): """Handles table tags.""" nested_tables = element.find("table") if nested_tables is not None: - _log.warn("detected nested tables: skipping for now") + _log.warning("detected nested tables: skipping for now") return # Count the number of rows (number of elements) @@ -398,10 +398,7 @@ def handle_table(self, element, idx, doc): try: text = self.extract_table_cell_text(html_cell) except Exception as exc: - _log.warn("exception: ", exc) - exit(-1) - - # label = html_cell.name + _log.warning("exception: ", exc) col_span = int(html_cell.get("colspan", 1)) row_span = int(html_cell.get("rowspan", 1)) @@ -469,49 +466,49 @@ def _get_imageref(self, element): fig_ref = None img = element.find(["img"]) - _log.info(img) if img is not None and img.has_attr("src"): fig_uri = img["src"] - _log.info(fig_uri) - dpi = 128 + if fig_uri.startswith("//"): + fig_uri = "https:" + fig_uri + + dpi: int = 128 try: dpi = int(img["dpi"]) except: _log.debug("could not identify `dpi` of image") - width = 128 + width: float = 128.0 try: - width = int(img["width"]) + width = float(img["width"]) except: _log.debug("could not identify `width` of image") - height = 128 + height: float = 128.0 try: - height = int(img["height"]) + height = float(img["height"]) except: _log.debug("could not identify `height` of image") - if fig_uri.endswith(".jpg"): - fig_ref = ImageRef( - mimetype="image/jpg", dpi=dpi, size=Size(width, height), uri=fig_uri - ) + size = Size(width=width, height=height) - elif fig_uri.endswith(".jpeg"): + if fig_uri.endswith(".jpg") or fig_uri.endswith(".jpeg"): fig_ref = ImageRef( - mimetype="image/jpg", dpi=dpi, size=Size(width, height), uri=fig_uri + mimetype="image/jpeg", dpi=dpi, size=size, uri=fig_uri ) elif fig_uri.endswith(".png"): fig_ref = ImageRef( - mimetype="image/png", dpi=dpi, size=Size(width, height), uri=fig_uri + mimetype="image/png", dpi=dpi, size=size, uri=fig_uri ) elif fig_uri.endswith(".svg"): fig_ref = ImageRef( - mimetype="image/svg", dpi=dpi, size=Size(width, height), uri=fig_uri + mimetype="image/svg", dpi=dpi, size=size, uri=fig_uri ) + else: + _log.debug(f"We do not yet support uri of type: {fig_uri}") return fig_ref @@ -537,8 +534,6 @@ def handle_figure(self, element, idx, doc): fig_ref = self._get_imageref(element) fig_caption = self._get_figcaption(element, doc) - _log.warn(fig_ref) - doc.add_picture( parent=self.parents[self.level], image=fig_ref, caption=fig_caption ) diff --git a/tests/data/groundtruth/docling_v2/wiki_duck.html.json b/tests/data/groundtruth/docling_v2/wiki_duck.html.json index 8c345c34..08c45007 100644 --- a/tests/data/groundtruth/docling_v2/wiki_duck.html.json +++ b/tests/data/groundtruth/docling_v2/wiki_duck.html.json @@ -5906,6 +5906,15 @@ ], "references": [], "footnotes": [], + "image": { + "mimetype": "image/jpeg", + "dpi": 128, + "size": { + "width": 220.0, + "height": 205.0 + }, + "uri": "https://upload.wikimedia.org/wikipedia/commons/thumb/8/82/Pacific_Black_Ducks_on_pond_ducking.jpg/220px-Pacific_Black_Ducks_on_pond_ducking.jpg" + }, "annotations": [] }, { @@ -5923,6 +5932,15 @@ ], "references": [], "footnotes": [], + "image": { + "mimetype": "image/jpeg", + "dpi": 128, + "size": { + "width": 220.0, + "height": 214.0 + }, + "uri": "https://upload.wikimedia.org/wikipedia/commons/thumb/7/78/Mallard-drake-chicago-march-2024.jpg/220px-Mallard-drake-chicago-march-2024.jpg" + }, "annotations": [] }, { @@ -5940,6 +5958,15 @@ ], "references": [], "footnotes": [], + "image": { + "mimetype": "image/jpeg", + "dpi": 128, + "size": { + "width": 220.0, + "height": 128.0 + }, + "uri": "https://upload.wikimedia.org/wikipedia/commons/thumb/7/7d/Wood-ducks-male-female-chicago-march-2024.jpg/220px-Wood-ducks-male-female-chicago-march-2024.jpg" + }, "annotations": [] }, { @@ -5957,6 +5984,15 @@ ], "references": [], "footnotes": [], + "image": { + "mimetype": "image/jpeg", + "dpi": 128, + "size": { + "width": 220.0, + "height": 147.0 + }, + "uri": "https://upload.wikimedia.org/wikipedia/commons/thumb/5/54/Mallard_drake_.02.jpg/220px-Mallard_drake_.02.jpg" + }, "annotations": [] }, { @@ -5974,6 +6010,15 @@ ], "references": [], "footnotes": [], + "image": { + "mimetype": "image/jpeg", + "dpi": 128, + "size": { + "width": 220.0, + "height": 157.0 + }, + "uri": "https://upload.wikimedia.org/wikipedia/commons/thumb/5/51/Mandarin.duck.arp.jpg/220px-Mandarin.duck.arp.jpg" + }, "annotations": [] }, { @@ -5991,6 +6036,15 @@ ], "references": [], "footnotes": [], + "image": { + "mimetype": "image/jpeg", + "dpi": 128, + "size": { + "width": 220.0, + "height": 147.0 + }, + "uri": "https://upload.wikimedia.org/wikipedia/commons/thumb/d/d0/Last_day_in_Ushuaia%2C_Argentina.Flying_Steamer-Ducks_%28Tachyeres_patachonicus%29_in_various_artistic_settings.Harbour_silhouettes._%2825921897721%29.jpg/220px-Last_day_in_Ushuaia%2C_Argentina.Flying_Steamer-Ducks_%28Tachyeres_patachonicus%29_in_various_artistic_settings.Harbour_silhouettes._%2825921897721%29.jpg" + }, "annotations": [] }, { @@ -6008,6 +6062,15 @@ ], "references": [], "footnotes": [], + "image": { + "mimetype": "image/jpeg", + "dpi": 128, + "size": { + "width": 220.0, + "height": 165.0 + }, + "uri": "https://upload.wikimedia.org/wikipedia/commons/thumb/a/a5/Female_Mallard_at_Menacuddle_Well.jpg/220px-Female_Mallard_at_Menacuddle_Well.jpg" + }, "annotations": [] }, { @@ -6025,6 +6088,15 @@ ], "references": [], "footnotes": [], + "image": { + "mimetype": "image/jpeg", + "dpi": 128, + "size": { + "width": 220.0, + "height": 106.0 + }, + "uri": "https://upload.wikimedia.org/wikipedia/commons/thumb/6/65/Duck_1_filter_teeth_edit.jpg/220px-Duck_1_filter_teeth_edit.jpg" + }, "annotations": [] }, { @@ -6059,6 +6131,15 @@ ], "references": [], "footnotes": [], + "image": { + "mimetype": "image/jpeg", + "dpi": 128, + "size": { + "width": 220.0, + "height": 165.0 + }, + "uri": "https://upload.wikimedia.org/wikipedia/commons/thumb/a/a6/Parrulo_-Muscovy_duckling.jpg/220px-Parrulo_-Muscovy_duckling.jpg" + }, "annotations": [] }, { @@ -6093,6 +6174,15 @@ ], "references": [], "footnotes": [], + "image": { + "mimetype": "image/jpeg", + "dpi": 128, + "size": { + "width": 220.0, + "height": 220.0 + }, + "uri": "https://upload.wikimedia.org/wikipedia/commons/thumb/d/d8/Tunnel_of_ducks.jpg/220px-Tunnel_of_ducks.jpg" + }, "annotations": [] }, { @@ -6110,6 +6200,15 @@ ], "references": [], "footnotes": [], + "image": { + "mimetype": "image/png", + "dpi": 128, + "size": { + "width": 130.0, + "height": 149.0 + }, + "uri": "https://upload.wikimedia.org/wikipedia/commons/thumb/0/04/Maaninka.vaakuna.svg/130px-Maaninka.vaakuna.svg.png" + }, "annotations": [] }, {