Skip to content

Commit

Permalink
add the skip_furniture parameter
Browse files Browse the repository at this point in the history
Signed-off-by: Peter Staar <taa@zurich.ibm.com>
  • Loading branch information
PeterStaar-IBM committed Nov 1, 2024
1 parent ebe0b20 commit 473ad9a
Showing 1 changed file with 10 additions and 5 deletions.
15 changes: 10 additions & 5 deletions docling/backend/html_backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,12 @@


class HTMLDocumentBackend(DeclarativeDocumentBackend):
def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path], skip_furniture:bool=False):
def __init__(
self,
in_doc: "InputDocument",
path_or_stream: Union[BytesIO, Path],
skip_furniture: bool = False,
):
super().__init__(in_doc, path_or_stream)
_log.debug("About to init HTML backend...")
self.soup = None
Expand All @@ -36,7 +41,7 @@ def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]
self.labels = {} # type: ignore

self.skip_furniture = skip_furniture

try:
if isinstance(self.path_or_stream, BytesIO):
text_stream = self.path_or_stream.getvalue().decode("utf-8")
Expand Down Expand Up @@ -84,9 +89,9 @@ def convert(self) -> DoclingDocument:
for br in self.soup.body.find_all("br"):
br.replace_with("\n")

self.contains_h1 = bool(soup.find('h1')) and self.skip_furniture
self.contains_h1 = bool(self.soup.find("h1")) and self.skip_furniture
self.detected_h1 = False

doc = self.walk(self.soup.body, doc)
else:
raise RuntimeError(
Expand Down Expand Up @@ -124,7 +129,7 @@ def analyse_element(self, element, idx, doc):

if element.name in ["h1"]:
self.detected_h1 = True

if element.name in ["h1", "h2", "h3", "h4", "h5", "h6"]:
if (not self.contains_h1) or (self.contains_h1 and self.detected_h1):
self.handle_header(element, idx, doc)
Expand Down

0 comments on commit 473ad9a

Please sign in to comment.