Skip to content

Commit

Permalink
feat: add ability to detect h1 and filter from there-on
Browse files Browse the repository at this point in the history
Signed-off-by: Peter Staar <taa@zurich.ibm.com>
  • Loading branch information
PeterStaar-IBM committed Oct 31, 2024
1 parent 9d88658 commit c52e68c
Showing 1 changed file with 21 additions and 7 deletions.
28 changes: 21 additions & 7 deletions docling/backend/html_backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,10 @@ def convert(self) -> DoclingDocument:
# Replace <br> tags with newline characters
for br in self.soup.body.find_all("br"):
br.replace_with("\n")

self.contains_h1 = True
self.detected_h1 = False

doc = self.walk(self.soup.body, doc)
else:
raise RuntimeError(
Expand Down Expand Up @@ -116,20 +120,30 @@ def analyse_element(self, element, idx, doc):
else:
self.labels[element.name] = 1

if element.name in ["h1"]:
self.detected_h1 = True

if element.name in ["h1", "h2", "h3", "h4", "h5", "h6"]:
self.handle_header(element, idx, doc)
if (not self.contains_h1) or (self.contains_h1 and self.detected_h1):
self.handle_header(element, idx, doc)
elif element.name in ["p"]:
self.handle_paragraph(element, idx, doc)
if (not self.contains_h1) or (self.contains_h1 and self.detected_h1):
self.handle_paragraph(element, idx, doc)
elif element.name in ["ul", "ol"]:
self.handle_list(element, idx, doc)
if (not self.contains_h1) or (self.contains_h1 and self.detected_h1):
self.handle_list(element, idx, doc)
elif element.name in ["li"]:
self.handle_listitem(element, idx, doc)
if (not self.contains_h1) or (self.contains_h1 and self.detected_h1):
self.handle_listitem(element, idx, doc)
elif element.name == "table":
self.handle_table(element, idx, doc)
if (not self.contains_h1) or (self.contains_h1 and self.detected_h1):
self.handle_table(element, idx, doc)
elif element.name == "figure":
self.handle_figure(element, idx, doc)
if (not self.contains_h1) or (self.contains_h1 and self.detected_h1):
self.handle_figure(element, idx, doc)
elif element.name == "img":
self.handle_image(element, idx, doc)
if (not self.contains_h1) or (self.contains_h1 and self.detected_h1):
self.handle_image(element, idx, doc)
else:
self.walk(element, doc)

Expand Down

0 comments on commit c52e68c

Please sign in to comment.