update master (#433)

* TLDR-659 added references to nodes into HTML return format (#427) * TLDR-660 fixes in article type (#428) * TLDR-642 FinTOC benchmarks (#426) * TLDR-657 remove other_fields from LineMetadata and DocumentMetadata (#430) * Readme fixes (#431) * new version 2.2.1 (#432)
ispras · May 3, 2024 · 0acfb3d · 0acfb3d
1 parent 16747b0
commit 0acfb3d
Show file tree

Hide file tree

Showing 74 changed files with 5,067 additions and 4,142 deletions.
diff --git a/.flake8 b/.flake8
@@ -16,12 +16,15 @@ exclude =
     resources,
     venv,
     build,
-    dedoc.egg-info
-    docs/_build
+    dedoc.egg-info,
+    docs/_build,
+    scripts/fintoc2022/metric.py
 
 # ANN101 - type annotations for self
+# T201 - prints found
+# JS101 - Multi-line container not broken after opening character
 ignore =
     ANN101
 per-file-ignores =
     scripts/*:T201
-    scripts/benchmark_pdf_performance*:JS101,T201
+    scripts/benchmark_pdf_performance*:JS101
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -3,7 +3,7 @@ repos:
     rev: 5.0.4
     hooks:
     -   id: flake8
-        exclude: \.github|.*__init__\.py|resources|docs|venv|build|dedoc\.egg-info
+        exclude: \.github|.*__init__\.py|resources|docs|venv|build|dedoc\.egg-info|scripts/fintoc2022/metric.py
         args:
             - "--config=.flake8"
         additional_dependencies: [

diff --git a/README.md b/README.md
@@ -1,6 +1,10 @@
 # Dedoc
 
+[![License](http://img.shields.io/:license-apache-blue.svg)](http://www.apache.org/licenses/LICENSE-2.0.html)
 [![Documentation Status](https://readthedocs.org/projects/dedoc/badge/?version=latest)](https://dedoc.readthedocs.io/en/latest/?badge=latest)
+[![GitHub release](https://img.shields.io/github/release/ispras/dedoc.svg)](https://github.com/ispras/dedoc/releases/)
+[![Demo dedoc-readme.hf.space](https://img.shields.io/website-up-down-green-red/https/huggingface.co/spaces/dedoc/README.svg)](https://dedoc-readme.hf.space)
+[![Docker Hub](https://img.shields.io/docker/pulls/dedocproject/dedoc.svg)](https://hub.docker.com/r/dedocproject/dedoc/ "Docker Pulls")
 
 ![Dedoc](https://github.com/ispras/dedoc/raw/master/dedoc_logo.png)
 
@@ -39,52 +43,53 @@ In 2022, the system won a grant to support the development of promising AI proje
 ## Document format description
 The system processes different document formats. The main formats are listed below:
 
-| Format group          | Description                                                                                                                                                                                                                                                                                                                                                                     |
-|-----------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
-| Office formats        | DOCX, XLSX, PPTX and formats that canbe converted to them. Handling of these for-mats is held by analysis of format inner rep-resentation and using specialized libraries ([python-docx](https://python-docx.readthedocs.io/en/latest/), [BeautifulSoup](https://www.crummy.com/software/BeautifulSoup/bs4/doc/))                                                               |
-| HTML, EML, MHTML      | HTML  documents  are  parsed  using  tagsanalysis,  HTML  handler  is  used  for  han-dling  documents  of  other  formats  in  thisgroup                                                                                                                                                                                                                                       |
-| TXT                   | Only raw textual content is analyzed                                                                                                                                                                                                                                                                                                                                            |
-| Archives              | Attachments of the archive are analyzed                                                                                                                                                                                                                                                                                                                                         |                                                                                                                                                                                                                                                                                                                  |
-| PDF,document images   | Copyable PDF documents (with a textual layer) can be handled using [pdfminer-six](https://pdfminersix.readthedocs.io/en/latest/) library or [tabby](https://github.com/sunveil/ispras_tbl_extr) software. Non-copyable PDF documents or imagesare handled using [Tesseract-OCR](https://github.com/tesseract-ocr/tesseract), machine learning methods (including neural network methods) and [image processing methods](https://opencv.org/) |
+| Format group         | Description                                                                                                                                                                                                                                                                                                                                                                                                                                   |
+|----------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+| Office formats       | DOCX, XLSX, PPTX and formats that can be converted to them. Handling of these formats is held by analysis of format inner representation and using specialized libraries ([python-docx](https://python-docx.readthedocs.io/en/latest/), [BeautifulSoup](https://www.crummy.com/software/BeautifulSoup/bs4/doc/))                                                                                                                              |
+| HTML, EML, MHTML     | HTML  documents  are  parsed  using  tags analysis,  HTML  handler  is  used  for  handling  documents  of  other  formats  in  this group                                                                                                                                                                                                                                                                                                    |
+| TXT                  | Only raw textual content is analyzed                                                                                                                                                                                                                                                                                                                                                                                                          |
+| Archives             | Attachments of the archive are analyzed                                                                                                                                                                                                                                                                                                                                                                                                       |                                                                                                                                                                                                                                                                                                                  |
+| PDF, document images | Copyable PDF documents (with a textual layer) can be handled using [pdfminer-six](https://pdfminersix.readthedocs.io/en/latest/) library or [tabby](https://github.com/sunveil/ispras_tbl_extr) software. Non-copyable PDF documents or images are handled using [Tesseract-OCR](https://github.com/tesseract-ocr/tesseract), machine learning methods (including neural network methods) and [image processing methods](https://opencv.org/) |
 
 ## Examples of processed scanned documents
 * Dedoc can only process scanned black and white documents, such as technical specifications, regulations, articles, etc.
-<img src="docs/source/_static/doc_examples.png" alt="Document examples" style="width:800px;"/>
-<!--![Document examples](docs/source/_static/doc_examples.png){:height="150px"}-->
+<img src="https://github.com/ispras/dedoc/raw/master/docs/source/_static/doc_examples.png" alt="Document examples" style="width:800px;"/>
+
 * In particular, dedoc recognizes tabular information only from tables with explicit boundaries. Here are examples of documents that can be processed by an dedoc's image handler:
-<img src="docs/source/_static/example_table.jpg" alt="Table parsing example" style="width:600px;"/>
-<!--![Table Example](docs/source/_static/example_table.jpg)-->
+<img src="https://github.com/ispras/dedoc/raw/master/docs/source/_static/example_table.jpg" alt="Table parsing example" style="width:600px;"/>
+
 * The system also automatically detects and corrects the orientation of scanned documents
 
-## Example of structure extractor
-<img src="docs/source/_static/str_ext_example_law.png" alt="Law structure example"/>
-<img src="docs/source/_static/str_ext_example_tz.png" alt="Tz structure example"/>
+## Examples of structure extractors
+<img src="https://github.com/ispras/dedoc/raw/master/docs/source/_static/str_ext_example_law.png" alt="Law structure example"/>
+<img src="https://github.com/ispras/dedoc/raw/master/docs/source/_static/str_ext_example_tz.png" alt="Tz structure example"/>
 
 
 ## Impact
 This project may be useful as a first step of automatic document analysis pipeline (e.g. before the NLP part).
 Dedoc is in demand for information analytic systems, information leak monitoring systems, as well as for natural language processing systems.
 The library is intended for application use by developers of systems for automatic analysis and structuring of electronic documents, including for further search in electronic documents. 
 
-# Online-Documentation
-Relevant documentation of the dedoc is available [here](https://dedoc.readthedocs.io/en/latest/)
+# Documentation
+Relevant documentation of dedoc is available [here](https://dedoc.readthedocs.io/en/latest/)
 
 # Demo
-You can try dedoc's demo: https://dedoc-readme.hf.space.
 
-We have a video to demonstrate how to use the system: https://www.youtube.com/watch?v=ZUnPYV8rd9A.
+* You can try [dedoc demo](https://dedoc-readme.hf.space)
+* You can watch [video about dedoc](https://www.youtube.com/watch?v=ZUnPYV8rd9A)
 
-![Web_interface](docs/source/_static/web_interface.png)
+![](https://github.com/ispras/dedoc/raw/master/docs/source/_static/web_interface.png)
 
-![dedoc_demo](docs/source/_static/dedoc_short.gif)
+![](https://github.com/ispras/dedoc/raw/master/docs/source/_static/dedoc_short.gif)
 
-# Some our publications
+# Publications related to dedoc
 
-* Article on [Habr](https://habr.com/ru/companies/isp_ras/articles/779390/), where we describe our system in detail
-* [Our article](https://aclanthology.org/2022.fnp-1.13.pdf) from the FINTOC 2022 competition. We are the winners :smiley: :trophy:!
+* Article [ISPRAS@FinTOC-2022 shared task: Two-stage TOC generation model](https://aclanthology.org/2022.fnp-1.13.pdf) for the [FinTOC 2022 Shared Task](https://wp.lancs.ac.uk/cfie/fintoc2022/). We are the winners :smiley: :trophy:!
+* Article on habr.com [Dedoc: как автоматически извлечь из текстового документа всё и даже немного больше](https://habr.com/ru/companies/isp_ras/articles/779390/) in Russian (2023)
+* Article [Dedoc: A Universal System for Extracting Content and Logical Structure From Textual Documents](https://ieeexplore.ieee.org/abstract/document/10508151/) in English (2023)
 
 # Installation instructions
-****************************************
+
 This project has REST Api and you can run it in Docker container.
 Also, dedoc can be installed as a library via `pip`.
 There are two ways to install and run dedoc as a web application or a library that are described below.

diff --git a/VERSION b/VERSION
@@ -1 +1 @@
-2.2
+2.2.1
diff --git a/dedoc/api/api_args.py b/dedoc/api/api_args.py
@@ -7,7 +7,7 @@
 @dataclass
 class QueryParameters:
     # type of document structure parsing
-    document_type: str = Form("", enum=["", "law", "tz", "diploma"], description="Document domain")
+    document_type: str = Form("", enum=["", "law", "tz", "diploma", "article", "fintoc"], description="Document domain")
     structure_type: str = Form("tree", enum=["linear", "tree"], description="Output structure type")
     return_format: str = Form("json", enum=["json", "html", "plain_text", "tree", "collapsed_tree", "ujson", "pretty_json"],
                               description="Response representation, most types (except json) are used for debug purposes only")
@@ -29,7 +29,7 @@ class QueryParameters:
     # pdf handling
     pdf_with_text_layer: str = Form("auto_tabby", enum=["true", "false", "auto", "auto_tabby", "tabby"],
                                     description="Extract text from a text layer of PDF or using OCR methods for image-like documents")
-    language: str = Form("rus+eng", enum=["rus+eng", "rus", "eng"], description="Recognition language")
+    language: str = Form("rus+eng", enum=["rus+eng", "rus", "eng", "fra", "spa"], description="Recognition language")
     pages: str = Form(":", description='Page numbers range for reading PDF or images, "left:right" means read pages from left to right')
     is_one_column_document: str = Form("auto", enum=["auto", "true", "false"],
                                        description='One or multiple column document, "auto" - predict number of page columns automatically')

diff --git a/dedoc/api/api_utils.py b/dedoc/api/api_utils.py
@@ -3,6 +3,7 @@
 from dedoc.data_structures import LineMetadata
 from dedoc.data_structures.concrete_annotations.bold_annotation import BoldAnnotation
 from dedoc.data_structures.concrete_annotations.italic_annotation import ItalicAnnotation
+from dedoc.data_structures.concrete_annotations.reference_annotation import ReferenceAnnotation
 from dedoc.data_structures.concrete_annotations.strike_annotation import StrikeAnnotation
 from dedoc.data_structures.concrete_annotations.subscript_annotation import SubscriptAnnotation
 from dedoc.data_structures.concrete_annotations.superscript_annotation import SuperscriptAnnotation
@@ -116,7 +117,7 @@ def json2html(text: str, paragraph: TreeNode, tables: Optional[List[Table]], tab
     if table2id is None:
         table2id = {table.metadata.uid: table_id for table_id, table in enumerate(tables)}
 
-    ptext = __annotations2html(paragraph, table2id)
+    ptext = __annotations2html(paragraph, table2id, tabs=tabs)
 
     if paragraph.metadata.hierarchy_level.line_type in [HierarchyLevel.header, HierarchyLevel.root]:
         ptext = f"<strong>{ptext.strip()}</strong>"
@@ -125,7 +126,10 @@ def json2html(text: str, paragraph: TreeNode, tables: Optional[List[Table]], tab
     else:
         ptext = ptext.strip()
 
-    text += f'<p> {"&nbsp;" * tabs} {ptext}     <sub> id = {paragraph.node_id} ; type = {paragraph.metadata.hierarchy_level.line_type} </sub></p>'
+    ptext = f'<p> {"&nbsp;" * tabs} {ptext}     <sub> id = {paragraph.node_id} ; type = {paragraph.metadata.hierarchy_level.line_type} </sub></p>'
+    if hasattr(paragraph.metadata, "uid"):
+        ptext = f'<div id="{paragraph.metadata.uid}">{ptext}</div>'
+    text += ptext
 
     for subparagraph in paragraph.subparagraphs:
         text = json2html(text=text, paragraph=subparagraph, tables=None, tabs=tabs + 4, table2id=table2id)
@@ -157,14 +161,17 @@ def __value2tag(name: str, value: str) -> str:
     if name == UnderlinedAnnotation.name:
         return "u"
 
+    if name == ReferenceAnnotation.name:
+        return "a"
+
     if value.startswith("heading "):
         level = value[len("heading "):]
         return "h" + level if level.isdigit() and int(level) < 7 else "strong"
 
     return value
 
 
-def __annotations2html(paragraph: TreeNode, table2id: Dict[str, int]) -> str:
+def __annotations2html(paragraph: TreeNode, table2id: Dict[str, int], tabs: int = 0) -> str:
     indexes = dict()
 
     for annotation in paragraph.annotations:
@@ -177,7 +184,7 @@ def __annotations2html(paragraph: TreeNode, table2id: Dict[str, int]) -> str:
                             SubscriptAnnotation.name,
                             SuperscriptAnnotation.name,
                             UnderlinedAnnotation.name]
-        check_annotations = bool_annotations + ["table"]
+        check_annotations = bool_annotations + ["table", "reference"]
         if name not in check_annotations and not value.startswith("heading "):
             continue
         elif name in bool_annotations and annotation.value == "False":
@@ -187,23 +194,27 @@ def __annotations2html(paragraph: TreeNode, table2id: Dict[str, int]) -> str:
         indexes.setdefault(annotation.start, "")
         indexes.setdefault(annotation.end, "")
         if name == "table":
-            indexes[annotation.start] += f'<p> <sub> <a href="#{tag}"> table#{table2id[tag]} </a> </sub> </p>'
+            indexes[annotation.end] += f' (<a href="#{tag}">table {table2id[tag]}</a>)'
+        elif name == "reference":
+            indexes[annotation.start] += f'<{tag} href="#{value}">'
+            indexes[annotation.end] = f"</{tag}>" + indexes[annotation.end]
         else:
-            indexes[annotation.start] += "<" + tag + ">"
-            indexes[annotation.end] = "</" + tag + ">" + indexes[annotation.end]
+            indexes[annotation.start] += f"<{tag}>"
+            indexes[annotation.end] = f"</{tag}>" + indexes[annotation.end]
 
     insert_tags = sorted([(index, tag) for index, tag in indexes.items()], reverse=True)
     text = paragraph.text
 
     for index, tag in insert_tags:
         text = text[:index] + tag + text[index:]
 
-    return text.replace("\n", "<br>")
+    return text.replace("\n", f'<br>{"&nbsp;" * tabs}')
 
 
 def table2html(table: Table, table2id: Dict[str, int]) -> str:
     uid = table.metadata.uid
-    text = f"<h4> table {table2id[uid]}:</h4>"
+    table_title = f" {table.metadata.title}" if table.metadata.title else ""
+    text = f"<h4> table {table2id[uid]}:{table_title}</h4>"
     text += f'<table border="1" id={uid} style="border-collapse: collapse; width: 100%;">\n<tbody>\n'
     for row in table.cells:
         text += "<tr>\n"

diff --git a/dedoc/api/schema/document_metadata.py b/dedoc/api/schema/document_metadata.py
@@ -1,5 +1,3 @@
-from typing import Optional
-
 from pydantic import BaseModel, Extra, Field
 
 
@@ -18,4 +16,3 @@ class Config:
     created_time: int = Field(description="Creation time of the document in the UnixTime format", example=1590579805)
     access_time: int = Field(description="File access time in the UnixTime format", example=1590579805)
     file_type: str = Field(description="Mime type of the file", example="application/vnd.oasis.opendocument.text")
-    other_fields: Optional[dict] = Field(description="Other optional fields")
diff --git a/dedoc/api/schema/line_metadata.py b/dedoc/api/schema/line_metadata.py
@@ -13,4 +13,3 @@ class Config:
     paragraph_type: str = Field(description="Type of the document line/paragraph (header, list_item, list) and etc.", example="raw_text")
     page_id: int = Field(description="Page number of the line/paragraph beginning", example=0)
     line_id: Optional[int] = Field(description="Line number", example=1)
-    other_fields: Optional[dict] = Field(description="Some other fields")