new version 2.3 (#498)

Co-authored-by: Andrey Mikhailov <mikhailov@icc.ru> Co-authored-by: Alexander Golodkov <55749660+alexander1999-hub@users.noreply.github.com> Co-authored-by: Alexander Golodkov <golodkov@ispras.ru>
ispras · Sep 19, 2024 · 724e2d2 · 724e2d2
1 parent 765aae2
commit 724e2d2
Show file tree

Hide file tree

Showing 111 changed files with 2,683 additions and 316 deletions.
diff --git a/.flake8 b/.flake8
@@ -28,6 +28,7 @@ exclude =
     *__init__.py,
     resources,
     venv,
+    .venv,
     build,
     dedoc.egg-info,
     docs/_build,
@@ -48,5 +49,5 @@ per-file-ignores =
     scripts/*:T201
     scripts/benchmark_pdf_performance*:JS101
     tests/custom_test_runner.py:ANN001,ANN201,ANN202,ANN204,N802
-    docs/source/_static/code_examples/*:I251
+    docs/source/_static/code_examples/*:I251,T201
     docs/source/_static/code_examples/langchain/*:FOL001,FOL002,FOL003,FOL004,FOL005,I100,I202,I251
diff --git a/.github/workflows/docs.yaml b/.github/workflows/docs.yaml
@@ -33,3 +33,4 @@ jobs:
         python dedoc_usage_tutorial.py
         python dedoc_add_new_doc_type_tutorial.py
         python dedoc_add_new_structure_type_tutorial.py
+        python dedoc_using_patterns_tutorial.py
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -3,7 +3,7 @@ repos:
     rev: 5.0.4
     hooks:
     -   id: flake8
-        exclude: \.github|.*__init__\.py|resources|docs|venv|build|dedoc\.egg-info|scripts/fintoc2022/metric.py
+        exclude: \.github|.*__init__\.py|resources|docs|venv|\.venv|build|dedoc\.egg-info|scripts/fintoc2022/metric.py
         args:
             - "--config=.flake8"
         additional_dependencies: [

diff --git a/README.md b/README.md
@@ -1,5 +1,6 @@
 # Dedoc
 
+[![Telegram](https://img.shields.io/badge/chat-on%20Telegram-2ba2d9.svg)](https://t.me/dedoc_chat)
 [![image](https://img.shields.io/pypi/pyversions/dedoc.svg)](https://pypi.python.org/pypi/dedoc)
 [![GitHub release](https://img.shields.io/github/release/ispras/dedoc.svg)](https://github.com/ispras/dedoc/releases/)
 [![PyPI version](https://badge.fury.io/py/dedoc.svg)](https://badge.fury.io/py/dedoc)
@@ -94,6 +95,12 @@ Relevant documentation of dedoc is available [here](https://dedoc.readthedocs.io
 * Article on habr.com [Dedoc: как автоматически извлечь из текстового документа всё и даже немного больше](https://habr.com/ru/companies/isp_ras/articles/779390/) in Russian (2023)
 * Article [Dedoc: A Universal System for Extracting Content and Logical Structure From Textual Documents](https://ieeexplore.ieee.org/abstract/document/10508151/) in English (2023)
 
+# Join Our Community
+
+Have questions or want to discuss Dedoc? Join our [Telegram chat](https://t.me/dedoc_chat) and connect with the community and the developers.
+
+Join our [Telegram channel](https://t.me/dedoc_channel) to get notifications about the most recent updates.
+
 # Installation instructions
 
 This project has a REST api and you can run it in Docker container.

diff --git a/VERSION b/VERSION
@@ -1 +1 @@
-2.2.7
+2.3
diff --git a/dedoc/api/api_args.py b/dedoc/api/api_args.py
@@ -8,6 +8,7 @@
 class QueryParameters:
     # type of document structure parsing
     document_type: str = Form("", enum=["", "law", "tz", "diploma", "article", "fintoc"], description="Document domain")
+    patterns: str = Form("", description='Patterns for default document type (when document_type="")')
     structure_type: str = Form("tree", enum=["linear", "tree"], description="Output structure type")
     return_format: str = Form("json", enum=["json", "html", "plain_text", "tree", "collapsed_tree", "ujson", "pretty_json"],
                               description="Response representation, most types (except json) are used for debug purposes only")
@@ -39,6 +40,7 @@ class QueryParameters:
                                                  '"no_change" - set vertical orientation of the document without using an orientation classifier')
     need_header_footer_analysis: str = Form("false", enum=["true", "false"], description="Exclude headers and footers from PDF parsing result")
     need_binarization: str = Form("false", enum=["true", "false"], description="Binarize document pages (for images or PDF without a textual layer)")
+    need_gost_frame_analysis: str = Form("false", enum=["true", "false"], description="Parameter for detecting and ignoring GOST frame of the document")
 
     # other formats handling
     delimiter: Optional[str] = Form(None, description="Column separator for CSV files")

diff --git a/dedoc/api/schema/annotation.py b/dedoc/api/schema/annotation.py
@@ -5,6 +5,16 @@ class Annotation(BaseModel):
     """
     The piece of information about the text line: it's appearance or links to another document object.
     For example Annotation(1, 13, "italic", "True") says that text between 1st and 13th symbol was written in italic.
+
+    :ivar start: start of the annotated text
+    :ivar end: end of the annotated text (end isn't included)
+    :ivar name: annotation's name, specific for each type of annotation
+    :ivar value: information about annotated text, depends on the type of annotation, e.g. "True"/"False", "10.0", etc.
+
+    :vartype start: int
+    :vartype end: int
+    :vartype name: str
+    :vartype value: str
     """
     start: int = Field(description="Start of the annotated text", example=0)
     end: int = Field(description="End of the annotated text (end isn't included)", example=5)

diff --git a/dedoc/api/schema/cell_with_meta.py b/dedoc/api/schema/cell_with_meta.py
@@ -8,6 +8,16 @@
 class CellWithMeta(BaseModel):
     """
     Holds the information about the cell: list of lines and cell properties (rowspan, colspan, invisible).
+
+    :ivar lines: list of textual lines of the cell
+    :ivar colspan: number of columns to span (for cells merged horizontally)
+    :ivar rowspan: number of rows to span (for cells merged vertically)
+    :ivar invisible: indicator for displaying or hiding cell text - cells that are merged with others are hidden (for HTML display)
+
+    :vartype lines: List[LineWithMeta]
+    :vartype colspan: int
+    :vartype rowspan: int
+    :vartype invisible: bool
     """
     lines: List[LineWithMeta] = Field(description="Textual lines of the cell with annotations")
     rowspan: int = Field(description="Number of rows to span like in HTML format", example=1)

diff --git a/dedoc/api/schema/document_content.py b/dedoc/api/schema/document_content.py
@@ -9,6 +9,12 @@
 class DocumentContent(BaseModel):
     """
     Content of the document - structured text and tables.
+
+    :ivar tables: list of document tables
+    :ivar structure: tree structure of the document nodes with text and additional metadata
+
+    :vartype tables: List[Table]
+    :vartype structure: TreeNode
     """
     structure: TreeNode = Field(description="Tree structure where content of the document is organized")
     tables: List[Table] = Field(description="List of document tables")
diff --git a/dedoc/api/schema/document_metadata.py b/dedoc/api/schema/document_metadata.py
@@ -4,6 +4,26 @@
 class DocumentMetadata(BaseModel):
     """
     Document metadata like its name, size, author, etc.
+
+    :ivar file_name: original document name (before rename and conversion, so it can contain non-ascii symbols, spaces and so on)
+    :ivar temporary_file_name: file name during parsing (unique name after rename and conversion)
+    :ivar size: size of the original file in bytes
+    :ivar modified_time: time of the last modification in unix time format (seconds since the epoch)
+    :ivar created_time: time of the creation in unixtime
+    :ivar access_time: time of the last access to the file in unixtime
+    :ivar file_type: mime type of the file
+    :ivar uid: document unique identifier (useful for attached files)
+
+    :vartype file_name: str
+    :vartype temporary_file_name: str
+    :vartype size: int
+    :vartype modified_time: int
+    :vartype created_time: int
+    :vartype access_time: int
+    :vartype file_type: str
+    :vartype uid: str
+
+    Additional variables may be added with other file metadata.
     """
     class Config:
         extra = Extra.allow

diff --git a/dedoc/api/schema/line_metadata.py b/dedoc/api/schema/line_metadata.py
@@ -6,10 +6,20 @@
 class LineMetadata(BaseModel):
     """
     Holds information about document node/line metadata, such as page number or line type.
+
+    :ivar paragraph_type: type of the document line/paragraph (header, list_item, list, etc.)
+    :ivar page_id: page number where paragraph starts, the numeration starts from page 0
+    :ivar line_id: line number inside the entire document, the numeration starts from line 0
+
+    :vartype paragraph_type: str
+    :vartype page_id: int
+    :vartype line_id: Optional[int]
+
+    Additional variables may be added with other line metadata.
     """
     class Config:
         extra = Extra.allow
 
-    paragraph_type: str = Field(description="Type of the document line/paragraph (header, list_item, list) and etc.", example="raw_text")
+    paragraph_type: str = Field(description="Type of the document line/paragraph (header, list_item, list, etc.)", example="raw_text")
     page_id: int = Field(description="Page number of the line/paragraph beginning", example=0)
     line_id: Optional[int] = Field(description="Line number", example=1)
diff --git a/dedoc/api/schema/line_with_meta.py b/dedoc/api/schema/line_with_meta.py
@@ -8,6 +8,12 @@
 class LineWithMeta(BaseModel):
     """
     Textual line with text annotations.
+
+    :ivar text: text of the line
+    :ivar annotations: text annotations (font, size, bold, italic, etc.)
+
+    :vartype text: str
+    :vartype annotations: List[Annotation]
     """
     text: str = Field(description="Text of the line", example="Some text")
-    annotations: List[Annotation] = Field(description="Text annotations (font, size, bold, italic and etc)")
+    annotations: List[Annotation] = Field(description="Text annotations (font, size, bold, italic, etc.)")
diff --git a/dedoc/api/schema/parsed_document.py b/dedoc/api/schema/parsed_document.py
@@ -9,6 +9,18 @@
 class ParsedDocument(BaseModel):
     """
     Holds information about the document content, metadata and attachments.
+
+    :ivar content: document text (hierarchy of nodes) and tables
+    :ivar attachments: result of analysis of attached files (empty if with_attachments=False)
+    :ivar metadata: document metadata such as size, creation date and so on.
+    :ivar warnings: list of warnings and possible errors, arising in the process of document parsing
+    :ivar version: version of the program that parsed this document
+
+    :vartype content: DocumentContent
+    :vartype attachments: List[ParsedDocument]
+    :vartype metadata: DocumentMetadata
+    :vartype warnings: List[str]
+    :vartype version: str
     """
     content: DocumentContent = Field(description="Document text and tables")
     metadata: DocumentMetadata = Field(description="Document metadata such as size, creation date and so on")

diff --git a/dedoc/api/schema/table.py b/dedoc/api/schema/table.py
@@ -11,6 +11,12 @@ class Table(BaseModel):
     Holds information about tables in the document.
     We assume that a table has rectangle form (has the same number of columns in each row).
     Table representation is row-based i.e. external list contains list of rows.
+
+    :ivar metadata: a list of lists of table cells (cell has text lines, colspan and rowspan attributes)
+    :ivar cells: table metadata as location, title and so on
+
+    :vartype metadata: TableMetadata
+    :vartype cells: List[List[CellWithMeta]]
     """
     cells: List[List[CellWithMeta]] = Field(description="List of lists of table cells (cell has text, colspan and rowspan attributes)")
     metadata: TableMetadata = Field(description="Table meta information")
diff --git a/dedoc/api/schema/table_metadata.py b/dedoc/api/schema/table_metadata.py
@@ -6,6 +6,16 @@
 class TableMetadata(BaseModel):
     """
     Holds the information about table unique identifier, rotation angle (if table has been rotated - for images) and so on.
+
+    :ivar page_id: number of the page where table starts
+    :ivar uid: unique identifier of the table (used for linking table to text)
+    :ivar rotated_angle: value of the rotation angle by which the table was rotated during recognition
+    :ivar title: table's title
+
+    :vartype page_id: Optional[int]
+    :vartype uid: str
+    :vartype rotated_angle: float
+    :vartype title: str
     """
     page_id: Optional[int] = Field(description="Number of the page where the table starts", example=0)
     uid: str = Field(description="Unique identifier of the table", example="e8ba5523-8546-4804-898c-2f4835a1804f")

diff --git a/dedoc/api/schema/tree_node.py b/dedoc/api/schema/tree_node.py
@@ -10,6 +10,18 @@ class TreeNode(BaseModel):
     """
     Helps to represent document as recursive tree structure.
     It has list of children `TreeNode` nodes (empty list for a leaf node).
+
+    :ivar node_id: unique node identifier
+    :ivar text: text of the node (may contain several lines)
+    :ivar annotations: some metadata related to the part of the text (as font size)
+    :ivar metadata: metadata refers to entire node (as node type)
+    :ivar subparagraphs: list of child of this node
+
+    :vartype node_id: str
+    :vartype text: str
+    :vartype annotations: List[Annotation]
+    :vartype metadata: LineMetadata
+    :vartype subparagraphs: List[TreeNode]
     """
     node_id: str = Field(description="Document element identifier. It is unique within a document content tree. "
                                      "The identifier consists of numbers separated by dots where each number "

diff --git a/dedoc/api/web/index.html b/dedoc/api/web/index.html
@@ -28,7 +28,7 @@ <h3>Parameters configuration</h3>
 
         <div class="parameters">
             <h4>Type of document structure parsing</h4>
-            <details><summary>document_type, structure_type, return_format</summary>
+            <details><summary>document_type, patterns, structure_type, return_format</summary>
                 <br>
                 <p>
                     <label>
@@ -43,6 +43,14 @@ <h4>Type of document structure parsing</h4>
                     </label>
                 </p>
 
+                <p>
+                    <div>
+                        Patterns for default structure extractor (document_type="other")<br>
+                        <label><textarea id="patterns" name="patterns" style="width:450px;height:75px;"></textarea></label><br>
+                        <button type="button" onclick="Format()">Format</button>
+                    </div>
+                </p>
+
                 <p>
                     <label>
                         <select name="structure_type">
@@ -114,7 +122,7 @@ <h4>Tables handling </h4>
 
         <div class="parameters">
             <h4>PDF handling</h4>
-            <details><summary>pdf_with_text_layer, language, pages, is_one_column_document, document_orientation, need_header_footer_analysis, need_binarization</summary>
+            <details><summary>pdf_with_text_layer, fast_textual_layer_detection, language, pages, is_one_column_document, document_orientation, need_header_footer_analysis, need_binarization</summary>
                 <br>
                 <p>
                     <label>
@@ -175,6 +183,9 @@ <h4>PDF handling</h4>
                 <p>
                     <label><input name="need_binarization" type="checkbox" value="true"> need_binarization</label>
                 </p>
+                <p>
+                    <label><input name="need_gost_frame_analysis" type="checkbox" value="true"> need_gost_frame_analysis</label>
+                </p>
             </details>
         </div>
 
@@ -213,4 +224,18 @@ <h3>Useful links</h3>
     </ul>
 
 </body>
+
+<script>
+    function Format() {
+        try {
+            let input = document.getElementById("patterns")
+            let data = JSON.parse(input.value.replaceAll("\\", "\\\\"))
+            input.value = JSON.stringify(data, null, 2).replaceAll("\\\\", "\\")
+        }
+        catch (error) {
+            alert("Incorrect JSON syntax")
+        }
+    }
+</script>
+
 </html>
diff --git a/dedoc/data_structures/annotation.py b/dedoc/data_structures/annotation.py
@@ -7,6 +7,18 @@ class Annotation(Serializable):
     Base class for text annotations of all kinds.
     Annotation is the piece of information about the text line: it's appearance or links to another document object.
     Look to the concrete kind of annotations to get mode examples.
+
+    :ivar start: start of the annotated text
+    :ivar end: end of the annotated text (end isn't included)
+    :ivar name: annotation's name, specific for each type of annotation
+    :ivar value: information about annotated text, depends on the type of annotation, e.g. "True"/"False", "10.0", etc.
+    :ivar is_mergeable: is it possible to merge annotations with the same value
+
+    :vartype start: int
+    :vartype end: int
+    :vartype name: str
+    :vartype value: str
+    :vartype is_mergeable: bool
     """
 
     def __init__(self, start: int, end: int, name: str, value: str, is_mergeable: bool = True) -> None:
@@ -20,11 +32,11 @@ def __init__(self, start: int, end: int, name: str, value: str, is_mergeable: bo
         :param value: information about annotated text
         :param is_mergeable: is it possible to merge annotations with the same value
         """
-        self.start = start
-        self.end = end
-        self.name = name
-        self.value = value
-        self.is_mergeable = is_mergeable
+        self.start: int = start
+        self.end: int = end
+        self.name: str = name
+        self.value: str = value
+        self.is_mergeable: bool = is_mergeable
 
     def __eq__(self, o: object) -> bool:
         if not isinstance(o, Annotation):
@@ -35,7 +47,7 @@ def __str__(self) -> str:
         return f"{self.name.capitalize()}({self.start}:{self.end}, {self.value})"
 
     def __repr__(self) -> str:
-        return f"{self.name.capitalize()}(...)"
+        return self.__str__()
 
     def to_api_schema(self) -> ApiAnnotation:
         return ApiAnnotation(start=self.start, end=self.end, name=self.name, value=self.value)
diff --git a/dedoc/data_structures/attached_file.py b/dedoc/data_structures/attached_file.py
@@ -1,18 +1,28 @@
 class AttachedFile:
     """
     Holds information about files, attached to the parsed document.
+
+    :ivar original_name: original name of the attached file if it was possible to extract it
+    :ivar tmp_file_path: path to the attached file on disk - its name is different from original_name
+    :ivar need_content_analysis: does the attached file need parsing (enable recursive parsing in :class:`~dedoc.DedocManager`)
+    :ivar uid: unique identifier of the attached file
+
+    :vartype original_name: str
+    :vartype tmp_file_path: str
+    :vartype need_content_analysis: bool
+    :vartype uid: str
     """
     def __init__(self, original_name: str, tmp_file_path: str, need_content_analysis: bool, uid: str) -> None:
         """
-        :param original_name: Name of the file from which the attachments are extracted
-        :param tmp_file_path: path to the attachment file.
+        :param original_name: original name of the attached file
+        :param tmp_file_path: path to the attachment file
         :param need_content_analysis: indicator should we parse the attachment's content or simply save it without parsing
         :param uid: unique identifier of the attachment
         """
-        self.original_name = original_name
-        self.tmp_file_path = tmp_file_path
-        self.need_content_analysis = need_content_analysis
-        self.uid = uid
+        self.original_name: str = original_name
+        self.tmp_file_path: str = tmp_file_path
+        self.need_content_analysis: bool = need_content_analysis
+        self.uid: str = uid
 
     def get_filename_in_path(self) -> str:
         return self.tmp_file_path