Skip to content

Commit f4930f0

Browse files
committed
TLDR-918 added table_type, TableRecognition info into docs; added table_type on web
1 parent fbd4c05 commit f4930f0

File tree

7 files changed

+114
-3
lines changed

7 files changed

+114
-3
lines changed

dedoc/api/web/index.html

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
details > summary {font-style: italic; cursor: pointer; display: list-item;}
1414
.child.max {padding-left: 5px; flex: 1}
1515
.parent {display: flex}
16+
details { padding-left: 24px;}
1617
</style>
1718
</head>
1819

@@ -142,15 +143,18 @@ <h4>PDF handling</h4>
142143
</label>
143144
</p>
144145

145-
<details><summary>need_pdf_table_analysis</summary>
146+
<details><summary>need_pdf_table_analysis, table_type</summary>
146147
<br>
147148
<p>
148149
<label>
149150
<input type="hidden" name="need_pdf_table_analysis" value="false">
150151
<input type="checkbox" name="need_pdf_table_analysis" value="true" checked> need_pdf_table_analysis</label>
151152
</p>
153+
<p>
154+
<label>table_type <input name="table_type" type="text" size="20" value=""></label>
155+
</p>
152156
</details>
153-
157+
<br>
154158
<p>
155159
<label>pages <input name="pages" type="text" size="8" value=":"></label>
156160
</p>

dedoc/readers/pdf_reader/data_classes/tables/scantable.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,10 @@
1010

1111

1212
class ScanTable(Table):
13+
"""
14+
Utility class for storing recognized tables from document images. The class
15+
:class:`~dedoc.readers.pdf_reader.pdf_image_reader.table_recognizer.table_recognizer.TableRecognizer` works with this class.
16+
"""
1317
def __init__(self, page_number: int, cells: List[List[CellWithMeta]], bbox: BBox, order: int = -1) -> None:
1418

1519
super().__init__(cells, TableMetadata(page_id=page_number))

dedoc/readers/pdf_reader/data_classes/tables/table_type.py

Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,61 @@
11
class TableTypeAdditionalOptions:
2+
"""
3+
Setting up the table recognizer. The value of the parameter specifies the type of tables recognized when processed by
4+
class :class:`~dedoc.readers.pdf_reader.pdf_image_reader.table_recognizer.table_recognizer.TableRecognizer`.
5+
6+
* Parameter `table_type=wo_external_bounds` - recognize tables without external bounds;
7+
8+
Example of a table of type `wo_external_bounds`::
9+
10+
text | text | text
11+
--------+------+------
12+
text | text | text
13+
--------+------+------
14+
text | text | text
15+
--------+------+------
16+
text | text | text
17+
18+
19+
* Parameter `table_type=one_cell_table` - if a document contains a bounding box with text, it will be considered a table;
20+
21+
Example of a page with a table of type `one_cell_table`::
22+
23+
_________________________
24+
Header of document
25+
text text text +------+
26+
text | text | <--- it is a table
27+
+------+
28+
________________________
29+
30+
* Parameter `table_type=split_last_column` - specified parameter for the merged last column of the table;
31+
32+
Example of a table of type `split_last_column`::
33+
34+
+--------+------+-------+
35+
| text | text | text1 |
36+
+--------+------+ |
37+
| text0 | text | text2 |
38+
| | -----| |
39+
| | text | text3 |
40+
+--------+------+ |
41+
| text | text | text4 |
42+
+--------+------+-------+
43+
|
44+
Recognition
45+
|
46+
V
47+
+--------+------+-------+
48+
| text | text | text1 |
49+
+--------+------+-------|
50+
| text0 | text | text2 |
51+
|--------+ -----+------ |
52+
| text0 | text | text3 |
53+
+--------+------+------ |
54+
| text | text | text4 |
55+
+--------+------+-------+
56+
57+
"""
58+
259
def __init__(self) -> None:
360
self.table_wo_external_bounds = "wo_external_bounds"
461
self.detect_one_cell_table = "one_cell_table"

dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/table_recognizer.py

Lines changed: 22 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,18 @@
1919
"""-------------------------------------entry class of Table Recognizer Module---------------------------------------"""
2020

2121

22-
class TableRecognizer(object):
22+
class TableRecognizer:
23+
"""
24+
The class recognizes tables from document images. This class is internal to the system. It is called from readers such as .
25+
26+
* The class recognizes tables with borders from the document image and returns the class
27+
(function :meth:`~dedoc.readers.pdf_reader.pdf_image_reader.table_recognizer.table_recognizer.TableRecognizer.recognize_tables_from_image`);
28+
29+
30+
* The class also analyzes recognized single-page tables and combines them into multi-page ones
31+
(function :meth:`~dedoc.readers.pdf_reader.pdf_image_reader.table_recognizer.table_recognizer.TableRecognizer.convert_to_multipages_tables`);
32+
33+
"""
2334

2435
def __init__(self, *, config: dict = None) -> None:
2536
self.logger = config.get("logger", logging.getLogger())
@@ -29,10 +40,20 @@ def __init__(self, *, config: dict = None) -> None:
2940
self.table_type = TableTypeAdditionalOptions()
3041

3142
def convert_to_multipages_tables(self, all_single_tables: List[ScanTable], lines_with_meta: List[LineWithMeta]) -> List[ScanTable]:
43+
"""
44+
The function analyzes recognized tables from the entire document (all pages) to see if they are multi-page.
45+
If single-page tables are part of one multi-page, they are combined into one multi-page table.
46+
"""
3247
multipage_tables = self.multipage_tables_extractor.extract_multipage_tables(single_tables=all_single_tables, lines_with_meta=lines_with_meta)
3348
return multipage_tables
3449

3550
def recognize_tables_from_image(self, image: np.ndarray, page_number: int, language: str, table_type: str = "") -> Tuple[np.ndarray, List[ScanTable]]:
51+
"""
52+
The function recognizes tables with borders from scanned document image.
53+
Here, the contour analysis method is used to determine the boundaries of table cells.
54+
Then, a set of heuristics is used to detect tables, and finally,
55+
the detected table cells are converted to a matrix form (merged cells are detected and separated).
56+
"""
3657
self.logger.debug(f"Page {page_number}")
3758
try:
3859
cleaned_image, scan_tables = self.__rec_tables_from_img(image, page_num=page_number, language=language, table_type=table_type)

docs/source/modules/data_structures.rst

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,11 @@ Helper classes
7676
.. autoclass:: dedoc.data_structures.AttachedFile
7777
:members:
7878

79+
80+
.. autoclass:: dedoc.readers.pdf_reader.data_classes.tables.scantable.ScanTable
81+
:show-inheritance:
82+
:members:
83+
7984
.. _annotations:
8085

8186
Annotations of the text lines

docs/source/modules/manager.rst

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,3 +10,11 @@ Dedoc pipeline
1010
.. autoclass:: dedoc.attachments_handler.AttachmentsHandler
1111
:special-members: __init__
1212
:members:
13+
14+
.. autoclass:: dedoc.readers.pdf_reader.pdf_image_reader.table_recognizer.table_recognizer.TableRecognizer
15+
:show-inheritance:
16+
:members:
17+
18+
.. autoclass:: dedoc.readers.pdf_reader.data_classes.tables.table_type.TableTypeAdditionalOptions
19+
:show-inheritance:
20+
:members:

docs/source/parameters/pdf_handling.rst

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -169,6 +169,18 @@ PDF and images handling
169169
If the document has a textual layer, it is recommended to use :class:`dedoc.readers.PdfTabbyReader`,
170170
in this case tables will be parsed much easier and faster.
171171

172+
* - table_type
173+
- "", wo_external_bounds, one_cell_table, split_last_column and their combinaton
174+
- ""
175+
- * :meth:`dedoc.DedocManager.parse`
176+
* :meth:`dedoc.readers.PdfAutoReader.read`, :meth:`dedoc.readers.PdfTxtlayerReader.read`, :meth:`dedoc.readers.PdfImageReader.read`
177+
* :meth:`dedoc.readers.ReaderComposition.read`
178+
- Setting up the table recognition method. The table recognition method is used in :class:`dedoc.readers.PdfImageReader` and
179+
:class:`dedoc.readers.PdfTxtlayerReader`. The value of the parameter specifies the type of tables recognized when processed by
180+
class :class:`~dedoc.readers.pdf_reader.pdf_image_reader.table_recognizer.table_recognizer.TableRecognizer`. More details about each parameter value
181+
are disclosed in the class :class:`dedoc.readers.pdf_reader.data_classes.tables.table_type.TableTypeAdditionalOptions` description.
182+
You can use combination of values (for example, `wo_external_bounds+one_cell_table`).
183+
172184
* - need_gost_frame_analysis
173185
- True, False
174186
- False

0 commit comments

Comments
 (0)