diff --git a/dedoc/readers/pdf_reader/data_classes/tables/table_type.py b/dedoc/readers/pdf_reader/data_classes/tables/table_type.py index 99c64b27..345b46ee 100644 --- a/dedoc/readers/pdf_reader/data_classes/tables/table_type.py +++ b/dedoc/readers/pdf_reader/data_classes/tables/table_type.py @@ -1,9 +1,10 @@ class TableTypeAdditionalOptions: """ - Setting up the table recognizer. The value of the parameter specifies the type of tables recognized when processed by + Enum for table types of tables for the table recognizer. + The value of the parameter specifies the type of tables recognized when processed by class :class:`~dedoc.readers.pdf_reader.pdf_image_reader.table_recognizer.table_recognizer.TableRecognizer`. - * Parameter `table_type=wo_external_bounds` - recognize tables without external bounds; + * Parameter `table_type=wo_external_bounds` - recognize tables without external bounds. Example of a table of type `wo_external_bounds`:: @@ -16,7 +17,7 @@ class :class:`~dedoc.readers.pdf_reader.pdf_image_reader.table_recognizer.table_ text | text | text - * Parameter `table_type=one_cell_table` - if a document contains a bounding box with text, it will be considered a table; + * Parameter `table_type=one_cell_table` - if a document contains a bounding box with text, it will be considered a table. Example of a page with a table of type `one_cell_table`:: @@ -27,7 +28,7 @@ class :class:`~dedoc.readers.pdf_reader.pdf_image_reader.table_recognizer.table_ +------+ ________________________ - * Parameter `table_type=split_last_column` - specified parameter for the merged last column of the table; + * Parameter `table_type=split_last_column` - specified parameter for the merged last column of the table. Example of a table of type `split_last_column`:: diff --git a/dedoc/readers/pdf_reader/pdf_base_reader.py b/dedoc/readers/pdf_reader/pdf_base_reader.py index 9e00ae36..d59c114f 100644 --- a/dedoc/readers/pdf_reader/pdf_base_reader.py +++ b/dedoc/readers/pdf_reader/pdf_base_reader.py @@ -12,7 +12,7 @@ from dedoc.readers.pdf_reader.data_classes.line_with_location import LineWithLocation from dedoc.readers.pdf_reader.data_classes.pdf_image_attachment import PdfImageAttachment from dedoc.readers.pdf_reader.data_classes.tables.scantable import ScanTable -from dedoc.readers.pdf_reader.utils.header_footers_analysis import HeaderFooterDetector + ParametersForParseDoc = namedtuple("ParametersForParseDoc", [ "is_one_column_document", @@ -44,6 +44,7 @@ def __init__(self, *, config: Optional[dict] = None, recognized_extensions: Opti from dedoc.readers.pdf_reader.pdf_image_reader.paragraph_extractor.scan_paragraph_classifier_extractor import ScanParagraphClassifierExtractor from dedoc.readers.pdf_reader.pdf_image_reader.table_recognizer.gost_frame_recognizer import GOSTFrameRecognizer from dedoc.readers.pdf_reader.pdf_image_reader.table_recognizer.table_recognizer import TableRecognizer + from dedoc.readers.pdf_reader.utils.header_footers_analysis import HeaderFooterDetector from dedoc.readers.pdf_reader.utils.line_object_linker import LineObjectLinker from dedoc.attachments_extractors.concrete_attachments_extractors.pdf_attachments_extractor import PDFAttachmentsExtractor diff --git a/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/table_recognizer.py b/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/table_recognizer.py index 7bcc7cc6..26f153df 100644 --- a/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/table_recognizer.py +++ b/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/table_recognizer.py @@ -21,15 +21,13 @@ class TableRecognizer: """ - The class recognizes tables from document images. This class is internal to the system. It is called from readers such as . - - * The class recognizes tables with borders from the document image and returns the class - (function :meth:`~dedoc.readers.pdf_reader.pdf_image_reader.table_recognizer.table_recognizer.TableRecognizer.recognize_tables_from_image`); - - - * The class also analyzes recognized single-page tables and combines them into multi-page ones - (function :meth:`~dedoc.readers.pdf_reader.pdf_image_reader.table_recognizer.table_recognizer.TableRecognizer.convert_to_multipages_tables`); + The class recognizes tables from document images. This class is internal to the system. + It is called from readers such as :class:`dedoc.readers.PdfTxtlayerReader` or :class:`dedoc.readers.PdfImageReader`. + * The class recognizes tables with borders from the document image using + :meth:`~dedoc.readers.pdf_reader.pdf_image_reader.table_recognizer.table_recognizer.TableRecognizer.recognize_tables_from_image`; + * The class also analyzes recognized single-page tables and combines them into multi-page ones using + :meth:`~dedoc.readers.pdf_reader.pdf_image_reader.table_recognizer.table_recognizer.TableRecognizer.convert_to_multipages_tables` """ def __init__(self, *, config: dict = None) -> None: diff --git a/dedoc/readers/pdf_reader/utils/header_footers_analysis.py b/dedoc/readers/pdf_reader/utils/header_footers_analysis.py index 61a8b22c..0ff73653 100644 --- a/dedoc/readers/pdf_reader/utils/header_footers_analysis.py +++ b/dedoc/readers/pdf_reader/utils/header_footers_analysis.py @@ -16,18 +16,18 @@ class HeaderFooterDetector: `Lin X. Header and footer extraction by page association //Document Recognition and Retrieval X. – SPIE, 2003. – Т. 5010. – С. 164-171.` Algorithm's notes: - 1) For documents of 6 pages or more, lines on even and odd pages of the document are compared to detect alternating footers-headers. - For documents of less than 6 pages, lines between adjacent pages (between even or odd pages) are compared. - Therefore, alternating footers-headers will not be detected on documents of less than 6 pages. - 2) The algorithm analyzes the first 4 and last 4 lines on each page of the document and, - by comparing lines across pages, identifies common footer-header patterns using Levenshtein similarity. + 1. For documents of 6 pages or more, lines on even and odd pages of the document are compared to detect alternating footers-headers. + For documents of less than 6 pages, lines between adjacent pages (between even or odd pages) are compared. + Therefore, alternating footers-headers will not be detected on documents of less than 6 pages. - 3) For the algorithm to work, the document must have at least two pages of text. - It is not an ML algorithm it cannot work with just one page. + 2. The algorithm analyzes the first 4 and last 4 lines on each page of the document and, + by comparing lines across pages, identifies common footer-header patterns using Levenshtein similarity. - 4) The more pages the better. Remember the parameter `pages` limits the number of pages in a document. + 3. For algorithm work, the document must have at least two pages of text. + It is not an ML algorithm so it cannot work with just one page. + 4. The more pages, the better. Remember that the parameter `pages` limits the number of pages in a document. """ def __init__(self) -> None: diff --git a/docs/source/changelog.rst b/docs/source/changelog.rst index c24c570d..07e0a243 100644 --- a/docs/source/changelog.rst +++ b/docs/source/changelog.rst @@ -1,18 +1,17 @@ Changelog ========= + v2.6 (2025-09-19) ----------------- Release note: `v2.6 `_ -* improve table merge algorithm (added check on table layout) `MultiPageTableExtractor`. -* refactoring table merge `MultiPageTableExtractor`. -* improve header footer analysis `HeaderFooterDetector`. -* added header footer analysis support in Tabby. -* added header footer analysis info (parameter `need_header_footer_analysis`) in documentation (readthedocs). -* update to python3.10. -* update to ubuntu22.04. -* added `Contributing Information` (project rules, how build, how develop) in documentation (readthedocs). - +* Improved table merge algorithm (added check on table layout) `MultiPageTableExtractor`. +* Improved header footer analysis `HeaderFooterDetector`. +* Added header footer analysis support in `PdfTabbyReader`. +* Added header footer analysis info (parameter `need_header_footer_analysis`) in documentation. +* Updated to python3.10. +* Updated to ubuntu22.04. +* Added `Support and Contributing` (project rules, how to build, how to develop) in documentation. v2.5 (2025-09-05) ----------------- @@ -20,9 +19,8 @@ Release note: `v2.5 `_ * Added simple multilingual textual layer correctness classification based on letter percentage calculation (`textual_layer_classifier=letter`). * Added a new parameter `textual_layer_classifier = [simple, ml (default), letter]`. -* Remove parameter `fast_textual_layer_detection`. Now it is a `textual_layer_classifier=simple`. -* Fix bug with `table_type=table_wo_external_bounds` (fixed cv2.BoundingRect). -* Some refactoring `TableRecognition`. +* Removed parameter `fast_textual_layer_detection`. Now it is a `textual_layer_classifier=simple`. +* Fixed bug with `table_type=table_wo_external_bounds` (fixed cv2.BoundingRect). * Added parameter `table_type` and `TableRecognition` info into documentation. v2.4 (2025-07-28) diff --git a/docs/source/contributing/check_documentation.rst b/docs/source/contributing/check_documentation.rst index d94d5e55..a8f273a2 100644 --- a/docs/source/contributing/check_documentation.rst +++ b/docs/source/contributing/check_documentation.rst @@ -9,12 +9,11 @@ Check documentation pip install .[docs] -2. Documentation files should be located in the `docs/ `_ directory, - which must contain the `docs/source/conf.py `_ (build settings) - and `docs/source/index.rst `_ (documentation main page) files. - -3. Build documentation into HTML pages is done as follows: +2. Documentation files should be located in the `docs/ `_ directory. + Build documentation into HTML pages is done as follows: .. code-block:: bash python -m sphinx -T -E -W -b html -d docs/_build/doctrees -D language=en docs/source docs/_build + +3. After building, the documentation can be checked locally, the main built page ``docs/_build/index.html`` can be opened in the browser. diff --git a/docs/source/contributing/contributing.rst b/docs/source/contributing/contributing.rst index fe349d3b..b36fa097 100644 --- a/docs/source/contributing/contributing.rst +++ b/docs/source/contributing/contributing.rst @@ -5,12 +5,12 @@ Support and Contributing Support ------- -If you are stuck with a problem using Dedoc, please do get in touch at our `Issues `_ (recommend) +If you are stuck with a problem using Dedoc, please use our `Issues `_ (recommended) or `Dedoc Chat `_. The developers are willing to help. You can save time by following this procedure when reporting a problem: - * Do try to solve the problem on your own first. Read the documentation, including using the search feature, index and reference documentation. + * Try to solve the problem on your own first. Read the documentation, including using the search feature, index and reference documentation. * Search the issue archives to see if someone else already had the same problem. @@ -23,7 +23,9 @@ Contributing Rules * To add new features to the project repository yourself, you should follow the `general contributing rules of github `_. - In your Pull Request, set `develop` as the target branch. + + .. note:: + In your Pull Request, set `develop` as the target branch. * We recommend using `Pycharm IDE` and `virtualenv` package for development. @@ -34,16 +36,17 @@ Contributing Rules * We strongly recommend using the already used ML library `torch` in development. For example, using `tensorflow` library instead of `torch` is justified only in case of extreme necessity. - * If you add new functionality to dedoc, be sure to add python `unitests` to test the added functionality - (you can add api tests in `tests/api_tests `_, - you can add unit tests in `tests/unit_tests `_). + * If you add new functionality to dedoc, be sure to add python `unittest` to test the added functionality + (you can add api tests in `tests/api_tests `_ + or unit tests in `tests/unit_tests `_). These tests are run automatically in the Continuous Integration pipeline. + To run tests locally, you can use docker as described in the `README `_. * Before each commit, check the code style using the automatic checker using the `flake8` library. - Instructions for using flake8 are provided here :ref:using_flake8`. + Instructions for using flake8 are provided in :ref:`using_flake8`. * We recommend setting up pre-commit for convenience and speeding up development according to the instructions :ref:`using_precommit` . - This will run a style check of the changed code with each commit. + This will run a style check of the changed code before each commit. * In case of any change in the online documentation of the project (for example, when adding a new api parameter), be sure to check locally that the changed documentation is successfully built and looks as expected. @@ -51,11 +54,8 @@ Contributing Rules .. toctree:: :maxdepth: 1 + :hidden: using_flake8 using_precommit check_documentation - - - - diff --git a/docs/source/dedoc_api_usage/api.rst b/docs/source/dedoc_api_usage/api.rst index 72c09551..d6b2ab2f 100644 --- a/docs/source/dedoc_api_usage/api.rst +++ b/docs/source/dedoc_api_usage/api.rst @@ -279,6 +279,16 @@ Api parameters description - false - This option is used to **remove** headers and footers of PDF documents from the output result. If ``need_header_footer_analysis=false``, header and footer lines will present in the output as well as all other document lines. + The algorithm is implemented and described in the class :class:`~dedoc.readers.pdf_reader.utils.header_footers_analysis.HeaderFooterDetector`. + + * - table_type + - "", wo_external_bounds, one_cell_table, split_last_column and their combinaton + - "" + - Setting up the table recognition method. This option is used for PDF documents which are images with text (PDF without a textual layer). + It is also used for PDF documents when ``pdf_with_text_layer`` is ``true``, ``false``, ``auto`` or ``auto_tabby``. + The value of the parameter specifies the type of tables for recognition, + supported table types are described in :class:`~dedoc.readers.pdf_reader.data_classes.tables.table_type.TableTypeAdditionalOptions`. + You can use combination of values (for example, ``wo_external_bounds+one_cell_table``). * - need_binarization - true, false diff --git a/docs/source/getting_started/installation.rst b/docs/source/getting_started/installation.rst index 09c70b81..0112a135 100644 --- a/docs/source/getting_started/installation.rst +++ b/docs/source/getting_started/installation.rst @@ -182,4 +182,4 @@ For ``python3.9``: .. code-block:: bash pip install https://github.com/ispras/dedockerfiles/raw/master/wheels/torch-1.11.0a0+git137096a-cp39-cp39-linux_x86_64.whl - pip install https://github.com/ispras/dedockerfiles/raw/master/wheels/torchvision-0.12.0a0%2B9b5a3fe-cp39-cp39-linux_x86_64.whl \ No newline at end of file + pip install https://github.com/ispras/dedockerfiles/raw/master/wheels/torchvision-0.12.0a0%2B9b5a3fe-cp39-cp39-linux_x86_64.whl diff --git a/docs/source/index.rst b/docs/source/index.rst index d9391635..c36c8c9e 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -278,6 +278,7 @@ This type of structure is configurable (see :ref:`using_patterns`). modules/metadata_extractors modules/structure_extractors modules/structure_constructors + modules/pdf_parsing .. toctree:: diff --git a/docs/source/modules/manager.rst b/docs/source/modules/manager.rst index 8f185b06..53a5cb26 100644 --- a/docs/source/modules/manager.rst +++ b/docs/source/modules/manager.rst @@ -10,15 +10,3 @@ Dedoc pipeline .. autoclass:: dedoc.attachments_handler.AttachmentsHandler :special-members: __init__ :members: - -.. autoclass:: dedoc.readers.pdf_reader.pdf_image_reader.table_recognizer.table_recognizer.TableRecognizer - :show-inheritance: - :members: - -.. autoclass:: dedoc.readers.pdf_reader.data_classes.tables.table_type.TableTypeAdditionalOptions - :show-inheritance: - :members: - -.. autoclass:: dedoc.readers.pdf_reader.utils.header_footers_analysis.HeaderFooterDetector - :show-inheritance: - :members: diff --git a/docs/source/modules/pdf_parsing.rst b/docs/source/modules/pdf_parsing.rst new file mode 100644 index 00000000..2bb25b5a --- /dev/null +++ b/docs/source/modules/pdf_parsing.rst @@ -0,0 +1,14 @@ +.. _pdf_parsing: + +Auxiliary data structures for PDF and images parsing +==================================================== + + +.. autoclass:: dedoc.readers.pdf_reader.pdf_image_reader.table_recognizer.table_recognizer.TableRecognizer + :members: + +.. autoclass:: dedoc.readers.pdf_reader.data_classes.tables.table_type.TableTypeAdditionalOptions + :members: + +.. autoclass:: dedoc.readers.pdf_reader.utils.header_footers_analysis.HeaderFooterDetector + :members: diff --git a/docs/source/parameters/pdf_handling.rst b/docs/source/parameters/pdf_handling.rst index 2c6019ae..e2e23408 100644 --- a/docs/source/parameters/pdf_handling.rst +++ b/docs/source/parameters/pdf_handling.rst @@ -148,19 +148,7 @@ PDF and images handling * :meth:`dedoc.readers.ReaderComposition.read` - This option is used to **remove** headers and footers of PDF documents from the output result. If ``need_header_footer_analysis=False``, header and footer lines will present in the output as well as all other document lines. - The algorithm is implemented in the class :class:`~dedoc.readers.pdf_reader.utils.header_footers_analysis.HeaderFooterDetector`. - A link to the article describing the algorithm is provided in the class description. - - * For documents of 6 pages or more, lines on even and odd pages of the document are compared to detect alternating footers-headers. - For documents of less than 6 pages, lines between adjacent pages (between even or odd pages) are compared. - Therefore, alternating footers-headers will not be detected on documents of less than 6 pages. - - * The algorithm analyzes the first 4 and last 4 lines on each page of the document and, - by comparing lines across pages, identifies common footer-header patterns using Levenshtein similarity. - - * For the algorithm to work, the document must have at least two pages of text. - - * The more pages the better. Remember the parameter `pages` limits the number of pages in a document. + The algorithm is implemented and described in the class :class:`~dedoc.readers.pdf_reader.utils.header_footers_analysis.HeaderFooterDetector`. * - need_binarization - True, False @@ -192,8 +180,8 @@ PDF and images handling - Setting up the table recognition method. The table recognition method is used in :class:`dedoc.readers.PdfImageReader` and :class:`dedoc.readers.PdfTxtlayerReader`. The value of the parameter specifies the type of tables recognized when processed by class :class:`~dedoc.readers.pdf_reader.pdf_image_reader.table_recognizer.table_recognizer.TableRecognizer`. More details about each parameter value - are disclosed in the class :class:`dedoc.readers.pdf_reader.data_classes.tables.table_type.TableTypeAdditionalOptions` description. - You can use combination of values (for example, `wo_external_bounds+one_cell_table`). + are disclosed in the class :class:`~dedoc.readers.pdf_reader.data_classes.tables.table_type.TableTypeAdditionalOptions` description. + You can use combination of values (for example, ``wo_external_bounds+one_cell_table``). * - need_gost_frame_analysis - True, False diff --git a/labeling/Dockerfile b/labeling/Dockerfile index 25970c9f..c601dbe8 100644 --- a/labeling/Dockerfile +++ b/labeling/Dockerfile @@ -1,5 +1,5 @@ ARG REPOSITORY="docker.io" -FROM dedocproject/dedoc_p3.9_base:version_2023_08_28 +FROM dedocproject/dedoc_jammy_p3.10_base:version_2025_09_11 ENV PYTHONPATH "${PYTHONPATH}:/labeling_root:/labeling_root/labeling" ENV RESOURCES_PATH "/labeling_root/resources" diff --git a/tests/Dockerfile b/tests/Dockerfile index 38264bc4..4d47c8db 100644 --- a/tests/Dockerfile +++ b/tests/Dockerfile @@ -1,5 +1,5 @@ ARG REPOSITORY="docker.io" -FROM dedocproject/dedoc_p3.9_base:version_2023_08_28 +FROM dedocproject/dedoc_jammy_p3.10_base:version_2025_09_11 ENV PYTHONPATH "${PYTHONPATH}:/dedoc_root:/dedoc_root/tests:/dedoc_root/langchain"