TALP-447, TALP-448: notes extraction from DOCX; XLSX

NastyBoget · NastyBoget · commit 5febc7d369ec · 2026-02-27T20:11:01.000+03:00
diff --git a/dedoc/readers/docx_reader/data_structures/docx_document.py b/dedoc/readers/docx_reader/data_structures/docx_document.py
@@ -12,7 +12,7 @@
 from dedoc.data_structures.line_with_meta import LineWithMeta
 from dedoc.readers.docx_reader.data_structures.table import DocxTable
 from dedoc.readers.docx_reader.data_structures.utils import Counter, ParagraphMaker
-from dedoc.readers.docx_reader.footnote_extractor import FootnoteExtractor
+from dedoc.readers.docx_reader.footnote_extractor import NoteExtractor
 from dedoc.readers.docx_reader.line_with_meta_converter import LineWithMetaConverter
 from dedoc.readers.docx_reader.numbering_extractor import NumberingExtractor
 from dedoc.readers.docx_reader.styles_extractor import StylesExtractor
@@ -47,8 +47,9 @@ def __get_paragraph_maker(self) -> ParagraphMaker:
             path_hash=calculate_file_hash(path=self.path),
             styles_extractor=styles_extractor,
             numbering_extractor=numbering_extractor,
-            footnote_extractor=FootnoteExtractor(get_bs_from_zip(self.path, "word/footnotes.xml")),
-            endnote_extractor=FootnoteExtractor(get_bs_from_zip(self.path, "word/endnotes.xml"), key="endnote")
+            footnote_extractor=NoteExtractor(get_bs_from_zip(self.path, "word/footnotes.xml")),
+            endnote_extractor=NoteExtractor(get_bs_from_zip(self.path, "word/endnotes.xml"), key="endnote"),
+            comment_extractor=NoteExtractor(get_bs_from_zip(self.path, "word/comments.xml"), key="comment")
         )
 
     def __get_lines(self) -> List[LineWithMeta]:
diff --git a/dedoc/readers/docx_reader/data_structures/paragraph.py b/dedoc/readers/docx_reader/data_structures/paragraph.py
@@ -4,7 +4,7 @@
 
 from dedoc.readers.docx_reader.data_structures.base_props import BaseProperties
 from dedoc.readers.docx_reader.data_structures.run import Run
-from dedoc.readers.docx_reader.footnote_extractor import FootnoteExtractor
+from dedoc.readers.docx_reader.footnote_extractor import NoteExtractor
 from dedoc.readers.docx_reader.numbering_extractor import NumberingExtractor
 from dedoc.readers.docx_reader.properties_extractor import change_paragraph_properties, change_run_properties
 from dedoc.readers.docx_reader.styles_extractor import StyleType, StylesExtractor
@@ -16,8 +16,9 @@ def __init__(self,
                  xml: Tag,
                  styles_extractor: StylesExtractor,
                  numbering_extractor: NumberingExtractor,
-                 footnote_extractor: FootnoteExtractor,
-                 endnote_extractor: FootnoteExtractor,
+                 footnote_extractor: NoteExtractor,
+                 endnote_extractor: NoteExtractor,
+                 comment_extractor: NoteExtractor,
                  uid: str) -> None:
         """
         Contains information about paragraph properties.
@@ -30,9 +31,10 @@ def __init__(self,
         self.xml = xml
         self.footnote_extractor = footnote_extractor
         self.endnote_extractor = endnote_extractor
+        self.comment_extractor = comment_extractor
         self.numbering_extractor = numbering_extractor
         self.styles_extractor = styles_extractor
-        self.footnotes = []
+        self.notes = []
         self.runs = []
         self.runs_ids = []  # list of (start, end) inside the paragraph text
         self.text = ""
@@ -85,12 +87,8 @@ def __parse(self) -> None:
         if hasattr(self, "caps") and self.caps:
             self.text = self.text.upper()
 
-        for key, extractor in [("w:footnoteReference", self.footnote_extractor), ("w:endnoteReference", self.endnote_extractor)]:
-            notes = self.xml.find_all(key)
-            for footnote in notes:
-                note_id = footnote.get("w:id")
-                if note_id in extractor.id2footnote:
-                    self.footnotes.append(extractor.id2footnote[note_id])
+        for extractor in [self.footnote_extractor, self.endnote_extractor]:
+            self.notes.extend(extractor.get_notes(self.xml))
 
     def __get_numbering_formatting(self) -> Optional[Run]:
         """
@@ -99,7 +97,7 @@ def __get_numbering_formatting(self) -> Optional[Run]:
         :returns: numbering run if there is the text in numbering else None
         """
         if self.xml.numPr and self.numbering_extractor:
-            numbering_run = Run(self, self.styles_extractor)
+            numbering_run = Run(self, self.styles_extractor, self.comment_extractor)
             self.numbering_extractor.parse(self.xml.numPr, self, numbering_run)
 
             if numbering_run.text:
@@ -115,7 +113,7 @@ def __make_run_list(self) -> None:
         run_list = self.xml.find_all("w:r")
 
         for run_tree in run_list:
-            new_run = Run(self, self.styles_extractor)
+            new_run = Run(self, self.styles_extractor, self.comment_extractor)
 
             if run_tree.rStyle:
                 self.styles_extractor.parse(run_tree.rStyle["w:val"], new_run, StyleType.CHARACTER)
@@ -126,6 +124,9 @@ def __make_run_list(self) -> None:
                 change_run_properties(new_run, run_tree.rPr)
             new_run.get_text(run_tree)
             if not new_run.text:
+                if new_run.linked_text and self.runs:
+                    prev_linked_text = self.runs[-1].linked_text
+                    self.runs[-1].linked_text = new_run.linked_text if not prev_linked_text else f"{prev_linked_text}; {new_run.linked_text}"
                 continue
 
             if self.runs and self.runs[-1] == new_run:
diff --git a/dedoc/readers/docx_reader/data_structures/run.py b/dedoc/readers/docx_reader/data_structures/run.py
@@ -3,28 +3,36 @@
 from bs4 import Tag
 
 from dedoc.readers.docx_reader.data_structures.base_props import BaseProperties
+from dedoc.readers.docx_reader.footnote_extractor import NoteExtractor
 from dedoc.readers.docx_reader.properties_extractor import change_caps
 
 
 class Run(BaseProperties):
 
-    def __init__(self, properties: Optional[BaseProperties], styles_extractor: "StylesExtractor") -> None:  # noqa
+    def __init__(self, properties: Optional[BaseProperties], styles_extractor: "StylesExtractor", comment_extractor: Optional[NoteExtractor] = None) -> None:  # noqa
         """
         Contains information about run properties.
         :param properties: Paragraph or Run for copying its properties
         :param styles_extractor: StylesExtractor
+        :param comment_extractor: NoteExtractor for comments
         """
 
         self.name2char = dict(tab="\t", br="\n", cr="\r")
         self.text = ""
+        self.linked_text = ""
         self.styles_extractor = styles_extractor
+        self.comment_extractor = comment_extractor
         super().__init__(properties)
 
     def get_text(self, xml: Tag) -> None:
         """
         Makes the text of run.
         :param xml: BeautifulSoup tree with run properties
         """
+        notes = self.comment_extractor.get_notes(xml) if self.comment_extractor else None
+        if notes:
+            self.linked_text = "; ".join(notes)
+
         for tag in xml:
             tag_name = tag.name
 
@@ -56,4 +64,5 @@ def __eq__(self, other: "Run") -> bool:
         size_eq = self.size == other.size
         font_eq = self.bold == other.bold and self.italic == other.italic and self.underlined == other.underlined
         script_eq = self.superscript == other.superscript and self.subscript == other.subscript
-        return size_eq and font_eq and script_eq
+        linked_text_eq = self.linked_text == other.linked_text
+        return size_eq and font_eq and script_eq and linked_text_eq
diff --git a/dedoc/readers/docx_reader/data_structures/utils.py b/dedoc/readers/docx_reader/data_structures/utils.py
@@ -6,7 +6,7 @@
 from bs4 import Tag
 
 from dedoc.readers.docx_reader.data_structures.paragraph import Paragraph
-from dedoc.readers.docx_reader.footnote_extractor import FootnoteExtractor
+from dedoc.readers.docx_reader.footnote_extractor import NoteExtractor
 from dedoc.readers.docx_reader.numbering_extractor import NumberingExtractor
 from dedoc.readers.docx_reader.styles_extractor import StylesExtractor
 
@@ -35,14 +35,16 @@ def __init__(self,
                  counter: Counter,
                  styles_extractor: StylesExtractor,
                  numbering_extractor: NumberingExtractor,
-                 footnote_extractor: FootnoteExtractor,
-                 endnote_extractor: FootnoteExtractor) -> None:
+                 footnote_extractor: NoteExtractor,
+                 endnote_extractor: NoteExtractor,
+                 comment_extractor: NoteExtractor) -> None:
         self.counter = counter
         self.path_hash = path_hash
         self.styles_extractor = styles_extractor
         self.numbering_extractor = numbering_extractor
         self.footnote_extractor = footnote_extractor
         self.endnote_extractor = endnote_extractor
+        self.comment_extractor = comment_extractor
         self.uids_set = set()
 
     def make_paragraph(self, paragraph_xml: Tag, paragraph_list: List[Paragraph]) -> Paragraph:
@@ -52,6 +54,7 @@ def make_paragraph(self, paragraph_xml: Tag, paragraph_list: List[Paragraph]) ->
                               numbering_extractor=self.numbering_extractor,
                               footnote_extractor=self.footnote_extractor,
                               endnote_extractor=self.endnote_extractor,
+                              comment_extractor=self.comment_extractor,
                               uid=uid)
         prev_paragraph = None if len(paragraph_list) == 0 else paragraph_list[-1]
         paragraph.spacing = paragraph.spacing_before if prev_paragraph is None else max(prev_paragraph.spacing_after, paragraph.spacing_before)
diff --git a/dedoc/readers/docx_reader/footnote_extractor.py b/dedoc/readers/docx_reader/footnote_extractor.py
@@ -1,21 +1,33 @@
-from typing import Optional
+from typing import Dict, List, Optional
 
-from bs4 import BeautifulSoup
+from bs4 import BeautifulSoup, Tag
 
 
-class FootnoteExtractor:
+class NoteExtractor:
 
     def __init__(self, xml: Optional[BeautifulSoup], key: str = "footnote") -> None:
         """
         :param xml: BeautifulSoup tree with styles
-        :param key: footnote or endnote
+        :param key: footnote, endnote or comment
         """
-        self.id2footnote = {}
+        self.key = key
+        self.id2note: Dict[str, str] = {}
         if not xml:
             return
 
         for footnote in xml.find_all(f"w:{key}"):
             footnote_id = footnote.get("w:id")
             footnote_text = " ".join(t.text for t in footnote.find_all("w:t") if t.text)
+            author = footnote.get("w:author")
+            footnote_text = f"{author}: {footnote_text}" if author else footnote_text
             if footnote_id and footnote_text:
-                self.id2footnote[footnote_id] = footnote_text
+                self.id2note[footnote_id] = footnote_text
+
+    def get_notes(self, xml: Tag) -> List[str]:
+        notes_xml = xml.find_all(f"w:{self.key}Reference")
+        notes = []
+        for note in notes_xml:
+            note_id = note.get("w:id")
+            if note_id in self.id2note:
+                notes.append(self.id2note[note_id])
+        return notes
diff --git a/dedoc/readers/docx_reader/line_with_meta_converter.py b/dedoc/readers/docx_reader/line_with_meta_converter.py
@@ -24,7 +24,9 @@ def __init__(self, paragraph: Paragraph, paragraph_id: int) -> None:
         Converts custom DOCX Paragraph to LineWithMeta class.
         :param paragraph: Paragraph for converting its properties to the unified representation.
         """
-        annotations = [BoldAnnotation, ItalicAnnotation, UnderlinedAnnotation, StrikeAnnotation, SuperscriptAnnotation, SubscriptAnnotation]
+        annotations = [
+            BoldAnnotation, ItalicAnnotation, UnderlinedAnnotation, StrikeAnnotation, SuperscriptAnnotation, SubscriptAnnotation, LinkedTextAnnotation
+        ]
         self.dict2annotation = {annotation.name: annotation for annotation in annotations}
         self.annotation_merger = AnnotationMerger()
 
@@ -37,8 +39,8 @@ def __parse(self, paragraph: Paragraph, paragraph_id: int) -> LineWithMeta:
             AlignmentAnnotation(start=0, end=len(paragraph.text), value=paragraph.jc),
             SpacingAnnotation(start=0, end=len(paragraph.text), value=str(paragraph.spacing))
         ]
-        for footnote in paragraph.footnotes:
-            annotations.append(LinkedTextAnnotation(start=0, end=len(paragraph.text), value=footnote))
+        for note in paragraph.notes:
+            annotations.append(LinkedTextAnnotation(start=0, end=len(paragraph.text), value=note))
 
         if paragraph.style_name is not None:
             annotations.append(StyleAnnotation(start=0, end=len(paragraph.text), value=paragraph.style_name))
@@ -47,7 +49,7 @@ def __parse(self, paragraph: Paragraph, paragraph_id: int) -> LineWithMeta:
 
         for run, (start, end) in zip(paragraph.runs, paragraph.runs_ids):
             annotations.append(SizeAnnotation(start=start, end=end, value=str(run.size / 2)))
-            for property_name in ["bold", "italic", "underlined", "strike", "superscript", "subscript"]:
+            for property_name in self.dict2annotation:
                 property_value = getattr(run, property_name)
                 if property_value:
                     annotations.append(self.dict2annotation[property_name](start=start, end=end, value=str(property_value)))
diff --git a/dedoc/readers/excel_reader/excel_reader.py b/dedoc/readers/excel_reader/excel_reader.py
@@ -2,6 +2,7 @@
 
 from xlrd.sheet import Sheet
 
+from dedoc.data_structures.concrete_annotations.linked_text_annotation import LinkedTextAnnotation
 from dedoc.data_structures.table import Table
 from dedoc.data_structures.unstructured_document import UnstructuredDocument
 from dedoc.readers.base_reader import BaseReader
@@ -54,8 +55,13 @@ def __parse_sheet(self, sheet_id: int, sheet: Sheet) -> Table:
         for row_id in range(n_rows):
             row = []
             for col_id in range(n_cols):
-                value = str(sheet.cell_value(rowx=row_id, colx=col_id))
-                row.append(CellWithMeta(lines=[LineWithMeta(line=value, metadata=LineMetadata(page_id=sheet_id, line_id=0))]))
+                cell_text = str(sheet.cell_value(rowx=row_id, colx=col_id))
+                if (row_id, col_id) in sheet.cell_note_map:
+                    note_text = sheet.cell_note_map[(row_id, col_id)].text.replace("\n", " ")
+                    annotations = [LinkedTextAnnotation(start=0, end=len(cell_text), value=note_text)]
+                else:
+                    annotations = []
+                row.append(CellWithMeta(lines=[LineWithMeta(line=cell_text, metadata=LineMetadata(page_id=sheet_id, line_id=0), annotations=annotations)]))
             res.append(row)
         metadata = TableMetadata(page_id=sheet_id)
         return Table(cells=res, metadata=metadata)