Skip to content

Commit 5febc7d

Browse files
committed
TALP-447, TALP-448: notes extraction from DOCX; XLSX
1 parent e48acb1 commit 5febc7d

File tree

7 files changed

+66
-32
lines changed

7 files changed

+66
-32
lines changed

dedoc/readers/docx_reader/data_structures/docx_document.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
from dedoc.data_structures.line_with_meta import LineWithMeta
1313
from dedoc.readers.docx_reader.data_structures.table import DocxTable
1414
from dedoc.readers.docx_reader.data_structures.utils import Counter, ParagraphMaker
15-
from dedoc.readers.docx_reader.footnote_extractor import FootnoteExtractor
15+
from dedoc.readers.docx_reader.footnote_extractor import NoteExtractor
1616
from dedoc.readers.docx_reader.line_with_meta_converter import LineWithMetaConverter
1717
from dedoc.readers.docx_reader.numbering_extractor import NumberingExtractor
1818
from dedoc.readers.docx_reader.styles_extractor import StylesExtractor
@@ -47,8 +47,9 @@ def __get_paragraph_maker(self) -> ParagraphMaker:
4747
path_hash=calculate_file_hash(path=self.path),
4848
styles_extractor=styles_extractor,
4949
numbering_extractor=numbering_extractor,
50-
footnote_extractor=FootnoteExtractor(get_bs_from_zip(self.path, "word/footnotes.xml")),
51-
endnote_extractor=FootnoteExtractor(get_bs_from_zip(self.path, "word/endnotes.xml"), key="endnote")
50+
footnote_extractor=NoteExtractor(get_bs_from_zip(self.path, "word/footnotes.xml")),
51+
endnote_extractor=NoteExtractor(get_bs_from_zip(self.path, "word/endnotes.xml"), key="endnote"),
52+
comment_extractor=NoteExtractor(get_bs_from_zip(self.path, "word/comments.xml"), key="comment")
5253
)
5354

5455
def __get_lines(self) -> List[LineWithMeta]:

dedoc/readers/docx_reader/data_structures/paragraph.py

Lines changed: 13 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44

55
from dedoc.readers.docx_reader.data_structures.base_props import BaseProperties
66
from dedoc.readers.docx_reader.data_structures.run import Run
7-
from dedoc.readers.docx_reader.footnote_extractor import FootnoteExtractor
7+
from dedoc.readers.docx_reader.footnote_extractor import NoteExtractor
88
from dedoc.readers.docx_reader.numbering_extractor import NumberingExtractor
99
from dedoc.readers.docx_reader.properties_extractor import change_paragraph_properties, change_run_properties
1010
from dedoc.readers.docx_reader.styles_extractor import StyleType, StylesExtractor
@@ -16,8 +16,9 @@ def __init__(self,
1616
xml: Tag,
1717
styles_extractor: StylesExtractor,
1818
numbering_extractor: NumberingExtractor,
19-
footnote_extractor: FootnoteExtractor,
20-
endnote_extractor: FootnoteExtractor,
19+
footnote_extractor: NoteExtractor,
20+
endnote_extractor: NoteExtractor,
21+
comment_extractor: NoteExtractor,
2122
uid: str) -> None:
2223
"""
2324
Contains information about paragraph properties.
@@ -30,9 +31,10 @@ def __init__(self,
3031
self.xml = xml
3132
self.footnote_extractor = footnote_extractor
3233
self.endnote_extractor = endnote_extractor
34+
self.comment_extractor = comment_extractor
3335
self.numbering_extractor = numbering_extractor
3436
self.styles_extractor = styles_extractor
35-
self.footnotes = []
37+
self.notes = []
3638
self.runs = []
3739
self.runs_ids = [] # list of (start, end) inside the paragraph text
3840
self.text = ""
@@ -85,12 +87,8 @@ def __parse(self) -> None:
8587
if hasattr(self, "caps") and self.caps:
8688
self.text = self.text.upper()
8789

88-
for key, extractor in [("w:footnoteReference", self.footnote_extractor), ("w:endnoteReference", self.endnote_extractor)]:
89-
notes = self.xml.find_all(key)
90-
for footnote in notes:
91-
note_id = footnote.get("w:id")
92-
if note_id in extractor.id2footnote:
93-
self.footnotes.append(extractor.id2footnote[note_id])
90+
for extractor in [self.footnote_extractor, self.endnote_extractor]:
91+
self.notes.extend(extractor.get_notes(self.xml))
9492

9593
def __get_numbering_formatting(self) -> Optional[Run]:
9694
"""
@@ -99,7 +97,7 @@ def __get_numbering_formatting(self) -> Optional[Run]:
9997
:returns: numbering run if there is the text in numbering else None
10098
"""
10199
if self.xml.numPr and self.numbering_extractor:
102-
numbering_run = Run(self, self.styles_extractor)
100+
numbering_run = Run(self, self.styles_extractor, self.comment_extractor)
103101
self.numbering_extractor.parse(self.xml.numPr, self, numbering_run)
104102

105103
if numbering_run.text:
@@ -115,7 +113,7 @@ def __make_run_list(self) -> None:
115113
run_list = self.xml.find_all("w:r")
116114

117115
for run_tree in run_list:
118-
new_run = Run(self, self.styles_extractor)
116+
new_run = Run(self, self.styles_extractor, self.comment_extractor)
119117

120118
if run_tree.rStyle:
121119
self.styles_extractor.parse(run_tree.rStyle["w:val"], new_run, StyleType.CHARACTER)
@@ -126,6 +124,9 @@ def __make_run_list(self) -> None:
126124
change_run_properties(new_run, run_tree.rPr)
127125
new_run.get_text(run_tree)
128126
if not new_run.text:
127+
if new_run.linked_text and self.runs:
128+
prev_linked_text = self.runs[-1].linked_text
129+
self.runs[-1].linked_text = new_run.linked_text if not prev_linked_text else f"{prev_linked_text}; {new_run.linked_text}"
129130
continue
130131

131132
if self.runs and self.runs[-1] == new_run:

dedoc/readers/docx_reader/data_structures/run.py

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,28 +3,36 @@
33
from bs4 import Tag
44

55
from dedoc.readers.docx_reader.data_structures.base_props import BaseProperties
6+
from dedoc.readers.docx_reader.footnote_extractor import NoteExtractor
67
from dedoc.readers.docx_reader.properties_extractor import change_caps
78

89

910
class Run(BaseProperties):
1011

11-
def __init__(self, properties: Optional[BaseProperties], styles_extractor: "StylesExtractor") -> None: # noqa
12+
def __init__(self, properties: Optional[BaseProperties], styles_extractor: "StylesExtractor", comment_extractor: Optional[NoteExtractor] = None) -> None: # noqa
1213
"""
1314
Contains information about run properties.
1415
:param properties: Paragraph or Run for copying its properties
1516
:param styles_extractor: StylesExtractor
17+
:param comment_extractor: NoteExtractor for comments
1618
"""
1719

1820
self.name2char = dict(tab="\t", br="\n", cr="\r")
1921
self.text = ""
22+
self.linked_text = ""
2023
self.styles_extractor = styles_extractor
24+
self.comment_extractor = comment_extractor
2125
super().__init__(properties)
2226

2327
def get_text(self, xml: Tag) -> None:
2428
"""
2529
Makes the text of run.
2630
:param xml: BeautifulSoup tree with run properties
2731
"""
32+
notes = self.comment_extractor.get_notes(xml) if self.comment_extractor else None
33+
if notes:
34+
self.linked_text = "; ".join(notes)
35+
2836
for tag in xml:
2937
tag_name = tag.name
3038

@@ -56,4 +64,5 @@ def __eq__(self, other: "Run") -> bool:
5664
size_eq = self.size == other.size
5765
font_eq = self.bold == other.bold and self.italic == other.italic and self.underlined == other.underlined
5866
script_eq = self.superscript == other.superscript and self.subscript == other.subscript
59-
return size_eq and font_eq and script_eq
67+
linked_text_eq = self.linked_text == other.linked_text
68+
return size_eq and font_eq and script_eq and linked_text_eq

dedoc/readers/docx_reader/data_structures/utils.py

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
from bs4 import Tag
77

88
from dedoc.readers.docx_reader.data_structures.paragraph import Paragraph
9-
from dedoc.readers.docx_reader.footnote_extractor import FootnoteExtractor
9+
from dedoc.readers.docx_reader.footnote_extractor import NoteExtractor
1010
from dedoc.readers.docx_reader.numbering_extractor import NumberingExtractor
1111
from dedoc.readers.docx_reader.styles_extractor import StylesExtractor
1212

@@ -35,14 +35,16 @@ def __init__(self,
3535
counter: Counter,
3636
styles_extractor: StylesExtractor,
3737
numbering_extractor: NumberingExtractor,
38-
footnote_extractor: FootnoteExtractor,
39-
endnote_extractor: FootnoteExtractor) -> None:
38+
footnote_extractor: NoteExtractor,
39+
endnote_extractor: NoteExtractor,
40+
comment_extractor: NoteExtractor) -> None:
4041
self.counter = counter
4142
self.path_hash = path_hash
4243
self.styles_extractor = styles_extractor
4344
self.numbering_extractor = numbering_extractor
4445
self.footnote_extractor = footnote_extractor
4546
self.endnote_extractor = endnote_extractor
47+
self.comment_extractor = comment_extractor
4648
self.uids_set = set()
4749

4850
def make_paragraph(self, paragraph_xml: Tag, paragraph_list: List[Paragraph]) -> Paragraph:
@@ -52,6 +54,7 @@ def make_paragraph(self, paragraph_xml: Tag, paragraph_list: List[Paragraph]) ->
5254
numbering_extractor=self.numbering_extractor,
5355
footnote_extractor=self.footnote_extractor,
5456
endnote_extractor=self.endnote_extractor,
57+
comment_extractor=self.comment_extractor,
5558
uid=uid)
5659
prev_paragraph = None if len(paragraph_list) == 0 else paragraph_list[-1]
5760
paragraph.spacing = paragraph.spacing_before if prev_paragraph is None else max(prev_paragraph.spacing_after, paragraph.spacing_before)
Lines changed: 18 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,21 +1,33 @@
1-
from typing import Optional
1+
from typing import Dict, List, Optional
22

3-
from bs4 import BeautifulSoup
3+
from bs4 import BeautifulSoup, Tag
44

55

6-
class FootnoteExtractor:
6+
class NoteExtractor:
77

88
def __init__(self, xml: Optional[BeautifulSoup], key: str = "footnote") -> None:
99
"""
1010
:param xml: BeautifulSoup tree with styles
11-
:param key: footnote or endnote
11+
:param key: footnote, endnote or comment
1212
"""
13-
self.id2footnote = {}
13+
self.key = key
14+
self.id2note: Dict[str, str] = {}
1415
if not xml:
1516
return
1617

1718
for footnote in xml.find_all(f"w:{key}"):
1819
footnote_id = footnote.get("w:id")
1920
footnote_text = " ".join(t.text for t in footnote.find_all("w:t") if t.text)
21+
author = footnote.get("w:author")
22+
footnote_text = f"{author}: {footnote_text}" if author else footnote_text
2023
if footnote_id and footnote_text:
21-
self.id2footnote[footnote_id] = footnote_text
24+
self.id2note[footnote_id] = footnote_text
25+
26+
def get_notes(self, xml: Tag) -> List[str]:
27+
notes_xml = xml.find_all(f"w:{self.key}Reference")
28+
notes = []
29+
for note in notes_xml:
30+
note_id = note.get("w:id")
31+
if note_id in self.id2note:
32+
notes.append(self.id2note[note_id])
33+
return notes

dedoc/readers/docx_reader/line_with_meta_converter.py

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,9 @@ def __init__(self, paragraph: Paragraph, paragraph_id: int) -> None:
2424
Converts custom DOCX Paragraph to LineWithMeta class.
2525
:param paragraph: Paragraph for converting its properties to the unified representation.
2626
"""
27-
annotations = [BoldAnnotation, ItalicAnnotation, UnderlinedAnnotation, StrikeAnnotation, SuperscriptAnnotation, SubscriptAnnotation]
27+
annotations = [
28+
BoldAnnotation, ItalicAnnotation, UnderlinedAnnotation, StrikeAnnotation, SuperscriptAnnotation, SubscriptAnnotation, LinkedTextAnnotation
29+
]
2830
self.dict2annotation = {annotation.name: annotation for annotation in annotations}
2931
self.annotation_merger = AnnotationMerger()
3032

@@ -37,8 +39,8 @@ def __parse(self, paragraph: Paragraph, paragraph_id: int) -> LineWithMeta:
3739
AlignmentAnnotation(start=0, end=len(paragraph.text), value=paragraph.jc),
3840
SpacingAnnotation(start=0, end=len(paragraph.text), value=str(paragraph.spacing))
3941
]
40-
for footnote in paragraph.footnotes:
41-
annotations.append(LinkedTextAnnotation(start=0, end=len(paragraph.text), value=footnote))
42+
for note in paragraph.notes:
43+
annotations.append(LinkedTextAnnotation(start=0, end=len(paragraph.text), value=note))
4244

4345
if paragraph.style_name is not None:
4446
annotations.append(StyleAnnotation(start=0, end=len(paragraph.text), value=paragraph.style_name))
@@ -47,7 +49,7 @@ def __parse(self, paragraph: Paragraph, paragraph_id: int) -> LineWithMeta:
4749

4850
for run, (start, end) in zip(paragraph.runs, paragraph.runs_ids):
4951
annotations.append(SizeAnnotation(start=start, end=end, value=str(run.size / 2)))
50-
for property_name in ["bold", "italic", "underlined", "strike", "superscript", "subscript"]:
52+
for property_name in self.dict2annotation:
5153
property_value = getattr(run, property_name)
5254
if property_value:
5355
annotations.append(self.dict2annotation[property_name](start=start, end=end, value=str(property_value)))

dedoc/readers/excel_reader/excel_reader.py

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22

33
from xlrd.sheet import Sheet
44

5+
from dedoc.data_structures.concrete_annotations.linked_text_annotation import LinkedTextAnnotation
56
from dedoc.data_structures.table import Table
67
from dedoc.data_structures.unstructured_document import UnstructuredDocument
78
from dedoc.readers.base_reader import BaseReader
@@ -54,8 +55,13 @@ def __parse_sheet(self, sheet_id: int, sheet: Sheet) -> Table:
5455
for row_id in range(n_rows):
5556
row = []
5657
for col_id in range(n_cols):
57-
value = str(sheet.cell_value(rowx=row_id, colx=col_id))
58-
row.append(CellWithMeta(lines=[LineWithMeta(line=value, metadata=LineMetadata(page_id=sheet_id, line_id=0))]))
58+
cell_text = str(sheet.cell_value(rowx=row_id, colx=col_id))
59+
if (row_id, col_id) in sheet.cell_note_map:
60+
note_text = sheet.cell_note_map[(row_id, col_id)].text.replace("\n", " ")
61+
annotations = [LinkedTextAnnotation(start=0, end=len(cell_text), value=note_text)]
62+
else:
63+
annotations = []
64+
row.append(CellWithMeta(lines=[LineWithMeta(line=cell_text, metadata=LineMetadata(page_id=sheet_id, line_id=0), annotations=annotations)]))
5965
res.append(row)
6066
metadata = TableMetadata(page_id=sheet_id)
6167
return Table(cells=res, metadata=metadata)

0 commit comments

Comments
 (0)