Skip to content

Commit 386a560

Browse files
authored
* TIKA-4345 -- extract metadata before writing to pstmailitem body so that more metadata is written to the xhtml * TIKA-4345 -- allow configurability for injecting headers into content in msg
1 parent bc74d11 commit 386a560

File tree

9 files changed

+147
-56
lines changed

9 files changed

+147
-56
lines changed

tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/AbstractOfficeParser.java

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -180,4 +180,18 @@ public void setIncludeHeadersAndFooters(boolean includeHeadersAndFooters) {
180180
public boolean isIncludeHeadersAndFooters() {
181181
return defaultOfficeParserConfig.isIncludeHeadersAndFooters();
182182
}
183+
184+
/**
185+
* If set to <code>true</code>, this will write the to/from/cc into the body content
186+
*
187+
* @param val
188+
*/
189+
@Field
190+
public void setWriteSelectHeadersInBody(boolean val) {
191+
defaultOfficeParserConfig.setWriteSelectHeadersInBody(val);
192+
}
193+
194+
public boolean isWriteSelectHeadersInBody() {
195+
return defaultOfficeParserConfig.isWriteSelectHeadersInBody();
196+
}
183197
}

tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -248,7 +248,6 @@ protected void parse(DirectoryNode root, ParseContext context, Metadata metadata
248248
break;
249249
case OUTLOOK:
250250
OutlookExtractor extractor = new OutlookExtractor(root, metadata, context);
251-
252251
extractor.parse(xhtml);
253252
break;
254253
case ENCRYPTED:

tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OfficeParserConfig.java

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,8 +34,9 @@ public class OfficeParserConfig implements Serializable {
3434

3535
private boolean useSAXDocxExtractor = false;
3636
private boolean useSAXPptxExtractor = false;
37-
private boolean extractAllAlternativesFromMSG;
3837

38+
private boolean extractAllAlternativesFromMSG = false;
39+
private boolean writeSelectHeadersInBody = false;
3940
private String dateOverrideFormat = null;
4041
private int maxOverride = 0;//ignore
4142

@@ -201,6 +202,20 @@ public void setExtractAllAlternativesFromMSG(boolean extractAllAlternativesFromM
201202
this.extractAllAlternativesFromMSG = extractAllAlternativesFromMSG;
202203
}
203204

205+
public boolean isWriteSelectHeadersInBody() {
206+
return writeSelectHeadersInBody;
207+
}
208+
209+
/**
210+
* If set to <code>true</code>, this will add to/from/cc into the
211+
* body content.
212+
*
213+
* @param val
214+
*/
215+
public void setWriteSelectHeadersInBody(boolean val) {
216+
this.writeSelectHeadersInBody = val;
217+
}
218+
204219
public boolean isIncludeMissingRows() {
205220
return includeMissingRows;
206221
}

tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java

Lines changed: 40 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -227,24 +227,7 @@ public void parse(XHTMLContentHandler xhtml)
227227
}
228228
}
229229

230-
231-
xhtml.element("h1", subject);
232-
233-
// Output the from and to details in text, as you
234-
// often want them in text form for searching
235-
xhtml.startElement("dl");
236-
if (from != null) {
237-
header(xhtml, "From", from);
238-
}
239-
header(xhtml, "To", msg.getDisplayTo());
240-
header(xhtml, "Cc", msg.getDisplayCC());
241-
header(xhtml, "Bcc", msg.getDisplayBCC());
242-
try {
243-
header(xhtml, "Recipients", msg.getRecipientEmailAddress());
244-
} catch (ChunkNotFoundException e) {
245-
//swallow
246-
}
247-
xhtml.endElement("dl");
230+
writeSelectHeadersInBody(subject, from, msg, xhtml);
248231

249232
// Get the message body. Preference order is: html, rtf, text
250233
Chunk htmlChunk = null;
@@ -265,17 +248,13 @@ public void parse(XHTMLContentHandler xhtml)
265248

266249
// Process the attachments
267250
for (AttachmentChunks attachment : msg.getAttachmentFiles()) {
268-
xhtml.startElement("div", "class", "attachment-entry");
269251

270252
String filename = null;
271253
if (attachment.getAttachLongFileName() != null) {
272254
filename = attachment.getAttachLongFileName().getValue();
273255
} else if (attachment.getAttachFileName() != null) {
274256
filename = attachment.getAttachFileName().getValue();
275257
}
276-
if (filename != null && filename.length() > 0) {
277-
xhtml.element("h1", filename);
278-
}
279258

280259
if (attachment.getAttachData() != null) {
281260
handleEmbeddedResource(
@@ -286,8 +265,6 @@ public void parse(XHTMLContentHandler xhtml)
286265
handleEmbeddedOfficeDoc(attachment.getAttachmentDirectory().getDirectory(), filename,
287266
xhtml, true);
288267
}
289-
290-
xhtml.endElement("div");
291268
}
292269
} catch (ChunkNotFoundException e) {
293270
throw new TikaException("POI MAPIMessage broken - didn't return null on missing chunk",
@@ -302,6 +279,31 @@ public void parse(XHTMLContentHandler xhtml)
302279
}
303280
}
304281

282+
private void writeSelectHeadersInBody(String subject, String from, MAPIMessage msg, XHTMLContentHandler xhtml)
283+
throws SAXException, ChunkNotFoundException {
284+
if (! officeParserConfig.isWriteSelectHeadersInBody()) {
285+
return;
286+
}
287+
xhtml.element("h1", subject);
288+
289+
// Output the from and to details in text, as you
290+
// often want them in text form for searching
291+
xhtml.startElement("dl");
292+
if (from != null) {
293+
header(xhtml, "From", from);
294+
}
295+
header(xhtml, "To", msg.getDisplayTo());
296+
header(xhtml, "Cc", msg.getDisplayCC());
297+
header(xhtml, "Bcc", msg.getDisplayBCC());
298+
try {
299+
header(xhtml, "Recipients", msg.getRecipientEmailAddress());
300+
} catch (ChunkNotFoundException e) {
301+
//swallow
302+
}
303+
xhtml.endElement("dl");
304+
305+
}
306+
305307
private void handleBodyChunks(Chunk htmlChunk, Chunk rtfChunk, Chunk textChunk,
306308
XHTMLContentHandler xhtml)
307309
throws SAXException, IOException, TikaException {
@@ -310,9 +312,18 @@ private void handleBodyChunks(Chunk htmlChunk, Chunk rtfChunk, Chunk textChunk,
310312
extractAllAlternatives(htmlChunk, rtfChunk, textChunk, xhtml);
311313
return;
312314
}
313-
315+
if (officeParserConfig.isWriteSelectHeadersInBody()) {
316+
xhtml.startElement("div", "class", "message-body");
317+
_handleBodyChunks(htmlChunk, rtfChunk, textChunk, xhtml);
318+
xhtml.endElement("div");
319+
} else {
320+
_handleBodyChunks(htmlChunk, rtfChunk, textChunk, xhtml);
321+
}
322+
}
323+
private void _handleBodyChunks(Chunk htmlChunk, Chunk rtfChunk, Chunk textChunk,
324+
XHTMLContentHandler xhtml)
325+
throws SAXException, IOException, TikaException {
314326
boolean doneBody = false;
315-
xhtml.startElement("div", "class", "message-body");
316327
if (htmlChunk != null) {
317328
byte[] data = null;
318329
if (htmlChunk instanceof ByteChunk) {
@@ -341,21 +352,19 @@ private void handleBodyChunks(Chunk htmlChunk, Chunk rtfChunk, Chunk textChunk,
341352
MAPIRtfAttribute rtf =
342353
new MAPIRtfAttribute(MAPIProperty.RTF_COMPRESSED, Types.BINARY.getId(),
343354
chunk.getValue());
344-
Parser rtfParser = EmbeddedDocumentUtil
355+
RTFParser rtfParser = (RTFParser) EmbeddedDocumentUtil
345356
.tryToFindExistingLeafParser(RTFParser.class, parseContext);
346357
if (rtfParser == null) {
347358
rtfParser = new RTFParser();
348359
}
349-
rtfParser.parse(UnsynchronizedByteArrayInputStream.builder().setByteArray(rtf.getData()).get(),
350-
new EmbeddedContentHandler(new BodyContentHandler(xhtml)), new Metadata(),
351-
parseContext);
360+
rtfParser.parseInline(UnsynchronizedByteArrayInputStream.builder().setByteArray(rtf.getData()).get(),
361+
xhtml, new Metadata(), parseContext);
352362
doneBody = true;
353363
}
354364
}
355365
if (textChunk != null && (extractAllAlternatives || !doneBody)) {
356366
xhtml.element("p", ((StringChunk) textChunk).getValue());
357367
}
358-
xhtml.endElement("div");
359368

360369
}
361370

tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/pst/PSTMailItemParser.java

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -87,11 +87,11 @@ public void parse(InputStream stream, ContentHandler handler, Metadata metadata,
8787
private void parseMailAndAttachments(PSTMessage pstMsg, XHTMLContentHandler handler, Metadata metadata, ParseContext context,
8888
EmbeddedDocumentExtractor embeddedExtractor)
8989
throws SAXException, IOException, TikaException {
90+
extractMetadata(pstMsg, metadata);
9091
AttributesImpl attributes = new AttributesImpl();
9192
attributes.addAttribute("", "class", "class", "CDATA", "embedded");
9293
attributes.addAttribute("", "id", "id", "CDATA", pstMsg.getInternetMessageId());
9394
handler.startElement("div", attributes);
94-
handler.element("h1", pstMsg.getSubject());
9595

9696
parseMailItem(pstMsg, handler, metadata, context);
9797
parseMailAttachments(pstMsg, handler, metadata, context, embeddedExtractor);
@@ -100,7 +100,7 @@ private void parseMailAndAttachments(PSTMessage pstMsg, XHTMLContentHandler hand
100100

101101
private void parseMailItem(PSTMessage pstMail, XHTMLContentHandler xhtml,
102102
Metadata metadata, ParseContext context) throws SAXException, IOException, TikaException {
103-
extractMetadata(pstMail, metadata);
103+
104104
//try the html first. It preserves logical paragraph markers
105105
String htmlChunk = pstMail.getBodyHTML();
106106
if (! StringUtils.isBlank(htmlChunk)) {

tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/rtf/RTFParser.java

Lines changed: 25 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -64,19 +64,38 @@ public void parse(InputStream stream, ContentHandler handler, Metadata metadata,
6464
ParseContext context) throws IOException, SAXException, TikaException {
6565
metadata.set(Metadata.CONTENT_TYPE, "application/rtf");
6666
TaggedInputStream tagged = new TaggedInputStream(stream);
67+
XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
68+
xhtml.startDocument();
6769
try {
68-
XHTMLContentHandler xhtmlHandler = new XHTMLContentHandler(handler, metadata);
69-
RTFEmbObjHandler embObjHandler =
70-
new RTFEmbObjHandler(xhtmlHandler, metadata, context, getMemoryLimitInKb());
71-
final TextExtractor ert = new TextExtractor(xhtmlHandler, metadata, embObjHandler);
72-
ert.setIgnoreListMarkup(ignoreListMarkup);
73-
ert.extract(stream);
70+
parseInline(stream, xhtml, metadata, context);
7471
} catch (IOException e) {
7572
tagged.throwIfCauseOf(e);
7673
throw new TikaException("Error parsing an RTF document", e);
74+
} finally {
75+
xhtml.endDocument();
7776
}
7877
}
7978

79+
/**
80+
* This bypasses wrapping the handler for inline parsing (in at least the OutlookExtractor).
81+
*
82+
* @param is
83+
* @param handler
84+
* @param metadata
85+
* @param context
86+
* @throws TikaException
87+
* @throws IOException
88+
* @throws SAXException
89+
*/
90+
public void parseInline(InputStream is, ContentHandler handler, Metadata metadata, ParseContext context)
91+
throws TikaException, IOException, SAXException {
92+
RTFEmbObjHandler embObjHandler =
93+
new RTFEmbObjHandler(handler, metadata, context, getMemoryLimitInKb());
94+
final TextExtractor ert = new TextExtractor(handler, metadata, embObjHandler);
95+
ert.setIgnoreListMarkup(ignoreListMarkup);
96+
ert.extract(is);
97+
}
98+
8099
public int getMemoryLimitInKb() {
81100
//there's a race condition here, but it shouldn't matter.
82101
if (USE_STATIC) {

tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/rtf/TextExtractor.java

Lines changed: 11 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,8 @@
1717

1818
package org.apache.tika.parser.microsoft.rtf;
1919

20+
import static org.apache.tika.sax.XHTMLContentHandler.XHTML;
21+
2022
import java.io.IOException;
2123
import java.io.InputStream;
2224
import java.io.PushbackInputStream;
@@ -36,7 +38,9 @@
3638
import java.util.TimeZone;
3739

3840
import org.apache.commons.io.IOUtils;
41+
import org.xml.sax.ContentHandler;
3942
import org.xml.sax.SAXException;
43+
import org.xml.sax.helpers.AttributesImpl;
4044

4145
import org.apache.tika.exception.TikaException;
4246
import org.apache.tika.extractor.EmbeddedDocumentUtil;
@@ -47,7 +51,6 @@
4751
import org.apache.tika.metadata.OfficeOpenXMLExtended;
4852
import org.apache.tika.metadata.Property;
4953
import org.apache.tika.metadata.TikaCoreProperties;
50-
import org.apache.tika.sax.XHTMLContentHandler;
5154
import org.apache.tika.utils.CharsetUtils;
5255

5356
/* Tokenizes and performs a "shallow" parse of the RTF
@@ -256,7 +259,7 @@ final class TextExtractor {
256259
// close the group, we restore it
257260
private final LinkedList<GroupState> groupStates = new LinkedList<>();
258261
private final StringBuilder pendingBuffer = new StringBuilder();
259-
private final XHTMLContentHandler out;
262+
private final ContentHandler out;
260263
private final Metadata metadata;
261264
private final RTFEmbObjHandler embObjHandler;
262265
// How many next ansi chars we should skip; this
@@ -330,7 +333,7 @@ final class TextExtractor {
330333
//to defend against DoS with memory consumption
331334
private int maxStackSize = 1000;
332335

333-
public TextExtractor(XHTMLContentHandler out, Metadata metadata,
336+
public TextExtractor(ContentHandler out, Metadata metadata,
334337
RTFEmbObjHandler embObjHandler) {
335338
this.metadata = metadata;
336339
this.out = out;
@@ -464,7 +467,6 @@ public void extract(InputStream in) throws IOException, SAXException, TikaExcept
464467
}
465468

466469
private void extract(PushbackInputStream in) throws IOException, SAXException, TikaException {
467-
out.startDocument();
468470

469471
while (true) {
470472
final int b = in.read();
@@ -503,7 +505,6 @@ private void extract(PushbackInputStream in) throws IOException, SAXException, T
503505
while (paragraphStack.size() > 0) {
504506
end(paragraphStack.pop());
505507
}
506-
out.endDocument();
507508
}
508509

509510
private void parseControlToken(PushbackInputStream in)
@@ -1084,11 +1085,11 @@ private boolean isUnorderedList(int listID) {
10841085
}
10851086

10861087
private void end(String tag) throws IOException, SAXException, TikaException {
1087-
out.endElement(tag);
1088+
out.endElement(XHTML, tag, tag);
10881089
}
10891090

10901091
private void start(String tag) throws IOException, SAXException, TikaException {
1091-
out.startElement(tag);
1092+
out.startElement(XHTML, tag, tag, new AttributesImpl());
10921093
}
10931094

10941095
// Handle non-parameter control word:
@@ -1357,7 +1358,9 @@ private void processControlWord() throws IOException, SAXException, TikaExceptio
13571358
} else if (equals("fldrslt") && fieldState == 2) {
13581359
assert pendingURL != null;
13591360
lazyStartParagraph();
1360-
out.startElement("a", "href", pendingURL);
1361+
AttributesImpl attrs = new AttributesImpl();
1362+
attrs.addAttribute(XHTML, "href", "href", "CDATA", pendingURL);
1363+
out.startElement("", "a", "a", attrs);
13611364
pendingURL = null;
13621365
fieldState = 3;
13631366
groupState.ignore = false;

0 commit comments

Comments
 (0)