Skip to content

Commit 55d3d78

Browse files
authored
TIKA-4345 -- stop injecting headers into the body of msg files (#2042)
* TIKA-4345 -- stop injecting headers into the body for msg files
1 parent 41302d5 commit 55d3d78

File tree

5 files changed

+18
-86
lines changed

5 files changed

+18
-86
lines changed

CHANGES.txt

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,15 @@
11
Release 4.0.0-BETA1 - ???
22
BREAKING CHANGES
33

4+
* Headers are no longer injected into the body/content of MSG files (TIKA-4345). Please open
5+
a ticket if you need this behavior across email formats.
6+
7+
48
Release 3.1.0 - ??
59

10+
* Allow users to turn off the injection of some headers into the content stream of MSG
11+
files (TIKA-4345).
12+
613
* Add a wrapper for Google's magika detector (TIKA-4344).
714

815
* Add support for MachO via Alexey Pelykh (TIKA-4309).

tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/AbstractOfficeParser.java

Lines changed: 0 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -181,17 +181,4 @@ public boolean isIncludeHeadersAndFooters() {
181181
return defaultOfficeParserConfig.isIncludeHeadersAndFooters();
182182
}
183183

184-
/**
185-
* If set to <code>true</code>, this will write the to/from/cc into the body content
186-
*
187-
* @param val
188-
*/
189-
@Field
190-
public void setWriteSelectHeadersInBody(boolean val) {
191-
defaultOfficeParserConfig.setWriteSelectHeadersInBody(val);
192-
}
193-
194-
public boolean isWriteSelectHeadersInBody() {
195-
return defaultOfficeParserConfig.isWriteSelectHeadersInBody();
196-
}
197184
}

tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OfficeParserConfig.java

Lines changed: 0 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,6 @@ public class OfficeParserConfig implements Serializable {
3636
private boolean useSAXPptxExtractor = false;
3737

3838
private boolean extractAllAlternativesFromMSG = false;
39-
private boolean writeSelectHeadersInBody = false;
4039
private String dateOverrideFormat = null;
4140
private int maxOverride = 0;//ignore
4241

@@ -202,20 +201,6 @@ public void setExtractAllAlternativesFromMSG(boolean extractAllAlternativesFromM
202201
this.extractAllAlternativesFromMSG = extractAllAlternativesFromMSG;
203202
}
204203

205-
public boolean isWriteSelectHeadersInBody() {
206-
return writeSelectHeadersInBody;
207-
}
208-
209-
/**
210-
* If set to <code>true</code>, this will add to/from/cc into the
211-
* body content.
212-
*
213-
* @param val
214-
*/
215-
public void setWriteSelectHeadersInBody(boolean val) {
216-
this.writeSelectHeadersInBody = val;
217-
}
218-
219204
public boolean isIncludeMissingRows() {
220205
return includeMissingRows;
221206
}

tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java

Lines changed: 2 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -227,8 +227,6 @@ public void parse(XHTMLContentHandler xhtml)
227227
}
228228
}
229229

230-
writeSelectHeadersInBody(subject, from, msg, xhtml);
231-
232230
// Get the message body. Preference order is: html, rtf, text
233231
Chunk htmlChunk = null;
234232
Chunk rtfChunk = null;
@@ -279,31 +277,6 @@ public void parse(XHTMLContentHandler xhtml)
279277
}
280278
}
281279

282-
private void writeSelectHeadersInBody(String subject, String from, MAPIMessage msg, XHTMLContentHandler xhtml)
283-
throws SAXException, ChunkNotFoundException {
284-
if (! officeParserConfig.isWriteSelectHeadersInBody()) {
285-
return;
286-
}
287-
xhtml.element("h1", subject);
288-
289-
// Output the from and to details in text, as you
290-
// often want them in text form for searching
291-
xhtml.startElement("dl");
292-
if (from != null) {
293-
header(xhtml, "From", from);
294-
}
295-
header(xhtml, "To", msg.getDisplayTo());
296-
header(xhtml, "Cc", msg.getDisplayCC());
297-
header(xhtml, "Bcc", msg.getDisplayBCC());
298-
try {
299-
header(xhtml, "Recipients", msg.getRecipientEmailAddress());
300-
} catch (ChunkNotFoundException e) {
301-
//swallow
302-
}
303-
xhtml.endElement("dl");
304-
305-
}
306-
307280
private void handleBodyChunks(Chunk htmlChunk, Chunk rtfChunk, Chunk textChunk,
308281
XHTMLContentHandler xhtml)
309282
throws SAXException, IOException, TikaException {
@@ -312,13 +285,8 @@ private void handleBodyChunks(Chunk htmlChunk, Chunk rtfChunk, Chunk textChunk,
312285
extractAllAlternatives(htmlChunk, rtfChunk, textChunk, xhtml);
313286
return;
314287
}
315-
if (officeParserConfig.isWriteSelectHeadersInBody()) {
316-
xhtml.startElement("div", "class", "message-body");
317-
_handleBodyChunks(htmlChunk, rtfChunk, textChunk, xhtml);
318-
xhtml.endElement("div");
319-
} else {
320-
_handleBodyChunks(htmlChunk, rtfChunk, textChunk, xhtml);
321-
}
288+
_handleBodyChunks(htmlChunk, rtfChunk, textChunk, xhtml);
289+
322290
}
323291
private void _handleBodyChunks(Chunk htmlChunk, Chunk rtfChunk, Chunk textChunk,
324292
XHTMLContentHandler xhtml)

tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java

Lines changed: 9 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -54,18 +54,12 @@ public class OutlookParserTest extends TikaTest {
5454
@Test
5555
public void testOutlookParsing() throws Exception {
5656

57-
//test default behavior
58-
List<Metadata> metadataList = getRecursiveMetadata("test-outlook.msg", AUTO_DETECT_PARSER,
59-
BasicContentHandlerFactory.HANDLER_TYPE.BODY);
60-
assertNotContained("Microsoft Outlook Express 6", metadataList.get(0).get(TikaCoreProperties.TIKA_CONTENT));
6157

62-
63-
//test legacy behavior
6458
ContentHandler handler = new BodyContentHandler();
6559
Metadata metadata = new Metadata();
6660

6761
try (InputStream stream = getResourceAsStream("/test-documents/test-outlook.msg")) {
68-
AUTO_DETECT_PARSER.parse(stream, handler, metadata, configureInjectHeaders());
62+
AUTO_DETECT_PARSER.parse(stream, handler, metadata, new ParseContext());
6963
}
7064
assertEquals("application/vnd.ms-outlook", metadata.get(Metadata.CONTENT_TYPE));
7165
assertEquals("Microsoft Outlook Express 6", metadata.get(TikaCoreProperties.TITLE));
@@ -90,9 +84,9 @@ public void testOutlookParsing() throws Exception {
9084
assertEquals("2007-04-05T16:26:06Z", metadata.get(TikaCoreProperties.CREATED));
9185

9286
String content = handler.toString();
93-
assertContains("Microsoft Outlook Express 6", content);
94-
assertContains("L'\u00C9quipe Microsoft Outlook Express", content);
95-
assertContains("Nouvel utilisateur de Outlook Express", content);
87+
assertNotContained("Microsoft Outlook Express 6", content);
88+
assertNotContained("L'\u00C9quipe Microsoft Outlook Express", content);
89+
assertNotContained("Nouvel utilisateur de Outlook Express", content);
9690
assertContains("Messagerie et groupes de discussion", content);
9791
}
9892

@@ -107,15 +101,14 @@ public void testMultipleCopies() throws Exception {
107101
Metadata metadata = new Metadata();
108102

109103
try (InputStream stream = getResourceAsStream("/test-documents/testMSG.msg")) {
110-
AUTO_DETECT_PARSER.parse(stream, handler, metadata, configureInjectHeaders());
104+
AUTO_DETECT_PARSER.parse(stream, handler, metadata, new ParseContext());
111105
}
112106

113107
assertEquals("application/vnd.ms-outlook", metadata.get(Metadata.CONTENT_TYPE));
114108

115109
String content = handler.toString();
116110
Pattern pattern = Pattern.compile("From");
117111
Matcher matcher = pattern.matcher(content);
118-
assertTrue(matcher.find());
119112
assertFalse(matcher.find());
120113

121114
//test that last header is added
@@ -185,13 +178,13 @@ public void testOutlookHTMLVersion() throws Exception {
185178
handler.setResult(new StreamResult(sw));
186179

187180
try (InputStream stream = getResourceAsStream("/test-documents/testMSG_chinese.msg")) {
188-
AUTO_DETECT_PARSER.parse(stream, handler, metadata, configureInjectHeaders());
181+
AUTO_DETECT_PARSER.parse(stream, handler, metadata, new ParseContext());
189182
}
190183

191184
// As the HTML version should have been processed, ensure
192185
// we got some of the links
193186
String content = sw.toString();
194-
assertContains("<dd>[email protected]</dd>", content);
187+
assertNotContained("<dd>[email protected]</dd>", content);
195188
assertContains("<p>Alfresco MSG format testing", content);
196189
assertContains("<li>1", content);
197190
assertContains("<li>2", content);
@@ -259,13 +252,13 @@ public void testOutlookHTMLfromRTF() throws Exception {
259252
handler.setResult(new StreamResult(sw));
260253

261254
try (InputStream stream = getResourceAsStream("/test-documents/test-outlook2003.msg")) {
262-
AUTO_DETECT_PARSER.parse(stream, handler, metadata, configureInjectHeaders());
255+
AUTO_DETECT_PARSER.parse(stream, handler, metadata, new ParseContext());
263256
}
264257

265258
// As the HTML version should have been processed, ensure
266259
// we got some of the links
267260
String content = sw.toString().replaceAll("[\\r\\n\\t]+", " ").replaceAll(" +", " ");
268-
assertContains("<dd>New Outlook User</dd>", content);
261+
assertNotContained("<dd>New Outlook User</dd>", content);
269262
assertContains("designed <i>to help you", content);
270263
assertContains(
271264
"<p> <a href=\"http://r.office.microsoft.com/r/rlidOutlookWelcomeMail10?clid=1033\">Cached Exchange Mode</a>",
@@ -282,14 +275,6 @@ public void testOutlookHTMLfromRTF() throws Exception {
282275
assertEquals(2, content.split("<\\/body>").length);
283276
}
284277

285-
private ParseContext configureInjectHeaders() {
286-
ParseContext parseContext = new ParseContext();
287-
OfficeParserConfig officeParserConfig = new OfficeParserConfig();
288-
officeParserConfig.setWriteSelectHeadersInBody(true);
289-
parseContext.set(OfficeParserConfig.class, officeParserConfig);
290-
return parseContext;
291-
}
292-
293278
@Test
294279
public void testMAPIMessageClasses() throws Exception {
295280

0 commit comments

Comments
 (0)