Skip to content

Commit bd878d3

Browse files
TIKA-4350 HTML snippet containing <iframe> as root element erroneously recognized as application/xml (#2045)
1 parent 90d854f commit bd878d3

File tree

3 files changed

+6
-0
lines changed

3 files changed

+6
-0
lines changed

tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7680,6 +7680,8 @@
76807680
<root-XML localName="SCRIPT"/>
76817681
<root-XML localName="frameset"/>
76827682
<root-XML localName="FRAMESET"/>
7683+
<root-XML localName="iframe"/>
7684+
<root-XML localName="IFRAME"/>
76837685
<magic priority="60">
76847686
<match value="(?i)&lt;(html|head|body|title|div)[ >]" type="regex" offset="0"/>
76857687
<match value="(?i)&lt;h[123][ >]" type="regex" offset="0"/>

tika-core/src/test/java/org/apache/tika/mime/MimeDetectionTest.java

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,8 @@ public void testDetection() throws Exception {
7474
testFile("text/html", "testlargerbuffer.html");
7575
// test fragment of HTML with <div> (TIKA-1102)
7676
testFile("text/html", "htmlfragment");
77+
// test fragment of HTML with <iframe> and potentially misleading file suffix
78+
testFile("text/html", "test-html-snippet-iframe.jsp");
7779
// test binary CGM detection (TIKA-1170)
7880
testFile("image/cgm", "plotutils-bin-cgm-v3.cgm");
7981
// test HTML detection of malformed file, previously identified as image/cgm (TIKA-1170)
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
<!-- this is a comment: https://www.example.org/path/file.pdf -->
2+
<iframe src='/path/file.pdf' width='100%' height='100%' target='_blank'>

0 commit comments

Comments
 (0)