Skip to content

Commit 5a3a7d2

Browse files
authored
TIKA-4352 -- add an exclusion list in the StandardWriteFilter (#2046)
1 parent 3a8990d commit 5a3a7d2

File tree

4 files changed

+88
-13
lines changed

4 files changed

+88
-13
lines changed

tika-core/src/main/java/org/apache/tika/metadata/writefilter/StandardWriteFilter.java

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -113,6 +113,7 @@ public class StandardWriteFilter implements MetadataWriteFilter, Serializable {
113113

114114

115115
private final Set<String> includeFields;
116+
private final Set<String> excludeFields;
116117

117118
private Map<String, Integer> fieldSizes = new HashMap<>();
118119

@@ -125,19 +126,22 @@ public class StandardWriteFilter implements MetadataWriteFilter, Serializable {
125126
* @param maxEstimatedSize
126127
* @param includeFields if null or empty, all fields are included; otherwise, which fields
127128
* to add to the metadata object.
129+
* @param excludeFields these fields will not be included (unless they're in {@link StandardWriteFilter#ALWAYS_SET_FIELDS})
128130
* @param includeEmpty if <code>true</code>, this will set or add an empty value to the
129131
* metadata object.
130132
*/
131133
protected StandardWriteFilter(int maxKeySize, int maxFieldSize, int maxEstimatedSize,
132134
int maxValuesPerField,
133135
Set<String> includeFields,
136+
Set<String> excludeFields,
134137
boolean includeEmpty) {
135138

136139
this.maxKeySize = maxKeySize;
137140
this.maxFieldSize = maxFieldSize;
138141
this.maxTotalEstimatedSize = maxEstimatedSize;
139142
this.maxValuesPerField = maxValuesPerField;
140143
this.includeFields = includeFields;
144+
this.excludeFields = excludeFields;
141145
this.includeEmpty = includeEmpty;
142146
}
143147

@@ -176,6 +180,7 @@ public void set(String field, String value, Map<String, String[]> data) {
176180
setAlwaysInclude(field, value, data);
177181
return;
178182
}
183+
179184
StringSizePair filterKey = filterKey(field, value, data);
180185
setFilterKey(filterKey, value, data);
181186
}
@@ -433,11 +438,10 @@ private boolean includeField(String name) {
433438
if (ALWAYS_SET_FIELDS.contains(name)) {
434439
return true;
435440
}
436-
if (includeFields == null ||
437-
includeFields.contains(name)) {
438-
return true;
441+
if (excludeFields.contains(name)) {
442+
return false;
439443
}
440-
return false;
444+
return includeFields.isEmpty() || includeFields.contains(name);
441445
}
442446

443447
private static int estimateSize(String s) {

tika-core/src/main/java/org/apache/tika/metadata/writefilter/StandardWriteFilterFactory.java

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,8 @@ public class StandardWriteFilterFactory implements MetadataWriteFilterFactory {
3333
public static int DEFAULT_TOTAL_ESTIMATED_BYTES = 10 * 1024 * 1024;
3434
public static int DEFAULT_MAX_VALUES_PER_FIELD = 10;
3535

36-
private Set<String> includeFields = null;
36+
private Set<String> includeFields = Collections.EMPTY_SET;
37+
private Set<String> excludeFields = Collections.EMPTY_SET;
3738
private int maxKeySize = DEFAULT_MAX_KEY_SIZE;
3839
private int maxFieldSize = DEFAULT_MAX_FIELD_SIZE;
3940
private int maxTotalEstimatedBytes = DEFAULT_TOTAL_ESTIMATED_BYTES;
@@ -55,7 +56,8 @@ public MetadataWriteFilter newInstance() {
5556
}
5657

5758
return new StandardWriteFilter(maxKeySize, maxFieldSize,
58-
maxTotalEstimatedBytes, maxValuesPerField, includeFields, includeEmpty);
59+
maxTotalEstimatedBytes, maxValuesPerField, includeFields,
60+
excludeFields, includeEmpty);
5961
}
6062

6163
public void setIncludeFields(List<String> includeFields) {
@@ -64,6 +66,12 @@ public void setIncludeFields(List<String> includeFields) {
6466
this.includeFields = Collections.unmodifiableSet(keys);
6567
}
6668

69+
public void setExcludeFields(List<String> excludeFields) {
70+
Set<String> keys = ConcurrentHashMap.newKeySet(excludeFields.size());
71+
keys.addAll(excludeFields);
72+
this.excludeFields = Collections.unmodifiableSet(keys);
73+
}
74+
6775
public void setMaxTotalEstimatedBytes(int maxTotalEstimatedBytes) {
6876
this.maxTotalEstimatedBytes = maxTotalEstimatedBytes;
6977
}

tika-core/src/test/java/org/apache/tika/metadata/writefilter/StandardWriteFilterTest.java

Lines changed: 35 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121

2222
import java.io.ByteArrayInputStream;
2323
import java.nio.charset.StandardCharsets;
24+
import java.util.Collections;
2425
import java.util.List;
2526
import java.util.Set;
2627

@@ -116,7 +117,7 @@ public void testMetadataFactoryFieldsConfig() throws Exception {
116117
@Test
117118
public void testKeySizeFilter() throws Exception {
118119
Metadata metadata = filter(10, 1000, 10000, 100,
119-
null, true);
120+
Collections.EMPTY_SET, Collections.EMPTY_SET, true);
120121
//test that must add keys are not truncated
121122
metadata.add(TikaCoreProperties.TIKA_PARSED_BY, "some-long-parser1");
122123
metadata.add(TikaCoreProperties.TIKA_PARSED_BY, "some-long-parser2");
@@ -138,13 +139,13 @@ public void testAfterMaxHit() throws Exception {
138139
String k = "dc:creator";//20 bytes
139140
//key is > maxTotalBytes, so the value isn't even added
140141
Metadata metadata = filter(100, 10000, 10,
141-
100, null, false);
142+
100, Collections.EMPTY_SET, Collections.EMPTY_SET, false);
142143
metadata.set(k, "ab");
143144
assertEquals(1, metadata.names().length);
144145
assertEquals("true", metadata.get(TikaCoreProperties.TRUNCATED_METADATA));
145146

146147
metadata = filter(100, 10000, 50, 100,
147-
null, false);
148+
Collections.EMPTY_SET, Collections.EMPTY_SET, false);
148149
for (int i = 0; i < 10; i++) {
149150
metadata.set(k, "abcde");
150151
}
@@ -178,7 +179,8 @@ public void testAfterMaxHit() throws Exception {
178179
@Test
179180
public void testMinSizeForAlwaysInclude() throws Exception {
180181
//test that mimes don't get truncated
181-
Metadata metadata = filter(100, 10, 10000, 100, null, true);
182+
Metadata metadata = filter(100, 10, 10000, 100,
183+
Collections.EMPTY_SET, Collections.EMPTY_SET, true);
182184

183185
String mime = getLongestMime().toString();
184186
metadata.set(Metadata.CONTENT_TYPE, mime);
@@ -192,21 +194,47 @@ public void testMinSizeForAlwaysInclude() throws Exception {
192194

193195
@Test
194196
public void testMaxFieldValues() throws Exception {
195-
Metadata metadata = filter(100, 10000, 10000, 3, null, true);
197+
Metadata metadata = filter(100, 10000, 10000, 3,
198+
Collections.EMPTY_SET, Collections.EMPTY_SET, true);
196199
for (int i = 0; i < 10; i++) {
197200
metadata.add(TikaCoreProperties.SUBJECT, "ab");
198201
}
199202
assertEquals(3, metadata.getValues(TikaCoreProperties.SUBJECT).length);
200203
}
201204

205+
@Test
206+
public void testExclude() throws Exception {
207+
TikaConfig tikaConfig =
208+
new TikaConfig(TikaConfigTest.class.getResourceAsStream("TIKA-3695-exclude.xml"));
209+
AutoDetectParser parser = new AutoDetectParser(tikaConfig);
210+
String mock = "<?xml version=\"1.0\" encoding=\"UTF-8\" ?>" +
211+
"<mock>";
212+
mock += "<metadata action=\"add\" name=\"dc:creator\">01234567890123456789</metadata>";
213+
mock += "<metadata action=\"add\" name=\"subject\">01234567890123456789</metadata>";
214+
mock += "<metadata action=\"add\" name=\"subjectB\">01234567890123456789</metadata>";
215+
mock += "<write element=\"p\" times=\"1\"> hello </write>\n";
216+
mock += "</mock>";
217+
Metadata metadata = new Metadata();
218+
List<Metadata> metadataList =
219+
getRecursiveMetadata(new ByteArrayInputStream(mock.getBytes(StandardCharsets.UTF_8)),
220+
parser, metadata, new ParseContext(), true);
221+
assertEquals(1, metadataList.size());
222+
metadata = metadataList.get(0);
223+
assertEquals(9, metadata.names().length);
224+
assertEquals("01234567890123456789", metadata.get("dc:creator"));
225+
assertEquals("01234567890123456789", metadata.get("subjectB"));
226+
assertNull(metadata.get("subject"));
227+
}
228+
229+
202230
private void assertTruncated(Metadata metadata) {
203231
assertEquals("true", metadata.get(TikaCoreProperties.TRUNCATED_METADATA));
204232
}
205233
private Metadata filter(int maxKeySize, int maxFieldSize, int maxTotalBytes,
206234
int maxValuesPerField,
207-
Set<String> includeFields, boolean includeEmpty) {
235+
Set<String> includeFields, Set<String> excludeFields, boolean includeEmpty) {
208236
MetadataWriteFilter filter = new StandardWriteFilter(maxKeySize, maxFieldSize,
209-
maxTotalBytes, maxValuesPerField, includeFields, includeEmpty);
237+
maxTotalBytes, maxValuesPerField, includeFields, excludeFields, includeEmpty);
210238
Metadata metadata = new Metadata();
211239
metadata.setMetadataWriteFilter(filter);
212240
return metadata;
Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
<?xml version="1.0" encoding="UTF-8"?>
2+
<!--
3+
Licensed to the Apache Software Foundation (ASF) under one or more
4+
contributor license agreements. See the NOTICE file distributed with
5+
this work for additional information regarding copyright ownership.
6+
The ASF licenses this file to You under the Apache License, Version 2.0
7+
(the "License"); you may not use this file except in compliance with
8+
the License. You may obtain a copy of the License at
9+
10+
http://www.apache.org/licenses/LICENSE-2.0
11+
12+
Unless required by applicable law or agreed to in writing, software
13+
distributed under the License is distributed on an "AS IS" BASIS,
14+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15+
See the License for the specific language governing permissions and
16+
limitations under the License.
17+
-->
18+
<properties>
19+
<parsers>
20+
<parser class="org.apache.tika.parser.DefaultParser"/>
21+
</parsers>
22+
<autoDetectParserConfig>
23+
<params>
24+
<spoolToDisk>12345</spoolToDisk>
25+
<outputThreshold>6789</outputThreshold>
26+
</params>
27+
<metadataWriteFilterFactory class="org.apache.tika.metadata.writefilter.StandardWriteFilterFactory">
28+
<params>
29+
<excludeFields>
30+
<field>subject</field>
31+
</excludeFields>
32+
</params>
33+
</metadataWriteFilterFactory>
34+
</autoDetectParserConfig>
35+
</properties>

0 commit comments

Comments
 (0)