Skip to content

Commit d09383a

Browse files
authored
TIKA-4577 -- improve metadata filter serialization via ParseContext. (#2461)
1 parent e113b93 commit d09383a

File tree

4 files changed

+234
-3
lines changed

4 files changed

+234
-3
lines changed

tika-pipes/tika-pipes-integration-tests/src/test/java/org/apache/tika/pipes/core/PipesClientTest.java

Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@
2828
import org.junit.jupiter.api.Test;
2929
import org.junit.jupiter.api.io.TempDir;
3030

31+
import org.apache.tika.config.ConfigContainer;
3132
import org.apache.tika.config.TikaTaskTimeout;
3233
import org.apache.tika.config.loader.TikaJsonConfig;
3334
import org.apache.tika.metadata.Metadata;
@@ -41,6 +42,7 @@
4142
import org.apache.tika.pipes.api.PipesResult;
4243
import org.apache.tika.pipes.api.emitter.EmitKey;
4344
import org.apache.tika.pipes.api.fetcher.FetchKey;
45+
import org.apache.tika.serialization.ParseContextUtils;
4446

4547
public class PipesClientTest {
4648
String fetcherName = "fsf";
@@ -104,6 +106,74 @@ public void testMetadataListFilter(@TempDir Path tmp) throws Exception {
104106
assertEquals(4, Integer.parseInt(metadata.get("X-TIKA:attachment_count")));
105107
}
106108

109+
@Test
110+
public void testMetadataFilterFromJsonConfig(@TempDir Path tmp) throws Exception {
111+
// Test that metadata filters specified as JSON array in ConfigContainer
112+
// are properly resolved and applied during pipe processing.
113+
// This tests the full serialization/deserialization flow.
114+
ParseContext parseContext = new ParseContext();
115+
ConfigContainer configContainer = new ConfigContainer();
116+
configContainer.set("metadata-filters", """
117+
[
118+
"mock-upper-case-filter"
119+
]
120+
""");
121+
parseContext.set(ConfigContainer.class, configContainer);
122+
123+
// Resolve the config to actual MetadataFilter instances
124+
ParseContextUtils.resolveAll(parseContext, PipesClientTest.class.getClassLoader());
125+
126+
// Verify the filter was resolved
127+
MetadataFilter resolvedFilter = parseContext.get(MetadataFilter.class);
128+
Assertions.assertNotNull(resolvedFilter, "MetadataFilter should be resolved from ConfigContainer");
129+
assertEquals(CompositeMetadataFilter.class, resolvedFilter.getClass());
130+
131+
PipesClient pipesClient = init(tmp, testDoc);
132+
PipesResult pipesResult = pipesClient.process(
133+
new FetchEmitTuple(testDoc, new FetchKey(fetcherName, testDoc),
134+
new EmitKey(), new Metadata(), parseContext, FetchEmitTuple.ON_PARSE_EXCEPTION.SKIP));
135+
136+
Assertions.assertNotNull(pipesResult.emitData().getMetadataList());
137+
assertEquals(1, pipesResult.emitData().getMetadataList().size());
138+
Metadata metadata = pipesResult.emitData().getMetadataList().get(0);
139+
// MockUpperCaseFilter uppercases all metadata values
140+
assertEquals("TESTOVERLAPPINGTEXT.PDF", metadata.get(TikaCoreProperties.RESOURCE_NAME_KEY));
141+
}
142+
143+
@Test
144+
public void testMultipleMetadataFiltersFromJsonConfig(@TempDir Path tmp) throws Exception {
145+
// Test multiple filters specified as JSON array
146+
ParseContext parseContext = new ParseContext();
147+
ConfigContainer configContainer = new ConfigContainer();
148+
configContainer.set("metadata-filters", """
149+
[
150+
"attachment-counting-list-filter",
151+
"mock-upper-case-filter"
152+
]
153+
""");
154+
parseContext.set(ConfigContainer.class, configContainer);
155+
156+
// Resolve the config to actual MetadataFilter instances
157+
ParseContextUtils.resolveAll(parseContext, PipesClientTest.class.getClassLoader());
158+
159+
String testFile = "mock-embedded.xml";
160+
PipesClient pipesClient = init(tmp, testFile);
161+
162+
PipesResult pipesResult = pipesClient.process(
163+
new FetchEmitTuple(testFile, new FetchKey(fetcherName, testFile),
164+
new EmitKey(), new Metadata(), parseContext, FetchEmitTuple.ON_PARSE_EXCEPTION.SKIP));
165+
166+
Assertions.assertNotNull(pipesResult.emitData().getMetadataList());
167+
assertEquals(5, pipesResult.emitData().getMetadataList().size());
168+
Metadata metadata = pipesResult.emitData().getMetadataList().get(0);
169+
170+
// AttachmentCountingListFilter should have added the count
171+
assertEquals(4, Integer.parseInt(metadata.get("X-TIKA:attachment_count")));
172+
173+
// MockUpperCaseFilter should have uppercased the resource name
174+
assertEquals("MOCK-EMBEDDED.XML", metadata.get(TikaCoreProperties.RESOURCE_NAME_KEY));
175+
}
176+
107177
@Test
108178
public void testTimeout(@TempDir Path tmp) throws Exception {
109179
//TODO -- figure out how to test pipes server timeout alone

tika-serialization/src/main/java/org/apache/tika/serialization/ParseContextDeserializer.java

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -200,6 +200,9 @@ public static ParseContext readParseContext(JsonNode jsonNode) throws IOExceptio
200200
parseContext.set(ConfigContainer.class, configContainer);
201201
}
202202

203+
// Resolve array configs (e.g., "metadata-filters") and non-SelfConfiguring components
204+
ParseContextUtils.resolveAll(parseContext, ParseContextDeserializer.class.getClassLoader());
205+
203206
return parseContext;
204207
}
205208
}

tika-serialization/src/main/java/org/apache/tika/serialization/ParseContextUtils.java

Lines changed: 135 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -19,17 +19,21 @@
1919
import java.io.IOException;
2020
import java.util.ArrayList;
2121
import java.util.List;
22+
import java.util.Map;
2223

24+
import com.fasterxml.jackson.databind.JsonNode;
2325
import com.fasterxml.jackson.databind.ObjectMapper;
2426
import org.slf4j.Logger;
2527
import org.slf4j.LoggerFactory;
2628

2729
import org.apache.tika.config.ConfigContainer;
2830
import org.apache.tika.config.JsonConfig;
2931
import org.apache.tika.config.loader.ComponentInfo;
32+
import org.apache.tika.config.loader.ComponentInstantiator;
3033
import org.apache.tika.config.loader.ComponentRegistry;
3134
import org.apache.tika.config.loader.TikaObjectMapperFactory;
3235
import org.apache.tika.exception.TikaConfigException;
36+
import org.apache.tika.metadata.filter.CompositeMetadataFilter;
3337
import org.apache.tika.metadata.filter.MetadataFilter;
3438
import org.apache.tika.parser.ParseContext;
3539

@@ -68,6 +72,20 @@ public class ParseContextUtils {
6872
// Add other known interfaces as needed
6973
);
7074

75+
/**
76+
* Mapping of array config keys to their context keys and composite wrapper factories.
77+
* Key: config name (e.g., "metadata-filters")
78+
* Value: (contextKey, componentInterface)
79+
*/
80+
private static final Map<String, ArrayConfigInfo> ARRAY_CONFIGS = Map.of(
81+
"metadata-filters", new ArrayConfigInfo(MetadataFilter.class, MetadataFilter.class)
82+
);
83+
84+
/**
85+
* Holds information about how to process array configs.
86+
*/
87+
private record ArrayConfigInfo(Class<?> contextKey, Class<?> componentInterface) {}
88+
7189
/**
7290
* Resolves all friendly-named components from ConfigContainer and adds them to ParseContext.
7391
* <p>
@@ -102,12 +120,27 @@ public static void resolveAll(ParseContext context, ClassLoader classLoader) {
102120

103121
List<String> resolvedKeys = new ArrayList<>();
104122

123+
// First, process known array configs (e.g., "metadata-filters")
124+
// These don't depend on the other-configs registry
125+
for (String friendlyName : new ArrayList<>(container.getKeys())) {
126+
if (ARRAY_CONFIGS.containsKey(friendlyName)) {
127+
JsonConfig jsonConfig = container.get(friendlyName, null);
128+
if (jsonConfig != null && resolveArrayConfig(friendlyName, jsonConfig, context, classLoader)) {
129+
resolvedKeys.add(friendlyName);
130+
}
131+
}
132+
}
133+
134+
// Then, try to load the "other-configs" registry for single component configs
105135
try {
106-
// Load the "other-configs" registry which includes parse-context components
107136
ComponentRegistry registry = new ComponentRegistry("other-configs", classLoader);
108137

109-
// Iterate through all configs in the container
110138
for (String friendlyName : container.getKeys()) {
139+
// Skip already resolved array configs
140+
if (resolvedKeys.contains(friendlyName)) {
141+
continue;
142+
}
143+
111144
JsonConfig jsonConfig = container.get(friendlyName, null);
112145
if (jsonConfig == null) {
113146
continue;
@@ -143,7 +176,8 @@ public static void resolveAll(ParseContext context, ClassLoader classLoader) {
143176
}
144177
}
145178
} catch (TikaConfigException e) {
146-
LOG.warn("Failed to load other-configs registry for parse-context resolution", e);
179+
// other-configs registry not available - that's okay, array configs were still processed
180+
LOG.debug("other-configs registry not available: {}", e.getMessage());
147181
}
148182

149183
// Remove resolved configs from the container
@@ -191,4 +225,102 @@ private static Class<?> determineContextKey(ComponentInfo info, String friendlyN
191225
// Use the single matched interface, or fall back to the component class
192226
return matches.isEmpty() ? info.componentClass() : matches.get(0);
193227
}
228+
229+
/**
230+
* Resolves an array config entry (e.g., "metadata-filters") to a composite component.
231+
* <p>
232+
* The array can contain either strings (friendly names) or objects:
233+
* <pre>
234+
* ["filter-name-1", "filter-name-2"] // String shorthand
235+
* [{"filter-name-1": {}}, {"filter-name-2": {}}] // Object format
236+
* </pre>
237+
*
238+
* @param configName the config name (e.g., "metadata-filters")
239+
* @param jsonConfig the JSON configuration (should be an array)
240+
* @param context the ParseContext to add the resolved component to
241+
* @param classLoader the ClassLoader to use for loading component classes
242+
* @return true if resolution was successful
243+
*/
244+
@SuppressWarnings("unchecked")
245+
private static boolean resolveArrayConfig(String configName, JsonConfig jsonConfig,
246+
ParseContext context, ClassLoader classLoader) {
247+
ArrayConfigInfo configInfo = ARRAY_CONFIGS.get(configName);
248+
if (configInfo == null) {
249+
return false;
250+
}
251+
252+
try {
253+
JsonNode arrayNode = MAPPER.readTree(jsonConfig.json());
254+
if (!arrayNode.isArray()) {
255+
LOG.warn("Expected array for '{}', got: {}", configName, arrayNode.getNodeType());
256+
return false;
257+
}
258+
259+
List<Object> components = new ArrayList<>();
260+
261+
for (JsonNode item : arrayNode) {
262+
String typeName;
263+
JsonNode configNode;
264+
265+
if (item.isTextual()) {
266+
// String shorthand: "component-name"
267+
typeName = item.asText();
268+
configNode = MAPPER.createObjectNode();
269+
} else if (item.isObject() && item.size() == 1) {
270+
// Object format: {"component-name": {...}}
271+
typeName = item.fieldNames().next();
272+
configNode = item.get(typeName);
273+
} else {
274+
LOG.warn("Unexpected item format in '{}': {}", configName, item);
275+
continue;
276+
}
277+
278+
try {
279+
Object component = ComponentInstantiator.instantiate(
280+
typeName, configNode, MAPPER, classLoader);
281+
components.add(component);
282+
LOG.debug("Instantiated '{}' for '{}'", typeName, configName);
283+
} catch (TikaConfigException e) {
284+
LOG.warn("Failed to instantiate '{}' for '{}': {}", typeName, configName, e.getMessage());
285+
}
286+
}
287+
288+
// Create the composite and add to ParseContext
289+
if (!components.isEmpty()) {
290+
Object composite = createComposite(configName, components, configInfo);
291+
if (composite != null) {
292+
context.set((Class) configInfo.contextKey(), composite);
293+
LOG.debug("Resolved '{}' -> {} with {} components",
294+
configName, composite.getClass().getSimpleName(), components.size());
295+
return true;
296+
}
297+
}
298+
} catch (IOException e) {
299+
LOG.warn("Failed to parse array config '{}': {}", configName, e.getMessage());
300+
}
301+
302+
return false;
303+
}
304+
305+
/**
306+
* Creates a composite component from a list of individual components.
307+
*
308+
* @param configName the config name (for error messages)
309+
* @param components the list of components
310+
* @param configInfo the array config info
311+
* @return the composite component, or null if creation failed
312+
*/
313+
@SuppressWarnings("unchecked")
314+
private static Object createComposite(String configName, List<Object> components,
315+
ArrayConfigInfo configInfo) {
316+
// Handle known composite types
317+
if (configInfo.componentInterface() == MetadataFilter.class) {
318+
List<MetadataFilter> filters = (List<MetadataFilter>) (List<?>) components;
319+
return new CompositeMetadataFilter(filters);
320+
}
321+
322+
// Add more composite types as needed
323+
LOG.warn("No composite factory for '{}'", configName);
324+
return null;
325+
}
194326
}

tika-serialization/src/test/java/org/apache/tika/serialization/TestParseContextSerialization.java

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,9 @@
3535
import org.apache.tika.config.loader.TikaObjectMapperFactory;
3636
import org.apache.tika.extractor.DocumentSelector;
3737
import org.apache.tika.extractor.SkipEmbeddedDocumentSelector;
38+
import org.apache.tika.metadata.filter.AttachmentCountingListFilter;
39+
import org.apache.tika.metadata.filter.CompositeMetadataFilter;
40+
import org.apache.tika.metadata.filter.MetadataFilter;
3841
import org.apache.tika.parser.ParseContext;
3942

4043
/**
@@ -296,6 +299,29 @@ public void testProgrammaticObjectsWithoutFriendlyName() throws Exception {
296299
assertEquals(0, root.size(), "Objects without friendly names should not be serialized");
297300
}
298301

302+
@Test
303+
public void testMetadataList() throws Exception {
304+
ConfigContainer configContainer = new ConfigContainer();
305+
configContainer.set("metadata-filters", """
306+
[
307+
"attachment-counting-list-filter",
308+
"mock-upper-case-filter"
309+
]
310+
""");
311+
ParseContext parseContext = new ParseContext();
312+
parseContext.set(ConfigContainer.class, configContainer);
313+
314+
ObjectMapper mapper = createMapper();
315+
String json = mapper.writeValueAsString(parseContext);
316+
317+
ParseContext deser = mapper.readValue(json, ParseContext.class);
318+
MetadataFilter resolvedFilter = deser.get(MetadataFilter.class);
319+
assertNotNull(resolvedFilter, "MetadataFilter should be resolved");
320+
assertEquals(CompositeMetadataFilter.class, resolvedFilter.getClass());
321+
CompositeMetadataFilter deserFilter = (CompositeMetadataFilter) resolvedFilter;
322+
assertEquals(AttachmentCountingListFilter.class, deserFilter.getFilters().get(0).getClass());
323+
}
324+
299325
@Test
300326
public void testContextKeyDeserialization() throws Exception {
301327
// Test that components with @TikaComponent(contextKey=...) are stored

0 commit comments

Comments
 (0)