Skip to content

Commit 9ccf59e

Browse files
authored
TIKA-4585 -- simplify serialization (#2471)
* TIKA-4585 -- simplify ParseContext serialization -WIP * TIKA-4585 -- further progress - WIP * TIKA-4585 -- further progress - WIP * TIKA-4585 -- further progress - WIP * TIKA-4585 -- further progress * TIKA-4585 -- fix unit test, add comment * TIKA-4585 -- rm unused mixins * TIKA-4585 -- bring back in some design features from main
1 parent 34b60d6 commit 9ccf59e

File tree

110 files changed

+3990
-2417
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

110 files changed

+3990
-2417
lines changed

tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java

Lines changed: 6 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -38,8 +38,6 @@
3838
import java.util.List;
3939
import java.util.Set;
4040

41-
import org.jetbrains.annotations.NotNull;
42-
import org.jetbrains.annotations.Nullable;
4341
import org.junit.jupiter.api.AfterEach;
4442
import org.junit.jupiter.api.BeforeEach;
4543
import org.junit.jupiter.api.Disabled;
@@ -237,7 +235,7 @@ public void testJsonMetadataOutput() throws Exception {
237235
public void testJsonMetadataPrettyPrintOutput() throws Exception {
238236
String json = getParamOutContent("--json", "-r", resourcePrefix + "testJsonMultipleInts.html");
239237

240-
assertTrue(json.contains("org.apache.tika.parser.CompositeParser\", \"org.apache.tika.parser.html.JSoupParser"));
238+
assertTrue(json.contains("org.apache.tika.parser.DefaultParser\", \"org.apache.tika.parser.html.JSoupParser"));
241239
//test pretty-print alphabetic sort of keys
242240
int enc = json.indexOf("\"Content-Encoding\"");
243241
int fb = json.indexOf("fb:admins");
@@ -413,22 +411,23 @@ private Set<String> getFileNames(Path extractDir) throws IOException {
413411
final Set<String> names = new HashSet<>();
414412
Files.walkFileTree(extractDir, new FileVisitor<Path>() {
415413
@Override
416-
public @NotNull FileVisitResult preVisitDirectory(Path path, @NotNull BasicFileAttributes basicFileAttributes) throws IOException {
414+
public FileVisitResult preVisitDirectory(Path path, BasicFileAttributes basicFileAttributes) throws IOException {
417415
return FileVisitResult.CONTINUE;
418416
}
419417

420-
public @NotNull FileVisitResult visitFile(Path path, @NotNull BasicFileAttributes basicFileAttributes) throws IOException {
418+
@Override
419+
public FileVisitResult visitFile(Path path, BasicFileAttributes basicFileAttributes) throws IOException {
421420
names.add(extractDir.relativize(path).toString().replace('\\', '/'));
422421
return FileVisitResult.CONTINUE;
423422
}
424423

425424
@Override
426-
public @NotNull FileVisitResult visitFileFailed(Path path, @NotNull IOException e) throws IOException {
425+
public FileVisitResult visitFileFailed(Path path, IOException e) throws IOException {
427426
return FileVisitResult.CONTINUE;
428427
}
429428

430429
@Override
431-
public @NotNull FileVisitResult postVisitDirectory(Path path, @Nullable IOException e) throws IOException {
430+
public FileVisitResult postVisitDirectory(Path path, IOException e) throws IOException {
432431
return FileVisitResult.CONTINUE;
433432
}
434433
});

tika-core/src/main/java/org/apache/tika/config/ConfigContainer.java

Lines changed: 0 additions & 99 deletions
This file was deleted.

tika-core/src/main/java/org/apache/tika/config/ParseContextConfig.java

Lines changed: 16 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -24,13 +24,13 @@
2424
import org.apache.tika.parser.ParseContext;
2525

2626
/**
27-
* Facade for accessing runtime configuration from ParseContext's ConfigContainer.
27+
* Facade for accessing runtime configuration from ParseContext's jsonConfigs.
2828
* <p>
2929
* This wrapper provides a safe way for parsers to access runtime configuration
3030
* without directly depending on tika-serialization. It performs these critical checks:
3131
* <ul>
32-
* <li>If ConfigContainer has config for the requested key but ConfigDeserializer
33-
* is not on the classpath, throws IllegalStateException with a clear error message</li>
32+
* <li>If ParseContext has JSON config for the requested key but ConfigDeserializer
33+
* is not on the classpath, throws TikaConfigException with a clear error message</li>
3434
* <li>If ConfigDeserializer is available, delegates to it for deserialization</li>
3535
* <li>If no config is present, returns the default config</li>
3636
* </ul>
@@ -55,7 +55,7 @@ public class ParseContextConfig {
5555
Method hasMethod = null;
5656
try {
5757
clazz = Class.forName("org.apache.tika.serialization.ConfigDeserializer");
58-
getMethod = clazz.getMethod("getConfig",
58+
getMethod = clazz.getMethod("getConfig",
5959
ParseContext.class, String.class, Class.class, Object.class);
6060
hasMethod = clazz.getMethod("hasConfig", ParseContext.class, String.class);
6161
} catch (ClassNotFoundException | NoSuchMethodException e) {
@@ -74,10 +74,11 @@ public class ParseContextConfig {
7474
* re-deserializing. This is efficient for embedded documents where the config
7575
* was already deserialized for the parent document.
7676
* <p>
77-
* If not found, it checks ConfigContainer for the config key and deserializes
78-
* the JSON. The deserialized config is also set in ParseContext for future lookups.
77+
* If not found, it checks jsonConfigs for the config key and deserializes
78+
* the JSON. The deserialized config is cached in resolvedConfigs and also
79+
* set in the main ParseContext for future lookups.
7980
* <p>
80-
* This method performs defensive checking: if the ConfigContainer has configuration
81+
* This method performs defensive checking: if the ParseContext has JSON configuration
8182
* for the requested key but the ConfigDeserializer is not available on the classpath,
8283
* it throws TikaConfigException to prevent silent failures.
8384
*
@@ -87,7 +88,7 @@ public class ParseContextConfig {
8788
* @param defaultConfig the default configuration to use if no runtime config exists
8889
* @param <T> the configuration type
8990
* @return the runtime config merged with defaults, or the default config if no runtime config
90-
* @throws TikaConfigException if ConfigContainer has config but ConfigDeserializer is not on classpath
91+
* @throws TikaConfigException if ParseContext has JSON config but ConfigDeserializer is not on classpath
9192
* @throws IOException if deserialization fails
9293
*/
9394
public static <T> T getConfig(ParseContext context, String configKey,
@@ -104,25 +105,18 @@ public static <T> T getConfig(ParseContext context, String configKey,
104105
return existingConfig;
105106
}
106107

107-
ConfigContainer configContainer = context.get(ConfigContainer.class);
108-
if (configContainer == null) {
108+
// Check for JSON config
109+
if (!context.hasJsonConfig(configKey)) {
109110
return defaultConfig;
110111
}
111112

112-
// Check if there's config for this specific key
113-
boolean hasConfigForKey = configContainer.get(configKey).isPresent();
114-
if (!hasConfigForKey) {
115-
return defaultConfig;
116-
}
117-
118-
// Config exists for this key - ConfigDeserializer MUST be available
113+
// JSON config exists for this key - ConfigDeserializer MUST be available
119114
if (CONFIG_DESERIALIZER_CLASS == null) {
120115
throw new TikaConfigException(String.format(Locale.ROOT,
121-
"ParseContext contains ConfigContainer with configuration for '%s' " +
116+
"ParseContext contains JSON configuration for '%s' " +
122117
"but org.apache.tika.serialization.ConfigDeserializer is not on the classpath. " +
123118
"This means your runtime configuration will be ignored. " +
124-
"To fix: add tika-serialization as a dependency, or remove the ConfigContainer " +
125-
"from ParseContext if runtime configuration via ConfigContainer is not needed.",
119+
"To fix: add tika-serialization as a dependency.",
126120
configKey));
127121
}
128122

@@ -150,19 +144,13 @@ public static <T> T getConfig(ParseContext context, String configKey,
150144
*
151145
* @param context the parse context
152146
* @param configKey the configuration key
153-
* @return true if config exists for this key
147+
* @return true if JSON config exists for this key
154148
*/
155149
public static boolean hasConfig(ParseContext context, String configKey) {
156150
if (context == null) {
157151
return false;
158152
}
159-
160-
ConfigContainer configContainer = context.get(ConfigContainer.class);
161-
if (configContainer == null) {
162-
return false;
163-
}
164-
165-
return configContainer.get(configKey).isPresent();
153+
return context.hasJsonConfig(configKey);
166154
}
167155

168156
/**

tika-core/src/main/java/org/apache/tika/config/SelfConfiguring.java

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -18,12 +18,12 @@
1818

1919
/**
2020
* Marker interface indicating that a component reads its own configuration
21-
* from {@link ConfigContainer} inside the {@link org.apache.tika.parser.ParseContext} at runtime.
21+
* from {@link org.apache.tika.parser.ParseContext}'s jsonConfigs at runtime.
2222
* <p>
2323
* Components implementing this interface will NOT be automatically resolved
2424
* by ParseContextUtils. Instead, the JSON configuration will remain in
25-
* ConfigContainer, and the component is responsible for reading and applying
26-
* its own configuration during execution.
25+
* ParseContext's jsonConfigs, and the component is responsible for reading
26+
* and applying its own configuration during execution.
2727
* <p>
2828
* This is typically used by parsers and other components that need fine-grained
2929
* control over how their configuration is loaded and merged with defaults.
@@ -36,7 +36,7 @@
3636
* private final PDFParserConfig defaultConfig;
3737
*
3838
* public void parse(..., ParseContext context) {
39-
* // Component reads its own config from ConfigContainer
39+
* // Component reads its own config from ParseContext
4040
* PDFParserConfig config = ParseContextConfig.getConfig(
4141
* context, "pdf-parser", PDFParserConfig.class, defaultConfig);
4242
* // use config...
@@ -48,7 +48,6 @@
4848
* automatically deserialized and added to ParseContext by ParseContextUtils.
4949
*
5050
* @since Apache Tika 4.0
51-
* @see ConfigContainer
5251
* @see ParseContextConfig
5352
*/
5453
public interface SelfConfiguring {

tika-core/src/main/java/org/apache/tika/config/TikaTaskTimeout.java

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,9 @@
2222

2323
/**
2424
* Configuration class for specifying parse task timeout.
25+
* <p>
26+
* This is a config POJO (not a component like Parser/Detector), so it uses
27+
* standard Jackson format rather than compact component format:
2528
* <pre>
2629
* {
2730
* "parse-context": {

tika-core/src/main/java/org/apache/tika/detect/DefaultDetector.java

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,12 +16,14 @@
1616
*/
1717
package org.apache.tika.detect;
1818

19+
import java.util.ArrayList;
1920
import java.util.Collection;
2021
import java.util.Collections;
2122
import java.util.List;
2223
import javax.imageio.spi.ServiceRegistry;
2324

2425
import org.apache.tika.config.ServiceLoader;
26+
import org.apache.tika.config.TikaComponent;
2527
import org.apache.tika.mime.MimeTypes;
2628
import org.apache.tika.utils.ServiceLoaderUtils;
2729

@@ -37,18 +39,23 @@
3739
*
3840
* @since Apache Tika 0.9
3941
*/
42+
@TikaComponent(spi = false)
4043
public class DefaultDetector extends CompositeDetector {
4144

4245
/**
4346
* Serial version UID
4447
*/
4548
private static final long serialVersionUID = -8170114575326908027L;
4649
private transient final ServiceLoader loader;
50+
private final Collection<Class<? extends Detector>> excludedClasses;
4751

4852
public DefaultDetector(MimeTypes types, ServiceLoader loader,
4953
Collection<Class<? extends Detector>> excludeDetectors) {
5054
super(types.getMediaTypeRegistry(), getDefaultDetectors(types, loader, excludeDetectors));
5155
this.loader = loader;
56+
this.excludedClasses = excludeDetectors != null ?
57+
Collections.unmodifiableCollection(new ArrayList<>(excludeDetectors)) :
58+
Collections.emptySet();
5259
}
5360

5461
public DefaultDetector(MimeTypes types, ServiceLoader loader) {
@@ -124,4 +131,13 @@ public List<Detector> getDetectors() {
124131
}
125132
}
126133

134+
/**
135+
* Returns the classes that were explicitly excluded when constructing this detector.
136+
* Used for round-trip serialization to preserve exclusion configuration.
137+
*
138+
* @return unmodifiable collection of excluded detector classes, never null
139+
*/
140+
public Collection<Class<? extends Detector>> getExcludedClasses() {
141+
return excludedClasses;
142+
}
127143
}

tika-core/src/main/java/org/apache/tika/parser/AutoDetectParserConfig.java

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,8 +33,12 @@
3333
* when parsing data that is extremely compressible and resembles a ZIP
3434
* bomb. Null values will be ignored and will not affect the default values
3535
* in SecureContentHandler.
36+
* <p>
37+
* This is a config POJO. It uses standard Jackson deserialization for its
38+
* primitive fields, but component fields (like embeddedDocumentExtractorFactory)
39+
* use compact format.
3640
*/
37-
@TikaComponent
41+
@TikaComponent(spi = false)
3842
public class AutoDetectParserConfig implements Serializable {
3943

4044
private static ContentHandlerDecoratorFactory NOOP_CONTENT_HANDLER_DECORATOR_FACTORY =

0 commit comments

Comments
 (0)