Skip to content

Commit 488867f

Browse files
authored
TIKA-4545 -- add translators and refactor loaders (#2420)
* TIKA-4545 -- add translators and refactor loaders
1 parent 5bf9f71 commit 488867f

File tree

30 files changed

+366
-246
lines changed

30 files changed

+366
-246
lines changed

tika-core/src/main/java/org/apache/tika/language/translate/EmptyTranslator.java

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,11 +16,14 @@
1616
*/
1717
package org.apache.tika.language.translate;
1818

19+
import org.apache.tika.config.TikaComponent;
20+
1921
/**
2022
* Dummy translator that always declines to give any text. Useful as a
2123
* sentinel translator for when none others are available.
2224
* for unknown document types.
2325
*/
26+
@TikaComponent
2427
public class EmptyTranslator implements Translator {
2528
public String translate(String text, String sourceLanguage, String targetLanguage) {
2629
return null;

tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/emitter/EmitterManager.java

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,8 @@ public class EmitterManager {
4646
private final Map<String, Emitter> emitterMap = new ConcurrentHashMap<>();
4747

4848
public static EmitterManager load(PluginManager pluginManager, TikaConfigs tikaConfigs) throws IOException, TikaConfigException {
49-
JsonNode fetchersNode = tikaConfigs.getRoot().get(CONFIG_KEY);
49+
JsonNode fetchersNode = tikaConfigs.getTikaJsonConfig()
50+
.getRootNode().get(CONFIG_KEY);
5051
Map<String, Emitter> fetchers =
5152
PluginComponentLoader.loadInstances(pluginManager, EmitterFactory.class, fetchersNode);
5253
return new EmitterManager(fetchers);

tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/fetcher/FetcherManager.java

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,8 @@ public class FetcherManager {
4545

4646

4747
public static FetcherManager load(PluginManager pluginManager, TikaConfigs tikaConfigs) throws TikaConfigException, IOException {
48-
JsonNode fetchersNode = tikaConfigs.getRoot().get(CONFIG_KEY);
48+
JsonNode fetchersNode = tikaConfigs.getTikaJsonConfig()
49+
.getRootNode().get(CONFIG_KEY);
4950
Map<String, Fetcher> fetchers =
5051
PluginComponentLoader.loadInstances(pluginManager, FetcherFactory.class, fetchersNode);
5152
return new FetcherManager(fetchers);

tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/pipesiterator/PipesIteratorManager.java

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,8 @@ public class PipesIteratorManager {
3939

4040
public static Optional<PipesIterator> load(PluginManager pluginManager, TikaConfigs tikaConfigs) throws IOException, TikaConfigException {
4141

42-
JsonNode node = tikaConfigs.getRoot().get(CONFIG_KEY);
42+
JsonNode node = tikaConfigs.getTikaJsonConfig()
43+
.getRootNode().get(CONFIG_KEY);
4344

4445
return PluginComponentLoader.loadSingleton(pluginManager, PipesIteratorFactory.class, node);
4546
}

tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/reporter/ReporterManager.java

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,8 @@ public class ReporterManager {
3939

4040
public static PipesReporter load(PluginManager pluginManager, TikaConfigs tikaConfigs) throws IOException, TikaConfigException {
4141

42-
JsonNode node = tikaConfigs.getRoot().get(CONFIG_KEY);
42+
JsonNode node = tikaConfigs.getTikaJsonConfig()
43+
.getRootNode().get(CONFIG_KEY);
4344

4445
List<PipesReporter> reporters = PluginComponentLoader.loadUnnamedInstances(pluginManager, PipesReporterFactory.class, node);
4546
if (reporters.isEmpty()) {

tika-plugins-core/src/main/java/org/apache/tika/plugins/TikaConfigs.java

Lines changed: 0 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -21,9 +21,7 @@
2121
import java.util.Iterator;
2222
import java.util.Set;
2323

24-
import com.fasterxml.jackson.databind.DeserializationFeature;
2524
import com.fasterxml.jackson.databind.JsonNode;
26-
import com.fasterxml.jackson.databind.ObjectMapper;
2725

2826
import org.apache.tika.config.loader.TikaJsonConfig;
2927
import org.apache.tika.exception.TikaConfigException;
@@ -66,9 +64,6 @@ public class TikaConfigs {
6664
"server"
6765
);
6866

69-
static final ObjectMapper OBJECT_MAPPER = new ObjectMapper()
70-
.configure(DeserializationFeature.ACCEPT_SINGLE_VALUE_AS_ARRAY, true);
71-
7267
private final TikaJsonConfig tikaJsonConfig;
7368

7469
/**
@@ -113,17 +108,6 @@ public TikaJsonConfig getTikaJsonConfig() {
113108
return tikaJsonConfig;
114109
}
115110

116-
/**
117-
* Gets the root JSON node.
118-
* Deprecated - use {@link #getTikaJsonConfig()} instead.
119-
*
120-
* @return the root JSON node
121-
*/
122-
@Deprecated
123-
public JsonNode getRoot() {
124-
return tikaJsonConfig.getRootNode();
125-
}
126-
127111
/**
128112
* Deserializes a configuration value for the given key.
129113
*

tika-plugins-core/src/main/java/org/apache/tika/plugins/TikaPluginManager.java

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,9 @@
2323
import java.util.List;
2424

2525
import com.fasterxml.jackson.core.type.TypeReference;
26+
import com.fasterxml.jackson.databind.DeserializationFeature;
2627
import com.fasterxml.jackson.databind.JsonNode;
28+
import com.fasterxml.jackson.databind.ObjectMapper;
2729
import org.pf4j.DefaultExtensionFinder;
2830
import org.pf4j.DefaultPluginManager;
2931
import org.pf4j.ExtensionFinder;
@@ -43,6 +45,14 @@ public class TikaPluginManager extends DefaultPluginManager {
4345

4446
private static final Logger LOG = LoggerFactory.getLogger(TikaPluginManager.class);
4547

48+
//we're only using this to convert a single path or a list of paths to a list
49+
//we don't need all the functionality of the polymorphic objectmapper in tika-serialization
50+
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
51+
52+
static {
53+
OBJECT_MAPPER.configure(DeserializationFeature.ACCEPT_SINGLE_VALUE_AS_ARRAY, true);
54+
}
55+
4656
/**
4757
* Loads plugin manager from a pre-parsed TikaJsonConfig.
4858
* This is the preferred method when sharing configuration across
@@ -83,12 +93,12 @@ public static TikaPluginManager load(Path configPath) throws TikaConfigException
8393
*/
8494
public static TikaPluginManager load(TikaConfigs tikaConfigs)
8595
throws TikaConfigException, IOException {
86-
JsonNode root = tikaConfigs.getRoot();
96+
JsonNode root = tikaConfigs.getTikaJsonConfig().getRootNode();
8797
JsonNode pluginRoots = root.get("plugin-roots");
8898
if (pluginRoots == null) {
8999
throw new TikaConfigException("plugin-roots must be specified");
90100
}
91-
List<Path> roots = TikaConfigs.OBJECT_MAPPER.convertValue(pluginRoots,
101+
List<Path> roots = OBJECT_MAPPER.convertValue(pluginRoots,
92102
new TypeReference<List<Path>>() {});
93103
if (roots.isEmpty()) {
94104
throw new TikaConfigException("plugin-roots must not be empty");

tika-plugins-core/src/test/java/org/apache/tika/plugins/TikaConfigsTest.java

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -137,8 +137,8 @@ public void testGetRootReturnsJsonNode() throws Exception {
137137
""";
138138

139139
TikaConfigs configs = loadFromString(json);
140-
assertNotNull(configs.getRoot());
141-
assertNotNull(configs.getRoot().get("fetchers"));
140+
assertNotNull(configs.getTikaJsonConfig().getRootNode());
141+
assertNotNull(configs.getTikaJsonConfig().getRootNode().get("fetchers"));
142142
}
143143

144144
private TikaConfigs loadFromString(String json) throws Exception {
Lines changed: 116 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,116 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one or more
3+
* contributor license agreements. See the NOTICE file distributed with
4+
* this work for additional information regarding copyright ownership.
5+
* The ASF licenses this file to You under the Apache License, Version 2.0
6+
* (the "License"); you may not use this file except in compliance with
7+
* the License. You may obtain a copy of the License at
8+
*
9+
* http://www.apache.org/licenses/LICENSE-2.0
10+
*
11+
* Unless required by applicable law or agreed to in writing, software
12+
* distributed under the License is distributed on an "AS IS" BASIS,
13+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
* See the License for the specific language governing permissions and
15+
* limitations under the License.
16+
*/
17+
package org.apache.tika.config.loader;
18+
19+
import java.lang.reflect.Constructor;
20+
import java.lang.reflect.InvocationTargetException;
21+
22+
import com.fasterxml.jackson.databind.JsonNode;
23+
import com.fasterxml.jackson.databind.ObjectMapper;
24+
25+
import org.apache.tika.config.JsonConfig;
26+
import org.apache.tika.exception.TikaConfigException;
27+
import org.apache.tika.utils.ServiceLoaderUtils;
28+
29+
/**
30+
* Utility class for instantiating Tika components from JSON configuration.
31+
* Provides common logic for all component loaders to avoid code duplication.
32+
*/
33+
public class ComponentInstantiator {
34+
35+
/**
36+
* Instantiates a component with JsonConfig constructor or falls back to zero-arg constructor.
37+
* <p>
38+
* Instantiation strategy:
39+
* <ol>
40+
* <li>Try constructor with JsonConfig parameter</li>
41+
* <li>If not found and JSON config has actual configuration, throw error</li>
42+
* <li>Otherwise fall back to zero-arg constructor via ServiceLoader</li>
43+
* </ol>
44+
*
45+
* @param componentClass the component class to instantiate
46+
* @param jsonConfig the JSON configuration for the component
47+
* @param classLoader the class loader to use
48+
* @param componentTypeName the component type name (e.g., "Detector", "Parser") for error messages
49+
* @param objectMapper the Jackson ObjectMapper for parsing JSON
50+
* @param <T> the component type
51+
* @return the instantiated component
52+
* @throws TikaConfigException if instantiation fails
53+
*/
54+
@SuppressWarnings("unchecked")
55+
public static <T> T instantiate(Class<?> componentClass,
56+
JsonConfig jsonConfig,
57+
ClassLoader classLoader,
58+
String componentTypeName,
59+
ObjectMapper objectMapper)
60+
throws TikaConfigException {
61+
try {
62+
T component;
63+
64+
// Try constructor with JsonConfig parameter
65+
try {
66+
Constructor<?> constructor = componentClass.getConstructor(JsonConfig.class);
67+
component = (T) constructor.newInstance(jsonConfig);
68+
} catch (NoSuchMethodException e) {
69+
// Check if JSON config has actual configuration
70+
if (hasConfiguration(jsonConfig, objectMapper)) {
71+
throw new TikaConfigException(
72+
componentTypeName + " '" + componentClass.getName() + "' has configuration in JSON, " +
73+
"but does not have a constructor that accepts JsonConfig. " +
74+
"Please add a constructor: public " + componentClass.getSimpleName() + "(JsonConfig jsonConfig)");
75+
}
76+
// Fall back to zero-arg constructor if no configuration provided
77+
component = (T) ServiceLoaderUtils.newInstance(componentClass,
78+
new org.apache.tika.config.ServiceLoader(classLoader));
79+
}
80+
81+
return component;
82+
} catch (InstantiationException | IllegalAccessException | InvocationTargetException e) {
83+
throw new TikaConfigException("Failed to instantiate " + componentTypeName + ": " +
84+
componentClass.getName(), e);
85+
}
86+
}
87+
88+
/**
89+
* Checks if the JsonConfig contains actual configuration (non-empty JSON object with fields).
90+
*
91+
* @param jsonConfig the JSON configuration
92+
* @param objectMapper the Jackson ObjectMapper for parsing JSON
93+
* @return true if there's meaningful configuration, false if empty or just "{}"
94+
*/
95+
public static boolean hasConfiguration(JsonConfig jsonConfig, ObjectMapper objectMapper) {
96+
if (jsonConfig == null) {
97+
return false;
98+
}
99+
String json = jsonConfig.json();
100+
if (json == null || json.trim().isEmpty()) {
101+
return false;
102+
}
103+
// Parse to check if it's an empty object or has actual fields
104+
try {
105+
JsonNode node = objectMapper.readTree(json);
106+
// Check if it's an object and has at least one field
107+
if (node.isObject() && node.size() > 0) {
108+
return true;
109+
}
110+
return false;
111+
} catch (Exception e) {
112+
// If we can't parse it, assume it has configuration to be safe
113+
return true;
114+
}
115+
}
116+
}

tika-serialization/src/main/java/org/apache/tika/config/loader/CompositeComponentLoader.java

Lines changed: 2 additions & 53 deletions
Original file line numberDiff line numberDiff line change
@@ -16,8 +16,6 @@
1616
*/
1717
package org.apache.tika.config.loader;
1818

19-
import java.lang.reflect.Constructor;
20-
import java.lang.reflect.InvocationTargetException;
2119
import java.util.ArrayList;
2220
import java.util.Collections;
2321
import java.util.Iterator;
@@ -32,7 +30,6 @@
3230

3331
import org.apache.tika.config.JsonConfig;
3432
import org.apache.tika.exception.TikaConfigException;
35-
import org.apache.tika.utils.ServiceLoaderUtils;
3633

3734
/**
3835
* Generic loader for Tika components (detectors, encoding detectors, filters, etc.).
@@ -172,58 +169,10 @@ private T loadConfiguredComponent(String name, JsonNode configNode,
172169
}
173170
}
174171

175-
@SuppressWarnings("unchecked")
176172
private T instantiateComponent(Class<?> componentClass, JsonConfig configJson)
177173
throws TikaConfigException {
178-
try {
179-
// Try constructor with JsonConfig parameter
180-
try {
181-
Constructor<?> constructor = componentClass.getConstructor(JsonConfig.class);
182-
return (T) constructor.newInstance(configJson);
183-
} catch (NoSuchMethodException e) {
184-
// Check if JSON config has actual configuration
185-
if (hasConfiguration(configJson)) {
186-
throw new TikaConfigException(
187-
"Component '" + componentClass.getName() + "' has configuration in JSON, " +
188-
"but does not have a constructor that accepts JsonConfig. " +
189-
"Please add a constructor: public " + componentClass.getSimpleName() + "(JsonConfig jsonConfig)");
190-
}
191-
// Fall back to zero-arg constructor if no configuration provided
192-
return (T) ServiceLoaderUtils.newInstance(componentClass,
193-
new org.apache.tika.config.ServiceLoader(classLoader));
194-
}
195-
} catch (InstantiationException | IllegalAccessException | InvocationTargetException e) {
196-
throw new TikaConfigException("Failed to instantiate component: " +
197-
componentClass.getName(), e);
198-
}
199-
}
200-
201-
/**
202-
* Checks if the JsonConfig contains actual configuration (non-empty JSON object with fields).
203-
*
204-
* @param jsonConfig the JSON configuration
205-
* @return true if there's meaningful configuration, false if empty or just "{}"
206-
*/
207-
private boolean hasConfiguration(JsonConfig jsonConfig) {
208-
if (jsonConfig == null) {
209-
return false;
210-
}
211-
String json = jsonConfig.json();
212-
if (json == null || json.trim().isEmpty()) {
213-
return false;
214-
}
215-
// Parse to check if it's an empty object or has actual fields
216-
try {
217-
JsonNode node = objectMapper.readTree(json);
218-
// Check if it's an object and has at least one field
219-
if (node.isObject() && node.size() > 0) {
220-
return true;
221-
}
222-
return false;
223-
} catch (Exception e) {
224-
// If we can't parse it, assume it has configuration to be safe
225-
return true;
226-
}
174+
return ComponentInstantiator.instantiate(componentClass, configJson, classLoader,
175+
componentTypeName, objectMapper);
227176
}
228177

229178
private List<T> loadSpiComponents() {

0 commit comments

Comments
 (0)