Skip to content

Commit 25e6856

Browse files
authored
TIKA-4571 -- add a replacement for ForkParser (#2451)
(and fix a rat test in tika-serialization :/) Generated-by: Claude Opus 4.5 (model ID: claude-opus-4-5-20251101)
1 parent 77f5593 commit 25e6856

File tree

17 files changed

+2261
-312
lines changed

17 files changed

+2261
-312
lines changed

tika-example/pom.xml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,11 @@
6161
<artifactId>tika-transcribe-aws</artifactId>
6262
<version>${project.version}</version>
6363
</dependency>
64+
<dependency>
65+
<groupId>org.apache.tika</groupId>
66+
<artifactId>tika-pipes-fork-parser</artifactId>
67+
<version>${project.version}</version>
68+
</dependency>
6469
<dependency>
6570
<groupId>org.apache.tika</groupId>
6671
<artifactId>tika-core</artifactId>

tika-example/src/main/java/org/apache/tika/example/PipesForkParserExample.java

Lines changed: 491 additions & 0 deletions
Large diffs are not rendered by default.

tika-pipes/pom.xml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@
3636
<module>tika-pipes-reporter-commons</module>
3737
<module>tika-pipes-iterator-commons</module>
3838
<module>tika-pipes-plugins</module>
39+
<module>tika-pipes-fork-parser</module>
3940
<module>tika-async-cli</module>
4041
<module>tika-pipes-integration-tests</module>
4142
</modules>

tika-pipes/tika-pipes-api/src/main/java/org/apache/tika/pipes/api/fetcher/Fetcher.java

Lines changed: 17 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,5 +35,21 @@
3535
*/
3636
public interface Fetcher extends TikaExtension, ExtensionPoint {
3737

38-
TikaInputStream fetch(String fetchKey, Metadata metadata, ParseContext parseContext) throws TikaException, IOException;
38+
/**
39+
* Fetches a resource and returns it as a TikaInputStream.
40+
*
41+
* @param fetchKey the key identifying the resource to fetch (interpretation
42+
* depends on the implementation, e.g., file path, URL, S3 key)
43+
* @param metadata metadata object to be updated with resource information
44+
* @param parseContext the parse context
45+
* @return a TikaInputStream for reading the resource content
46+
* @throws TikaException if a Tika-specific error occurs during fetching
47+
* @throws IOException if an I/O error occurs during fetching
48+
* @throws SecurityException if the fetchKey attempts to access a resource
49+
* outside permitted boundaries (e.g., path traversal attack)
50+
* @throws IllegalArgumentException if the fetchKey contains invalid characters
51+
* (e.g., null bytes)
52+
*/
53+
TikaInputStream fetch(String fetchKey, Metadata metadata, ParseContext parseContext)
54+
throws TikaException, IOException;
3955
}
Lines changed: 158 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,158 @@
1+
<?xml version="1.0" encoding="UTF-8"?>
2+
<!--
3+
Licensed to the Apache Software Foundation (ASF) under one
4+
or more contributor license agreements. See the NOTICE file
5+
distributed with this work for additional information
6+
regarding copyright ownership. The ASF licenses this file
7+
to you under the Apache License, Version 2.0 (the
8+
"License"); you may not use this file except in compliance
9+
with the License. You may obtain a copy of the License at
10+
11+
http://www.apache.org/licenses/LICENSE-2.0
12+
13+
Unless required by applicable law or agreed to in writing,
14+
software distributed under the License is distributed on an
15+
"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
16+
KIND, either express or implied. See the License for the
17+
specific language governing permissions and limitations
18+
under the License.
19+
-->
20+
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 https://maven.apache.org/xsd/maven-4.0.0.xsd">
21+
<parent>
22+
<groupId>org.apache.tika</groupId>
23+
<artifactId>tika-pipes</artifactId>
24+
<version>4.0.0-SNAPSHOT</version>
25+
<relativePath>../pom.xml</relativePath>
26+
</parent>
27+
<modelVersion>4.0.0</modelVersion>
28+
29+
<artifactId>tika-pipes-fork-parser</artifactId>
30+
31+
<name>Apache Tika pipes fork parser</name>
32+
<description>A ForkParser implementation backed by PipesClient for parsing in forked JVM processes</description>
33+
<url>https://tika.apache.org/</url>
34+
35+
<dependencies>
36+
<dependency>
37+
<groupId>${project.groupId}</groupId>
38+
<artifactId>tika-core</artifactId>
39+
<version>${project.version}</version>
40+
</dependency>
41+
<dependency>
42+
<groupId>${project.groupId}</groupId>
43+
<artifactId>tika-pipes-api</artifactId>
44+
<version>${project.version}</version>
45+
</dependency>
46+
<dependency>
47+
<groupId>${project.groupId}</groupId>
48+
<artifactId>tika-pipes-core</artifactId>
49+
<version>${project.version}</version>
50+
</dependency>
51+
<dependency>
52+
<groupId>${project.groupId}</groupId>
53+
<artifactId>tika-pipes-file-system</artifactId>
54+
<version>${project.version}</version>
55+
</dependency>
56+
<dependency>
57+
<groupId>${project.groupId}</groupId>
58+
<artifactId>tika-parsers-standard-package</artifactId>
59+
<version>${project.version}</version>
60+
</dependency>
61+
<dependency>
62+
<groupId>${project.groupId}</groupId>
63+
<artifactId>tika-core</artifactId>
64+
<version>${project.version}</version>
65+
<type>test-jar</type>
66+
<scope>test</scope>
67+
</dependency>
68+
<dependency>
69+
<groupId>${project.groupId}</groupId>
70+
<artifactId>tika-pipes-file-system</artifactId>
71+
<version>${project.version}</version>
72+
<scope>test</scope>
73+
<type>zip</type>
74+
</dependency>
75+
<dependency>
76+
<groupId>org.junit.jupiter</groupId>
77+
<artifactId>junit-jupiter-api</artifactId>
78+
<scope>test</scope>
79+
</dependency>
80+
<dependency>
81+
<groupId>org.junit.jupiter</groupId>
82+
<artifactId>junit-jupiter-engine</artifactId>
83+
<scope>test</scope>
84+
</dependency>
85+
</dependencies>
86+
<build>
87+
<plugins>
88+
<plugin>
89+
<groupId>org.apache.maven.plugins</groupId>
90+
<artifactId>maven-jar-plugin</artifactId>
91+
<configuration>
92+
<archive>
93+
<manifestEntries>
94+
<Automatic-Module-Name>org.apache.tika.pipes.fork</Automatic-Module-Name>
95+
</manifestEntries>
96+
</archive>
97+
</configuration>
98+
</plugin>
99+
<plugin>
100+
<groupId>org.apache.maven.plugins</groupId>
101+
<artifactId>maven-dependency-plugin</artifactId>
102+
<executions>
103+
<execution>
104+
<id>copy-plugins</id>
105+
<phase>process-test-resources</phase>
106+
<goals>
107+
<goal>copy</goal>
108+
</goals>
109+
<configuration>
110+
<outputDirectory>${project.build.directory}/plugins</outputDirectory>
111+
<artifactItems>
112+
<artifactItem>
113+
<groupId>org.apache.tika</groupId>
114+
<artifactId>tika-pipes-file-system</artifactId>
115+
<version>${project.version}</version>
116+
<type>zip</type>
117+
<overWrite>true</overWrite>
118+
</artifactItem>
119+
</artifactItems>
120+
</configuration>
121+
</execution>
122+
<execution>
123+
<id>copy-dependencies</id>
124+
<phase>package</phase>
125+
<goals>
126+
<goal>copy-dependencies</goal>
127+
</goals>
128+
<configuration>
129+
<outputDirectory>${project.build.directory}/lib</outputDirectory>
130+
<includeScope>runtime</includeScope>
131+
<stripVersion>false</stripVersion>
132+
<overWriteReleases>false</overWriteReleases>
133+
<overWriteSnapshots>false</overWriteSnapshots>
134+
</configuration>
135+
</execution>
136+
</executions>
137+
</plugin>
138+
<plugin>
139+
<artifactId>maven-assembly-plugin</artifactId>
140+
<configuration>
141+
<descriptors>
142+
<descriptor>src/main/assembly/assembly.xml</descriptor>
143+
</descriptors>
144+
<appendAssemblyId>false</appendAssemblyId>
145+
</configuration>
146+
<executions>
147+
<execution>
148+
<id>make-assembly</id>
149+
<phase>package</phase>
150+
<goals>
151+
<goal>single</goal>
152+
</goals>
153+
</execution>
154+
</executions>
155+
</plugin>
156+
</plugins>
157+
</build>
158+
</project>
Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,51 @@
1+
<!--
2+
Licensed to the Apache Software Foundation (ASF) under one or more
3+
contributor license agreements. See the NOTICE file distributed with
4+
this work for additional information regarding copyright ownership.
5+
The ASF licenses this file to You under the Apache License, Version 2.0
6+
(the "License"); you may not use this file except in compliance with
7+
the License. You may obtain a copy of the License at
8+
9+
http://www.apache.org/licenses/LICENSE-2.0
10+
11+
Unless required by applicable law or agreed to in writing, software
12+
distributed under the License is distributed on an "AS IS" BASIS,
13+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
See the License for the specific language governing permissions and
15+
limitations under the License.
16+
-->
17+
<assembly xmlns="http://maven.apache.org/ASSEMBLY/2.1.1"
18+
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
19+
xsi:schemaLocation="http://maven.apache.org/ASSEMBLY/2.1.1 http://maven.apache.org/xsd/assembly-2.1.1.xsd">
20+
<id>bin</id>
21+
<formats>
22+
<format>zip</format>
23+
</formats>
24+
<includeBaseDirectory>false</includeBaseDirectory>
25+
26+
<dependencySets>
27+
<dependencySet>
28+
<outputDirectory>lib</outputDirectory>
29+
<useProjectArtifact>false</useProjectArtifact>
30+
<unpack>false</unpack>
31+
<scope>runtime</scope>
32+
</dependencySet>
33+
</dependencySets>
34+
<fileSets>
35+
<fileSet>
36+
<directory>${project.build.directory}</directory>
37+
<outputDirectory>/</outputDirectory>
38+
<includes>
39+
<include>*.jar</include>
40+
</includes>
41+
<excludes>
42+
<exclude>*-sources.jar</exclude>
43+
<exclude>*-javadoc.jar</exclude>
44+
</excludes>
45+
</fileSet>
46+
<fileSet>
47+
<directory>${project.build.directory}/plugins</directory>
48+
<outputDirectory>plugins</outputDirectory>
49+
</fileSet>
50+
</fileSets>
51+
</assembly>

0 commit comments

Comments
 (0)