Skip to content

Commit ff9d722

Browse files
authored
TIKA-4354 -- make incremental update metadata and parsing default in tika-cli (#2059)
* TIKA-4354 -- make incremental update metadata and parsing default in tika-cli
1 parent 347d58c commit ff9d722

File tree

4 files changed

+15
-7
lines changed

4 files changed

+15
-7
lines changed

tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -200,7 +200,6 @@ public void process(InputStream stream, OutputStream output, Metadata metadata)
200200
*/
201201
private String password = System.getenv("TIKA_PASSWORD");
202202
private DigestingParser.Digester digester = null;
203-
private boolean asyncMode = false;
204203
private boolean pipeMode = true;
205204
private boolean fork = false;
206205
private boolean prettyPrint;
@@ -340,9 +339,12 @@ private void configurePDFExtractSettings() {
340339
if (configFilePath == null && context.get(PDFParserConfig.class) == null) {
341340
PDFParserConfig pdfParserConfig = new PDFParserConfig();
342341
pdfParserConfig.setExtractInlineImages(true);
342+
pdfParserConfig.setExtractIncrementalUpdateInfo(true);
343343
pdfParserConfig.setParseIncrementalUpdates(true);
344-
String warn = "As a convenience, TikaCLI has turned on extraction of\n" + "inline images and incremental updates for the PDFParser (TIKA-2374 and " + "TIKA-4017).\n" +
345-
"Aside from the -z option, this is not the default behavior\n" + "in Tika generally or in tika-server.";
344+
String warn = "As a convenience, TikaCLI has turned on extraction of\n" +
345+
"inline images and incremental updates for the PDFParser (TIKA-2374, " +
346+
"TIKA-4017 and TIKA-4354).\n" +
347+
"This is not the default behavior in Tika generally or in tika-server.";
346348
LOG.info(warn);
347349
context.set(PDFParserConfig.class, pdfParserConfig);
348350
}
@@ -401,8 +403,6 @@ public void process(String arg) throws Exception {
401403
// ignore, as container-aware detectors are now always used
402404
} else if (arg.equals("-f") || arg.equals("--fork")) {
403405
fork = true;
404-
} else if (arg.equals("-a") || arg.equals("--async")) {
405-
asyncMode = true;
406406
} else if (arg.startsWith("--config=")) {
407407
configFilePath = arg.substring("--config=".length());
408408
} else if (arg.startsWith("--digest=")) {
@@ -446,7 +446,6 @@ public void process(String arg) throws Exception {
446446
}
447447
extractDir = new File(dirPath);
448448
} else if (arg.equals("-z") || arg.equals("--extract")) {
449-
configurePDFExtractSettings();
450449
type = NO_OUTPUT;
451450
context.set(EmbeddedDocumentExtractor.class, new FileEmbeddedDocumentExtractor());
452451
} else if (arg.equals("-r") || arg.equals("--pretty-print")) {
@@ -475,6 +474,7 @@ public void process(String arg) throws Exception {
475474
} else {
476475
url = new URL(arg);
477476
}
477+
configurePDFExtractSettings();
478478
if (recursiveJSON) {
479479
handleRecursiveJson(url, System.out);
480480
} else {

tika-app/src/test/java/org/apache/tika/cli/TikaCLIAsyncTest.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -120,7 +120,7 @@ public void testAsync() throws Exception {
120120
json++;
121121
}
122122
}
123-
assertEquals(17, json);
123+
assertEquals(18, json);
124124
}
125125

126126
private void checkForPrettyPrint(File f) throws IOException {

tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -241,6 +241,14 @@ public void testJsonMetadataPrettyPrintOutput() throws Exception {
241241
assertTrue(fb > -1 && title > -1 && fb > title);
242242
}
243243

244+
@Test
245+
public void testDefaultPDFIncrementalUpdateSettings() throws Exception {
246+
String json = getParamOutContent("-J",
247+
resourcePrefix + "testPDF_incrementalUpdates.pdf");
248+
assertTrue(json.contains("pdf:incrementalUpdateCount\":\"2\""));
249+
assertTrue(json.contains("embeddedResourceType\":\"VERSION\""));
250+
}
251+
244252
/**
245253
* Tests -l option of the cli
246254
*
Binary file not shown.

0 commit comments

Comments
 (0)