Skip to content

Commit 90d854f

Browse files
authored
TIKA-4349 -- allow uppercasing of hex encoded digests in the CommonsDigester (#2044)
* TIKA-4349 -- allow uppercasing of hex encoded digests in the CommonsDigester
1 parent 406ca53 commit 90d854f

File tree

3 files changed

+51
-23
lines changed
  • tika-app/src/test/java/org/apache/tika/cli
  • tika-parsers/tika-parsers-standard

3 files changed

+51
-23
lines changed

tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -139,7 +139,7 @@ public void testXMLOutput() throws Exception {
139139
String content = getParamOutContent("-x", resourcePrefix + "alice.cli.test");
140140
assertTrue(content.contains("?xml version=\"1.0\" encoding=\"UTF-8\"?"));
141141

142-
content = getParamOutContent("-x", "--digest=SHA256", resourcePrefix + "alice.cli.test");
142+
content = getParamOutContent("-x", "--digest=sha256", resourcePrefix + "alice.cli.test");
143143
assertTrue(content.contains("<meta name=\"X-TIKA:digest:SHA256\" content=\"e90779adbac09c4ee"));
144144

145145
}
@@ -155,7 +155,7 @@ public void testHTMLOutput() throws Exception {
155155
assertTrue(content.contains("html xmlns=\"http://www.w3.org/1999/xhtml"));
156156
assertTrue(content.contains("<title></title>"), "Expanded <title></title> element should be present");
157157

158-
content = getParamOutContent("-h", "--digest=SHA384", resourcePrefix + "alice.cli.test");
158+
content = getParamOutContent("-h", "--digest=sha384", resourcePrefix + "alice.cli.test");
159159
assertTrue(content.contains("<meta name=\"X-TIKA:digest:SHA384\" content=\"c69ea023f5da95a026"));
160160
}
161161

@@ -207,7 +207,7 @@ public void testMetadataOutput() throws Exception {
207207

208208
content = getParamOutContent("-m", "--digest=SHA512", resourcePrefix + "alice.cli.test");
209209
assertTrue(content.contains("text/plain"));
210-
assertTrue(content.contains("X-TIKA:digest:SHA512: dd459d99bc19ff78fd31fbae46e0"));
210+
assertTrue(content.contains("X-TIKA:digest:SHA512: DD459D99BC19FF78FD31FBAE46E0"));
211211
}
212212

213213
/**
@@ -459,7 +459,7 @@ public void testJsonRecursiveMetadataParserText() throws Exception {
459459

460460
@Test
461461
public void testDigestInJson() throws Exception {
462-
String content = getParamOutContent("-J", "-r", "-t", "--digest=MD5", resourcePrefix + "test_recursive_embedded.docx");
462+
String content = getParamOutContent("-J", "-r", "-t", "--digest=md5", resourcePrefix + "test_recursive_embedded.docx");
463463
assertTrue(content.contains("\"X-TIKA:digest:MD5\" : \"59f626e09a8c16ab6dbc2800c685f772\","));
464464
assertTrue(content.contains("\"X-TIKA:digest:MD5\" : \"f9627095ef86c482e61d99f0cc1cf87d\""));
465465
}

tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-digest-commons/src/main/java/org/apache/tika/parser/digestutils/CommonsDigester.java

Lines changed: 45 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,8 @@ public class CommonsDigester extends CompositeDigester {
4343
/**
4444
* Include a string representing the comma-separated algorithms to run: e.g. "md5,sha1".
4545
* If you want base 32 encoding instead of hexadecimal, add ":32" to the algorithm, e.g.
46-
* "md5,sha1:32"
46+
* "md5,sha1:32". If you want uppercase digests for the hexadecimal encoder,
47+
* use uppercase in the algorithm name, e.g. "MD5".
4748
* <p/>
4849
* Will throw an IllegalArgumentException if an algorithm isn't supported
4950
*
@@ -70,7 +71,7 @@ private static DigestingParser.Digester[] buildDigesters(int markLimit,
7071
for (DigestAlgorithm algorithm : algorithms) {
7172
digesters[i++] =
7273
new InputStreamDigester(markLimit, algorithm.getJavaName(), algorithm.name(),
73-
new HexEncoder());
74+
new HexEncoder(false));
7475
}
7576
return digesters;
7677
}
@@ -129,27 +130,33 @@ private static DigestingParser.Digester[] buildDigesters(int markLimit, String d
129130
int i = 0;
130131
for (String digest : digests) {
131132
String[] parts = digest.split(":");
132-
DigestingParser.Encoder encoder = null;
133-
if (parts.length > 1) {
134-
if (parts[1].equals("16")) {
135-
encoder = new HexEncoder();
136-
} else if (parts[1].equals("32")) {
137-
encoder = new Base32Encoder();
138-
} else if (parts[1].equals("64")) {
139-
encoder = new Base64Encoder();
140-
} else {
141-
throw new IllegalArgumentException("Value must be '16', '32' or '64'");
142-
}
143-
} else {
144-
encoder = new HexEncoder();
145-
}
133+
DigestingParser.Encoder encoder = getEncoder(parts);
146134
DigestAlgorithm digestAlgorithm = getDigestAlgorithm(parts[0]);
147135
digesters[i++] = new InputStreamDigester(markLimit, digestAlgorithm.getJavaName(),
148136
digestAlgorithm.name(), encoder);
149137
}
150138
return digesters;
151139
}
152140

141+
private static DigestingParser.Encoder getEncoder(String[] parts) {
142+
DigestingParser.Encoder encoder = null;
143+
boolean uc = parts[0].matches("[A-Z0-9]{1,20}");
144+
if (parts.length > 1) {
145+
if (parts[1].equals("16")) {
146+
encoder = new HexEncoder(uc);
147+
} else if (parts[1].equals("32")) {
148+
encoder = new Base32Encoder();
149+
} else if (parts[1].equals("64")) {
150+
encoder = new Base64Encoder();
151+
} else {
152+
throw new IllegalArgumentException("Value must be '16', '32' or '64'");
153+
}
154+
} else {
155+
encoder = new HexEncoder(uc);
156+
}
157+
return encoder;
158+
}
159+
153160
public enum DigestAlgorithm {
154161
//those currently available in commons.digest
155162
MD2("MD2"), MD5("MD5"), SHA1("SHA-1"), SHA256("SHA-256"), SHA384("SHA-384"),
@@ -171,10 +178,31 @@ String getMetadataKey() {
171178
}
172179
}
173180

181+
private static abstract class CasingEncoderBase implements DigestingParser.Encoder {
182+
private final boolean upperCase;
183+
private CasingEncoderBase(boolean upperCase) {
184+
this.upperCase = upperCase;
185+
}
186+
187+
}
174188
private static class HexEncoder implements DigestingParser.Encoder {
189+
private final boolean upperCase;
190+
private HexEncoder(boolean upperCase) {
191+
this.upperCase = upperCase;
192+
}
193+
175194
@Override
176195
public String encode(byte[] bytes) {
177-
return Hex.encodeHexString(bytes);
196+
return toCase(Hex.encodeHexString(bytes));
197+
}
198+
199+
String toCase(String digest) {
200+
if (upperCase) {
201+
return digest.toUpperCase(Locale.ROOT);
202+
} else {
203+
//this is redundant, but useful for future proofing?
204+
return digest.toLowerCase(Locale.ROOT);
205+
}
178206
}
179207
}
180208

tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/DigestingParserTest.java

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -90,7 +90,7 @@ public void testCommaSeparated() throws Exception {
9090

9191

9292
expected.put(CommonsDigester.DigestAlgorithm.MD2, "d768c8e27b0b52c6eaabfaa7122d1d4f");
93-
expected.put(CommonsDigester.DigestAlgorithm.MD5, "59f626e09a8c16ab6dbc2800c685f772");
93+
expected.put(CommonsDigester.DigestAlgorithm.MD5, "59F626E09A8C16AB6DBC2800C685F772");
9494
expected.put(CommonsDigester.DigestAlgorithm.SHA1, "PIPQAHIWHLEQ3DVFJQCQ7L22HADZPCFG");
9595
expected.put(CommonsDigester.DigestAlgorithm.SHA256,
9696
"c4b7fab030a8b6a9d6691f6699ac8e6f" + "82bc53764a0f1430d134ae3b70c32654");
@@ -105,7 +105,7 @@ public void testCommaSeparated() throws Exception {
105105
Metadata m = new Metadata();
106106
XMLResult xml = getXML("test_recursive_embedded.docx",
107107
new DigestingParser(AUTO_DETECT_PARSER,
108-
new CommonsDigester(UNLIMITED, "md5,sha256,sha384,sha512,sha1:32"), false)
108+
new CommonsDigester(UNLIMITED, "MD5,sha256,sha384,sha512,sha1:32"), false)
109109
, m);
110110
for (CommonsDigester.DigestAlgorithm algo : new CommonsDigester.DigestAlgorithm[]{
111111
CommonsDigester.DigestAlgorithm.MD5, CommonsDigester.DigestAlgorithm.SHA1,

0 commit comments

Comments
 (0)