Skip to content

Commit a987e03

Browse files
eddumelendezmarkpollack
authored andcommitted
Add new FactCheckingEvaluator for grounded factuality checking
- Introduce FactCheckingEvaluator class for LLM response validation - Implement evaluation logic using ChatClient for fact-checking - Add comprehensive JavaDoc explaining the evaluator's purpose and usage - Reference Bespoke-Minicheck model for efficient implementation options - Include links to Ollama blog post and MiniCheck research paper - Distinguish from 'closed book' scenario testing in documentation This new evaluator enables detection and reduction of hallucinations in LLM outputs by checking claims against provided context. It provides a foundation for implementing advanced fact-checking methodologies in Spring AI applications. See https://ollama.com/blog/reduce-hallucinations-with-bespoke-minicheck
1 parent 78073c6 commit a987e03

File tree

6 files changed

+187
-16
lines changed

6 files changed

+187
-16
lines changed

npe.txt

Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,65 @@
1+
diff --git a/models/spring-ai-ollama/src/main/java/org/springframework/ai/ollama/OllamaChatModel.java b/models/spring-ai-ollama/src/main/java/org/springframework/ai/ollama/OllamaChatModel.java
2+
index c6d689e6..6168ea55 100644
3+
--- a/models/spring-ai-ollama/src/main/java/org/springframework/ai/ollama/OllamaChatModel.java
4+
+++ b/models/spring-ai-ollama/src/main/java/org/springframework/ai/ollama/OllamaChatModel.java
5+
@@ -211,13 +211,18 @@ public class OllamaChatModel extends AbstractToolCallSupport implements ChatMode
6+
7+
Flux<ChatResponse> chatResponse = ollamaResponse.map(chunk -> {
8+
String content = (chunk.message() != null) ? chunk.message().content() : "";
9+
- List<AssistantMessage.ToolCall> toolCalls = chunk.message().toolCalls() == null ? List.of()
10+
- : chunk.message()
11+
- .toolCalls()
12+
- .stream()
13+
- .map(toolCall -> new AssistantMessage.ToolCall("", "function", toolCall.function().name(),
14+
- ModelOptionsUtils.toJsonString(toolCall.function().arguments())))
15+
- .toList();
16+
+
17+
+ List<AssistantMessage.ToolCall> toolCalls = List.of();
18+
+
19+
+ // Added null checks to prevent NPE when accessing tool calls
20+
+ if (chunk.message() != null && chunk.message().toolCalls() != null) {
21+
+ toolCalls = chunk.message()
22+
+ .toolCalls()
23+
+ .stream()
24+
+ .map(toolCall -> new AssistantMessage.ToolCall("", "function", toolCall.function().name(),
25+
+ ModelOptionsUtils.toJsonString(toolCall.function().arguments())))
26+
+ .toList();
27+
+ }
28+
29+
var assistantMessage = new AssistantMessage(content, Map.of(), toolCalls);
30+
31+
diff --git a/models/spring-ai-ollama/src/test/java/org/springframework/ai/ollama/OllamaChatModelMultimodalIT.java b/models/spring-ai-ollama/src/test/java/org/springframework/ai/ollama/OllamaChatModelMultimodalIT.java
32+
index f58552f8..4dffc7d2 100644
33+
--- a/models/spring-ai-ollama/src/test/java/org/springframework/ai/ollama/OllamaChatModelMultimodalIT.java
34+
+++ b/models/spring-ai-ollama/src/test/java/org/springframework/ai/ollama/OllamaChatModelMultimodalIT.java
35+
@@ -40,6 +40,7 @@ import java.io.IOException;
36+
import java.util.List;
37+
38+
import static org.assertj.core.api.Assertions.assertThat;
39+
+import static org.junit.Assert.assertThrows;
40+
41+
@SpringBootTest
42+
@Testcontainers
43+
@@ -67,6 +68,18 @@ class OllamaChatModelMultimodalIT extends BaseOllamaIT {
44+
@Autowired
45+
private OllamaChatModel chatModel;
46+
47+
+ @Test
48+
+ void unsupportedMediaType() throws IOException {
49+
+
50+
+ var imageData = new ClassPathResource("/norway.webp");
51+
+
52+
+ var userMessage = new UserMessage("Explain what do you see on this picture?",
53+
+ List.of(new Media(MimeTypeUtils.IMAGE_PNG, imageData)));
54+
+
55+
+ assertThrows(RuntimeException.class, () -> chatModel.call(new Prompt(List.of(userMessage))));
56+
+
57+
+ }
58+
+
59+
@Test
60+
void multiModalityTest() throws IOException {
61+
62+
diff --git a/models/spring-ai-ollama/src/test/resources/norway.webp b/models/spring-ai-ollama/src/test/resources/norway.webp
63+
new file mode 100644
64+
index 00000000..0da983e2
65+
Binary files /dev/null and b/models/spring-ai-ollama/src/test/resources/norway.webp differ

spring-ai-core/src/main/java/org/springframework/ai/evaluation/EvaluationRequest.java

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
* correctness of the chat response based on the context.
1313
*
1414
* @author Mark Pollack
15+
* @author Eddú Meléndez
1516
* @since 1.0.0 M1
1617
*/
1718
public class EvaluationRequest {
@@ -26,6 +27,10 @@ public EvaluationRequest(String userText, String responseContent) {
2627
this(userText, Collections.emptyList(), responseContent);
2728
}
2829

30+
public EvaluationRequest(List<Content> dataList, String responseContent) {
31+
this("", dataList, responseContent);
32+
}
33+
2934
public EvaluationRequest(String userText, List<Content> dataList, String responseContent) {
3035
this.userText = userText;
3136
this.dataList = dataList;

spring-ai-core/src/main/java/org/springframework/ai/evaluation/EvaluationResponse.java

Lines changed: 12 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -5,21 +5,29 @@
55

66
public class EvaluationResponse {
77

8-
private boolean pass;
8+
private final boolean pass;
99

10-
private float score;
10+
private final float score;
1111

12-
private String feedback;
12+
private final String feedback;
1313

14-
Map<String, Object> metadata;
14+
private final Map<String, Object> metadata;
1515

16+
@Deprecated
1617
public EvaluationResponse(boolean pass, float score, String feedback, Map<String, Object> metadata) {
1718
this.pass = pass;
1819
this.score = score;
1920
this.feedback = feedback;
2021
this.metadata = metadata;
2122
}
2223

24+
public EvaluationResponse(boolean pass, String feedback, Map<String, Object> metadata) {
25+
this.pass = pass;
26+
this.score = 0;
27+
this.feedback = feedback;
28+
this.metadata = metadata;
29+
}
30+
2331
public boolean isPass() {
2432
return pass;
2533
}
Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,22 @@
11
package org.springframework.ai.evaluation;
22

3+
import org.springframework.ai.model.Content;
4+
import org.springframework.util.StringUtils;
5+
6+
import java.util.List;
7+
import java.util.stream.Collectors;
8+
39
@FunctionalInterface
410
public interface Evaluator {
511

612
EvaluationResponse evaluate(EvaluationRequest evaluationRequest);
713

14+
default String doGetSupportingData(EvaluationRequest evaluationRequest) {
15+
List<Content> data = evaluationRequest.getDataList();
16+
return data.stream()
17+
.map(Content::getContent)
18+
.filter(StringUtils::hasText)
19+
.collect(Collectors.joining(System.lineSeparator()));
20+
}
21+
822
}
Lines changed: 91 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,91 @@
1+
package org.springframework.ai.evaluation;
2+
3+
import org.springframework.ai.chat.client.ChatClient;
4+
5+
import java.util.Collections;
6+
7+
/**
8+
* The FactCheckingEvaluator class implements a method for evaluating the factual accuracy
9+
* of Large Language Model (LLM) responses against provided context.
10+
*
11+
* This evaluator addresses a specific type of potential error in LLM outputs known as
12+
* "hallucination" in the context of grounded factuality. It verifies whether a given
13+
* statement (the "claim") is logically supported by a provided context (the "document").
14+
*
15+
* Key concepts: - Document: The context or grounding information against which the claim
16+
* is checked. - Claim: The statement to be verified against the document.
17+
*
18+
* The evaluator uses a prompt-based approach with a separate, typically smaller and more
19+
* efficient LLM to perform the fact-checking. This design choice allows for
20+
* cost-effective and rapid verification, which is crucial when evaluating longer LLM
21+
* outputs that may require multiple verification steps.
22+
*
23+
* Implementation note: For efficient and accurate fact-checking, consider using
24+
* specialized models like Bespoke-Minicheck, a grounded factuality checking model
25+
* developed by Bespoke Labs and available in Ollama. Such models are specifically
26+
* designed to fact-check responses generated by other models, helping to detect and
27+
* reduce hallucinations. For more information, see:
28+
* <a href="https://ollama.com/blog/reduce-hallucinations-with-bespoke-minicheck">Reduce
29+
* Hallucinations with Bespoke-Minicheck</a> and the research paper:
30+
* <a href="https://arxiv.org/pdf/2404.10774v1">MiniCheck: An Efficient Method for LLM
31+
* Hallucination Detection</a>
32+
*
33+
* Note: This evaluator is specifically designed to fact-check statements against given
34+
* information. It's not meant for other types of accuracy tests, like quizzing an AI on
35+
* obscure facts without giving it any reference material to work with (so-called 'closed
36+
* book' scenarios).
37+
*
38+
* The evaluation process aims to determine if the claim is supported by the document,
39+
* returning a boolean result indicating whether the fact-check passed or failed.
40+
*
41+
* @author Eddú Meléndez
42+
* @author Mark Pollack
43+
* @see Evaluator
44+
* @see EvaluationRequest
45+
* @see EvaluationResponse
46+
* @since 1.0.0
47+
*/
48+
public class FactCheckingEvaluator implements Evaluator {
49+
50+
private static final String DEFAULT_EVALUATION_PROMPT_TEXT = """
51+
Document: \\n {document}\\n
52+
Claim: \\n {claim}
53+
""";
54+
55+
private final ChatClient.Builder chatClientBuilder;
56+
57+
/**
58+
* Constructs a new FactCheckingEvaluator with the provided ChatClient.Builder.
59+
* @param chatClientBuilder The builder for the ChatClient used to perform the
60+
* evaluation
61+
*/
62+
public FactCheckingEvaluator(ChatClient.Builder chatClientBuilder) {
63+
this.chatClientBuilder = chatClientBuilder;
64+
}
65+
66+
@Override
67+
/**
68+
* Evaluates whether the response content in the EvaluationRequest is factually
69+
* supported by the context provided in the same request.
70+
* @param evaluationRequest The request containing the response to be evaluated and
71+
* the supporting context
72+
* @return An EvaluationResponse indicating whether the claim is supported by the
73+
* document
74+
*/
75+
public EvaluationResponse evaluate(EvaluationRequest evaluationRequest) {
76+
var response = evaluationRequest.getResponseContent();
77+
var context = doGetSupportingData(evaluationRequest);
78+
79+
String evaluationResponse = this.chatClientBuilder.build()
80+
.prompt()
81+
.user(userSpec -> userSpec.text(DEFAULT_EVALUATION_PROMPT_TEXT)
82+
.param("document", context)
83+
.param("claim", response))
84+
.call()
85+
.content();
86+
87+
boolean passing = evaluationResponse.equalsIgnoreCase("yes");
88+
return new EvaluationResponse(passing, "", Collections.emptyMap());
89+
}
90+
91+
}

spring-ai-core/src/main/java/org/springframework/ai/evaluation/RelevancyEvaluator.java

Lines changed: 0 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,8 @@
11
package org.springframework.ai.evaluation;
22

33
import org.springframework.ai.chat.client.ChatClient;
4-
import org.springframework.ai.model.Content;
54

65
import java.util.Collections;
7-
import java.util.List;
8-
import java.util.stream.Collectors;
9-
import org.springframework.util.StringUtils;
106

117
public class RelevancyEvaluator implements Evaluator {
128

@@ -53,12 +49,4 @@ public EvaluationResponse evaluate(EvaluationRequest evaluationRequest) {
5349
return new EvaluationResponse(passing, score, "", Collections.emptyMap());
5450
}
5551

56-
protected String doGetSupportingData(EvaluationRequest evaluationRequest) {
57-
List<Content> data = evaluationRequest.getDataList();
58-
return data.stream()
59-
.map(Content::getContent)
60-
.filter(StringUtils::hasText)
61-
.collect(Collectors.joining(System.lineSeparator()));
62-
}
63-
6452
}

0 commit comments

Comments
 (0)