Add new FactCheckingEvaluator for grounded factuality checking

eddumelendez · markpollack · commit a987e0305a01 · 2024-10-07T14:07:12.000+02:00
- Introduce FactCheckingEvaluator class for LLM response validation - Implement evaluation logic using ChatClient for fact-checking - Add comprehensive JavaDoc explaining the evaluator's purpose and usage - Reference Bespoke-Minicheck model for efficient implementation options - Include links to Ollama blog post and MiniCheck research paper - Distinguish from 'closed book' scenario testing in documentation This new evaluator enables detection and reduction of hallucinations in LLM outputs by checking claims against provided context. It provides a foundation for implementing advanced fact-checking methodologies in Spring AI applications. See https://ollama.com/blog/reduce-hallucinations-with-bespoke-minicheck
diff --git a/npe.txt b/npe.txt
@@ -0,0 +1,65 @@
+diff --git a/models/spring-ai-ollama/src/main/java/org/springframework/ai/ollama/OllamaChatModel.java b/models/spring-ai-ollama/src/main/java/org/springframework/ai/ollama/OllamaChatModel.java
+index c6d689e6..6168ea55 100644
+--- a/models/spring-ai-ollama/src/main/java/org/springframework/ai/ollama/OllamaChatModel.java
++++ b/models/spring-ai-ollama/src/main/java/org/springframework/ai/ollama/OllamaChatModel.java
+@@ -211,13 +211,18 @@ public class OllamaChatModel extends AbstractToolCallSupport implements ChatMode
+ 
+ 			Flux<ChatResponse> chatResponse = ollamaResponse.map(chunk -> {
+ 				String content = (chunk.message() != null) ? chunk.message().content() : "";
+-				List<AssistantMessage.ToolCall> toolCalls = chunk.message().toolCalls() == null ? List.of()
+-						: chunk.message()
+-							.toolCalls()
+-							.stream()
+-							.map(toolCall -> new AssistantMessage.ToolCall("", "function", toolCall.function().name(),
+-									ModelOptionsUtils.toJsonString(toolCall.function().arguments())))
+-							.toList();
++
++				List<AssistantMessage.ToolCall> toolCalls = List.of();
++
++				// Added null checks to prevent NPE when accessing tool calls
++				if (chunk.message() != null && chunk.message().toolCalls() != null) {
++					toolCalls = chunk.message()
++						.toolCalls()
++						.stream()
++						.map(toolCall -> new AssistantMessage.ToolCall("", "function", toolCall.function().name(),
++								ModelOptionsUtils.toJsonString(toolCall.function().arguments())))
++						.toList();
++				}
+ 
+ 				var assistantMessage = new AssistantMessage(content, Map.of(), toolCalls);
+ 
+diff --git a/models/spring-ai-ollama/src/test/java/org/springframework/ai/ollama/OllamaChatModelMultimodalIT.java b/models/spring-ai-ollama/src/test/java/org/springframework/ai/ollama/OllamaChatModelMultimodalIT.java
+index f58552f8..4dffc7d2 100644
+--- a/models/spring-ai-ollama/src/test/java/org/springframework/ai/ollama/OllamaChatModelMultimodalIT.java
++++ b/models/spring-ai-ollama/src/test/java/org/springframework/ai/ollama/OllamaChatModelMultimodalIT.java
+@@ -40,6 +40,7 @@ import java.io.IOException;
+ import java.util.List;
+ 
+ import static org.assertj.core.api.Assertions.assertThat;
++import static org.junit.Assert.assertThrows;
+ 
+ @SpringBootTest
+ @Testcontainers
+@@ -67,6 +68,18 @@ class OllamaChatModelMultimodalIT extends BaseOllamaIT {
+ 	@Autowired
+ 	private OllamaChatModel chatModel;
+ 
++	@Test
++	void unsupportedMediaType() throws IOException {
++
++		var imageData = new ClassPathResource("/norway.webp");
++
++		var userMessage = new UserMessage("Explain what do you see on this picture?",
++				List.of(new Media(MimeTypeUtils.IMAGE_PNG, imageData)));
++
++		assertThrows(RuntimeException.class, () -> chatModel.call(new Prompt(List.of(userMessage))));
++
++	}
++
+ 	@Test
+ 	void multiModalityTest() throws IOException {
+ 
+diff --git a/models/spring-ai-ollama/src/test/resources/norway.webp b/models/spring-ai-ollama/src/test/resources/norway.webp
+new file mode 100644
+index 00000000..0da983e2
+Binary files /dev/null and b/models/spring-ai-ollama/src/test/resources/norway.webp differ
diff --git a/spring-ai-core/src/main/java/org/springframework/ai/evaluation/EvaluationRequest.java b/spring-ai-core/src/main/java/org/springframework/ai/evaluation/EvaluationRequest.java
@@ -12,6 +12,7 @@
  * correctness of the chat response based on the context.
  *
  * @author Mark Pollack
+ * @author Eddú Meléndez
  * @since 1.0.0 M1
  */
 public class EvaluationRequest {
@@ -26,6 +27,10 @@ public EvaluationRequest(String userText, String responseContent) {
 		this(userText, Collections.emptyList(), responseContent);
 	}
 
+	public EvaluationRequest(List<Content> dataList, String responseContent) {
+		this("", dataList, responseContent);
+	}
+
 	public EvaluationRequest(String userText, List<Content> dataList, String responseContent) {
 		this.userText = userText;
 		this.dataList = dataList;
diff --git a/spring-ai-core/src/main/java/org/springframework/ai/evaluation/EvaluationResponse.java b/spring-ai-core/src/main/java/org/springframework/ai/evaluation/EvaluationResponse.java
@@ -5,21 +5,29 @@
 
 public class EvaluationResponse {
 
-	private boolean pass;
+	private final boolean pass;
 
-	private float score;
+	private final float score;
 
-	private String feedback;
+	private final String feedback;
 
-	Map<String, Object> metadata;
+	private final Map<String, Object> metadata;
 
+	@Deprecated
 	public EvaluationResponse(boolean pass, float score, String feedback, Map<String, Object> metadata) {
 		this.pass = pass;
 		this.score = score;
 		this.feedback = feedback;
 		this.metadata = metadata;
 	}
 
+	public EvaluationResponse(boolean pass, String feedback, Map<String, Object> metadata) {
+		this.pass = pass;
+		this.score = 0;
+		this.feedback = feedback;
+		this.metadata = metadata;
+	}
+
 	public boolean isPass() {
 		return pass;
 	}
diff --git a/spring-ai-core/src/main/java/org/springframework/ai/evaluation/Evaluator.java b/spring-ai-core/src/main/java/org/springframework/ai/evaluation/Evaluator.java
@@ -1,8 +1,22 @@
 package org.springframework.ai.evaluation;
 
+import org.springframework.ai.model.Content;
+import org.springframework.util.StringUtils;
+
+import java.util.List;
+import java.util.stream.Collectors;
+
 @FunctionalInterface
 public interface Evaluator {
 
 	EvaluationResponse evaluate(EvaluationRequest evaluationRequest);
 
+	default String doGetSupportingData(EvaluationRequest evaluationRequest) {
+		List<Content> data = evaluationRequest.getDataList();
+		return data.stream()
+			.map(Content::getContent)
+			.filter(StringUtils::hasText)
+			.collect(Collectors.joining(System.lineSeparator()));
+	}
+
 }
diff --git a/spring-ai-core/src/main/java/org/springframework/ai/evaluation/FactCheckingEvaluator.java b/spring-ai-core/src/main/java/org/springframework/ai/evaluation/FactCheckingEvaluator.java
@@ -0,0 +1,91 @@
+package org.springframework.ai.evaluation;
+
+import org.springframework.ai.chat.client.ChatClient;
+
+import java.util.Collections;
+
+/**
+ * The FactCheckingEvaluator class implements a method for evaluating the factual accuracy
+ * of Large Language Model (LLM) responses against provided context.
+ *
+ * This evaluator addresses a specific type of potential error in LLM outputs known as
+ * "hallucination" in the context of grounded factuality. It verifies whether a given
+ * statement (the "claim") is logically supported by a provided context (the "document").
+ *
+ * Key concepts: - Document: The context or grounding information against which the claim
+ * is checked. - Claim: The statement to be verified against the document.
+ *
+ * The evaluator uses a prompt-based approach with a separate, typically smaller and more
+ * efficient LLM to perform the fact-checking. This design choice allows for
+ * cost-effective and rapid verification, which is crucial when evaluating longer LLM
+ * outputs that may require multiple verification steps.
+ *
+ * Implementation note: For efficient and accurate fact-checking, consider using
+ * specialized models like Bespoke-Minicheck, a grounded factuality checking model
+ * developed by Bespoke Labs and available in Ollama. Such models are specifically
+ * designed to fact-check responses generated by other models, helping to detect and
+ * reduce hallucinations. For more information, see:
+ * <a href="https://ollama.com/blog/reduce-hallucinations-with-bespoke-minicheck">Reduce
+ * Hallucinations with Bespoke-Minicheck</a> and the research paper:
+ * <a href="https://arxiv.org/pdf/2404.10774v1">MiniCheck: An Efficient Method for LLM
+ * Hallucination Detection</a>
+ *
+ * Note: This evaluator is specifically designed to fact-check statements against given
+ * information. It's not meant for other types of accuracy tests, like quizzing an AI on
+ * obscure facts without giving it any reference material to work with (so-called 'closed
+ * book' scenarios).
+ *
+ * The evaluation process aims to determine if the claim is supported by the document,
+ * returning a boolean result indicating whether the fact-check passed or failed.
+ *
+ * @author Eddú Meléndez
+ * @author Mark Pollack
+ * @see Evaluator
+ * @see EvaluationRequest
+ * @see EvaluationResponse
+ * @since 1.0.0
+ */
+public class FactCheckingEvaluator implements Evaluator {
+
+	private static final String DEFAULT_EVALUATION_PROMPT_TEXT = """
+			    Document: \\n {document}\\n
+			    Claim: \\n {claim}
+			""";
+
+	private final ChatClient.Builder chatClientBuilder;
+
+	/**
+	 * Constructs a new FactCheckingEvaluator with the provided ChatClient.Builder.
+	 * @param chatClientBuilder The builder for the ChatClient used to perform the
+	 * evaluation
+	 */
+	public FactCheckingEvaluator(ChatClient.Builder chatClientBuilder) {
+		this.chatClientBuilder = chatClientBuilder;
+	}
+
+	@Override
+	/**
+	 * Evaluates whether the response content in the EvaluationRequest is factually
+	 * supported by the context provided in the same request.
+	 * @param evaluationRequest The request containing the response to be evaluated and
+	 * the supporting context
+	 * @return An EvaluationResponse indicating whether the claim is supported by the
+	 * document
+	 */
+	public EvaluationResponse evaluate(EvaluationRequest evaluationRequest) {
+		var response = evaluationRequest.getResponseContent();
+		var context = doGetSupportingData(evaluationRequest);
+
+		String evaluationResponse = this.chatClientBuilder.build()
+			.prompt()
+			.user(userSpec -> userSpec.text(DEFAULT_EVALUATION_PROMPT_TEXT)
+				.param("document", context)
+				.param("claim", response))
+			.call()
+			.content();
+
+		boolean passing = evaluationResponse.equalsIgnoreCase("yes");
+		return new EvaluationResponse(passing, "", Collections.emptyMap());
+	}
+
+}
diff --git a/spring-ai-core/src/main/java/org/springframework/ai/evaluation/RelevancyEvaluator.java b/spring-ai-core/src/main/java/org/springframework/ai/evaluation/RelevancyEvaluator.java
@@ -1,12 +1,8 @@
 package org.springframework.ai.evaluation;
 
 import org.springframework.ai.chat.client.ChatClient;
-import org.springframework.ai.model.Content;
 
 import java.util.Collections;
-import java.util.List;
-import java.util.stream.Collectors;
-import org.springframework.util.StringUtils;
 
 public class RelevancyEvaluator implements Evaluator {
 
@@ -53,12 +49,4 @@ public EvaluationResponse evaluate(EvaluationRequest evaluationRequest) {
 		return new EvaluationResponse(passing, score, "", Collections.emptyMap());
 	}
 
-	protected String doGetSupportingData(EvaluationRequest evaluationRequest) {
-		List<Content> data = evaluationRequest.getDataList();
-		return data.stream()
-			.map(Content::getContent)
-			.filter(StringUtils::hasText)
-			.collect(Collectors.joining(System.lineSeparator()));
-	}
-
 }