@@ -7,29 +7,19 @@ import {
7
7
} from "mongodb-rag-core/braintrust" ;
8
8
import {
9
9
Conversation ,
10
- generateResponse ,
11
- GenerateResponseParams ,
10
+ GenerateResponse ,
12
11
logger ,
13
12
Message ,
14
13
} from "mongodb-chatbot-server" ;
15
14
import { ObjectId } from "mongodb-rag-core/mongodb" ;
16
15
17
- import {
18
- AnswerRelevancy ,
19
- ContextRelevancy ,
20
- Faithfulness ,
21
- Factuality ,
22
- } from "autoevals" ;
16
+ import { ContextRelevancy , Faithfulness , Factuality } from "autoevals" ;
23
17
import { strict as assert } from "assert" ;
24
18
import { MongoDbTag } from "mongodb-rag-core/mongoDbMetadata" ;
25
19
import { fuzzyLinkMatch } from "./fuzzyLinkMatch" ;
26
20
import { binaryNdcgAtK } from "./scorers/binaryNdcgAtK" ;
27
21
import { ConversationEvalCase as ConversationEvalCaseSource } from "mongodb-rag-core/eval" ;
28
- import {
29
- getLastUserMessageFromMessages ,
30
- getLastAssistantMessageFromMessages ,
31
- getContextsFromUserMessage ,
32
- } from "./evalHelpers" ;
22
+ import { extractTracingData } from "../tracing/extractTracingData" ;
33
23
34
24
interface ConversationEvalCaseInput {
35
25
previousConversation : Conversation ;
@@ -40,6 +30,7 @@ type ConversationEvalCaseExpected = {
40
30
links ?: string [ ] ;
41
31
reference ?: string ;
42
32
expectation ?: string ;
33
+ reject ?: boolean ;
43
34
} ;
44
35
45
36
interface ConversationEvalCase
@@ -69,10 +60,16 @@ type ConversationEvalScorer = EvalScorer<
69
60
70
61
// -- Evaluation metrics --
71
62
const RetrievedContext : ConversationEvalScorer = async ( args ) => {
72
- args . output . context ;
63
+ const name = "RetrievedContext" ;
64
+ if ( ! args . output . context ) {
65
+ return {
66
+ name,
67
+ score : null ,
68
+ } ;
69
+ }
73
70
return {
74
- name : "RetrievedContext" ,
75
- score : args . output . context ? .length ? 1 : 0 ,
71
+ name,
72
+ score : args . output . context . length ? 1 : 0 ,
76
73
} ;
77
74
} ;
78
75
@@ -83,6 +80,22 @@ const AllowedQuery: ConversationEvalScorer = async (args) => {
83
80
} ;
84
81
} ;
85
82
83
+ const InputGuardrailExpected : ConversationEvalScorer = async ( args ) => {
84
+ const name = "InputGuardrail" ;
85
+ // Skip running eval if no expected reject
86
+ if ( ! args . expected . reject ) {
87
+ return {
88
+ name,
89
+ score : null ,
90
+ } ;
91
+ }
92
+ const match = args . expected . reject === ! args . output . allowedQuery ;
93
+ return {
94
+ name,
95
+ score : match ? 1 : 0 ,
96
+ } ;
97
+ } ;
98
+
86
99
const BinaryNdcgAt5 : ConversationEvalScorer = async ( args ) => {
87
100
const name = "BinaryNdcgAt5" ;
88
101
const k = 5 ;
@@ -141,14 +154,15 @@ type ConversationEvalScorerConstructor = (
141
154
142
155
const makeConversationFaithfulness : ConversationEvalScorerConstructor =
143
156
( judgeModelConfig ) => async ( args ) => {
157
+ if ( args . output . context ?. length === 0 ) {
158
+ return {
159
+ name : "Faithfulness" ,
160
+ score : null ,
161
+ } ;
162
+ }
144
163
return Faithfulness ( getConversationRagasConfig ( args , judgeModelConfig ) ) ;
145
164
} ;
146
165
147
- const makeConversationAnswerRelevancy : ConversationEvalScorerConstructor =
148
- ( judgeModelConfig ) => async ( args ) => {
149
- return AnswerRelevancy ( getConversationRagasConfig ( args , judgeModelConfig ) ) ;
150
- } ;
151
-
152
166
const makeConversationContextRelevancy : ConversationEvalScorerConstructor =
153
167
( judgeModelConfig ) => async ( args ) => {
154
168
return ContextRelevancy ( getConversationRagasConfig ( args , judgeModelConfig ) ) ;
@@ -176,32 +190,19 @@ export interface MakeConversationEvalParams {
176
190
experimentName : string ;
177
191
metadata ?: Record < string , unknown > ;
178
192
maxConcurrency ?: number ;
179
- generate : Pick <
180
- GenerateResponseParams ,
181
- | "filterPreviousMessages"
182
- | "generateUserPrompt"
183
- | "llmNotWorkingMessage"
184
- | "llm"
185
- | "noRelevantContentMessage"
186
- > & {
187
- systemPrompt : {
188
- content : string ;
189
- role : "system" ;
190
- } ;
191
- } ;
193
+ generateResponse : GenerateResponse ;
192
194
}
193
- export function makeConversationEval ( {
195
+ export async function makeConversationEval ( {
194
196
conversationEvalCases,
195
197
judgeModelConfig,
196
198
projectName,
197
199
experimentName,
198
200
metadata,
199
201
maxConcurrency,
200
- generate ,
202
+ generateResponse ,
201
203
} : MakeConversationEvalParams ) {
202
204
const Factuality = makeFactuality ( judgeModelConfig ) ;
203
205
const Faithfullness = makeConversationFaithfulness ( judgeModelConfig ) ;
204
- const AnswerRelevancy = makeConversationAnswerRelevancy ( judgeModelConfig ) ;
205
206
const ContextRelevancy = makeConversationContextRelevancy ( judgeModelConfig ) ;
206
207
207
208
return Eval ( projectName , {
@@ -216,11 +217,6 @@ export function makeConversationEval({
216
217
createdAt : new Date ( ) ,
217
218
} satisfies Message )
218
219
) ;
219
- prevConversationMessages . unshift ( {
220
- ...generate . systemPrompt ,
221
- id : new ObjectId ( ) ,
222
- createdAt : new Date ( ) ,
223
- } satisfies Message ) ;
224
220
const latestMessageText = evalCase . messages . at ( - 1 ) ?. content ;
225
221
assert ( latestMessageText , "No latest message text found" ) ;
226
222
return {
@@ -238,6 +234,7 @@ export function makeConversationEval({
238
234
expectation : evalCase . expectation ,
239
235
reference : evalCase . reference ,
240
236
links : evalCase . expectedLinks ,
237
+ reject : evalCase . reject ,
241
238
} ,
242
239
metadata : null ,
243
240
} satisfies ConversationEvalCase ;
@@ -248,33 +245,34 @@ export function makeConversationEval({
248
245
maxConcurrency,
249
246
async task ( input ) : Promise < ConversationTaskOutput > {
250
247
try {
251
- const generated = await traced (
248
+ const id = new ObjectId ( ) ;
249
+ const { messages } = await traced (
252
250
async ( ) =>
253
251
generateResponse ( {
254
252
conversation : input . previousConversation ,
255
253
latestMessageText : input . latestMessageText ,
256
- llm : generate . llm ,
257
- llmNotWorkingMessage : generate . llmNotWorkingMessage ,
258
- noRelevantContentMessage : generate . noRelevantContentMessage ,
259
- reqId : input . latestMessageText ,
254
+ reqId : id . toHexString ( ) ,
260
255
shouldStream : false ,
261
- generateUserPrompt : generate . generateUserPrompt ,
262
- filterPreviousMessages : generate . filterPreviousMessages ,
263
256
} ) ,
264
257
{
265
258
name : "generateResponse" ,
266
259
}
267
260
) ;
268
- const userMessage = getLastUserMessageFromMessages ( generated . messages ) ;
269
- const finalAssistantMessage = getLastAssistantMessageFromMessages (
270
- generated . messages
271
- ) ;
272
- const contextInfo = getContextsFromUserMessage ( userMessage ) ;
261
+ const mockDbMessages = messages . map ( ( m , i ) => {
262
+ const msgId = i === messages . length - 1 ? id : new ObjectId ( ) ;
263
+ return { ...m , id : msgId , createdAt : new Date ( ) } ;
264
+ } ) ;
265
+
266
+ const { rejectQuery, userMessage, contextContent, assistantMessage } =
267
+ extractTracingData ( mockDbMessages , id ) ;
268
+ assert ( assistantMessage , "No assistant message found" ) ;
269
+ assert ( contextContent , "No context content found" ) ;
270
+ assert ( userMessage , "No user message found" ) ;
273
271
return {
274
- assistantMessageContent : finalAssistantMessage . content ,
275
- context : contextInfo ?. contexts ,
276
- urls : contextInfo ?. urls ,
277
- allowedQuery : ! userMessage . rejectQuery ,
272
+ assistantMessageContent : assistantMessage . content ,
273
+ context : contextContent . map ( ( c ) => c . text ) ,
274
+ urls : assistantMessage . references ?. map ( ( r ) => r . url ) ,
275
+ allowedQuery : ! rejectQuery ,
278
276
} ;
279
277
} catch ( error ) {
280
278
logger . error ( `Error evaluating input: ${ input . latestMessageText } ` ) ;
@@ -288,7 +286,7 @@ export function makeConversationEval({
288
286
BinaryNdcgAt5 ,
289
287
Factuality ,
290
288
Faithfullness ,
291
- AnswerRelevancy ,
289
+ InputGuardrailExpected ,
292
290
ContextRelevancy ,
293
291
] ,
294
292
} ) ;
0 commit comments