@@ -74,22 +74,209 @@ class ConversationLengthMetric(ConversationThreadMetric):
7474 )
7575```
7676
77+ ## Advanced Example: LLM-as-a-Judge Conversation Metric
78+
79+ For more sophisticated evaluation, you can use an LLM to judge conversation quality. This pattern is particularly useful when you need nuanced assessment of conversation attributes like helpfulness, coherence, or tone.
80+
81+ Here's an example that evaluates the quality of assistant responses:
82+
83+ ### Step 1: Define the Output Schema
84+
85+ ``` python
86+ import pydantic
87+
88+ class ConversationQualityScore (pydantic .BaseModel ):
89+ """ Schema for LLM judge output."""
90+ score_value: float # Score between 0.0 and 1.0
91+ reason: str # Explanation for the score
92+
93+ __hash__ = object .__hash__
94+ ```
95+
96+ ### Step 2: Create the Evaluation Prompt
97+
98+ ``` python
99+ def create_evaluation_prompt (conversation : list ) -> str :
100+ """
101+ Create a prompt that asks the LLM to evaluate conversation quality.
102+ """
103+ return f """ Evaluate the quality of the assistant's responses in this conversation.
104+ Consider the following criteria:
105+ 1. Helpfulness: Does the assistant provide useful, relevant information?
106+ 2. Clarity: Are the responses clear and easy to understand?
107+ 3. Consistency: Does the assistant maintain context across turns?
108+ 4. Professionalism: Is the tone appropriate and respectful?
109+
110+ Return a JSON object with:
111+ - score_value: A number between 0.0 (poor) and 1.0 (excellent)
112+ - reason: A brief explanation of your assessment
113+
114+ Conversation:
115+ { conversation}
116+
117+ Your evaluation (JSON only):
118+ """
119+ ```
120+
121+ ### Step 3: Implement the Metric
122+
123+ ``` python
124+ import logging
125+ from typing import Optional, Union, Any
126+ import pydantic
127+
128+ from opik import exceptions
129+ from opik.evaluation.metrics import score_result
130+ from opik.evaluation.metrics.conversation import (
131+ ConversationThreadMetric,
132+ types as conversation_types,
133+ )
134+ from opik.evaluation.metrics.llm_judges import parsing_helpers
135+ from opik.evaluation.models import base_model, models_factory
136+
137+ LOGGER = logging.getLogger(__name__ )
138+
139+
140+ class ConversationQualityMetric (ConversationThreadMetric ):
141+ """
142+ An LLM-as-judge metric that evaluates conversation quality.
143+
144+ Args:
145+ model: The LLM to use as a judge (e.g., "gpt-4", "claude-3-5-sonnet-20241022").
146+ If None, uses the default model.
147+ name: The name of this metric.
148+ track: Whether to track the metric in Opik.
149+ project_name: Optional project name for tracking.
150+ """
151+
152+ def __init__ (
153+ self ,
154+ model : Optional[Union[str , base_model.OpikBaseModel]] = None ,
155+ name : str = " conversation_quality_score" ,
156+ track : bool = True ,
157+ project_name : Optional[str ] = None ,
158+ ):
159+ super ().__init__ (name = name, track = track, project_name = project_name)
160+ self ._init_model(model)
161+
162+ def _init_model (
163+ self , model : Optional[Union[str , base_model.OpikBaseModel]]
164+ ) -> None :
165+ """ Initialize the LLM model for judging."""
166+ if isinstance (model, base_model.OpikBaseModel):
167+ self ._model = model
168+ else :
169+ # Get model from factory (supports various providers via LiteLLM)
170+ self ._model = models_factory.get(model_name = model)
171+
172+ def score (
173+ self ,
174+ conversation : conversation_types.Conversation,
175+ ** kwargs : Any,
176+ ) -> score_result.ScoreResult:
177+ """
178+ Evaluate the conversation quality using an LLM judge.
179+
180+ Args:
181+ conversation: List of conversation messages.
182+ **kwargs: Additional arguments (ignored).
183+
184+ Returns:
185+ ScoreResult with value between 0.0 and 1.0.
186+ """
187+ try :
188+ # Create the evaluation prompt
189+ llm_query = create_evaluation_prompt(conversation)
190+
191+ # Call the LLM with structured output
192+ model_output = self ._model.generate_string(
193+ input = llm_query,
194+ response_format = ConversationQualityScore,
195+ )
196+
197+ # Parse the LLM response
198+ score_data = self ._parse_llm_output(model_output)
199+
200+ # Ensure score is within valid range [0.0, 1.0]
201+ validated_score = max (0.0 , min (1.0 , score_data.score_value))
202+
203+ return score_result.ScoreResult(
204+ name = self .name,
205+ value = validated_score,
206+ reason = score_data.reason,
207+ )
208+
209+ except Exception as e:
210+ LOGGER .error(f " Failed to calculate conversation quality: { e} " )
211+ raise exceptions.MetricComputationError(
212+ f " Failed to calculate conversation quality: { e} "
213+ ) from e
214+
215+ def _parse_llm_output (self , model_output : str ) -> ConversationQualityScore:
216+ """ Parse and validate the LLM's output."""
217+ try :
218+ # Extract JSON from the model output
219+ dict_content = parsing_helpers.extract_json_content_or_raise(
220+ model_output
221+ )
222+
223+ # Validate against schema
224+ return ConversationQualityScore.model_validate(dict_content)
225+
226+ except pydantic.ValidationError as e:
227+ LOGGER .warning(
228+ f " Failed to parse LLM output: { model_output} , error: { e} " ,
229+ exc_info = True ,
230+ )
231+ raise
232+ ```
233+
234+ ### Step 4: Use the Metric
235+
236+ ``` python
237+ from opik.evaluation import evaluate_threads
238+
239+ # Initialize the metric with your preferred judge model
240+ quality_metric = ConversationQualityMetric(
241+ model = " gpt-4o" , # or "claude-3-5-sonnet-20241022", etc.
242+ name = " conversation_quality"
243+ )
244+
245+ # Evaluate threads in your project
246+ results = evaluate_threads(
247+ project_name = " my_chatbot_project" ,
248+ eval_project_name = " quality_evaluation" ,
249+ metrics = [quality_metric],
250+ )
251+ ```
252+
253+ ### Key Patterns in LLM-as-Judge Metrics
254+
255+ When building LLM-as-judge metrics, follow these best practices:
256+
257+ 1 . ** Structured Output** : Use Pydantic models to ensure consistent LLM responses
258+ 2 . ** Clear Prompts** : Provide specific evaluation criteria to the judge
259+ 3 . ** Error Handling** : Wrap LLM calls in try-except blocks with proper logging
260+ 4 . ** Model Flexibility** : Allow users to specify their preferred judge model
261+ 5 . ** Reason Field** : Always include an explanation for transparency
262+
77263## Using Custom Conversation Metrics
78264
79- You can use this metric with ` evaluate_threads ` :
265+ You can use custom metrics with ` evaluate_threads ` :
80266
81267``` python
82268from opik.evaluation import evaluate_threads
83269
84- # Initialize the metric
270+ # Initialize your metrics
85271conversation_length_metric = ConversationLengthMetric()
272+ quality_metric = ConversationQualityMetric(model = " gpt-4o" )
86273
87274# Evaluate threads in your project
88275results = evaluate_threads(
89276 project_name = " my_chatbot_project" ,
90277 filter_string = ' status = "inactive"' ,
91278 eval_project_name = " chatbot_evaluation" ,
92- metrics = [conversation_length_metric],
279+ metrics = [conversation_length_metric, quality_metric ],
93280 trace_input_transform = lambda x : x[" input" ],
94281 trace_output_transform = lambda x : x[" output" ],
95282)
0 commit comments