Skip to content

Commit 8c73037

Browse files
authored
Simplify eval arg names (#6944)
It'll be easier to switch between these if the names of predictions are consistent
1 parent 8f5eca2 commit 8c73037

File tree

8 files changed

+52
-51
lines changed

8 files changed

+52
-51
lines changed

docs/extras/guides/evaluation/comparisons.ipynb

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -243,8 +243,8 @@
243243
" pred_a, pred_b = res_b, res_a\n",
244244
" a, b = \"b\", \"a\"\n",
245245
" eval_res = eval_chain.evaluate_string_pairs(\n",
246-
" output_a=pred_a['output'] if isinstance(pred_a, dict) else str(pred_a),\n",
247-
" output_b=pred_b['output'] if isinstance(pred_b, dict) else str(pred_b),\n",
246+
" prediction=pred_a['output'] if isinstance(pred_a, dict) else str(pred_a),\n",
247+
" prediction_b=pred_b['output'] if isinstance(pred_b, dict) else str(pred_b),\n",
248248
" input=input_\n",
249249
" )\n",
250250
" if eval_res[\"value\"] == \"A\":\n",

langchain/evaluation/agents/trajectory_eval_chain.py

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -105,7 +105,7 @@ def geography_answers(country: str, question: str) -> str:
105105
result = eval_chain.evaluate_agent_trajectory(
106106
input=question,
107107
agent_trajectory=response["intermediate_steps"],
108-
output=response["output"],
108+
prediction=response["output"],
109109
reference="Paris",
110110
)
111111
print(result["score"])
@@ -325,9 +325,9 @@ async def _acall(
325325
def evaluate_agent_trajectory(
326326
self,
327327
*,
328+
prediction: str,
328329
input: str,
329330
agent_trajectory: Union[str, List[Tuple[AgentAction, str]]],
330-
output: str,
331331
reference: Optional[str] = None,
332332
callbacks: Callbacks = None,
333333
**kwargs: Any,
@@ -338,7 +338,7 @@ def evaluate_agent_trajectory(
338338
input (str): The input question.
339339
agent_trajectory (Union[str, List[Tuple[AgentAction, str]]]):
340340
The intermediate steps forming the agent trajectory.
341-
output (str): The expected output.
341+
prediction (str): The expected prediction.
342342
reference (Optional[str]): The reference answer.
343343
344344
Returns:
@@ -347,17 +347,17 @@ def evaluate_agent_trajectory(
347347
inputs = {
348348
"question": input,
349349
"agent_trajectory": self.get_agent_trajectory(agent_trajectory),
350-
"answer": output,
350+
"answer": prediction,
351351
"reference": self._format_reference(reference),
352352
}
353353
return self(inputs=inputs, callbacks=callbacks, **kwargs)
354354

355355
async def aevaluate_agent_trajectory(
356356
self,
357357
*,
358+
prediction: str,
358359
input: str,
359360
agent_trajectory: Union[str, List[Tuple[AgentAction, str]]],
360-
output: str,
361361
reference: Optional[str] = None,
362362
callbacks: Callbacks = None,
363363
**kwargs: Any,
@@ -368,7 +368,7 @@ async def aevaluate_agent_trajectory(
368368
input (str): The input question.
369369
agent_trajectory (Union[str, List[Tuple[AgentAction, str]]]):
370370
The intermediate steps forming the agent trajectory.
371-
output (str): The expected output.
371+
prediction (str): The expected prediction.
372372
reference (Optional[str]): The reference answer.
373373
374374
Returns:
@@ -377,7 +377,7 @@ async def aevaluate_agent_trajectory(
377377
inputs = {
378378
"question": input,
379379
"agent_trajectory": self.get_agent_trajectory(agent_trajectory),
380-
"answer": output,
380+
"answer": prediction,
381381
"reference": self._format_reference(reference),
382382
}
383383
return await self.acall(

langchain/evaluation/comparison/__init__.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,8 +12,8 @@
1212
>>> chain = PairwiseStringEvalChain.from_llm(llm=llm)
1313
>>> result = chain.evaluate_string_pairs(
1414
... input = "What is the chemical formula for water?",
15-
... output_a = "H2O",
16-
... output_b = (
15+
... prediction = "H2O",
16+
... prediction_b = (
1717
... "The chemical formula for water is H2O, which means"
1818
... " there are two hydrogen atoms and one oxygen atom."
1919
... referenc = "The chemical formula for water is H2O.",

langchain/evaluation/comparison/eval_chain.py

Lines changed: 16 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -60,8 +60,8 @@ class PairwiseStringEvalChain(LLMChain):
6060
>>> chain = PairwiseStringEvalChain.from_llm(llm=llm)
6161
>>> result = chain.evaluate_string_pairs(
6262
... input = "What is the chemical formula for water?",
63-
... output_a = "H2O",
64-
... output_b = (
63+
... prediction = "H2O",
64+
... prediction_b = (
6565
... "The chemical formula for water is H2O, which means"
6666
... " there are two hydrogen atoms and one oxygen atom."
6767
... referenc = "The chemical formula for water is H2O.",
@@ -101,7 +101,7 @@ def from_llm(
101101
Returns:
102102
PairwiseStringEvalChain: The initialized PairwiseStringEvalChain.
103103
"""
104-
expected_input_vars = {"output_a", "output_b", "input"}
104+
expected_input_vars = {"prediction", "prediction_b", "input"}
105105
if prompt is None:
106106
if require_reference:
107107
expected_input_vars.add("reference")
@@ -121,11 +121,11 @@ def from_llm(
121121
return cls(llm=llm, prompt=prompt_, **kwargs)
122122

123123
def _prepare_input(
124-
self, output_a: str, output_b: str, input: str, reference: Optional[str]
124+
self, prediction: str, prediction_b: str, input: str, reference: Optional[str]
125125
) -> dict:
126126
input_ = {
127-
"output_a": output_a,
128-
"output_b": output_b,
127+
"prediction": prediction,
128+
"prediction_b": prediction_b,
129129
"input": input,
130130
}
131131
if reference is not None and "reference" in self.prompt.input_variables:
@@ -135,8 +135,8 @@ def _prepare_input(
135135
def evaluate_string_pairs(
136136
self,
137137
*,
138-
output_a: str,
139-
output_b: str,
138+
prediction: str,
139+
prediction_b: str,
140140
input: str,
141141
reference: Optional[str] = None,
142142
callbacks: Callbacks = None,
@@ -145,8 +145,8 @@ def evaluate_string_pairs(
145145
"""Evaluate whether output A is preferred to output B.
146146
147147
Args:
148-
output_a (str): The output string from the first model.
149-
output_b (str): The output string from the second model.
148+
prediction (str): The output string from the first model.
149+
prediction_b (str): The output string from the second model.
150150
input (str): The input or task string.
151151
callbacks (Callbacks, optional): The callbacks to use.
152152
reference (str, optional): The reference string, if any.
@@ -160,7 +160,7 @@ def evaluate_string_pairs(
160160
- score: The preference score, which is 1 for 'A', 0 for 'B',
161161
and 0.5 for None.
162162
"""
163-
input_ = self._prepare_input(output_a, output_b, input, reference)
163+
input_ = self._prepare_input(prediction, prediction_b, input, reference)
164164
result = self(
165165
inputs=input_,
166166
callbacks=callbacks,
@@ -171,8 +171,8 @@ def evaluate_string_pairs(
171171
async def aevaluate_string_pairs(
172172
self,
173173
*,
174-
output_a: str,
175-
output_b: str,
174+
prediction: str,
175+
prediction_b: str,
176176
input: str,
177177
reference: Optional[str] = None,
178178
callbacks: Callbacks = None,
@@ -181,8 +181,8 @@ async def aevaluate_string_pairs(
181181
"""Asynchronously evaluate whether output A is preferred to output B.
182182
183183
Args:
184-
output_a (str): The output string from the first model.
185-
output_b (str): The output string from the second model.
184+
prediction (str): The output string from the first model.
185+
prediction_b (str): The output string from the second model.
186186
input (str): The input or task string.
187187
callbacks (Callbacks, optional): The callbacks to use.
188188
reference (str, optional): The reference string, if any.
@@ -196,7 +196,7 @@ async def aevaluate_string_pairs(
196196
- score: The preference score, which is 1 for 'A', 0 for 'B',
197197
and 0.5 for None.
198198
"""
199-
input_ = self._prepare_input(output_a, output_b, input, reference)
199+
input_ = self._prepare_input(prediction, prediction_b, input, reference)
200200
result = await self.acall(
201201
inputs=input_,
202202
callbacks=callbacks,

langchain/evaluation/comparison/prompt.py

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -21,14 +21,14 @@
2121
[/QUESTION]
2222
2323
[RESPONSE A]
24-
{output_a}
24+
{prediction}
2525
[/RESPONSE A]
2626
2727
[RESPONSE B]
28-
{output_b}
28+
{prediction_b}
2929
[/RESPONSE B]"""
3030
PROMPT = PromptTemplate(
31-
input_variables=["input", "output_a", "output_b"], template=template
31+
input_variables=["input", "prediction", "prediction_b"], template=template
3232
)
3333

3434
template = """Act as a fair judge and rate the two responses to the question below.\
@@ -52,13 +52,14 @@
5252
[/QUESTION]
5353
5454
[RESPONSE A]
55-
{output_a}
55+
{prediction}
5656
[/RESPONSE A]
5757
5858
[RESPONSE B]
59-
{output_b}
59+
{prediction_b}
6060
[/RESPONSE B]"""
6161

6262
PROMPT_WITH_REFERENCE = PromptTemplate(
63-
input_variables=["input", "output_a", "output_b", "reference"], template=template
63+
input_variables=["input", "prediction", "prediction_b", "reference"],
64+
template=template,
6465
)

langchain/evaluation/schema.py

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -62,17 +62,17 @@ class PairwiseStringEvaluator(Protocol):
6262
def evaluate_string_pairs(
6363
self,
6464
*,
65-
output_a: str,
66-
output_b: str,
65+
prediction: str,
66+
prediction_b: str,
6767
reference: Optional[str] = None,
6868
input: Optional[str] = None,
6969
**kwargs: Any,
7070
) -> dict:
7171
"""Evaluate the output string pairs.
7272
7373
Args:
74-
output_a (str): The output string from the first model.
75-
output_b (str): The output string from the second model.
74+
prediction (str): The output string from the first model.
75+
prediction_b (str): The output string from the second model.
7676
reference (str, optional): The expected output / reference
7777
string. Defaults to None.
7878
input (str, optional): The input string. Defaults to None.
@@ -86,17 +86,17 @@ def evaluate_string_pairs(
8686

8787
async def aevaluate_string_pairs(
8888
self,
89-
output_a: str,
90-
output_b: str,
89+
prediction: str,
90+
prediction_b: str,
9191
reference: Optional[str] = None,
9292
input: Optional[str] = None,
9393
**kwargs: Any,
9494
) -> dict:
9595
"""Evaluate the output string pairs.
9696
9797
Args:
98-
output_a (str): The output string from the first model.
99-
output_b (str): The output string from the second model.
98+
prediction (str): The output string from the first model.
99+
prediction_b (str): The output string from the second model.
100100
reference (str, optional): The expected output / reference
101101
string. Defaults to None.
102102
input (str, optional): The input string. Defaults to None.

tests/unit_tests/evaluation/agents/test_eval_chain.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -45,14 +45,14 @@ def test_trajectory_eval_chain(
4545
res = chain.evaluate_agent_trajectory(
4646
input="What is your favorite food?",
4747
agent_trajectory=intermediate_steps,
48-
output="I like pie.",
48+
prediction="I like pie.",
4949
)
5050
assert res["score"] == 5
5151
# Test when ref is provided
5252
res = chain.evaluate_agent_trajectory(
5353
input="What is your favorite food?",
5454
agent_trajectory=intermediate_steps,
55-
output="I like pie.",
55+
prediction="I like pie.",
5656
reference="Paris",
5757
)
5858
assert res["score"] == 1
@@ -72,13 +72,13 @@ def test_trajectory_eval_chain_no_tools(
7272
res = chain.evaluate_agent_trajectory(
7373
input="What is your favorite food?",
7474
agent_trajectory=intermediate_steps,
75-
output="I like pie.",
75+
prediction="I like pie.",
7676
)
7777
assert res["score"] == 5
7878
res = chain.evaluate_agent_trajectory(
7979
input="What is your favorite food?",
8080
agent_trajectory=intermediate_steps,
81-
output="I like pie.",
81+
prediction="I like pie.",
8282
reference="Paris",
8383
)
8484
assert res["score"] == 1

tests/unit_tests/evaluation/comparison/test_eval_chain.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -16,23 +16,23 @@ def test_pairwise_string_comparison_chain() -> None:
1616
)
1717
chain = PairwiseStringEvalChain.from_llm(llm=llm)
1818
res = chain.evaluate_string_pairs(
19-
output_a="I like pie.",
20-
output_b="I love pie.",
19+
prediction="I like pie.",
20+
prediction_b="I love pie.",
2121
input="What is your favorite food?",
2222
)
2323
assert res["value"] is None
2424
assert res["score"] == 0.5
2525
assert res["reasoning"] == "The values are the same."
2626
res = chain.evaluate_string_pairs(
27-
output_a="I like pie.",
28-
output_b="I like pie.",
27+
prediction="I like pie.",
28+
prediction_b="I like pie.",
2929
input="What is your favorite food?",
3030
)
3131
assert res["value"] == "A"
3232
assert res["score"] == 1
3333
res = chain.evaluate_string_pairs(
34-
output_a="I like pie.",
35-
output_b="I hate pie.",
34+
prediction="I like pie.",
35+
prediction_b="I hate pie.",
3636
input="What is your favorite food?",
3737
)
3838
assert res["value"] == "B"

0 commit comments

Comments
 (0)