Adding Graders¶
Create custom graders to evaluate agent behavior.
Grader Interface¶
All graders inherit from BaseGrader:
from abc import ABC, abstractmethod
from evaldeck.trace import Trace
from evaldeck.test_case import EvalCase
from evaldeck.results import GradeResult
class BaseGrader(ABC):
"""Base class for all graders."""
name: str = "base"
@abstractmethod
def grade(self, trace: Trace, test_case: EvalCase) -> GradeResult:
"""Evaluate the trace and return a grade result."""
pass
Creating a Grader¶
Step 1: Define the Class¶
# src/evaldeck/graders/sentiment.py
from evaldeck.graders.base import BaseGrader
from evaldeck.results import GradeResult, GradeStatus
class SentimentGrader(BaseGrader):
"""Check if output has positive sentiment."""
name = "sentiment"
def __init__(self, require_positive: bool = True):
"""Initialize sentiment grader.
Args:
require_positive: If True, output must be positive.
If False, output must not be negative.
"""
self.require_positive = require_positive
Step 2: Implement grade()¶
def grade(self, trace: Trace, test_case: EvalCase) -> GradeResult:
"""Check sentiment of the output."""
output = trace.output or ""
# Simple sentiment check (replace with real implementation)
positive_words = ["great", "success", "happy", "thank", "perfect"]
negative_words = ["error", "fail", "sorry", "unfortunately", "problem"]
output_lower = output.lower()
has_positive = any(word in output_lower for word in positive_words)
has_negative = any(word in output_lower for word in negative_words)
if self.require_positive:
if has_positive and not has_negative:
return GradeResult.passed_result(
self.name,
"Output has positive sentiment"
)
return GradeResult.failed_result(
self.name,
"Output does not have positive sentiment",
expected="positive sentiment",
actual=f"positive={has_positive}, negative={has_negative}"
)
else:
if not has_negative:
return GradeResult.passed_result(
self.name,
"Output is not negative"
)
return GradeResult.failed_result(
self.name,
"Output has negative sentiment",
expected="no negative sentiment",
actual=output[:100]
)
Step 3: Export the Grader¶
# src/evaldeck/graders/__init__.py
from evaldeck.graders.sentiment import SentimentGrader
__all__ = [
# ... existing exports ...
"SentimentGrader",
]
Step 4: Add Tests¶
# tests/test_graders.py
import pytest
from evaldeck import Trace
from evaldeck.graders import SentimentGrader
class TestSentimentGrader:
def test_passes_on_positive_output(self):
trace = Trace(input="test")
trace.complete(output="Great! Your request was successful.")
grader = SentimentGrader(require_positive=True)
result = grader.grade(trace, mock_test_case)
assert result.passed
assert "positive" in result.message.lower()
def test_fails_on_negative_output(self):
trace = Trace(input="test")
trace.complete(output="Sorry, there was an error processing your request.")
grader = SentimentGrader(require_positive=True)
result = grader.grade(trace, mock_test_case)
assert not result.passed
def test_handles_empty_output(self):
trace = Trace(input="test")
trace.complete(output="")
grader = SentimentGrader(require_positive=True)
result = grader.grade(trace, mock_test_case)
assert not result.passed
GradeResult¶
Return appropriate results:
# Success
GradeResult.passed_result(
grader_name="my_grader",
message="Check passed",
score=1.0, # Optional
details={"key": "value"} # Optional
)
# Failure
GradeResult.failed_result(
grader_name="my_grader",
message="Check failed because...",
expected="what was expected",
actual="what was found"
)
# Error (grader couldn't run)
GradeResult.error_result(
grader_name="my_grader",
message="Failed to connect to API"
)
Configuration Support¶
To support YAML configuration:
1. Update GraderConfig handling¶
# src/evaldeck/evaluator.py
def _create_grader_from_config(self, config: GraderConfig) -> BaseGrader | None:
grader_type = config.type.lower()
if grader_type == "sentiment":
return SentimentGrader(**config.params)
# ... existing handlers ...
2. Document YAML usage¶
Best Practices¶
1. Single Responsibility¶
Each grader should check one thing:
# Good: focused grader
class ToolCalledGrader(BaseGrader):
"""Check that required tools were called."""
# Bad: multi-purpose grader
class EverythingGrader(BaseGrader):
"""Check tools, output, sentiment, and format."""
2. Clear Error Messages¶
Help users understand failures:
# Good
return GradeResult.failed_result(
self.name,
f"Required tool '{tool}' was not called. "
f"Called tools: {sorted(called_tools)}",
expected=required_tools,
actual=called_tools
)
# Bad
return GradeResult.failed_result(self.name, "Failed")
3. Handle Edge Cases¶
def grade(self, trace: Trace, test_case: EvalCase) -> GradeResult:
# Handle missing output
if not trace.output:
return GradeResult.failed_result(
self.name,
"No output to evaluate"
)
# Handle missing expectations
expected = test_case.expected.custom_field
if expected is None:
return GradeResult.passed_result(
self.name,
"No expectation defined, skipping"
)
4. Use Type Hints¶
Example: API Response Grader¶
A more complex example checking API response format:
class APIResponseGrader(BaseGrader):
"""Validate that tool results match expected API response format."""
name = "api_response"
def __init__(
self,
tool_name: str,
required_fields: list[str],
field_types: dict[str, type] | None = None
):
self.tool_name = tool_name
self.required_fields = required_fields
self.field_types = field_types or {}
def grade(self, trace: Trace, test_case: EvalCase) -> GradeResult:
# Find the tool call
tool_call = None
for step in trace.tool_calls:
if step.tool_name == self.tool_name:
tool_call = step
break
if tool_call is None:
return GradeResult.failed_result(
self.name,
f"Tool '{self.tool_name}' was not called"
)
result = tool_call.tool_result
if not isinstance(result, dict):
return GradeResult.failed_result(
self.name,
f"Tool result is not a dict: {type(result)}",
expected="dict",
actual=type(result).__name__
)
# Check required fields
missing = [f for f in self.required_fields if f not in result]
if missing:
return GradeResult.failed_result(
self.name,
f"Missing required fields: {missing}",
expected=self.required_fields,
actual=list(result.keys())
)
# Check field types
type_errors = []
for field, expected_type in self.field_types.items():
if field in result and not isinstance(result[field], expected_type):
type_errors.append(
f"{field}: expected {expected_type.__name__}, "
f"got {type(result[field]).__name__}"
)
if type_errors:
return GradeResult.failed_result(
self.name,
f"Type errors: {type_errors}"
)
return GradeResult.passed_result(
self.name,
f"API response from '{self.tool_name}' is valid"
)