Skip to content

Base Grader

evaldeck.graders.BaseGrader

Bases: ABC

Base class for all graders.

Graders evaluate a trace against expected behavior and return a grade result. Supports both sync and async evaluation.

Async behavior
  • Default grade_async() runs sync grade() in a thread pool
  • Override grade_async() for true async I/O (e.g., LLMGrader)
  • When using Evaluator.evaluate_async(), all graders run concurrently

Creating a custom async grader::

class MyAPIGrader(BaseGrader):
    name = "my_api"

    def grade(self, trace, test_case):
        # Sync fallback (blocking)
        return requests.post(...).json()

    async def grade_async(self, trace, test_case):
        # Async implementation (non-blocking)
        async with httpx.AsyncClient() as client:
            response = await client.post(...)
            return GradeResult.from_api(response.json())

grade abstractmethod

grade(trace, test_case)

Evaluate the trace and return a grade result.

Parameters:

Name Type Description Default
trace Trace

The execution trace to evaluate.

required
test_case EvalCase

The test case with expected behavior.

required

Returns:

Type Description
GradeResult

GradeResult indicating pass/fail and details.

Source code in src/evaldeck/graders/base.py
@abstractmethod
def grade(self, trace: Trace, test_case: EvalCase) -> GradeResult:
    """Evaluate the trace and return a grade result.

    Args:
        trace: The execution trace to evaluate.
        test_case: The test case with expected behavior.

    Returns:
        GradeResult indicating pass/fail and details.
    """
    pass

grade_async async

grade_async(trace, test_case)

Async version of grade.

Default implementation runs sync grade() in a thread pool. Override this method for true async behavior (e.g., async API calls).

Parameters:

Name Type Description Default
trace Trace

The execution trace to evaluate.

required
test_case EvalCase

The test case with expected behavior.

required

Returns:

Type Description
GradeResult

GradeResult indicating pass/fail and details.

Source code in src/evaldeck/graders/base.py
async def grade_async(self, trace: Trace, test_case: EvalCase) -> GradeResult:
    """Async version of grade.

    Default implementation runs sync grade() in a thread pool.
    Override this method for true async behavior (e.g., async API calls).

    Args:
        trace: The execution trace to evaluate.
        test_case: The test case with expected behavior.

    Returns:
        GradeResult indicating pass/fail and details.
    """
    return await asyncio.to_thread(self.grade, trace, test_case)

evaldeck.graders.CompositeGrader

CompositeGrader(graders, require_all=True)

Bases: BaseGrader

A grader that combines multiple graders.

By default, all graders must pass for the composite to pass.

Initialize composite grader.

Parameters:

Name Type Description Default
graders list[BaseGrader]

List of graders to run.

required
require_all bool

If True, all must pass. If False, any can pass.

True
Source code in src/evaldeck/graders/base.py
def __init__(
    self,
    graders: list[BaseGrader],
    require_all: bool = True,
) -> None:
    """Initialize composite grader.

    Args:
        graders: List of graders to run.
        require_all: If True, all must pass. If False, any can pass.
    """
    self.graders = graders
    self.require_all = require_all

grade

grade(trace, test_case)

Run all graders and combine results.

Source code in src/evaldeck/graders/base.py
def grade(self, trace: Trace, test_case: EvalCase) -> GradeResult:
    """Run all graders and combine results."""
    results: list[GradeResult] = []
    for grader in self.graders:
        result = grader.grade(trace, test_case)
        results.append(result)

    return self._combine_results(results)

grade_async async

grade_async(trace, test_case)

Run all graders concurrently and combine results.

Source code in src/evaldeck/graders/base.py
async def grade_async(self, trace: Trace, test_case: EvalCase) -> GradeResult:
    """Run all graders concurrently and combine results."""
    tasks = [grader.grade_async(trace, test_case) for grader in self.graders]
    results = await asyncio.gather(*tasks, return_exceptions=True)

    # Handle any exceptions
    grade_results: list[GradeResult] = []
    for i, result in enumerate(results):
        if isinstance(result, BaseException):
            grade_results.append(
                GradeResult.error_result(self.graders[i].name, f"Grader error: {result}")
            )
        else:
            grade_results.append(result)

    return self._combine_results(grade_results)