Skip to content

LLM Graders

Model-as-judge graders using LLM APIs.

evaldeck.graders.LLMGrader

LLMGrader(prompt=None, model='gpt-4o-mini', provider=None, api_key=None, threshold=None, temperature=0.0, task=None)

Bases: BaseGrader

Use an LLM to grade agent output.

This grader sends the trace/output to an LLM with a grading prompt and parses the response to determine pass/fail.

Supports OpenAI and Anthropic APIs (user provides their own API key).

Initialize LLM grader.

Parameters:

Name Type Description Default
prompt str | None

Custom grading prompt. Use {input}, {output}, {trace} placeholders.

None
model str

Model to use (e.g., "gpt-4o-mini", "claude-3-haiku-20240307").

'gpt-4o-mini'
provider str | None

API provider ("openai" or "anthropic"). Auto-detected from model.

None
api_key str | None

API key. If None, uses environment variable.

None
threshold float | None

Score threshold for pass (if using scored evaluation).

None
temperature float

Model temperature.

0.0
task str | None

Task description for the default prompt.

None
Source code in src/evaldeck/graders/llm.py
def __init__(
    self,
    prompt: str | None = None,
    model: str = "gpt-4o-mini",
    provider: str | None = None,
    api_key: str | None = None,
    threshold: float | None = None,
    temperature: float = 0.0,
    task: str | None = None,
) -> None:
    """Initialize LLM grader.

    Args:
        prompt: Custom grading prompt. Use {input}, {output}, {trace} placeholders.
        model: Model to use (e.g., "gpt-4o-mini", "claude-3-haiku-20240307").
        provider: API provider ("openai" or "anthropic"). Auto-detected from model.
        api_key: API key. If None, uses environment variable.
        threshold: Score threshold for pass (if using scored evaluation).
        temperature: Model temperature.
        task: Task description for the default prompt.
    """
    self.prompt_template = prompt or self.DEFAULT_PROMPT
    self.model = model
    self.provider = provider or self._detect_provider(model)
    self.api_key = api_key
    self.threshold = threshold
    self.temperature = temperature
    self.task = task or "Determine if the agent completed the task correctly."

grade

grade(trace, test_case)

Grade the trace using an LLM (sync).

Source code in src/evaldeck/graders/llm.py
def grade(self, trace: Trace, test_case: EvalCase) -> GradeResult:
    """Grade the trace using an LLM (sync)."""
    try:
        # Format prompt
        prompt = self._format_prompt(trace, test_case)

        # Call LLM
        if self.provider == "anthropic":
            response = self._call_anthropic(prompt)
        else:
            response = self._call_openai(prompt)

        return self._build_result(response)

    except Exception as e:
        return GradeResult.error_result(self.name, f"LLM grader error: {e}")

grade_async async

grade_async(trace, test_case)

Grade the trace using an LLM (async).

Uses async API clients for better performance in concurrent evaluation.

Source code in src/evaldeck/graders/llm.py
async def grade_async(self, trace: Trace, test_case: EvalCase) -> GradeResult:
    """Grade the trace using an LLM (async).

    Uses async API clients for better performance in concurrent evaluation.
    """
    try:
        # Format prompt
        prompt = self._format_prompt(trace, test_case)

        # Call LLM asynchronously
        if self.provider == "anthropic":
            response = await self._call_anthropic_async(prompt)
        else:
            response = await self._call_openai_async(prompt)

        return self._build_result(response)

    except Exception as e:
        return GradeResult.error_result(self.name, f"LLM grader error: {e}")

evaldeck.graders.LLMRubricGrader

LLMRubricGrader(rubric, pass_threshold=0.7, **kwargs)

Bases: LLMGrader

LLM grader with a detailed scoring rubric.

Initialize rubric grader.

Parameters:

Name Type Description Default
rubric dict[str, str]

Dict mapping criterion names to descriptions.

required
pass_threshold float

Minimum score ratio to pass (0-1).

0.7
**kwargs Any

Passed to LLMGrader.

{}
Source code in src/evaldeck/graders/llm.py
def __init__(
    self,
    rubric: dict[str, str],
    pass_threshold: float = 0.7,
    **kwargs: Any,
) -> None:
    """Initialize rubric grader.

    Args:
        rubric: Dict mapping criterion names to descriptions.
        pass_threshold: Minimum score ratio to pass (0-1).
        **kwargs: Passed to LLMGrader.
    """
    self.rubric = rubric
    self.pass_threshold = pass_threshold
    super().__init__(**kwargs)
    self.prompt_template = self.RUBRIC_PROMPT