Skip to content

Test Case Models

evaldeck.test_case.EvalCase

Bases: BaseModel

A test case for evaluating an agent.

Test cases define conversation turns to send to the agent and the expected behavior/output to validate against for each turn.

Example

Single turn: turns: - user: "Book a flight to NYC" expected: tools_called: [search_flights, book_flight]

Multi-turn: turns: - user: "I want to book a flight" - user: "NYC to LA, March 15" expected: tools_called: [search_flights] - user: "Book the cheapest one" expected: tools_called: [book_flight]

is_multi_turn property

is_multi_turn

Check if this is a multi-turn conversation.

expected property

expected

Get expected behavior from first turn (for backward compat with graders).

graders property

graders

Get graders from first turn (for backward compat).

input property

input

Get input from first turn (for backward compat).

from_yaml classmethod

from_yaml(path)

Load a test case from a YAML file.

Source code in src/evaldeck/test_case.py
@classmethod
def from_yaml(cls, path: str | Path) -> EvalCase:
    """Load a test case from a YAML file."""
    with open(path) as f:
        data = yaml.safe_load(f)
    return cls._from_dict(data)

from_yaml_string classmethod

from_yaml_string(content)

Load a test case from a YAML string.

Source code in src/evaldeck/test_case.py
@classmethod
def from_yaml_string(cls, content: str) -> EvalCase:
    """Load a test case from a YAML string."""
    data = yaml.safe_load(content)
    return cls._from_dict(data)

to_yaml

to_yaml()

Convert test case to YAML string.

Source code in src/evaldeck/test_case.py
def to_yaml(self) -> str:
    """Convert test case to YAML string."""
    result: str = yaml.dump(self.model_dump(exclude_none=True), default_flow_style=False)
    return result

evaldeck.test_case.ExpectedBehavior

Bases: BaseModel

Expected behavior for an agent test case.


evaldeck.test_case.EvalSuite

Bases: BaseModel

A collection of test cases.

from_directory classmethod

from_directory(path, name=None)

Load all test cases from a directory.

Source code in src/evaldeck/test_case.py
@classmethod
def from_directory(cls, path: str | Path, name: str | None = None) -> EvalSuite:
    """Load all test cases from a directory."""
    path = Path(path)
    if not path.is_dir():
        raise ValueError(f"Path is not a directory: {path}")

    test_cases = []
    for file in sorted(path.glob("*.yaml")):
        if file.name.startswith("_"):
            continue
        try:
            test_cases.append(EvalCase.from_yaml(file))
        except Exception as e:
            raise ValueError(f"Failed to load {file}: {e}") from e

    for file in sorted(path.glob("*.yml")):
        if file.name.startswith("_"):
            continue
        try:
            test_cases.append(EvalCase.from_yaml(file))
        except Exception as e:
            raise ValueError(f"Failed to load {file}: {e}") from e

    return cls(
        name=name or path.name,
        test_cases=test_cases,
    )

filter_by_tags

filter_by_tags(tags)

Return a new suite with only test cases matching the given tags.

Source code in src/evaldeck/test_case.py
def filter_by_tags(self, tags: list[str]) -> EvalSuite:
    """Return a new suite with only test cases matching the given tags."""
    filtered = [tc for tc in self.test_cases if any(t in tc.tags for t in tags)]
    return EvalSuite(
        name=self.name,
        description=self.description,
        test_cases=filtered,
        defaults=self.defaults,
        tags=self.tags,
    )

evaldeck.test_case.GraderConfig

Bases: BaseModel

Configuration for a grader.