Adding Metrics¶

Create custom metrics to measure agent behavior.

Metric Interface¶

All metrics inherit from BaseMetric:

from abc import ABC, abstractmethod
from evaldeck.trace import Trace
from evaldeck.test_case import EvalCase
from evaldeck.results import MetricResult

class BaseMetric(ABC):
    """Base class for all metrics."""

    name: str = "base"

    @abstractmethod
    def calculate(self, trace: Trace, test_case: EvalCase) -> MetricResult:
        """Calculate the metric value."""
        pass

Creating a Metric¶

Step 1: Define the Class¶

# src/evaldeck/metrics/custom.py
from evaldeck.metrics.base import BaseMetric
from evaldeck.results import MetricResult


class AverageStepDurationMetric(BaseMetric):
    """Calculate average duration per step."""

    name = "avg_step_duration"

    def calculate(self, trace: Trace, test_case: EvalCase) -> MetricResult:
        if not trace.steps:
            return MetricResult(
                metric_name=self.name,
                value=0.0,
                unit="ms"
            )

        total_duration = sum(
            step.duration_ms or 0
            for step in trace.steps
        )
        avg = total_duration / len(trace.steps)

        return MetricResult(
            metric_name=self.name,
            value=round(avg, 2),
            unit="ms",
            details={
                "total_duration_ms": total_duration,
                "step_count": len(trace.steps),
            }
        )

Step 2: Export the Metric¶

# src/evaldeck/metrics/__init__.py
from evaldeck.metrics.custom import AverageStepDurationMetric

__all__ = [
    # ... existing exports ...
    "AverageStepDurationMetric",
]

Step 3: Add Tests¶

# tests/test_metrics.py
import pytest
from evaldeck import Trace, Step
from evaldeck.metrics import AverageStepDurationMetric


class TestAverageStepDurationMetric:
    def test_calculates_average(self):
        trace = Trace(input="test")
        trace.add_step(Step.tool_call("a", {}, {}, duration_ms=100))
        trace.add_step(Step.tool_call("b", {}, {}, duration_ms=200))
        trace.add_step(Step.tool_call("c", {}, {}, duration_ms=300))

        metric = AverageStepDurationMetric()
        result = metric.calculate(trace, mock_test_case)

        assert result.value == 200.0
        assert result.unit == "ms"

    def test_handles_empty_trace(self):
        trace = Trace(input="test")

        metric = AverageStepDurationMetric()
        result = metric.calculate(trace, mock_test_case)

        assert result.value == 0.0

    def test_handles_missing_duration(self):
        trace = Trace(input="test")
        trace.add_step(Step.tool_call("a", {}, {}))  # No duration

        metric = AverageStepDurationMetric()
        result = metric.calculate(trace, mock_test_case)

        assert result.value == 0.0

MetricResult¶

Return a MetricResult with:

MetricResult(
    metric_name="my_metric",
    value=42.5,                    # The measurement
    unit="ms",                     # Optional unit
    details={"extra": "info"}      # Optional details
)

More Examples¶

Token Cost Metric¶

class TokenCostMetric(BaseMetric):
    """Estimate cost based on token usage."""

    name = "token_cost"

    # Prices per 1M tokens (example rates)
    PRICES = {
        "gpt-4o": {"input": 5.0, "output": 15.0},
        "gpt-4o-mini": {"input": 0.15, "output": 0.60},
        "claude-3-opus": {"input": 15.0, "output": 75.0},
        "claude-3-sonnet": {"input": 3.0, "output": 15.0},
    }

    def calculate(self, trace: Trace, test_case: EvalCase) -> MetricResult:
        total_cost = 0.0
        details = {}

        for step in trace.llm_calls:
            if step.tokens and step.model:
                model = step.model.lower()

                # Find matching price
                prices = None
                for model_key, model_prices in self.PRICES.items():
                    if model_key in model:
                        prices = model_prices
                        break

                if prices:
                    input_cost = (step.tokens.prompt_tokens / 1_000_000) * prices["input"]
                    output_cost = (step.tokens.completion_tokens / 1_000_000) * prices["output"]
                    step_cost = input_cost + output_cost
                    total_cost += step_cost

                    details[step.id] = {
                        "model": step.model,
                        "input_tokens": step.tokens.prompt_tokens,
                        "output_tokens": step.tokens.completion_tokens,
                        "cost": round(step_cost, 6),
                    }

        return MetricResult(
            metric_name=self.name,
            value=round(total_cost, 6),
            unit="USD",
            details=details
        )

Retry Rate Metric¶

class RetryRateMetric(BaseMetric):
    """Calculate the rate of repeated tool calls."""

    name = "retry_rate"

    def calculate(self, trace: Trace, test_case: EvalCase) -> MetricResult:
        tool_calls = [step.tool_name for step in trace.tool_calls]

        if not tool_calls:
            return MetricResult(
                metric_name=self.name,
                value=0.0,
                details={"message": "No tool calls"}
            )

        unique_tools = set(tool_calls)
        total_calls = len(tool_calls)
        unique_count = len(unique_tools)

        # Retry rate = 1 - (unique / total)
        # 0.0 = no retries, 0.5 = half are retries
        retry_rate = 1 - (unique_count / total_calls)

        return MetricResult(
            metric_name=self.name,
            value=round(retry_rate, 3),
            details={
                "total_calls": total_calls,
                "unique_tools": unique_count,
                "repeated_calls": total_calls - unique_count,
            }
        )

Reasoning Depth Metric¶

class ReasoningDepthMetric(BaseMetric):
    """Measure the depth of agent reasoning."""

    name = "reasoning_depth"

    def calculate(self, trace: Trace, test_case: EvalCase) -> MetricResult:
        reasoning_steps = [
            step for step in trace.steps
            if step.type.value == "reasoning"
        ]

        if not reasoning_steps:
            return MetricResult(
                metric_name=self.name,
                value=0,
                details={"message": "No reasoning steps"}
            )

        # Calculate metrics
        count = len(reasoning_steps)
        total_length = sum(
            len(step.reasoning_text or "")
            for step in reasoning_steps
        )
        avg_length = total_length / count

        return MetricResult(
            metric_name=self.name,
            value=count,
            details={
                "reasoning_step_count": count,
                "total_characters": total_length,
                "avg_characters": round(avg_length, 1),
            }
        )

Using Custom Metrics¶

Programmatically¶

from evaldeck import Evaluator
from my_metrics import TokenCostMetric, RetryRateMetric

evaluator = Evaluator()
evaluator.add_metric(TokenCostMetric())
evaluator.add_metric(RetryRateMetric())

result = evaluator.evaluate(trace, test_case)

for metric in result.metrics:
    print(f"{metric.metric_name}: {metric.value} {metric.unit or ''}")

In Evaluator Constructor¶

evaluator = Evaluator(
    metrics=[
        StepCountMetric(),
        TokenUsageMetric(),
        TokenCostMetric(),
        RetryRateMetric(),
    ]
)

Best Practices¶

Return sensible defaults for empty traces
Include details for debugging
Use appropriate units (ms, tokens, USD, etc.)
Handle missing data gracefully
Round values appropriately