Basic Usage¶
This example demonstrates the core Evaldeck workflow.
Step 1: Create a Trace¶
A trace captures the agent's execution:
from evaldeck import Trace, Step, TokenUsage
# Start a trace
trace = Trace(
input="Book a flight from NYC to LA on March 15",
framework="custom",
agent_name="BookingAgent"
)
# Add LLM reasoning step
trace.add_step(Step.llm_call(
model="gpt-4o-mini",
input="Parse user request for booking details",
output='{"from": "NYC", "to": "LA", "date": "March 15"}',
token_usage=TokenUsage(prompt_tokens=50, completion_tokens=30, total_tokens=80)
))
# Add tool call step
trace.add_step(Step.tool_call(
tool_name="search_flights",
tool_args={"from": "NYC", "to": "LA", "date": "2024-03-15"},
tool_result={
"flights": [
{"id": "AA123", "price": 299, "departure": "08:00"},
{"id": "UA456", "price": 349, "departure": "10:30"}
]
}
))
# Add another tool call
trace.add_step(Step.tool_call(
tool_name="book_flight",
tool_args={"flight_id": "AA123"},
tool_result={"confirmation": "ABC123", "status": "confirmed"}
))
# Complete the trace
trace.complete(output="Your flight AA123 from NYC to LA on March 15 is booked. Confirmation: ABC123")
Step 2: Define a Test Case¶
In Python¶
from evaldeck import EvalCase, ExpectedBehavior, Turn
test_case = EvalCase(
name="book_flight_basic",
description="Book a simple one-way flight",
turns=[
Turn(
user="Book a flight from NYC to LA on March 15",
expected=ExpectedBehavior(
tools_called=["search_flights", "book_flight"],
tools_not_called=["cancel_booking"],
output_contains=["confirmation", "ABC123"],
max_steps=5,
task_completed=True
),
)
],
tags=["booking", "critical"]
)
In YAML¶
# tests/evals/book_flight_basic.yaml
name: book_flight_basic
description: Book a simple one-way flight
turns:
- user: "Book a flight from NYC to LA on March 15"
expected:
tools_called:
- search_flights
- book_flight
tools_not_called:
- cancel_booking
output_contains:
- "confirmation"
- "ABC123"
max_steps: 5
task_completed: true
tags:
- booking
- critical
Step 3: Evaluate¶
from evaldeck import Evaluator
evaluator = Evaluator()
result = evaluator.evaluate(trace, test_case)
# Check results
print(f"Test: {result.test_case_name}")
print(f"Passed: {result.passed}")
print(f"Duration: {result.duration_ms:.0f}ms")
# Show grades
for grade in result.grades:
status = "PASS" if grade.passed else "FAIL"
print(f" {grade.grader_name}: {status}")
if not grade.passed:
print(f" Expected: {grade.expected}")
print(f" Actual: {grade.actual}")
# Show metrics
for metric in result.metrics:
print(f" {metric.metric_name}: {metric.value} {metric.unit or ''}")
Output:
Test: book_flight_basic
Passed: True
Duration: 5ms
tool_called: PASS
tool_not_called: PASS
contains: PASS
max_steps: PASS
task_completed: PASS
step_count: 3
token_usage: 80 tokens
tool_call_count: 2
Step 4: Run Multiple Tests¶
from evaldeck import EvalSuite
# Load from directory
suite = EvalSuite.from_directory("tests/evals")
# Or create programmatically
suite = EvalSuite(
name="booking_tests",
test_cases=[test_case1, test_case2, test_case3]
)
# Run all tests
def run_agent(input: str) -> Trace:
# Your agent implementation
...
suite_result = evaluator.evaluate_suite(suite, run_agent)
# Summary
print(f"\nResults: {suite_result.passed}/{suite_result.total} passed")
print(f"Pass rate: {suite_result.pass_rate:.1%}")
Complete Script¶
#!/usr/bin/env python3
"""Complete basic usage example."""
from evaldeck import (
Trace, Step, TokenUsage,
EvalCase, ExpectedBehavior, Turn,
Evaluator
)
def simulate_booking_agent(input: str) -> Trace:
"""Simulated booking agent for demonstration."""
trace = Trace(input=input)
# Parse request
trace.add_step(Step.llm_call(
model="gpt-4o-mini",
input=f"Parse: {input}",
output='{"action": "book_flight", "from": "NYC", "to": "LA"}',
token_usage=TokenUsage(prompt_tokens=30, completion_tokens=20, total_tokens=50)
))
# Search flights
trace.add_step(Step.tool_call(
tool_name="search_flights",
tool_args={"from": "NYC", "to": "LA"},
tool_result={"flights": [{"id": "AA123", "price": 299}]}
))
# Book flight
trace.add_step(Step.tool_call(
tool_name="book_flight",
tool_args={"flight_id": "AA123"},
tool_result={"confirmation": "ABC123"}
))
# Generate response
trace.add_step(Step.llm_call(
model="gpt-4o-mini",
input="Generate booking confirmation",
output="Flight booked! Confirmation: ABC123",
token_usage=TokenUsage(prompt_tokens=20, completion_tokens=15, total_tokens=35)
))
trace.complete(output="Your flight is booked! Confirmation: ABC123")
return trace
def main():
# Define test
test = EvalCase(
name="booking_test",
turns=[
Turn(
user="Book a flight from NYC to LA",
expected=ExpectedBehavior(
tools_called=["search_flights", "book_flight"],
output_contains=["confirmation", "ABC123"],
max_steps=5
)
)
]
)
# Run agent
trace = simulate_booking_agent(test.input)
# Evaluate
evaluator = Evaluator()
result = evaluator.evaluate(trace, test)
# Report
print(f"Test: {result.test_case_name}")
print(f"Status: {'PASS' if result.passed else 'FAIL'}")
print(f"Steps: {trace.step_count}")
print(f"Tokens: {trace.total_tokens}")
for grade in result.grades:
symbol = "✓" if grade.passed else "✗"
print(f" {symbol} {grade.grader_name}: {grade.message or grade.status}")
if __name__ == "__main__":
main()