Stats
Actions
Tags
From example-skills
Provides testing patterns for AI agents covering tool use, multi-turn conversations, error recovery, non-deterministic outputs, mocks, metrics, and regression tests. For LLM agent QA.
How this skill is triggered — by the user, by Claude, or both
Slash command
/example-skills:agent-testing-patternsThe summary Claude sees in its skill listing — used to decide when to auto-load this skill
Test AI agent systems that use tools, make decisions, and produce non-deterministic outputs.
Test AI agent systems that use tools, make decisions, and produce non-deterministic outputs.
| Challenge | Cause | Strategy |
|---|---|---|
| Non-deterministic output | LLM randomness | Assert on structure, not exact text |
| Tool use sequences | Agent autonomy | Verify tool calls, not call order |
| Multi-turn state | Conversation context | Snapshot-based assertions |
| Cost | API calls | Mock LLM in unit tests |
| Latency | API round-trips | Parallel test execution |
| Flakiness | Model updates | Semantic assertions, not string matches |
╱╲
╱ ╲ E2E Agent Tests (few, expensive)
╱────╲ Full agent loop with real LLM
╱ ╲
╱────────╲ Integration Tests (moderate)
╱ ╲ Tool execution, state management
╱────────────╲
╱ Unit Tests ╲ Tool implementations, parsers, validators
╱────────────────╲
import pytest
def test_file_read_tool():
tool = FileReadTool()
result = tool.execute({"path": "test.txt"})
assert result["content"] == "expected content"
assert result["success"] is True
def test_file_read_tool_missing_file():
tool = FileReadTool()
result = tool.execute({"path": "nonexistent.txt"})
assert result["success"] is False
assert "not found" in result["error"].lower()
def test_tool_input_validation():
tool = FileReadTool()
with pytest.raises(ValueError, match="path is required"):
tool.execute({})
def test_parse_tool_call():
raw = '{"tool": "search", "args": {"query": "python"}}'
result = parse_tool_call(raw)
assert result.tool == "search"
assert result.args == {"query": "python"}
def test_parse_malformed_tool_call():
raw = "not json at all"
result = parse_tool_call(raw)
assert result is None
class MockLLMClient:
def __init__(self, responses: list[dict]):
self.responses = iter(responses)
self.calls: list[dict] = []
async def generate(self, messages: list[dict], tools: list[dict] = None) -> dict:
self.calls.append({"messages": messages, "tools": tools})
return next(self.responses)
@pytest.fixture
def mock_agent():
client = MockLLMClient(responses=[
{"content": None, "tool_calls": [{"name": "search", "args": {"query": "python packaging"}}]},
{"content": "Based on the search results, here's how to package Python..."},
])
return Agent(llm=client, tools=[SearchTool(), FileReadTool()])
@pytest.mark.asyncio
async def test_agent_uses_search_then_responds(mock_agent):
result = await mock_agent.run("How do I package a Python project?")
# Verify tool was called
assert len(mock_agent.tool_history) == 1
assert mock_agent.tool_history[0].tool_name == "search"
assert "python" in mock_agent.tool_history[0].args["query"].lower()
# Verify final response exists
assert result.content is not None
assert len(result.content) > 0
@pytest.mark.asyncio
async def test_session_preserves_context(mock_agent):
await mock_agent.run("My name is Alice")
result = await mock_agent.run("What's my name?")
# Verify conversation history maintained
assert len(mock_agent.messages) == 4 # 2 user + 2 assistant
@pytest.mark.e2e
@pytest.mark.asyncio
async def test_agent_creates_file(real_agent, tmp_path):
result = await real_agent.run(f"Create a Python hello world script at {tmp_path}/hello.py")
# Assert on outcome, not exact content
hello_file = tmp_path / "hello.py"
assert hello_file.exists()
content = hello_file.read_text()
assert "print" in content # Must use print
assert content.strip() # Non-empty
# Verify it's valid Python
compile(content, "hello.py", "exec")
@pytest.mark.e2e
@pytest.mark.asyncio
async def test_agent_explains_concept(real_agent):
result = await real_agent.run("Explain what a circuit breaker pattern is in 2-3 sentences")
# Semantic checks (not exact string matching)
assert len(result.content) > 50
assert len(result.content) < 1000
assert any(term in result.content.lower() for term in ["fault", "failure", "threshold", "open", "closed"])
@dataclass
class AgentEvalResult:
task_completed: bool
tool_calls_count: int
tokens_used: int
latency_ms: float
error_recovery_count: int
async def evaluate_agent(agent, test_cases: list[dict]) -> list[AgentEvalResult]:
results = []
for case in test_cases:
start = time.perf_counter()
try:
result = await agent.run(case["prompt"])
completed = case["validator"](result)
except Exception:
completed = False
latency = (time.perf_counter() - start) * 1000
results.append(AgentEvalResult(
task_completed=completed,
tool_calls_count=len(agent.tool_history),
tokens_used=agent.total_tokens,
latency_ms=latency,
error_recovery_count=agent.error_count,
))
return results
def test_tool_call_format_regression():
"""Ensure tool call format hasn't changed."""
response = agent.format_tool_call("search", {"query": "test"})
expected = load_golden("tool_call_format.json")
assert response == expected
BENCHMARK_CASES = [
{"prompt": "List all Python files in the project", "expected_tools": ["glob"], "max_tokens": 500},
{"prompt": "Fix the syntax error in app.py", "expected_tools": ["read", "edit"], "max_tokens": 2000},
]
async def run_benchmark(agent):
for case in BENCHMARK_CASES:
result = await agent.run(case["prompt"])
tools_used = {t.tool_name for t in agent.tool_history}
assert tools_used.issubset(set(case["expected_tools"] + ["think"]))
assert agent.total_tokens <= case["max_tokens"]
npx claudepluginhub a-organvm/a-i--skills --plugin document-skillsProvides a checklist for code reviews covering functionality, security, performance, maintainability, tests, and quality. Use for pull requests, audits, team standards, and developer training.