tool-monitor-dashboard | adk-observability-safety

Stats

Actions

Tags

tool-monitor-dashboard | adk-observability-safety

tool-monitor-dashboard

Track per-tool health metrics for an ADK 2.0 agent: latency, error rate, frequency, cost.

Metric emitters

import time
from google.cloud import monitoring_v3
from google.adk.callbacks import on_before_tool_call, on_after_tool_call

PROJECT = "my-project"
client = monitoring_v3.MetricServiceClient()

def write_metric(metric_type: str, value: float, labels: dict):
    series = monitoring_v3.TimeSeries()
    series.metric.type = f"custom.googleapis.com/adk/{metric_type}"
    series.metric.labels.update(labels)
    series.resource.type = "global"
    point = monitoring_v3.Point({
        "interval": {"end_time": {"seconds": int(time.time())}},
        "value": {"double_value": value},
    })
    series.points = [point]
    client.create_time_series(name=f"projects/{PROJECT}", time_series=[series])

@on_before_tool_call
async def tool_start(ctx, tool_name, args):
    ctx.runtime[f"_tool_start_{tool_name}"] = time.time()

@on_after_tool_call
async def tool_metrics(ctx, tool_name, args, result):
    started = ctx.runtime.pop(f"_tool_start_{tool_name}", None)
    if started is None:
        return
    latency_ms = (time.time() - started) * 1000
    is_error = "error" in (result or {}) or result is None

    write_metric("tool/latency_ms", latency_ms, {"tool": tool_name})
    write_metric("tool/error", 1.0 if is_error else 0.0, {"tool": tool_name})
    write_metric("tool/calls", 1.0, {"tool": tool_name})

Prometheus alternative

from prometheus_client import Counter, Histogram, start_http_server

tool_calls = Counter("adk_tool_calls_total", "Total tool invocations", ["tool", "status"])
tool_latency = Histogram("adk_tool_latency_seconds", "Tool latency", ["tool"])

start_http_server(9100)  # /metrics endpoint

@on_after_tool_call
async def prom_metrics(ctx, tool_name, args, result):
    started = ctx.runtime.pop(f"_tool_start_{tool_name}", None)
    if started:
        tool_latency.labels(tool=tool_name).observe(time.time() - started)
    status = "error" if "error" in (result or {}) else "ok"
    tool_calls.labels(tool=tool_name, status=status).inc()

Grafana dashboard JSON (skeleton)

{
  "title": "ADK Agent Health",
  "panels": [
    {"title": "Tool Calls/min", "targets": [{"expr": "sum by(tool) (rate(adk_tool_calls_total[1m]))"}]},
    {"title": "Tool p95 Latency",  "targets": [{"expr": "histogram_quantile(0.95, sum by (le, tool) (rate(adk_tool_latency_seconds_bucket[5m])))"}]},
    {"title": "Tool Error Rate",   "targets": [{"expr": "sum by(tool) (rate(adk_tool_calls_total{status=\"error\"}[5m])) / sum by(tool) (rate(adk_tool_calls_total[5m]))"}]}
  ]
}

SLO suggestions

Metric	Target
Tool error rate	< 1% over 5min
Tool p95 latency	< 2s for sync tools
Agent end-to-end latency	< 10s p95

Validation

/metrics endpoint scrapes cleanly
Cloud Monitoring metric appears in Metrics Explorer
Dashboard renders with non-zero data
Alerts fire correctly (test by simulating tool errors)

See also

logging-callback-setup for structured event logs
safety-policy-enforcer for incident-grade alerts