What gets measured gets managed. On-call experience is often evaluated anecdotally — "our on-call is pretty bad right now" — without the data to understand why or track improvement. Systematic on-call metrics make the experience measurable, make improvements verifiable, and make the case for investment in reliability work when teams are struggling.
Core On-Call Metrics
Mean Time to Acknowledge (MTTA)
How quickly the on-call engineer responds to a page:
def calculate_mtta(incidents, days=30):
"""
Calculate Mean Time to Acknowledge for paged incidents.
Low MTTA: Engineers respond quickly (alert channel trusted, notifications working)
High MTTA: Engineers may be ignoring pages, notification issues, or overwhelmed
"""
cutoff = datetime.utcnow() - timedelta(days=days)
paged_incidents = [
i for i in incidents
if i.created_at >= cutoff
and i.acknowledged_at is not None
and i.paged_at is not None
]
if not paged_incidents:
return {"error": "No acknowledged paged incidents in period"}
ack_times_minutes = [
(i.acknowledged_at - i.paged_at).total_seconds() / 60
for i in paged_incidents
]
sorted_times = sorted(ack_times_minutes)
n = len(sorted_times)
return {
"mean_mtta_minutes": round(sum(ack_times_minutes) / n, 1),
"median_mtta_minutes": round(sorted_times[n // 2], 1),
"p95_mtta_minutes": round(sorted_times[int(n * 0.95)], 1),
"sample_size": n,
"assessment": assess_mtta(sum(ack_times_minutes) / n)
}
def assess_mtta(mean_minutes):
if mean_minutes < 5:
return "excellent" # Under 5 minutes
elif mean_minutes < 15:
return "good" # Under 15 minutes
elif mean_minutes < 30:
return "fair" # Under 30 minutes
else:
return "poor" # Over 30 minutes — investigate notification setup
Mean Time to Resolve (MTTR)
Total time from incident detection to resolution:
def calculate_mttr_by_severity(incidents, days=30):
"""
Calculate MTTR broken down by severity.
P1 MTTR should be minutes; P3 MTTR can be hours.
Mix them together and you lose insight.
"""
cutoff = datetime.utcnow() - timedelta(days=days)
results = {}
for severity in ["P1", "P2", "P3", "P4"]:
severity_incidents = [
i for i in incidents
if i.created_at >= cutoff
and i.severity == severity
and i.resolved_at is not None
]
if not severity_incidents:
results[severity] = {"count": 0, "mttr_minutes": None}
continue
resolution_times = [
(i.resolved_at - i.created_at).total_seconds() / 60
for i in severity_incidents
]
results[severity] = {
"count": len(severity_incidents),
"mean_mttr_minutes": round(sum(resolution_times) / len(resolution_times), 1),
"median_mttr_minutes": round(sorted(resolution_times)[len(resolution_times) // 2], 1),
"longest_minutes": round(max(resolution_times), 1)
}
return results
Alert Volume Per Rotation
How many alerts does each on-call engineer receive?
def calculate_per_rotation_load(alerts, rotations, days=90):
"""
Calculate alert load per on-call rotation.
High alert volume per rotation → burnout risk
"""
results = []
for rotation in rotations:
if rotation.end_date < datetime.utcnow() - timedelta(days=days):
continue
rotation_alerts = [
a for a in alerts
if rotation.start_date <= a.triggered_at <= rotation.end_date
]
sleep_disrupting = [
a for a in rotation_alerts
if a.triggered_at.hour < 7 or a.triggered_at.hour >= 22
]
actionable = [a for a in rotation_alerts if a.required_action]
results.append({
"rotation_id": rotation.id,
"engineer": rotation.assigned_engineer,
"start_date": rotation.start_date.date().isoformat(),
"end_date": rotation.end_date.date().isoformat(),
"total_alerts": len(rotation_alerts),
"alerts_per_day": round(len(rotation_alerts) / 7, 1),
"sleep_disrupting_alerts": len(sleep_disrupting),
"actionable_alerts": len(actionable),
"noise_alerts": len(rotation_alerts) - len(actionable),
"noise_rate": round(
(len(rotation_alerts) - len(actionable)) / len(rotation_alerts), 2
) if rotation_alerts else 0,
"burnout_risk": assess_burnout_risk(rotation_alerts, sleep_disrupting)
})
return sorted(results, key=lambda r: r["start_date"])
def assess_burnout_risk(all_alerts, sleep_alerts):
if len(sleep_alerts) > 10 or len(all_alerts) > 70:
return "high"
elif len(sleep_alerts) > 4 or len(all_alerts) > 35:
return "medium"
return "low"
Alert Actionability Rate
What percentage of alerts required actual work?
def calculate_alert_actionability(alerts, days=30):
"""
Measure what percentage of alerts were meaningful vs noise.
High noise rate → engineers stop trusting alerts
Low noise rate → alerts are reliable signals
"""
cutoff = datetime.utcnow() - timedelta(days=days)
recent_alerts = [a for a in alerts if a.triggered_at >= cutoff]
if not recent_alerts:
return {"error": "No alerts in period"}
# Categorize each alert
actionable = [a for a in recent_alerts if a.required_action]
auto_resolved = [a for a in recent_alerts if a.auto_resolved_without_action]
false_positive = [a for a in recent_alerts if a.marked_as_false_positive]
unknown = [
a for a in recent_alerts
if a not in actionable and a not in auto_resolved and a not in false_positive
]
total = len(recent_alerts)
return {
"period_days": days,
"total_alerts": total,
"actionable_count": len(actionable),
"actionable_rate": f"{len(actionable) / total:.0%}",
"auto_resolved_count": len(auto_resolved),
"false_positive_count": len(false_positive),
"unknown_count": len(unknown),
"assessment": {
"actionability": (
"excellent" if len(actionable) / total > 0.9
else "good" if len(actionable) / total > 0.7
else "needs_improvement" if len(actionable) / total > 0.5
else "poor"
),
"recommendation": (
"Review and tune high-volume alert rules"
if len(actionable) / total < 0.7 else "Alerting is well-calibrated"
)
}
}
On-Call Equity Metrics
Track fairness in on-call distribution:
def analyze_on_call_equity(rotations, alerts, year):
"""
Analyze whether on-call burden is distributed fairly across the team.
"""
engineer_stats = {}
for rotation in rotations:
if rotation.start_date.year != year:
continue
eng = rotation.assigned_engineer
if eng not in engineer_stats:
engineer_stats[eng] = {
"engineer": eng,
"weeks_on_call": 0,
"total_alerts": 0,
"sleep_disrupting_alerts": 0,
"total_resolution_minutes": 0
}
rotation_alerts = [
a for a in alerts
if rotation.start_date <= a.triggered_at <= rotation.end_date
]
engineer_stats[eng]["weeks_on_call"] += 1
engineer_stats[eng]["total_alerts"] += len(rotation_alerts)
engineer_stats[eng]["sleep_disrupting_alerts"] += sum(
1 for a in rotation_alerts
if a.triggered_at.hour < 7 or a.triggered_at.hour >= 22
)
stats_list = list(engineer_stats.values())
# Calculate equity metrics
weeks_values = [s["weeks_on_call"] for s in stats_list]
alert_values = [s["total_alerts"] for s in stats_list]
sleep_values = [s["sleep_disrupting_alerts"] for s in stats_list]
import statistics
return {
"engineers": stats_list,
"equity_analysis": {
"weeks_on_call": {
"min": min(weeks_values),
"max": max(weeks_values),
"mean": round(statistics.mean(weeks_values), 1),
"stdev": round(statistics.stdev(weeks_values), 1) if len(weeks_values) > 1 else 0,
"equitable": max(weeks_values) - min(weeks_values) <= 2
},
"total_alerts": {
"min": min(alert_values),
"max": max(alert_values),
"mean": round(statistics.mean(alert_values), 1),
"stdev": round(statistics.stdev(alert_values), 1) if len(alert_values) > 1 else 0
},
"sleep_disruptions": {
"min": min(sleep_values),
"max": max(sleep_values),
"mean": round(statistics.mean(sleep_values), 1)
}
}
}
Dashboard for On-Call Health
Visualize metrics in a weekly review:
## On-Call Health Dashboard Template
### This Week's Rotation
Engineer: [Name]
Rotation dates: [Mon - Mon]
**Alert Summary:**
| Category | Count | vs Last Week |
|---|---|---|
| Total alerts | 12 | -3 (improving) |
| Sleep-disrupting | 2 | +1 |
| Actionable | 10 | -2 |
| Noise/false positive | 2 | -1 |
**Time Metrics:**
- MTTA: 4.2 min (excellent)
- MTTR (P1): 18 min (good)
- MTTR (P2): 47 min (good)
**Top Alert Sources (by volume):**
1. checkout-api-latency: 4 alerts (3 actionable)
2. database-connections: 3 alerts (2 actionable)
3. background-job-queue: 3 alerts (3 actionable)
### Trend (Last 8 Weeks)
[bar chart showing weekly alert volume trend]
**Observation:** checkout-api-latency alerts increased this week.
Review threshold setting and recent deployments.
### Action Items from This Rotation
- [ ] Investigate checkout-api latency alerts — threshold may be too sensitive
- [ ] Update database-connection runbook (current one has outdated query)
Setting Improvement Targets
Use metrics to set concrete improvement goals:
ONCALL_HEALTH_TARGETS = {
# Alert volume
"max_alerts_per_rotation": 35, # Max 5/day
"max_sleep_disrupting_per_rotation": 5, # Max 5 sleep disruptions per week
# Alert quality
"min_actionability_rate": 0.85, # At least 85% of alerts require action
"max_auto_resolution_rate": 0.15, # Max 15% auto-resolve without action
# Response metrics
"max_mtta_minutes": 10, # Acknowledge within 10 minutes
"target_p1_mttr_minutes": 30, # Resolve P1 in 30 minutes
"target_p2_mttr_minutes": 120, # Resolve P2 in 2 hours
# Equity
"max_weeks_variation": 2, # Engineers shouldn't differ by more than 2 weeks/year
}
def assess_oncall_health(metrics, targets=ONCALL_HEALTH_TARGETS):
"""Compare current metrics against targets and generate recommendations."""
findings = []
if metrics["alerts_per_rotation"] > targets["max_alerts_per_rotation"]:
findings.append({
"metric": "alert_volume",
"current": metrics["alerts_per_rotation"],
"target": targets["max_alerts_per_rotation"],
"action": "Audit top alert sources and tune thresholds",
"priority": "high"
})
if metrics["actionability_rate"] < targets["min_actionability_rate"]:
findings.append({
"metric": "alert_quality",
"current": f"{metrics['actionability_rate']:.0%}",
"target": f"{targets['min_actionability_rate']:.0%}",
"action": "Identify and eliminate or tune non-actionable alerts",
"priority": "high"
})
if metrics["mean_mtta_minutes"] > targets["max_mtta_minutes"]:
findings.append({
"metric": "acknowledgment_speed",
"current": f"{metrics['mean_mtta_minutes']} min",
"target": f"{targets['max_mtta_minutes']} min",
"action": "Check PagerDuty notification configuration and escalation policy",
"priority": "medium"
})
return {
"findings": findings,
"overall_health": "good" if not findings else "needs_improvement",
"priority_actions": [f for f in findings if f["priority"] == "high"]
}
Conclusion
On-call metrics turn subjective burnout signals into objective data that drives concrete improvements. Teams that track MTTA, MTTR, alert volume, and actionability rate over time can demonstrate to leadership that reliability investment reduces on-call burden, justify alert cleanup work with data showing the noise rate, and detect engineer burnout risk before someone quits. The virtuous cycle — measure, improve, measure again — is what separates teams that continuously improve their on-call experience from those that accept it as an unavoidable burden. AzMonitor's alert history provides the raw data for calculating these metrics, tracking which monitors generate the most noise, and understanding how detection time contributes to overall MTTR.
3 monitors free forever · No credit card needed · Set up in 2 minutes
Start monitoring free →