A multi-channel alerting strategy ensures that critical alerts always reach a human — even when one channel is unavailable, muted, or being ignored due to fatigue. Different channels serve different purposes, and understanding when to use each is the foundation of reliable alert delivery.
The Alert Channel Hierarchy
Think of alert channels in terms of their ability to interrupt and their audience:
| Channel | Interruption Level | Best For | Reaches When | |---|---|---|---| | Phone call | Highest | P1 critical only | Engineer is asleep, away from devices | | SMS | High | P1-P2 critical | No internet, can't use apps | | PagerDuty push | High | Primary on-call alerts | Engineer has smartphone, app installed | | Slack DM | Medium | Secondary notifications | Engineer is working at desk | | Slack channel | Medium | Team awareness | Anyone watching the channel | | Email | Low | Status updates, daily digests | When they check email | | Webhook | Variable | Custom integrations | Whatever the target does with it |
Designing the Alert Flow
# alert_channel_config.py
class AlertChannelStrategy:
"""
Define multi-channel alert delivery based on severity and time.
"""
CHANNEL_CONFIG = {
"critical": {
"description": "P1 - Production outage",
"channels": [
{
"type": "pagerduty",
"timing": "immediate",
"methods": ["push_notification", "phone_call", "sms"],
"escalation": True
},
{
"type": "slack",
"channel": "#prod-incidents",
"timing": "immediate",
"mention": "@channel"
}
],
"require_acknowledgment": True,
"escalation_after_minutes": 5
},
"high": {
"description": "P2 - Significant degradation",
"channels": [
{
"type": "pagerduty",
"timing": "immediate",
"methods": ["push_notification", "sms"],
"business_hours_only": False,
"escalation": True
},
{
"type": "slack",
"channel": "#prod-alerts",
"timing": "immediate",
"mention": "@oncall"
}
],
"require_acknowledgment": True,
"escalation_after_minutes": 20
},
"medium": {
"description": "P3 - Non-critical issue",
"channels": [
{
"type": "slack",
"channel": "#prod-alerts",
"timing": "immediate",
"mention": None
},
{
"type": "email",
"recipients": ["oncall-team@example.com"],
"timing": "immediate"
}
],
"require_acknowledgment": False,
"escalation": False
},
"low": {
"description": "P4 - Informational",
"channels": [
{
"type": "slack",
"channel": "#monitoring-digest",
"timing": "immediate"
}
],
"require_acknowledgment": False,
"escalation": False
}
}
def get_delivery_plan(self, alert, current_time):
"""
Build a delivery plan for an alert based on severity and time.
"""
config = self.CHANNEL_CONFIG.get(alert.severity, self.CHANNEL_CONFIG["medium"])
plan = {
"alert_id": alert.id,
"severity": alert.severity,
"channels": [],
"require_acknowledgment": config["require_acknowledgment"]
}
for channel_config in config["channels"]:
# Check business hours restriction
if channel_config.get("business_hours_only") and not self.is_business_hours(current_time):
# Skip this channel outside business hours
continue
# Check time zone for business hours determination
if channel_config.get("business_hours_only"):
channel_config = dict(channel_config)
channel_config.pop("business_hours_only")
plan["channels"].append(channel_config)
return plan
def is_business_hours(self, dt, timezone="America/New_York", start_hour=9, end_hour=18):
"""Check if current time is within business hours."""
import pytz
tz = pytz.timezone(timezone)
local_time = dt.astimezone(tz)
is_weekday = local_time.weekday() < 5 # Monday=0, Friday=4
is_work_hour = start_hour <= local_time.hour < end_hour
return is_weekday and is_work_hour
Redundant Delivery for Critical Alerts
For P1 alerts, ensure redundant delivery through multiple independent paths:
import asyncio
import requests
async def deliver_critical_alert(alert, channels):
"""
Deliver a critical alert through multiple channels simultaneously.
Don't wait for one channel to succeed before trying others.
"""
tasks = []
for channel in channels:
if channel["type"] == "pagerduty":
tasks.append(send_pagerduty_alert(alert, channel))
elif channel["type"] == "slack":
tasks.append(send_slack_alert(alert, channel))
elif channel["type"] == "email":
tasks.append(send_email_alert(alert, channel))
elif channel["type"] == "sms":
tasks.append(send_sms_alert(alert, channel))
# Send all simultaneously — don't wait for one before sending others
results = await asyncio.gather(*tasks, return_exceptions=True)
# Check delivery success
delivered_to = []
failed_on = []
for i, result in enumerate(results):
channel_name = channels[i]["type"]
if isinstance(result, Exception):
failed_on.append({
"channel": channel_name,
"error": str(result)
})
elif result.get("success"):
delivered_to.append(channel_name)
# If ALL channels failed, try fallback
if not delivered_to:
await send_emergency_fallback(alert)
return {
"delivered_to": delivered_to,
"failed_on": failed_on,
"fallback_triggered": not delivered_to
}
async def send_emergency_fallback(alert):
"""
Emergency fallback when all primary channels fail.
Tries: SMS via different provider, email, backup phone.
"""
fallback_channels = [
send_sms_via_backup_provider(alert),
send_email_via_backup_smtp(alert)
]
await asyncio.gather(*fallback_channels, return_exceptions=True)
SMS Alerting
SMS is the most reliable backup channel — it works without internet and bypasses DND on most phones:
# Twilio SMS integration
from twilio.rest import Client
def send_sms_alert(
to_numbers: list,
alert_text: str,
twilio_account_sid: str,
twilio_auth_token: str,
from_number: str
) -> list:
"""
Send SMS alerts via Twilio.
Keep messages concise — SMS has a 160 character limit.
"""
client = Client(twilio_account_sid, twilio_auth_token)
# Format concise SMS (160 chars max for single SMS)
sms_text = format_sms_alert(alert_text)[:160]
results = []
for number in to_numbers:
try:
message = client.messages.create(
body=sms_text,
from_=from_number,
to=number
)
results.append({
"number": number,
"success": True,
"sid": message.sid
})
except Exception as e:
results.append({
"number": number,
"success": False,
"error": str(e)
})
return results
def format_sms_alert(alert) -> str:
"""
Format a concise SMS alert message (max 160 chars).
"""
# Priority: severity > service name > brief description
msg = f"{alert.severity.upper()}: {alert.service} - {alert.short_description}"
# Truncate if needed, keep it readable
if len(msg) > 155:
msg = msg[:152] + "..."
return msg
# Example SMS formats:
# "CRITICAL: checkout-api DOWN - 503 errors across all regions"
# "WARNING: auth-service SLOW - P99 latency 2.8s (target: 1s)"
Email Alerting for Non-Critical Notifications
Email works well for digest notifications and non-critical alerts:
def send_email_alert(
recipients: list,
alert: dict,
smtp_config: dict
) -> bool:
"""
Send formatted HTML email for monitoring alert.
Use for P3/P4 alerts and daily digests, not P1.
"""
import smtplib
from email.mime.multipart import MIMEMultipart
from email.mime.text import MIMEText
# Build email
msg = MIMEMultipart("alternative")
msg["Subject"] = format_email_subject(alert)
msg["From"] = smtp_config["from_address"]
msg["To"] = ", ".join(recipients)
# Plain text version
text_body = f"""
{alert["severity"].upper()}: {alert["title"]}
Service: {alert.get("service", "Unknown")}
Environment: {alert.get("environment", "production")}
Status: {alert.get("status", "Unknown")}
Started: {alert.get("started_at", "Unknown")}
{alert.get("description", "No description")}
View in monitoring: {alert.get("monitor_url", "")}
Runbook: {alert.get("runbook_url", "")}
"""
# HTML version (richer formatting)
html_body = render_html_email_template(alert)
msg.attach(MIMEText(text_body, "plain"))
msg.attach(MIMEText(html_body, "html"))
try:
with smtplib.SMTP_SSL(smtp_config["host"], smtp_config["port"]) as server:
server.login(smtp_config["username"], smtp_config["password"])
server.sendmail(smtp_config["from_address"], recipients, msg.as_string())
return True
except Exception:
return False
def format_email_subject(alert):
severity_prefix = {"critical": "URGENT", "high": "HIGH", "medium": "MEDIUM", "low": "INFO"}
prefix = severity_prefix.get(alert["severity"], "ALERT")
return f"[{prefix}] {alert['title']} — {alert.get('service', 'Unknown')}"
Webhook Integration for Custom Destinations
Webhooks enable custom integrations with any system:
def send_webhook_alert(
webhook_url: str,
alert: dict,
secret: str = None,
headers: dict = None
) -> dict:
"""
Send alert as JSON webhook to any HTTP endpoint.
Used for custom integrations (JIRA, ServiceNow, custom apps).
"""
import hmac
import hashlib
import json
payload = {
"event_type": "alert",
"timestamp": datetime.utcnow().isoformat() + "Z",
"alert": {
"id": alert.get("id"),
"severity": alert.get("severity"),
"title": alert.get("title"),
"description": alert.get("description"),
"service": alert.get("service"),
"environment": alert.get("environment"),
"status": alert.get("status"),
"started_at": alert.get("started_at"),
"monitor_url": alert.get("monitor_url"),
"runbook_url": alert.get("runbook_url")
}
}
payload_json = json.dumps(payload)
request_headers = {"Content-Type": "application/json"}
# Add HMAC signature if secret provided
if secret:
signature = hmac.new(
secret.encode(),
payload_json.encode(),
hashlib.sha256
).hexdigest()
request_headers["X-Webhook-Signature"] = f"sha256={signature}"
if headers:
request_headers.update(headers)
response = requests.post(
webhook_url,
data=payload_json,
headers=request_headers,
timeout=10
)
return {
"success": 200 <= response.status_code < 300,
"status_code": response.status_code,
"response": response.text[:500] # Truncate for logging
}
Testing Alert Delivery
Regularly test that all channels are working:
def test_all_alert_channels():
"""
Periodically verify all configured alert channels are functional.
Send test alerts monthly or after configuration changes.
"""
test_alert = {
"id": "test-001",
"severity": "low",
"title": "Monthly Alert Channel Test",
"description": "This is a test alert to verify all notification channels are working correctly. No action required.",
"service": "monitoring-system",
"status": "test"
}
results = {}
# Test PagerDuty
pd_result = send_pagerduty_alert(test_alert, PAGERDUTY_CONFIG)
results["pagerduty"] = {
"success": pd_result.get("success"),
"note": "Resolve immediately in PagerDuty — this is a test"
}
# Test Slack
for channel_name, webhook_url in SLACK_CHANNELS.items():
slack_result = send_slack_alert(test_alert, {"channel": channel_name, "webhook_url": webhook_url})
results[f"slack_{channel_name}"] = {"success": slack_result}
# Test Email
email_result = send_email_alert(
recipients=TEST_EMAIL_RECIPIENTS,
alert=test_alert,
smtp_config=SMTP_CONFIG
)
results["email"] = {"success": email_result}
# Log and alert on failures
failed_channels = [name for name, result in results.items() if not result["success"]]
if failed_channels:
# This is ironic — alert about failed alert channels using a working channel
send_slack_alert(
alert={
"severity": "high",
"title": f"Alert Channel Test Failed: {', '.join(failed_channels)}",
"description": "These notification channels failed their monthly test. Investigate immediately."
},
config={"channel": "#engineering", "webhook_url": BACKUP_SLACK_WEBHOOK}
)
return results
Conclusion
Multi-channel alerting is insurance against single points of failure in your notification infrastructure. The strategy — escalating from Slack notifications to PagerDuty pushes to SMS to phone calls based on severity and response time — ensures that critical alerts always reach someone. The key discipline is using each channel for its purpose: Slack for team awareness, PagerDuty for primary on-call alerting, SMS for highest-urgency backup, and email for lower-priority notifications and digests. AzMonitor's flexible alert routing supports all these channels with configurable routing rules, ensuring the right person gets notified through the right channel when their services need attention.
3 monitors free forever · No credit card needed · Set up in 2 minutes
Start monitoring free →