WebSockets power real-time features: live chat, collaborative editing, live dashboards, trading platforms, multiplayer games. When your WebSocket infrastructure has problems, it doesn't just slow things down — the real-time nature of your application breaks entirely. Unlike HTTP, WebSocket connections are long-lived, bidirectional, and stateful, which means monitoring them requires different approaches than standard request-response APIs.
WebSocket vs HTTP Monitoring Differences
The fundamental difference is connection state:
| Aspect | HTTP Monitoring | WebSocket Monitoring | |---|---|---| | Connection duration | Milliseconds | Minutes to hours | | Request pattern | Request → Response | Persistent bidirectional stream | | Failure modes | Server error, timeout | Disconnect, message loss, latency | | Health check | Single HTTP request | Establish + message exchange | | Load balancing | Stateless, easy | Sticky sessions required | | Scaling | Horizontal, straightforward | Stateful, requires coordination |
A WebSocket health check must establish a connection, exchange messages, measure latency, and cleanly disconnect — not just check if a port is open.
Building WebSocket Health Checks
A comprehensive WebSocket health check verifies the full lifecycle:
// websocket-health-check.js
const WebSocket = require('ws');
async function checkWebSocketHealth(url, timeout = 5000) {
return new Promise((resolve, reject) => {
const startTime = Date.now();
const result = {
url,
connected: false,
messageReceived: false,
pingLatencyMs: null,
connectionTimeMs: null,
totalTimeMs: null,
error: null
};
const timer = setTimeout(() => {
ws.close();
result.error = 'Timeout';
result.totalTimeMs = Date.now() - startTime;
reject(result);
}, timeout);
const ws = new WebSocket(url, {
handshakeTimeout: 3000,
headers: {
'User-Agent': 'AzMonitor-HealthCheck/1.0'
}
});
ws.on('open', () => {
result.connected = true;
result.connectionTimeMs = Date.now() - startTime;
// Send ping
const pingStart = Date.now();
ws.ping();
});
ws.on('pong', () => {
result.pingLatencyMs = Date.now() - pingStart;
// Send test message
ws.send(JSON.stringify({
type: 'health_check',
timestamp: Date.now()
}));
});
ws.on('message', (data) => {
try {
const msg = JSON.parse(data);
if (msg.type === 'health_check_response' || msg.type === 'pong') {
result.messageReceived = true;
clearTimeout(timer);
ws.close(1000, 'Health check complete');
result.totalTimeMs = Date.now() - startTime;
resolve(result);
}
} catch (e) {
// Non-JSON message is still a valid response
result.messageReceived = true;
}
});
ws.on('error', (error) => {
clearTimeout(timer);
result.error = error.message;
result.totalTimeMs = Date.now() - startTime;
reject(result);
});
ws.on('close', (code, reason) => {
if (!result.messageReceived && !result.error) {
clearTimeout(timer);
result.error = `Connection closed unexpectedly: code ${code}`;
result.totalTimeMs = Date.now() - startTime;
reject(result);
}
});
});
}
// Run health check
checkWebSocketHealth('wss://ws.example.com/chat')
.then(result => {
console.log('WebSocket healthy:', result);
// result.pingLatencyMs = 45
// result.connectionTimeMs = 23
// result.totalTimeMs = 68
})
.catch(error => {
console.error('WebSocket unhealthy:', error);
});
Server-Side WebSocket Metrics
Instrument your WebSocket server to expose meaningful metrics:
// WebSocket server with metrics (Node.js + ws library)
const WebSocket = require('ws');
const metrics = require('./metrics');
class MonitoredWebSocketServer {
constructor(options) {
this.wss = new WebSocket.Server(options);
this.stats = {
totalConnections: 0,
activeConnections: 0,
totalMessages: 0,
totalErrors: 0,
connectionsByStatus: {}
};
this.setupHandlers();
this.startMetricsCollection();
}
setupHandlers() {
this.wss.on('connection', (ws, req) => {
this.stats.totalConnections++;
this.stats.activeConnections++;
const connectionId = generateId();
const connectTime = Date.now();
// Track per-connection state
ws.connectionId = connectionId;
ws.connectTime = connectTime;
ws.messageCount = 0;
ws.lastMessageTime = connectTime;
metrics.increment('websocket.connections.total');
metrics.gauge('websocket.connections.active', this.stats.activeConnections);
ws.on('message', (data) => {
this.stats.totalMessages++;
ws.messageCount++;
ws.lastMessageTime = Date.now();
const messageLatency = Date.now() - ws.lastMessageTime;
metrics.histogram('websocket.message.latency', messageLatency);
metrics.increment('websocket.messages.total');
});
ws.on('close', (code, reason) => {
this.stats.activeConnections--;
const sessionDuration = Date.now() - connectTime;
metrics.gauge('websocket.connections.active', this.stats.activeConnections);
metrics.histogram('websocket.session.duration', sessionDuration);
metrics.histogram('websocket.session.messages', ws.messageCount);
// Track close codes
metrics.increment('websocket.closes.total', { code: code.toString() });
// Log unexpected closes
if (code !== 1000 && code !== 1001) {
console.warn('[WS_UNEXPECTED_CLOSE]', {
connectionId,
code,
reason: reason.toString(),
sessionDurationMs: sessionDuration,
messageCount: ws.messageCount
});
}
});
ws.on('error', (error) => {
this.stats.totalErrors++;
metrics.increment('websocket.errors.total', {
error_type: error.code || 'unknown'
});
console.error('[WS_ERROR]', {
connectionId,
error: error.message,
code: error.code
});
});
ws.on('pong', () => {
ws.isAlive = true;
});
});
}
startMetricsCollection() {
// Heartbeat to detect dead connections
const heartbeatInterval = setInterval(() => {
this.wss.clients.forEach(ws => {
if (ws.isAlive === false) {
// Connection didn't respond to last ping
console.warn('[WS_DEAD_CONNECTION]', {
connectionId: ws.connectionId,
sessionDurationMs: Date.now() - ws.connectTime
});
ws.terminate();
return;
}
ws.isAlive = false;
ws.ping();
});
metrics.gauge('websocket.connections.active', this.wss.clients.size);
}, 30000);
this.wss.on('close', () => clearInterval(heartbeatInterval));
}
}
WebSocket Close Codes
WebSocket close codes tell you why connections ended:
| Code | Name | Meaning | Action | |---|---|---|---| | 1000 | Normal Closure | Clean close | Expected | | 1001 | Going Away | Browser navigating away | Expected | | 1006 | Abnormal Closure | Connection dropped without close frame | Investigate | | 1011 | Internal Server Error | Server-side error | Investigate | | 1012 | Service Restart | Server restarting | Monitor restart frequency | | 1013 | Try Again Later | Server overloaded | Scale up | | 4xxx | Application codes | Application-defined | Depends on your app |
Track close code distribution — if 1006 (abnormal closure) rate is high, you have a connectivity problem. If 1011 is high, your server has errors:
# Alert on abnormal WebSocket close codes
def check_close_code_health(metrics_client, window_minutes=15):
"""
Analyze WebSocket close code distribution.
High rates of 1006, 1011, or 1013 indicate problems.
"""
close_codes = metrics_client.get_distribution(
'websocket.closes.total',
window=f'{window_minutes}m',
group_by='code'
)
total = sum(close_codes.values())
if total == 0:
return {"status": "no_data"}
abnormal_codes = ['1006', '1011', '1012', '1013']
abnormal_count = sum(
close_codes.get(code, 0) for code in abnormal_codes
)
abnormal_rate = abnormal_count / total
alerts = []
if abnormal_rate > 0.05: # More than 5% abnormal closes
alerts.append({
"condition": "high_abnormal_close_rate",
"rate": abnormal_rate,
"breakdown": {code: close_codes.get(code, 0) for code in abnormal_codes}
})
if close_codes.get('1006', 0) / total > 0.02: # > 2% abnormal disconnects
alerts.append({
"condition": "high_abnormal_disconnect_rate",
"message": "More than 2% of connections closing abnormally — connectivity issue?"
})
return {
"status": "alert" if alerts else "healthy",
"total_closes": total,
"abnormal_rate": abnormal_rate,
"alerts": alerts
}
Message Latency Monitoring
For real-time applications, message delivery latency is as important as connection availability:
// Client-side message latency measurement
class LatencyTracker {
constructor(ws) {
this.ws = ws;
this.pendingPings = new Map();
this.latencyHistory = [];
ws.on('pong', (data) => {
const pingId = data.toString();
const pingTime = this.pendingPings.get(pingId);
if (pingTime) {
const latency = Date.now() - pingTime;
this.latencyHistory.push(latency);
this.pendingPings.delete(pingId);
// Keep last 100 measurements
if (this.latencyHistory.length > 100) {
this.latencyHistory.shift();
}
// Alert on high latency
if (latency > 500) {
console.warn(`[WS_HIGH_LATENCY] ${latency}ms`);
}
}
});
// Send pings every 10 seconds
setInterval(() => this.ping(), 10000);
}
ping() {
const pingId = Date.now().toString();
this.pendingPings.set(pingId, Date.now());
this.ws.ping(pingId);
}
getStats() {
if (this.latencyHistory.length === 0) return null;
const sorted = [...this.latencyHistory].sort((a, b) => a - b);
return {
count: sorted.length,
min: sorted[0],
max: sorted[sorted.length - 1],
p50: sorted[Math.floor(sorted.length * 0.5)],
p95: sorted[Math.floor(sorted.length * 0.95)],
p99: sorted[Math.floor(sorted.length * 0.99)]
};
}
}
Load Balancer Configuration for WebSockets
WebSockets require sticky sessions (session affinity) at the load balancer. Monitor that this is working:
# Nginx WebSocket configuration
upstream ws_backend {
ip_hash; # Sticky sessions - requests from same IP go to same server
server ws1:8080;
server ws2:8080;
server ws3:8080;
}
server {
listen 443 ssl http2;
location /ws {
proxy_pass http://ws_backend;
proxy_http_version 1.1;
proxy_set_header Upgrade $http_upgrade;
proxy_set_header Connection "upgrade";
proxy_set_header Host $host;
# WebSocket timeouts
proxy_read_timeout 3600; # 1 hour
proxy_send_timeout 3600;
# Don't buffer WebSocket traffic
proxy_buffering off;
}
}
Monitor that connections stay on the same backend:
def check_sticky_session(ws_url, expected_server_id):
"""
Verify that reconnections go to the same backend server.
Critical for WebSocket applications with server-side state.
"""
import websocket
server_ids = []
for _ in range(5):
ws = websocket.create_connection(ws_url)
# Get server ID from initial message (if your app sends one)
result = ws.recv()
data = json.loads(result)
server_ids.append(data.get('server_id'))
ws.close()
all_same = len(set(server_ids)) == 1
if not all_same:
return {
"sticky_sessions_working": False,
"server_ids_seen": list(set(server_ids)),
"message": "Connections routing to multiple servers — check load balancer config"
}
return {
"sticky_sessions_working": True,
"server_id": server_ids[0]
}
External WebSocket Monitoring
Configure AzMonitor-style external WebSocket monitoring:
# WebSocket monitor configuration
monitor:
name: "WebSocket Server - Health"
type: websocket
url: "wss://ws.example.com/health"
timeout: 10000
steps:
- action: connect
assert_connected_within_ms: 1000
- action: send
message: '{"type": "ping", "id": "monitor-check"}'
- action: receive
timeout_ms: 3000
assert_message_contains: '"type": "pong"'
- action: measure_latency
- action: disconnect
expected_close_code: 1000
assertions:
- type: connection_latency
operator: less_than
value: 500 # ms
- type: message_latency
operator: less_than
value: 200 # ms
Conclusion
WebSocket monitoring requires thinking about connection lifecycle, message latency, and close code distribution rather than just HTTP status codes. Build server-side instrumentation that tracks active connections, message throughput, and abnormal closes. Create health check scripts that test the full WebSocket lifecycle. Configure load balancers for sticky sessions and verify they're working. Use external monitoring to continuously verify your WebSocket endpoint is reachable and responsive. AzMonitor's monitoring can verify your WebSocket upgrade handshake is working and that your service responds to connection attempts from multiple global locations, providing the external health check layer for your real-time infrastructure.
3 monitors free forever · No credit card needed · Set up in 2 minutes
Start monitoring free →