Node.js monitoring has some unique considerations compared to other runtimes. The single-threaded event loop means one CPU-intensive operation can block all requests. Memory management behaves differently, and V8's garbage collection can cause noticeable latency spikes. External monitoring catches availability issues; internal instrumentation reveals what's happening inside your process. You need both.
Building a Health Endpoint
Every Node.js application should expose a health endpoint that monitoring tools can check:
// health.js - Comprehensive health endpoint
const express = require('express');
const router = express.Router();
// Check database connection
async function checkDatabase(db) {
const start = Date.now();
try {
await db.query('SELECT 1');
return { status: 'healthy', latency_ms: Date.now() - start };
} catch (error) {
return { status: 'unhealthy', error: error.message, latency_ms: Date.now() - start };
}
}
// Check Redis/cache
async function checkCache(redis) {
const start = Date.now();
try {
await redis.ping();
return { status: 'healthy', latency_ms: Date.now() - start };
} catch (error) {
return { status: 'degraded', error: error.message, latency_ms: Date.now() - start };
}
}
// Check external service dependency
async function checkExternalService(url) {
const start = Date.now();
try {
const response = await fetch(url, {
signal: AbortSignal.timeout(2000)
});
return {
status: response.ok ? 'healthy' : 'degraded',
http_status: response.status,
latency_ms: Date.now() - start
};
} catch (error) {
return { status: 'unhealthy', error: error.message, latency_ms: Date.now() - start };
}
}
router.get('/health', async (req, res) => {
const checks = {
database: await checkDatabase(req.app.locals.db),
cache: await checkCache(req.app.locals.redis),
payment_service: await checkExternalService('https://api.stripe.com/healthcheck'),
};
// Process memory usage
const memUsage = process.memoryUsage();
const systemInfo = {
uptime_seconds: Math.floor(process.uptime()),
memory_heap_used_mb: Math.round(memUsage.heapUsed / 1024 / 1024),
memory_heap_total_mb: Math.round(memUsage.heapTotal / 1024 / 1024),
memory_rss_mb: Math.round(memUsage.rss / 1024 / 1024),
node_version: process.version,
pid: process.pid
};
// Determine overall status
const unhealthyCount = Object.values(checks).filter(c => c.status === 'unhealthy').length;
const degradedCount = Object.values(checks).filter(c => c.status === 'degraded').length;
let overallStatus;
if (unhealthyCount > 0) {
overallStatus = 'unhealthy';
} else if (degradedCount > 0) {
overallStatus = 'degraded';
} else {
overallStatus = 'healthy';
}
const statusCode = overallStatus === 'unhealthy' ? 503 : 200;
res.status(statusCode).json({
status: overallStatus,
timestamp: new Date().toISOString(),
checks,
system: systemInfo
});
});
module.exports = router;
Event Loop Monitoring
The Node.js event loop is a critical performance indicator. A blocked event loop means all requests are delayed:
// event-loop-monitor.js
const { monitorEventLoopDelay } = require('perf_hooks');
class EventLoopMonitor {
constructor(sampleInterval = 100) {
this.histogram = monitorEventLoopDelay({ resolution: sampleInterval });
this.histogram.enable();
}
getMetrics() {
const h = this.histogram;
return {
min_ms: h.min / 1e6,
max_ms: h.max / 1e6,
mean_ms: h.mean / 1e6,
p50_ms: h.percentile(50) / 1e6,
p95_ms: h.percentile(95) / 1e6,
p99_ms: h.percentile(99) / 1e6,
// Exceeds threshold = event loop is blocking
exceeds_10ms: h.percentile(99) / 1e6 > 10,
exceeds_100ms: h.percentile(99) / 1e6 > 100,
};
}
reset() {
this.histogram.reset();
}
}
const monitor = new EventLoopMonitor();
// Expose in metrics endpoint
app.get('/metrics/event-loop', (req, res) => {
res.json(monitor.getMetrics());
});
// Alert if event loop p99 exceeds 100ms
setInterval(() => {
const metrics = monitor.getMetrics();
if (metrics.p99_ms > 100) {
console.error('[EVENT_LOOP_BLOCKED]', {
p99_delay_ms: metrics.p99_ms,
max_delay_ms: metrics.max_ms
});
// Send alert to monitoring system
alerting.send({
severity: 'warning',
name: 'Event Loop Blocked',
value: metrics.p99_ms,
threshold: 100
});
}
monitor.reset();
}, 60000); // Check every minute
Memory Monitoring
Node.js memory issues can cause gradual degradation before OOM crashes:
// memory-monitor.js
class MemoryMonitor {
constructor(options = {}) {
this.heapWarningThreshold = options.heapWarningMB || 512;
this.heapCriticalThreshold = options.heapCriticalMB || 768;
this.checkInterval = options.checkInterval || 30000; // 30 seconds
this.startMonitoring();
}
startMonitoring() {
setInterval(() => {
this.check();
}, this.checkInterval);
}
check() {
const mem = process.memoryUsage();
const heapUsedMB = mem.heapUsed / 1024 / 1024;
const heapTotalMB = mem.heapTotal / 1024 / 1024;
const heapUsedPct = (mem.heapUsed / mem.heapTotal) * 100;
// Emit metrics
metrics.gauge('nodejs.memory.heap_used_mb', heapUsedMB);
metrics.gauge('nodejs.memory.heap_total_mb', heapTotalMB);
metrics.gauge('nodejs.memory.heap_used_pct', heapUsedPct);
metrics.gauge('nodejs.memory.rss_mb', mem.rss / 1024 / 1024);
metrics.gauge('nodejs.memory.external_mb', mem.external / 1024 / 1024);
// Alert on thresholds
if (heapUsedMB > this.heapCriticalThreshold) {
console.error('[MEMORY_CRITICAL]', {
heap_used_mb: heapUsedMB,
heap_total_mb: heapTotalMB
});
// Consider taking heap snapshot for analysis
// and potentially restarting the process
} else if (heapUsedMB > this.heapWarningThreshold) {
console.warn('[MEMORY_HIGH]', {
heap_used_mb: heapUsedMB
});
}
return { heapUsedMB, heapTotalMB, heapUsedPct };
}
}
const memoryMonitor = new MemoryMonitor({
heapWarningMB: 512,
heapCriticalMB: 768,
checkInterval: 30000
});
Request/Response Monitoring with Middleware
Add comprehensive request monitoring as Express middleware:
// monitoring-middleware.js
const onHeaders = require('on-headers');
function monitoringMiddleware(options = {}) {
return function(req, res, next) {
const startTime = Date.now();
const startHrTime = process.hrtime();
// Capture response details
onHeaders(res, function() {
const duration = Date.now() - startTime;
const hrDuration = process.hrtime(startHrTime);
const durationMs = hrDuration[0] * 1000 + hrDuration[1] / 1e6;
// Record metrics
metrics.histogram('http.request.duration', durationMs, {
method: req.method,
route: req.route?.path || req.path,
status_code: res.statusCode,
status_class: `${Math.floor(res.statusCode / 100)}xx`
});
metrics.increment('http.requests.total', {
method: req.method,
status_code: res.statusCode,
});
// Log slow requests
if (durationMs > 1000) {
console.warn('[SLOW_REQUEST]', {
method: req.method,
url: req.url,
duration_ms: durationMs,
status: res.statusCode,
request_id: req.headers['x-request-id']
});
}
});
next();
};
}
// Apply to all routes
app.use(monitoringMiddleware());
Prometheus Metrics for Node.js
For teams using Prometheus, prom-client provides excellent Node.js metrics:
// prometheus-metrics.js
const client = require('prom-client');
// Enable default metrics (event loop lag, GC, memory, etc.)
client.collectDefaultMetrics({
prefix: 'nodejs_',
gcDurationBuckets: [0.001, 0.01, 0.1, 1, 2, 5],
});
// Custom application metrics
const httpRequestDuration = new client.Histogram({
name: 'http_request_duration_seconds',
help: 'Duration of HTTP requests in seconds',
labelNames: ['method', 'route', 'status_code'],
buckets: [0.1, 0.5, 1, 2, 5, 10]
});
const activeConnections = new client.Gauge({
name: 'http_active_connections',
help: 'Number of active HTTP connections'
});
const requestsTotal = new client.Counter({
name: 'http_requests_total',
help: 'Total number of HTTP requests',
labelNames: ['method', 'status_code']
});
// Expose metrics endpoint
app.get('/metrics', async (req, res) => {
res.set('Content-Type', client.register.contentType);
res.end(await client.register.metrics());
});
module.exports = {
httpRequestDuration,
activeConnections,
requestsTotal
};
Error Rate Monitoring
Track unhandled errors and exceptions:
// error-monitoring.js
// Capture unhandled promise rejections
process.on('unhandledRejection', (reason, promise) => {
console.error('[UNHANDLED_REJECTION]', {
reason: reason?.message || String(reason),
stack: reason?.stack,
});
metrics.increment('nodejs.errors.unhandled_rejection', {
error_type: reason?.constructor?.name || 'Unknown'
});
// Optional: terminate on unhandled rejection (recommended in production)
// process.exit(1);
});
// Capture uncaught exceptions
process.on('uncaughtException', (error) => {
console.error('[UNCAUGHT_EXCEPTION]', {
error: error.message,
stack: error.stack,
});
metrics.increment('nodejs.errors.uncaught_exception', {
error_type: error.constructor.name
});
// Always exit on uncaught exception - process is in unknown state
process.exit(1);
});
// Express error handler (4-argument signature required)
function errorHandler(error, req, res, next) {
console.error('[EXPRESS_ERROR]', {
error: error.message,
stack: error.stack,
method: req.method,
url: req.url,
request_id: req.headers['x-request-id']
});
metrics.increment('http.errors.total', {
error_type: error.constructor.name,
status_code: error.statusCode || 500
});
const statusCode = error.statusCode || 500;
res.status(statusCode).json({
error: statusCode < 500 ? error.message : 'Internal server error'
});
}
app.use(errorHandler);
External Monitoring Configuration for Node.js
Configure AzMonitor to check your Node.js app from the outside:
monitors:
# Basic availability
- name: "Node.js App - Health"
url: "http://localhost:3000/health"
method: GET
interval: 60
assertions:
- type: status_code
value: 200
- type: json_path
path: "$.status"
operator: equals
value: "healthy"
- type: response_time
operator: less_than
value: 2000
# API endpoint check
- name: "Node.js API - Users"
url: "https://api.example.com/v1/users"
method: GET
headers:
Authorization: "Bearer ${MONITOR_TOKEN}"
assertions:
- type: status_code
value: 200
- type: json_path
path: "$.data"
operator: is_array
- type: response_time
operator: less_than
value: 500
Graceful Shutdown Monitoring
Monitor that your Node.js app shuts down gracefully (important for Kubernetes and PM2):
// graceful-shutdown.js
const server = app.listen(PORT, () => {
console.log(`Server started on port ${PORT}`);
});
let isShuttingDown = false;
function gracefulShutdown(signal) {
if (isShuttingDown) return;
isShuttingDown = true;
console.log(`[SHUTDOWN] Received ${signal}, shutting down gracefully...`);
// Stop accepting new connections
server.close(async () => {
console.log('[SHUTDOWN] HTTP server closed');
try {
// Finish in-flight requests
await waitForActiveRequests();
// Close database connections
await db.end();
console.log('[SHUTDOWN] Database connections closed');
// Close cache connections
await redis.quit();
console.log('[SHUTDOWN] Cache connections closed');
console.log('[SHUTDOWN] Graceful shutdown complete');
process.exit(0);
} catch (error) {
console.error('[SHUTDOWN] Error during shutdown:', error.message);
process.exit(1);
}
});
// Force exit after 30 seconds
setTimeout(() => {
console.error('[SHUTDOWN] Forced shutdown after 30s timeout');
process.exit(1);
}, 30000);
}
process.on('SIGTERM', () => gracefulShutdown('SIGTERM'));
process.on('SIGINT', () => gracefulShutdown('SIGINT'));
Node.js Monitoring Checklist
| Item | Implemented | Notes |
|---|---|---|
| Health endpoint at /health | | Should check all dependencies |
| Event loop lag monitoring | | Alert if p99 > 100ms |
| Memory monitoring | | Alert if heap > 80% of limit |
| Request duration histogram | | Track p95 and p99 |
| Error rate counter | | Track 4xx and 5xx separately |
| Unhandled rejection capture | | |
| Prometheus metrics endpoint | | If using Prometheus |
| External health check | | Via AzMonitor or similar |
| Graceful shutdown | | Critical for zero-downtime deploys |
| Process restart monitoring | | Alert if process restarts frequently |
Conclusion
Node.js monitoring requires attention to event loop health (unique to Node's architecture), memory patterns (garbage collection effects), and the full request lifecycle. Start with a comprehensive health endpoint that checks all dependencies, add middleware to track request metrics, and set up external monitoring to catch failures from the user's perspective. AzMonitor handles the external visibility — knowing your Node.js app is reachable and responding correctly — while internal instrumentation reveals the application-level details you need for optimization and debugging.
3 monitors free forever · No credit card needed · Set up in 2 minutes
Start monitoring free →