Go's concurrency model, efficient memory usage, and compiled nature make it excellent for high-performance services. But these same characteristics create unique monitoring requirements: goroutine leaks can silently exhaust resources, GC pressure affects latency in measurable ways, and channels blocking indefinitely can freeze services without triggering obvious errors. Here's how to build comprehensive observability for Go services.
Building the Health Endpoint
A well-structured health endpoint is the foundation of Go service monitoring:
// health.go
package main
import (
"context"
"encoding/json"
"net/http"
"runtime"
"time"
"database/sql"
)
type HealthStatus struct {
Status string `json:"status"`
Timestamp string `json:"timestamp"`
Version string `json:"version"`
Uptime float64 `json:"uptime_seconds"`
Checks map[string]CheckResult `json:"checks"`
Runtime RuntimeInfo `json:"runtime"`
}
type CheckResult struct {
Status string `json:"status"`
LatencyMs int64 `json:"latency_ms,omitempty"`
Error string `json:"error,omitempty"`
}
type RuntimeInfo struct {
GoVersion string `json:"go_version"`
NumGoroutines int `json:"num_goroutines"`
NumCPU int `json:"num_cpu"`
GOMAXPROCS int `json:"gomaxprocs"`
GCNumRuns uint32 `json:"gc_num_runs"`
AllocMB uint64 `json:"alloc_mb"`
SysMB uint64 `json:"sys_mb"`
}
var startTime = time.Now()
func healthHandler(db *sql.DB) http.HandlerFunc {
return func(w http.ResponseWriter, r *http.Request) {
ctx, cancel := context.WithTimeout(r.Context(), 5*time.Second)
defer cancel()
checks := make(map[string]CheckResult)
overallStatus := "healthy"
// Database check
dbStart := time.Now()
if err := db.PingContext(ctx); err != nil {
checks["database"] = CheckResult{
Status: "unhealthy",
Error: err.Error(),
}
overallStatus = "unhealthy"
} else {
checks["database"] = CheckResult{
Status: "healthy",
LatencyMs: time.Since(dbStart).Milliseconds(),
}
}
// Runtime info
var memStats runtime.MemStats
runtime.ReadMemStats(&memStats)
runtimeInfo := RuntimeInfo{
GoVersion: runtime.Version(),
NumGoroutines: runtime.NumGoroutine(),
NumCPU: runtime.NumCPU(),
GOMAXPROCS: runtime.GOMAXPROCS(0),
GCNumRuns: memStats.NumGC,
AllocMB: memStats.Alloc / 1024 / 1024,
SysMB: memStats.Sys / 1024 / 1024,
}
// Goroutine leak detection
if runtimeInfo.NumGoroutines > 10000 {
checks["goroutines"] = CheckResult{
Status: "degraded",
Error: "goroutine count unexpectedly high",
}
if overallStatus == "healthy" {
overallStatus = "degraded"
}
}
status := HealthStatus{
Status: overallStatus,
Timestamp: time.Now().UTC().Format(time.RFC3339),
Version: AppVersion,
Uptime: time.Since(startTime).Seconds(),
Checks: checks,
Runtime: runtimeInfo,
}
statusCode := http.StatusOK
if overallStatus == "unhealthy" {
statusCode = http.StatusServiceUnavailable
}
w.Header().Set("Content-Type", "application/json")
w.WriteHeader(statusCode)
json.NewEncoder(w).Encode(status)
}
}
Prometheus Metrics with promhttp
Go's Prometheus client library is excellent and widely used:
// metrics.go
package main
import (
"net/http"
"time"
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/promauto"
"github.com/prometheus/client_golang/prometheus/promhttp"
)
var (
// HTTP request metrics
httpRequestsTotal = promauto.NewCounterVec(
prometheus.CounterOpts{
Name: "http_requests_total",
Help: "Total number of HTTP requests",
},
[]string{"method", "path", "status_code"},
)
httpRequestDuration = promauto.NewHistogramVec(
prometheus.HistogramOpts{
Name: "http_request_duration_seconds",
Help: "HTTP request duration in seconds",
Buckets: []float64{.005, .01, .025, .05, .1, .25, .5, 1, 2.5, 5, 10},
},
[]string{"method", "path"},
)
// Goroutine count (refreshed periodically)
goroutineCount = promauto.NewGauge(prometheus.GaugeOpts{
Name: "go_goroutines_current",
Help: "Current number of goroutines",
})
// Business metrics
activeUsers = promauto.NewGauge(prometheus.GaugeOpts{
Name: "app_active_users",
Help: "Number of currently active users",
})
paymentProcessed = promauto.NewCounterVec(
prometheus.CounterOpts{
Name: "app_payments_total",
Help: "Total number of payments processed",
},
[]string{"status", "method"},
)
paymentAmount = promauto.NewHistogramVec(
prometheus.HistogramOpts{
Name: "app_payment_amount_dollars",
Help: "Payment amounts in dollars",
Buckets: []float64{1, 5, 10, 50, 100, 500, 1000},
},
[]string{"method"},
)
)
// Middleware for HTTP metrics
func metricsMiddleware(next http.Handler) http.Handler {
return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
start := time.Now()
// Wrap ResponseWriter to capture status code
wrapped := &statusResponseWriter{ResponseWriter: w, status: http.StatusOK}
next.ServeHTTP(wrapped, r)
duration := time.Since(start).Seconds()
statusStr := fmt.Sprintf("%d", wrapped.status)
httpRequestsTotal.WithLabelValues(r.Method, r.URL.Path, statusStr).Inc()
httpRequestDuration.WithLabelValues(r.Method, r.URL.Path).Observe(duration)
})
}
type statusResponseWriter struct {
http.ResponseWriter
status int
}
func (w *statusResponseWriter) WriteHeader(status int) {
w.status = status
w.ResponseWriter.WriteHeader(status)
}
// Background goroutine monitoring
func startGoroutineMonitoring() {
ticker := time.NewTicker(15 * time.Second)
go func() {
for range ticker.C {
count := runtime.NumGoroutine()
goroutineCount.Set(float64(count))
if count > 10000 {
log.Printf("[GOROUTINE_LEAK_WARNING] Goroutine count: %d", count)
}
}
}()
}
// Expose metrics endpoint
func setupMetrics(mux *http.ServeMux) {
mux.Handle("/metrics", promhttp.Handler())
startGoroutineMonitoring()
}
Goroutine Leak Detection
Goroutine leaks are Go's most insidious production problem:
// goroutine-monitor.go
package main
import (
"runtime"
"runtime/pprof"
"bytes"
"strings"
"time"
"log"
)
type GoroutineMonitor struct {
baseline int
threshold int
ticker *time.Ticker
}
func NewGoroutineMonitor(threshold int) *GoroutineMonitor {
m := &GoroutineMonitor{
baseline: runtime.NumGoroutine(),
threshold: threshold,
ticker: time.NewTicker(30 * time.Second),
}
go m.monitor()
return m
}
func (m *GoroutineMonitor) monitor() {
for range m.ticker.C {
current := runtime.NumGoroutine()
growth := current - m.baseline
if current > m.threshold {
log.Printf("[GOROUTINE_WARNING] Count: %d (baseline: %d, growth: %d)",
current, m.baseline, growth)
// Dump goroutine stack traces for diagnosis
m.dumpStacks()
}
}
}
func (m *GoroutineMonitor) dumpStacks() {
buf := new(bytes.Buffer)
pprof.Lookup("goroutine").WriteTo(buf, 1)
stacks := buf.String()
// Find which goroutines are stuck
lines := strings.Split(stacks, "\n")
goroutineTypes := make(map[string]int)
for _, line := range lines {
if strings.Contains(line, "goroutine") {
// Count by function to identify which goroutine types are leaking
if idx := strings.Index(line, "in "); idx != -1 {
funcName := line[idx+3:]
goroutineTypes[funcName]++
}
}
}
log.Printf("[GOROUTINE_STACKS] %v", goroutineTypes)
}
Database Connection Pool Monitoring
Database connection pool exhaustion is a common Go production issue:
// db-monitoring.go
package main
import (
"database/sql"
"time"
"github.com/prometheus/client_golang/prometheus"
)
func monitorDBPool(db *sql.DB, dbName string) {
dbOpenConnections := prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Name: "db_open_connections",
Help: "Number of open database connections",
},
[]string{"db"},
)
dbInUseConnections := prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Name: "db_in_use_connections",
Help: "Number of in-use database connections",
},
[]string{"db"},
)
dbIdleConnections := prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Name: "db_idle_connections",
Help: "Number of idle database connections",
},
[]string{"db"},
)
dbWaitCount := prometheus.NewCounterVec(
prometheus.CounterOpts{
Name: "db_wait_count_total",
Help: "Total number of connections waited for",
},
[]string{"db"},
)
prometheus.MustRegister(dbOpenConnections, dbInUseConnections, dbIdleConnections, dbWaitCount)
go func() {
ticker := time.NewTicker(15 * time.Second)
var lastWaitCount int64
for range ticker.C {
stats := db.Stats()
dbOpenConnections.WithLabelValues(dbName).Set(float64(stats.OpenConnections))
dbInUseConnections.WithLabelValues(dbName).Set(float64(stats.InUse))
dbIdleConnections.WithLabelValues(dbName).Set(float64(stats.Idle))
// Track wait count delta (connections that had to wait for availability)
waitDelta := stats.WaitCount - lastWaitCount
if waitDelta > 0 {
dbWaitCount.WithLabelValues(dbName).Add(float64(waitDelta))
lastWaitCount = stats.WaitCount
}
// Alert if pool is nearly exhausted
maxOpen := db.Stats().MaxOpenConnections
if maxOpen > 0 {
utilization := float64(stats.InUse) / float64(maxOpen)
if utilization > 0.8 {
log.Printf("[DB_POOL_HIGH] Utilization: %.1f%%", utilization*100)
}
}
}
}()
}
Structured Logging for Go
Structured logs accelerate incident diagnosis:
// logging.go
package main
import (
"go.uber.org/zap"
"go.uber.org/zap/zapcore"
"net/http"
"time"
)
var logger *zap.Logger
func initLogger() {
config := zap.NewProductionConfig()
config.EncoderConfig.TimeKey = "timestamp"
config.EncoderConfig.EncodeTime = zapcore.ISO8601TimeEncoder
var err error
logger, err = config.Build()
if err != nil {
panic(err)
}
}
// HTTP request logging middleware
func loggingMiddleware(next http.Handler) http.Handler {
return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
start := time.Now()
wrapped := &statusResponseWriter{ResponseWriter: w, status: 200}
next.ServeHTTP(wrapped, r)
duration := time.Since(start)
// Choose log level based on status
logFn := logger.Info
if wrapped.status >= 500 {
logFn = logger.Error
} else if wrapped.status >= 400 {
logFn = logger.Warn
}
logFn("HTTP request",
zap.String("method", r.Method),
zap.String("path", r.URL.Path),
zap.Int("status", wrapped.status),
zap.Duration("duration", duration),
zap.String("ip", r.RemoteAddr),
zap.String("user_agent", r.UserAgent()),
zap.String("request_id", r.Header.Get("X-Request-ID")),
)
})
}
pprof Profiling Endpoints
Enable pprof for on-demand profiling in production (protect with authentication):
// profiling.go (only enable with auth in production)
package main
import (
"net/http"
_ "net/http/pprof"
)
func setupProfiling(mux *http.ServeMux, authKey string) {
// Protected profiling endpoint
profileHandler := http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
if r.Header.Get("X-Profile-Key") != authKey {
http.Error(w, "Unauthorized", http.StatusUnauthorized)
return
}
// Delegate to pprof
http.DefaultServeMux.ServeHTTP(w, r)
})
mux.Handle("/debug/pprof/", profileHandler)
}
// Useful pprof endpoints:
// /debug/pprof/goroutine - goroutine stack traces
// /debug/pprof/heap - heap memory profile
// /debug/pprof/profile - CPU profile (30s default)
// /debug/pprof/trace - execution trace
External Monitoring Configuration
Configure AzMonitor for your Go service:
monitors:
- name: "Go API - Health Check"
url: "https://api.example.com/health"
method: GET
interval: 60
assertions:
- type: status_code
value: 200
- type: json_path
path: "$.status"
value: "healthy"
- type: json_path
path: "$.checks.database.status"
value: "healthy"
- type: response_time
operator: less_than
value: 500
- name: "Go Service - Goroutine Count"
url: "https://api.example.com/health"
method: GET
interval: 300
assertions:
- type: json_path
path: "$.runtime.num_goroutines"
operator: less_than
value: 5000
Go Monitoring Checklist
| Component | Implemented | Tool |
|---|---|---|
| Health endpoint /health | | Standard library |
| Prometheus metrics | | prom-client |
| Request duration histogram | | prom-client |
| Goroutine leak detection | | runtime package |
| Memory stats | | runtime.MemStats |
| DB connection pool monitoring | | database/sql Stats() |
| Structured logging | | zap or slog |
| pprof endpoints | | net/http/pprof |
| Graceful shutdown | | os/signal |
| External uptime check | | AzMonitor |
Conclusion
Go's standard library provides excellent primitives for monitoring, and the Prometheus ecosystem gives you powerful metrics collection with minimal overhead. Start with a health endpoint that checks all dependencies, add Prometheus metrics for HTTP and business metrics, monitor goroutine count for leak detection, and set up external checks to verify your service is reachable. AzMonitor's external health checks complement Go's internal observability — the external view confirms your service is responding correctly from users' perspective while your internal Prometheus metrics reveal why performance looks the way it does.
3 monitors free forever · No credit card needed · Set up in 2 minutes
Start monitoring free →