Vibecosystem observability

Structured logging with Pino/Winston, OpenTelemetry tracing, metrics collection, Grafana dashboards, and alerting rules.

install
source · Clone the upstream repo
git clone https://github.com/vibeeval/vibecosystem
Claude Code · Install into ~/.claude/skills/
T=$(mktemp -d) && git clone --depth=1 https://github.com/vibeeval/vibecosystem "$T" && mkdir -p ~/.claude/skills && cp -r "$T/skills/observability" ~/.claude/skills/vibeeval-vibecosystem-observability && rm -rf "$T"
manifest: skills/observability/SKILL.md
source content

Observability Patterns

Three pillars of observability: logs, traces, and metrics. Each answers different questions.

Structured Logging with Pino (Node.js)

Pino is the fastest Node.js logger. Always emit JSON; never plain strings.

// logger.ts
import pino from 'pino'

export const logger = pino({
  level: process.env.LOG_LEVEL ?? 'info',
  formatters: {
    level(label) {
      return { level: label }        // emit "level":"info" not numeric
    }
  },
  base: {
    service: process.env.SERVICE_NAME ?? 'api',
    version: process.env.APP_VERSION ?? 'unknown',
    env: process.env.NODE_ENV ?? 'development'
  },
  timestamp: pino.stdTimeFunctions.isoTime,
  redact: {
    paths: ['req.headers.authorization', 'body.password', '*.token'],
    censor: '[REDACTED]'
  }
})
// Usage examples
import { logger } from './logger'

// Child logger with request context
const reqLogger = logger.child({
  requestId: crypto.randomUUID(),
  userId: user.id,
  path: req.path
})

reqLogger.info('Processing payment')
reqLogger.warn({ amount, currency }, 'Payment above threshold')
reqLogger.error({ err }, 'Payment failed')

Structured Logging with Python (structlog)

# logging_config.py
import structlog
import logging

structlog.configure(
    processors=[
        structlog.contextvars.merge_contextvars,
        structlog.processors.add_log_level,
        structlog.processors.TimeStamper(fmt="iso"),
        structlog.processors.StackInfoRenderer(),
        structlog.processors.JSONRenderer(),
    ],
    wrapper_class=structlog.make_filtering_bound_logger(logging.DEBUG),
    context_class=dict,
    logger_factory=structlog.PrintLoggerFactory(),
)

log = structlog.get_logger()
# Usage
log.info("request.received", path="/api/users", method="GET")
log.warning("rate_limit.approaching", user_id=user.id, count=95, limit=100)
log.error("payment.failed", exc_info=True, order_id=order.id, amount=99.99)

# Bind context for duration of request
structlog.contextvars.bind_contextvars(request_id=request_id, user_id=user_id)
log.info("order.created")   # request_id and user_id included automatically
structlog.contextvars.clear_contextvars()

Log Levels Usage Guide

LevelWhen to UseExample
trace
Detailed execution path (dev only)Function entry/exit, loop iterations
debug
Diagnostic info for debuggingSQL queries, cache hit/miss
info
Normal operationsRequest received, job started, user login
warn
Unexpected but recoverableRetry attempt, fallback used, slow query
error
Errors requiring investigationDB connection failed, 3rd party API error
fatal
Process must exitConfig missing, port in use
// Good log message guidelines
// ✅ Include who, what, why, and relevant IDs
logger.info({ userId, orderId, amount }, 'order.created')

// ❌ Vague message, no context
logger.info('Order done')

// ✅ Error includes the actual error object
logger.error({ err, orderId }, 'order.payment.failed')

// ❌ Error swallowed or only string
logger.error('Payment error: ' + err.message)

Request Correlation IDs

Trace a request across multiple services by propagating a unique ID.

// Express middleware: assign or forward correlation ID
import { randomUUID } from 'crypto'
import { AsyncLocalStorage } from 'async_hooks'

const requestContext = new AsyncLocalStorage<{ requestId: string; userId?: string }>()

export function correlationMiddleware(req: Request, res: Response, next: NextFunction) {
  const requestId = (req.headers['x-request-id'] as string) ?? randomUUID()

  res.setHeader('x-request-id', requestId)

  requestContext.run({ requestId }, () => {
    next()
  })
}

// Get context anywhere in call stack (no prop drilling)
export function getRequestId(): string {
  return requestContext.getStore()?.requestId ?? 'unknown'
}

// Logger auto-includes correlation ID
export function getLogger() {
  return logger.child({ requestId: getRequestId() })
}

OpenTelemetry Tracing

// tracing.ts - must be imported FIRST before other modules
import { NodeSDK } from '@opentelemetry/sdk-node'
import { OTLPTraceExporter } from '@opentelemetry/exporter-trace-otlp-http'
import { HttpInstrumentation } from '@opentelemetry/instrumentation-http'
import { ExpressInstrumentation } from '@opentelemetry/instrumentation-express'
import { PgInstrumentation } from '@opentelemetry/instrumentation-pg'

const sdk = new NodeSDK({
  serviceName: process.env.SERVICE_NAME ?? 'api',
  traceExporter: new OTLPTraceExporter({
    url: process.env.OTEL_EXPORTER_OTLP_ENDPOINT ?? 'http://localhost:4318/v1/traces'
  }),
  instrumentations: [
    new HttpInstrumentation(),
    new ExpressInstrumentation(),
    new PgInstrumentation()
  ]
})

sdk.start()

process.on('SIGTERM', () => sdk.shutdown())
// Manual spans for business logic
import { trace, SpanStatusCode, context } from '@opentelemetry/api'

const tracer = trace.getTracer('payment-service')

async function processPayment(orderId: string, amount: number) {
  return tracer.startActiveSpan('payment.process', async (span) => {
    span.setAttributes({
      'order.id': orderId,
      'payment.amount': amount,
      'payment.currency': 'USD'
    })

    try {
      const result = await chargeCard(amount)
      span.setStatus({ code: SpanStatusCode.OK })
      return result
    } catch (error) {
      span.recordException(error as Error)
      span.setStatus({ code: SpanStatusCode.ERROR, message: (error as Error).message })
      throw error
    } finally {
      span.end()
    }
  })
}

Custom Metrics with Prometheus

// metrics.ts
import { Registry, Counter, Histogram, Gauge } from 'prom-client'

export const registry = new Registry()

// HTTP request counter
export const httpRequestTotal = new Counter({
  name: 'http_requests_total',
  help: 'Total number of HTTP requests',
  labelNames: ['method', 'route', 'status_code'],
  registers: [registry]
})

// Request duration histogram
export const httpRequestDuration = new Histogram({
  name: 'http_request_duration_seconds',
  help: 'HTTP request duration in seconds',
  labelNames: ['method', 'route', 'status_code'],
  buckets: [0.01, 0.05, 0.1, 0.25, 0.5, 1, 2.5, 5],
  registers: [registry]
})

// Active connections gauge
export const activeConnections = new Gauge({
  name: 'active_connections',
  help: 'Number of active WebSocket connections',
  registers: [registry]
})
// Metrics middleware
export function metricsMiddleware(req: Request, res: Response, next: NextFunction) {
  const start = Date.now()

  res.on('finish', () => {
    const duration = (Date.now() - start) / 1000
    const labels = {
      method: req.method,
      route: req.route?.path ?? req.path,
      status_code: String(res.statusCode)
    }
    httpRequestTotal.inc(labels)
    httpRequestDuration.observe(labels, duration)
  })

  next()
}

// Metrics endpoint (scrape target for Prometheus)
app.get('/metrics', async (req, res) => {
  res.set('Content-Type', registry.contentType)
  res.send(await registry.metrics())
})

Error Tracking with Sentry

// sentry.ts
import * as Sentry from '@sentry/node'
import { nodeProfilingIntegration } from '@sentry/profiling-node'

Sentry.init({
  dsn: process.env.SENTRY_DSN,
  environment: process.env.NODE_ENV,
  release: process.env.APP_VERSION,
  integrations: [nodeProfilingIntegration()],
  tracesSampleRate: process.env.NODE_ENV === 'production' ? 0.1 : 1.0,
  profilesSampleRate: 0.1,
  beforeSend(event, hint) {
    // Strip PII from errors
    if (event.user) {
      delete event.user.email
      delete event.user.ip_address
    }
    return event
  }
})

// Capture with context
try {
  await processOrder(orderId)
} catch (error) {
  Sentry.withScope((scope) => {
    scope.setTag('order.id', orderId)
    scope.setLevel('error')
    Sentry.captureException(error)
  })
  throw error
}

Grafana Dashboard Templates

// dashboard panel: Request Rate (PromQL)
{
  "title": "Request Rate",
  "type": "timeseries",
  "targets": [{
    "expr": "sum(rate(http_requests_total[5m])) by (route)",
    "legendFormat": "{{route}}"
  }]
}
# PromQL expressions for common panels

# Request rate (req/s over 5 min window)
sum(rate(http_requests_total[5m])) by (route, method)

# Error rate (%)
sum(rate(http_requests_total{status_code=~"5.."}[5m]))
  / sum(rate(http_requests_total[5m])) * 100

# Latency percentiles
histogram_quantile(0.50, sum(rate(http_request_duration_seconds_bucket[5m])) by (le, route))
histogram_quantile(0.95, sum(rate(http_request_duration_seconds_bucket[5m])) by (le, route))
histogram_quantile(0.99, sum(rate(http_request_duration_seconds_bucket[5m])) by (le, route))

# Apdex score (satisfied < 0.3s, tolerated < 1.2s)
(
  sum(rate(http_request_duration_seconds_bucket{le="0.3"}[5m]))
  + sum(rate(http_request_duration_seconds_bucket{le="1.2"}[5m]))
) / 2 / sum(rate(http_request_duration_seconds_count[5m]))

Alert Rules (SLO-Based)

# prometheus/alerts.yml
groups:
  - name: slo.alerts
    rules:
      # Error budget burn rate (fast burn = page immediately)
      - alert: HighErrorRate
        expr: |
          (
            sum(rate(http_requests_total{status_code=~"5.."}[5m]))
            / sum(rate(http_requests_total[5m]))
          ) > 0.01
        for: 5m
        labels:
          severity: critical
        annotations:
          summary: "Error rate above 1% SLO"
          description: "Error rate is {{ $value | humanizePercentage }}"

      # p99 latency SLO breach
      - alert: HighLatencyP99
        expr: |
          histogram_quantile(0.99,
            sum(rate(http_request_duration_seconds_bucket[5m])) by (le)
          ) > 1.0
        for: 10m
        labels:
          severity: warning
        annotations:
          summary: "p99 latency above 1s SLO"

      # Service availability
      - alert: ServiceDown
        expr: up{job="api"} == 0
        for: 1m
        labels:
          severity: critical
        annotations:
          summary: "API service is down"

Health Check Monitoring

// Composite health check endpoint
interface HealthStatus {
  status: 'healthy' | 'degraded' | 'unhealthy'
  checks: Record<string, { status: string; latencyMs?: number; error?: string }>
}

app.get('/health/detailed', async (req, res) => {
  const checks: HealthStatus['checks'] = {}

  // Database check
  const dbStart = Date.now()
  try {
    await db.execute('SELECT 1')
    checks.database = { status: 'ok', latencyMs: Date.now() - dbStart }
  } catch (err) {
    checks.database = { status: 'fail', error: (err as Error).message }
  }

  // Redis check
  const redisStart = Date.now()
  try {
    await redis.ping()
    checks.redis = { status: 'ok', latencyMs: Date.now() - redisStart }
  } catch (err) {
    checks.redis = { status: 'fail', error: (err as Error).message }
  }

  const allHealthy = Object.values(checks).every(c => c.status === 'ok')
  const anyFailing = Object.values(checks).some(c => c.status === 'fail')

  const overall: HealthStatus['status'] = allHealthy
    ? 'healthy'
    : anyFailing ? 'unhealthy' : 'degraded'

  res.status(allHealthy ? 200 : 503).json({ status: overall, checks })
})

Dynamic Log Level in Production

// Change log level without restart
import { logger } from './logger'

app.put('/admin/log-level', requireAdminAuth, (req, res) => {
  const { level } = req.body
  const validLevels = ['trace', 'debug', 'info', 'warn', 'error', 'fatal']

  if (!validLevels.includes(level)) {
    return res.status(400).json({ error: 'Invalid level' })
  }

  logger.level = level
  logger.info({ level }, 'Log level changed')
  res.json({ level })
})

Log Rotation and Retention

# logrotate config: /etc/logrotate.d/app
/var/log/app/*.log {
  daily
  rotate 14          # keep 14 days
  compress
  delaycompress
  missingok
  notifempty
  postrotate
    kill -USR1 $(cat /var/run/app.pid) 2>/dev/null || true
  endscript
}
# Docker logging with size-based rotation
services:
  api:
    logging:
      driver: json-file
      options:
        max-size: "50m"
        max-file: "5"
        labels: "service,version"

APM Integration (Datadog-style without vendor lock-in)

// OpenTelemetry collector config: otel-collector.yml
# ships to multiple backends simultaneously
exporters:
  otlp/datadog:
    endpoint: https://api.datadoghq.com/v1/traces
    headers:
      dd-api-key: ${DD_API_KEY}
  prometheus:
    endpoint: 0.0.0.0:8889
  loki:
    endpoint: http://loki:3100/loki/api/v1/push

pipelines:
  traces:
    receivers: [otlp]
    processors: [batch, resourcedetection]
    exporters: [otlp/datadog]
  metrics:
    receivers: [otlp, prometheus]
    exporters: [prometheus]
  logs:
    receivers: [otlp]
    exporters: [loki]

Key principle: Correlate logs, traces, and metrics by the same

requestId
/
traceId
. Emit structured JSON from day one — retrofitting is painful. Set up alerts on SLO burn rate, not absolute thresholds.