Vibecosystem observability
Structured logging with Pino/Winston, OpenTelemetry tracing, metrics collection, Grafana dashboards, and alerting rules.
install
source · Clone the upstream repo
git clone https://github.com/vibeeval/vibecosystem
Claude Code · Install into ~/.claude/skills/
T=$(mktemp -d) && git clone --depth=1 https://github.com/vibeeval/vibecosystem "$T" && mkdir -p ~/.claude/skills && cp -r "$T/skills/observability" ~/.claude/skills/vibeeval-vibecosystem-observability && rm -rf "$T"
manifest:
skills/observability/SKILL.mdsource content
Observability Patterns
Three pillars of observability: logs, traces, and metrics. Each answers different questions.
Structured Logging with Pino (Node.js)
Pino is the fastest Node.js logger. Always emit JSON; never plain strings.
// logger.ts import pino from 'pino' export const logger = pino({ level: process.env.LOG_LEVEL ?? 'info', formatters: { level(label) { return { level: label } // emit "level":"info" not numeric } }, base: { service: process.env.SERVICE_NAME ?? 'api', version: process.env.APP_VERSION ?? 'unknown', env: process.env.NODE_ENV ?? 'development' }, timestamp: pino.stdTimeFunctions.isoTime, redact: { paths: ['req.headers.authorization', 'body.password', '*.token'], censor: '[REDACTED]' } })
// Usage examples import { logger } from './logger' // Child logger with request context const reqLogger = logger.child({ requestId: crypto.randomUUID(), userId: user.id, path: req.path }) reqLogger.info('Processing payment') reqLogger.warn({ amount, currency }, 'Payment above threshold') reqLogger.error({ err }, 'Payment failed')
Structured Logging with Python (structlog)
# logging_config.py import structlog import logging structlog.configure( processors=[ structlog.contextvars.merge_contextvars, structlog.processors.add_log_level, structlog.processors.TimeStamper(fmt="iso"), structlog.processors.StackInfoRenderer(), structlog.processors.JSONRenderer(), ], wrapper_class=structlog.make_filtering_bound_logger(logging.DEBUG), context_class=dict, logger_factory=structlog.PrintLoggerFactory(), ) log = structlog.get_logger()
# Usage log.info("request.received", path="/api/users", method="GET") log.warning("rate_limit.approaching", user_id=user.id, count=95, limit=100) log.error("payment.failed", exc_info=True, order_id=order.id, amount=99.99) # Bind context for duration of request structlog.contextvars.bind_contextvars(request_id=request_id, user_id=user_id) log.info("order.created") # request_id and user_id included automatically structlog.contextvars.clear_contextvars()
Log Levels Usage Guide
| Level | When to Use | Example |
|---|---|---|
| Detailed execution path (dev only) | Function entry/exit, loop iterations |
| Diagnostic info for debugging | SQL queries, cache hit/miss |
| Normal operations | Request received, job started, user login |
| Unexpected but recoverable | Retry attempt, fallback used, slow query |
| Errors requiring investigation | DB connection failed, 3rd party API error |
| Process must exit | Config missing, port in use |
// Good log message guidelines // ✅ Include who, what, why, and relevant IDs logger.info({ userId, orderId, amount }, 'order.created') // ❌ Vague message, no context logger.info('Order done') // ✅ Error includes the actual error object logger.error({ err, orderId }, 'order.payment.failed') // ❌ Error swallowed or only string logger.error('Payment error: ' + err.message)
Request Correlation IDs
Trace a request across multiple services by propagating a unique ID.
// Express middleware: assign or forward correlation ID import { randomUUID } from 'crypto' import { AsyncLocalStorage } from 'async_hooks' const requestContext = new AsyncLocalStorage<{ requestId: string; userId?: string }>() export function correlationMiddleware(req: Request, res: Response, next: NextFunction) { const requestId = (req.headers['x-request-id'] as string) ?? randomUUID() res.setHeader('x-request-id', requestId) requestContext.run({ requestId }, () => { next() }) } // Get context anywhere in call stack (no prop drilling) export function getRequestId(): string { return requestContext.getStore()?.requestId ?? 'unknown' } // Logger auto-includes correlation ID export function getLogger() { return logger.child({ requestId: getRequestId() }) }
OpenTelemetry Tracing
// tracing.ts - must be imported FIRST before other modules import { NodeSDK } from '@opentelemetry/sdk-node' import { OTLPTraceExporter } from '@opentelemetry/exporter-trace-otlp-http' import { HttpInstrumentation } from '@opentelemetry/instrumentation-http' import { ExpressInstrumentation } from '@opentelemetry/instrumentation-express' import { PgInstrumentation } from '@opentelemetry/instrumentation-pg' const sdk = new NodeSDK({ serviceName: process.env.SERVICE_NAME ?? 'api', traceExporter: new OTLPTraceExporter({ url: process.env.OTEL_EXPORTER_OTLP_ENDPOINT ?? 'http://localhost:4318/v1/traces' }), instrumentations: [ new HttpInstrumentation(), new ExpressInstrumentation(), new PgInstrumentation() ] }) sdk.start() process.on('SIGTERM', () => sdk.shutdown())
// Manual spans for business logic import { trace, SpanStatusCode, context } from '@opentelemetry/api' const tracer = trace.getTracer('payment-service') async function processPayment(orderId: string, amount: number) { return tracer.startActiveSpan('payment.process', async (span) => { span.setAttributes({ 'order.id': orderId, 'payment.amount': amount, 'payment.currency': 'USD' }) try { const result = await chargeCard(amount) span.setStatus({ code: SpanStatusCode.OK }) return result } catch (error) { span.recordException(error as Error) span.setStatus({ code: SpanStatusCode.ERROR, message: (error as Error).message }) throw error } finally { span.end() } }) }
Custom Metrics with Prometheus
// metrics.ts import { Registry, Counter, Histogram, Gauge } from 'prom-client' export const registry = new Registry() // HTTP request counter export const httpRequestTotal = new Counter({ name: 'http_requests_total', help: 'Total number of HTTP requests', labelNames: ['method', 'route', 'status_code'], registers: [registry] }) // Request duration histogram export const httpRequestDuration = new Histogram({ name: 'http_request_duration_seconds', help: 'HTTP request duration in seconds', labelNames: ['method', 'route', 'status_code'], buckets: [0.01, 0.05, 0.1, 0.25, 0.5, 1, 2.5, 5], registers: [registry] }) // Active connections gauge export const activeConnections = new Gauge({ name: 'active_connections', help: 'Number of active WebSocket connections', registers: [registry] })
// Metrics middleware export function metricsMiddleware(req: Request, res: Response, next: NextFunction) { const start = Date.now() res.on('finish', () => { const duration = (Date.now() - start) / 1000 const labels = { method: req.method, route: req.route?.path ?? req.path, status_code: String(res.statusCode) } httpRequestTotal.inc(labels) httpRequestDuration.observe(labels, duration) }) next() } // Metrics endpoint (scrape target for Prometheus) app.get('/metrics', async (req, res) => { res.set('Content-Type', registry.contentType) res.send(await registry.metrics()) })
Error Tracking with Sentry
// sentry.ts import * as Sentry from '@sentry/node' import { nodeProfilingIntegration } from '@sentry/profiling-node' Sentry.init({ dsn: process.env.SENTRY_DSN, environment: process.env.NODE_ENV, release: process.env.APP_VERSION, integrations: [nodeProfilingIntegration()], tracesSampleRate: process.env.NODE_ENV === 'production' ? 0.1 : 1.0, profilesSampleRate: 0.1, beforeSend(event, hint) { // Strip PII from errors if (event.user) { delete event.user.email delete event.user.ip_address } return event } }) // Capture with context try { await processOrder(orderId) } catch (error) { Sentry.withScope((scope) => { scope.setTag('order.id', orderId) scope.setLevel('error') Sentry.captureException(error) }) throw error }
Grafana Dashboard Templates
// dashboard panel: Request Rate (PromQL) { "title": "Request Rate", "type": "timeseries", "targets": [{ "expr": "sum(rate(http_requests_total[5m])) by (route)", "legendFormat": "{{route}}" }] }
# PromQL expressions for common panels # Request rate (req/s over 5 min window) sum(rate(http_requests_total[5m])) by (route, method) # Error rate (%) sum(rate(http_requests_total{status_code=~"5.."}[5m])) / sum(rate(http_requests_total[5m])) * 100 # Latency percentiles histogram_quantile(0.50, sum(rate(http_request_duration_seconds_bucket[5m])) by (le, route)) histogram_quantile(0.95, sum(rate(http_request_duration_seconds_bucket[5m])) by (le, route)) histogram_quantile(0.99, sum(rate(http_request_duration_seconds_bucket[5m])) by (le, route)) # Apdex score (satisfied < 0.3s, tolerated < 1.2s) ( sum(rate(http_request_duration_seconds_bucket{le="0.3"}[5m])) + sum(rate(http_request_duration_seconds_bucket{le="1.2"}[5m])) ) / 2 / sum(rate(http_request_duration_seconds_count[5m]))
Alert Rules (SLO-Based)
# prometheus/alerts.yml groups: - name: slo.alerts rules: # Error budget burn rate (fast burn = page immediately) - alert: HighErrorRate expr: | ( sum(rate(http_requests_total{status_code=~"5.."}[5m])) / sum(rate(http_requests_total[5m])) ) > 0.01 for: 5m labels: severity: critical annotations: summary: "Error rate above 1% SLO" description: "Error rate is {{ $value | humanizePercentage }}" # p99 latency SLO breach - alert: HighLatencyP99 expr: | histogram_quantile(0.99, sum(rate(http_request_duration_seconds_bucket[5m])) by (le) ) > 1.0 for: 10m labels: severity: warning annotations: summary: "p99 latency above 1s SLO" # Service availability - alert: ServiceDown expr: up{job="api"} == 0 for: 1m labels: severity: critical annotations: summary: "API service is down"
Health Check Monitoring
// Composite health check endpoint interface HealthStatus { status: 'healthy' | 'degraded' | 'unhealthy' checks: Record<string, { status: string; latencyMs?: number; error?: string }> } app.get('/health/detailed', async (req, res) => { const checks: HealthStatus['checks'] = {} // Database check const dbStart = Date.now() try { await db.execute('SELECT 1') checks.database = { status: 'ok', latencyMs: Date.now() - dbStart } } catch (err) { checks.database = { status: 'fail', error: (err as Error).message } } // Redis check const redisStart = Date.now() try { await redis.ping() checks.redis = { status: 'ok', latencyMs: Date.now() - redisStart } } catch (err) { checks.redis = { status: 'fail', error: (err as Error).message } } const allHealthy = Object.values(checks).every(c => c.status === 'ok') const anyFailing = Object.values(checks).some(c => c.status === 'fail') const overall: HealthStatus['status'] = allHealthy ? 'healthy' : anyFailing ? 'unhealthy' : 'degraded' res.status(allHealthy ? 200 : 503).json({ status: overall, checks }) })
Dynamic Log Level in Production
// Change log level without restart import { logger } from './logger' app.put('/admin/log-level', requireAdminAuth, (req, res) => { const { level } = req.body const validLevels = ['trace', 'debug', 'info', 'warn', 'error', 'fatal'] if (!validLevels.includes(level)) { return res.status(400).json({ error: 'Invalid level' }) } logger.level = level logger.info({ level }, 'Log level changed') res.json({ level }) })
Log Rotation and Retention
# logrotate config: /etc/logrotate.d/app /var/log/app/*.log { daily rotate 14 # keep 14 days compress delaycompress missingok notifempty postrotate kill -USR1 $(cat /var/run/app.pid) 2>/dev/null || true endscript }
# Docker logging with size-based rotation services: api: logging: driver: json-file options: max-size: "50m" max-file: "5" labels: "service,version"
APM Integration (Datadog-style without vendor lock-in)
// OpenTelemetry collector config: otel-collector.yml # ships to multiple backends simultaneously exporters: otlp/datadog: endpoint: https://api.datadoghq.com/v1/traces headers: dd-api-key: ${DD_API_KEY} prometheus: endpoint: 0.0.0.0:8889 loki: endpoint: http://loki:3100/loki/api/v1/push pipelines: traces: receivers: [otlp] processors: [batch, resourcedetection] exporters: [otlp/datadog] metrics: receivers: [otlp, prometheus] exporters: [prometheus] logs: receivers: [otlp] exporters: [loki]
Key principle: Correlate logs, traces, and metrics by the same
requestId/traceId. Emit structured JSON from day one — retrofitting is painful. Set up alerts on SLO burn rate, not absolute thresholds.