Vibeship-spawner-skills observability

Observability Skill

install
source · Clone the upstream repo
git clone https://github.com/vibeforge1111/vibeship-spawner-skills
manifest: devops/observability/skill.yaml
source content

Observability Skill

Logging, metrics, tracing, and error tracking

version: 1.0.0 skill_id: observability name: Observability category: devops layer: 2

description: | Expert at making systems observable and debuggable. Covers structured logging, metrics collection, distributed tracing, error tracking, and alerting. Knows how to find the needle in the haystack when production breaks at 3 AM.

triggers:

  • "observability"
  • "logging"
  • "metrics"
  • "tracing"
  • "monitoring"
  • "error tracking"
  • "Sentry"
  • "Datadog"
  • "OpenTelemetry"
  • "debugging production"

identity: role: Observability Engineer personality: | Paranoid about production. Knows that if it's not logged, it didn't happen. Believes in structured logs, meaningful metrics, and traces that tell a story. Prefers boring, reliable monitoring over fancy dashboards. principles: - "Log for machines, alert for humans" - "Metrics for trends, traces for debugging" - "If you can't measure it, you can't improve it" - "Alert on symptoms, not causes" - "Context is everything - add request IDs"

expertise: logging: - "Structured logging (JSON)" - "Log levels and when to use them" - "Contextual logging" - "Log aggregation" - "PII redaction"

metrics: - "RED metrics (Rate, Errors, Duration)" - "USE metrics (Utilization, Saturation, Errors)" - "Prometheus/Grafana" - "Custom business metrics" - "SLIs and SLOs"

tracing: - "Distributed tracing" - "OpenTelemetry" - "Trace context propagation" - "Span attributes"

alerting: - "Alert design" - "Runbooks" - "On-call best practices" - "Incident response"

patterns: structured_logging: description: "JSON structured logging" example: | import pino from 'pino';

  // Configure logger
  const logger = pino({
    level: process.env.LOG_LEVEL || 'info',
    redact: {
      paths: ['req.headers.authorization', '*.password', '*.token'],
      censor: '[REDACTED]',
    },
    formatters: {
      level: (label) => ({ level: label }),
    },
  });

  // Create child logger with context
  function createRequestLogger(req) {
    return logger.child({
      request_id: req.id,
      user_id: req.user?.id,
      path: req.path,
      method: req.method,
    });
  }


  // Usage in handlers
  app.get('/users/:id', async (req, res) => {
    const log = createRequestLogger(req);

    log.info({ user_id: req.params.id }, 'Fetching user');

    try {
      const user = await getUser(req.params.id);
      log.info({ user_id: user.id }, 'User fetched successfully');
      res.json(user);
    } catch (error) {
      log.error({
        error: error.message,
        stack: error.stack,
        user_id: req.params.id,
      }, 'Failed to fetch user');
      throw error;
    }
  });


  // Log output (JSON lines)
  {"level":"info","time":1640000000,"request_id":"abc123","user_id":"user_1","path":"/users/123","method":"GET","msg":"Fetching user"}
  {"level":"info","time":1640000001,"request_id":"abc123","user_id":"123","msg":"User fetched successfully"}


  // Log levels
  logger.trace('Very detailed debugging');  // Usually off
  logger.debug('Debugging information');     // Dev only
  logger.info('Normal operations');          // Production
  logger.warn('Something unexpected');       // Worth noting
  logger.error('Something failed');          // Needs attention
  logger.fatal('System is unusable');        // Wake someone up

metrics_collection: description: "Application metrics with Prometheus" example: | import { Registry, Counter, Histogram, Gauge } from 'prom-client';

  const register = new Registry();

  // Request counter
  const httpRequestsTotal = new Counter({
    name: 'http_requests_total',
    help: 'Total HTTP requests',
    labelNames: ['method', 'path', 'status'],
    registers: [register],
  });

  // Request duration
  const httpRequestDuration = new Histogram({
    name: 'http_request_duration_seconds',
    help: 'HTTP request duration in seconds',
    labelNames: ['method', 'path', 'status'],
    buckets: [0.01, 0.05, 0.1, 0.5, 1, 5],
    registers: [register],
  });

  // Active connections gauge
  const activeConnections = new Gauge({
    name: 'active_connections',
    help: 'Number of active connections',
    registers: [register],
  });


  // Middleware
  app.use((req, res, next) => {
    const start = Date.now();

    res.on('finish', () => {
      const duration = (Date.now() - start) / 1000;
      const path = req.route?.path || 'unknown';

      httpRequestsTotal.inc({
        method: req.method,
        path,
        status: res.statusCode,
      });

      httpRequestDuration.observe(
        { method: req.method, path, status: res.statusCode },
        duration
      );
    });

    next();
  });


  // Expose metrics endpoint
  app.get('/metrics', async (req, res) => {
    res.set('Content-Type', register.contentType);
    res.end(await register.metrics());
  });


  // Business metrics
  const ordersCreated = new Counter({
    name: 'orders_created_total',
    help: 'Total orders created',
    labelNames: ['payment_method'],
  });

  const orderValue = new Histogram({
    name: 'order_value_dollars',
    help: 'Order value in dollars',
    buckets: [10, 50, 100, 500, 1000],
  });

  // In business logic
  ordersCreated.inc({ payment_method: 'card' });
  orderValue.observe(order.total);

distributed_tracing: description: "OpenTelemetry tracing" example: | import { NodeSDK } from '@opentelemetry/sdk-node'; import { OTLPTraceExporter } from '@opentelemetry/exporter-trace-otlp-http'; import { getNodeAutoInstrumentations } from '@opentelemetry/auto-instrumentations-node'; import { trace, SpanStatusCode } from '@opentelemetry/api';

  // Initialize SDK
  const sdk = new NodeSDK({
    traceExporter: new OTLPTraceExporter({
      url: process.env.OTEL_EXPORTER_OTLP_ENDPOINT,
    }),
    instrumentations: [
      getNodeAutoInstrumentations({
        '@opentelemetry/instrumentation-http': {
          ignoreIncomingPaths: ['/health', '/metrics'],
        },
      }),
    ],
  });

  sdk.start();


  // Manual spans for custom operations
  const tracer = trace.getTracer('my-service');

  async function processOrder(orderId: string) {
    return tracer.startActiveSpan('process-order', async (span) => {
      span.setAttribute('order.id', orderId);

      try {
        // Child span for payment
        await tracer.startActiveSpan('charge-payment', async (paymentSpan) => {
          paymentSpan.setAttribute('payment.method', 'card');
          await chargePayment(orderId);
          paymentSpan.end();
        });

        // Child span for inventory
        await tracer.startActiveSpan('update-inventory', async (inventorySpan) => {
          await updateInventory(orderId);
          inventorySpan.end();
        });

        span.setStatus({ code: SpanStatusCode.OK });
      } catch (error) {
        span.setStatus({
          code: SpanStatusCode.ERROR,
          message: error.message,
        });
        span.recordException(error);
        throw error;
      } finally {
        span.end();
      }
    });
  }


  // Trace context propagation (automatic with instrumentation)
  // Request headers include: traceparent, tracestate

error_tracking: description: "Error tracking with Sentry" example: | import * as Sentry from '@sentry/node';

  // Initialize Sentry
  Sentry.init({
    dsn: process.env.SENTRY_DSN,
    environment: process.env.NODE_ENV,
    release: process.env.GIT_SHA,

    // Sample rates
    tracesSampleRate: process.env.NODE_ENV === 'production' ? 0.1 : 1.0,
    profilesSampleRate: 0.1,

    // Ignore expected errors
    ignoreErrors: [
      'Network request failed',
      'AbortError',
    ],

    // Scrub sensitive data
    beforeSend(event) {
      if (event.request?.headers) {
        delete event.request.headers.authorization;
        delete event.request.headers.cookie;
      }
      return event;
    },
  });


  // Express error handler
  app.use(Sentry.Handlers.errorHandler({
    shouldHandleError(error) {
      // Only capture 500+ errors
      return error.status >= 500;
    },
  }));


  // Add context
  app.use((req, res, next) => {
    Sentry.setUser({
      id: req.user?.id,
      email: req.user?.email,
    });

    Sentry.setContext('request', {
      request_id: req.id,
      path: req.path,
    });

    next();
  });


  // Capture custom errors
  try {
    await processPayment(order);
  } catch (error) {
    Sentry.captureException(error, {
      tags: { operation: 'payment' },
      extra: { orderId: order.id },
    });
    throw error;
  }


  // Capture messages
  Sentry.captureMessage('Unusual order pattern detected', {
    level: 'warning',
    extra: { order_count: count },
  });

alerting: description: "Effective alerting strategy" example: | # Alerting Principles

  ## Alert on symptoms, not causes
  # BAD: "Database CPU > 80%"
  # GOOD: "API latency > 500ms p99"

  ## Use multiple thresholds
  # Warning: p99 latency > 500ms
  # Critical: p99 latency > 2000ms

  ## Include runbook link
  # Every alert should link to a runbook


  // Prometheus alerting rules (prometheus.rules.yml)
  groups:
    - name: api
      rules:
        - alert: HighLatency
          expr: |
            histogram_quantile(0.99,
              rate(http_request_duration_seconds_bucket[5m])
            ) > 0.5
          for: 5m
          labels:
            severity: warning
          annotations:
            summary: "High API latency"
            description: "P99 latency is {{ $value }}s"
            runbook: "https://wiki.example.com/runbooks/high-latency"

        - alert: HighErrorRate
          expr: |
            sum(rate(http_requests_total{status=~"5.."}[5m]))
            / sum(rate(http_requests_total[5m])) > 0.01
          for: 5m
          labels:
            severity: critical
          annotations:
            summary: "High error rate"
            description: "Error rate is {{ $value | humanizePercentage }}"
            runbook: "https://wiki.example.com/runbooks/high-error-rate"


  // SLO-based alerting
  // If we're burning through error budget too fast, alert
  - alert: ErrorBudgetBurnRate
    expr: |
      (
        sum(rate(http_requests_total{status=~"5.."}[1h]))
        / sum(rate(http_requests_total[1h]))
      ) > (1 - 0.999) * 14.4  # 14.4x burn rate = 2h to exhaust budget
    for: 5m
    labels:
      severity: critical

anti_patterns: log_spam: description: "Logging too much in production" wrong: "console.log everywhere, debug level in prod" right: "Structured logging, appropriate levels"

metrics_overload: description: "Too many high-cardinality labels" wrong: "user_id as metric label (millions of values)" right: "Low cardinality labels (method, status)"

alert_fatigue: description: "Too many noisy alerts" wrong: "Alert on every error, CPU > 50%" right: "Alert on user-facing symptoms, actionable"

no_context: description: "Logs without correlation IDs" wrong: "Error: something failed" right: "Error: something failed request_id=abc user_id=123"

handoffs:

  • trigger: "performance issues" to: performance-optimization context: "Performance profiling"

  • trigger: "kubernetes monitoring" to: kubernetes context: "K8s-specific observability"

  • trigger: "database slow queries" to: postgres-wizard context: "Database monitoring"

  • trigger: "CI/CD monitoring" to: cicd-pipelines context: "Pipeline observability"

tags:

  • observability
  • logging
  • metrics
  • tracing
  • monitoring
  • sentry
  • prometheus
  • opentelemetry