Vibecosystem resilience-patterns
Circuit breaker, bulkhead, retry with jitter, graceful shutdown, health check patterns for production resilience.
install
source · Clone the upstream repo
git clone https://github.com/vibeeval/vibecosystem
Claude Code · Install into ~/.claude/skills/
T=$(mktemp -d) && git clone --depth=1 https://github.com/vibeeval/vibecosystem "$T" && mkdir -p ~/.claude/skills && cp -r "$T/skills/resilience-patterns" ~/.claude/skills/vibeeval-vibecosystem-resilience-patterns && rm -rf "$T"
manifest:
skills/resilience-patterns/SKILL.mdsource content
Resilience Patterns
Production-grade patterns for surviving failures without cascading.
Circuit Breaker
// States: CLOSED (normal) → OPEN (blocking) → HALF_OPEN (testing) type CircuitState = 'CLOSED' | 'OPEN' | 'HALF_OPEN' class CircuitBreaker { private state: CircuitState = 'CLOSED' private failureCount = 0 private lastFailureTime = 0 private successCount = 0 constructor( private readonly failureThreshold = 5, private readonly recoveryTimeout = 30_000, // ms private readonly halfOpenMaxCalls = 3 ) {} async call<T>(fn: () => Promise<T>): Promise<T> { if (this.state === 'OPEN') { if (Date.now() - this.lastFailureTime > this.recoveryTimeout) { this.state = 'HALF_OPEN' this.successCount = 0 } else { throw new Error('Circuit breaker is OPEN — request rejected') } } try { const result = await fn() this.onSuccess() return result } catch (err) { this.onFailure() throw err } } private onSuccess(): void { if (this.state === 'HALF_OPEN') { this.successCount++ if (this.successCount >= this.halfOpenMaxCalls) { this.state = 'CLOSED' this.failureCount = 0 } } else { this.failureCount = 0 } } private onFailure(): void { this.failureCount++ this.lastFailureTime = Date.now() if (this.failureCount >= this.failureThreshold) { this.state = 'OPEN' } } getState(): CircuitState { return this.state } } // Usage with Opossum (production library) // import CircuitBreaker from 'opossum' // const breaker = new CircuitBreaker(riskyCall, { timeout: 3000, errorThresholdPercentage: 50 }) const paymentBreaker = new CircuitBreaker(5, 30_000) async function chargeUser(userId: string, amount: number) { return paymentBreaker.call(() => paymentService.charge(userId, amount)) }
Retry with Exponential Backoff + Jitter
interface RetryOptions { maxAttempts?: number baseDelayMs?: number maxDelayMs?: number jitter?: boolean retryIf?: (error: unknown) => boolean } async function withRetry<T>( fn: () => Promise<T>, options: RetryOptions = {} ): Promise<T> { const { maxAttempts = 3, baseDelayMs = 500, maxDelayMs = 15_000, jitter = true, retryIf = () => true } = options let lastError: unknown for (let attempt = 1; attempt <= maxAttempts; attempt++) { try { return await fn() } catch (err) { lastError = err if (attempt === maxAttempts || !retryIf(err)) throw err // Exponential backoff: 500ms, 1s, 2s, 4s... const base = Math.min(baseDelayMs * Math.pow(2, attempt - 1), maxDelayMs) // Full jitter: random between 0 and base (avoids synchronized retries) const delay = jitter ? Math.random() * base : base console.warn(`Attempt ${attempt}/${maxAttempts} failed, retrying in ${Math.round(delay)}ms`) await new Promise(resolve => setTimeout(resolve, delay)) } } throw lastError } // Usage: only retry transient errors await withRetry( () => externalApi.fetchData(), { maxAttempts: 4, retryIf: (err) => err instanceof NetworkError || (err as any)?.status >= 500 } )
Bulkhead Pattern
// Isolate failure domains: separate thread pools / queues per service class Bulkhead { private activeCount = 0 private queue: Array<() => void> = [] constructor( private maxConcurrent: number, private maxQueueSize: number = 50 ) {} async execute<T>(fn: () => Promise<T>): Promise<T> { if (this.activeCount >= this.maxConcurrent) { if (this.queue.length >= this.maxQueueSize) { throw new Error('Bulkhead queue full — request rejected') } await new Promise<void>((resolve, reject) => { const timeout = setTimeout( () => reject(new Error('Bulkhead queue timeout')), 5000 ) this.queue.push(() => { clearTimeout(timeout); resolve() }) }) } this.activeCount++ try { return await fn() } finally { this.activeCount-- if (this.queue.length > 0) { const next = this.queue.shift()! next() } } } getStats() { return { activeCount: this.activeCount, queueLength: this.queue.length } } } // Separate bulkheads per downstream service const paymentBulkhead = new Bulkhead(10, 20) const emailBulkhead = new Bulkhead(5, 10) const dbBulkhead = new Bulkhead(30, 100)
Timeout Policies
// Never let a call hang indefinitely function withTimeout<T>( fn: () => Promise<T>, timeoutMs: number, label = 'operation' ): Promise<T> { return Promise.race([ fn(), new Promise<never>((_, reject) => setTimeout( () => reject(new Error(`${label} timed out after ${timeoutMs}ms`)), timeoutMs ) ) ]) } // Connect timeout vs read timeout (different values) const fetchWithTimeouts = async (url: string) => { const controller = new AbortController() const connectTimeout = setTimeout(() => controller.abort(), 2000) // connect: 2s try { const response = await fetch(url, { signal: controller.signal }) clearTimeout(connectTimeout) // Read timeout: 10s for body streaming return await withTimeout(() => response.json(), 10_000, 'response body read') } finally { clearTimeout(connectTimeout) } }
Fallback Chain
async function getMarketData(id: string): Promise<Market> { return withFallbacks([ { name: 'primary-db', fn: () => primaryDb.market.findUnique({ where: { id } }) }, { name: 'redis-cache', fn: () => redis.get(`market:${id}`).then(v => v ? JSON.parse(v) : null) }, { name: 'replica-db', fn: () => replicaDb.market.findUnique({ where: { id } }) }, { name: 'stale-cache', fn: () => staleCache.get(id) } ]) } async function withFallbacks<T>( strategies: Array<{ name: string; fn: () => Promise<T | null> }> ): Promise<T> { for (const { name, fn } of strategies) { try { const result = await fn() if (result != null) return result } catch (err) { console.warn(`Strategy '${name}' failed:`, (err as Error).message) } } throw new Error('All fallback strategies exhausted') }
Health Check Endpoints
import express from 'express' const app = express() // Liveness: is the process alive? (k8s restarts if fails) app.get('/live', (_req, res) => { res.json({ status: 'ok', timestamp: new Date().toISOString() }) }) // Readiness: is the app ready to serve traffic? (k8s removes from LB if fails) app.get('/ready', async (_req, res) => { const checks = await Promise.allSettled([ checkDatabase(), checkRedis(), checkDependencies() ]) const results = checks.map((c, i) => ({ name: ['database', 'redis', 'dependencies'][i], status: c.status === 'fulfilled' ? 'ok' : 'fail', error: c.status === 'rejected' ? (c.reason as Error).message : undefined })) const allHealthy = results.every(r => r.status === 'ok') res.status(allHealthy ? 200 : 503).json({ status: allHealthy ? 'ready' : 'not-ready', checks: results }) }) // Health: detailed diagnostics for ops team app.get('/health', async (_req, res) => { const [dbMs, redisMs] = await Promise.all([pingDb(), pingRedis()]) res.json({ status: 'ok', uptime: process.uptime(), memory: process.memoryUsage(), db: { latencyMs: dbMs }, redis: { latencyMs: redisMs } }) }) async function checkDatabase(): Promise<void> { await db.$queryRaw`SELECT 1` } async function checkRedis(): Promise<void> { const pong = await redis.ping() if (pong !== 'PONG') throw new Error('Redis ping failed') }
Graceful Shutdown
// Handle SIGTERM (k8s, Docker stop) without dropping in-flight requests let isShuttingDown = false async function gracefulShutdown(server: http.Server): Promise<void> { console.log('SIGTERM received, starting graceful shutdown...') isShuttingDown = true // Stop accepting new requests server.close(async () => { console.log('HTTP server closed') try { // Drain active jobs await jobQueue.close() // Close DB connections await db.$disconnect() // Close Redis await redis.quit() console.log('Graceful shutdown complete') process.exit(0) } catch (err) { console.error('Error during shutdown:', err) process.exit(1) } }) // Force kill after 30s if drain stalls setTimeout(() => { console.error('Graceful shutdown timed out, forcing exit') process.exit(1) }, 30_000) } // Reject new requests during shutdown app.use((_req, res, next) => { if (isShuttingDown) { res.setHeader('Connection', 'close') return res.status(503).json({ error: 'Server is shutting down' }) } next() }) process.on('SIGTERM', () => gracefulShutdown(server)) process.on('SIGINT', () => gracefulShutdown(server))
Idempotency Keys
// Safe to retry without double-charging / double-creating async function processPaymentIdempotent( idempotencyKey: string, payload: PaymentPayload ): Promise<PaymentResult> { const lockKey = `idempotency:${idempotencyKey}` // Check if already processed const existing = await redis.get(lockKey) if (existing) { return JSON.parse(existing) as PaymentResult } const result = await paymentGateway.charge(payload) // Store result for 24 hours await redis.setex(lockKey, 86_400, JSON.stringify(result)) return result } // Client sends Idempotency-Key header, retry safe router.post('/payments', async (req, res) => { const key = req.headers['idempotency-key'] as string if (!key) return res.status(400).json({ error: 'Idempotency-Key header required' }) const result = await processPaymentIdempotent(key, req.body) res.json({ success: true, data: result }) })
Remember: Resilience is composed — combine circuit breaker + retry + bulkhead + timeout for defense in depth. Never apply retry without a circuit breaker, or you'll amplify load on a failing service.