Claude-code-plugins-plus-skills cohere-observability
install
source · Clone the upstream repo
git clone https://github.com/jeremylongshore/claude-code-plugins-plus-skills
Claude Code · Install into ~/.claude/skills/
T=$(mktemp -d) && git clone --depth=1 https://github.com/jeremylongshore/claude-code-plugins-plus-skills "$T" && mkdir -p ~/.claude/skills && cp -r "$T/plugins/saas-packs/cohere-pack/skills/cohere-observability" ~/.claude/skills/jeremylongshore-claude-code-plugins-plus-skills-cohere-observability && rm -rf "$T"
manifest:
plugins/saas-packs/cohere-pack/skills/cohere-observability/SKILL.mdsource content
Cohere Observability
Overview
Set up production observability for Cohere API v2 with Prometheus metrics, OpenTelemetry tracing, and AlertManager rules. Tracks per-endpoint latency, token usage, error rates, and costs.
Prerequisites
- Prometheus or compatible metrics backend
- OpenTelemetry SDK installed
SDK v7+cohere-ai
Instructions
Step 1: Metrics Collection
import { Registry, Counter, Histogram, Gauge } from 'prom-client'; const registry = new Registry(); // Per-endpoint request counter const requestCounter = new Counter({ name: 'cohere_requests_total', help: 'Total Cohere API requests', labelNames: ['endpoint', 'model', 'status'], registers: [registry], }); // Latency histogram const requestDuration = new Histogram({ name: 'cohere_request_duration_seconds', help: 'Cohere request duration', labelNames: ['endpoint', 'model'], buckets: [0.1, 0.25, 0.5, 1, 2.5, 5, 10, 30], registers: [registry], }); // Token usage tracking const tokenCounter = new Counter({ name: 'cohere_tokens_total', help: 'Total tokens consumed', labelNames: ['endpoint', 'model', 'direction'], // direction: input|output registers: [registry], }); // Error counter by type const errorCounter = new Counter({ name: 'cohere_errors_total', help: 'Cohere errors by status code', labelNames: ['endpoint', 'status_code'], registers: [registry], }); // Rate limit headroom const rateLimitGauge = new Gauge({ name: 'cohere_rate_limit_remaining', help: 'Remaining rate limit capacity', labelNames: ['endpoint'], registers: [registry], });
Step 2: Instrumented Client Wrapper
import { CohereClientV2, CohereError, CohereTimeoutError } from 'cohere-ai'; const cohere = new CohereClientV2(); async function instrumentedCall<T>( endpoint: string, model: string, operation: () => Promise<T> ): Promise<T> { const timer = requestDuration.startTimer({ endpoint, model }); try { const result = await operation(); requestCounter.inc({ endpoint, model, status: 'success' }); timer(); // Track tokens from response const usage = (result as any)?.usage?.billedUnits; if (usage) { if (usage.inputTokens) { tokenCounter.inc({ endpoint, model, direction: 'input' }, usage.inputTokens); } if (usage.outputTokens) { tokenCounter.inc({ endpoint, model, direction: 'output' }, usage.outputTokens); } } return result; } catch (err) { requestCounter.inc({ endpoint, model, status: 'error' }); timer(); if (err instanceof CohereError) { errorCounter.inc({ endpoint, status_code: String(err.statusCode) }); } else if (err instanceof CohereTimeoutError) { errorCounter.inc({ endpoint, status_code: 'timeout' }); } throw err; } } // Usage const response = await instrumentedCall('chat', 'command-a-03-2025', () => cohere.chat({ model: 'command-a-03-2025', messages: [{ role: 'user', content: query }], }) );
Step 3: OpenTelemetry Tracing
import { trace, SpanStatusCode, SpanKind } from '@opentelemetry/api'; const tracer = trace.getTracer('cohere-client', '1.0.0'); async function tracedCohereCall<T>( endpoint: string, model: string, operation: () => Promise<T> ): Promise<T> { return tracer.startActiveSpan( `cohere.${endpoint}`, { kind: SpanKind.CLIENT }, async (span) => { span.setAttribute('cohere.model', model); span.setAttribute('cohere.endpoint', endpoint); try { const result = await operation(); // Add token usage to span const usage = (result as any)?.usage?.billedUnits; if (usage) { span.setAttribute('cohere.tokens.input', usage.inputTokens ?? 0); span.setAttribute('cohere.tokens.output', usage.outputTokens ?? 0); } span.setStatus({ code: SpanStatusCode.OK }); return result; } catch (err: any) { span.setStatus({ code: SpanStatusCode.ERROR, message: err.message }); span.recordException(err); if (err instanceof CohereError) { span.setAttribute('cohere.error.status', err.statusCode ?? 0); } throw err; } finally { span.end(); } } ); }
Step 4: Structured Logging
import pino from 'pino'; const logger = pino({ name: 'cohere', level: process.env.LOG_LEVEL ?? 'info' }); function logCohereCall( endpoint: string, model: string, durationMs: number, status: 'success' | 'error', meta?: Record<string, unknown> ) { logger[status === 'error' ? 'error' : 'info']({ service: 'cohere', endpoint, model, durationMs, status, ...meta, }); } // Combined instrumentation async function observedCall<T>( endpoint: string, model: string, fn: () => Promise<T> ): Promise<T> { return tracedCohereCall(endpoint, model, () => instrumentedCall(endpoint, model, async () => { const start = Date.now(); try { const result = await fn(); logCohereCall(endpoint, model, Date.now() - start, 'success', { tokens: (result as any)?.usage?.billedUnits, }); return result; } catch (err) { logCohereCall(endpoint, model, Date.now() - start, 'error', { error: err instanceof CohereError ? err.statusCode : 'timeout', }); throw err; } }) ); }
Step 5: Alert Rules
# prometheus/cohere-alerts.yml groups: - name: cohere rules: - alert: CohereHighErrorRate expr: | rate(cohere_errors_total[5m]) / rate(cohere_requests_total[5m]) > 0.05 for: 5m labels: severity: warning annotations: summary: "Cohere error rate > 5%" description: "{{ $labels.endpoint }} error rate: {{ $value | humanizePercentage }}" - alert: CohereRateLimited expr: rate(cohere_errors_total{status_code="429"}[5m]) > 0.1 for: 2m labels: severity: warning annotations: summary: "Cohere rate limiting detected" - alert: CohereHighLatency expr: | histogram_quantile(0.95, rate(cohere_request_duration_seconds_bucket[5m]) ) > 10 for: 5m labels: severity: warning annotations: summary: "Cohere P95 latency > 10s" - alert: CohereAuthFailure expr: cohere_errors_total{status_code="401"} > 0 for: 1m labels: severity: critical annotations: summary: "Cohere authentication failure — check API key" - alert: CohereHighTokenBurn expr: rate(cohere_tokens_total[1h]) > 100000 for: 15m labels: severity: warning annotations: summary: "Cohere token burn rate > 100K/hour"
Step 6: Metrics Endpoint
// GET /metrics import express from 'express'; const app = express(); app.get('/metrics', async (req, res) => { res.set('Content-Type', registry.contentType); res.send(await registry.metrics()); });
Dashboard Panels (Grafana)
| Panel | Query | Type |
|---|---|---|
| Request Rate | | Time series |
| Error Rate | | Stat |
| P50/P95 Latency | | Time series |
| Token Usage | | Bar chart |
| Errors by Code | | Pie chart |
Output
- Prometheus metrics for requests, latency, tokens, and errors
- OpenTelemetry traces with Cohere-specific attributes
- Structured JSON logging with pino
- AlertManager rules for error rate, latency, auth, and cost
Error Handling
| Issue | Cause | Solution |
|---|---|---|
| Missing token metrics | Usage not in response | Check |
| High cardinality | Too many model labels | Use model family, not exact version |
| Alert storm | Threshold too low | Tune thresholds for your traffic |
| Trace gaps | Missing context propagation | Ensure OTel context flows through async |
Resources
Next Steps
For incident response, see
cohere-incident-runbook.