git clone https://github.com/vibeforge1111/vibeship-spawner-skills
ai/ai-observability/skill.yamlAI Observability Skill
LLM monitoring, tracing, cost tracking, and evaluation
id: ai-observability name: ai-observability category: ai description: | Implement comprehensive observability for LLM applications including tracing (Langfuse/Helicone), cost tracking, token optimization, RAG evaluation metrics (RAGAS), hallucination detection, and production monitoring. Essential for debugging, optimizing costs, and ensuring AI output quality.
version: 1.0.0 author: vibeship tags:
- llm-monitoring
- tracing
- langfuse
- helicone
- cost-tracking
- ragas
- evaluation
- hallucination-detection
- prompt-caching
principles:
-
name: "Trace Every LLM Call" description: | Production AI apps without tracing are flying blind. Every LLM call should be traced with inputs, outputs, latency, tokens, and cost. Use structured spans for multi-step chains and agents.
-
name: "Measure What Matters" description: | Track metrics that correlate with user value: faithfulness for RAG, answer relevancy, latency percentiles, cost per successful outcome. Vanity metrics (total calls) don't improve product quality.
-
name: "Cost Is a First-Class Metric" description: | Token costs can explode overnight with agent loops or context growth. Track cost per user, per feature, per model. Set budgets and alerts. Prompt caching can cut costs by 50-90%.
-
name: "Evaluate Continuously" description: | Run automated evals on production samples. RAGAS metrics (faithfulness, relevancy, context precision) catch quality degradation before users complain. Score > 0.8 is generally good.
patterns:
-
name: "Langfuse Tracing Setup" description: "Open-source LLM tracing with full observability" when_to_use: "Need detailed tracing, open-source, self-host option" implementation: | // lib/langfuse.ts import { Langfuse } from "langfuse"; import OpenAI from "openai";
// Initialize Langfuse client export const langfuse = new Langfuse({ publicKey: process.env.LANGFUSE_PUBLIC_KEY!, secretKey: process.env.LANGFUSE_SECRET_KEY!, baseUrl: process.env.LANGFUSE_HOST || "https://cloud.langfuse.com", });
// Wrap OpenAI with tracing const openai = new OpenAI();
interface TracedCompletionOptions { model: string; messages: OpenAI.ChatCompletionMessageParam[]; userId?: string; sessionId?: string; metadata?: Record<string, unknown>; tags?: string[]; }
export async function tracedCompletion({ model, messages, userId, sessionId, metadata, tags, }: TracedCompletionOptions) { // Create trace const trace = langfuse.trace({ name: "llm-completion", userId, sessionId, metadata, tags, });
// Create generation span const generation = trace.generation({ name: "openai-completion", model, input: messages, metadata: { modelProvider: "openai" }, }); const startTime = Date.now(); try { const response = await openai.chat.completions.create({ model, messages, }); const latencyMs = Date.now() - startTime; const output = response.choices[0].message; // End generation with output generation.end({ output, usage: { promptTokens: response.usage?.prompt_tokens, completionTokens: response.usage?.completion_tokens, totalTokens: response.usage?.total_tokens, }, metadata: { latencyMs, finishReason: response.choices[0].finish_reason, }, }); // Calculate cost const cost = calculateCost( model, response.usage?.prompt_tokens || 0, response.usage?.completion_tokens || 0 ); trace.update({ metadata: { ...metadata, totalCost: cost, latencyMs }, }); return { response, traceId: trace.id, cost, latencyMs, }; } catch (error) { generation.end({ level: "ERROR", statusMessage: error instanceof Error ? error.message : "Unknown error", }); throw error; } finally { // Flush to ensure data is sent await langfuse.flushAsync(); }}
// Model pricing (per 1M tokens) const MODEL_PRICING: Record<string, { input: number; output: number }> = { "gpt-4o": { input: 5, output: 15 }, "gpt-4o-mini": { input: 0.15, output: 0.6 }, "gpt-4-turbo": { input: 10, output: 30 }, "claude-sonnet-4-20250514": { input: 3, output: 15 }, "claude-3-5-haiku-latest": { input: 0.8, output: 4 }, };
function calculateCost( model: string, inputTokens: number, outputTokens: number ): number { const pricing = MODEL_PRICING[model] || MODEL_PRICING["gpt-4o"]; return ( (inputTokens / 1_000_000) * pricing.input + (outputTokens / 1_000_000) * pricing.output ); }
-
name: "Helicone Proxy Setup" description: "Zero-code LLM observability via proxy" when_to_use: "Want minimal setup, automatic cost tracking" implementation: | // lib/helicone-openai.ts import OpenAI from "openai";
// Use Helicone proxy for automatic tracing const openai = new OpenAI({ apiKey: process.env.OPENAI_API_KEY!, baseURL: "https://oai.helicone.ai/v1", defaultHeaders: { "Helicone-Auth":
, }, });Bearer ${process.env.HELICONE_API_KEY}// Add custom properties for filtering interface HeliconeOptions { userId?: string; sessionId?: string; promptId?: string; cache?: boolean; retryEnabled?: boolean; rateLimitPolicy?: string; }
export function createHeliconeClient(options: HeliconeOptions) { const headers: Record<string, string> = { "Helicone-Auth":
, };Bearer ${process.env.HELICONE_API_KEY}if (options.userId) { headers["Helicone-User-Id"] = options.userId; } if (options.sessionId) { headers["Helicone-Session-Id"] = options.sessionId; } if (options.promptId) { headers["Helicone-Prompt-Id"] = options.promptId; } if (options.cache) { headers["Helicone-Cache-Enabled"] = "true"; } if (options.retryEnabled) { headers["Helicone-Retry-Enabled"] = "true"; } if (options.rateLimitPolicy) { headers["Helicone-RateLimit-Policy"] = options.rateLimitPolicy; } return new OpenAI({ apiKey: process.env.OPENAI_API_KEY!, baseURL: "https://oai.helicone.ai/v1", defaultHeaders: headers, });}
// For Anthropic import Anthropic from "@anthropic-ai/sdk";
export function createHeliconeAnthropic(options: HeliconeOptions) { const headers: Record<string, string> = { "Helicone-Auth":
, };Bearer ${process.env.HELICONE_API_KEY}if (options.userId) headers["Helicone-User-Id"] = options.userId; if (options.cache) headers["Helicone-Cache-Enabled"] = "true"; return new Anthropic({ apiKey: process.env.ANTHROPIC_API_KEY!, baseURL: "https://anthropic.helicone.ai", defaultHeaders: headers, });}
-
name: "Cost Tracking Dashboard" description: "Track and budget LLM costs per user/feature" when_to_use: "Need granular cost visibility and budgets" implementation: | // lib/cost-tracking.ts import { db } from "./db";
interface TokenUsage { userId: string; model: string; inputTokens: number; outputTokens: number; cost: number; feature?: string; traceId?: string; }
// Record usage export async function recordUsage(usage: TokenUsage) { await db.tokenUsage.create({ data: { ...usage, timestamp: new Date(), }, }); }
// Get user's usage for period export async function getUserUsage( userId: string, period: "day" | "week" | "month" = "month" ) { const startDate = new Date(); if (period === "day") startDate.setDate(startDate.getDate() - 1); else if (period === "week") startDate.setDate(startDate.getDate() - 7); else startDate.setMonth(startDate.getMonth() - 1);
const usage = await db.tokenUsage.aggregate({ where: { userId, timestamp: { gte: startDate }, }, _sum: { inputTokens: true, outputTokens: true, cost: true, }, _count: true, }); return { totalInputTokens: usage._sum.inputTokens || 0, totalOutputTokens: usage._sum.outputTokens || 0, totalCost: usage._sum.cost || 0, requestCount: usage._count, };}
// Check budget before request export async function checkBudget( userId: string, estimatedCost: number ): Promise<{ allowed: boolean; remaining: number; reason?: string }> { const user = await db.user.findUnique({ where: { id: userId }, select: { monthlyBudget: true, tier: true }, });
if (!user) { return { allowed: false, remaining: 0, reason: "User not found" }; } const currentUsage = await getUserUsage(userId, "month"); const remaining = user.monthlyBudget - currentUsage.totalCost; if (remaining < estimatedCost) { return { allowed: false, remaining, reason: `Budget exceeded. Used $${currentUsage.totalCost.toFixed(2)} of $${user.monthlyBudget} limit.`, }; } return { allowed: true, remaining };}
// Cost breakdown by feature export async function getCostByFeature(period: "day" | "week" | "month") { const startDate = new Date(); if (period === "day") startDate.setDate(startDate.getDate() - 1); else if (period === "week") startDate.setDate(startDate.getDate() - 7); else startDate.setMonth(startDate.getMonth() - 1);
return db.tokenUsage.groupBy({ by: ["feature"], where: { timestamp: { gte: startDate } }, _sum: { cost: true, inputTokens: true, outputTokens: true }, _count: true, });}
-
name: "RAG Evaluation with RAGAS" description: "Automated RAG quality evaluation" when_to_use: "Have RAG pipeline, need to measure quality" implementation: | // lib/rag-evaluation.ts import Anthropic from "@anthropic-ai/sdk";
const anthropic = new Anthropic();
interface RAGSample { question: string; contexts: string[]; answer: string; groundTruth?: string; // Optional for some metrics }
interface RAGASScores { faithfulness: number; answerRelevancy: number; contextPrecision: number; contextRecall?: number; // Requires ground truth }
// Evaluate faithfulness: Is answer supported by context? async function evaluateFaithfulness(sample: RAGSample): Promise<number> { const prompt = `You are evaluating whether an answer is faithful to the provided context.
Context: ${sample.contexts.join("\n\n")}
Question: ${sample.question} Answer: ${sample.answer}
Task:
- List each claim made in the answer.
- For each claim, determine if it can be inferred from the context.
- Calculate the ratio of supported claims to total claims.
Return JSON: { "claims": [{"claim": "...", "supported": true/false, "evidence": "..."}], "faithfulness_score": 0.0-1.0 }`;
const response = await anthropic.messages.create({ model: "claude-sonnet-4-20250514", max_tokens: 1024, messages: [{ role: "user", content: prompt }], }); const text = response.content[0].type === "text" ? response.content[0].text : ""; const json = JSON.parse(text.match(/\{[\s\S]*\}/)?.[0] || "{}"); return json.faithfulness_score || 0; } // Evaluate answer relevancy: Is answer relevant to question? async function evaluateAnswerRelevancy(sample: RAGSample): Promise<number> { const prompt = `Evaluate how relevant the answer is to the question.
Question: ${sample.question} Answer: ${sample.answer}
Score from 0-1:
- 1.0: Directly and completely answers the question
- 0.7-0.9: Answers the question with minor irrelevant info
- 0.4-0.6: Partially answers, missing key aspects
- 0.1-0.3: Mostly irrelevant with some tangential relation
- 0.0: Completely irrelevant
Return JSON: {"score": 0.0-1.0, "reasoning": "..."}`;
const response = await anthropic.messages.create({ model: "claude-sonnet-4-20250514", max_tokens: 256, messages: [{ role: "user", content: prompt }], }); const text = response.content[0].type === "text" ? response.content[0].text : ""; const json = JSON.parse(text.match(/\{[\s\S]*\}/)?.[0] || "{}"); return json.score || 0; } // Evaluate context precision: Are retrieved contexts relevant? async function evaluateContextPrecision(sample: RAGSample): Promise<number> { const prompt = `Evaluate the precision of retrieved contexts for answering the question.
Question: ${sample.question}
Contexts: ${sample.contexts.map((c, i) =>
[${i + 1}] ${c}).join("\n\n")}
For each context, determine if it's relevant to answering the question. Return JSON: { "context_relevance": [{"index": 1, "relevant": true/false, "reason": "..."}], "precision_score": 0.0-1.0 }`;
const response = await anthropic.messages.create({ model: "claude-sonnet-4-20250514", max_tokens: 512, messages: [{ role: "user", content: prompt }], }); const text = response.content[0].type === "text" ? response.content[0].text : ""; const json = JSON.parse(text.match(/\{[\s\S]*\}/)?.[0] || "{}"); return json.precision_score || 0; } // Full RAGAS evaluation export async function evaluateRAGAS(sample: RAGSample): Promise<RAGASScores> { const [faithfulness, answerRelevancy, contextPrecision] = await Promise.all([ evaluateFaithfulness(sample), evaluateAnswerRelevancy(sample), evaluateContextPrecision(sample), ]); return { faithfulness, answerRelevancy, contextPrecision, }; } // Batch evaluation with aggregation export async function evaluateBatch( samples: RAGSample[] ): Promise<{ individual: Array<RAGSample & { scores: RAGASScores }>; aggregate: RAGASScores; }> { const results = await Promise.all( samples.map(async (sample) => ({ ...sample, scores: await evaluateRAGAS(sample), })) ); // Calculate averages const aggregate: RAGASScores = { faithfulness: results.reduce((sum, r) => sum + r.scores.faithfulness, 0) / results.length, answerRelevancy: results.reduce((sum, r) => sum + r.scores.answerRelevancy, 0) / results.length, contextPrecision: results.reduce((sum, r) => sum + r.scores.contextPrecision, 0) / results.length, }; return { individual: results, aggregate }; }
-
name: "Prompt Caching Optimization" description: "Reduce costs 50-90% with prompt caching" when_to_use: "Have repeated system prompts or context" implementation: | // lib/prompt-caching.ts import Anthropic from "@anthropic-ai/sdk";
const anthropic = new Anthropic();
interface CachedMessage { role: "user" | "assistant"; content: string | Anthropic.ContentBlock[]; }
// Anthropic prompt caching with cache_control export async function cachedAnthropicCall(options: { systemPrompt: string; staticContext?: string; userMessage: string; model?: string; }) { const { systemPrompt, staticContext, userMessage, model = "claude-sonnet-4-20250514" } = options;
// Build content blocks with cache markers const systemContent: Anthropic.ContentBlockParam[] = [ { type: "text", text: systemPrompt, cache_control: { type: "ephemeral" }, // Cache this }, ]; if (staticContext) { systemContent.push({ type: "text", text: staticContext, cache_control: { type: "ephemeral" }, // Cache this too }); } const response = await anthropic.messages.create({ model, max_tokens: 1024, system: systemContent, messages: [{ role: "user", content: userMessage }], }); // Check cache performance const usage = response.usage; const cacheHit = (usage as any).cache_read_input_tokens > 0; const cachedTokens = (usage as any).cache_read_input_tokens || 0; const uncachedTokens = usage.input_tokens - cachedTokens; return { response, cacheStats: { hit: cacheHit, cachedTokens, uncachedTokens, savingsPercent: cacheHit ? Math.round((cachedTokens / usage.input_tokens) * 100) : 0, }, };}
// OpenAI automatic caching (just structure prompts correctly) import OpenAI from "openai";
const openai = new OpenAI();
export async function optimizedOpenAICall(options: { staticSystemPrompt: string; staticContext?: string; dynamicUserMessage: string; model?: string; }) { const { staticSystemPrompt, staticContext, dynamicUserMessage, model = "gpt-4o" } = options;
// OpenAI caches automatically for prompts > 1024 tokens // Structure: static content first, dynamic content last const messages: OpenAI.ChatCompletionMessageParam[] = [ { role: "system", content: staticContext ? `${staticSystemPrompt}\n\nContext:\n${staticContext}` : staticSystemPrompt, }, { role: "user", content: dynamicUserMessage, }, ]; const response = await openai.chat.completions.create({ model, messages, }); // OpenAI returns cached_tokens in usage const cached = (response.usage as any)?.cached_tokens || 0; return { response, cacheStats: { cachedTokens: cached, uncachedTokens: (response.usage?.prompt_tokens || 0) - cached, }, };}
-
name: "Hallucination Detection" description: "Detect and prevent LLM hallucinations" when_to_use: "Need to verify factual accuracy, especially in RAG" implementation: | // lib/hallucination-detection.ts import Anthropic from "@anthropic-ai/sdk";
const anthropic = new Anthropic();
interface FactCheckResult { isHallucination: boolean; confidence: number; unsupportedClaims: string[]; suggestedCorrections?: string[]; }
// Self-consistency hallucination check // Generate multiple responses and check for consistency export async function selfConsistencyCheck( prompt: string, originalResponse: string, numSamples: number = 3 ): Promise<{ consistent: boolean; consistencyScore: number; samples: string[]; }> { // Generate additional samples const samples: string[] = [];
for (let i = 0; i < numSamples; i++) { const response = await anthropic.messages.create({ model: "claude-sonnet-4-20250514", max_tokens: 1024, messages: [{ role: "user", content: prompt }], temperature: 0.7, // Add variation }); const text = response.content[0].type === "text" ? response.content[0].text : ""; samples.push(text); } // Check consistency across samples const consistencyPrompt = `Compare these responses for factual consistency:
Original response: ${originalResponse}
Additional samples: ${samples.map((s, i) =>
Sample ${i + 1}: ${s}).join("\n\n")}
Identify any factual claims that differ significantly between responses. Return JSON: { "consistent": true/false, "consistency_score": 0.0-1.0, "inconsistent_claims": ["claim1", "claim2"] }`;
const checkResponse = await anthropic.messages.create({ model: "claude-sonnet-4-20250514", max_tokens: 512, messages: [{ role: "user", content: consistencyPrompt }], }); const text = checkResponse.content[0].type === "text" ? checkResponse.content[0].text : ""; const result = JSON.parse(text.match(/\{[\s\S]*\}/)?.[0] || "{}"); return { consistent: result.consistent ?? true, consistencyScore: result.consistency_score ?? 1.0, samples, }; } // Ground truth fact checking against context export async function factCheckAgainstContext( response: string, context: string ): Promise<FactCheckResult> { const prompt = `You are a fact-checker. Verify if the response is supported by the context.
Context: ${context}
Response to verify: ${response}
Instructions:
- Extract each factual claim from the response.
- Check if each claim is supported by the context.
- A claim is a hallucination if it cannot be inferred from the context.
Return JSON: { "claims": [ {"claim": "...", "supported": true/false, "evidence": "quote from context or 'not found'"} ], "is_hallucination": true/false, "confidence": 0.0-1.0, "unsupported_claims": ["claim1", "claim2"], "suggested_corrections": ["correction1"] }`;
const checkResponse = await anthropic.messages.create({ model: "claude-sonnet-4-20250514", max_tokens: 1024, messages: [{ role: "user", content: prompt }], }); const text = checkResponse.content[0].type === "text" ? checkResponse.content[0].text : ""; const result = JSON.parse(text.match(/\{[\s\S]*\}/)?.[0] || "{}"); return { isHallucination: result.is_hallucination ?? false, confidence: result.confidence ?? 1.0, unsupportedClaims: result.unsupported_claims || [], suggestedCorrections: result.suggested_corrections, }; }
anti_patterns:
-
name: "No Tracing in Production" description: "Running LLM apps without observability" why_bad: | Debugging production issues is impossible. Can't identify slow calls, high-cost requests, or quality issues. Flying completely blind. instead: "Add Langfuse/Helicone from day one. Setup takes <10 minutes."
-
name: "Ignoring Token Costs" description: "Not tracking costs per user/feature" why_bad: | A single agent loop can spend $50+ in minutes. One power user can consume entire monthly budget. Costs spiral without visibility. instead: "Track costs per request, set per-user budgets, alert on anomalies."
-
name: "No RAG Quality Metrics" description: "Deploying RAG without evaluation" why_bad: | Retrieval quality degrades silently. Users get irrelevant results. Can't measure impact of changes to chunking, embeddings, or prompts. instead: "Run RAGAS evals on sample of production queries weekly."
-
name: "Missing Prompt Caching" description: "Repeated system prompts without caching" why_bad: | Same 2000-token system prompt sent with every request. At 1000 requests/day, that's 2M tokens wasted = $10/day on just repeated context. instead: "Use Anthropic cache_control or structure prompts for OpenAI auto-cache."
-
name: "Batch Size One" description: "Processing items one at a time" why_bad: | API overhead per request. Rate limits hit faster. Costs higher. Processing 100 items takes 100x the time and overhead. instead: "Batch API calls where possible. Use async patterns for parallelism."
handoffs:
-
to: "semantic-search" when: "Need to evaluate RAG retrieval quality" context: "Use observability to measure search relevance and precision"
-
to: "ai-safety-alignment" when: "Need content moderation and guardrails" context: "Safety metrics are part of overall observability"
-
to: "backend" when: "Need to build cost tracking database" context: "Backend for storing usage data and dashboards"
-
to: "ai-code-generation" when: "Monitoring code generation quality" context: "Track code validity, test pass rates as metrics"
references:
- title: "Langfuse Documentation" url: "https://langfuse.com/docs/observability/overview"
- title: "Helicone LLM Observability Guide" url: "https://www.helicone.ai/blog/the-complete-guide-to-LLM-observability-platforms"
- title: "RAGAS Evaluation Metrics" url: "https://docs.ragas.io/en/stable/concepts/metrics/available_metrics/"
- title: "Prompt Caching Guide 2025" url: "https://promptbuilder.cc/blog/prompt-caching-token-economics-2025"
- title: "NeMo Guardrails" url: "https://github.com/NVIDIA-NeMo/Guardrails"