Vibeship-spawner-skills ai-observability

AI Observability Skill

install
source · Clone the upstream repo
git clone https://github.com/vibeforge1111/vibeship-spawner-skills
manifest: ai/ai-observability/skill.yaml
source content

AI Observability Skill

LLM monitoring, tracing, cost tracking, and evaluation

id: ai-observability name: ai-observability category: ai description: | Implement comprehensive observability for LLM applications including tracing (Langfuse/Helicone), cost tracking, token optimization, RAG evaluation metrics (RAGAS), hallucination detection, and production monitoring. Essential for debugging, optimizing costs, and ensuring AI output quality.

version: 1.0.0 author: vibeship tags:

  • llm-monitoring
  • tracing
  • langfuse
  • helicone
  • cost-tracking
  • ragas
  • evaluation
  • hallucination-detection
  • prompt-caching

principles:

  • name: "Trace Every LLM Call" description: | Production AI apps without tracing are flying blind. Every LLM call should be traced with inputs, outputs, latency, tokens, and cost. Use structured spans for multi-step chains and agents.

  • name: "Measure What Matters" description: | Track metrics that correlate with user value: faithfulness for RAG, answer relevancy, latency percentiles, cost per successful outcome. Vanity metrics (total calls) don't improve product quality.

  • name: "Cost Is a First-Class Metric" description: | Token costs can explode overnight with agent loops or context growth. Track cost per user, per feature, per model. Set budgets and alerts. Prompt caching can cut costs by 50-90%.

  • name: "Evaluate Continuously" description: | Run automated evals on production samples. RAGAS metrics (faithfulness, relevancy, context precision) catch quality degradation before users complain. Score > 0.8 is generally good.

patterns:

  • name: "Langfuse Tracing Setup" description: "Open-source LLM tracing with full observability" when_to_use: "Need detailed tracing, open-source, self-host option" implementation: | // lib/langfuse.ts import { Langfuse } from "langfuse"; import OpenAI from "openai";

    // Initialize Langfuse client export const langfuse = new Langfuse({ publicKey: process.env.LANGFUSE_PUBLIC_KEY!, secretKey: process.env.LANGFUSE_SECRET_KEY!, baseUrl: process.env.LANGFUSE_HOST || "https://cloud.langfuse.com", });

    // Wrap OpenAI with tracing const openai = new OpenAI();

    interface TracedCompletionOptions { model: string; messages: OpenAI.ChatCompletionMessageParam[]; userId?: string; sessionId?: string; metadata?: Record<string, unknown>; tags?: string[]; }

    export async function tracedCompletion({ model, messages, userId, sessionId, metadata, tags, }: TracedCompletionOptions) { // Create trace const trace = langfuse.trace({ name: "llm-completion", userId, sessionId, metadata, tags, });

    // Create generation span
    const generation = trace.generation({
      name: "openai-completion",
      model,
      input: messages,
      metadata: { modelProvider: "openai" },
    });
    
    const startTime = Date.now();
    
    try {
      const response = await openai.chat.completions.create({
        model,
        messages,
      });
    
      const latencyMs = Date.now() - startTime;
      const output = response.choices[0].message;
    
      // End generation with output
      generation.end({
        output,
        usage: {
          promptTokens: response.usage?.prompt_tokens,
          completionTokens: response.usage?.completion_tokens,
          totalTokens: response.usage?.total_tokens,
        },
        metadata: {
          latencyMs,
          finishReason: response.choices[0].finish_reason,
        },
      });
    
      // Calculate cost
      const cost = calculateCost(
        model,
        response.usage?.prompt_tokens || 0,
        response.usage?.completion_tokens || 0
      );
    
      trace.update({
        metadata: { ...metadata, totalCost: cost, latencyMs },
      });
    
      return {
        response,
        traceId: trace.id,
        cost,
        latencyMs,
      };
    } catch (error) {
      generation.end({
        level: "ERROR",
        statusMessage: error instanceof Error ? error.message : "Unknown error",
      });
      throw error;
    } finally {
      // Flush to ensure data is sent
      await langfuse.flushAsync();
    }
    

    }

    // Model pricing (per 1M tokens) const MODEL_PRICING: Record<string, { input: number; output: number }> = { "gpt-4o": { input: 5, output: 15 }, "gpt-4o-mini": { input: 0.15, output: 0.6 }, "gpt-4-turbo": { input: 10, output: 30 }, "claude-sonnet-4-20250514": { input: 3, output: 15 }, "claude-3-5-haiku-latest": { input: 0.8, output: 4 }, };

    function calculateCost( model: string, inputTokens: number, outputTokens: number ): number { const pricing = MODEL_PRICING[model] || MODEL_PRICING["gpt-4o"]; return ( (inputTokens / 1_000_000) * pricing.input + (outputTokens / 1_000_000) * pricing.output ); }

  • name: "Helicone Proxy Setup" description: "Zero-code LLM observability via proxy" when_to_use: "Want minimal setup, automatic cost tracking" implementation: | // lib/helicone-openai.ts import OpenAI from "openai";

    // Use Helicone proxy for automatic tracing const openai = new OpenAI({ apiKey: process.env.OPENAI_API_KEY!, baseURL: "https://oai.helicone.ai/v1", defaultHeaders: { "Helicone-Auth":

    Bearer ${process.env.HELICONE_API_KEY}
    , }, });

    // Add custom properties for filtering interface HeliconeOptions { userId?: string; sessionId?: string; promptId?: string; cache?: boolean; retryEnabled?: boolean; rateLimitPolicy?: string; }

    export function createHeliconeClient(options: HeliconeOptions) { const headers: Record<string, string> = { "Helicone-Auth":

    Bearer ${process.env.HELICONE_API_KEY}
    , };

    if (options.userId) {
      headers["Helicone-User-Id"] = options.userId;
    }
    if (options.sessionId) {
      headers["Helicone-Session-Id"] = options.sessionId;
    }
    if (options.promptId) {
      headers["Helicone-Prompt-Id"] = options.promptId;
    }
    if (options.cache) {
      headers["Helicone-Cache-Enabled"] = "true";
    }
    if (options.retryEnabled) {
      headers["Helicone-Retry-Enabled"] = "true";
    }
    if (options.rateLimitPolicy) {
      headers["Helicone-RateLimit-Policy"] = options.rateLimitPolicy;
    }
    
    return new OpenAI({
      apiKey: process.env.OPENAI_API_KEY!,
      baseURL: "https://oai.helicone.ai/v1",
      defaultHeaders: headers,
    });
    

    }

    // For Anthropic import Anthropic from "@anthropic-ai/sdk";

    export function createHeliconeAnthropic(options: HeliconeOptions) { const headers: Record<string, string> = { "Helicone-Auth":

    Bearer ${process.env.HELICONE_API_KEY}
    , };

    if (options.userId) headers["Helicone-User-Id"] = options.userId;
    if (options.cache) headers["Helicone-Cache-Enabled"] = "true";
    
    return new Anthropic({
      apiKey: process.env.ANTHROPIC_API_KEY!,
      baseURL: "https://anthropic.helicone.ai",
      defaultHeaders: headers,
    });
    

    }

  • name: "Cost Tracking Dashboard" description: "Track and budget LLM costs per user/feature" when_to_use: "Need granular cost visibility and budgets" implementation: | // lib/cost-tracking.ts import { db } from "./db";

    interface TokenUsage { userId: string; model: string; inputTokens: number; outputTokens: number; cost: number; feature?: string; traceId?: string; }

    // Record usage export async function recordUsage(usage: TokenUsage) { await db.tokenUsage.create({ data: { ...usage, timestamp: new Date(), }, }); }

    // Get user's usage for period export async function getUserUsage( userId: string, period: "day" | "week" | "month" = "month" ) { const startDate = new Date(); if (period === "day") startDate.setDate(startDate.getDate() - 1); else if (period === "week") startDate.setDate(startDate.getDate() - 7); else startDate.setMonth(startDate.getMonth() - 1);

    const usage = await db.tokenUsage.aggregate({
      where: {
        userId,
        timestamp: { gte: startDate },
      },
      _sum: {
        inputTokens: true,
        outputTokens: true,
        cost: true,
      },
      _count: true,
    });
    
    return {
      totalInputTokens: usage._sum.inputTokens || 0,
      totalOutputTokens: usage._sum.outputTokens || 0,
      totalCost: usage._sum.cost || 0,
      requestCount: usage._count,
    };
    

    }

    // Check budget before request export async function checkBudget( userId: string, estimatedCost: number ): Promise<{ allowed: boolean; remaining: number; reason?: string }> { const user = await db.user.findUnique({ where: { id: userId }, select: { monthlyBudget: true, tier: true }, });

    if (!user) {
      return { allowed: false, remaining: 0, reason: "User not found" };
    }
    
    const currentUsage = await getUserUsage(userId, "month");
    const remaining = user.monthlyBudget - currentUsage.totalCost;
    
    if (remaining < estimatedCost) {
      return {
        allowed: false,
        remaining,
        reason: `Budget exceeded. Used $${currentUsage.totalCost.toFixed(2)} of $${user.monthlyBudget} limit.`,
      };
    }
    
    return { allowed: true, remaining };
    

    }

    // Cost breakdown by feature export async function getCostByFeature(period: "day" | "week" | "month") { const startDate = new Date(); if (period === "day") startDate.setDate(startDate.getDate() - 1); else if (period === "week") startDate.setDate(startDate.getDate() - 7); else startDate.setMonth(startDate.getMonth() - 1);

    return db.tokenUsage.groupBy({
      by: ["feature"],
      where: { timestamp: { gte: startDate } },
      _sum: { cost: true, inputTokens: true, outputTokens: true },
      _count: true,
    });
    

    }

  • name: "RAG Evaluation with RAGAS" description: "Automated RAG quality evaluation" when_to_use: "Have RAG pipeline, need to measure quality" implementation: | // lib/rag-evaluation.ts import Anthropic from "@anthropic-ai/sdk";

    const anthropic = new Anthropic();

    interface RAGSample { question: string; contexts: string[]; answer: string; groundTruth?: string; // Optional for some metrics }

    interface RAGASScores { faithfulness: number; answerRelevancy: number; contextPrecision: number; contextRecall?: number; // Requires ground truth }

    // Evaluate faithfulness: Is answer supported by context? async function evaluateFaithfulness(sample: RAGSample): Promise<number> { const prompt = `You are evaluating whether an answer is faithful to the provided context.

Context: ${sample.contexts.join("\n\n")}

Question: ${sample.question} Answer: ${sample.answer}

Task:

  1. List each claim made in the answer.
  2. For each claim, determine if it can be inferred from the context.
  3. Calculate the ratio of supported claims to total claims.

Return JSON: { "claims": [{"claim": "...", "supported": true/false, "evidence": "..."}], "faithfulness_score": 0.0-1.0 }`;

    const response = await anthropic.messages.create({
      model: "claude-sonnet-4-20250514",
      max_tokens: 1024,
      messages: [{ role: "user", content: prompt }],
    });

    const text = response.content[0].type === "text" ? response.content[0].text : "";
    const json = JSON.parse(text.match(/\{[\s\S]*\}/)?.[0] || "{}");

    return json.faithfulness_score || 0;
  }

  // Evaluate answer relevancy: Is answer relevant to question?
  async function evaluateAnswerRelevancy(sample: RAGSample): Promise<number> {
    const prompt = `Evaluate how relevant the answer is to the question.

Question: ${sample.question} Answer: ${sample.answer}

Score from 0-1:

  • 1.0: Directly and completely answers the question
  • 0.7-0.9: Answers the question with minor irrelevant info
  • 0.4-0.6: Partially answers, missing key aspects
  • 0.1-0.3: Mostly irrelevant with some tangential relation
  • 0.0: Completely irrelevant

Return JSON: {"score": 0.0-1.0, "reasoning": "..."}`;

    const response = await anthropic.messages.create({
      model: "claude-sonnet-4-20250514",
      max_tokens: 256,
      messages: [{ role: "user", content: prompt }],
    });

    const text = response.content[0].type === "text" ? response.content[0].text : "";
    const json = JSON.parse(text.match(/\{[\s\S]*\}/)?.[0] || "{}");

    return json.score || 0;
  }

  // Evaluate context precision: Are retrieved contexts relevant?
  async function evaluateContextPrecision(sample: RAGSample): Promise<number> {
    const prompt = `Evaluate the precision of retrieved contexts for answering the question.

Question: ${sample.question}

Contexts: ${sample.contexts.map((c, i) =>

[${i + 1}] ${c}
).join("\n\n")}

For each context, determine if it's relevant to answering the question. Return JSON: { "context_relevance": [{"index": 1, "relevant": true/false, "reason": "..."}], "precision_score": 0.0-1.0 }`;

    const response = await anthropic.messages.create({
      model: "claude-sonnet-4-20250514",
      max_tokens: 512,
      messages: [{ role: "user", content: prompt }],
    });

    const text = response.content[0].type === "text" ? response.content[0].text : "";
    const json = JSON.parse(text.match(/\{[\s\S]*\}/)?.[0] || "{}");

    return json.precision_score || 0;
  }

  // Full RAGAS evaluation
  export async function evaluateRAGAS(sample: RAGSample): Promise<RAGASScores> {
    const [faithfulness, answerRelevancy, contextPrecision] = await Promise.all([
      evaluateFaithfulness(sample),
      evaluateAnswerRelevancy(sample),
      evaluateContextPrecision(sample),
    ]);

    return {
      faithfulness,
      answerRelevancy,
      contextPrecision,
    };
  }

  // Batch evaluation with aggregation
  export async function evaluateBatch(
    samples: RAGSample[]
  ): Promise<{
    individual: Array<RAGSample & { scores: RAGASScores }>;
    aggregate: RAGASScores;
  }> {
    const results = await Promise.all(
      samples.map(async (sample) => ({
        ...sample,
        scores: await evaluateRAGAS(sample),
      }))
    );

    // Calculate averages
    const aggregate: RAGASScores = {
      faithfulness:
        results.reduce((sum, r) => sum + r.scores.faithfulness, 0) / results.length,
      answerRelevancy:
        results.reduce((sum, r) => sum + r.scores.answerRelevancy, 0) / results.length,
      contextPrecision:
        results.reduce((sum, r) => sum + r.scores.contextPrecision, 0) / results.length,
    };

    return { individual: results, aggregate };
  }
  • name: "Prompt Caching Optimization" description: "Reduce costs 50-90% with prompt caching" when_to_use: "Have repeated system prompts or context" implementation: | // lib/prompt-caching.ts import Anthropic from "@anthropic-ai/sdk";

    const anthropic = new Anthropic();

    interface CachedMessage { role: "user" | "assistant"; content: string | Anthropic.ContentBlock[]; }

    // Anthropic prompt caching with cache_control export async function cachedAnthropicCall(options: { systemPrompt: string; staticContext?: string; userMessage: string; model?: string; }) { const { systemPrompt, staticContext, userMessage, model = "claude-sonnet-4-20250514" } = options;

    // Build content blocks with cache markers
    const systemContent: Anthropic.ContentBlockParam[] = [
      {
        type: "text",
        text: systemPrompt,
        cache_control: { type: "ephemeral" }, // Cache this
      },
    ];
    
    if (staticContext) {
      systemContent.push({
        type: "text",
        text: staticContext,
        cache_control: { type: "ephemeral" }, // Cache this too
      });
    }
    
    const response = await anthropic.messages.create({
      model,
      max_tokens: 1024,
      system: systemContent,
      messages: [{ role: "user", content: userMessage }],
    });
    
    // Check cache performance
    const usage = response.usage;
    const cacheHit = (usage as any).cache_read_input_tokens > 0;
    const cachedTokens = (usage as any).cache_read_input_tokens || 0;
    const uncachedTokens = usage.input_tokens - cachedTokens;
    
    return {
      response,
      cacheStats: {
        hit: cacheHit,
        cachedTokens,
        uncachedTokens,
        savingsPercent: cacheHit
          ? Math.round((cachedTokens / usage.input_tokens) * 100)
          : 0,
      },
    };
    

    }

    // OpenAI automatic caching (just structure prompts correctly) import OpenAI from "openai";

    const openai = new OpenAI();

    export async function optimizedOpenAICall(options: { staticSystemPrompt: string; staticContext?: string; dynamicUserMessage: string; model?: string; }) { const { staticSystemPrompt, staticContext, dynamicUserMessage, model = "gpt-4o" } = options;

    // OpenAI caches automatically for prompts > 1024 tokens
    // Structure: static content first, dynamic content last
    const messages: OpenAI.ChatCompletionMessageParam[] = [
      {
        role: "system",
        content: staticContext
          ? `${staticSystemPrompt}\n\nContext:\n${staticContext}`
          : staticSystemPrompt,
      },
      {
        role: "user",
        content: dynamicUserMessage,
      },
    ];
    
    const response = await openai.chat.completions.create({
      model,
      messages,
    });
    
    // OpenAI returns cached_tokens in usage
    const cached = (response.usage as any)?.cached_tokens || 0;
    
    return {
      response,
      cacheStats: {
        cachedTokens: cached,
        uncachedTokens: (response.usage?.prompt_tokens || 0) - cached,
      },
    };
    

    }

  • name: "Hallucination Detection" description: "Detect and prevent LLM hallucinations" when_to_use: "Need to verify factual accuracy, especially in RAG" implementation: | // lib/hallucination-detection.ts import Anthropic from "@anthropic-ai/sdk";

    const anthropic = new Anthropic();

    interface FactCheckResult { isHallucination: boolean; confidence: number; unsupportedClaims: string[]; suggestedCorrections?: string[]; }

    // Self-consistency hallucination check // Generate multiple responses and check for consistency export async function selfConsistencyCheck( prompt: string, originalResponse: string, numSamples: number = 3 ): Promise<{ consistent: boolean; consistencyScore: number; samples: string[]; }> { // Generate additional samples const samples: string[] = [];

    for (let i = 0; i < numSamples; i++) {
      const response = await anthropic.messages.create({
        model: "claude-sonnet-4-20250514",
        max_tokens: 1024,
        messages: [{ role: "user", content: prompt }],
        temperature: 0.7, // Add variation
      });
    
      const text = response.content[0].type === "text" ? response.content[0].text : "";
      samples.push(text);
    }
    
    // Check consistency across samples
    const consistencyPrompt = `Compare these responses for factual consistency:
    

Original response: ${originalResponse}

Additional samples: ${samples.map((s, i) =>

Sample ${i + 1}: ${s}
).join("\n\n")}

Identify any factual claims that differ significantly between responses. Return JSON: { "consistent": true/false, "consistency_score": 0.0-1.0, "inconsistent_claims": ["claim1", "claim2"] }`;

    const checkResponse = await anthropic.messages.create({
      model: "claude-sonnet-4-20250514",
      max_tokens: 512,
      messages: [{ role: "user", content: consistencyPrompt }],
    });

    const text = checkResponse.content[0].type === "text" ? checkResponse.content[0].text : "";
    const result = JSON.parse(text.match(/\{[\s\S]*\}/)?.[0] || "{}");

    return {
      consistent: result.consistent ?? true,
      consistencyScore: result.consistency_score ?? 1.0,
      samples,
    };
  }

  // Ground truth fact checking against context
  export async function factCheckAgainstContext(
    response: string,
    context: string
  ): Promise<FactCheckResult> {
    const prompt = `You are a fact-checker. Verify if the response is supported by the context.

Context: ${context}

Response to verify: ${response}

Instructions:

  1. Extract each factual claim from the response.
  2. Check if each claim is supported by the context.
  3. A claim is a hallucination if it cannot be inferred from the context.

Return JSON: { "claims": [ {"claim": "...", "supported": true/false, "evidence": "quote from context or 'not found'"} ], "is_hallucination": true/false, "confidence": 0.0-1.0, "unsupported_claims": ["claim1", "claim2"], "suggested_corrections": ["correction1"] }`;

    const checkResponse = await anthropic.messages.create({
      model: "claude-sonnet-4-20250514",
      max_tokens: 1024,
      messages: [{ role: "user", content: prompt }],
    });

    const text = checkResponse.content[0].type === "text" ? checkResponse.content[0].text : "";
    const result = JSON.parse(text.match(/\{[\s\S]*\}/)?.[0] || "{}");

    return {
      isHallucination: result.is_hallucination ?? false,
      confidence: result.confidence ?? 1.0,
      unsupportedClaims: result.unsupported_claims || [],
      suggestedCorrections: result.suggested_corrections,
    };
  }

anti_patterns:

  • name: "No Tracing in Production" description: "Running LLM apps without observability" why_bad: | Debugging production issues is impossible. Can't identify slow calls, high-cost requests, or quality issues. Flying completely blind. instead: "Add Langfuse/Helicone from day one. Setup takes <10 minutes."

  • name: "Ignoring Token Costs" description: "Not tracking costs per user/feature" why_bad: | A single agent loop can spend $50+ in minutes. One power user can consume entire monthly budget. Costs spiral without visibility. instead: "Track costs per request, set per-user budgets, alert on anomalies."

  • name: "No RAG Quality Metrics" description: "Deploying RAG without evaluation" why_bad: | Retrieval quality degrades silently. Users get irrelevant results. Can't measure impact of changes to chunking, embeddings, or prompts. instead: "Run RAGAS evals on sample of production queries weekly."

  • name: "Missing Prompt Caching" description: "Repeated system prompts without caching" why_bad: | Same 2000-token system prompt sent with every request. At 1000 requests/day, that's 2M tokens wasted = $10/day on just repeated context. instead: "Use Anthropic cache_control or structure prompts for OpenAI auto-cache."

  • name: "Batch Size One" description: "Processing items one at a time" why_bad: | API overhead per request. Rate limits hit faster. Costs higher. Processing 100 items takes 100x the time and overhead. instead: "Batch API calls where possible. Use async patterns for parallelism."

handoffs:

  • to: "semantic-search" when: "Need to evaluate RAG retrieval quality" context: "Use observability to measure search relevance and precision"

  • to: "ai-safety-alignment" when: "Need content moderation and guardrails" context: "Safety metrics are part of overall observability"

  • to: "backend" when: "Need to build cost tracking database" context: "Backend for storing usage data and dashboards"

  • to: "ai-code-generation" when: "Monitoring code generation quality" context: "Track code validity, test pass rates as metrics"

references: