Claude-skill-registry langfuse-cost-tuning

install
source · Clone the upstream repo
git clone https://github.com/majiayu000/claude-skill-registry
Claude Code · Install into ~/.claude/skills/
T=$(mktemp -d) && git clone --depth=1 https://github.com/majiayu000/claude-skill-registry "$T" && mkdir -p ~/.claude/skills && cp -r "$T/skills/data/langfuse-cost-tuning" ~/.claude/skills/majiayu000-claude-skill-registry-langfuse-cost-tuning && rm -rf "$T"
manifest: skills/data/langfuse-cost-tuning/SKILL.md
source content

Langfuse Cost Tuning

Overview

Track, analyze, and optimize LLM costs using Langfuse observability data.

Prerequisites

  • Langfuse tracing with token usage
  • Understanding of LLM pricing models
  • Access to Langfuse analytics dashboard

LLM Cost Reference

ModelInput (per 1M)Output (per 1M)
GPT-4 Turbo$10.00$30.00
GPT-4o$5.00$15.00
GPT-4o-mini$0.15$0.60
GPT-3.5 Turbo$0.50$1.50
Claude 3 Opus$15.00$75.00
Claude 3 Sonnet$3.00$15.00
Claude 3 Haiku$0.25$1.25

Instructions

Step 1: Track Token Usage in Generations

import { Langfuse } from "langfuse";

const langfuse = new Langfuse();

// Model pricing configuration
const MODEL_PRICING: Record<
  string,
  { input: number; output: number }
> = {
  "gpt-4-turbo": { input: 10.0, output: 30.0 },
  "gpt-4o": { input: 5.0, output: 15.0 },
  "gpt-4o-mini": { input: 0.15, output: 0.6 },
  "gpt-3.5-turbo": { input: 0.5, output: 1.5 },
  "claude-3-opus": { input: 15.0, output: 75.0 },
  "claude-3-sonnet": { input: 3.0, output: 15.0 },
  "claude-3-haiku": { input: 0.25, output: 1.25 },
};

function calculateCost(
  model: string,
  promptTokens: number,
  completionTokens: number
): number {
  const pricing = MODEL_PRICING[model] || { input: 0, output: 0 };
  const inputCost = (promptTokens / 1_000_000) * pricing.input;
  const outputCost = (completionTokens / 1_000_000) * pricing.output;
  return inputCost + outputCost;
}

// Track with cost metadata
async function tracedLLMCall(
  trace: ReturnType<typeof langfuse.trace>,
  model: string,
  messages: any[]
) {
  const generation = trace.generation({
    name: "llm-call",
    model,
    input: messages,
  });

  const response = await openai.chat.completions.create({
    model,
    messages,
  });

  const usage = response.usage!;
  const cost = calculateCost(model, usage.prompt_tokens, usage.completion_tokens);

  generation.end({
    output: response.choices[0].message,
    usage: {
      promptTokens: usage.prompt_tokens,
      completionTokens: usage.completion_tokens,
      totalTokens: usage.total_tokens,
    },
    metadata: {
      cost_usd: cost,
      model_version: model,
    },
  });

  return response;
}

Step 2: Create Cost Dashboard Queries

// Fetch cost data from Langfuse
async function getCostAnalytics(days: number = 30) {
  const langfuse = new Langfuse();

  const fromDate = new Date();
  fromDate.setDate(fromDate.getDate() - days);

  const generations = await langfuse.fetchGenerations({
    fromTimestamp: fromDate,
  });

  // Aggregate costs
  const costByModel: Record<string, number> = {};
  const costByDay: Record<string, number> = {};
  const tokensByModel: Record<string, { prompt: number; completion: number }> = {};

  for (const gen of generations.data) {
    const model = gen.model || "unknown";
    const date = new Date(gen.startTime).toISOString().split("T")[0];

    // Get cost from metadata or calculate
    const cost = gen.metadata?.cost_usd || calculateCost(
      model,
      gen.usage?.promptTokens || 0,
      gen.usage?.completionTokens || 0
    );

    // Aggregate by model
    costByModel[model] = (costByModel[model] || 0) + cost;

    // Aggregate by day
    costByDay[date] = (costByDay[date] || 0) + cost;

    // Token usage by model
    if (!tokensByModel[model]) {
      tokensByModel[model] = { prompt: 0, completion: 0 };
    }
    tokensByModel[model].prompt += gen.usage?.promptTokens || 0;
    tokensByModel[model].completion += gen.usage?.completionTokens || 0;
  }

  return {
    totalCost: Object.values(costByModel).reduce((a, b) => a + b, 0),
    costByModel,
    costByDay,
    tokensByModel,
    generationCount: generations.data.length,
  };
}

Step 3: Implement Cost Alerts

interface CostAlert {
  type: "daily" | "hourly" | "per-request";
  threshold: number;
  action: "warn" | "block" | "notify";
}

const COST_ALERTS: CostAlert[] = [
  { type: "daily", threshold: 100, action: "warn" },
  { type: "daily", threshold: 500, action: "notify" },
  { type: "per-request", threshold: 1, action: "warn" },
];

class CostMonitor {
  private hourlySpend: Map<string, number> = new Map();
  private dailySpend: Map<string, number> = new Map();

  trackCost(cost: number) {
    const hourKey = new Date().toISOString().slice(0, 13);
    const dayKey = new Date().toISOString().slice(0, 10);

    this.hourlySpend.set(
      hourKey,
      (this.hourlySpend.get(hourKey) || 0) + cost
    );
    this.dailySpend.set(
      dayKey,
      (this.dailySpend.get(dayKey) || 0) + cost
    );

    this.checkAlerts(cost);
  }

  private checkAlerts(requestCost: number) {
    const dayKey = new Date().toISOString().slice(0, 10);
    const dailyTotal = this.dailySpend.get(dayKey) || 0;

    for (const alert of COST_ALERTS) {
      let currentValue: number;

      switch (alert.type) {
        case "daily":
          currentValue = dailyTotal;
          break;
        case "per-request":
          currentValue = requestCost;
          break;
        default:
          continue;
      }

      if (currentValue >= alert.threshold) {
        this.triggerAlert(alert, currentValue);
      }
    }
  }

  private triggerAlert(alert: CostAlert, value: number) {
    const message = `Cost alert: ${alert.type} spend $${value.toFixed(2)} exceeded threshold $${alert.threshold}`;

    switch (alert.action) {
      case "warn":
        console.warn(message);
        break;
      case "notify":
        this.sendNotification(message);
        break;
      case "block":
        throw new Error(`Request blocked: ${message}`);
    }
  }

  private async sendNotification(message: string) {
    // Send to Slack, PagerDuty, etc.
    await fetch(process.env.SLACK_WEBHOOK_URL!, {
      method: "POST",
      body: JSON.stringify({ text: message }),
    });
  }

  getStats() {
    const dayKey = new Date().toISOString().slice(0, 10);
    return {
      dailySpend: this.dailySpend.get(dayKey) || 0,
      dailyBudgetRemaining: 100 - (this.dailySpend.get(dayKey) || 0),
    };
  }
}

const costMonitor = new CostMonitor();

Step 4: Implement Cost Optimization Strategies

// Model selection based on task complexity
interface ModelSelector {
  selectModel(task: string, inputLength: number): string;
}

class CostOptimizedModelSelector implements ModelSelector {
  selectModel(task: string, inputLength: number): string {
    // Simple tasks -> cheaper model
    const simpleTasks = ["summarize", "classify", "extract"];
    if (simpleTasks.some((t) => task.toLowerCase().includes(t))) {
      return "gpt-4o-mini";
    }

    // Short inputs -> cheaper model
    if (inputLength < 500) {
      return "gpt-4o-mini";
    }

    // Complex tasks -> more capable model
    const complexTasks = ["analyze", "reason", "code", "math"];
    if (complexTasks.some((t) => task.toLowerCase().includes(t))) {
      return "gpt-4o";
    }

    // Default to mid-tier
    return "gpt-4o-mini";
  }
}

// Prompt optimization to reduce tokens
function optimizePrompt(prompt: string): string {
  // Remove excessive whitespace
  let optimized = prompt.replace(/\s+/g, " ").trim();

  // Remove redundant instructions
  optimized = optimized.replace(/please |kindly |could you /gi, "");

  return optimized;
}

// Caching for repeated queries
const responseCache = new Map<string, { response: string; timestamp: Date }>();

async function cachedLLMCall(
  prompt: string,
  model: string,
  ttlMs: number = 3600000
): Promise<string> {
  const cacheKey = `${model}:${prompt}`;
  const cached = responseCache.get(cacheKey);

  if (cached && Date.now() - cached.timestamp.getTime() < ttlMs) {
    console.log("Cache hit - saved API call");
    return cached.response;
  }

  const response = await callLLM(prompt, model);
  responseCache.set(cacheKey, { response, timestamp: new Date() });

  return response;
}

Step 5: Generate Cost Reports

async function generateCostReport(period: "daily" | "weekly" | "monthly") {
  const days = period === "daily" ? 1 : period === "weekly" ? 7 : 30;
  const analytics = await getCostAnalytics(days);

  const report = `
# LLM Cost Report - ${period.charAt(0).toUpperCase() + period.slice(1)}
Generated: ${new Date().toISOString()}

## Summary
- Total Cost: $${analytics.totalCost.toFixed(2)}
- Total Generations: ${analytics.generationCount}
- Average Cost per Generation: $${(analytics.totalCost / analytics.generationCount).toFixed(4)}

## Cost by Model
${Object.entries(analytics.costByModel)
  .sort(([, a], [, b]) => b - a)
  .map(([model, cost]) => `- ${model}: $${cost.toFixed(2)}`)
  .join("\n")}

## Token Usage by Model
${Object.entries(analytics.tokensByModel)
  .map(
    ([model, tokens]) =>
      `- ${model}: ${tokens.prompt.toLocaleString()} prompt, ${tokens.completion.toLocaleString()} completion`
  )
  .join("\n")}

## Recommendations
${generateRecommendations(analytics)}
`;

  return report;
}

function generateRecommendations(analytics: any): string {
  const recommendations: string[] = [];

  // Check for expensive model overuse
  const gpt4Cost = analytics.costByModel["gpt-4-turbo"] || 0;
  const totalCost = analytics.totalCost;

  if (gpt4Cost > totalCost * 0.5) {
    recommendations.push(
      "- Consider using GPT-4o or GPT-4o-mini for simpler tasks to reduce costs"
    );
  }

  // Check for high output token ratio
  for (const [model, tokens] of Object.entries(analytics.tokensByModel)) {
    const { prompt, completion } = tokens as any;
    if (completion > prompt * 2) {
      recommendations.push(
        `- ${model}: High output ratio. Consider limiting max_tokens or response length`
      );
    }
  }

  return recommendations.length > 0
    ? recommendations.join("\n")
    : "- No immediate optimization opportunities identified";
}

Output

  • Token usage tracking in generations
  • Cost analytics dashboard
  • Real-time cost alerts
  • Model selection optimization
  • Caching and prompt optimization
  • Automated cost reports

Cost Optimization Strategies

StrategyPotential SavingsImplementation Effort
Model downgrade50-90%Low
Prompt optimization10-30%Low
Response caching20-60%Medium
Batch processing10-20%Medium
SamplingVariableMedium

Error Handling

IssueCauseSolution
Missing usage dataSDK not capturingVerify generation.end() includes usage
Inaccurate costsWrong pricingUpdate MODEL_PRICING regularly
Budget exceededNo alertsImplement cost alerts
Report failuresAPI limitsAdd pagination to fetchGenerations

Resources

Next Steps

For reference architecture, see

langfuse-reference-architecture
.