Claude-skill-registry langfuse-cost-tuning
install
source · Clone the upstream repo
git clone https://github.com/majiayu000/claude-skill-registry
Claude Code · Install into ~/.claude/skills/
T=$(mktemp -d) && git clone --depth=1 https://github.com/majiayu000/claude-skill-registry "$T" && mkdir -p ~/.claude/skills && cp -r "$T/skills/data/langfuse-cost-tuning" ~/.claude/skills/majiayu000-claude-skill-registry-langfuse-cost-tuning && rm -rf "$T"
manifest:
skills/data/langfuse-cost-tuning/SKILL.mdsource content
Langfuse Cost Tuning
Overview
Track, analyze, and optimize LLM costs using Langfuse observability data.
Prerequisites
- Langfuse tracing with token usage
- Understanding of LLM pricing models
- Access to Langfuse analytics dashboard
LLM Cost Reference
| Model | Input (per 1M) | Output (per 1M) |
|---|---|---|
| GPT-4 Turbo | $10.00 | $30.00 |
| GPT-4o | $5.00 | $15.00 |
| GPT-4o-mini | $0.15 | $0.60 |
| GPT-3.5 Turbo | $0.50 | $1.50 |
| Claude 3 Opus | $15.00 | $75.00 |
| Claude 3 Sonnet | $3.00 | $15.00 |
| Claude 3 Haiku | $0.25 | $1.25 |
Instructions
Step 1: Track Token Usage in Generations
import { Langfuse } from "langfuse"; const langfuse = new Langfuse(); // Model pricing configuration const MODEL_PRICING: Record< string, { input: number; output: number } > = { "gpt-4-turbo": { input: 10.0, output: 30.0 }, "gpt-4o": { input: 5.0, output: 15.0 }, "gpt-4o-mini": { input: 0.15, output: 0.6 }, "gpt-3.5-turbo": { input: 0.5, output: 1.5 }, "claude-3-opus": { input: 15.0, output: 75.0 }, "claude-3-sonnet": { input: 3.0, output: 15.0 }, "claude-3-haiku": { input: 0.25, output: 1.25 }, }; function calculateCost( model: string, promptTokens: number, completionTokens: number ): number { const pricing = MODEL_PRICING[model] || { input: 0, output: 0 }; const inputCost = (promptTokens / 1_000_000) * pricing.input; const outputCost = (completionTokens / 1_000_000) * pricing.output; return inputCost + outputCost; } // Track with cost metadata async function tracedLLMCall( trace: ReturnType<typeof langfuse.trace>, model: string, messages: any[] ) { const generation = trace.generation({ name: "llm-call", model, input: messages, }); const response = await openai.chat.completions.create({ model, messages, }); const usage = response.usage!; const cost = calculateCost(model, usage.prompt_tokens, usage.completion_tokens); generation.end({ output: response.choices[0].message, usage: { promptTokens: usage.prompt_tokens, completionTokens: usage.completion_tokens, totalTokens: usage.total_tokens, }, metadata: { cost_usd: cost, model_version: model, }, }); return response; }
Step 2: Create Cost Dashboard Queries
// Fetch cost data from Langfuse async function getCostAnalytics(days: number = 30) { const langfuse = new Langfuse(); const fromDate = new Date(); fromDate.setDate(fromDate.getDate() - days); const generations = await langfuse.fetchGenerations({ fromTimestamp: fromDate, }); // Aggregate costs const costByModel: Record<string, number> = {}; const costByDay: Record<string, number> = {}; const tokensByModel: Record<string, { prompt: number; completion: number }> = {}; for (const gen of generations.data) { const model = gen.model || "unknown"; const date = new Date(gen.startTime).toISOString().split("T")[0]; // Get cost from metadata or calculate const cost = gen.metadata?.cost_usd || calculateCost( model, gen.usage?.promptTokens || 0, gen.usage?.completionTokens || 0 ); // Aggregate by model costByModel[model] = (costByModel[model] || 0) + cost; // Aggregate by day costByDay[date] = (costByDay[date] || 0) + cost; // Token usage by model if (!tokensByModel[model]) { tokensByModel[model] = { prompt: 0, completion: 0 }; } tokensByModel[model].prompt += gen.usage?.promptTokens || 0; tokensByModel[model].completion += gen.usage?.completionTokens || 0; } return { totalCost: Object.values(costByModel).reduce((a, b) => a + b, 0), costByModel, costByDay, tokensByModel, generationCount: generations.data.length, }; }
Step 3: Implement Cost Alerts
interface CostAlert { type: "daily" | "hourly" | "per-request"; threshold: number; action: "warn" | "block" | "notify"; } const COST_ALERTS: CostAlert[] = [ { type: "daily", threshold: 100, action: "warn" }, { type: "daily", threshold: 500, action: "notify" }, { type: "per-request", threshold: 1, action: "warn" }, ]; class CostMonitor { private hourlySpend: Map<string, number> = new Map(); private dailySpend: Map<string, number> = new Map(); trackCost(cost: number) { const hourKey = new Date().toISOString().slice(0, 13); const dayKey = new Date().toISOString().slice(0, 10); this.hourlySpend.set( hourKey, (this.hourlySpend.get(hourKey) || 0) + cost ); this.dailySpend.set( dayKey, (this.dailySpend.get(dayKey) || 0) + cost ); this.checkAlerts(cost); } private checkAlerts(requestCost: number) { const dayKey = new Date().toISOString().slice(0, 10); const dailyTotal = this.dailySpend.get(dayKey) || 0; for (const alert of COST_ALERTS) { let currentValue: number; switch (alert.type) { case "daily": currentValue = dailyTotal; break; case "per-request": currentValue = requestCost; break; default: continue; } if (currentValue >= alert.threshold) { this.triggerAlert(alert, currentValue); } } } private triggerAlert(alert: CostAlert, value: number) { const message = `Cost alert: ${alert.type} spend $${value.toFixed(2)} exceeded threshold $${alert.threshold}`; switch (alert.action) { case "warn": console.warn(message); break; case "notify": this.sendNotification(message); break; case "block": throw new Error(`Request blocked: ${message}`); } } private async sendNotification(message: string) { // Send to Slack, PagerDuty, etc. await fetch(process.env.SLACK_WEBHOOK_URL!, { method: "POST", body: JSON.stringify({ text: message }), }); } getStats() { const dayKey = new Date().toISOString().slice(0, 10); return { dailySpend: this.dailySpend.get(dayKey) || 0, dailyBudgetRemaining: 100 - (this.dailySpend.get(dayKey) || 0), }; } } const costMonitor = new CostMonitor();
Step 4: Implement Cost Optimization Strategies
// Model selection based on task complexity interface ModelSelector { selectModel(task: string, inputLength: number): string; } class CostOptimizedModelSelector implements ModelSelector { selectModel(task: string, inputLength: number): string { // Simple tasks -> cheaper model const simpleTasks = ["summarize", "classify", "extract"]; if (simpleTasks.some((t) => task.toLowerCase().includes(t))) { return "gpt-4o-mini"; } // Short inputs -> cheaper model if (inputLength < 500) { return "gpt-4o-mini"; } // Complex tasks -> more capable model const complexTasks = ["analyze", "reason", "code", "math"]; if (complexTasks.some((t) => task.toLowerCase().includes(t))) { return "gpt-4o"; } // Default to mid-tier return "gpt-4o-mini"; } } // Prompt optimization to reduce tokens function optimizePrompt(prompt: string): string { // Remove excessive whitespace let optimized = prompt.replace(/\s+/g, " ").trim(); // Remove redundant instructions optimized = optimized.replace(/please |kindly |could you /gi, ""); return optimized; } // Caching for repeated queries const responseCache = new Map<string, { response: string; timestamp: Date }>(); async function cachedLLMCall( prompt: string, model: string, ttlMs: number = 3600000 ): Promise<string> { const cacheKey = `${model}:${prompt}`; const cached = responseCache.get(cacheKey); if (cached && Date.now() - cached.timestamp.getTime() < ttlMs) { console.log("Cache hit - saved API call"); return cached.response; } const response = await callLLM(prompt, model); responseCache.set(cacheKey, { response, timestamp: new Date() }); return response; }
Step 5: Generate Cost Reports
async function generateCostReport(period: "daily" | "weekly" | "monthly") { const days = period === "daily" ? 1 : period === "weekly" ? 7 : 30; const analytics = await getCostAnalytics(days); const report = ` # LLM Cost Report - ${period.charAt(0).toUpperCase() + period.slice(1)} Generated: ${new Date().toISOString()} ## Summary - Total Cost: $${analytics.totalCost.toFixed(2)} - Total Generations: ${analytics.generationCount} - Average Cost per Generation: $${(analytics.totalCost / analytics.generationCount).toFixed(4)} ## Cost by Model ${Object.entries(analytics.costByModel) .sort(([, a], [, b]) => b - a) .map(([model, cost]) => `- ${model}: $${cost.toFixed(2)}`) .join("\n")} ## Token Usage by Model ${Object.entries(analytics.tokensByModel) .map( ([model, tokens]) => `- ${model}: ${tokens.prompt.toLocaleString()} prompt, ${tokens.completion.toLocaleString()} completion` ) .join("\n")} ## Recommendations ${generateRecommendations(analytics)} `; return report; } function generateRecommendations(analytics: any): string { const recommendations: string[] = []; // Check for expensive model overuse const gpt4Cost = analytics.costByModel["gpt-4-turbo"] || 0; const totalCost = analytics.totalCost; if (gpt4Cost > totalCost * 0.5) { recommendations.push( "- Consider using GPT-4o or GPT-4o-mini for simpler tasks to reduce costs" ); } // Check for high output token ratio for (const [model, tokens] of Object.entries(analytics.tokensByModel)) { const { prompt, completion } = tokens as any; if (completion > prompt * 2) { recommendations.push( `- ${model}: High output ratio. Consider limiting max_tokens or response length` ); } } return recommendations.length > 0 ? recommendations.join("\n") : "- No immediate optimization opportunities identified"; }
Output
- Token usage tracking in generations
- Cost analytics dashboard
- Real-time cost alerts
- Model selection optimization
- Caching and prompt optimization
- Automated cost reports
Cost Optimization Strategies
| Strategy | Potential Savings | Implementation Effort |
|---|---|---|
| Model downgrade | 50-90% | Low |
| Prompt optimization | 10-30% | Low |
| Response caching | 20-60% | Medium |
| Batch processing | 10-20% | Medium |
| Sampling | Variable | Medium |
Error Handling
| Issue | Cause | Solution |
|---|---|---|
| Missing usage data | SDK not capturing | Verify generation.end() includes usage |
| Inaccurate costs | Wrong pricing | Update MODEL_PRICING regularly |
| Budget exceeded | No alerts | Implement cost alerts |
| Report failures | API limits | Add pagination to fetchGenerations |
Resources
Next Steps
For reference architecture, see
langfuse-reference-architecture.