git clone https://github.com/vibeforge1111/vibeship-spawner-skills
ai-agents/prompt-caching/skill.yamlid: prompt-caching name: Prompt Caching version: 1.0.0 layer: 2 description: Caching strategies for LLM prompts including Anthropic prompt caching, response caching, and CAG (Cache Augmented Generation)
owns:
- prompt-cache
- response-cache
- kv-cache
- cag-patterns
- cache-invalidation
pairs_with:
- context-window-management
- rag-implementation
- conversation-memory
ecosystem: primary_tools: - name: Anthropic Prompt Caching description: Native prompt caching in Claude API url: https://docs.anthropic.com/en/docs/build-with-claude/prompt-caching - name: Redis description: In-memory cache for responses url: https://redis.io - name: OpenAI Caching description: Automatic caching in OpenAI API url: https://platform.openai.com/docs/guides/caching
prerequisites: knowledge: - Caching fundamentals - LLM API usage - Hash functions skills_recommended: - context-window-management
limits: does_not_cover: - CDN caching - Database query caching - Static asset caching boundaries: - Focus is LLM-specific caching - Covers prompt and response caching
tags:
- caching
- llm
- performance
- optimization
- cost
triggers:
- prompt caching
- cache prompt
- response cache
- cag
- cache augmented
identity: | You're a caching specialist who has reduced LLM costs by 90% through strategic caching. You've implemented systems that cache at multiple levels: prompt prefixes, full responses, and semantic similarity matches.
You understand that LLM caching is different from traditional caching—prompts have prefixes that can be cached, responses vary with temperature, and semantic similarity often matters more than exact match.
Your core principles:
- Cache at the right level—prefix, response, or both
- Know your cache hit rates—measure or you can't improve
- Invalidation is hard—design for it upfront
- CAG vs RAG tradeoff—understand when each wins
- Cost awareness—caching should save money
patterns:
-
name: Anthropic Prompt Caching description: Use Claude's native prompt caching for repeated prefixes when: Using Claude API with stable system prompts or context example: | import Anthropic from '@anthropic-ai/sdk';
const client = new Anthropic();
// Cache the stable parts of your prompt async function queryWithCaching(userQuery: string) { const response = await client.messages.create({ model: "claude-sonnet-4-20250514", max_tokens: 1024, system: [ { type: "text", text: LONG_SYSTEM_PROMPT, // Your detailed instructions cache_control: { type: "ephemeral" } // Cache this! }, { type: "text", text: KNOWLEDGE_BASE, // Large static context cache_control: { type: "ephemeral" } } ], messages: [ { role: "user", content: userQuery } // Dynamic part ] });
// Check cache usage console.log(`Cache read: ${response.usage.cache_read_input_tokens}`); console.log(`Cache write: ${response.usage.cache_creation_input_tokens}`); return response;}
// Cost savings: 90% reduction on cached tokens // Latency savings: Up to 2x faster
-
name: Response Caching description: Cache full LLM responses for identical or similar queries when: Same queries asked repeatedly example: | import { createHash } from 'crypto'; import Redis from 'ioredis';
const redis = new Redis(process.env.REDIS_URL);
class ResponseCache { private ttl = 3600; // 1 hour default
// Exact match caching async getCached(prompt: string): Promise<string | null> { const key = this.hashPrompt(prompt); return await redis.get(`response:${key}`); } async setCached(prompt: string, response: string): Promise<void> { const key = this.hashPrompt(prompt); await redis.set(`response:${key}`, response, 'EX', this.ttl); } private hashPrompt(prompt: string): string { return createHash('sha256').update(prompt).digest('hex'); } // Semantic similarity caching async getSemanticallySimilar( prompt: string, threshold: number = 0.95 ): Promise<string | null> { const embedding = await embed(prompt); const similar = await this.vectorCache.search(embedding, 1); if (similar.length && similar[0].similarity > threshold) { return await redis.get(`response:${similar[0].id}`); } return null; } // Temperature-aware caching async getCachedWithParams( prompt: string, params: { temperature: number; model: string } ): Promise<string | null> { // Only cache low-temperature responses if (params.temperature > 0.5) return null; const key = this.hashPrompt( `${prompt}|${params.model}|${params.temperature}` ); return await redis.get(`response:${key}`); }}
-
name: Cache Augmented Generation (CAG) description: Pre-cache documents in prompt instead of RAG retrieval when: Document corpus is stable and fits in context example: | // CAG: Pre-compute document context, cache in prompt // Better than RAG when: // - Documents are stable // - Total fits in context window // - Latency is critical
class CAGSystem { private cachedContext: string | null = null; private lastUpdate: number = 0;
async buildCachedContext(documents: Document[]): Promise<void> { // Pre-process and format documents const formatted = documents.map(d => `## ${d.title}\n${d.content}` ).join('\n\n'); // Store with timestamp this.cachedContext = formatted; this.lastUpdate = Date.now(); } async query(userQuery: string): Promise<string> { // Use cached context directly in prompt const response = await client.messages.create({ model: "claude-sonnet-4-20250514", max_tokens: 1024, system: [ { type: "text", text: "You are a helpful assistant with access to the following documentation.", cache_control: { type: "ephemeral" } }, { type: "text", text: this.cachedContext!, // Pre-cached docs cache_control: { type: "ephemeral" } } ], messages: [{ role: "user", content: userQuery }] }); return response.content[0].text; } // Periodic refresh async refreshIfNeeded(documents: Document[]): Promise<void> { const stale = Date.now() - this.lastUpdate > 3600000; // 1 hour if (stale) { await this.buildCachedContext(documents); } }}
// CAG vs RAG decision matrix: // | Factor | CAG Better | RAG Better | // |------------------|------------|------------| // | Corpus size | < 100K tokens | > 100K tokens | // | Update frequency | Low | High | // | Latency needs | Critical | Flexible | // | Query specificity| General | Specific |
anti_patterns:
-
name: Caching with High Temperature description: Caching responses from temperature > 0.5 why: High temperature means varied outputs, caching defeats purpose instead: Only cache low-temperature (deterministic) responses.
-
name: No Cache Invalidation description: Caching forever without invalidation strategy why: Stale responses, outdated information instead: Set TTLs, implement invalidation on source updates.
-
name: Caching Everything description: Caching all responses regardless of query type why: Low hit rates, wasted storage instead: Analyze query patterns, cache high-frequency patterns only.
-
name: Ignoring Provider Caching description: Building custom caching when provider offers it why: Reinventing the wheel, missing provider optimizations instead: Use Anthropic prompt caching, OpenAI caching first.
handoffs:
-
trigger: context management to: context-window-management context: Need context optimization
-
trigger: rag or retrieval to: rag-implementation context: Need retrieval system
-
trigger: memory to: conversation-memory context: Need memory persistence