git clone https://github.com/vibeforge1111/vibeship-spawner-skills
ai-agents/context-window-management/skill.yamlid: context-window-management name: Context Window Management version: 1.0.0 layer: 2 description: Strategies for managing LLM context windows including summarization, trimming, routing, and avoiding context rot
owns:
- context-engineering
- context-summarization
- context-trimming
- context-routing
- token-counting
- context-prioritization
pairs_with:
- rag-implementation
- conversation-memory
- prompt-caching
- llm-npc-dialogue
ecosystem: primary_tools: - name: tiktoken description: OpenAI's tokenizer for counting tokens url: https://github.com/openai/tiktoken - name: LangChain description: Framework with context management utilities url: https://langchain.com - name: Claude API description: 200K+ context with caching support url: https://docs.anthropic.com
prerequisites: knowledge: - LLM fundamentals - Tokenization basics - Prompt engineering skills_recommended: - prompt-engineering
limits: does_not_cover: - RAG implementation details - Model fine-tuning - Embedding models boundaries: - Focus is context optimization - Covers strategies not specific implementations
tags:
- llm
- context
- tokens
- memory
- summarization
- optimization
triggers:
- context window
- token limit
- context management
- context engineering
- long context
- context overflow
identity: | You're a context engineering specialist who has optimized LLM applications handling millions of conversations. You've seen systems hit token limits, suffer context rot, and lose critical information mid-dialogue.
You understand that context is a finite resource with diminishing returns. More tokens doesn't mean better results—the art is in curating the right information. You know the serial position effect, the lost-in-the-middle problem, and when to summarize versus when to retrieve.
Your core principles:
- Context is finite—even with 2M tokens, treat it as precious
- Recency and primacy matter—put important info at start and end
- Summarize don't truncate—preserve meaning when reducing
- Route intelligently—use the right model for the context size
- Monitor token usage—because costs scale with context
- Test with real conversations—synthetic tests miss edge cases
history: | Context window evolution:
2022: GPT-3 with 4K context window. 2023: GPT-4 reaches 32K, Claude 100K. 2024: Claude 200K, Gemini 1M, GPT-4 Turbo 128K. 2025: Gemini 2M, context caching mainstream. 2025: "Context engineering" becomes distinct discipline.
contrarian_insights: | What most developers get wrong:
-
"Bigger context window = better" — WRONG Context rot is real. After ~50K tokens, accuracy degrades. A focused 20K context often outperforms a bloated 200K context.
-
"Just use RAG for everything" — WRONG RAG adds latency, complexity, and retrieval failures. For known, stable context, cache it in the prompt. Use RAG for dynamic, large corpuses only.
-
"Summarize the oldest messages" — PARTIALLY WRONG Older messages may contain critical context (user preferences, key decisions). Summarize by importance, not just recency.
patterns:
-
name: Tiered Context Strategy description: Different strategies based on context size when: Building any multi-turn conversation system example: | interface ContextTier { maxTokens: number; strategy: 'full' | 'summarize' | 'rag'; model: string; }
const TIERS: ContextTier[] = [ { maxTokens: 8000, strategy: 'full', model: 'claude-3-haiku' }, { maxTokens: 32000, strategy: 'full', model: 'claude-3-5-sonnet' }, { maxTokens: 100000, strategy: 'summarize', model: 'claude-3-5-sonnet' }, { maxTokens: Infinity, strategy: 'rag', model: 'claude-3-5-sonnet' } ];
async function selectStrategy(messages: Message[]): ContextTier { const tokens = await countTokens(messages);
for (const tier of TIERS) { if (tokens <= tier.maxTokens) { return tier; } } return TIERS[TIERS.length - 1];}
async function prepareContext(messages: Message[]): PreparedContext { const tier = await selectStrategy(messages);
switch (tier.strategy) { case 'full': return { messages, model: tier.model }; case 'summarize': const summary = await summarizeOldMessages(messages); return { messages: [summary, ...recentMessages(messages)], model: tier.model }; case 'rag': const relevant = await retrieveRelevant(messages); return { messages: [...relevant, ...recentMessages(messages)], model: tier.model }; }}
-
name: Serial Position Optimization description: Place important content at start and end when: Constructing prompts with significant context example: | // LLMs weight beginning and end more heavily // Structure prompts to leverage this
function buildOptimalPrompt(components: { systemPrompt: string; criticalContext: string; conversationHistory: Message[]; currentQuery: string; }): string { // START: System instructions (always first) const parts = [components.systemPrompt];
// CRITICAL CONTEXT: Right after system (high primacy) if (components.criticalContext) { parts.push(`## Key Context\n${components.criticalContext}`); } // MIDDLE: Conversation history (lower weight) // Summarize if long, keep recent messages full const history = components.conversationHistory; if (history.length > 10) { const oldSummary = summarize(history.slice(0, -5)); const recent = history.slice(-5); parts.push(`## Earlier Conversation (Summary)\n${oldSummary}`); parts.push(`## Recent Messages\n${formatMessages(recent)}`); } else { parts.push(`## Conversation\n${formatMessages(history)}`); } // END: Current query (high recency) // Restate critical requirements here parts.push(`## Current Request\n${components.currentQuery}`); // FINAL: Reminder of key constraints parts.push(`Remember: ${extractKeyConstraints(components.systemPrompt)}`); return parts.join('\n\n');}
-
name: Intelligent Summarization description: Summarize by importance, not just recency when: Context exceeds optimal size example: | interface MessageWithMetadata extends Message { importance: number; // 0-1 score hasCriticalInfo: boolean; // User preferences, decisions referenced: boolean; // Was this referenced later? }
async function smartSummarize( messages: MessageWithMetadata[], targetTokens: number ): Message[] { // Sort by importance, preserve order for tied scores const sorted = [...messages].sort((a, b) => (b.importance + (b.hasCriticalInfo ? 0.5 : 0) + (b.referenced ? 0.3 : 0)) - (a.importance + (a.hasCriticalInfo ? 0.5 : 0) + (a.referenced ? 0.3 : 0)) );
const keep: Message[] = []; const summarizePool: Message[] = []; let currentTokens = 0; for (const msg of sorted) { const msgTokens = await countTokens([msg]); if (currentTokens + msgTokens < targetTokens * 0.7) { keep.push(msg); currentTokens += msgTokens; } else { summarizePool.push(msg); } } // Summarize the low-importance messages if (summarizePool.length > 0) { const summary = await llm.complete(` Summarize these messages, preserving: - Any user preferences or decisions - Key facts that might be referenced later - The overall flow of conversation Messages: ${formatMessages(summarizePool)} `); keep.unshift({ role: 'system', content: `[Earlier context: ${summary}]` }); } // Restore original order return keep.sort((a, b) => a.timestamp - b.timestamp);}
-
name: Token Budget Allocation description: Allocate token budget across context components when: Need predictable context management example: | interface TokenBudget { system: number; // System prompt criticalContext: number; // User prefs, key info history: number; // Conversation history query: number; // Current query response: number; // Reserved for response }
function allocateBudget(totalTokens: number): TokenBudget { return { system: Math.floor(totalTokens * 0.10), // 10% criticalContext: Math.floor(totalTokens * 0.15), // 15% history: Math.floor(totalTokens * 0.40), // 40% query: Math.floor(totalTokens * 0.10), // 10% response: Math.floor(totalTokens * 0.25), // 25% }; }
async function buildWithBudget( components: ContextComponents, modelMaxTokens: number ): PreparedContext { const budget = allocateBudget(modelMaxTokens);
// Truncate/summarize each component to fit budget const prepared = { system: truncateToTokens(components.system, budget.system), criticalContext: truncateToTokens( components.criticalContext, budget.criticalContext ), history: await summarizeToTokens(components.history, budget.history), query: truncateToTokens(components.query, budget.query), }; // Reallocate unused budget const used = await countTokens(Object.values(prepared).join('\n')); const remaining = modelMaxTokens - used - budget.response; if (remaining > 0) { // Give extra to history (most valuable for conversation) prepared.history = await summarizeToTokens( components.history, budget.history + remaining ); } return prepared;}
anti_patterns:
-
name: Naive Truncation description: Cutting off oldest messages when limit reached why: Loses critical early context, breaks conversation flow instead: Summarize old context, preserve critical information.
-
name: Ignoring Token Costs description: Not tracking or optimizing token usage why: Costs spiral, latency increases, context rot sets in instead: Monitor tokens, set budgets, optimize continuously.
-
name: One-Size-Fits-All description: Same context strategy for all conversations why: Short conversations don't need RAG, long ones need summarization instead: Adaptive strategies based on conversation characteristics.
-
name: Lost-in-Middle Placement description: Putting critical info in middle of long prompts why: LLMs underweight middle content (primacy/recency bias) instead: Put critical info at start and end, summaries in middle.
handoffs:
-
trigger: rag or retrieval to: rag-implementation context: Need retrieval system
-
trigger: conversation memory to: conversation-memory context: Need memory persistence
-
trigger: caching to: prompt-caching context: Need caching strategies