Claude-skill-registry-data memory-rag-instrumentation
Instrument RAG retrieval, memory operations, and context management
install
source · Clone the upstream repo
git clone https://github.com/majiayu000/claude-skill-registry-data
Claude Code · Install into ~/.claude/skills/
T=$(mktemp -d) && git clone --depth=1 https://github.com/majiayu000/claude-skill-registry-data "$T" && mkdir -p ~/.claude/skills && cp -r "$T/data/memory-rag-instrumentation" ~/.claude/skills/majiayu000-claude-skill-registry-data-memory-rag-instrumentation && rm -rf "$T"
manifest:
data/memory-rag-instrumentation/SKILL.mdsource content
Memory and RAG Instrumentation
Instrument retrieval-augmented generation and memory operations for quality debugging.
Core Principle
RAG observability answers:
- What was retrieved? (sources, scores)
- Was it relevant? (quality signals)
- How much context was used?
- Did retrieval affect the response?
Retrieval Span Attributes
# Required (P0) span.set_attribute("retrieval.source", "vector_store") span.set_attribute("retrieval.query_length", 150) span.set_attribute("retrieval.results_count", 5) span.set_attribute("retrieval.latency_ms", 45) # Quality signals (P1) span.set_attribute("retrieval.top_score", 0.89) span.set_attribute("retrieval.avg_score", 0.72) span.set_attribute("retrieval.min_score", 0.55) span.set_attribute("retrieval.above_threshold", 4) # Count above relevance threshold # Context usage (P1) span.set_attribute("retrieval.tokens_retrieved", 2500) span.set_attribute("retrieval.tokens_used", 2000) # After truncation span.set_attribute("retrieval.context_window_pct", 0.25) # % of context window # Source tracking (P2) span.set_attribute("retrieval.sources", ["doc1.pdf", "doc2.pdf"]) span.set_attribute("retrieval.collection", "knowledge_base")
Retrieval Pipeline Stages
Query Processing
with tracer.start_span("retrieval.query_process") as span: span.set_attribute("query.original_length", len(query)) span.set_attribute("query.expanded", bool(expansion)) span.set_attribute("query.rewritten", bool(rewrite)) # Process query
Vector Search
with tracer.start_span("retrieval.vector_search") as span: span.set_attribute("vector.index", "main_index") span.set_attribute("vector.k", 10) span.set_attribute("vector.ef_search", 100) # HNSW param span.set_attribute("vector.distance_metric", "cosine") # Execute search
Reranking
with tracer.start_span("retrieval.rerank") as span: span.set_attribute("rerank.model", "cohere-rerank-v3") span.set_attribute("rerank.input_count", 10) span.set_attribute("rerank.output_count", 5) span.set_attribute("rerank.score_improvement", 0.15) # Rerank results
Context Assembly
with tracer.start_span("retrieval.context_assembly") as span: span.set_attribute("context.chunks_selected", 5) span.set_attribute("context.total_tokens", 2500) span.set_attribute("context.max_tokens", 4000) span.set_attribute("context.truncated", False) # Assemble context
Memory Operations
Short-term Memory (Conversation)
span.set_attribute("memory.type", "conversation") span.set_attribute("memory.messages_stored", 10) span.set_attribute("memory.tokens_stored", 3500) span.set_attribute("memory.window_size", 20) span.set_attribute("memory.pruned_count", 5)
Long-term Memory (Persistent)
span.set_attribute("memory.type", "persistent") span.set_attribute("memory.operation", "write") # read, write, delete span.set_attribute("memory.key", "user_preferences") span.set_attribute("memory.store", "redis") span.set_attribute("memory.ttl_seconds", 86400)
Episodic Memory
span.set_attribute("memory.type", "episodic") span.set_attribute("memory.episode_id", "session_123") span.set_attribute("memory.events_count", 15) span.set_attribute("memory.summary_generated", True)
Quality Signals
Track signals that indicate retrieval quality:
# Relevance scoring span.set_attribute("quality.relevance_score", 0.85) span.set_attribute("quality.coverage_score", 0.70) # How well query is covered span.set_attribute("quality.diversity_score", 0.60) # Source diversity # Failure signals span.set_attribute("quality.no_results", False) span.set_attribute("quality.below_threshold", 2) # Count below threshold span.set_attribute("quality.fallback_used", False)
Framework Integration
LangChain Retrievers
from langchain.retrievers import VectorStoreRetriever from langfuse.decorators import observe @observe(name="retrieval.search") def search_documents(query: str, k: int = 5): span = get_current_span() results = retriever.get_relevant_documents(query) span.set_attribute("retrieval.query_length", len(query)) span.set_attribute("retrieval.results_count", len(results)) span.set_attribute("retrieval.top_score", results[0].metadata.get("score", 0)) return results
LlamaIndex
from llama_index.core import VectorStoreIndex from langfuse.decorators import observe @observe(name="retrieval.query") def query_index(query: str): response = index.as_query_engine().query(query) span = get_current_span() span.set_attribute("retrieval.source_nodes", len(response.source_nodes)) return response
Context Window Management
Track context usage to avoid truncation issues:
MODEL_CONTEXT_LIMITS = { "claude-3-opus": 200_000, "claude-3-5-sonnet": 200_000, "gpt-4-turbo": 128_000, "gpt-4o": 128_000, } def track_context_usage(model: str, tokens_used: int): limit = MODEL_CONTEXT_LIMITS.get(model, 100_000) pct = tokens_used / limit span.set_attribute("context.tokens_used", tokens_used) span.set_attribute("context.limit", limit) span.set_attribute("context.utilization_pct", round(pct, 2)) span.set_attribute("context.near_limit", pct > 0.8)
Anti-Patterns
- Logging full retrieved documents (storage explosion)
- Missing relevance scores (can't debug quality)
- No reranking metrics (hidden quality drop)
- Ignoring context window usage (truncation bugs)
Related Skills
- LLM instrumentationllm-call-tracing
- Quality metricsevaluation-quality