Claude-skill-registry error-retry-tracking
Instrument error handling, retries, fallbacks, and failure patterns
install
source · Clone the upstream repo
git clone https://github.com/majiayu000/claude-skill-registry
Claude Code · Install into ~/.claude/skills/
T=$(mktemp -d) && git clone --depth=1 https://github.com/majiayu000/claude-skill-registry "$T" && mkdir -p ~/.claude/skills && cp -r "$T/skills/data/error-retry-tracking" ~/.claude/skills/majiayu000-claude-skill-registry-error-retry-tracking && rm -rf "$T"
manifest:
skills/data/error-retry-tracking/SKILL.mdsource content
Error and Retry Tracking
Instrument error handling to understand failure patterns and recovery behavior.
Core Principle
Error observability answers:
- What failed and why?
- How many retries before success/failure?
- What fallbacks were used?
- What's the recovery rate?
- Are errors correlated (rate limits, outages)?
Error Classification
Transient vs. Permanent
TRANSIENT_ERRORS = [ "RateLimitError", "TimeoutError", "ServiceUnavailable", "ConnectionError", ] PERMANENT_ERRORS = [ "InvalidRequestError", "AuthenticationError", "ContentPolicyViolation", "ContextLengthExceeded", ] def classify_error(error: Exception) -> str: error_type = type(error).__name__ if error_type in TRANSIENT_ERRORS: return "transient" elif error_type in PERMANENT_ERRORS: return "permanent" return "unknown"
Error Span Attributes
# Error identification (P0) span.set_attribute("error.type", "RateLimitError") span.set_attribute("error.message", "Rate limit exceeded") span.set_attribute("error.category", "transient") span.set_attribute("error.source", "llm_provider") # Provider context (P1) span.set_attribute("error.provider", "anthropic") span.set_attribute("error.model", "claude-3-opus") span.set_attribute("error.status_code", 429) span.set_attribute("error.request_id", "req_abc123") # Timing context (P1) span.set_attribute("error.retry_after_ms", 60000) span.set_attribute("error.occurred_at_step", 3) span.set_attribute("error.time_into_request_ms", 2500) # Impact (P2) span.set_attribute("error.tokens_wasted", 1500) # Tokens sent before failure span.set_attribute("error.cost_wasted_usd", 0.015)
Retry Span Attributes
# Retry tracking (P0) span.set_attribute("retry.attempt", 2) span.set_attribute("retry.max_attempts", 3) span.set_attribute("retry.strategy", "exponential_backoff") # Timing (P1) span.set_attribute("retry.delay_ms", 2000) span.set_attribute("retry.total_wait_ms", 3500) span.set_attribute("retry.jitter_ms", 150) # Outcome (P0) span.set_attribute("retry.success", True) span.set_attribute("retry.final_attempt", 2) span.set_attribute("retry.exhausted", False)
Retry Wrapper Pattern
from functools import wraps from langfuse.decorators import observe import time def with_retry( max_attempts: int = 3, base_delay: float = 1.0, max_delay: float = 60.0, exponential_base: float = 2.0, ): def decorator(func): @wraps(func) @observe(name=f"{func.__name__}.with_retry") def wrapper(*args, **kwargs): span = get_current_span() span.set_attribute("retry.max_attempts", max_attempts) span.set_attribute("retry.strategy", "exponential_backoff") last_error = None total_wait = 0 for attempt in range(1, max_attempts + 1): try: span.set_attribute("retry.attempt", attempt) result = func(*args, **kwargs) span.set_attribute("retry.success", True) span.set_attribute("retry.final_attempt", attempt) return result except Exception as e: last_error = e span.set_attribute("error.type", type(e).__name__) span.set_attribute("error.category", classify_error(e)) if classify_error(e) == "permanent": span.set_attribute("retry.exhausted", False) span.set_attribute("retry.abort_reason", "permanent_error") raise if attempt < max_attempts: delay = min( base_delay * (exponential_base ** (attempt - 1)), max_delay ) total_wait += delay span.add_event("retry.waiting", {"delay_ms": delay * 1000}) time.sleep(delay) span.set_attribute("retry.success", False) span.set_attribute("retry.exhausted", True) span.set_attribute("retry.total_wait_ms", total_wait * 1000) raise last_error return wrapper return decorator @with_retry(max_attempts=3) def call_llm(messages): return client.messages.create(messages=messages)
Fallback Tracking
# Fallback span attributes span.set_attribute("fallback.triggered", True) span.set_attribute("fallback.reason", "primary_model_unavailable") span.set_attribute("fallback.from_model", "claude-3-opus") span.set_attribute("fallback.to_model", "claude-3-sonnet") span.set_attribute("fallback.quality_impact", "reduced") # Fallback chain span.set_attribute("fallback.chain", ["opus", "sonnet", "haiku"]) span.set_attribute("fallback.chain_position", 2)
Rate Limit Handling
# Rate limit specific attributes span.set_attribute("rate_limit.type", "tokens_per_minute") span.set_attribute("rate_limit.limit", 100000) span.set_attribute("rate_limit.remaining", 0) span.set_attribute("rate_limit.reset_at", "2024-01-15T10:01:00Z") span.set_attribute("rate_limit.retry_after_ms", 45000) # Proactive rate limiting span.set_attribute("rate_limit.preemptive_wait", True) span.set_attribute("rate_limit.tokens_queued", 5000)
Circuit Breaker Pattern
from enum import Enum class CircuitState(Enum): CLOSED = "closed" # Normal operation OPEN = "open" # Failing, reject requests HALF_OPEN = "half_open" # Testing recovery # Circuit breaker attributes span.set_attribute("circuit.state", "open") span.set_attribute("circuit.failure_count", 5) span.set_attribute("circuit.failure_threshold", 5) span.set_attribute("circuit.last_failure_at", timestamp) span.set_attribute("circuit.opens_at", timestamp) span.set_attribute("circuit.half_open_attempts", 0)
Error Aggregation
Track error patterns:
# Per-session error summary span.set_attribute("session.total_errors", 3) span.set_attribute("session.transient_errors", 2) span.set_attribute("session.permanent_errors", 1) span.set_attribute("session.retry_success_rate", 0.67) # Per-provider health span.set_attribute("provider.health", "degraded") span.set_attribute("provider.error_rate_1h", 0.05) span.set_attribute("provider.avg_latency_1h_ms", 2500)
Framework Integration
LangChain Retry
from langchain.chat_models import ChatAnthropic from langfuse.callback import CallbackHandler llm = ChatAnthropic( model="claude-3-opus", max_retries=3, request_timeout=30, ) # Callbacks capture retry behavior handler = CallbackHandler() response = llm.invoke(messages, config={"callbacks": [handler]})
Tenacity Integration
from tenacity import retry, stop_after_attempt, wait_exponential from langfuse.decorators import observe @observe(name="llm.call") @retry( stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=1, max=60), ) def call_with_tenacity(messages): return client.messages.create(messages=messages)
Anti-Patterns
- Catching all exceptions silently (hidden failures)
- No retry tracking (can't optimize retry config)
- Missing error classification (can't distinguish transient vs. permanent)
- No fallback logging (unclear degradation)
- Retrying permanent errors (wasted cost)
Related Skills
- LLM error contextllm-call-tracing
- Tool error handlingtool-call-tracking