Claude-skill-registry error-retry-tracking

Instrument error handling, retries, fallbacks, and failure patterns

install

source · Clone the upstream repo

git clone https://github.com/majiayu000/claude-skill-registry

Claude Code · Install into ~/.claude/skills/

T=$(mktemp -d) && git clone --depth=1 https://github.com/majiayu000/claude-skill-registry "$T" && mkdir -p ~/.claude/skills && cp -r "$T/skills/data/error-retry-tracking" ~/.claude/skills/majiayu000-claude-skill-registry-error-retry-tracking && rm -rf "$T"

manifest: skills/data/error-retry-tracking/SKILL.md

source content

Error and Retry Tracking

Instrument error handling to understand failure patterns and recovery behavior.

Core Principle

Error observability answers:

What failed and why?
How many retries before success/failure?
What fallbacks were used?
What's the recovery rate?
Are errors correlated (rate limits, outages)?

Error Classification

Transient vs. Permanent

TRANSIENT_ERRORS = [
    "RateLimitError",
    "TimeoutError",
    "ServiceUnavailable",
    "ConnectionError",
]

PERMANENT_ERRORS = [
    "InvalidRequestError",
    "AuthenticationError",
    "ContentPolicyViolation",
    "ContextLengthExceeded",
]

def classify_error(error: Exception) -> str:
    error_type = type(error).__name__
    if error_type in TRANSIENT_ERRORS:
        return "transient"
    elif error_type in PERMANENT_ERRORS:
        return "permanent"
    return "unknown"

Error Span Attributes

# Error identification (P0)
span.set_attribute("error.type", "RateLimitError")
span.set_attribute("error.message", "Rate limit exceeded")
span.set_attribute("error.category", "transient")
span.set_attribute("error.source", "llm_provider")

# Provider context (P1)
span.set_attribute("error.provider", "anthropic")
span.set_attribute("error.model", "claude-3-opus")
span.set_attribute("error.status_code", 429)
span.set_attribute("error.request_id", "req_abc123")

# Timing context (P1)
span.set_attribute("error.retry_after_ms", 60000)
span.set_attribute("error.occurred_at_step", 3)
span.set_attribute("error.time_into_request_ms", 2500)

# Impact (P2)
span.set_attribute("error.tokens_wasted", 1500)  # Tokens sent before failure
span.set_attribute("error.cost_wasted_usd", 0.015)

Retry Span Attributes

# Retry tracking (P0)
span.set_attribute("retry.attempt", 2)
span.set_attribute("retry.max_attempts", 3)
span.set_attribute("retry.strategy", "exponential_backoff")

# Timing (P1)
span.set_attribute("retry.delay_ms", 2000)
span.set_attribute("retry.total_wait_ms", 3500)
span.set_attribute("retry.jitter_ms", 150)

# Outcome (P0)
span.set_attribute("retry.success", True)
span.set_attribute("retry.final_attempt", 2)
span.set_attribute("retry.exhausted", False)

Retry Wrapper Pattern

from functools import wraps
from langfuse.decorators import observe
import time

def with_retry(
    max_attempts: int = 3,
    base_delay: float = 1.0,
    max_delay: float = 60.0,
    exponential_base: float = 2.0,
):
    def decorator(func):
        @wraps(func)
        @observe(name=f"{func.__name__}.with_retry")
        def wrapper(*args, **kwargs):
            span = get_current_span()
            span.set_attribute("retry.max_attempts", max_attempts)
            span.set_attribute("retry.strategy", "exponential_backoff")

            last_error = None
            total_wait = 0

            for attempt in range(1, max_attempts + 1):
                try:
                    span.set_attribute("retry.attempt", attempt)
                    result = func(*args, **kwargs)
                    span.set_attribute("retry.success", True)
                    span.set_attribute("retry.final_attempt", attempt)
                    return result

                except Exception as e:
                    last_error = e
                    span.set_attribute("error.type", type(e).__name__)
                    span.set_attribute("error.category", classify_error(e))

                    if classify_error(e) == "permanent":
                        span.set_attribute("retry.exhausted", False)
                        span.set_attribute("retry.abort_reason", "permanent_error")
                        raise

                    if attempt < max_attempts:
                        delay = min(
                            base_delay * (exponential_base ** (attempt - 1)),
                            max_delay
                        )
                        total_wait += delay
                        span.add_event("retry.waiting", {"delay_ms": delay * 1000})
                        time.sleep(delay)

            span.set_attribute("retry.success", False)
            span.set_attribute("retry.exhausted", True)
            span.set_attribute("retry.total_wait_ms", total_wait * 1000)
            raise last_error

        return wrapper
    return decorator

@with_retry(max_attempts=3)
def call_llm(messages):
    return client.messages.create(messages=messages)

Fallback Tracking

# Fallback span attributes
span.set_attribute("fallback.triggered", True)
span.set_attribute("fallback.reason", "primary_model_unavailable")
span.set_attribute("fallback.from_model", "claude-3-opus")
span.set_attribute("fallback.to_model", "claude-3-sonnet")
span.set_attribute("fallback.quality_impact", "reduced")

# Fallback chain
span.set_attribute("fallback.chain", ["opus", "sonnet", "haiku"])
span.set_attribute("fallback.chain_position", 2)

Rate Limit Handling

# Rate limit specific attributes
span.set_attribute("rate_limit.type", "tokens_per_minute")
span.set_attribute("rate_limit.limit", 100000)
span.set_attribute("rate_limit.remaining", 0)
span.set_attribute("rate_limit.reset_at", "2024-01-15T10:01:00Z")
span.set_attribute("rate_limit.retry_after_ms", 45000)

# Proactive rate limiting
span.set_attribute("rate_limit.preemptive_wait", True)
span.set_attribute("rate_limit.tokens_queued", 5000)

Circuit Breaker Pattern

from enum import Enum

class CircuitState(Enum):
    CLOSED = "closed"      # Normal operation
    OPEN = "open"          # Failing, reject requests
    HALF_OPEN = "half_open"  # Testing recovery

# Circuit breaker attributes
span.set_attribute("circuit.state", "open")
span.set_attribute("circuit.failure_count", 5)
span.set_attribute("circuit.failure_threshold", 5)
span.set_attribute("circuit.last_failure_at", timestamp)
span.set_attribute("circuit.opens_at", timestamp)
span.set_attribute("circuit.half_open_attempts", 0)

Error Aggregation

Track error patterns:

# Per-session error summary
span.set_attribute("session.total_errors", 3)
span.set_attribute("session.transient_errors", 2)
span.set_attribute("session.permanent_errors", 1)
span.set_attribute("session.retry_success_rate", 0.67)

# Per-provider health
span.set_attribute("provider.health", "degraded")
span.set_attribute("provider.error_rate_1h", 0.05)
span.set_attribute("provider.avg_latency_1h_ms", 2500)

Framework Integration

LangChain Retry

from langchain.chat_models import ChatAnthropic
from langfuse.callback import CallbackHandler

llm = ChatAnthropic(
    model="claude-3-opus",
    max_retries=3,
    request_timeout=30,
)

# Callbacks capture retry behavior
handler = CallbackHandler()
response = llm.invoke(messages, config={"callbacks": [handler]})

Tenacity Integration

from tenacity import retry, stop_after_attempt, wait_exponential
from langfuse.decorators import observe

@observe(name="llm.call")
@retry(
    stop=stop_after_attempt(3),
    wait=wait_exponential(multiplier=1, min=1, max=60),
)
def call_with_tenacity(messages):
    return client.messages.create(messages=messages)

Anti-Patterns

Catching all exceptions silently (hidden failures)
No retry tracking (can't optimize retry config)
Missing error classification (can't distinguish transient vs. permanent)
No fallback logging (unclear degradation)
Retrying permanent errors (wasted cost)

Related Skills

```
llm-call-tracing
```
- LLM error context
```
tool-call-tracking
```
- Tool error handling