Claude-skill-registry decision-tracing

Trace agent decision-making, tool selection, and reasoning chains

install

source · Clone the upstream repo

git clone https://github.com/majiayu000/claude-skill-registry

Claude Code · Install into ~/.claude/skills/

T=$(mktemp -d) && git clone --depth=1 https://github.com/majiayu000/claude-skill-registry "$T" && mkdir -p ~/.claude/skills && cp -r "$T/skills/data/decision-tracing" ~/.claude/skills/majiayu000-claude-skill-registry-decision-tracing && rm -rf "$T"

manifest: skills/data/decision-tracing/SKILL.md

Decision Tracing

Understand why agents make decisions, not just what they did.

Core Principle

For every agent action, capture:

What options were available
What was chosen and why
What context influenced the decision
Was it correct in hindsight

This enables debugging failures and optimizing decision quality.

Decision Span Attributes

# P0 - Always capture
span.set_attribute("decision.type", "tool_selection")
span.set_attribute("decision.chosen", "web_search")
span.set_attribute("decision.confidence", 0.85)

# P1 - For analysis
span.set_attribute("decision.options", ["web_search", "calculator", "code_exec"])
span.set_attribute("decision.options_count", 3)
span.set_attribute("decision.reasoning", "User asked about current events")

# P2 - For debugging
span.set_attribute("decision.context_tokens", 1500)
span.set_attribute("decision.model", "claude-3-5-sonnet")

Tool Selection Tracing

from langfuse.decorators import observe, langfuse_context

@observe(name="decision.tool_selection")
def trace_tool_selection(
    response,
    available_tools: list[str],
) -> dict:
    """Trace which tool was selected and why."""

    # Extract tool choice from response
    tool_calls = response.tool_calls or []
    chosen_tools = [tc.function.name for tc in tool_calls]

    langfuse_context.update_current_observation(
        metadata={
            "decision_type": "tool_selection",
            "available_tools": available_tools,
            "chosen_tools": chosen_tools,
            "num_tools_called": len(chosen_tools),
            "called_parallel": len(chosen_tools) > 1,
        }
    )

    # If model provided reasoning (e.g., in <thinking> tags)
    if hasattr(response, "thinking"):
        langfuse_context.update_current_observation(
            metadata={
                "reasoning_provided": True,
                "reasoning_length": len(response.thinking),
            }
        )

    return {
        "chosen": chosen_tools,
        "available": available_tools,
    }

Routing Decision Tracing

@observe(name="decision.routing")
def trace_routing_decision(
    task: str,
    routed_to: str,
    available_agents: list[str],
    routing_scores: dict[str, float] = None,
) -> dict:
    """Trace agent/model routing decisions."""

    langfuse_context.update_current_observation(
        metadata={
            "decision_type": "routing",
            "routed_to": routed_to,
            "available_agents": available_agents,
            "scores": routing_scores,
            "top_score": max(routing_scores.values()) if routing_scores else None,
            "score_margin": calculate_margin(routing_scores) if routing_scores else None,
        }
    )

    return {"routed_to": routed_to}

def route_to_agent(task: str) -> str:
    """Route task to appropriate agent."""

    # Classifier-based routing
    scores = {
        "researcher": classify_score(task, "research"),
        "coder": classify_score(task, "coding"),
        "writer": classify_score(task, "writing"),
    }

    chosen = max(scores, key=scores.get)

    trace_routing_decision(
        task=task,
        routed_to=chosen,
        available_agents=list(scores.keys()),
        routing_scores=scores,
    )

    return chosen

Chain of Thought Tracing

@observe(name="decision.reasoning")
def trace_reasoning_chain(
    response,
    structured_output: bool = False,
) -> dict:
    """Extract and trace reasoning from agent responses."""

    # Parse thinking/reasoning from response
    reasoning = extract_reasoning(response)

    langfuse_context.update_current_observation(
        metadata={
            "decision_type": "reasoning",
            "has_reasoning": reasoning is not None,
            "reasoning_steps": count_steps(reasoning) if reasoning else 0,
            "reasoning_length": len(reasoning) if reasoning else 0,
        }
    )

    # If structured output, trace the decision structure
    if structured_output and hasattr(response, "parsed"):
        langfuse_context.update_current_observation(
            metadata={
                "structured_decision": True,
                "decision_fields": list(response.parsed.__fields__.keys()),
            }
        )

    return {
        "reasoning": reasoning,
        "steps": count_steps(reasoning) if reasoning else 0,
    }

Multi-Step Decision Tracing

@observe(name="agent.run")
def run_agent_with_decision_tracing(task: str) -> str:
    """Full agent loop with decision tracing."""

    messages = [{"role": "user", "content": task}]
    decisions = []

    for step in range(max_steps):
        with langfuse_context.observation(name=f"step.{step}") as step_span:
            # Get LLM response
            response = call_llm(messages)

            # Trace the decision made at this step
            decision = {
                "step": step,
                "type": classify_decision_type(response),
                "action": None,
                "reasoning": extract_reasoning(response),
            }

            if response.tool_calls:
                # Tool use decision
                decision["action"] = "tool_call"
                decision["tools"] = [tc.function.name for tc in response.tool_calls]

                step_span.set_attribute("decision.type", "tool_call")
                step_span.set_attribute("decision.tools", decision["tools"])

            elif response.stop_reason == "end_turn":
                # Decision to respond
                decision["action"] = "respond"

                step_span.set_attribute("decision.type", "respond")
                step_span.set_attribute("decision.final", True)

            decisions.append(decision)

            # Continue loop...

    # Log full decision chain
    langfuse_context.update_current_observation(
        metadata={
            "decision_chain": decisions,
            "total_decisions": len(decisions),
            "tool_decisions": sum(1 for d in decisions if d["action"] == "tool_call"),
        }
    )

    return result

Decision Quality Scoring

@observe(name="decision.evaluate")
def evaluate_decision_quality(
    decision: dict,
    outcome: dict,
    ground_truth: dict = None,
) -> dict:
    """Score the quality of a decision after seeing the outcome."""

    scores = {}

    # Was the right tool chosen?
    if decision["type"] == "tool_call":
        if ground_truth and "expected_tool" in ground_truth:
            scores["tool_correct"] = decision["tools"][0] == ground_truth["expected_tool"]

        # Did the tool call succeed?
        scores["tool_succeeded"] = outcome.get("tool_success", False)

    # Was the decision efficient?
    scores["tokens_used"] = outcome.get("tokens", 0)
    scores["steps_taken"] = outcome.get("steps", 0)

    # Did it lead to task completion?
    scores["task_completed"] = outcome.get("success", False)

    langfuse_context.update_current_observation(
        metadata={
            "decision_type": decision["type"],
            "quality_scores": scores,
            "overall_quality": calculate_overall(scores),
        }
    )

    return scores

Tool Selection Analysis

def analyze_tool_selection_patterns(traces: list) -> dict:
    """Analyze tool selection patterns across traces."""

    patterns = {
        "tool_usage": {},           # tool -> count
        "tool_success_rate": {},    # tool -> success rate
        "tool_by_task_type": {},    # task_type -> tool distribution
        "unnecessary_calls": 0,      # Tools called but not needed
        "missing_calls": 0,          # Tools needed but not called
    }

    for trace in traces:
        for decision in trace.get("decisions", []):
            if decision["type"] == "tool_call":
                for tool in decision["tools"]:
                    patterns["tool_usage"][tool] = patterns["tool_usage"].get(tool, 0) + 1

    return patterns

Decision Replay for Debugging

@observe(name="decision.replay")
def replay_decision(
    trace_id: str,
    step: int,
    new_context: dict = None,
) -> dict:
    """Replay a decision with same or modified context."""

    # Fetch original trace
    original = langfuse.get_trace(trace_id)
    original_decision = original.decisions[step]

    # Reconstruct context at that step
    context = reconstruct_context(original, step)
    if new_context:
        context.update(new_context)

    # Re-run decision with same/modified context
    new_response = call_llm(context["messages"])
    new_decision = extract_decision(new_response)

    langfuse_context.update_current_observation(
        metadata={
            "replay_of": trace_id,
            "original_step": step,
            "original_decision": original_decision,
            "new_decision": new_decision,
            "decision_changed": new_decision != original_decision,
            "context_modified": new_context is not None,
        }
    )

    return {
        "original": original_decision,
        "replayed": new_decision,
        "changed": new_decision != original_decision,
    }

Decision Attribution

@observe(name="decision.attribution")
def trace_decision_attribution(
    decision: dict,
    context_sources: list[dict],
) -> dict:
    """Trace what context influenced a decision."""

    # Analyze which context pieces were relevant
    relevant_sources = []
    for source in context_sources:
        relevance = calculate_relevance(decision, source)
        if relevance > 0.5:
            relevant_sources.append({
                "source_id": source["id"],
                "source_type": source["type"],
                "relevance": relevance,
            })

    langfuse_context.update_current_observation(
        metadata={
            "decision_type": decision["type"],
            "context_sources_total": len(context_sources),
            "context_sources_relevant": len(relevant_sources),
            "top_source": relevant_sources[0]["source_id"] if relevant_sources else None,
            "attribution": relevant_sources[:3],  # Top 3
        }
    )

    return {
        "decision": decision,
        "attributed_to": relevant_sources,
    }

Dashboard Metrics

# Decision quality metrics
decision_metrics = {
    # Accuracy
    "tool_selection_accuracy": "% correct tool choices",
    "routing_accuracy": "% correct agent routing",

    # Efficiency
    "avg_decisions_per_task": "Average decisions before completion",
    "unnecessary_tool_calls": "Tool calls that didn't help",
    "backtrack_rate": "% of tasks requiring backtracking",

    # Reasoning
    "reasoning_provided_rate": "% with explicit reasoning",
    "reasoning_quality_score": "Avg reasoning quality (via eval)",

    # Outcomes
    "decision_to_success_rate": "% of decisions leading to success",
    "first_decision_correct_rate": "% first decision was right",
}

Anti-Patterns

Anti-Pattern	Problem	Fix
Only logging chosen action	Can't analyze alternatives	Log available options
No confidence scores	Can't identify uncertain decisions	Log model confidence
Missing context at decision time	Can't replay/debug	Snapshot context
No decision-outcome linking	Can't measure quality	Track outcome per decision
Aggregating all decisions	Lose granular insight	Trace each decision point

Related Skills

```
tool-call-tracking
```
- Tool execution details
```
multi-agent-coordination
```
- Agent routing
```
evaluation-quality
```
- Decision quality scoring