Claude-skill-registry decision-tracing

Trace agent decision-making, tool selection, and reasoning chains

install
source · Clone the upstream repo
git clone https://github.com/majiayu000/claude-skill-registry
Claude Code · Install into ~/.claude/skills/
T=$(mktemp -d) && git clone --depth=1 https://github.com/majiayu000/claude-skill-registry "$T" && mkdir -p ~/.claude/skills && cp -r "$T/skills/data/decision-tracing" ~/.claude/skills/majiayu000-claude-skill-registry-decision-tracing && rm -rf "$T"
manifest: skills/data/decision-tracing/SKILL.md
source content

Decision Tracing

Understand why agents make decisions, not just what they did.

Core Principle

For every agent action, capture:

  1. What options were available
  2. What was chosen and why
  3. What context influenced the decision
  4. Was it correct in hindsight

This enables debugging failures and optimizing decision quality.

Decision Span Attributes

# P0 - Always capture
span.set_attribute("decision.type", "tool_selection")
span.set_attribute("decision.chosen", "web_search")
span.set_attribute("decision.confidence", 0.85)

# P1 - For analysis
span.set_attribute("decision.options", ["web_search", "calculator", "code_exec"])
span.set_attribute("decision.options_count", 3)
span.set_attribute("decision.reasoning", "User asked about current events")

# P2 - For debugging
span.set_attribute("decision.context_tokens", 1500)
span.set_attribute("decision.model", "claude-3-5-sonnet")

Tool Selection Tracing

from langfuse.decorators import observe, langfuse_context

@observe(name="decision.tool_selection")
def trace_tool_selection(
    response,
    available_tools: list[str],
) -> dict:
    """Trace which tool was selected and why."""

    # Extract tool choice from response
    tool_calls = response.tool_calls or []
    chosen_tools = [tc.function.name for tc in tool_calls]

    langfuse_context.update_current_observation(
        metadata={
            "decision_type": "tool_selection",
            "available_tools": available_tools,
            "chosen_tools": chosen_tools,
            "num_tools_called": len(chosen_tools),
            "called_parallel": len(chosen_tools) > 1,
        }
    )

    # If model provided reasoning (e.g., in <thinking> tags)
    if hasattr(response, "thinking"):
        langfuse_context.update_current_observation(
            metadata={
                "reasoning_provided": True,
                "reasoning_length": len(response.thinking),
            }
        )

    return {
        "chosen": chosen_tools,
        "available": available_tools,
    }

Routing Decision Tracing

@observe(name="decision.routing")
def trace_routing_decision(
    task: str,
    routed_to: str,
    available_agents: list[str],
    routing_scores: dict[str, float] = None,
) -> dict:
    """Trace agent/model routing decisions."""

    langfuse_context.update_current_observation(
        metadata={
            "decision_type": "routing",
            "routed_to": routed_to,
            "available_agents": available_agents,
            "scores": routing_scores,
            "top_score": max(routing_scores.values()) if routing_scores else None,
            "score_margin": calculate_margin(routing_scores) if routing_scores else None,
        }
    )

    return {"routed_to": routed_to}

def route_to_agent(task: str) -> str:
    """Route task to appropriate agent."""

    # Classifier-based routing
    scores = {
        "researcher": classify_score(task, "research"),
        "coder": classify_score(task, "coding"),
        "writer": classify_score(task, "writing"),
    }

    chosen = max(scores, key=scores.get)

    trace_routing_decision(
        task=task,
        routed_to=chosen,
        available_agents=list(scores.keys()),
        routing_scores=scores,
    )

    return chosen

Chain of Thought Tracing

@observe(name="decision.reasoning")
def trace_reasoning_chain(
    response,
    structured_output: bool = False,
) -> dict:
    """Extract and trace reasoning from agent responses."""

    # Parse thinking/reasoning from response
    reasoning = extract_reasoning(response)

    langfuse_context.update_current_observation(
        metadata={
            "decision_type": "reasoning",
            "has_reasoning": reasoning is not None,
            "reasoning_steps": count_steps(reasoning) if reasoning else 0,
            "reasoning_length": len(reasoning) if reasoning else 0,
        }
    )

    # If structured output, trace the decision structure
    if structured_output and hasattr(response, "parsed"):
        langfuse_context.update_current_observation(
            metadata={
                "structured_decision": True,
                "decision_fields": list(response.parsed.__fields__.keys()),
            }
        )

    return {
        "reasoning": reasoning,
        "steps": count_steps(reasoning) if reasoning else 0,
    }

Multi-Step Decision Tracing

@observe(name="agent.run")
def run_agent_with_decision_tracing(task: str) -> str:
    """Full agent loop with decision tracing."""

    messages = [{"role": "user", "content": task}]
    decisions = []

    for step in range(max_steps):
        with langfuse_context.observation(name=f"step.{step}") as step_span:
            # Get LLM response
            response = call_llm(messages)

            # Trace the decision made at this step
            decision = {
                "step": step,
                "type": classify_decision_type(response),
                "action": None,
                "reasoning": extract_reasoning(response),
            }

            if response.tool_calls:
                # Tool use decision
                decision["action"] = "tool_call"
                decision["tools"] = [tc.function.name for tc in response.tool_calls]

                step_span.set_attribute("decision.type", "tool_call")
                step_span.set_attribute("decision.tools", decision["tools"])

            elif response.stop_reason == "end_turn":
                # Decision to respond
                decision["action"] = "respond"

                step_span.set_attribute("decision.type", "respond")
                step_span.set_attribute("decision.final", True)

            decisions.append(decision)

            # Continue loop...

    # Log full decision chain
    langfuse_context.update_current_observation(
        metadata={
            "decision_chain": decisions,
            "total_decisions": len(decisions),
            "tool_decisions": sum(1 for d in decisions if d["action"] == "tool_call"),
        }
    )

    return result

Decision Quality Scoring

@observe(name="decision.evaluate")
def evaluate_decision_quality(
    decision: dict,
    outcome: dict,
    ground_truth: dict = None,
) -> dict:
    """Score the quality of a decision after seeing the outcome."""

    scores = {}

    # Was the right tool chosen?
    if decision["type"] == "tool_call":
        if ground_truth and "expected_tool" in ground_truth:
            scores["tool_correct"] = decision["tools"][0] == ground_truth["expected_tool"]

        # Did the tool call succeed?
        scores["tool_succeeded"] = outcome.get("tool_success", False)

    # Was the decision efficient?
    scores["tokens_used"] = outcome.get("tokens", 0)
    scores["steps_taken"] = outcome.get("steps", 0)

    # Did it lead to task completion?
    scores["task_completed"] = outcome.get("success", False)

    langfuse_context.update_current_observation(
        metadata={
            "decision_type": decision["type"],
            "quality_scores": scores,
            "overall_quality": calculate_overall(scores),
        }
    )

    return scores

Tool Selection Analysis

def analyze_tool_selection_patterns(traces: list) -> dict:
    """Analyze tool selection patterns across traces."""

    patterns = {
        "tool_usage": {},           # tool -> count
        "tool_success_rate": {},    # tool -> success rate
        "tool_by_task_type": {},    # task_type -> tool distribution
        "unnecessary_calls": 0,      # Tools called but not needed
        "missing_calls": 0,          # Tools needed but not called
    }

    for trace in traces:
        for decision in trace.get("decisions", []):
            if decision["type"] == "tool_call":
                for tool in decision["tools"]:
                    patterns["tool_usage"][tool] = patterns["tool_usage"].get(tool, 0) + 1

    return patterns

Decision Replay for Debugging

@observe(name="decision.replay")
def replay_decision(
    trace_id: str,
    step: int,
    new_context: dict = None,
) -> dict:
    """Replay a decision with same or modified context."""

    # Fetch original trace
    original = langfuse.get_trace(trace_id)
    original_decision = original.decisions[step]

    # Reconstruct context at that step
    context = reconstruct_context(original, step)
    if new_context:
        context.update(new_context)

    # Re-run decision with same/modified context
    new_response = call_llm(context["messages"])
    new_decision = extract_decision(new_response)

    langfuse_context.update_current_observation(
        metadata={
            "replay_of": trace_id,
            "original_step": step,
            "original_decision": original_decision,
            "new_decision": new_decision,
            "decision_changed": new_decision != original_decision,
            "context_modified": new_context is not None,
        }
    )

    return {
        "original": original_decision,
        "replayed": new_decision,
        "changed": new_decision != original_decision,
    }

Decision Attribution

@observe(name="decision.attribution")
def trace_decision_attribution(
    decision: dict,
    context_sources: list[dict],
) -> dict:
    """Trace what context influenced a decision."""

    # Analyze which context pieces were relevant
    relevant_sources = []
    for source in context_sources:
        relevance = calculate_relevance(decision, source)
        if relevance > 0.5:
            relevant_sources.append({
                "source_id": source["id"],
                "source_type": source["type"],
                "relevance": relevance,
            })

    langfuse_context.update_current_observation(
        metadata={
            "decision_type": decision["type"],
            "context_sources_total": len(context_sources),
            "context_sources_relevant": len(relevant_sources),
            "top_source": relevant_sources[0]["source_id"] if relevant_sources else None,
            "attribution": relevant_sources[:3],  # Top 3
        }
    )

    return {
        "decision": decision,
        "attributed_to": relevant_sources,
    }

Dashboard Metrics

# Decision quality metrics
decision_metrics = {
    # Accuracy
    "tool_selection_accuracy": "% correct tool choices",
    "routing_accuracy": "% correct agent routing",

    # Efficiency
    "avg_decisions_per_task": "Average decisions before completion",
    "unnecessary_tool_calls": "Tool calls that didn't help",
    "backtrack_rate": "% of tasks requiring backtracking",

    # Reasoning
    "reasoning_provided_rate": "% with explicit reasoning",
    "reasoning_quality_score": "Avg reasoning quality (via eval)",

    # Outcomes
    "decision_to_success_rate": "% of decisions leading to success",
    "first_decision_correct_rate": "% first decision was right",
}

Anti-Patterns

Anti-PatternProblemFix
Only logging chosen actionCan't analyze alternativesLog available options
No confidence scoresCan't identify uncertain decisionsLog model confidence
Missing context at decision timeCan't replay/debugSnapshot context
No decision-outcome linkingCan't measure qualityTrack outcome per decision
Aggregating all decisionsLose granular insightTrace each decision point

Related Skills

  • tool-call-tracking
    - Tool execution details
  • multi-agent-coordination
    - Agent routing
  • evaluation-quality
    - Decision quality scoring