Claude-skill-registry decision-tracing
Trace agent decision-making, tool selection, and reasoning chains
install
source · Clone the upstream repo
git clone https://github.com/majiayu000/claude-skill-registry
Claude Code · Install into ~/.claude/skills/
T=$(mktemp -d) && git clone --depth=1 https://github.com/majiayu000/claude-skill-registry "$T" && mkdir -p ~/.claude/skills && cp -r "$T/skills/data/decision-tracing" ~/.claude/skills/majiayu000-claude-skill-registry-decision-tracing && rm -rf "$T"
manifest:
skills/data/decision-tracing/SKILL.mdsource content
Decision Tracing
Understand why agents make decisions, not just what they did.
Core Principle
For every agent action, capture:
- What options were available
- What was chosen and why
- What context influenced the decision
- Was it correct in hindsight
This enables debugging failures and optimizing decision quality.
Decision Span Attributes
# P0 - Always capture span.set_attribute("decision.type", "tool_selection") span.set_attribute("decision.chosen", "web_search") span.set_attribute("decision.confidence", 0.85) # P1 - For analysis span.set_attribute("decision.options", ["web_search", "calculator", "code_exec"]) span.set_attribute("decision.options_count", 3) span.set_attribute("decision.reasoning", "User asked about current events") # P2 - For debugging span.set_attribute("decision.context_tokens", 1500) span.set_attribute("decision.model", "claude-3-5-sonnet")
Tool Selection Tracing
from langfuse.decorators import observe, langfuse_context @observe(name="decision.tool_selection") def trace_tool_selection( response, available_tools: list[str], ) -> dict: """Trace which tool was selected and why.""" # Extract tool choice from response tool_calls = response.tool_calls or [] chosen_tools = [tc.function.name for tc in tool_calls] langfuse_context.update_current_observation( metadata={ "decision_type": "tool_selection", "available_tools": available_tools, "chosen_tools": chosen_tools, "num_tools_called": len(chosen_tools), "called_parallel": len(chosen_tools) > 1, } ) # If model provided reasoning (e.g., in <thinking> tags) if hasattr(response, "thinking"): langfuse_context.update_current_observation( metadata={ "reasoning_provided": True, "reasoning_length": len(response.thinking), } ) return { "chosen": chosen_tools, "available": available_tools, }
Routing Decision Tracing
@observe(name="decision.routing") def trace_routing_decision( task: str, routed_to: str, available_agents: list[str], routing_scores: dict[str, float] = None, ) -> dict: """Trace agent/model routing decisions.""" langfuse_context.update_current_observation( metadata={ "decision_type": "routing", "routed_to": routed_to, "available_agents": available_agents, "scores": routing_scores, "top_score": max(routing_scores.values()) if routing_scores else None, "score_margin": calculate_margin(routing_scores) if routing_scores else None, } ) return {"routed_to": routed_to} def route_to_agent(task: str) -> str: """Route task to appropriate agent.""" # Classifier-based routing scores = { "researcher": classify_score(task, "research"), "coder": classify_score(task, "coding"), "writer": classify_score(task, "writing"), } chosen = max(scores, key=scores.get) trace_routing_decision( task=task, routed_to=chosen, available_agents=list(scores.keys()), routing_scores=scores, ) return chosen
Chain of Thought Tracing
@observe(name="decision.reasoning") def trace_reasoning_chain( response, structured_output: bool = False, ) -> dict: """Extract and trace reasoning from agent responses.""" # Parse thinking/reasoning from response reasoning = extract_reasoning(response) langfuse_context.update_current_observation( metadata={ "decision_type": "reasoning", "has_reasoning": reasoning is not None, "reasoning_steps": count_steps(reasoning) if reasoning else 0, "reasoning_length": len(reasoning) if reasoning else 0, } ) # If structured output, trace the decision structure if structured_output and hasattr(response, "parsed"): langfuse_context.update_current_observation( metadata={ "structured_decision": True, "decision_fields": list(response.parsed.__fields__.keys()), } ) return { "reasoning": reasoning, "steps": count_steps(reasoning) if reasoning else 0, }
Multi-Step Decision Tracing
@observe(name="agent.run") def run_agent_with_decision_tracing(task: str) -> str: """Full agent loop with decision tracing.""" messages = [{"role": "user", "content": task}] decisions = [] for step in range(max_steps): with langfuse_context.observation(name=f"step.{step}") as step_span: # Get LLM response response = call_llm(messages) # Trace the decision made at this step decision = { "step": step, "type": classify_decision_type(response), "action": None, "reasoning": extract_reasoning(response), } if response.tool_calls: # Tool use decision decision["action"] = "tool_call" decision["tools"] = [tc.function.name for tc in response.tool_calls] step_span.set_attribute("decision.type", "tool_call") step_span.set_attribute("decision.tools", decision["tools"]) elif response.stop_reason == "end_turn": # Decision to respond decision["action"] = "respond" step_span.set_attribute("decision.type", "respond") step_span.set_attribute("decision.final", True) decisions.append(decision) # Continue loop... # Log full decision chain langfuse_context.update_current_observation( metadata={ "decision_chain": decisions, "total_decisions": len(decisions), "tool_decisions": sum(1 for d in decisions if d["action"] == "tool_call"), } ) return result
Decision Quality Scoring
@observe(name="decision.evaluate") def evaluate_decision_quality( decision: dict, outcome: dict, ground_truth: dict = None, ) -> dict: """Score the quality of a decision after seeing the outcome.""" scores = {} # Was the right tool chosen? if decision["type"] == "tool_call": if ground_truth and "expected_tool" in ground_truth: scores["tool_correct"] = decision["tools"][0] == ground_truth["expected_tool"] # Did the tool call succeed? scores["tool_succeeded"] = outcome.get("tool_success", False) # Was the decision efficient? scores["tokens_used"] = outcome.get("tokens", 0) scores["steps_taken"] = outcome.get("steps", 0) # Did it lead to task completion? scores["task_completed"] = outcome.get("success", False) langfuse_context.update_current_observation( metadata={ "decision_type": decision["type"], "quality_scores": scores, "overall_quality": calculate_overall(scores), } ) return scores
Tool Selection Analysis
def analyze_tool_selection_patterns(traces: list) -> dict: """Analyze tool selection patterns across traces.""" patterns = { "tool_usage": {}, # tool -> count "tool_success_rate": {}, # tool -> success rate "tool_by_task_type": {}, # task_type -> tool distribution "unnecessary_calls": 0, # Tools called but not needed "missing_calls": 0, # Tools needed but not called } for trace in traces: for decision in trace.get("decisions", []): if decision["type"] == "tool_call": for tool in decision["tools"]: patterns["tool_usage"][tool] = patterns["tool_usage"].get(tool, 0) + 1 return patterns
Decision Replay for Debugging
@observe(name="decision.replay") def replay_decision( trace_id: str, step: int, new_context: dict = None, ) -> dict: """Replay a decision with same or modified context.""" # Fetch original trace original = langfuse.get_trace(trace_id) original_decision = original.decisions[step] # Reconstruct context at that step context = reconstruct_context(original, step) if new_context: context.update(new_context) # Re-run decision with same/modified context new_response = call_llm(context["messages"]) new_decision = extract_decision(new_response) langfuse_context.update_current_observation( metadata={ "replay_of": trace_id, "original_step": step, "original_decision": original_decision, "new_decision": new_decision, "decision_changed": new_decision != original_decision, "context_modified": new_context is not None, } ) return { "original": original_decision, "replayed": new_decision, "changed": new_decision != original_decision, }
Decision Attribution
@observe(name="decision.attribution") def trace_decision_attribution( decision: dict, context_sources: list[dict], ) -> dict: """Trace what context influenced a decision.""" # Analyze which context pieces were relevant relevant_sources = [] for source in context_sources: relevance = calculate_relevance(decision, source) if relevance > 0.5: relevant_sources.append({ "source_id": source["id"], "source_type": source["type"], "relevance": relevance, }) langfuse_context.update_current_observation( metadata={ "decision_type": decision["type"], "context_sources_total": len(context_sources), "context_sources_relevant": len(relevant_sources), "top_source": relevant_sources[0]["source_id"] if relevant_sources else None, "attribution": relevant_sources[:3], # Top 3 } ) return { "decision": decision, "attributed_to": relevant_sources, }
Dashboard Metrics
# Decision quality metrics decision_metrics = { # Accuracy "tool_selection_accuracy": "% correct tool choices", "routing_accuracy": "% correct agent routing", # Efficiency "avg_decisions_per_task": "Average decisions before completion", "unnecessary_tool_calls": "Tool calls that didn't help", "backtrack_rate": "% of tasks requiring backtracking", # Reasoning "reasoning_provided_rate": "% with explicit reasoning", "reasoning_quality_score": "Avg reasoning quality (via eval)", # Outcomes "decision_to_success_rate": "% of decisions leading to success", "first_decision_correct_rate": "% first decision was right", }
Anti-Patterns
| Anti-Pattern | Problem | Fix |
|---|---|---|
| Only logging chosen action | Can't analyze alternatives | Log available options |
| No confidence scores | Can't identify uncertain decisions | Log model confidence |
| Missing context at decision time | Can't replay/debug | Snapshot context |
| No decision-outcome linking | Can't measure quality | Track outcome per decision |
| Aggregating all decisions | Lose granular insight | Trace each decision point |
Related Skills
- Tool execution detailstool-call-tracking
- Agent routingmulti-agent-coordination
- Decision quality scoringevaluation-quality