Claude-skill-registry guardrails-safety
Instrument safety checks, content filters, and guardrails for agent outputs
install
source · Clone the upstream repo
git clone https://github.com/majiayu000/claude-skill-registry
Claude Code · Install into ~/.claude/skills/
T=$(mktemp -d) && git clone --depth=1 https://github.com/majiayu000/claude-skill-registry "$T" && mkdir -p ~/.claude/skills && cp -r "$T/skills/data/guardrails-safety" ~/.claude/skills/majiayu000-claude-skill-registry-guardrails-safety && rm -rf "$T"
manifest:
skills/data/guardrails-safety/SKILL.mdsource content
Guardrails & Safety Instrumentation
Instrument safety checks to catch issues before users see them.
Core Principle
Guardrails run at two points:
- Input guardrails: Before the LLM sees user input
- Output guardrails: Before the user sees LLM output
Both must be instrumented to:
- Know what was blocked and why
- Measure false positive rate (blocking good content)
- Track latency overhead of safety checks
Guardrail Span Attributes
# P0 - Always capture span.set_attribute("guardrail.name", "pii_filter") span.set_attribute("guardrail.type", "output") # or "input" span.set_attribute("guardrail.triggered", True) span.set_attribute("guardrail.action", "block") # block, warn, redact, pass # P1 - For analysis span.set_attribute("guardrail.category", "pii") span.set_attribute("guardrail.confidence", 0.95) span.set_attribute("guardrail.latency_ms", 45) # P2 - For debugging (be careful with PII) span.set_attribute("guardrail.matched_pattern", "SSN") span.set_attribute("guardrail.redacted_count", 2)
Input Guardrails
Prompt Injection Detection
from langfuse.decorators import observe, langfuse_context @observe(name="guardrail.input.injection") def check_prompt_injection(user_input: str) -> dict: """Detect prompt injection attempts.""" # Simple heuristic checks injection_patterns = [ r"ignore.*previous.*instructions", r"you are now", r"new instructions:", r"system prompt:", r"<\|.*\|>", # Special tokens ] triggered = False matched = [] for pattern in injection_patterns: if re.search(pattern, user_input, re.IGNORECASE): triggered = True matched.append(pattern) langfuse_context.update_current_observation( metadata={ "guardrail_name": "prompt_injection", "guardrail_type": "input", "triggered": triggered, "patterns_matched": len(matched), } ) return { "passed": not triggered, "action": "block" if triggered else "pass", "reason": "prompt_injection" if triggered else None, }
Input Content Filter
@observe(name="guardrail.input.content") def check_input_content(user_input: str) -> dict: """Filter harmful input content.""" # Use moderation API or classifier result = moderation_api.check(user_input) triggered = result.flagged categories = [c for c, v in result.categories.items() if v] langfuse_context.update_current_observation( metadata={ "guardrail_name": "content_filter", "guardrail_type": "input", "triggered": triggered, "categories": categories, "scores": result.category_scores, } ) return { "passed": not triggered, "action": "block" if triggered else "pass", "categories": categories, }
Output Guardrails
PII Detection & Redaction
@observe(name="guardrail.output.pii") def check_pii(output: str) -> dict: """Detect and optionally redact PII.""" pii_patterns = { "ssn": r"\b\d{3}-\d{2}-\d{4}\b", "email": r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b", "phone": r"\b\d{3}[-.]?\d{3}[-.]?\d{4}\b", "credit_card": r"\b\d{4}[-\s]?\d{4}[-\s]?\d{4}[-\s]?\d{4}\b", } detections = {} redacted_output = output for pii_type, pattern in pii_patterns.items(): matches = re.findall(pattern, output) if matches: detections[pii_type] = len(matches) # Redact redacted_output = re.sub(pattern, f"[{pii_type.upper()}_REDACTED]", redacted_output) triggered = len(detections) > 0 langfuse_context.update_current_observation( metadata={ "guardrail_name": "pii_filter", "guardrail_type": "output", "triggered": triggered, "pii_types_found": list(detections.keys()), "total_redactions": sum(detections.values()), } ) return { "passed": not triggered, "action": "redact" if triggered else "pass", "redacted_output": redacted_output if triggered else output, "detections": detections, }
Hallucination Check
@observe(name="guardrail.output.hallucination") def check_hallucination( output: str, context: list[str], threshold: float = 0.7, ) -> dict: """Check if output is grounded in provided context.""" # Use NLI model or LLM-as-judge grounding_score = check_grounding(output, context) triggered = grounding_score < threshold langfuse_context.update_current_observation( metadata={ "guardrail_name": "hallucination_check", "guardrail_type": "output", "triggered": triggered, "grounding_score": grounding_score, "threshold": threshold, "context_chunks": len(context), } ) return { "passed": not triggered, "action": "warn" if triggered else "pass", "grounding_score": grounding_score, }
Output Safety Filter
@observe(name="guardrail.output.safety") def check_output_safety(output: str) -> dict: """Check output for harmful content.""" result = moderation_api.check(output) # Also check for refusals that shouldn't happen false_refusal = any(phrase in output.lower() for phrase in [ "i cannot help", "i'm not able to", "as an ai", ]) langfuse_context.update_current_observation( metadata={ "guardrail_name": "output_safety", "guardrail_type": "output", "triggered": result.flagged, "false_refusal_detected": false_refusal, "categories": [c for c, v in result.categories.items() if v], } ) return { "passed": not result.flagged, "action": "block" if result.flagged else "pass", "false_refusal": false_refusal, }
Guardrail Pipeline
from langfuse.decorators import observe, langfuse_context class GuardrailPipeline: """Run multiple guardrails in sequence.""" def __init__(self, guardrails: list): self.guardrails = guardrails @observe(name="guardrails.run") def run(self, content: str, context: dict = None) -> dict: results = [] blocked = False final_content = content for guardrail in self.guardrails: result = guardrail(final_content, context) results.append({ "name": guardrail.__name__, "passed": result["passed"], "action": result["action"], }) if result["action"] == "block": blocked = True break elif result["action"] == "redact": final_content = result.get("redacted_output", final_content) langfuse_context.update_current_observation( metadata={ "guardrails_run": len(results), "any_triggered": any(not r["passed"] for r in results), "blocked": blocked, "actions": [r["action"] for r in results], } ) return { "passed": not blocked, "content": final_content, "results": results, } # Setup pipeline input_guardrails = GuardrailPipeline([ check_prompt_injection, check_input_content, ]) output_guardrails = GuardrailPipeline([ check_pii, check_hallucination, check_output_safety, ])
Full Agent with Guardrails
@observe(name="agent.run") def run_agent_with_guardrails(task: str, user_id: str) -> dict: """Agent with full guardrail pipeline.""" # Input guardrails input_check = input_guardrails.run(task) if not input_check["passed"]: langfuse_context.update_current_observation( metadata={"blocked_at": "input", "reason": input_check["results"][-1]["name"]} ) return { "success": False, "blocked": True, "stage": "input", "message": "Request could not be processed.", } # Run agent result = agent.invoke(input_check["content"]) # Output guardrails output_check = output_guardrails.run( result.output, context={"sources": result.sources}, ) if not output_check["passed"]: langfuse_context.update_current_observation( metadata={"blocked_at": "output", "reason": output_check["results"][-1]["name"]} ) return { "success": False, "blocked": True, "stage": "output", "message": "Response could not be delivered.", } return { "success": True, "output": output_check["content"], # May be redacted "guardrails_triggered": any(not r["passed"] for r in output_check["results"]), }
Guardrail Metrics Dashboard
# Track guardrail performance guardrail_metrics = { # Volume "total_checks": "Total guardrail runs", "triggered_count": "Times guardrail triggered", "trigger_rate": "% of checks that trigger", # Actions "block_rate": "% blocked completely", "redact_rate": "% with redactions", "warn_rate": "% with warnings only", # Performance "latency_p50_ms": "Median guardrail latency", "latency_p99_ms": "99th percentile latency", # Quality "false_positive_rate": "% incorrectly blocked (via feedback)", "false_negative_rate": "% that should have blocked", }
Guardrail Feedback Loop
@observe(name="guardrail.feedback") def record_guardrail_feedback( trace_id: str, guardrail_name: str, was_correct: bool, feedback_type: str, # "false_positive", "false_negative", "correct" ): """Record feedback on guardrail decisions.""" langfuse_context.update_current_observation( metadata={ "guardrail_name": guardrail_name, "feedback_type": feedback_type, "was_correct": was_correct, } ) # Score the original trace langfuse.score( trace_id=trace_id, name=f"guardrail_{guardrail_name}_accuracy", value=1.0 if was_correct else 0.0, comment=feedback_type, )
Async Guardrails (Non-Blocking)
import asyncio from langfuse.decorators import observe @observe(name="guardrails.async") async def run_guardrails_async(content: str) -> dict: """Run expensive guardrails in parallel.""" tasks = [ asyncio.create_task(check_pii_async(content)), asyncio.create_task(check_toxicity_async(content)), asyncio.create_task(check_hallucination_async(content)), ] results = await asyncio.gather(*tasks) any_blocked = any(not r["passed"] for r in results) return { "passed": not any_blocked, "results": results, "parallel": True, }
Anti-Patterns
| Anti-Pattern | Problem | Fix |
|---|---|---|
| No guardrail instrumentation | Can't measure false positives | Always log trigger/pass |
| Blocking without reason | Can't debug or improve | Log why it triggered |
| Sync guardrails in hot path | Latency impact | Use async or sample |
| No feedback loop | Can't improve accuracy | Collect user feedback |
| Logging PII in guardrail logs | Defeats the purpose | Log metadata only |
| Same guardrails for all users | Over/under blocking | Tier by user trust |
Related Skills
- Quality scoringevaluation-quality
- Handling blocked requestserror-retry-tracking
- Escalation when blockedhuman-in-the-loop