Claude-skill-registry guardrails-safety

Instrument safety checks, content filters, and guardrails for agent outputs

install

source · Clone the upstream repo

git clone https://github.com/majiayu000/claude-skill-registry

Claude Code · Install into ~/.claude/skills/

T=$(mktemp -d) && git clone --depth=1 https://github.com/majiayu000/claude-skill-registry "$T" && mkdir -p ~/.claude/skills && cp -r "$T/skills/data/guardrails-safety" ~/.claude/skills/majiayu000-claude-skill-registry-guardrails-safety && rm -rf "$T"

manifest: skills/data/guardrails-safety/SKILL.md

Guardrails & Safety Instrumentation

Instrument safety checks to catch issues before users see them.

Core Principle

Guardrails run at two points:

Input guardrails: Before the LLM sees user input
Output guardrails: Before the user sees LLM output

Both must be instrumented to:

Know what was blocked and why
Measure false positive rate (blocking good content)
Track latency overhead of safety checks

Guardrail Span Attributes

# P0 - Always capture
span.set_attribute("guardrail.name", "pii_filter")
span.set_attribute("guardrail.type", "output")  # or "input"
span.set_attribute("guardrail.triggered", True)
span.set_attribute("guardrail.action", "block")  # block, warn, redact, pass

# P1 - For analysis
span.set_attribute("guardrail.category", "pii")
span.set_attribute("guardrail.confidence", 0.95)
span.set_attribute("guardrail.latency_ms", 45)

# P2 - For debugging (be careful with PII)
span.set_attribute("guardrail.matched_pattern", "SSN")
span.set_attribute("guardrail.redacted_count", 2)

Input Guardrails

Prompt Injection Detection

from langfuse.decorators import observe, langfuse_context

@observe(name="guardrail.input.injection")
def check_prompt_injection(user_input: str) -> dict:
    """Detect prompt injection attempts."""

    # Simple heuristic checks
    injection_patterns = [
        r"ignore.*previous.*instructions",
        r"you are now",
        r"new instructions:",
        r"system prompt:",
        r"<\|.*\|>",  # Special tokens
    ]

    triggered = False
    matched = []

    for pattern in injection_patterns:
        if re.search(pattern, user_input, re.IGNORECASE):
            triggered = True
            matched.append(pattern)

    langfuse_context.update_current_observation(
        metadata={
            "guardrail_name": "prompt_injection",
            "guardrail_type": "input",
            "triggered": triggered,
            "patterns_matched": len(matched),
        }
    )

    return {
        "passed": not triggered,
        "action": "block" if triggered else "pass",
        "reason": "prompt_injection" if triggered else None,
    }

Input Content Filter

@observe(name="guardrail.input.content")
def check_input_content(user_input: str) -> dict:
    """Filter harmful input content."""

    # Use moderation API or classifier
    result = moderation_api.check(user_input)

    triggered = result.flagged
    categories = [c for c, v in result.categories.items() if v]

    langfuse_context.update_current_observation(
        metadata={
            "guardrail_name": "content_filter",
            "guardrail_type": "input",
            "triggered": triggered,
            "categories": categories,
            "scores": result.category_scores,
        }
    )

    return {
        "passed": not triggered,
        "action": "block" if triggered else "pass",
        "categories": categories,
    }

Output Guardrails

PII Detection & Redaction

@observe(name="guardrail.output.pii")
def check_pii(output: str) -> dict:
    """Detect and optionally redact PII."""

    pii_patterns = {
        "ssn": r"\b\d{3}-\d{2}-\d{4}\b",
        "email": r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b",
        "phone": r"\b\d{3}[-.]?\d{3}[-.]?\d{4}\b",
        "credit_card": r"\b\d{4}[-\s]?\d{4}[-\s]?\d{4}[-\s]?\d{4}\b",
    }

    detections = {}
    redacted_output = output

    for pii_type, pattern in pii_patterns.items():
        matches = re.findall(pattern, output)
        if matches:
            detections[pii_type] = len(matches)
            # Redact
            redacted_output = re.sub(pattern, f"[{pii_type.upper()}_REDACTED]", redacted_output)

    triggered = len(detections) > 0

    langfuse_context.update_current_observation(
        metadata={
            "guardrail_name": "pii_filter",
            "guardrail_type": "output",
            "triggered": triggered,
            "pii_types_found": list(detections.keys()),
            "total_redactions": sum(detections.values()),
        }
    )

    return {
        "passed": not triggered,
        "action": "redact" if triggered else "pass",
        "redacted_output": redacted_output if triggered else output,
        "detections": detections,
    }

Hallucination Check

@observe(name="guardrail.output.hallucination")
def check_hallucination(
    output: str,
    context: list[str],
    threshold: float = 0.7,
) -> dict:
    """Check if output is grounded in provided context."""

    # Use NLI model or LLM-as-judge
    grounding_score = check_grounding(output, context)

    triggered = grounding_score < threshold

    langfuse_context.update_current_observation(
        metadata={
            "guardrail_name": "hallucination_check",
            "guardrail_type": "output",
            "triggered": triggered,
            "grounding_score": grounding_score,
            "threshold": threshold,
            "context_chunks": len(context),
        }
    )

    return {
        "passed": not triggered,
        "action": "warn" if triggered else "pass",
        "grounding_score": grounding_score,
    }

Output Safety Filter

@observe(name="guardrail.output.safety")
def check_output_safety(output: str) -> dict:
    """Check output for harmful content."""

    result = moderation_api.check(output)

    # Also check for refusals that shouldn't happen
    false_refusal = any(phrase in output.lower() for phrase in [
        "i cannot help",
        "i'm not able to",
        "as an ai",
    ])

    langfuse_context.update_current_observation(
        metadata={
            "guardrail_name": "output_safety",
            "guardrail_type": "output",
            "triggered": result.flagged,
            "false_refusal_detected": false_refusal,
            "categories": [c for c, v in result.categories.items() if v],
        }
    )

    return {
        "passed": not result.flagged,
        "action": "block" if result.flagged else "pass",
        "false_refusal": false_refusal,
    }

Guardrail Pipeline

from langfuse.decorators import observe, langfuse_context

class GuardrailPipeline:
    """Run multiple guardrails in sequence."""

    def __init__(self, guardrails: list):
        self.guardrails = guardrails

    @observe(name="guardrails.run")
    def run(self, content: str, context: dict = None) -> dict:
        results = []
        blocked = False
        final_content = content

        for guardrail in self.guardrails:
            result = guardrail(final_content, context)
            results.append({
                "name": guardrail.__name__,
                "passed": result["passed"],
                "action": result["action"],
            })

            if result["action"] == "block":
                blocked = True
                break
            elif result["action"] == "redact":
                final_content = result.get("redacted_output", final_content)

        langfuse_context.update_current_observation(
            metadata={
                "guardrails_run": len(results),
                "any_triggered": any(not r["passed"] for r in results),
                "blocked": blocked,
                "actions": [r["action"] for r in results],
            }
        )

        return {
            "passed": not blocked,
            "content": final_content,
            "results": results,
        }

# Setup pipeline
input_guardrails = GuardrailPipeline([
    check_prompt_injection,
    check_input_content,
])

output_guardrails = GuardrailPipeline([
    check_pii,
    check_hallucination,
    check_output_safety,
])

Full Agent with Guardrails

@observe(name="agent.run")
def run_agent_with_guardrails(task: str, user_id: str) -> dict:
    """Agent with full guardrail pipeline."""

    # Input guardrails
    input_check = input_guardrails.run(task)

    if not input_check["passed"]:
        langfuse_context.update_current_observation(
            metadata={"blocked_at": "input", "reason": input_check["results"][-1]["name"]}
        )
        return {
            "success": False,
            "blocked": True,
            "stage": "input",
            "message": "Request could not be processed.",
        }

    # Run agent
    result = agent.invoke(input_check["content"])

    # Output guardrails
    output_check = output_guardrails.run(
        result.output,
        context={"sources": result.sources},
    )

    if not output_check["passed"]:
        langfuse_context.update_current_observation(
            metadata={"blocked_at": "output", "reason": output_check["results"][-1]["name"]}
        )
        return {
            "success": False,
            "blocked": True,
            "stage": "output",
            "message": "Response could not be delivered.",
        }

    return {
        "success": True,
        "output": output_check["content"],  # May be redacted
        "guardrails_triggered": any(not r["passed"] for r in output_check["results"]),
    }

Guardrail Metrics Dashboard

# Track guardrail performance
guardrail_metrics = {
    # Volume
    "total_checks": "Total guardrail runs",
    "triggered_count": "Times guardrail triggered",
    "trigger_rate": "% of checks that trigger",

    # Actions
    "block_rate": "% blocked completely",
    "redact_rate": "% with redactions",
    "warn_rate": "% with warnings only",

    # Performance
    "latency_p50_ms": "Median guardrail latency",
    "latency_p99_ms": "99th percentile latency",

    # Quality
    "false_positive_rate": "% incorrectly blocked (via feedback)",
    "false_negative_rate": "% that should have blocked",
}

Guardrail Feedback Loop

@observe(name="guardrail.feedback")
def record_guardrail_feedback(
    trace_id: str,
    guardrail_name: str,
    was_correct: bool,
    feedback_type: str,  # "false_positive", "false_negative", "correct"
):
    """Record feedback on guardrail decisions."""

    langfuse_context.update_current_observation(
        metadata={
            "guardrail_name": guardrail_name,
            "feedback_type": feedback_type,
            "was_correct": was_correct,
        }
    )

    # Score the original trace
    langfuse.score(
        trace_id=trace_id,
        name=f"guardrail_{guardrail_name}_accuracy",
        value=1.0 if was_correct else 0.0,
        comment=feedback_type,
    )

Async Guardrails (Non-Blocking)

import asyncio
from langfuse.decorators import observe

@observe(name="guardrails.async")
async def run_guardrails_async(content: str) -> dict:
    """Run expensive guardrails in parallel."""

    tasks = [
        asyncio.create_task(check_pii_async(content)),
        asyncio.create_task(check_toxicity_async(content)),
        asyncio.create_task(check_hallucination_async(content)),
    ]

    results = await asyncio.gather(*tasks)

    any_blocked = any(not r["passed"] for r in results)

    return {
        "passed": not any_blocked,
        "results": results,
        "parallel": True,
    }

Anti-Patterns

Anti-Pattern	Problem	Fix
No guardrail instrumentation	Can't measure false positives	Always log trigger/pass
Blocking without reason	Can't debug or improve	Log why it triggered
Sync guardrails in hot path	Latency impact	Use async or sample
No feedback loop	Can't improve accuracy	Collect user feedback
Logging PII in guardrail logs	Defeats the purpose	Log metadata only
Same guardrails for all users	Over/under blocking	Tier by user trust

Related Skills

```
evaluation-quality
```
- Quality scoring
```
error-retry-tracking
```
- Handling blocked requests
```
human-in-the-loop
```
- Escalation when blocked