Claude-skill-registry evaluation-quality
Instrument evaluation metrics, quality scores, and feedback loops
install
source · Clone the upstream repo
git clone https://github.com/majiayu000/claude-skill-registry
Claude Code · Install into ~/.claude/skills/
T=$(mktemp -d) && git clone --depth=1 https://github.com/majiayu000/claude-skill-registry "$T" && mkdir -p ~/.claude/skills && cp -r "$T/skills/data/evaluation-quality" ~/.claude/skills/majiayu000-claude-skill-registry-evaluation-quality && rm -rf "$T"
manifest:
skills/data/evaluation-quality/SKILL.mdsource content
Evaluation and Quality Instrumentation
Instrument evaluation systems to track agent quality and enable continuous improvement.
Core Principle
Quality observability answers:
- How good are agent responses?
- What types of errors occur (hallucination, refusal, etc.)?
- Is quality improving over time?
- What correlates with high/low quality?
- How does human feedback compare to automated evals?
Evaluation Types
Automated Evals
LLM-as-judge or heuristic scoring:
span.set_attribute("eval.type", "automated") span.set_attribute("eval.method", "llm_judge") span.set_attribute("eval.model", "claude-3-opus") span.set_attribute("eval.criteria", "helpfulness") span.set_attribute("eval.score", 0.85) span.set_attribute("eval.confidence", 0.92)
Human Feedback
User ratings and corrections:
span.set_attribute("eval.type", "human") span.set_attribute("eval.feedback_type", "thumbs") span.set_attribute("eval.score", 1) # 1 = thumbs up, 0 = thumbs down span.set_attribute("eval.user_id", "user_hash") span.set_attribute("eval.latency_to_feedback_ms", 45000)
Ground Truth Comparison
Compare to known correct answers:
span.set_attribute("eval.type", "ground_truth") span.set_attribute("eval.metric", "exact_match") span.set_attribute("eval.score", 1.0) span.set_attribute("eval.test_case_id", "test_123")
Quality Dimensions
Correctness
span.set_attribute("quality.factual_accuracy", 0.95) span.set_attribute("quality.hallucination_detected", False) span.set_attribute("quality.source_grounded", True) span.set_attribute("quality.citation_count", 3)
Helpfulness
span.set_attribute("quality.task_completion", 1.0) span.set_attribute("quality.answered_question", True) span.set_attribute("quality.actionable", True) span.set_attribute("quality.conciseness", 0.8)
Safety
span.set_attribute("quality.safety_score", 1.0) span.set_attribute("quality.refused", False) span.set_attribute("quality.pii_detected", False) span.set_attribute("quality.harmful_content", False)
Relevance
span.set_attribute("quality.relevance", 0.9) span.set_attribute("quality.on_topic", True) span.set_attribute("quality.context_used", 0.85)
Eval Span Attributes
# Eval metadata (P0) span.set_attribute("eval.id", str(uuid4())) span.set_attribute("eval.name", "helpfulness_v2") span.set_attribute("eval.version", "2.1") span.set_attribute("eval.timestamp", datetime.utcnow().isoformat()) # Input reference (P0) span.set_attribute("eval.trace_id", original_trace_id) span.set_attribute("eval.span_id", original_span_id) span.set_attribute("eval.agent_name", "researcher") # Scores (P0) span.set_attribute("eval.score", 0.85) span.set_attribute("eval.pass", True) span.set_attribute("eval.threshold", 0.7) # Details (P1) span.set_attribute("eval.reasoning", "Response was accurate and helpful") span.set_attribute("eval.issues", ["slightly_verbose"]) span.set_attribute("eval.latency_ms", 1500)
Feedback Collection Pattern
from langfuse.decorators import observe @observe(name="feedback.collect") def collect_feedback( trace_id: str, score: int, feedback_type: str = "thumbs", comment: str = None, ): span = get_current_span() span.set_attribute("feedback.trace_id", trace_id) span.set_attribute("feedback.type", feedback_type) span.set_attribute("feedback.score", score) if comment: span.set_attribute("feedback.has_comment", True) span.set_attribute("feedback.comment_length", len(comment)) # Store feedback langfuse.score( trace_id=trace_id, name=feedback_type, value=score, comment=comment, )
LLM-as-Judge Pattern
@observe(name="eval.llm_judge") def evaluate_response( question: str, response: str, criteria: str, ) -> float: span = get_current_span() span.set_attribute("eval.method", "llm_judge") span.set_attribute("eval.criteria", criteria) judge_prompt = f""" Evaluate this response on {criteria} (0-1 scale): Question: {question} Response: {response} Score: """ result = judge_llm.invoke(judge_prompt) score = parse_score(result) span.set_attribute("eval.score", score) span.set_attribute("eval.judge_tokens", result.usage.total_tokens) return score
Hallucination Detection
@observe(name="eval.hallucination_check") def check_hallucination( response: str, sources: list[str], ) -> dict: span = get_current_span() span.set_attribute("eval.type", "hallucination") # Check each claim against sources claims = extract_claims(response) span.set_attribute("eval.claims_count", len(claims)) grounded = 0 for claim in claims: if is_grounded(claim, sources): grounded += 1 score = grounded / len(claims) if claims else 1.0 span.set_attribute("eval.grounded_claims", grounded) span.set_attribute("eval.hallucination_score", 1 - score) span.set_attribute("eval.pass", score >= 0.9) return {"score": score, "grounded": grounded, "total": len(claims)}
Framework Integration
Langfuse Scores
from langfuse import Langfuse langfuse = Langfuse() # Score a trace langfuse.score( trace_id=trace_id, name="helpfulness", value=0.85, comment="Accurate and complete response", ) # Score a specific generation langfuse.score( trace_id=trace_id, observation_id=generation_id, name="factual_accuracy", value=1.0, )
Braintrust Evals
from braintrust import Eval Eval( "agent_quality", data=[{"input": q, "expected": a} for q, a in test_cases], task=run_agent, scores=[ Factuality(), Helpfulness(), SafetyCheck(), ], )
Aggregation & Trends
Track quality over time:
# Per-eval run span.set_attribute("eval.run_id", run_id) span.set_attribute("eval.test_count", 100) span.set_attribute("eval.pass_rate", 0.92) span.set_attribute("eval.avg_score", 0.87) span.set_attribute("eval.p50_score", 0.89) span.set_attribute("eval.p10_score", 0.65) # Comparison to baseline span.set_attribute("eval.baseline_score", 0.82) span.set_attribute("eval.improvement", 0.05) span.set_attribute("eval.regression", False)
Anti-Patterns
- No eval on production data (lab-only testing)
- Missing baseline comparison (can't detect regression)
- Eval without trace linking (can't debug failures)
- Only thumbs up/down (no granular insight)
- No eval versioning (can't compare over time)
Related Skills
- Feedback collectionhuman-in-the-loop
- Generation trackingllm-call-tracing