Claude-code-plugins-plus-skills langfuse-ci-integration
install
source · Clone the upstream repo
git clone https://github.com/jeremylongshore/claude-code-plugins-plus-skills
Claude Code · Install into ~/.claude/skills/
T=$(mktemp -d) && git clone --depth=1 https://github.com/jeremylongshore/claude-code-plugins-plus-skills "$T" && mkdir -p ~/.claude/skills && cp -r "$T/plugins/saas-packs/langfuse-pack/skills/langfuse-ci-integration" ~/.claude/skills/jeremylongshore-claude-code-plugins-plus-skills-langfuse-ci-integration && rm -rf "$T"
manifest:
plugins/saas-packs/langfuse-pack/skills/langfuse-ci-integration/SKILL.mdsource content
Langfuse CI Integration
Overview
Integrate Langfuse into CI/CD pipelines: trace validation tests, prompt regression testing, experiment-driven quality gates, automated prompt deployment from version control, and score monitoring.
Prerequisites
- Langfuse API keys stored as GitHub secrets (
,LANGFUSE_PUBLIC_KEY
)LANGFUSE_SECRET_KEY - Test framework (Vitest or Jest)
- OpenAI API key for LLM tests
Instructions
Step 1: GitHub Actions Workflow for AI Quality Tests
# .github/workflows/langfuse-tests.yml name: AI Quality Tests on: pull_request: paths: ["src/ai/**", "src/prompts/**", "tests/ai/**"] jobs: ai-quality: runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 - uses: actions/setup-node@v4 with: { node-version: "20", cache: "npm" } - run: npm ci - name: Run AI quality tests with tracing env: LANGFUSE_PUBLIC_KEY: ${{ secrets.LANGFUSE_PUBLIC_KEY }} LANGFUSE_SECRET_KEY: ${{ secrets.LANGFUSE_SECRET_KEY }} LANGFUSE_BASE_URL: ${{ vars.LANGFUSE_BASE_URL || 'https://cloud.langfuse.com' }} OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} run: npx vitest run tests/ai/ --reporter=verbose - name: Langfuse connectivity check env: LANGFUSE_PUBLIC_KEY: ${{ secrets.LANGFUSE_PUBLIC_KEY }} LANGFUSE_SECRET_KEY: ${{ secrets.LANGFUSE_SECRET_KEY }} run: | node -e " const { LangfuseClient } = require('@langfuse/client'); const lf = new LangfuseClient(); lf.prompt.get('__ci-health__').catch(() => {}); console.log('Langfuse SDK initialized OK'); "
Step 2: Prompt Regression Tests
// tests/ai/prompt-quality.test.ts import { describe, it, expect, afterAll } from "vitest"; import { LangfuseClient } from "@langfuse/client"; import { startActiveObservation, updateActiveObservation } from "@langfuse/tracing"; import OpenAI from "openai"; const langfuse = new LangfuseClient(); const openai = new OpenAI(); describe("Prompt Quality Regression", () => { it("summarization prompt produces valid output", async () => { const prompt = await langfuse.prompt.get("summarize-article", { type: "text" }); const compiled = prompt.compile({ maxLength: "100 words" }); const result = await startActiveObservation( { name: "ci-test-summarize", asType: "generation" }, async () => { updateActiveObservation({ model: "gpt-4o-mini", input: compiled }); const response = await openai.chat.completions.create({ model: "gpt-4o-mini", messages: [{ role: "user", content: compiled }], temperature: 0, }); const output = response.choices[0].message.content || ""; updateActiveObservation({ output, usage: { promptTokens: response.usage?.prompt_tokens, completionTokens: response.usage?.completion_tokens, }, }); return output; } ); expect(result.length).toBeGreaterThan(20); expect(result.length).toBeLessThan(600); }); it("classification prompt returns valid intent", async () => { const prompt = await langfuse.prompt.get("classify-intent", { type: "text" }); const compiled = prompt.compile({ userMessage: "I want to cancel my subscription" }); const response = await openai.chat.completions.create({ model: "gpt-4o-mini", messages: [{ role: "user", content: compiled }], temperature: 0, }); const intent = response.choices[0].message.content?.trim().toLowerCase() || ""; const validIntents = ["billing", "cancellation", "support", "feedback"]; expect(validIntents).toContain(intent); }); });
Step 3: Experiment-Driven Quality Gates
// tests/ai/experiment-gate.test.ts import { describe, it, expect } from "vitest"; import { LangfuseClient } from "@langfuse/client"; import OpenAI from "openai"; const langfuse = new LangfuseClient(); const openai = new OpenAI(); describe("Quality Gate: Intent Classification", () => { it("scores above 80% accuracy on test dataset", async () => { async function classifyIntent(input: { query: string }) { const response = await openai.chat.completions.create({ model: "gpt-4o-mini", messages: [ { role: "system", content: "Classify intent. Return one word." }, { role: "user", content: input.query }, ], temperature: 0, }); return response.choices[0].message.content?.trim() || ""; } const result = await langfuse.runExperiment({ datasetName: "intent-classification-test", runName: `ci-${process.env.GITHUB_SHA?.slice(0, 7) || "local"}`, task: classifyIntent, evaluators: [ ({ output, expectedOutput }) => ({ name: "exact-match", value: output.toLowerCase() === expectedOutput.intent.toLowerCase() ? 1 : 0, dataType: "BOOLEAN" as const, }), ], }); // Calculate accuracy const scores = result.runs.flatMap((r) => r.scores || []); const accuracy = scores.filter((s) => s.value === 1).length / scores.length; console.log(`Accuracy: ${(accuracy * 100).toFixed(1)}%`); expect(accuracy).toBeGreaterThanOrEqual(0.8); }); });
Step 4: Automated Prompt Deployment
# .github/workflows/deploy-prompts.yml name: Deploy Prompts to Langfuse on: push: branches: [main] paths: ["src/prompts/**"] jobs: deploy: runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 - uses: actions/setup-node@v4 with: { node-version: "20", cache: "npm" } - run: npm ci - name: Deploy prompts env: LANGFUSE_PUBLIC_KEY: ${{ secrets.LANGFUSE_PUBLIC_KEY }} LANGFUSE_SECRET_KEY: ${{ secrets.LANGFUSE_SECRET_KEY }} run: node scripts/deploy-prompts.mjs
// scripts/deploy-prompts.mjs import { LangfuseClient } from "@langfuse/client"; import { readdirSync, readFileSync } from "fs"; import { join } from "path"; const langfuse = new LangfuseClient(); const promptDir = join(process.cwd(), "src/prompts"); for (const file of readdirSync(promptDir).filter((f) => f.endsWith(".json"))) { const config = JSON.parse(readFileSync(join(promptDir, file), "utf-8")); await langfuse.api.prompts.create({ name: config.name, prompt: config.template, type: config.type || "text", config: config.config || {}, labels: ["production", `deploy-${new Date().toISOString().split("T")[0]}`], }); console.log(`Deployed: ${config.name}`); }
Step 5: Score Regression Monitoring
// scripts/check-quality-regression.ts import { LangfuseClient } from "@langfuse/client"; const langfuse = new LangfuseClient(); async function checkRegression() { const scores = await langfuse.api.scores.list({ name: "quality", limit: 100, }); const values = scores.data.map((s) => s.value).filter((v): v is number => v !== null); const avg = values.reduce((a, b) => a + b, 0) / values.length; console.log(`Average quality score: ${avg.toFixed(3)} (n=${values.length})`); if (avg < 0.7) { console.error("QUALITY REGRESSION: Score below 0.7 threshold"); process.exit(1); } } checkRegression();
CI Best Practices
| Practice | Why |
|---|---|
Use in CI tests | Deterministic outputs, fewer false failures |
| Separate CI API keys | Isolate test traces from production |
| Run experiments on dataset changes | Catch regressions before deploy |
| Assert on ranges, not exact strings | LLM output varies even at temp 0 |
Flush/shutdown in | Ensure all traces reach Langfuse |
Error Handling
| Issue | Cause | Solution |
|---|---|---|
| Traces not in dashboard | No flush in CI | Add or flush |
| Flaky quality tests | Non-deterministic LLM | Use , assert on ranges |
| Prompt not found | Not yet deployed | Deploy prompts before running tests |
| Missing secrets in CI | Not configured | Add to GitHub Settings > Secrets > Actions |