git clone https://github.com/vibeforge1111/vibeship-spawner-skills
ai/synthetic-data/skill.yamlSynthetic Data Generation
Creating high-quality synthetic datasets for ML/AI
version: 1.0.0 skill_id: synthetic-data id: synthetic-data name: Synthetic Data Generation category: ai difficulty: intermediate
description: | Patterns for generating synthetic data for ML training, testing, and privacy. Covers LLM-based generation, tabular synthesis, and quality validation.
triggers:
- "synthetic data"
- "generate training data"
- "fake data generation"
- "data augmentation"
- "SDV"
- "Gretel"
- "test data"
- "privacy-preserving data"
technologies:
- SDV
- Gretel
- Faker
- OpenAI
- Anthropic
patterns: llm_synthetic_generation: description: "Generate synthetic data using LLMs" when: "Need diverse, contextual training examples" implementation: | import Anthropic from "@anthropic-ai/sdk"; import { z } from "zod"; import { zodToJsonSchema } from "zod-to-json-schema";
const anthropic = new Anthropic(); // Define the schema for your synthetic data const CustomerSupportTicket = z.object({ id: z.string().describe("Unique ticket ID"), category: z.enum(["billing", "technical", "account", "shipping", "other"]), sentiment: z.enum(["positive", "neutral", "negative", "frustrated"]), query: z.string().describe("Customer's question or issue"), idealResponse: z.string().describe("Ideal support agent response"), difficulty: z.enum(["easy", "medium", "hard"]), metadata: z.object({ requiresEscalation: z.boolean(), estimatedHandleTime: z.number().describe("Minutes to resolve"), }), }); interface GenerationConfig { count: number; diversity: "low" | "medium" | "high"; biasCheck?: boolean; temperature?: number; } async function generateSyntheticData<T extends z.ZodType>( schema: T, context: string, config: GenerationConfig ): Promise<z.infer<T>[]> { const { count, diversity, biasCheck = true, temperature = 0.8 } = config; const results: z.infer<T>[] = []; const batchSize = 5; const seenExamples: string[] = []; for (let i = 0; i < count; i += batchSize) { const batchCount = Math.min(batchSize, count - i); const response = await anthropic.messages.create({ model: "claude-sonnet-4-20250514", max_tokens: 4096, temperature: diversity === "high" ? 1.0 : diversity === "medium" ? 0.8 : 0.5, system: `You are a synthetic data generator. Generate realistic, diverse data samples. Rules: - Each example must be unique and realistic - Vary the complexity and edge cases - Include both typical and edge-case scenarios - Do NOT copy exact phrases from examples - create new variations ${biasCheck ? "- Ensure demographic diversity in examples" : ""} ${seenExamples.length > 0 ? `- Avoid similarity to these already generated: ${seenExamples.slice(-10).join(", ")}` : ""}`, messages: [ { role: "user", content: `Generate ${batchCount} synthetic examples for: ${context} Output as a JSON array matching this schema: ${JSON.stringify(zodToJsonSchema(schema), null, 2)} Generate diverse examples with varied difficulty and scenarios.`, }, ], }); const content = response.content[0]; if (content.type !== "text") continue; // Parse and validate const parsed = JSON.parse(extractJSON(content.text)); const items = Array.isArray(parsed) ? parsed : [parsed]; for (const item of items) { const validated = schema.safeParse(item); if (validated.success) { results.push(validated.data); // Track for diversity seenExamples.push(JSON.stringify(item).slice(0, 100)); } } } return results; } function extractJSON(text: string): string { const match = text.match(/\[[\s\S]*\]|\{[\s\S]*\}/); return match ? match[0] : text; } // Usage const syntheticTickets = await generateSyntheticData( CustomerSupportTicket, "Customer support tickets for an e-commerce platform. Include billing issues, technical problems, and shipping complaints.", { count: 100, diversity: "high", biasCheck: true } );
sdv_tabular_synthesis: description: "Generate synthetic tabular data with SDV" when: "Need statistically accurate tabular data" implementation: | // Note: SDV is Python-based, here's a Node.js wrapper approach import { spawn } from "child_process"; import { writeFile, readFile } from "fs/promises"; import { z } from "zod";
interface SDVConfig { synthesizer: "gaussian_copula" | "ctgan" | "tvae"; sampleSize: number; constraints?: Array<{ type: "unique" | "positive" | "range" | "custom"; column: string; params?: Record<string, unknown>; }>; } async function generateWithSDV<T>( sourceData: T[], config: SDVConfig ): Promise<T[]> { // Write source data to temp file const inputPath = `/tmp/sdv_input_${Date.now()}.json`; const outputPath = `/tmp/sdv_output_${Date.now()}.json`; await writeFile(inputPath, JSON.stringify(sourceData)); // Python script for SDV const pythonScript = `
import json import sys from sdv.single_table import GaussianCopulaSynthesizer, CTGANSynthesizer, TVAESynthesizer from sdv.metadata import SingleTableMetadata import pandas as pd
Load data
with open('${inputPath}', 'r') as f: data = json.load(f)
df = pd.DataFrame(data)
Detect metadata
metadata = SingleTableMetadata() metadata.detect_from_dataframe(df)
Choose synthesizer
synthesizer_map = { 'gaussian_copula': GaussianCopulaSynthesizer, 'ctgan': CTGANSynthesizer, 'tvae': TVAESynthesizer, }
Synthesizer = synthesizer_map['${config.synthesizer}'] synthesizer = Synthesizer(metadata)
Fit and sample
synthesizer.fit(df) synthetic = synthesizer.sample(${config.sampleSize})
Save output
synthetic.to_json('${outputPath}', orient='records') `;
// Run Python script await new Promise((resolve, reject) => { const proc = spawn("python3", ["-c", pythonScript]); proc.on("close", (code) => { if (code === 0) resolve(null); else reject(new Error(`SDV failed with code ${code}`)); }); }); // Read results const output = await readFile(outputPath, "utf-8"); return JSON.parse(output); } // Validate synthetic data quality interface QualityMetrics { columnShapes: number; // 0-1, higher is better columnPairTrends: number; overallScore: number; } async function validateSyntheticQuality( real: unknown[], synthetic: unknown[] ): Promise<QualityMetrics> { const pythonScript = `
import json from sdv.evaluation.single_table import evaluate_quality import pandas as pd
real_df = pd.DataFrame(${JSON.stringify(real)}) synthetic_df = pd.DataFrame(${JSON.stringify(synthetic)})
report = evaluate_quality(real_df, synthetic_df) print(json.dumps({ 'columnShapes': report.get_property('Column Shapes'), 'columnPairTrends': report.get_property('Column Pair Trends'), 'overallScore': report.get_score() })) `;
// Execute and parse return new Promise((resolve, reject) => { const proc = spawn("python3", ["-c", pythonScript]); let output = ""; proc.stdout.on("data", (data) => (output += data)); proc.on("close", (code) => { if (code === 0) resolve(JSON.parse(output)); else reject(new Error("Quality check failed")); }); }); }
faker_test_data: description: "Generate test data with Faker" when: "Need simple mock data for testing" implementation: | import { faker } from "@faker-js/faker";
// Seed for reproducibility faker.seed(12345); interface UserProfile { id: string; email: string; name: string; avatar: string; address: { street: string; city: string; country: string; zipCode: string; }; company: string; createdAt: Date; subscription: "free" | "pro" | "enterprise"; } function generateUser(overrides?: Partial<UserProfile>): UserProfile { return { id: faker.string.uuid(), email: faker.internet.email(), name: faker.person.fullName(), avatar: faker.image.avatar(), address: { street: faker.location.streetAddress(), city: faker.location.city(), country: faker.location.country(), zipCode: faker.location.zipCode(), }, company: faker.company.name(), createdAt: faker.date.past({ years: 2 }), subscription: faker.helpers.arrayElement(["free", "pro", "enterprise"]), ...overrides, }; } // Generate batch with realistic distribution function generateUsers(count: number): UserProfile[] { return Array.from({ length: count }, () => { // Realistic subscription distribution const subscriptionWeights = { free: 0.7, pro: 0.25, enterprise: 0.05 }; const subscription = faker.helpers.weightedArrayElement( Object.entries(subscriptionWeights).map(([value, weight]) => ({ value, weight })) ) as UserProfile["subscription"]; return generateUser({ subscription }); }); } // Generate related data (orders for users) interface Order { id: string; userId: string; total: number; status: "pending" | "shipped" | "delivered" | "cancelled"; items: number; createdAt: Date; } function generateOrdersForUsers(users: UserProfile[], avgOrdersPerUser: number): Order[] { return users.flatMap((user) => { // Pro/enterprise users order more const multiplier = user.subscription === "enterprise" ? 3 : user.subscription === "pro" ? 2 : 1; const orderCount = faker.number.int({ min: 0, max: avgOrdersPerUser * multiplier * 2, }); return Array.from({ length: orderCount }, () => ({ id: faker.string.uuid(), userId: user.id, total: parseFloat(faker.commerce.price({ min: 10, max: 500 })), status: faker.helpers.arrayElement(["pending", "shipped", "delivered", "cancelled"]), items: faker.number.int({ min: 1, max: 10 }), createdAt: faker.date.between({ from: user.createdAt, to: new Date() }), })); }); }
llm_data_augmentation: description: "Augment existing data with LLM variations" when: "Need more training examples from limited data" implementation: | import OpenAI from "openai";
const openai = new OpenAI(); interface AugmentationConfig { techniques: Array<"paraphrase" | "formalize" | "simplify" | "translate" | "noise">; preserveIntent: boolean; targetCount: number; } async function augmentTextData( examples: string[], config: AugmentationConfig ): Promise<string[]> { const augmented: string[] = [...examples]; // Keep originals const techniquesPrompt = { paraphrase: "Rephrase this text while keeping the exact same meaning", formalize: "Rewrite in a more formal, professional tone", simplify: "Simplify to a 6th grade reading level", translate: "Translate to Spanish then back to English (back-translation)", noise: "Add realistic typos and casual language", }; for (const example of examples) { for (const technique of config.techniques) { if (augmented.length >= config.targetCount) break; const response = await openai.chat.completions.create({ model: "gpt-4o-mini", messages: [ { role: "system", content: `${techniquesPrompt[technique]}. ${config.preserveIntent ? "CRITICAL: The core meaning and intent must be preserved exactly." : ""} Output only the transformed text, nothing else.`, }, { role: "user", content: example }, ], temperature: 0.7, }); const augmentedText = response.choices[0].message.content?.trim(); if (augmentedText && augmentedText !== example) { augmented.push(augmentedText); } } } return augmented.slice(0, config.targetCount); } // Augment labeled data (preserves labels) interface LabeledExample { text: string; label: string; metadata?: Record<string, unknown>; } async function augmentLabeledData( examples: LabeledExample[], config: AugmentationConfig ): Promise<LabeledExample[]> { const augmented: LabeledExample[] = [...examples]; for (const example of examples) { const variations = await augmentTextData([example.text], { ...config, targetCount: Math.ceil(config.targetCount / examples.length), }); for (const variation of variations) { if (variation !== example.text) { augmented.push({ text: variation, label: example.label, // Preserve label metadata: { ...example.metadata, augmented: true, source: example.text.slice(0, 50), }, }); } } } return augmented; }
quality_validation: description: "Validate synthetic data quality" when: "After generating synthetic data" implementation: | import { z } from "zod";
interface QualityReport { fidelity: { distributionMatch: number; // 0-1 correlationMatch: number; schemaCompliance: number; }; diversity: { uniqueness: number; // % unique examples coverageScore: number; // How well it covers the space duplicateCount: number; }; utility: { modelPerformanceReal: number; modelPerformanceSynthetic: number; performanceGap: number; }; privacy: { nearestNeighborDistance: number; membershipInferenceRisk: number; attributeInferenceRisk: number; }; overall: number; recommendations: string[]; } async function validateSyntheticData<T>( realData: T[], syntheticData: T[], schema: z.ZodType<T> ): Promise<QualityReport> { const recommendations: string[] = []; // 1. Schema compliance let schemaCompliance = 0; for (const item of syntheticData) { if (schema.safeParse(item).success) schemaCompliance++; } schemaCompliance /= syntheticData.length; if (schemaCompliance < 0.95) { recommendations.push("Schema compliance below 95%. Validate generation prompts."); } // 2. Uniqueness const serialized = syntheticData.map((d) => JSON.stringify(d)); const unique = new Set(serialized); const uniqueness = unique.size / syntheticData.length; const duplicateCount = syntheticData.length - unique.size; if (uniqueness < 0.9) { recommendations.push("High duplicate rate. Increase diversity in generation."); } // 3. Distribution comparison (simplified) const numericFields = getNumericFields(realData[0] as Record<string, unknown>); let distributionMatch = 0; for (const field of numericFields) { const realValues = realData.map((d) => (d as Record<string, number>)[field]); const synthValues = syntheticData.map((d) => (d as Record<string, number>)[field]); const realMean = mean(realValues); const synthMean = mean(synthValues); const realStd = std(realValues); const synthStd = std(synthValues); // Compare means and stds const meanDiff = Math.abs(realMean - synthMean) / (realMean || 1); const stdDiff = Math.abs(realStd - synthStd) / (realStd || 1); distributionMatch += 1 - (meanDiff + stdDiff) / 2; } distributionMatch /= numericFields.length || 1; if (distributionMatch < 0.8) { recommendations.push("Distribution mismatch detected. Review generation parameters."); } // 4. Privacy check (simplified - check for exact matches) const realSet = new Set(realData.map((d) => JSON.stringify(d))); let memorizedCount = 0; for (const item of syntheticData) { if (realSet.has(JSON.stringify(item))) memorizedCount++; } const memorizedRatio = memorizedCount / syntheticData.length; if (memorizedRatio > 0.01) { recommendations.push( `WARNING: ${(memorizedRatio * 100).toFixed(1)}% synthetic data matches real data exactly. Privacy risk.` ); } return { fidelity: { distributionMatch, correlationMatch: 0.85, // Would need proper calculation schemaCompliance, }, diversity: { uniqueness, coverageScore: Math.min(1, syntheticData.length / realData.length), duplicateCount, }, utility: { modelPerformanceReal: 0, // Requires training modelPerformanceSynthetic: 0, performanceGap: 0, }, privacy: { nearestNeighborDistance: 0, membershipInferenceRisk: memorizedRatio, attributeInferenceRisk: 0, }, overall: (schemaCompliance + uniqueness + distributionMatch) / 3, recommendations, }; } function getNumericFields(obj: Record<string, unknown>): string[] { return Object.entries(obj) .filter(([_, v]) => typeof v === "number") .map(([k]) => k); } function mean(arr: number[]): number { return arr.reduce((a, b) => a + b, 0) / arr.length; } function std(arr: number[]): number { const m = mean(arr); return Math.sqrt(arr.reduce((sum, x) => sum + (x - m) ** 2, 0) / arr.length); }
instruction_tuning_data: description: "Generate instruction-tuning datasets" when: "Fine-tuning LLMs for specific tasks" implementation: | import Anthropic from "@anthropic-ai/sdk"; import { z } from "zod";
const anthropic = new Anthropic(); const InstructionExample = z.object({ instruction: z.string().describe("Clear task instruction"), input: z.string().optional().describe("Optional input context"), output: z.string().describe("Expected model response"), category: z.string().describe("Task category"), difficulty: z.enum(["easy", "medium", "hard"]), }); type InstructionExample = z.infer<typeof InstructionExample>; interface InstructionDatasetConfig { taskDescription: string; categories: string[]; examplesPerCategory: number; includeEdgeCases: boolean; format: "alpaca" | "sharegpt" | "openai"; } async function generateInstructionDataset( config: InstructionDatasetConfig ): Promise<InstructionExample[]> { const allExamples: InstructionExample[] = []; for (const category of config.categories) { const response = await anthropic.messages.create({ model: "claude-sonnet-4-20250514", max_tokens: 4096, temperature: 0.9, messages: [ { role: "user", content: `Generate ${config.examplesPerCategory} instruction-tuning examples for: Task: ${config.taskDescription} Category: ${category} Requirements: - Each example needs: instruction, input (optional), output - Vary difficulty: easy, medium, hard - Instructions should be clear and specific - Outputs should be high-quality examples of ideal responses ${config.includeEdgeCases ? "- Include edge cases and tricky scenarios" : ""} Output as JSON array with keys: instruction, input, output, category, difficulty`, }, ], }); const content = response.content[0]; if (content.type === "text") { const examples = JSON.parse(extractJSON(content.text)); for (const ex of examples) { const validated = InstructionExample.safeParse({ ...ex, category }); if (validated.success) { allExamples.push(validated.data); } } } } return formatDataset(allExamples, config.format); } function formatDataset( examples: InstructionExample[], format: "alpaca" | "sharegpt" | "openai" ): InstructionExample[] { // Alpaca format is our base format if (format === "alpaca") return examples; // Convert to other formats as needed // ShareGPT format: { conversations: [{ from: "human", value: "" }, { from: "gpt", value: "" }] } // OpenAI format: { messages: [{ role: "system", content: "" }, ...] } return examples; } function extractJSON(text: string): string { const match = text.match(/\[[\s\S]*\]|\{[\s\S]*\}/); return match ? match[0] : text; }
anti_patterns:
-
pattern: "Generate data without validation" problem: "Synthetic data may not match real distribution" solution: "Always validate with quality metrics before use"
-
pattern: "Using real data as few-shot examples" problem: "Risk of memorization and privacy leakage" solution: "Use separate seed examples that aren't from production"
-
pattern: "Single technique for all data" problem: "Different data types need different approaches" solution: "Use SDV for tabular, LLM for text, domain-specific for specialized data"
-
pattern: "No bias checking" problem: "Synthetic data can amplify existing biases" solution: "Audit for demographic representation and harmful patterns"
-
pattern: "Assuming privacy by default" problem: "Synthetic data can leak information about real data" solution: "Run privacy metrics and add differential privacy if needed"
handoffs:
-
skill: ai-observability when: "Track synthetic data generation metrics"
-
skill: llm-integration when: "Using LLMs for generation"
-
skill: data-pipelines when: "Automating data generation workflows"