git clone https://github.com/vibeforge1111/vibeship-spawner-skills
ai/ai-music-audio/skill.yamlAI Music & Audio Generation Skill
Patterns for AI-powered music, sound effects, and voice synthesis
version: 1.0.0 skill_id: ai-music-audio id: ai-music-audio name: AI Music & Audio Generation category: ai description: | Comprehensive patterns for AI-powered audio generation including text-to-music, voice synthesis, text-to-speech, sound effects, and audio manipulation using MusicGen, Bark, ElevenLabs, and more.
triggers:
- "music generation"
- "text to music"
- "AI music"
- "voice cloning"
- "text to speech"
- "TTS API"
- "ElevenLabs"
- "MusicGen"
- "Bark"
- "audio synthesis"
- "sound effects generation"
- "voice synthesis"
- "AudioCraft"
provides:
- "Music generation via MusicGen and other models"
- "Voice cloning and synthesis with ElevenLabs"
- "Text-to-speech implementation patterns"
- "Sound effects generation"
- "Audio watermarking with AudioSeal"
- "Content safety and provenance"
- "Streaming audio responses"
patterns:
-
name: "Music Generation with MusicGen/Replicate" description: "Generate music from text descriptions using Meta's MusicGen" when: "User needs AI-generated background music or soundtracks" implementation: | import Replicate from "replicate";
const replicate = new Replicate({ auth: process.env.REPLICATE_API_TOKEN, });
interface MusicOptions { prompt: string; duration?: number; // seconds (max 30) modelVersion?: "stereo-large" | "melody-large" | "large"; inputAudio?: string; // for melody conditioning temperature?: number; // 0-1, higher = more random topK?: number; // token sampling topP?: number; // nucleus sampling cfgCoefficient?: number; // classifier-free guidance seed?: number; // for reproducibility }
async function generateMusic(options: MusicOptions): Promise<string> { const { prompt, duration = 8, modelVersion = "stereo-large", inputAudio, temperature = 1.0, topK = 250, topP = 0, cfgCoefficient = 3, seed, } = options;
// Validate duration (MusicGen max is 30 seconds) const safeDuration = Math.min(duration, 30); const input: Record<string, unknown> = { prompt, duration: safeDuration, model_version: modelVersion, output_format: "mp3", temperature, top_k: topK, top_p: topP, classifier_free_guidance: cfgCoefficient, }; if (inputAudio) { input.input_audio = inputAudio; input.continuation = false; // Use as melody reference } if (seed !== undefined) { input.seed = seed; } const output = await replicate.run( "meta/musicgen:671ac645ce5e552cc63a54a2bbff63fcf798043055f2b43c18a9f16e11e82434", { input } ); return output as string;}
// Usage const audioUrl = await generateMusic({ prompt: "upbeat electronic dance music with synths and drums, 120 BPM", duration: 15, modelVersion: "stereo-large", });
-
name: "Text-to-Speech with ElevenLabs" description: "High-quality voice synthesis for narration and dialogue" when: "User needs natural-sounding speech from text" implementation: | import { ElevenLabsClient } from "elevenlabs"; import { Readable } from "stream";
const elevenlabs = new ElevenLabsClient({ apiKey: process.env.ELEVENLABS_API_KEY, });
interface TTSOptions { text: string; voiceId: string; // Use premade or cloned voice modelId?: string; // eleven_multilingual_v2, eleven_turbo_v2 stability?: number; // 0-1, lower = more expressive similarityBoost?: number; // 0-1, higher = closer to original style?: number; // 0-1, style exaggeration useSpeakerBoost?: boolean; // enhance voice clarity }
async function textToSpeech(options: TTSOptions): Promise<Buffer> { const { text, voiceId, modelId = "eleven_multilingual_v2", stability = 0.5, similarityBoost = 0.75, style = 0, useSpeakerBoost = true, } = options;
const audioStream = await elevenlabs.textToSpeech.convert(voiceId, { text, model_id: modelId, voice_settings: { stability, similarity_boost: similarityBoost, style, use_speaker_boost: useSpeakerBoost, }, }); // Convert stream to buffer const chunks: Buffer[] = []; for await (const chunk of audioStream) { chunks.push(Buffer.from(chunk)); } return Buffer.concat(chunks);}
// Streaming version for real-time playback async function streamTextToSpeech( options: TTSOptions, onChunk: (chunk: Buffer) => void ): Promise<void> { const audioStream = await elevenlabs.textToSpeech.convertAsStream( options.voiceId, { text: options.text, model_id: options.modelId || "eleven_turbo_v2", voice_settings: { stability: options.stability || 0.5, similarity_boost: options.similarityBoost || 0.75, }, } );
for await (const chunk of audioStream) { onChunk(Buffer.from(chunk)); }}
// Get available voices async function getVoices() { const response = await elevenlabs.voices.getAll(); return response.voices.map((voice) => ({ id: voice.voice_id, name: voice.name, category: voice.category, description: voice.description, previewUrl: voice.preview_url, })); }
-
name: "Voice Cloning with ElevenLabs" description: "Create custom voice clones from audio samples" when: "User wants to clone a voice for consistent narration" implementation: | import { ElevenLabsClient } from "elevenlabs"; import * as fs from "fs";
const elevenlabs = new ElevenLabsClient({ apiKey: process.env.ELEVENLABS_API_KEY, });
// Instant Voice Clone (from 1-3 minute sample) async function createInstantVoiceClone( name: string, samplePaths: string[], description?: string ) { // Read audio files const files = samplePaths.map((path) => fs.createReadStream(path));
const voice = await elevenlabs.voices.add({ name, description: description || `Instant clone of ${name}`, files, labels: { type: "instant_clone", }, }); return { voiceId: voice.voice_id, name: voice.name, };}
// Professional Voice Clone (requires 1-3 hours of audio) // This initiates the process - actual training takes time async function initiateProfessionalClone( name: string, sampleUrls: string[], // Links to high-quality audio description: string ) { // PVC requires verification - user must agree to terms // and may need to complete identity verification
const response = await fetch( "https://api.elevenlabs.io/v1/voices/add/professional", { method: "POST", headers: { "xi-api-key": process.env.ELEVENLABS_API_KEY!, "Content-Type": "application/json", }, body: JSON.stringify({ name, description, sample_urls: sampleUrls, }), } ); if (!response.ok) { throw new Error(`PVC initiation failed: ${response.statusText}`); } return response.json();}
// Best practices for voice samples: // - Use high-quality recording (XLR mic, treated room) // - Aim for -6dB to -3dB peak levels // - No background noise, reverb, or music // - For instant: 1-3 minutes // - For professional: 1-3 hours // - Include varied emotional range and speaking styles
-
name: "Sound Effects Generation" description: "Generate sound effects using AudioGen" when: "User needs custom sound effects for games or videos" implementation: | import Replicate from "replicate";
const replicate = new Replicate({ auth: process.env.REPLICATE_API_TOKEN, });
interface SFXOptions { prompt: string; // Description of sound duration?: number; // seconds guidanceScale?: number; // 1-20, higher = closer to prompt temperature?: number; // randomness }
async function generateSoundEffect(options: SFXOptions): Promise<string> { const { prompt, duration = 5, guidanceScale = 3, temperature = 1.0, } = options;
// Using AudioGen via Replicate const output = await replicate.run( "meta/audiogen:f8a0bac6-4f97-4e62-ad89-5e5c89c5d5f2", { input: { prompt, duration: Math.min(duration, 10), // AudioGen max 10s guidance_scale: guidanceScale, temperature, }, } ); return output as string;}
// Alternative: Using Bark for short sound effects with speech async function generateSpeechWithEffects(text: string): Promise<string> { // Bark supports sound effects in text // [laughs], [sighs], [clears throat], [music], etc. const output = await replicate.run( "suno-ai/bark:b76242b40d67c76ab6742e987628a2a9ac019e11d56ab96c4e91ce03b79b2787", { input: { prompt: text, text_temp: 0.7, waveform_temp: 0.7, }, } );
return output.audio_out as string;}
// SFX prompting tips: // - Be specific: "footsteps on wooden floor" not "walking" // - Include environment: "rain on metal roof in warehouse" // - Describe qualities: "deep rumbling thunder, distant" // - Avoid music terms for non-music sounds
-
name: "Audio Watermarking with AudioSeal" description: "Embed imperceptible watermarks for content provenance" when: "User needs to mark AI-generated audio for detection" implementation: | // AudioSeal is typically run server-side with PyTorch // This shows the integration pattern
import { spawn } from "child_process";
interface WatermarkResult { watermarkedAudioPath: string; secret: string; // The embedded message }
interface DetectionResult { isWatermarked: boolean; confidence: number; decodedMessage?: string; }
// Python script for AudioSeal (run on server) const AUDIOSEAL_SCRIPT = ` import torch import torchaudio from audioseal import AudioSeal import sys import json
def watermark_audio(input_path, output_path, message=None): # Load model model = AudioSeal.load_generator("audioseal_wm_16bits")
# Load audio audio, sr = torchaudio.load(input_path) if sr != 16000: resampler = torchaudio.transforms.Resample(sr, 16000) audio = resampler(audio) # Generate watermark if message: # Convert message to bits msg_bits = torch.tensor([int(b) for b in format(int(message, 16), '016b')]) watermarked = model.embed(audio.unsqueeze(0), msg_bits.unsqueeze(0)) else: watermarked = model.embed(audio.unsqueeze(0)) # Save torchaudio.save(output_path, watermarked.squeeze(0), 16000) return {"success": True}
def detect_watermark(audio_path): detector = AudioSeal.load_detector("audioseal_detector_16bits")
audio, sr = torchaudio.load(audio_path) if sr != 16000: resampler = torchaudio.transforms.Resample(sr, 16000) audio = resampler(audio) result = detector.detect(audio.unsqueeze(0)) return { "is_watermarked": result.score > 0.5, "confidence": float(result.score), "message": result.message if hasattr(result, 'message') else None }
if name == "main": action = sys.argv[1] if action == "watermark": result = watermark_audio(sys.argv[2], sys.argv[3], sys.argv[4] if len(sys.argv) > 4 else None) elif action == "detect": result = detect_watermark(sys.argv[2]) print(json.dumps(result)) `;
// Node.js wrapper async function watermarkAudio( inputPath: string, outputPath: string, message?: string ): Promise<WatermarkResult> { return new Promise((resolve, reject) => { const args = ["watermark", inputPath, outputPath]; if (message) args.push(message); const process = spawn("python", ["-c", AUDIOSEAL_SCRIPT, ...args]); let output = ""; process.stdout.on("data", (data) => (output += data)); process.stderr.on("data", (data) => console.error(data.toString())); process.on("close", (code) => { if (code === 0) { resolve({ watermarkedAudioPath: outputPath, secret: message || "default", }); } else { reject(new Error("Watermarking failed")); } }); }); } async function detectWatermark(audioPath: string): Promise<DetectionResult> { return new Promise((resolve, reject) => { const process = spawn("python", [ "-c", AUDIOSEAL_SCRIPT, "detect", audioPath, ]); let output = ""; process.stdout.on("data", (data) => (output += data)); process.on("close", (code) => { if (code === 0) { const result = JSON.parse(output); resolve({ isWatermarked: result.is_watermarked, confidence: result.confidence, decodedMessage: result.message, }); } else { reject(new Error("Detection failed")); } }); }); }
-
name: "Streaming Audio Response" description: "Stream generated audio for real-time playback" when: "User needs audio to start playing before generation completes" implementation: | import { NextRequest, NextResponse } from "next/server"; import { ElevenLabsClient } from "elevenlabs";
const elevenlabs = new ElevenLabsClient({ apiKey: process.env.ELEVENLABS_API_KEY, });
// Next.js streaming TTS endpoint export async function POST(request: NextRequest) { const { text, voiceId } = await request.json();
// Get streaming audio const audioStream = await elevenlabs.textToSpeech.convertAsStream( voiceId, { text, model_id: "eleven_turbo_v2", // Faster for streaming voice_settings: { stability: 0.5, similarity_boost: 0.75, }, } ); // Create readable stream for response const stream = new ReadableStream({ async start(controller) { for await (const chunk of audioStream) { controller.enqueue(chunk); } controller.close(); }, }); return new NextResponse(stream, { headers: { "Content-Type": "audio/mpeg", "Transfer-Encoding": "chunked", }, });}
// Frontend: Play streaming audio async function playStreamingAudio(text: string, voiceId: string) { const response = await fetch("/api/tts/stream", { method: "POST", headers: { "Content-Type": "application/json" }, body: JSON.stringify({ text, voiceId }), });
// Use Media Source Extensions for streaming playback const mediaSource = new MediaSource(); const audio = new Audio(); audio.src = URL.createObjectURL(mediaSource); mediaSource.addEventListener("sourceopen", async () => { const sourceBuffer = mediaSource.addSourceBuffer("audio/mpeg"); const reader = response.body!.getReader(); while (true) { const { done, value } = await reader.read(); if (done) { mediaSource.endOfStream(); break; } // Wait for buffer to be ready await new Promise((resolve) => { if (sourceBuffer.updating) { sourceBuffer.addEventListener("updateend", resolve, { once: true }); } else { resolve(undefined); } }); sourceBuffer.appendBuffer(value); } }); await audio.play();}
-
name: "Lyria for High-Quality Music" description: "Use Fal.ai's Lyria model for production music" when: "User needs higher quality AI music than MusicGen" implementation: | // Using Fal.ai's Lyria model for music generation // Lyria2 provides higher quality output
interface LyriaOptions { prompt: string; duration?: number; negativePrompt?: string; }
async function generateMusicWithLyria(options: LyriaOptions) { const response = await fetch("https://fal.run/fal-ai/lyria2", { method: "POST", headers: { Authorization:
, "Content-Type": "application/json", }, body: JSON.stringify({ prompt: options.prompt, duration_seconds: options.duration || 30, negative_prompt: options.negativePrompt, }), });Key ${process.env.FAL_KEY}if (!response.ok) { throw new Error(`Lyria generation failed: ${response.statusText}`); } const result = await response.json(); return { audioUrl: result.audio.url, duration: result.audio.duration, };}
// Music prompting best practices: // - Include genre: "synthwave", "lo-fi hip hop", "orchestral" // - Specify tempo: "120 BPM", "slow tempo", "upbeat" // - Describe mood: "melancholic", "energetic", "peaceful" // - Add instruments: "piano and strings", "electric guitar solo" // - Reference styles: "in the style of 80s synth pop"
anti_patterns:
-
name: "Exposing audio API keys client-side" why_bad: "Audio generation is expensive - leaked keys cause massive bills" example_bad: | // Frontend code with exposed key const elevenlabs = new ElevenLabsClient({ apiKey: "sk-..." // Exposed in browser! }); example_good: | // Server-side only // app/api/tts/route.ts const elevenlabs = new ElevenLabsClient({ apiKey: process.env.ELEVENLABS_API_KEY, });
-
name: "No character/duration limits" why_bad: "Single long text can cost $10+ in TTS charges" example_bad: | async function synthesize(text: string) { return elevenlabs.textToSpeech.convert(voiceId, { text }); } example_good: | async function synthesize(text: string) { if (text.length > 5000) { throw new Error("Text exceeds 5000 character limit"); } // Also implement cost tracking await recordCost(userId, text.length * 0.00003); // ~$0.03/1000 chars return elevenlabs.textToSpeech.convert(voiceId, { text }); }
-
name: "Voice cloning without consent" why_bad: "Creates deepfakes, legal liability, platform bans" example_bad: | // Clone any uploaded voice await createVoiceClone(userUploadedAudio); example_good: | // Require explicit consent const consent = await db.voiceConsent.findUnique({ where: { voiceOwnerEmail, voiceId }, });
if (!consent || !consent.verified) { throw new Error("Voice consent not verified"); }
await createVoiceClone(userUploadedAudio);
-
name: "No content moderation on generated audio" why_bad: "AI can generate harmful content (hate speech, misinformation)" example_bad: | // Generate without checking const audio = await textToSpeech(userInput); return audio; example_good: | // Moderate text before synthesis const moderation = await openai.moderations.create({ input: userInput }); if (moderation.results[0].flagged) { throw new Error("Content violates policy"); } const audio = await textToSpeech(userInput);
-
name: "Synchronous long audio generation" why_bad: "Music generation takes 30-120+ seconds, blocks requests" example_bad: | // Blocks for 2+ minutes app.post("/api/music", async (req, res) => { const audio = await generateMusic(req.body.prompt); res.json({ audio }); }); example_good: | // Queue-based async app.post("/api/music", async (req, res) => { const job = await musicQueue.add("generate", req.body); res.json({ jobId: job.id, status: "pending" }); });
handoffs:
-
to: "text-to-video" when: "User wants to create video with AI soundtrack" context: "I've generated the audio. The text-to-video skill can create visuals."
-
to: "frontend" when: "User needs audio player UI" context: "Audio generation is handled. Frontend skill can build the player interface."
-
to: "ai-safety-alignment" when: "User needs deepfake detection or voice verification" context: "Audio is generated. Safety skill handles detection and verification."
references: