Vibeship-spawner-skills ai-music-audio

AI Music & Audio Generation Skill

install

source · Clone the upstream repo

git clone https://github.com/vibeforge1111/vibeship-spawner-skills

manifest: ai/ai-music-audio/skill.yaml

AI Music & Audio Generation Skill

Patterns for AI-powered music, sound effects, and voice synthesis

version: 1.0.0 skill_id: ai-music-audio id: ai-music-audio name: AI Music & Audio Generation category: ai description: | Comprehensive patterns for AI-powered audio generation including text-to-music, voice synthesis, text-to-speech, sound effects, and audio manipulation using MusicGen, Bark, ElevenLabs, and more.

triggers:

"music generation"
"text to music"
"AI music"
"voice cloning"
"text to speech"
"TTS API"
"ElevenLabs"
"MusicGen"
"Bark"
"audio synthesis"
"sound effects generation"
"voice synthesis"
"AudioCraft"

provides:

"Music generation via MusicGen and other models"
"Voice cloning and synthesis with ElevenLabs"
"Text-to-speech implementation patterns"
"Sound effects generation"
"Audio watermarking with AudioSeal"
"Content safety and provenance"
"Streaming audio responses"

patterns:

name: "Music Generation with MusicGen/Replicate" description: "Generate music from text descriptions using Meta's MusicGen" when: "User needs AI-generated background music or soundtracks" implementation: | import Replicate from "replicate";

const replicate = new Replicate({ auth: process.env.REPLICATE_API_TOKEN, });

interface MusicOptions { prompt: string; duration?: number; // seconds (max 30) modelVersion?: "stereo-large" | "melody-large" | "large"; inputAudio?: string; // for melody conditioning temperature?: number; // 0-1, higher = more random topK?: number; // token sampling topP?: number; // nucleus sampling cfgCoefficient?: number; // classifier-free guidance seed?: number; // for reproducibility }

async function generateMusic(options: MusicOptions): Promise<string> { const { prompt, duration = 8, modelVersion = "stereo-large", inputAudio, temperature = 1.0, topK = 250, topP = 0, cfgCoefficient = 3, seed, } = options;

// Validate duration (MusicGen max is 30 seconds)
const safeDuration = Math.min(duration, 30);

const input: Record<string, unknown> = {
  prompt,
  duration: safeDuration,
  model_version: modelVersion,
  output_format: "mp3",
  temperature,
  top_k: topK,
  top_p: topP,
  classifier_free_guidance: cfgCoefficient,
};

if (inputAudio) {
  input.input_audio = inputAudio;
  input.continuation = false; // Use as melody reference
}

if (seed !== undefined) {
  input.seed = seed;
}

const output = await replicate.run(
  "meta/musicgen:671ac645ce5e552cc63a54a2bbff63fcf798043055f2b43c18a9f16e11e82434",
  { input }
);

return output as string;

}

// Usage const audioUrl = await generateMusic({ prompt: "upbeat electronic dance music with synths and drums, 120 BPM", duration: 15, modelVersion: "stereo-large", });

name: "Text-to-Speech with ElevenLabs" description: "High-quality voice synthesis for narration and dialogue" when: "User needs natural-sounding speech from text" implementation: | import { ElevenLabsClient } from "elevenlabs"; import { Readable } from "stream";

const elevenlabs = new ElevenLabsClient({ apiKey: process.env.ELEVENLABS_API_KEY, });

interface TTSOptions { text: string; voiceId: string; // Use premade or cloned voice modelId?: string; // eleven_multilingual_v2, eleven_turbo_v2 stability?: number; // 0-1, lower = more expressive similarityBoost?: number; // 0-1, higher = closer to original style?: number; // 0-1, style exaggeration useSpeakerBoost?: boolean; // enhance voice clarity }

async function textToSpeech(options: TTSOptions): Promise<Buffer> { const { text, voiceId, modelId = "eleven_multilingual_v2", stability = 0.5, similarityBoost = 0.75, style = 0, useSpeakerBoost = true, } = options;
```
const audioStream = await elevenlabs.textToSpeech.convert(voiceId, {
  text,
  model_id: modelId,
  voice_settings: {
    stability,
    similarity_boost: similarityBoost,
    style,
    use_speaker_boost: useSpeakerBoost,
  },
});

// Convert stream to buffer
const chunks: Buffer[] = [];
for await (const chunk of audioStream) {
  chunks.push(Buffer.from(chunk));
}

return Buffer.concat(chunks);
```
}

// Streaming version for real-time playback async function streamTextToSpeech( options: TTSOptions, onChunk: (chunk: Buffer) => void ): Promise<void> { const audioStream = await elevenlabs.textToSpeech.convertAsStream( options.voiceId, { text: options.text, model_id: options.modelId || "eleven_turbo_v2", voice_settings: { stability: options.stability || 0.5, similarity_boost: options.similarityBoost || 0.75, }, } );
```
for await (const chunk of audioStream) {
  onChunk(Buffer.from(chunk));
}
```
}

// Get available voices async function getVoices() { const response = await elevenlabs.voices.getAll(); return response.voices.map((voice) => ({ id: voice.voice_id, name: voice.name, category: voice.category, description: voice.description, previewUrl: voice.preview_url, })); }

name: "Voice Cloning with ElevenLabs" description: "Create custom voice clones from audio samples" when: "User wants to clone a voice for consistent narration" implementation: | import { ElevenLabsClient } from "elevenlabs"; import * as fs from "fs";

const elevenlabs = new ElevenLabsClient({ apiKey: process.env.ELEVENLABS_API_KEY, });

// Instant Voice Clone (from 1-3 minute sample) async function createInstantVoiceClone( name: string, samplePaths: string[], description?: string ) { // Read audio files const files = samplePaths.map((path) => fs.createReadStream(path));

const voice = await elevenlabs.voices.add({
  name,
  description: description || `Instant clone of ${name}`,
  files,
  labels: {
    type: "instant_clone",
  },
});

return {
  voiceId: voice.voice_id,
  name: voice.name,
};

}

// Professional Voice Clone (requires 1-3 hours of audio) // This initiates the process - actual training takes time async function initiateProfessionalClone( name: string, sampleUrls: string[], // Links to high-quality audio description: string ) { // PVC requires verification - user must agree to terms // and may need to complete identity verification

const response = await fetch(
  "https://api.elevenlabs.io/v1/voices/add/professional",
  {
    method: "POST",
    headers: {
      "xi-api-key": process.env.ELEVENLABS_API_KEY!,
      "Content-Type": "application/json",
    },
    body: JSON.stringify({
      name,
      description,
      sample_urls: sampleUrls,
    }),
  }
);

if (!response.ok) {
  throw new Error(`PVC initiation failed: ${response.statusText}`);
}

return response.json();

}

// Best practices for voice samples: // - Use high-quality recording (XLR mic, treated room) // - Aim for -6dB to -3dB peak levels // - No background noise, reverb, or music // - For instant: 1-3 minutes // - For professional: 1-3 hours // - Include varied emotional range and speaking styles

name: "Sound Effects Generation" description: "Generate sound effects using AudioGen" when: "User needs custom sound effects for games or videos" implementation: | import Replicate from "replicate";

const replicate = new Replicate({ auth: process.env.REPLICATE_API_TOKEN, });

interface SFXOptions { prompt: string; // Description of sound duration?: number; // seconds guidanceScale?: number; // 1-20, higher = closer to prompt temperature?: number; // randomness }

async function generateSoundEffect(options: SFXOptions): Promise<string> { const { prompt, duration = 5, guidanceScale = 3, temperature = 1.0, } = options;
```
// Using AudioGen via Replicate
const output = await replicate.run(
  "meta/audiogen:f8a0bac6-4f97-4e62-ad89-5e5c89c5d5f2",
  {
    input: {
      prompt,
      duration: Math.min(duration, 10), // AudioGen max 10s
      guidance_scale: guidanceScale,
      temperature,
    },
  }
);

return output as string;
```
}

// Alternative: Using Bark for short sound effects with speech async function generateSpeechWithEffects(text: string): Promise<string> { // Bark supports sound effects in text // [laughs], [sighs], [clears throat], [music], etc. const output = await replicate.run( "suno-ai/bark:b76242b40d67c76ab6742e987628a2a9ac019e11d56ab96c4e91ce03b79b2787", { input: { prompt: text, text_temp: 0.7, waveform_temp: 0.7, }, } );
```
return output.audio_out as string;
```
}

// SFX prompting tips: // - Be specific: "footsteps on wooden floor" not "walking" // - Include environment: "rain on metal roof in warehouse" // - Describe qualities: "deep rumbling thunder, distant" // - Avoid music terms for non-music sounds
name: "Audio Watermarking with AudioSeal" description: "Embed imperceptible watermarks for content provenance" when: "User needs to mark AI-generated audio for detection" implementation: | // AudioSeal is typically run server-side with PyTorch // This shows the integration pattern

import { spawn } from "child_process";

interface WatermarkResult { watermarkedAudioPath: string; secret: string; // The embedded message }

interface DetectionResult { isWatermarked: boolean; confidence: number; decodedMessage?: string; }

// Python script for AudioSeal (run on server) const AUDIOSEAL_SCRIPT = ` import torch import torchaudio from audioseal import AudioSeal import sys import json

def watermark_audio(input_path, output_path, message=None): # Load model model = AudioSeal.load_generator("audioseal_wm_16bits")

# Load audio
audio, sr = torchaudio.load(input_path)
if sr != 16000:
    resampler = torchaudio.transforms.Resample(sr, 16000)
    audio = resampler(audio)

# Generate watermark
if message:
    # Convert message to bits
    msg_bits = torch.tensor([int(b) for b in format(int(message, 16), '016b')])
    watermarked = model.embed(audio.unsqueeze(0), msg_bits.unsqueeze(0))
else:
    watermarked = model.embed(audio.unsqueeze(0))

# Save
torchaudio.save(output_path, watermarked.squeeze(0), 16000)
return {"success": True}

def detect_watermark(audio_path): detector = AudioSeal.load_detector("audioseal_detector_16bits")

audio, sr = torchaudio.load(audio_path)
if sr != 16000:
    resampler = torchaudio.transforms.Resample(sr, 16000)
    audio = resampler(audio)

result = detector.detect(audio.unsqueeze(0))

return {
    "is_watermarked": result.score > 0.5,
    "confidence": float(result.score),
    "message": result.message if hasattr(result, 'message') else None
}

if name == "main": action = sys.argv[1] if action == "watermark": result = watermark_audio(sys.argv[2], sys.argv[3], sys.argv[4] if len(sys.argv) > 4 else None) elif action == "detect": result = detect_watermark(sys.argv[2]) print(json.dumps(result)) `;

  // Node.js wrapper
  async function watermarkAudio(
    inputPath: string,
    outputPath: string,
    message?: string
  ): Promise<WatermarkResult> {
    return new Promise((resolve, reject) => {
      const args = ["watermark", inputPath, outputPath];
      if (message) args.push(message);

      const process = spawn("python", ["-c", AUDIOSEAL_SCRIPT, ...args]);

      let output = "";
      process.stdout.on("data", (data) => (output += data));
      process.stderr.on("data", (data) => console.error(data.toString()));

      process.on("close", (code) => {
        if (code === 0) {
          resolve({
            watermarkedAudioPath: outputPath,
            secret: message || "default",
          });
        } else {
          reject(new Error("Watermarking failed"));
        }
      });
    });
  }

  async function detectWatermark(audioPath: string): Promise<DetectionResult> {
    return new Promise((resolve, reject) => {
      const process = spawn("python", [
        "-c",
        AUDIOSEAL_SCRIPT,
        "detect",
        audioPath,
      ]);

      let output = "";
      process.stdout.on("data", (data) => (output += data));

      process.on("close", (code) => {
        if (code === 0) {
          const result = JSON.parse(output);
          resolve({
            isWatermarked: result.is_watermarked,
            confidence: result.confidence,
            decodedMessage: result.message,
          });
        } else {
          reject(new Error("Detection failed"));
        }
      });
    });
  }

name: "Streaming Audio Response" description: "Stream generated audio for real-time playback" when: "User needs audio to start playing before generation completes" implementation: | import { NextRequest, NextResponse } from "next/server"; import { ElevenLabsClient } from "elevenlabs";

const elevenlabs = new ElevenLabsClient({ apiKey: process.env.ELEVENLABS_API_KEY, });

// Next.js streaming TTS endpoint export async function POST(request: NextRequest) { const { text, voiceId } = await request.json();

// Get streaming audio
const audioStream = await elevenlabs.textToSpeech.convertAsStream(
  voiceId,
  {
    text,
    model_id: "eleven_turbo_v2", // Faster for streaming
    voice_settings: {
      stability: 0.5,
      similarity_boost: 0.75,
    },
  }
);

// Create readable stream for response
const stream = new ReadableStream({
  async start(controller) {
    for await (const chunk of audioStream) {
      controller.enqueue(chunk);
    }
    controller.close();
  },
});

return new NextResponse(stream, {
  headers: {
    "Content-Type": "audio/mpeg",
    "Transfer-Encoding": "chunked",
  },
});

}

// Frontend: Play streaming audio async function playStreamingAudio(text: string, voiceId: string) { const response = await fetch("/api/tts/stream", { method: "POST", headers: { "Content-Type": "application/json" }, body: JSON.stringify({ text, voiceId }), });

// Use Media Source Extensions for streaming playback
const mediaSource = new MediaSource();
const audio = new Audio();
audio.src = URL.createObjectURL(mediaSource);

mediaSource.addEventListener("sourceopen", async () => {
  const sourceBuffer = mediaSource.addSourceBuffer("audio/mpeg");
  const reader = response.body!.getReader();

  while (true) {
    const { done, value } = await reader.read();
    if (done) {
      mediaSource.endOfStream();
      break;
    }

    // Wait for buffer to be ready
    await new Promise((resolve) => {
      if (sourceBuffer.updating) {
        sourceBuffer.addEventListener("updateend", resolve, { once: true });
      } else {
        resolve(undefined);
      }
    });

    sourceBuffer.appendBuffer(value);
  }
});

await audio.play();

}

name: "Lyria for High-Quality Music" description: "Use Fal.ai's Lyria model for production music" when: "User needs higher quality AI music than MusicGen" implementation: | // Using Fal.ai's Lyria model for music generation // Lyria2 provides higher quality output

interface LyriaOptions { prompt: string; duration?: number; negativePrompt?: string; }

async function generateMusicWithLyria(options: LyriaOptions) { const response = await fetch("https://fal.run/fal-ai/lyria2", { method: "POST", headers: { Authorization:
```
Key ${process.env.FAL_KEY}
```
, "Content-Type": "application/json", }, body: JSON.stringify({ prompt: options.prompt, duration_seconds: options.duration || 30, negative_prompt: options.negativePrompt, }), });
```
if (!response.ok) {
  throw new Error(`Lyria generation failed: ${response.statusText}`);
}

const result = await response.json();
return {
  audioUrl: result.audio.url,
  duration: result.audio.duration,
};
```
}

// Music prompting best practices: // - Include genre: "synthwave", "lo-fi hip hop", "orchestral" // - Specify tempo: "120 BPM", "slow tempo", "upbeat" // - Describe mood: "melancholic", "energetic", "peaceful" // - Add instruments: "piano and strings", "electric guitar solo" // - Reference styles: "in the style of 80s synth pop"

anti_patterns:

name: "Exposing audio API keys client-side" why_bad: "Audio generation is expensive - leaked keys cause massive bills" example_bad: | // Frontend code with exposed key const elevenlabs = new ElevenLabsClient({ apiKey: "sk-..." // Exposed in browser! }); example_good: | // Server-side only // app/api/tts/route.ts const elevenlabs = new ElevenLabsClient({ apiKey: process.env.ELEVENLABS_API_KEY, });
name: "No character/duration limits" why_bad: "Single long text can cost $10+ in TTS charges" example_bad: | async function synthesize(text: string) { return elevenlabs.textToSpeech.convert(voiceId, { text }); } example_good: | async function synthesize(text: string) { if (text.length > 5000) { throw new Error("Text exceeds 5000 character limit"); } // Also implement cost tracking await recordCost(userId, text.length * 0.00003); // ~$0.03/1000 chars return elevenlabs.textToSpeech.convert(voiceId, { text }); }
name: "Voice cloning without consent" why_bad: "Creates deepfakes, legal liability, platform bans" example_bad: | // Clone any uploaded voice await createVoiceClone(userUploadedAudio); example_good: | // Require explicit consent const consent = await db.voiceConsent.findUnique({ where: { voiceOwnerEmail, voiceId }, });

if (!consent || !consent.verified) { throw new Error("Voice consent not verified"); }

await createVoiceClone(userUploadedAudio);
name: "No content moderation on generated audio" why_bad: "AI can generate harmful content (hate speech, misinformation)" example_bad: | // Generate without checking const audio = await textToSpeech(userInput); return audio; example_good: | // Moderate text before synthesis const moderation = await openai.moderations.create({ input: userInput }); if (moderation.results[0].flagged) { throw new Error("Content violates policy"); } const audio = await textToSpeech(userInput);
name: "Synchronous long audio generation" why_bad: "Music generation takes 30-120+ seconds, blocks requests" example_bad: | // Blocks for 2+ minutes app.post("/api/music", async (req, res) => { const audio = await generateMusic(req.body.prompt); res.json({ audio }); }); example_good: | // Queue-based async app.post("/api/music", async (req, res) => { const job = await musicQueue.add("generate", req.body); res.json({ jobId: job.id, status: "pending" }); });

handoffs:

to: "text-to-video" when: "User wants to create video with AI soundtrack" context: "I've generated the audio. The text-to-video skill can create visuals."
to: "frontend" when: "User needs audio player UI" context: "Audio generation is handled. Frontend skill can build the player interface."
to: "ai-safety-alignment" when: "User needs deepfake detection or voice verification" context: "Audio is generated. Safety skill handles detection and verification."

references: