Vibeship-spawner-skills multimodal-ai

Multimodal AI

install

source · Clone the upstream repo

git clone https://github.com/vibeforge1111/vibeship-spawner-skills

manifest: ai/multimodal-ai/skill.yaml

Multimodal AI

Working with text, images, audio, and video in AI systems

version: 1.0.0 name: Multimodal AI id: multimodal-ai category: ai difficulty: advanced

description: | Patterns for building multimodal AI applications that combine text, images, audio, and video. Covers vision APIs, audio transcription, and unified pipelines.

triggers:

"multimodal AI"
"vision API"
"image understanding"
"GPT-4V"
"Claude vision"
"audio transcription"
"Whisper"
"document extraction"
"image to text"

technologies:

OpenAI GPT-4o
Claude Vision
Whisper
Gemini
Google Cloud Vision

patterns: openai_vision: description: "Process images with GPT-4o vision" when: "Need to analyze, describe, or extract from images" implementation: | import OpenAI from "openai"; import { readFile } from "fs/promises"; import path from "path";

  const openai = new OpenAI();

  interface VisionOptions {
    detail?: "low" | "high" | "auto";
    maxTokens?: number;
  }

  // Process single image from URL
  async function analyzeImageUrl(
    imageUrl: string,
    prompt: string,
    options?: VisionOptions
  ): Promise<string> {
    const response = await openai.chat.completions.create({
      model: "gpt-4o",
      max_tokens: options?.maxTokens ?? 1024,
      messages: [
        {
          role: "user",
          content: [
            { type: "text", text: prompt },
            {
              type: "image_url",
              image_url: {
                url: imageUrl,
                detail: options?.detail ?? "auto",
              },
            },
          ],
        },
      ],
    });

    return response.choices[0].message.content ?? "";
  }

  // Process image from file (base64)
  async function analyzeImageFile(
    filePath: string,
    prompt: string,
    options?: VisionOptions
  ): Promise<string> {
    const buffer = await readFile(filePath);
    const base64 = buffer.toString("base64");
    const ext = path.extname(filePath).toLowerCase();

    const mimeType = {
      ".jpg": "image/jpeg",
      ".jpeg": "image/jpeg",
      ".png": "image/png",
      ".gif": "image/gif",
      ".webp": "image/webp",
    }[ext] ?? "image/jpeg";

    const response = await openai.chat.completions.create({
      model: "gpt-4o",
      max_tokens: options?.maxTokens ?? 1024,
      messages: [
        {
          role: "user",
          content: [
            { type: "text", text: prompt },
            {
              type: "image_url",
              image_url: {
                url: `data:${mimeType};base64,${base64}`,
                detail: options?.detail ?? "auto",
              },
            },
          ],
        },
      ],
    });

    return response.choices[0].message.content ?? "";
  }

  // Compare multiple images
  async function compareImages(
    images: string[],
    prompt: string
  ): Promise<string> {
    const imageContent = images.map((url) => ({
      type: "image_url" as const,
      image_url: { url, detail: "high" as const },
    }));

    const response = await openai.chat.completions.create({
      model: "gpt-4o",
      max_tokens: 2048,
      messages: [
        {
          role: "user",
          content: [{ type: "text", text: prompt }, ...imageContent],
        },
      ],
    });

    return response.choices[0].message.content ?? "";
  }

  // Structured extraction from images
  interface ExtractedData<T> {
    data: T;
    confidence: number;
  }

  async function extractStructuredData<T>(
    imageUrl: string,
    schema: string,
    example?: T
  ): Promise<ExtractedData<T>> {
    const prompt = `Extract structured data from this image.

    Output JSON matching this schema:
    ${schema}

    ${example ? `Example output:\n${JSON.stringify(example, null, 2)}` : ""}

    Respond ONLY with valid JSON. Include a "confidence" field (0-1).`;

    const response = await openai.chat.completions.create({
      model: "gpt-4o",
      max_tokens: 2048,
      response_format: { type: "json_object" },
      messages: [
        {
          role: "user",
          content: [
            { type: "text", text: prompt },
            { type: "image_url", image_url: { url: imageUrl, detail: "high" } },
          ],
        },
      ],
    });

    const result = JSON.parse(response.choices[0].message.content ?? "{}");
    return result as ExtractedData<T>;
  }

claude_vision: description: "Process images with Claude" when: "Need document extraction or long-context vision" implementation: | import Anthropic from "@anthropic-ai/sdk"; import { readFile } from "fs/promises";

  const anthropic = new Anthropic();

  interface ClaudeVisionOptions {
    maxTokens?: number;
    thinking?: boolean;
  }

  // Process image with Claude
  async function analyzeWithClaude(
    imageSource: string | Buffer,
    prompt: string,
    options?: ClaudeVisionOptions
  ): Promise<string> {
    let imageData: Anthropic.ImageBlockParam;

    if (Buffer.isBuffer(imageSource)) {
      imageData = {
        type: "image",
        source: {
          type: "base64",
          media_type: "image/png",
          data: imageSource.toString("base64"),
        },
      };
    } else if (imageSource.startsWith("data:")) {
      // Base64 data URL
      const [header, data] = imageSource.split(",");
      const mediaType = header.match(/data:(.+);base64/)?.[1] ?? "image/png";
      imageData = {
        type: "image",
        source: {
          type: "base64",
          media_type: mediaType as "image/png",
          data,
        },
      };
    } else {
      // URL
      imageData = {
        type: "image",
        source: {
          type: "url",
          url: imageSource,
        },
      };
    }

    const response = await anthropic.messages.create({
      model: "claude-sonnet-4-20250514",
      max_tokens: options?.maxTokens ?? 2048,
      messages: [
        {
          role: "user",
          content: [imageData, { type: "text", text: prompt }],
        },
      ],
    });

    const textBlock = response.content.find((b) => b.type === "text");
    return textBlock?.type === "text" ? textBlock.text : "";
  }

  // Document extraction with Claude (superior for tables/forms)
  interface DocumentField {
    name: string;
    value: string;
    confidence: number;
    location?: string;
  }

  async function extractDocument(
    imageUrl: string,
    fieldNames: string[]
  ): Promise<DocumentField[]> {
    const prompt = `Extract the following fields from this document image:
    ${fieldNames.map((f) => `- ${f}`).join("\n")}

    For each field, provide:
    - name: The field name
    - value: The extracted value (exact text from document)
    - confidence: Your confidence (0-1)
    - location: Where on the document (e.g., "top-left", "table row 3")

    Output as JSON array. If a field is not found, set value to null.`;

    const response = await analyzeWithClaude(imageUrl, prompt);

    try {
      // Extract JSON from response
      const jsonMatch = response.match(/\[[\s\S]*\]/);
      return jsonMatch ? JSON.parse(jsonMatch[0]) : [];
    } catch {
      return [];
    }
  }

  // Multi-page document processing
  async function processMultiPageDocument(
    pages: string[],
    task: string
  ): Promise<string> {
    const imageContent = pages.map((url) => ({
      type: "image" as const,
      source: { type: "url" as const, url },
    }));

    const response = await anthropic.messages.create({
      model: "claude-sonnet-4-20250514",
      max_tokens: 4096,
      messages: [
        {
          role: "user",
          content: [
            ...imageContent,
            {
              type: "text",
              text: `These are pages of a document. ${task}`,
            },
          ],
        },
      ],
    });

    const textBlock = response.content.find((b) => b.type === "text");
    return textBlock?.type === "text" ? textBlock.text : "";
  }

audio_transcription: description: "Transcribe audio with Whisper" when: "Converting speech to text" implementation: | import OpenAI from "openai"; import { createReadStream } from "fs"; import { writeFile } from "fs/promises";

  const openai = new OpenAI();

  interface TranscriptionOptions {
    language?: string;
    prompt?: string;
    responseFormat?: "json" | "text" | "srt" | "vtt" | "verbose_json";
    temperature?: number;
  }

  interface TranscriptionResult {
    text: string;
    segments?: Array<{
      start: number;
      end: number;
      text: string;
    }>;
    language?: string;
    duration?: number;
  }

  // Basic transcription
  async function transcribeAudio(
    audioPath: string,
    options?: TranscriptionOptions
  ): Promise<TranscriptionResult> {
    const response = await openai.audio.transcriptions.create({
      file: createReadStream(audioPath),
      model: "whisper-1",
      language: options?.language,
      prompt: options?.prompt,
      response_format: options?.responseFormat ?? "verbose_json",
      temperature: options?.temperature ?? 0,
    });

    if (typeof response === "string") {
      return { text: response };
    }

    return {
      text: response.text,
      segments: response.segments?.map((s) => ({
        start: s.start,
        end: s.end,
        text: s.text,
      })),
      language: response.language,
      duration: response.duration,
    };
  }

  // Transcribe with speaker diarization (using external service or prompt)
  async function transcribeWithSpeakers(
    audioPath: string
  ): Promise<Array<{ speaker: string; start: number; end: number; text: string }>> {
    // Step 1: Basic transcription with timestamps
    const transcription = await transcribeAudio(audioPath, {
      responseFormat: "verbose_json",
    });

    // Step 2: Use GPT to identify speakers
    const segments = transcription.segments ?? [];
    const fullText = segments.map((s) => `[${s.start.toFixed(1)}s] ${s.text}`).join("\n");

    const response = await openai.chat.completions.create({
      model: "gpt-4o",
      messages: [
        {
          role: "system",
          content: `Analyze this transcript and identify different speakers.
          Output JSON array with: speaker, start, end, text.
          Assign speakers as "Speaker 1", "Speaker 2", etc.`,
        },
        { role: "user", content: fullText },
      ],
      response_format: { type: "json_object" },
    });

    const result = JSON.parse(response.choices[0].message.content ?? "{}");
    return result.segments ?? [];
  }

  // Real-time transcription with streaming
  // Note: OpenAI Whisper doesn't support streaming, but you can chunk
  async function* streamTranscription(
    audioChunks: AsyncIterable<Buffer>,
    chunkDurationSec: number = 5
  ): AsyncGenerator<TranscriptionResult> {
    let buffer = Buffer.alloc(0);
    const sampleRate = 16000; // Assuming 16kHz mono
    const bytesPerChunk = sampleRate * 2 * chunkDurationSec; // 16-bit audio

    for await (const chunk of audioChunks) {
      buffer = Buffer.concat([buffer, chunk]);

      while (buffer.length >= bytesPerChunk) {
        const audioChunk = buffer.slice(0, bytesPerChunk);
        buffer = buffer.slice(bytesPerChunk);

        // Save temp file (Whisper requires file input)
        const tempPath = `/tmp/chunk_${Date.now()}.wav`;
        await writeFile(tempPath, createWavFile(audioChunk, sampleRate));

        const result = await transcribeAudio(tempPath);
        yield result;
      }
    }

    // Process remaining buffer
    if (buffer.length > 0) {
      const tempPath = `/tmp/chunk_final_${Date.now()}.wav`;
      await writeFile(tempPath, createWavFile(buffer, 16000));
      yield await transcribeAudio(tempPath);
    }
  }

  function createWavFile(pcmData: Buffer, sampleRate: number): Buffer {
    // WAV header creation (simplified)
    const header = Buffer.alloc(44);
    // ... WAV header bytes
    return Buffer.concat([header, pcmData]);
  }

unified_multimodal: description: "Unified pipeline for multiple modalities" when: "Processing mixed input types" implementation: | import OpenAI from "openai"; import Anthropic from "@anthropic-ai/sdk";

  const openai = new OpenAI();
  const anthropic = new Anthropic();

  type InputModality = "text" | "image" | "audio" | "video";

  interface MultimodalInput {
    type: InputModality;
    content: string | Buffer;
    metadata?: Record<string, unknown>;
  }

  interface MultimodalOutput {
    text: string;
    modalities: InputModality[];
    tokenUsage: {
      input: number;
      output: number;
    };
    processingTime: number;
  }

  class UnifiedMultimodalPipeline {
    private model: "gpt-4o" | "claude";

    constructor(model: "gpt-4o" | "claude" = "gpt-4o") {
      this.model = model;
    }

    async process(
      inputs: MultimodalInput[],
      instruction: string
    ): Promise<MultimodalOutput> {
      const startTime = Date.now();
      const modalities = inputs.map((i) => i.type);

      // Preprocess audio to text (neither GPT-4o nor Claude accept audio directly in chat)
      const processedInputs = await Promise.all(
        inputs.map(async (input) => {
          if (input.type === "audio") {
            const transcription = await this.transcribeAudio(input.content as Buffer);
            return {
              type: "text" as const,
              content: `[Transcribed audio]: ${transcription}`,
              metadata: { originalType: "audio" },
            };
          }
          return input;
        })
      );

      // Build content array
      const content = processedInputs.map((input) => {
        if (input.type === "text") {
          return { type: "text" as const, text: input.content as string };
        }
        if (input.type === "image") {
          const imageData =
            typeof input.content === "string"
              ? input.content
              : `data:image/png;base64,${(input.content as Buffer).toString("base64")}`;

          return {
            type: "image_url" as const,
            image_url: { url: imageData, detail: "high" as const },
          };
        }
        throw new Error(`Unsupported type: ${input.type}`);
      });

      // Add instruction
      content.push({ type: "text" as const, text: instruction });

      // Call appropriate API
      let response: string;
      let usage = { input: 0, output: 0 };

      if (this.model === "gpt-4o") {
        const result = await openai.chat.completions.create({
          model: "gpt-4o",
          messages: [{ role: "user", content }],
          max_tokens: 2048,
        });

        response = result.choices[0].message.content ?? "";
        usage = {
          input: result.usage?.prompt_tokens ?? 0,
          output: result.usage?.completion_tokens ?? 0,
        };
      } else {
        // Convert to Claude format
        const claudeContent = content.map((c) => {
          if (c.type === "text") return c;
          if (c.type === "image_url") {
            return {
              type: "image" as const,
              source: { type: "url" as const, url: c.image_url.url },
            };
          }
          return c;
        });

        const result = await anthropic.messages.create({
          model: "claude-sonnet-4-20250514",
          messages: [{ role: "user", content: claudeContent }],
          max_tokens: 2048,
        });

        const textBlock = result.content.find((b) => b.type === "text");
        response = textBlock?.type === "text" ? textBlock.text : "";
        usage = {
          input: result.usage.input_tokens,
          output: result.usage.output_tokens,
        };
      }

      return {
        text: response,
        modalities,
        tokenUsage: usage,
        processingTime: Date.now() - startTime,
      };
    }

    private async transcribeAudio(audio: Buffer): Promise<string> {
      // Save to temp file
      const tempPath = `/tmp/audio_${Date.now()}.mp3`;
      await require("fs/promises").writeFile(tempPath, audio);

      const result = await openai.audio.transcriptions.create({
        file: require("fs").createReadStream(tempPath),
        model: "whisper-1",
      });

      return result.text;
    }
  }

image_token_optimization: description: "Optimize image token usage" when: "Managing costs with vision APIs" implementation: | import OpenAI from "openai"; import sharp from "sharp";

  const openai = new OpenAI();

  interface ImageOptimization {
    originalSize: { width: number; height: number };
    optimizedSize: { width: number; height: number };
    estimatedTokens: number;
    detailLevel: "low" | "high";
  }

  // GPT-4o token estimation for images
  // Low detail: 85 tokens fixed
  // High detail: 85 + 170 * ceil(width/512) * ceil(height/512)
  function estimateImageTokens(
    width: number,
    height: number,
    detail: "low" | "high"
  ): number {
    if (detail === "low") {
      return 85;
    }

    // Scale down if larger than 2048 on any side
    const maxDim = 2048;
    if (width > maxDim || height > maxDim) {
      const scale = maxDim / Math.max(width, height);
      width = Math.round(width * scale);
      height = Math.round(height * scale);
    }

    // Scale to fit in 768px on shortest side
    const shortSide = Math.min(width, height);
    if (shortSide > 768) {
      const scale = 768 / shortSide;
      width = Math.round(width * scale);
      height = Math.round(height * scale);
    }

    // Calculate tiles
    const tilesX = Math.ceil(width / 512);
    const tilesY = Math.ceil(height / 512);

    return 85 + 170 * tilesX * tilesY;
  }

  // Optimize image for minimal tokens while preserving quality
  async function optimizeImageForVision(
    imagePath: string,
    task: "ocr" | "describe" | "compare" | "analyze"
  ): Promise<{ buffer: Buffer; optimization: ImageOptimization }> {
    const image = sharp(imagePath);
    const metadata = await image.metadata();
    const originalWidth = metadata.width ?? 0;
    const originalHeight = metadata.height ?? 0;

    // Choose detail level based on task
    let detail: "low" | "high" = "high";
    let targetSize = { width: originalWidth, height: originalHeight };

    switch (task) {
      case "describe":
        // Low detail sufficient for general descriptions
        detail = "low";
        targetSize = { width: 512, height: 512 };
        break;

      case "ocr":
        // High detail needed for text extraction
        detail = "high";
        // Keep original size up to 2048
        break;

      case "compare":
        // Medium - resize to 1024 max
        detail = "high";
        if (originalWidth > 1024 || originalHeight > 1024) {
          const scale = 1024 / Math.max(originalWidth, originalHeight);
          targetSize = {
            width: Math.round(originalWidth * scale),
            height: Math.round(originalHeight * scale),
          };
        }
        break;

      case "analyze":
        // High detail, but optimize size
        detail = "high";
        // Scale to 768 on shortest side
        const shortSide = Math.min(originalWidth, originalHeight);
        if (shortSide > 768) {
          const scale = 768 / shortSide;
          targetSize = {
            width: Math.round(originalWidth * scale),
            height: Math.round(originalHeight * scale),
          };
        }
        break;
    }

    // Resize and compress
    const buffer = await image
      .resize(targetSize.width, targetSize.height, { fit: "inside" })
      .jpeg({ quality: 85 })
      .toBuffer();

    const estimatedTokens = estimateImageTokens(targetSize.width, targetSize.height, detail);

    return {
      buffer,
      optimization: {
        originalSize: { width: originalWidth, height: originalHeight },
        optimizedSize: targetSize,
        estimatedTokens,
        detailLevel: detail,
      },
    };
  }

anti_patterns:

pattern: "Sending high-res images for simple tasks" problem: "Wastes tokens and money" solution: "Use 'low' detail for descriptions, 'high' for OCR"
pattern: "Sequential multimodal processing" problem: "Slow - processes image, then audio, then text" solution: "Process modalities in parallel where possible"
pattern: "Not validating image format" problem: "API errors on unsupported formats" solution: "Convert to supported formats (JPEG, PNG, WebP, GIF)"
pattern: "Ignoring token costs for images" problem: "Vision costs 10-100x more than text" solution: "Estimate and optimize image tokens before sending"
pattern: "Using vision for text-only tasks" problem: "Unnecessary complexity and cost" solution: "If text is available, prefer text over screenshots"

handoffs:

skill: document-ai when: "Processing structured documents"
skill: ai-observability when: "Tracking multimodal costs"
skill: llm-integration when: "Building chat with multimodal support"