Vibeship-spawner-skills document-ai

Document AI Skill

install

source · Clone the upstream repo

git clone https://github.com/vibeforge1111/vibeship-spawner-skills

manifest: ai/document-ai/skill.yaml

Document AI Skill

Patterns for AI-powered document parsing, OCR, and data extraction

version: 1.0.0 name: Document AI id: document-ai category: ai description: | Comprehensive patterns for AI-powered document understanding including PDF parsing, OCR, invoice/receipt extraction, table extraction, multimodal RAG with vision models, and structured data output.

triggers:

"document parsing"
"PDF extraction"
"OCR"
"invoice processing"
"receipt extraction"
"document understanding"
"LlamaParse"
"Unstructured"
"vision document"
"table extraction"
"structured output from PDF"

provides:

"PDF parsing with Claude/GPT-4 vision"
"Invoice and receipt data extraction"
"Table extraction from documents"
"Multimodal RAG for document Q&A"
"Structured JSON output with schema validation"
"Layout-aware document processing"
"Batch document processing pipelines"

patterns:

name: "PDF Parsing with Claude Vision" description: "Extract structured data from PDFs using Claude's vision" when: "User needs to extract data from PDF documents" implementation: | import Anthropic from "@anthropic-ai/sdk"; import { pdf } from "pdf-to-img"; import * as fs from "fs";

const anthropic = new Anthropic();

interface ExtractionResult { pages: PageContent[]; metadata: DocumentMetadata; }

interface PageContent { pageNumber: number; text: string; tables: Table[]; images: ImageDescription[]; }

// Convert PDF pages to base64 images async function pdfToImages(pdfPath: string): Promise<string[]> { const images: string[] = [];

const document = await pdf(pdfPath, { scale: 2 }); // Higher scale for OCR

for await (const image of document) {
  const base64 = image.toString("base64");
  images.push(base64);
}

return images;

}

// Extract structured data from a single page async function extractPageContent( imageBase64: string, pageNumber: number, schema?: string ): Promise<PageContent> { const systemPrompt = schema ?

Extract information according to this JSON schema: ${schema}

: "Extract all text, tables, and describe any images/charts.";

const response = await anthropic.messages.create({
  model: "claude-sonnet-4-20250514",
  max_tokens: 4096,
  messages: [
    {
      role: "user",
      content: [
        {
          type: "image",
          source: {
            type: "base64",
            media_type: "image/png",
            data: imageBase64,
          },
        },
        {
          type: "text",
          text: `${systemPrompt}

          For tables, output as structured JSON arrays.
          For images/charts, describe what they show.
          Preserve the document's logical structure.`,
        },
      ],
    },
  ],
});

const content = response.content[0];
if (content.type !== "text") {
  throw new Error("Unexpected response type");
}

return {
  pageNumber,
  text: content.text,
  tables: extractTablesFromText(content.text),
  images: extractImageDescriptions(content.text),
};

}

// Process entire PDF async function extractFromPDF( pdfPath: string, options?: { maxPages?: number; schema?: string; concurrency?: number; } ): Promise<ExtractionResult> { const { maxPages = 100, schema, concurrency = 3 } = options || {};

// Validate file size (Claude limit: 32MB)
const stats = fs.statSync(pdfPath);
if (stats.size > 32 * 1024 * 1024) {
  throw new Error("PDF exceeds 32MB limit. Split into smaller files.");
}

// Convert to images
const images = await pdfToImages(pdfPath);

if (images.length > maxPages) {
  throw new Error(`PDF has ${images.length} pages, max is ${maxPages}`);
}

// Process pages with controlled concurrency
const pages: PageContent[] = [];

for (let i = 0; i < images.length; i += concurrency) {
  const batch = images.slice(i, i + concurrency);
  const batchResults = await Promise.all(
    batch.map((img, idx) =>
      extractPageContent(img, i + idx + 1, schema)
    )
  );
  pages.push(...batchResults);
}

return {
  pages,
  metadata: {
    totalPages: images.length,
    processedAt: new Date().toISOString(),
  },
};

}

name: "Invoice Extraction with Schema Validation" description: "Extract structured invoice data with Zod schema enforcement" when: "User needs to process invoices or receipts" implementation: | import Anthropic from "@anthropic-ai/sdk"; import { z } from "zod"; import { zodToJsonSchema } from "zod-to-json-schema";

const anthropic = new Anthropic();

// Define strict schema for invoice data const InvoiceSchema = z.object({ invoiceNumber: z.string().describe("Invoice ID or number"), invoiceDate: z.string().describe("Date in YYYY-MM-DD format"), dueDate: z.string().optional().describe("Payment due date"), vendor: z.object({ name: z.string(), address: z.string().optional(), taxId: z.string().optional(), }), customer: z.object({ name: z.string(), address: z.string().optional(), }), lineItems: z.array( z.object({ description: z.string(), quantity: z.number(), unitPrice: z.number(), amount: z.number(), }) ), subtotal: z.number(), taxAmount: z.number().optional(), total: z.number(), currency: z.string().default("USD"), });

type Invoice = z.infer<typeof InvoiceSchema>;

async function extractInvoice(imageBase64: string): Promise<Invoice> { const jsonSchema = zodToJsonSchema(InvoiceSchema);

const response = await anthropic.messages.create({
  model: "claude-sonnet-4-20250514",
  max_tokens: 4096,
  messages: [
    {
      role: "user",
      content: [
        {
          type: "image",
          source: {
            type: "base64",
            media_type: "image/png",
            data: imageBase64,
          },
        },
        {
          type: "text",
          text: `Extract invoice data from this image.

          Return ONLY valid JSON matching this schema:
          ${JSON.stringify(jsonSchema, null, 2)}

          Rules:
          - All amounts should be numbers, not strings
          - Dates must be in YYYY-MM-DD format
          - If a field is unclear, use your best interpretation
          - Do not include any text outside the JSON object`,
        },
      ],
    },
  ],
});

const content = response.content[0];
if (content.type !== "text") {
  throw new Error("Unexpected response type");
}

// Parse and validate with Zod
const jsonMatch = content.text.match(/\{[\s\S]*\}/);
if (!jsonMatch) {
  throw new Error("No JSON found in response");
}

const parsed = JSON.parse(jsonMatch[0]);
return InvoiceSchema.parse(parsed);

}

// Batch process multiple invoices async function processInvoiceBatch( imagePaths: string[], onProgress?: (completed: number, total: number) => void ): Promise<{ results: Invoice[]; errors: Error[] }> { const results: Invoice[] = []; const errors: Error[] = [];

for (let i = 0; i < imagePaths.length; i++) {
  try {
    const imageBuffer = fs.readFileSync(imagePaths[i]);
    const base64 = imageBuffer.toString("base64");
    const invoice = await extractInvoice(base64);
    results.push(invoice);
  } catch (error) {
    errors.push(error as Error);
  }

  onProgress?.(i + 1, imagePaths.length);
}

return { results, errors };

}

name: "Table Extraction from Documents" description: "Extract tables as structured data from PDFs/images" when: "User needs to extract tabular data from documents" implementation: | import OpenAI from "openai";

const openai = new OpenAI();

interface ExtractedTable { headers: string[]; rows: string[][]; title?: string; pageNumber?: number; }

async function extractTables( imageBase64: string, options?: { format?: "json" | "csv" | "markdown"; pageNumber?: number; } ): Promise<ExtractedTable[]> { const { format = "json", pageNumber } = options || {};

const response = await openai.chat.completions.create({
  model: "gpt-4o",
  messages: [
    {
      role: "user",
      content: [
        {
          type: "image_url",
          image_url: {
            url: `data:image/png;base64,${imageBase64}`,
            detail: "high",
          },
        },
        {
          type: "text",
          text: `Extract ALL tables from this document image.

          For each table:
          1. Identify the table title/caption if present
          2. Extract all headers
          3. Extract all data rows
          4. Preserve the exact cell values

          Return as JSON array:
          [
            {
              "title": "Optional table title",
              "headers": ["Column1", "Column2", ...],
              "rows": [
                ["value1", "value2", ...],
                ...
              ]
            }
          ]

          If no tables found, return empty array [].
          Return ONLY the JSON, no other text.`,
        },
      ],
    },
  ],
  max_tokens: 4096,
  response_format: { type: "json_object" },
});

const content = response.choices[0].message.content;
if (!content) return [];

const parsed = JSON.parse(content);
const tables = Array.isArray(parsed) ? parsed : parsed.tables || [];

return tables.map((t: any) => ({
  ...t,
  pageNumber,
}));

}

// Convert extracted table to different formats function tableToCSV(table: ExtractedTable): string { const escape = (cell: string) => cell.includes(",") ?

"${cell}"

: cell;

const headerRow = table.headers.map(escape).join(",");
const dataRows = table.rows.map((row) =>
  row.map(escape).join(",")
);

return [headerRow, ...dataRows].join("\n");

}

function tableToMarkdown(table: ExtractedTable): string { const headerRow =

| ${table.headers.join(" | ")} |

; const separator =

| ${table.headers.map(() => "---").join(" | ")} |

; const dataRows = table.rows.map( (row) =>

| ${row.join(" | ")} |

);

let md = "";
if (table.title) md += `### ${table.title}\n\n`;
md += [headerRow, separator, ...dataRows].join("\n");

return md;

}

name: "Multimodal RAG for Document Q&A" description: "Build Q&A over documents with vision-enhanced RAG" when: "User needs to query large document collections" implementation: | import OpenAI from "openai"; import { Index } from "@upstash/vector";

const openai = new OpenAI(); const vectorIndex = new Index();

interface DocumentChunk { id: string; pageNumber: number; content: string; imageBase64?: string; // Store for visual queries embedding?: number[]; }

// Index a document for RAG async function indexDocument( documentId: string, pages: { text: string; imageBase64: string }[] ) { const chunks: DocumentChunk[] = [];

for (let i = 0; i < pages.length; i++) {
  const { text, imageBase64 } = pages[i];

  // Create text embedding
  const embeddingResponse = await openai.embeddings.create({
    model: "text-embedding-3-small",
    input: text,
  });

  const chunk: DocumentChunk = {
    id: `${documentId}-page-${i + 1}`,
    pageNumber: i + 1,
    content: text,
    imageBase64,
    embedding: embeddingResponse.data[0].embedding,
  };

  chunks.push(chunk);

  // Store in vector DB
  await vectorIndex.upsert({
    id: chunk.id,
    vector: chunk.embedding!,
    metadata: {
      documentId,
      pageNumber: chunk.pageNumber,
      content: chunk.content,
      hasImage: !!imageBase64,
    },
  });
}

return chunks;

}

// Query documents with multimodal understanding async function queryDocuments( query: string, options?: { documentIds?: string[]; topK?: number; useVision?: boolean; } ): Promise<{ answer: string; sources: DocumentChunk[] }> { const { topK = 5, useVision = true } = options || {};

// Get query embedding
const queryEmbedding = await openai.embeddings.create({
  model: "text-embedding-3-small",
  input: query,
});

// Search vector DB
const results = await vectorIndex.query({
  vector: queryEmbedding.data[0].embedding,
  topK,
  includeMetadata: true,
});

// Retrieve full chunks with images
const chunks = await Promise.all(
  results.map(async (r) => {
    // Fetch full chunk from storage
    return getChunkById(r.id);
  })
);

// Build multimodal prompt
const messages: any[] = [
  {
    role: "system",
    content: `Answer questions based on the provided document pages.
      Cite specific page numbers when referencing information.
      If the answer isn't in the documents, say so.`,
  },
];

// Add retrieved pages as context
const userContent: any[] = [];

for (const chunk of chunks) {
  if (useVision && chunk.imageBase64) {
    // Include page image for visual understanding
    userContent.push({
      type: "image_url",
      image_url: {
        url: `data:image/png;base64,${chunk.imageBase64}`,
        detail: "low", // Use low for cost efficiency
      },
    });
  }

  userContent.push({
    type: "text",
    text: `[Page ${chunk.pageNumber}]:\n${chunk.content}`,
  });
}

userContent.push({
  type: "text",
  text: `\n\nQuestion: ${query}`,
});

messages.push({ role: "user", content: userContent });

const response = await openai.chat.completions.create({
  model: "gpt-4o",
  messages,
  max_tokens: 1024,
});

return {
  answer: response.choices[0].message.content || "",
  sources: chunks,
};

}

name: "Using LlamaParse for Complex Documents" description: "Parse complex documents with LlamaParse API" when: "Need specialized document parsing beyond basic vision" implementation: | import { LlamaParseReader } from "llamaindex";

const reader = new LlamaParseReader({ apiKey: process.env.LLAMA_CLOUD_API_KEY, resultType: "markdown", // or "text", "json" parsingInstruction: "Extract all tables and preserve layout", });

interface ParsedDocument { content: string; metadata: Record<string, any>; }

async function parseWithLlamaparse( filePath: string, options?: { outputFormat?: "markdown" | "text" | "json"; parseInstruction?: string; } ): Promise<ParsedDocument[]> { const { outputFormat = "markdown", parseInstruction } = options || {};

const customReader = new LlamaParseReader({
  apiKey: process.env.LLAMA_CLOUD_API_KEY,
  resultType: outputFormat,
  ...(parseInstruction && { parsingInstruction: parseInstruction }),
});

const documents = await customReader.loadData(filePath);

return documents.map((doc) => ({
  content: doc.text,
  metadata: doc.metadata || {},
}));

}

// Parse with specific extraction focus async function parseInvoicesWithLlamaparse(filePath: string) { return parseWithLlamaparse(filePath, { outputFormat: "json", parseInstruction:

        Extract invoice data with the following structure:         - Invoice number         - Date         - Vendor information         - Line items with quantities and prices         - Totals and taxes         Return as structured JSON.

, }); }

name: "Using Unstructured.io for Enterprise" description: "Process documents with Unstructured API" when: "Need enterprise-grade document processing" implementation: | import { UnstructuredClient } from "unstructured-client"; import { Strategy } from "unstructured-client/sdk/models/shared";

const client = new UnstructuredClient({ serverURL: "https://api.unstructured.io", security: { apiKeyAuth: process.env.UNSTRUCTURED_API_KEY, }, });

interface UnstructuredElement { type: string; text: string; metadata: { page_number?: number; coordinates?: any; parent_id?: string; }; }

async function processWithUnstructured( filePath: string, options?: { strategy?: "fast" | "hi_res" | "ocr_only"; extractTables?: boolean; extractImages?: boolean; } ): Promise<UnstructuredElement[]> { const { strategy = "hi_res", extractTables = true, extractImages = false, } = options || {};

const fileBuffer = fs.readFileSync(filePath);
const fileName = path.basename(filePath);

const response = await client.general.partition({
  partitionParameters: {
    files: {
      content: fileBuffer,
      fileName,
    },
    strategy: strategy as Strategy,
    extractImageBlockTypes: extractImages ? ["Image", "Table"] : [],
    includePageBreaks: true,
  },
});

return response.elements as UnstructuredElement[];

}

// Extract structured data from elements function extractStructuredData(elements: UnstructuredElement[]) { const tables = elements.filter((e) => e.type === "Table"); const text = elements .filter((e) => e.type === "NarrativeText" || e.type === "Title") .map((e) => e.text) .join("\n\n");

const byPage = elements.reduce((acc, el) => {
  const page = el.metadata.page_number || 1;
  if (!acc[page]) acc[page] = [];
  acc[page].push(el);
  return acc;
}, {} as Record<number, UnstructuredElement[]>);

return { tables, text, byPage };

}

anti_patterns:

name: "Processing large PDFs without chunking" why_bad: "Exceeds token limits, causes timeouts, high costs" example_bad: | // BAD: Send entire 100-page PDF at once const result = await extractFromPDF(largePdf); example_good: | // GOOD: Process page by page with limits const pages = await pdfToImages(pdfPath); if (pages.length > 50) { throw new Error("Split PDF into smaller sections"); } for (const page of pages) { await extractPageContent(page); }
name: "No schema validation on extracted data" why_bad: "LLMs can hallucinate fields, produce invalid JSON" example_bad: | const data = JSON.parse(response); await db.invoices.create({ data }); // May be malformed example_good: | const data = InvoiceSchema.parse(JSON.parse(response)); await db.invoices.create({ data }); // Validated
name: "Ignoring low image quality" why_bad: "Poor scans produce garbage extraction" example_bad: | const result = await extract(anyImage); example_good: | // Check image quality first const quality = await assessImageQuality(image); if (quality.dpi < 150 || quality.blur > 0.5) { throw new Error("Image quality too low for reliable extraction"); }
name: "Not handling multi-column layouts" why_bad: "Text gets jumbled between columns" example_bad: | // Assume single-column layout const text = await extractText(pdf); example_good: | // Use layout-aware extraction const result = await extractWithLayout(pdf, { preserveLayout: true, detectColumns: true, });

handoffs:

to: "semantic-search" when: "Extracted documents need to be searchable" context: "Documents are parsed. Semantic search skill can index for retrieval."
to: "backend" when: "Need to store extracted data in database" context: "Data is structured. Backend skill can design storage schema."
to: "ai-observability" when: "Need to monitor extraction accuracy" context: "Track extraction quality, errors, and costs."

references: