git clone https://github.com/vibeforge1111/vibeship-spawner-skills
ai/on-device-ai/skill.yamlOn-Device AI
Running AI models in browsers and edge devices
version: 1.0.0 name: On-Device AI id: on-device-ai category: ai difficulty: advanced
description: | Patterns for running AI models locally in browsers using WebGPU, Transformers.js, WebLLM, and ONNX Runtime. Zero API costs, full privacy.
triggers:
- "on-device AI"
- "browser AI"
- "WebLLM"
- "Transformers.js"
- "WebGPU"
- "edge inference"
- "offline AI"
- "client-side ML"
- "ONNX web"
technologies:
- Transformers.js
- WebLLM
- ONNX Runtime Web
- WebGPU
- TensorFlow.js
patterns: transformers_js_setup: description: "Set up Transformers.js for browser inference" when: "Running HuggingFace models in browser" implementation: | // Transformers.js with WebGPU acceleration import { pipeline, env } from "@huggingface/transformers";
// Configure for browser env.useBrowserCache = true; env.allowLocalModels = false; // Check WebGPU support async function checkWebGPUSupport(): Promise<{ supported: boolean; adapter?: GPUAdapter; device?: GPUDevice; }> { if (!("gpu" in navigator)) { return { supported: false }; } try { const adapter = await navigator.gpu.requestAdapter(); if (!adapter) { return { supported: false }; } const device = await adapter.requestDevice(); return { supported: true, adapter, device }; } catch { return { supported: false }; } } // Initialize pipeline with WebGPU async function createPipeline<T extends "text-generation" | "feature-extraction" | "sentiment-analysis">( task: T, model: string, options?: { quantized?: boolean; dtype?: "fp32" | "fp16" | "q8" | "q4"; onProgress?: (progress: { progress: number; status: string }) => void; } ) { const webgpu = await checkWebGPUSupport(); const config = { dtype: options?.dtype ?? (webgpu.supported ? "q4" : "q8"), device: webgpu.supported ? "webgpu" : "wasm", progress_callback: options?.onProgress, }; console.log(`Using ${config.device} backend with ${config.dtype} precision`); return pipeline(task, model, config); } // Text generation example async function generateText(prompt: string): Promise<string> { const generator = await createPipeline( "text-generation", "Xenova/Phi-3-mini-4k-instruct", { dtype: "q4", onProgress: ({ progress, status }) => { console.log(`Loading: ${(progress * 100).toFixed(0)}% - ${status}`); }, } ); const result = await generator(prompt, { max_new_tokens: 256, temperature: 0.7, do_sample: true, }); return result[0].generated_text; } // Embeddings for semantic search async function getEmbeddings(texts: string[]): Promise<Float32Array[]> { const extractor = await createPipeline( "feature-extraction", "Xenova/all-MiniLM-L6-v2", { dtype: "fp32" } ); const embeddings: Float32Array[] = []; for (const text of texts) { const result = await extractor(text, { pooling: "mean", normalize: true, }); embeddings.push(result.data); } return embeddings; }
webllm_chat: description: "Run LLMs in browser with WebLLM" when: "Need conversational AI without API calls" implementation: | import * as webllm from "@mlc-ai/web-llm";
interface ChatMessage { role: "system" | "user" | "assistant"; content: string; } class BrowserChatEngine { private engine: webllm.MLCEngine | null = null; private isLoading = false; private loadProgress = 0; // Available models (sorted by size) static MODELS = { small: "Llama-3.2-1B-Instruct-q4f16_1-MLC", medium: "Llama-3.2-3B-Instruct-q4f16_1-MLC", large: "Phi-3.5-mini-instruct-q4f16_1-MLC", vision: "Llama-3.2-11B-Vision-Instruct-q4f16_1-MLC", }; async initialize( model: keyof typeof BrowserChatEngine.MODELS = "small", onProgress?: (progress: number, status: string) => void ): Promise<void> { if (this.engine || this.isLoading) return; this.isLoading = true; const modelId = BrowserChatEngine.MODELS[model]; this.engine = await webllm.CreateMLCEngine(modelId, { initProgressCallback: (report) => { this.loadProgress = report.progress; onProgress?.(report.progress, report.text); }, }); this.isLoading = false; } async chat( messages: ChatMessage[], options?: { temperature?: number; maxTokens?: number; stream?: boolean; onToken?: (token: string) => void; } ): Promise<string> { if (!this.engine) { throw new Error("Engine not initialized. Call initialize() first."); } const { temperature = 0.7, maxTokens = 512, stream = false, onToken } = options ?? {}; if (stream && onToken) { // Streaming response let fullResponse = ""; const asyncGenerator = await this.engine.chat.completions.create({ messages, temperature, max_tokens: maxTokens, stream: true, }); for await (const chunk of asyncGenerator) { const delta = chunk.choices[0]?.delta?.content ?? ""; fullResponse += delta; onToken(delta); } return fullResponse; } // Non-streaming const response = await this.engine.chat.completions.create({ messages, temperature, max_tokens: maxTokens, }); return response.choices[0].message.content ?? ""; } async unload(): Promise<void> { if (this.engine) { await this.engine.unload(); this.engine = null; } } getProgress(): number { return this.loadProgress; } isReady(): boolean { return this.engine !== null && !this.isLoading; } } // React hook function useBrowserChat(model: keyof typeof BrowserChatEngine.MODELS = "small") { const [engine] = useState(() => new BrowserChatEngine()); const [loading, setLoading] = useState(false); const [progress, setProgress] = useState(0); const [ready, setReady] = useState(false); useEffect(() => { setLoading(true); engine.initialize(model, (p, status) => { setProgress(p); }).then(() => { setReady(true); setLoading(false); }); return () => { engine.unload(); }; }, [model]); const chat = useCallback(async (messages: ChatMessage[]) => { if (!ready) throw new Error("Not ready"); return engine.chat(messages); }, [engine, ready]); return { chat, loading, progress, ready }; }
onnx_runtime_inference: description: "Run ONNX models with WebGPU" when: "Using custom or fine-tuned models" implementation: | import * as ort from "onnxruntime-web";
// Configure ONNX Runtime for WebGPU async function setupONNXRuntime(): Promise<void> { // Check WebGPU availability if ("gpu" in navigator) { ort.env.wasm.numThreads = navigator.hardwareConcurrency; ort.env.webgpu.powerPreference = "high-performance"; } // Set WASM paths ort.env.wasm.wasmPaths = "https://cdn.jsdelivr.net/npm/onnxruntime-web/dist/"; } interface ModelConfig { modelPath: string; executionProvider: "webgpu" | "wasm" | "webgl"; inputNames: string[]; outputNames: string[]; } class ONNXInferenceEngine { private session: ort.InferenceSession | null = null; private config: ModelConfig; constructor(config: ModelConfig) { this.config = config; } async load(): Promise<void> { await setupONNXRuntime(); const options: ort.InferenceSession.SessionOptions = { executionProviders: [this.config.executionProvider], graphOptimizationLevel: "all", }; this.session = await ort.InferenceSession.create( this.config.modelPath, options ); } async run(inputs: Record<string, ort.Tensor>): Promise<Record<string, ort.Tensor>> { if (!this.session) { throw new Error("Session not loaded"); } return await this.session.run(inputs); } async dispose(): Promise<void> { if (this.session) { await this.session.release(); this.session = null; } } } // Example: Text classification with ONNX async function classifyText(text: string): Promise<{ label: string; score: number }[]> { const tokenizer = await AutoTokenizer.from_pretrained("Xenova/bert-base-uncased"); const encoded = await tokenizer(text, { padding: true, truncation: true, max_length: 128, }); const engine = new ONNXInferenceEngine({ modelPath: "/models/classifier.onnx", executionProvider: "webgpu", inputNames: ["input_ids", "attention_mask"], outputNames: ["logits"], }); await engine.load(); const inputIds = new ort.Tensor("int64", encoded.input_ids.data, encoded.input_ids.dims); const attentionMask = new ort.Tensor("int64", encoded.attention_mask.data, encoded.attention_mask.dims); const outputs = await engine.run({ input_ids: inputIds, attention_mask: attentionMask, }); const logits = outputs.logits.data as Float32Array; const probs = softmax(Array.from(logits)); await engine.dispose(); return probs.map((score, idx) => ({ label: LABELS[idx], score, })).sort((a, b) => b.score - a.score); } function softmax(arr: number[]): number[] { const max = Math.max(...arr); const exp = arr.map((x) => Math.exp(x - max)); const sum = exp.reduce((a, b) => a + b, 0); return exp.map((x) => x / sum); }
local_rag_pipeline: description: "Build RAG entirely in browser" when: "Need retrieval without server calls" implementation: | import { pipeline } from "@huggingface/transformers"; import * as webllm from "@mlc-ai/web-llm";
interface Document { id: string; content: string; embedding?: Float32Array; } class BrowserRAG { private embedder: Awaited<ReturnType<typeof pipeline>> | null = null; private llm: webllm.MLCEngine | null = null; private documents: Document[] = []; async initialize( onProgress?: (stage: string, progress: number) => void ): Promise<void> { // Load embedder onProgress?.("embedder", 0); this.embedder = await pipeline( "feature-extraction", "Xenova/all-MiniLM-L6-v2", { device: "webgpu", dtype: "fp32", progress_callback: (p) => onProgress?.("embedder", p.progress), } ); // Load LLM onProgress?.("llm", 0); this.llm = await webllm.CreateMLCEngine( "Llama-3.2-1B-Instruct-q4f16_1-MLC", { initProgressCallback: (r) => onProgress?.("llm", r.progress), } ); } async addDocuments(docs: Array<{ id: string; content: string }>): Promise<void> { if (!this.embedder) throw new Error("Not initialized"); for (const doc of docs) { const embedding = await this.embed(doc.content); this.documents.push({ id: doc.id, content: doc.content, embedding, }); } } async query( question: string, options?: { topK?: number; stream?: boolean; onToken?: (token: string) => void } ): Promise<string> { const { topK = 3, stream = false, onToken } = options ?? {}; if (!this.embedder || !this.llm) throw new Error("Not initialized"); // Get question embedding const queryEmbedding = await this.embed(question); // Find similar documents const scored = this.documents.map((doc) => ({ doc, score: this.cosineSimilarity(queryEmbedding, doc.embedding!), })); scored.sort((a, b) => b.score - a.score); const topDocs = scored.slice(0, topK); // Build context const context = topDocs .map((s) => s.doc.content) .join("\n\n---\n\n"); // Generate answer const messages = [ { role: "system" as const, content: "Answer questions based on the provided context. Be concise and accurate.", }, { role: "user" as const, content: `Context:\n${context}\n\nQuestion: ${question}`, }, ]; if (stream && onToken) { let response = ""; const chunks = await this.llm.chat.completions.create({ messages, stream: true, max_tokens: 256, }); for await (const chunk of chunks) { const delta = chunk.choices[0]?.delta?.content ?? ""; response += delta; onToken(delta); } return response; } const result = await this.llm.chat.completions.create({ messages, max_tokens: 256, }); return result.choices[0].message.content ?? ""; } private async embed(text: string): Promise<Float32Array> { const result = await this.embedder!(text, { pooling: "mean", normalize: true, }); return result.data as Float32Array; } private cosineSimilarity(a: Float32Array, b: Float32Array): number { let dot = 0; let normA = 0; let normB = 0; for (let i = 0; i < a.length; i++) { dot += a[i] * b[i]; normA += a[i] * a[i]; normB += b[i] * b[i]; } return dot / (Math.sqrt(normA) * Math.sqrt(normB)); } }
model_caching: description: "Cache models for fast subsequent loads" when: "Optimizing repeat visits" implementation: | // Models are automatically cached in browser storage // but we can optimize with preloading and cache management
import { env } from "@huggingface/transformers"; // Enable browser caching env.useBrowserCache = true; env.cacheDir = "transformers-cache"; interface CacheInfo { modelId: string; size: number; lastAccessed: Date; } async function getCacheInfo(): Promise<CacheInfo[]> { const cacheInfo: CacheInfo[] = []; if ("caches" in window) { const cacheNames = await caches.keys(); for (const name of cacheNames) { if (name.includes("transformers") || name.includes("onnx")) { const cache = await caches.open(name); const keys = await cache.keys(); let totalSize = 0; for (const request of keys) { const response = await cache.match(request); if (response) { const blob = await response.blob(); totalSize += blob.size; } } cacheInfo.push({ modelId: name, size: totalSize, lastAccessed: new Date(), }); } } } return cacheInfo; } async function clearModelCache(modelId?: string): Promise<void> { if ("caches" in window) { const cacheNames = await caches.keys(); for (const name of cacheNames) { if (!modelId || name.includes(modelId)) { await caches.delete(name); } } } // Also clear IndexedDB if used if ("indexedDB" in window) { const databases = await indexedDB.databases(); for (const db of databases) { if (db.name && (!modelId || db.name.includes(modelId))) { indexedDB.deleteDatabase(db.name); } } } } // Preload models in background async function preloadModels( models: string[], onProgress?: (model: string, progress: number) => void ): Promise<void> { for (const model of models) { try { // Use pipeline to trigger download without running inference await pipeline("feature-extraction", model, { progress_callback: (p) => onProgress?.(model, p.progress), }); console.log(`Preloaded: ${model}`); } catch (error) { console.warn(`Failed to preload ${model}:`, error); } } } // Service worker for offline support const SW_SCRIPT = ` self.addEventListener('fetch', (event) => { // Cache model files if (event.request.url.includes('huggingface.co') || event.request.url.includes('onnx')) { event.respondWith( caches.match(event.request).then((response) => { return response || fetch(event.request).then((fetchResponse) => { return caches.open('models').then((cache) => { cache.put(event.request, fetchResponse.clone()); return fetchResponse; }); }); }) ); } }); `;
react_integration: description: "React hooks for on-device AI" when: "Building React apps with local inference" implementation: | import { useState, useEffect, useCallback, useRef } from "react"; import { pipeline } from "@huggingface/transformers";
interface UseModelOptions { device?: "webgpu" | "wasm"; dtype?: "fp32" | "fp16" | "q8" | "q4"; onProgress?: (progress: number) => void; } interface UseModelResult<T> { model: T | null; loading: boolean; progress: number; error: Error | null; ready: boolean; } function useModel<T>( task: string, modelId: string, options?: UseModelOptions ): UseModelResult<T> { const [model, setModel] = useState<T | null>(null); const [loading, setLoading] = useState(true); const [progress, setProgress] = useState(0); const [error, setError] = useState<Error | null>(null); useEffect(() => { let cancelled = false; async function loadModel() { try { setLoading(true); setError(null); const pipe = await pipeline(task, modelId, { device: options?.device ?? "webgpu", dtype: options?.dtype ?? "q4", progress_callback: (p) => { if (!cancelled) { setProgress(p.progress); options?.onProgress?.(p.progress); } }, }); if (!cancelled) { setModel(pipe as T); setLoading(false); } } catch (e) { if (!cancelled) { setError(e instanceof Error ? e : new Error(String(e))); setLoading(false); } } } loadModel(); return () => { cancelled = true; }; }, [task, modelId, options?.device, options?.dtype]); return { model, loading, progress, error, ready: model !== null && !loading, }; } // Specialized hooks function useTextGeneration(modelId: string = "Xenova/Phi-3-mini-4k-instruct") { const result = useModel("text-generation", modelId, { dtype: "q4" }); const generate = useCallback( async (prompt: string, options?: { maxTokens?: number; temperature?: number }) => { if (!result.model) throw new Error("Model not loaded"); const output = await (result.model as any)(prompt, { max_new_tokens: options?.maxTokens ?? 256, temperature: options?.temperature ?? 0.7, do_sample: true, }); return output[0].generated_text; }, [result.model] ); return { ...result, generate }; } function useEmbeddings(modelId: string = "Xenova/all-MiniLM-L6-v2") { const result = useModel("feature-extraction", modelId, { dtype: "fp32" }); const embed = useCallback( async (texts: string | string[]) => { if (!result.model) throw new Error("Model not loaded"); const input = Array.isArray(texts) ? texts : [texts]; const embeddings: Float32Array[] = []; for (const text of input) { const output = await (result.model as any)(text, { pooling: "mean", normalize: true, }); embeddings.push(output.data); } return embeddings; }, [result.model] ); return { ...result, embed }; } function useSentiment(modelId: string = "Xenova/distilbert-base-uncased-finetuned-sst-2-english") { const result = useModel("sentiment-analysis", modelId); const analyze = useCallback( async (text: string) => { if (!result.model) throw new Error("Model not loaded"); return (result.model as any)(text); }, [result.model] ); return { ...result, analyze }; } // Example component function ChatInterface() { const { generate, loading, progress, ready } = useTextGeneration(); const [input, setInput] = useState(""); const [output, setOutput] = useState(""); const [generating, setGenerating] = useState(false); const handleSubmit = async () => { if (!ready || generating) return; setGenerating(true); try { const response = await generate(input); setOutput(response); } finally { setGenerating(false); } }; if (loading) { return <div>Loading model... {(progress * 100).toFixed(0)}%</div>; } return ( <div> <textarea value={input} onChange={(e) => setInput(e.target.value)} /> <button onClick={handleSubmit} disabled={generating}> {generating ? "Generating..." : "Generate"} </button> <div>{output}</div> </div> ); }
anti_patterns:
-
pattern: "Loading large models on mobile" problem: "Mobile devices have limited memory" solution: "Use smaller quantized models (1B-3B params) on mobile"
-
pattern: "Blocking main thread during inference" problem: "UI freezes during model operations" solution: "Use Web Workers for heavy inference tasks"
-
pattern: "Assuming WebGPU is available" problem: "Not all browsers/devices support WebGPU" solution: "Check support and fallback to WASM"
-
pattern: "Not caching models" problem: "Users re-download models every visit" solution: "Enable browser caching and preload models"
-
pattern: "Using server-sized models" problem: "8B+ models are too large for most browsers" solution: "Use 1B-3B quantized models designed for edge"
handoffs:
-
skill: semantic-search when: "Building local RAG with embeddings"
-
skill: react-patterns when: "Integrating with React applications"
-
skill: performance-optimization when: "Optimizing inference speed"