git clone https://github.com/vibeforge1111/vibeship-spawner-skills
ai/semantic-search/skill.yamlSemantic Search Skill
Vector databases, embeddings, RAG, and hybrid search patterns
id: semantic-search name: Semantic Search category: ai description: | Build production-ready semantic search systems using vector databases, embeddings, and retrieval-augmented generation (RAG). Covers vector DB selection (Pinecone/Qdrant/Weaviate), embedding models (OpenAI/Voyage/Cohere), chunking strategies, hybrid search, and reranking for high-quality retrieval.
version: 1.0.0 author: vibeship tags:
- vector-search
- embeddings
- rag
- pinecone
- qdrant
- weaviate
- llama-index
- langchain
- hybrid-search
- reranking
principles:
-
name: "Hybrid Search by Default" description: | Pure vector search misses exact matches. Combine dense (vector) and sparse (BM25/keyword) retrieval with reciprocal rank fusion for production-ready search that handles both semantic and exact queries.
-
name: "Chunking Determines Quality" description: | Bad chunking = bad retrieval. Use semantic chunking that preserves context (200-300 words), keeps sections intact, and maintains hierarchical structure. Too small loses context, too large dilutes relevance.
-
name: "Rerank for Precision" description: | First-stage retrieval casts wide. Use cross-encoder rerankers (Cohere Rerank, Jina, Pinecone) as second stage to boost relevance by up to 48% before feeding to LLM.
-
name: "Match Embedding to Use Case" description: | Voyage-3 beats OpenAI on retrieval benchmarks by 9.74% average. text-embedding-3-small is reliable and cheap ($0.02/1M tokens). Use specialized embeddings for code (Voyage-code) or multilingual.
patterns:
-
name: "Vector Database Setup with Upstash" description: "Serverless vector search for edge and Vercel deployments" when_to_use: "Need vector search without managing infrastructure, Vercel/edge deployment" implementation: | // lib/vector-store.ts import { Index } from "@upstash/vector"; import OpenAI from "openai";
const vectorIndex = new Index({ url: process.env.UPSTASH_VECTOR_REST_URL!, token: process.env.UPSTASH_VECTOR_REST_TOKEN!, });
const openai = new OpenAI();
interface Document { id: string; content: string; metadata: Record<string, unknown>; }
// Generate embedding async function embed(text: string): Promise<number[]> { const response = await openai.embeddings.create({ model: "text-embedding-3-small", input: text, dimensions: 1536, // Upstash default }); return response.data[0].embedding; }
// Index documents async function indexDocuments(documents: Document[]) { const vectors = await Promise.all( documents.map(async (doc) => ({ id: doc.id, vector: await embed(doc.content), metadata: { ...doc.metadata, content: doc.content.slice(0, 1000), // Store for retrieval }, })) );
// Batch upsert (max 1000 per call) for (let i = 0; i < vectors.length; i += 1000) { await vectorIndex.upsert(vectors.slice(i, i + 1000)); } return { indexed: vectors.length };}
// Query with metadata filter async function search( query: string, options?: { topK?: number; filter?: Record<string, unknown>; includeMetadata?: boolean; } ) { const { topK = 5, filter, includeMetadata = true } = options || {};
const queryVector = await embed(query); const results = await vectorIndex.query({ vector: queryVector, topK, filter, includeMetadata, }); return results.map((r) => ({ id: r.id, score: r.score, content: r.metadata?.content, metadata: r.metadata, }));}
-
name: "Pinecone with Serverless" description: "Enterprise-grade vector search with Pinecone serverless" when_to_use: "Need billion-scale vectors, multi-region, enterprise SLA" implementation: | // lib/pinecone-search.ts import { Pinecone } from "@pinecone-database/pinecone"; import OpenAI from "openai";
const pinecone = new Pinecone({ apiKey: process.env.PINECONE_API_KEY!, });
const openai = new OpenAI(); const index = pinecone.index("your-index-name");
interface UpsertRecord { id: string; text: string; metadata?: Record<string, string | number | boolean>; }
// Embed text async function embed(texts: string[]): Promise<number[][]> { const response = await openai.embeddings.create({ model: "text-embedding-3-small", input: texts, }); return response.data.map((d) => d.embedding); }
// Batch upsert with namespace async function upsertDocuments( records: UpsertRecord[], namespace?: string ) { const batchSize = 100; const ns = index.namespace(namespace || "");
for (let i = 0; i < records.length; i += batchSize) { const batch = records.slice(i, i + batchSize); const embeddings = await embed(batch.map((r) => r.text)); await ns.upsert( batch.map((record, idx) => ({ id: record.id, values: embeddings[idx], metadata: { ...record.metadata, text: record.text.slice(0, 40000), // Pinecone metadata limit }, })) ); } return { upserted: records.length };}
// Query with filters async function query( queryText: string, options?: { topK?: number; namespace?: string; filter?: Record<string, unknown>; } ) { const { topK = 10, namespace, filter } = options || {};
const [queryEmbedding] = await embed([queryText]); const ns = index.namespace(namespace || ""); const results = await ns.query({ vector: queryEmbedding, topK, filter, includeMetadata: true, }); return results.matches?.map((m) => ({ id: m.id, score: m.score, text: m.metadata?.text, metadata: m.metadata, })) || [];}
// Delete by filter async function deleteByMetadata( filter: Record<string, unknown>, namespace?: string ) { const ns = index.namespace(namespace || ""); await ns.deleteMany({ filter }); }
-
name: "Hybrid Search with Qdrant" description: "Combined vector + keyword search with Qdrant" when_to_use: "Need complex filtering, hybrid search, self-hosted option" implementation: | // lib/qdrant-hybrid.ts import { QdrantClient } from "@qdrant/js-client-rest"; import OpenAI from "openai";
const qdrant = new QdrantClient({ url: process.env.QDRANT_URL || "http://localhost:6333", apiKey: process.env.QDRANT_API_KEY, });
const openai = new OpenAI(); const COLLECTION_NAME = "documents";
// Create collection with hybrid search async function createCollection() { await qdrant.createCollection(COLLECTION_NAME, { vectors: { size: 1536, distance: "Cosine", }, sparse_vectors: { text: {}, // For BM25-style keyword matching }, optimizers_config: { default_segment_number: 2, }, });
// Create payload index for filtering await qdrant.createPayloadIndex(COLLECTION_NAME, { field_name: "category", field_schema: "keyword", });}
// Generate sparse vector from text (simple BM25-like) function sparseEncode(text: string): { indices: number[]; values: number[] } { const words = text.toLowerCase().split(/\W+/).filter(Boolean); const termFreq: Record<string, number> = {};
for (const word of words) { termFreq[word] = (termFreq[word] || 0) + 1; } // Simple hashing for indices const indices: number[] = []; const values: number[] = []; for (const [term, freq] of Object.entries(termFreq)) { const hash = term.split("").reduce((a, c) => a + c.charCodeAt(0), 0) % 30000; indices.push(hash); values.push(Math.log(1 + freq)); // TF-IDF-like } return { indices, values };}
// Upsert with both dense and sparse vectors async function upsertHybrid( documents: Array<{ id: string; text: string; metadata?: Record<string, unknown> }> ) { const embeddings = await Promise.all( documents.map(async (doc) => { const response = await openai.embeddings.create({ model: "text-embedding-3-small", input: doc.text, }); return response.data[0].embedding; }) );
const points = documents.map((doc, i) => ({ id: doc.id, vector: embeddings[i], sparse_vectors: { text: sparseEncode(doc.text), }, payload: { text: doc.text, ...doc.metadata, }, })); await qdrant.upsert(COLLECTION_NAME, { points, wait: true });}
// Hybrid search with RRF fusion async function hybridSearch( query: string, options?: { topK?: number; filter?: Record<string, unknown>; denseWeight?: number; // 0-1, default 0.7 } ) { const { topK = 10, filter, denseWeight = 0.7 } = options || {};
// Get dense embedding const response = await openai.embeddings.create({ model: "text-embedding-3-small", input: query, }); const denseVector = response.data[0].embedding; const sparseVector = sparseEncode(query); // Hybrid query with prefetch const results = await qdrant.query(COLLECTION_NAME, { prefetch: [ { query: denseVector, using: "default", limit: topK * 2, }, { query: { indices: sparseVector.indices, values: sparseVector.values, }, using: "text", limit: topK * 2, }, ], query: { fusion: "rrf" }, // Reciprocal Rank Fusion limit: topK, filter: filter ? { must: Object.entries(filter).map(([key, value]) => ({ key, match: { value }, })), } : undefined, with_payload: true, }); return results.map((r) => ({ id: r.id, score: r.score, text: r.payload?.text, metadata: r.payload, }));}
-
name: "Semantic Chunking" description: "Smart document chunking that preserves context" when_to_use: "Indexing documents for RAG, need coherent chunks" implementation: | // lib/chunking.ts import { RecursiveCharacterTextSplitter } from "langchain/text_splitter";
interface Chunk { content: string; metadata: { pageNumber?: number; section?: string; chunkIndex: number; startChar: number; endChar: number; }; }
// Basic recursive chunking with overlap async function chunkDocument( text: string, options?: { chunkSize?: number; chunkOverlap?: number; separators?: string[]; } ): Promise<Chunk[]> { const { chunkSize = 1000, chunkOverlap = 200, separators = ["\n\n", "\n", ". ", " ", ""], } = options || {};
const splitter = new RecursiveCharacterTextSplitter({ chunkSize, chunkOverlap, separators, }); const docs = await splitter.createDocuments([text]); return docs.map((doc, i) => ({ content: doc.pageContent, metadata: { chunkIndex: i, startChar: doc.metadata.loc?.start || 0, endChar: doc.metadata.loc?.end || doc.pageContent.length, }, }));}
// Semantic chunking with sentence boundaries function semanticChunk( text: string, options?: { targetSize?: number; maxSize?: number; preserveHeaders?: boolean; } ): Chunk[] { const { targetSize = 800, maxSize = 1200, preserveHeaders = true } = options || {};
// Split into sentences const sentences = text.match(/[^.!?]+[.!?]+\s*/g) || [text]; const chunks: Chunk[] = []; let currentChunk = ""; let currentSection = ""; let chunkStart = 0; let charPos = 0; for (const sentence of sentences) { // Detect section headers if (preserveHeaders && /^#+\s+/.test(sentence.trim())) { // If we have content, save it if (currentChunk.trim()) { chunks.push({ content: currentSection ? `## ${currentSection}\n\n${currentChunk.trim()}` : currentChunk.trim(), metadata: { section: currentSection, chunkIndex: chunks.length, startChar: chunkStart, endChar: charPos, }, }); } currentSection = sentence.replace(/^#+\s+/, "").trim(); currentChunk = ""; chunkStart = charPos; } else if (currentChunk.length + sentence.length > maxSize) { // Save current chunk if (currentChunk.trim()) { chunks.push({ content: currentSection ? `## ${currentSection}\n\n${currentChunk.trim()}` : currentChunk.trim(), metadata: { section: currentSection, chunkIndex: chunks.length, startChar: chunkStart, endChar: charPos, }, }); } currentChunk = sentence; chunkStart = charPos; } else { currentChunk += sentence; } charPos += sentence.length; } // Save last chunk if (currentChunk.trim()) { chunks.push({ content: currentSection ? `## ${currentSection}\n\n${currentChunk.trim()}` : currentChunk.trim(), metadata: { section: currentSection, chunkIndex: chunks.length, startChar: chunkStart, endChar: charPos, }, }); } return chunks;}
// Estimate tokens (rough: 4 chars per token) function estimateTokens(text: string): number { return Math.ceil(text.length / 4); }
-
name: "Reranking with Cohere" description: "Second-stage reranking for precision" when_to_use: "Need high-precision retrieval, have latency budget for reranking" implementation: | // lib/reranker.ts import { CohereClient } from "cohere-ai";
const cohere = new CohereClient({ token: process.env.COHERE_API_KEY!, });
interface RerankResult { id: string; text: string; originalScore: number; rerankScore: number; metadata?: Record<string, unknown>; }
// Rerank search results async function rerankResults( query: string, documents: Array<{ id: string; text: string; score: number; metadata?: Record<string, unknown> }>, options?: { topN?: number; model?: "rerank-english-v3.0" | "rerank-multilingual-v3.0" | "rerank-v3.5"; } ): Promise<RerankResult[]> { const { topN = 5, model = "rerank-v3.5" } = options || {};
if (documents.length === 0) return []; const response = await cohere.rerank({ model, query, documents: documents.map((d) => d.text), topN, returnDocuments: false, }); return response.results.map((result) => { const original = documents[result.index]; return { id: original.id, text: original.text, originalScore: original.score, rerankScore: result.relevanceScore, metadata: original.metadata, }; });}
// Full retrieval pipeline with reranking async function retrieveAndRerank( query: string, vectorSearch: (query: string, topK: number) => Promise< Array<{ id: string; text: string; score: number; metadata?: Record<string, unknown> }> >, options?: { retrieveK?: number; // How many to retrieve rerankK?: number; // How many to keep after reranking } ): Promise<RerankResult[]> { const { retrieveK = 20, rerankK = 5 } = options || {};
// Stage 1: Vector retrieval (cast wide net) const candidates = await vectorSearch(query, retrieveK); if (candidates.length === 0) return []; // Stage 2: Rerank for precision const reranked = await rerankResults(query, candidates, { topN: rerankK }); return reranked;}
-
name: "LlamaIndex RAG Pipeline" description: "Complete RAG pipeline with LlamaIndex TypeScript" when_to_use: "Building RAG app in TypeScript, need full pipeline" implementation: | // lib/rag-pipeline.ts import { Document, VectorStoreIndex, SimpleDirectoryReader, OpenAIEmbedding, Settings, serviceContextFromDefaults, } from "llamaindex"; import { PineconeVectorStore } from "@llamaindex/pinecone"; import { Pinecone } from "@pinecone-database/pinecone";
// Configure LlamaIndex settings Settings.embedModel = new OpenAIEmbedding({ model: "text-embedding-3-small", });
// Initialize Pinecone client const pinecone = new Pinecone({ apiKey: process.env.PINECONE_API_KEY!, });
// Create vector store async function createVectorStore(indexName: string) { const pineconeIndex = pinecone.index(indexName);
return new PineconeVectorStore({ pineconeIndex, });}
// Index documents async function indexDocuments( documentsPath: string, indexName: string ) { // Load documents const reader = new SimpleDirectoryReader(); const documents = await reader.loadData(documentsPath);
// Create vector store const vectorStore = await createVectorStore(indexName); // Build index const index = await VectorStoreIndex.fromDocuments(documents, { vectorStore, }); return index;}
// Query with context async function queryRAG( query: string, indexName: string, options?: { topK?: number; systemPrompt?: string; } ) { const { topK = 3, systemPrompt } = options || {};
const vectorStore = await createVectorStore(indexName); const index = await VectorStoreIndex.fromVectorStore(vectorStore); const queryEngine = index.asQueryEngine({ similarityTopK: topK, }); const response = await queryEngine.query({ query, }); return { answer: response.response, sourceNodes: response.sourceNodes?.map((node) => ({ text: node.node.text, score: node.score, metadata: node.node.metadata, })), };}
// Chat with history async function chatRAG( message: string, indexName: string, history: Array<{ role: "user" | "assistant"; content: string }> ) { const vectorStore = await createVectorStore(indexName); const index = await VectorStoreIndex.fromVectorStore(vectorStore);
const chatEngine = index.asChatEngine({ similarityTopK: 3, }); // Add history for (const msg of history) { if (msg.role === "user") { chatEngine.chatHistory.addMessage({ role: "user", content: msg.content, }); } else { chatEngine.chatHistory.addMessage({ role: "assistant", content: msg.content, }); } } const response = await chatEngine.chat({ message }); return { answer: response.response, sources: response.sourceNodes?.map((n) => n.node.text), };}
-
name: "Voyage AI Embeddings" description: "High-performance embeddings with Voyage AI" when_to_use: "Need best-in-class retrieval accuracy, budget allows" implementation: | // lib/voyage-embeddings.ts interface VoyageEmbedding { embedding: number[]; usage: { total_tokens: number }; }
interface VoyageResponse { data: Array<{ embedding: number[] }>; usage: { total_tokens: number }; }
const VOYAGE_API_URL = "https://api.voyageai.com/v1/embeddings";
async function voyageEmbed( texts: string[], options?: { model?: "voyage-3" | "voyage-3-lite" | "voyage-large-2" | "voyage-code-2"; inputType?: "query" | "document"; } ): Promise<{ embeddings: number[][]; totalTokens: number }> { const { model = "voyage-3-lite", // Good balance of cost/quality inputType = "document", } = options || {};
const response = await fetch(VOYAGE_API_URL, { method: "POST", headers: { "Content-Type": "application/json", Authorization: `Bearer ${process.env.VOYAGE_API_KEY}`, }, body: JSON.stringify({ input: texts, model, input_type: inputType, }), }); if (!response.ok) { throw new Error(`Voyage API error: ${response.statusText}`); } const data: VoyageResponse = await response.json(); return { embeddings: data.data.map((d) => d.embedding), totalTokens: data.usage.total_tokens, };}
// Batch embed with rate limiting async function batchEmbed( texts: string[], options?: { model?: "voyage-3" | "voyage-3-lite"; batchSize?: number; delayMs?: number; } ): Promise<number[][]> { const { model = "voyage-3-lite", batchSize = 128, delayMs = 100 } = options || {};
const allEmbeddings: number[][] = []; for (let i = 0; i < texts.length; i += batchSize) { const batch = texts.slice(i, i + batchSize); const { embeddings } = await voyageEmbed(batch, { model }); allEmbeddings.push(...embeddings); // Rate limit if (i + batchSize < texts.length) { await new Promise((r) => setTimeout(r, delayMs)); } } return allEmbeddings;}
anti_patterns:
-
name: "Single-Stage Retrieval" description: "Using only vector search without reranking" why_bad: | Vector search is recall-optimized, not precision-optimized. First 5 results may not be the best 5. Reranking boosts precision by up to 48% (Databricks research). instead: "Add Cohere/Jina reranker as second stage for important queries"
-
name: "Fixed-Size Chunking" description: "Blindly splitting at 512 or 1000 characters" why_bad: | Cuts sentences mid-thought, breaks context, separates related information. Results in incoherent chunks that embed poorly. instead: "Use semantic chunking with sentence boundaries and header preservation"
-
name: "Pure Vector Search for Exact Matches" description: "Expecting vector search to find exact identifiers" why_bad: | Vector search captures semantic meaning, not exact strings. Query for 'E-404' may return conceptually similar errors but miss the exact error code in documents. instead: "Use hybrid search (vector + BM25 keyword) for production systems"
-
name: "Ignoring Embedding Model Choice" description: "Defaulting to text-embedding-ada-002" why_bad: | ada-002 is outdated. text-embedding-3-small is same price with better performance. Voyage-3 outperforms OpenAI by 9.74% on retrieval benchmarks. instead: "Evaluate Voyage-3-lite ($0.02/1M) or text-embedding-3-small for your use case"
-
name: "No Metadata Filtering" description: "Searching entire vector space for every query" why_bad: | Slower, less relevant. If user is asking about 'React hooks', searching through Python and Go documentation wastes resources. instead: "Add metadata (category, date, source) and filter before vector search"
-
name: "Embedding in Request Path" description: "Generating embeddings on every search request" why_bad: | Adds 100-300ms latency per query. API rate limits become bottleneck under load. instead: "Pre-embed documents at index time, cache query embeddings"
handoffs:
-
to: "document-ai" when: "Need to parse PDFs/documents before indexing" context: "Use document-ai to extract text, then semantic-search to index"
-
to: "ai-observability" when: "Need to monitor retrieval quality" context: "Track recall@k, MRR, latency, embedding costs"
-
to: "backend" when: "Need to build search API endpoints" context: "Backend skill for API design, this skill for search logic"
-
to: "ai-personalization" when: "Need personalized search results" context: "Combine with user embeddings for personalized retrieval"
references:
- title: "Vector Database Comparison 2025" url: "https://www.firecrawl.dev/blog/best-vector-databases-2025"
- title: "Chunking Strategies for RAG" url: "https://medium.com/@adnanmasood/chunking-strategies-for-retrieval-augmented-generation-rag-a-comprehensive-guide"
- title: "Voyage AI Embedding Benchmarks" url: "https://blog.voyageai.com/2025/01/07/voyage-3-large/"
- title: "Top Rerankers for RAG" url: "https://www.analyticsvidhya.com/blog/2025/06/top-rerankers-for-rag/"
- title: "LlamaIndex TypeScript Guide" url: "https://www.analyticsvidhya.com/blog/2025/03/llamaindex-typescript/"