Asi skill-embedding-vss

P-adic ultrametric skill embeddings with MLX Snowflake Arctic, DuckDB

install

source · Clone the upstream repo

git clone https://github.com/plurigrid/asi

Claude Code · Install into ~/.claude/skills/

T=$(mktemp -d) && git clone --depth=1 https://github.com/plurigrid/asi "$T" && mkdir -p ~/.claude/skills && cp -r "$T/plugins/asi/skills/skill-embedding-vss" ~/.claude/skills/plurigrid-asi-skill-embedding-vss && rm -rf "$T"

manifest: plugins/asi/skills/skill-embedding-vss/SKILL.md

source content

Skill Embedding VSS

Use this skill whenever you need to compare skills for relational structure. This is the 2024 evolution of bmorphism's 2020 Levenshtein keyspace reduction: "Pairwise compare functions across all versions and determine which ones are most dissimilar." — r2con 2020 Zignatures talk

Vector similarity search for Agent Skills using:

P-adic ultrametric distance (non-Archimedean, hierarchical clustering)
Snowflake Arctic 1024-bit embeddings via MLX on Apple Silicon
DuckDB HNSW index for fast approximate nearest neighbor
SPI tracing down to individual Metal operations
Content ID extraction with cq/jq/narya.el normal forms

GF(3) Triad

skill-embedding-vss (0) ⊗ skill-connectivity-hub (-1) ⊗ skill-installer (+1) = 0 ✓

Dependencies

uv pip install mlx-embeddings duckdb

Core Implementation

from mlx_embeddings import load
import mlx.core as mx
import numpy as np
import duckdb
import os

class SkillEmbeddingVSS:
    """Embed and search skills using MLX Snowflake + DuckDB HNSW."""
    
    MODEL_ID = "mlx-community/snowflake-arctic-embed-l-v2.0-8bit"
    EMBEDDING_DIM = 1024
    
    def __init__(self, skills_dir: str):
        self.skills_dir = skills_dir
        self.model, self.tokenizer = load(self.MODEL_ID)
        self.model.eval()
        self.conn = duckdb.connect(':memory:')
        self.conn.execute('INSTALL vss; LOAD vss')
        self.conn.execute(f'''
            CREATE TABLE skills (
                name VARCHAR PRIMARY KEY,
                is_target BOOLEAN,
                embedding FLOAT[{self.EMBEDDING_DIM}]
            )
        ''')
        self.skill_data = []
        self.embeddings = []
    
    def embed_text(self, text: str, max_tokens: int = 512) -> np.ndarray:
        """Generate embedding for text using Snowflake Arctic."""
        tokens = self.tokenizer.encode(text[:max_tokens * 4])[:max_tokens]
        input_ids = mx.array([tokens])
        attention_mask = mx.ones_like(input_ids)
        outputs = self.model(input_ids, attention_mask=attention_mask)
        if outputs.text_embeds is not None:
            return np.array(outputs.text_embeds[0])
        return np.array(mx.mean(outputs.last_hidden_state, axis=1)[0])
    
    def index_skills(self, target_skills: list[str] = None):
        """Index all skills from directory."""
        target_skills = target_skills or []
        skills = [d for d in os.listdir(self.skills_dir) 
                  if os.path.isdir(os.path.join(self.skills_dir, d)) 
                  and not d.startswith('.') and d != '_integrated']
        
        for skill in skills:
            skill_path = os.path.join(self.skills_dir, skill, 'SKILL.md')
            if os.path.exists(skill_path):
                with open(skill_path, 'r') as f:
                    content = f.read()[:3000]
                emb = self.embed_text(content)
                is_target = skill in target_skills
                self.skill_data.append({'name': skill, 'is_target': is_target})
                self.embeddings.append(emb)
                self.conn.execute(
                    'INSERT INTO skills VALUES (?, ?, ?)',
                    [skill, is_target, emb.tolist()]
                )
        
        self.embeddings = np.array(self.embeddings)
        self.conn.execute('CREATE INDEX skills_idx ON skills USING HNSW (embedding)')
        return len(self.skill_data)
    
    def find_nearest(self, skill_name: str, k: int = 3, exclude_targets: bool = True) -> list:
        """Find k nearest skills to given skill."""
        idx = next((i for i, s in enumerate(self.skill_data) if s['name'] == skill_name), None)
        if idx is None:
            return []
        
        query_emb = self.embeddings[idx].tolist()
        exclude_clause = "AND NOT is_target" if exclude_targets else ""
        
        result = self.conn.execute(f'''
            SELECT name, array_distance(embedding, ?::FLOAT[{self.EMBEDDING_DIM}]) as dist
            FROM skills 
            WHERE name != ? {exclude_clause}
            ORDER BY dist ASC
            LIMIT ?
        ''', [query_emb, skill_name, k]).fetchall()
        
        return [(name, float(dist)) for name, dist in result]
    
    def find_most_novel(self, target_skills: list[str], top_k: int = 5) -> list:
        """Find which target skills are most novel (furthest from others)."""
        novelty = []
        for skill_name in target_skills:
            nearest = self.find_nearest(skill_name, k=1, exclude_targets=True)
            if nearest:
                novelty.append((skill_name, nearest[0][1]))
        
        novelty.sort(key=lambda x: -x[1])
        return novelty[:top_k]
    
    def embed_query(self, query_text: str, k: int = 5) -> list:
        """Find skills most similar to arbitrary query text."""
        query_emb = self.embed_text(query_text)
        
        result = self.conn.execute(f'''
            SELECT name, array_distance(embedding, ?::FLOAT[{self.EMBEDDING_DIM}]) as dist
            FROM skills
            ORDER BY dist ASC
            LIMIT ?
        ''', [query_emb.tolist(), k]).fetchall()
        
        return [(name, float(dist)) for name, dist in result]
    
    def close(self):
        self.conn.close()

Usage Examples

Index and Search Skills

from skill_embedding_vss import SkillEmbeddingVSS

# Define target skills (e.g., a contributor's skills)
zubyul_skills = [
    'aptos-wallet-mcp', 'skill-connectivity-hub', 'glass-hopping',
    'ordered-locale', 'covariant-modification', 'catsharp-galois',
    'topos-of-music', 'gay-integration', 'kolmogorov-codex-quest'
]

# Initialize and index
vss = SkillEmbeddingVSS('/path/to/skills')
vss.index_skills(target_skills=zubyul_skills)

# Find nearest non-target skills
for skill in zubyul_skills:
    neighbors = vss.find_nearest(skill, k=3)
    print(f"🎯 {skill}")
    for name, dist in neighbors:
        print(f"   ├─ {name} ({dist:.4f})")

# Find most novel contributions
novel = vss.find_most_novel(zubyul_skills, top_k=5)
for name, dist in novel:
    print(f"🆕 {name}: {dist:.4f}")

vss.close()

Query by Description

# Find skills matching a concept
results = vss.embed_query("blockchain wallet integration with GF(3) conservation")
for name, dist in results:
    print(f"{name}: {dist:.4f}")

Babashka CLI Wrapper

#!/usr/bin/env bb
;; skill-vss.bb - Query skill embeddings

(require '[babashka.process :refer [shell]])
(require '[cheshire.core :as json])

(defn query-skills [query-text]
  (let [result (shell {:out :string}
                 "uv" "run" "--with" "mlx-embeddings" "--with" "duckdb"
                 "python3" "-c"
                 (format "
from skill_embedding_vss import SkillEmbeddingVSS
import json
vss = SkillEmbeddingVSS('%s')
vss.index_skills()
results = vss.embed_query('%s', k=5)
print(json.dumps(results))
vss.close()
" (System/getenv "SKILLS_DIR") query-text))]
    (json/parse-string (:out result) true)))

(when (= *file* (System/getProperty "babashka.file"))
  (let [query (first *command-line-args*)]
    (doseq [[name dist] (query-skills query)]
      (println (format "%-40s %.4f" name dist)))))

Justfile Recipes

# Embed all skills and create index
embed-skills skills_dir:
    uv run --with mlx-embeddings --with duckdb python3 -c "
    from skill_embedding_vss import SkillEmbeddingVSS
    vss = SkillEmbeddingVSS('{{skills_dir}}')
    n = vss.index_skills()
    print(f'Indexed {n} skills')
    vss.close()
    "

# Find nearest skills to a target
nearest skill_name skills_dir:
    uv run --with mlx-embeddings --with duckdb python3 -c "
    from skill_embedding_vss import SkillEmbeddingVSS
    vss = SkillEmbeddingVSS('{{skills_dir}}')
    vss.index_skills()
    for name, dist in vss.find_nearest('{{skill_name}}', k=5):
        print(f'{name}: {dist:.4f}')
    vss.close()
    "

# Search by text query
search query skills_dir:
    uv run --with mlx-embeddings --with duckdb python3 -c "
    from skill_embedding_vss import SkillEmbeddingVSS
    vss = SkillEmbeddingVSS('{{skills_dir}}')
    vss.index_skills()
    for name, dist in vss.embed_query('{{query}}', k=10):
        print(f'{name}: {dist:.4f}')
    vss.close()
    "

Model Comparison

Model	Dim	Size	Quality	Speed
all-MiniLM-L6-v2	384	90MB	Baseline	Fast
snowflake-arctic-embed-l-v2.0-8bit	1024	1.2GB	Best	Medium
Qwen3-Embedding-8B	4096	4.7GB	SOTA	Slow

Invariants

invariants:
  - name: embedding_determinism
    predicate: "same text → same embedding"
    scope: per_embed
    
  - name: hnsw_recall
    predicate: "recall@10 >= 0.95"
    scope: per_index
    
  - name: gf3_conservation
    predicate: "trit(skill-embedding-vss) + trit(hub) + trit(installer) = 0"
    scope: per_triad

Fibers

fibers:
  - name: skill_embedding_fiber
    base: "Skill"
    projection: "embed_text(skill.content) → R^1024"
    
  - name: similarity_fiber  
    base: "Skill × Skill"
    projection: "array_distance(emb_a, emb_b) → R"
    
  - name: novelty_fiber
    base: "SkillSet"
    projection: "min_distance_to_complement → R"

Trifurcated Random Walk

The

trifurcate_walk.py

module implements SplitMix64-seeded random walks through the 1024-dim embedding space with GF(3) trifurcation at each step.

SplitMix64 / SplitMixTernary

@dataclass
class SplitMix64:
    """Splittable PRNG - same seed = same sequence."""
    state: int
    GOLDEN = 0x9e3779b97f4a7c15
    
    def next_trit(self) -> int:
        """Generate trit in {-1, 0, +1} via mod 3."""
        return (self.next() % 3) - 1
    
    def trifurcate(self) -> Tuple['SplitMix64', 'SplitMix64', 'SplitMix64']:
        """Split into three streams for GF(3) parallel execution."""
        return (SplitMix64(self.next()),   # MINUS
                SplitMix64(self.next()),   # ERGODIC  
                SplitMix64(self.next()))   # PLUS

class SplitMixTernary:
    """Generates balanced trit triples that sum to 0 (mod 3)."""
    def next_balanced_triple(self) -> Tuple[int, int, int]:
        t1 = self.rng.next_trit()
        t2 = self.rng.next_trit()
        t3 = -(t1 + t2) % 3  # Conservation
        return (t1, t2, t3 if t3 != 2 else -1)

Walk Algorithm

At each step:
1. Find k nearest neighbors per trit: {MINUS: [...], ERGODIC: [...], PLUS: [...]}
2. Select one from each → balanced triad (Σ = 0)
3. SplitMixTernary emits conserved trit → pick corresponding path
4. Continue from chosen skill, avoiding visited nodes

Example Walk

bisimulation-game (seed=1069):
  Step 0: bisimulation-game → [+] glass-bead-game
          Triad: [+]glass-bead-game, [−]open-games, [○]blackhat-go (Σ=0)
  Step 1: glass-bead-game → [−] dialectica
          Triad: [+]world-hopping, [○]glass-hopping, [−]dialectica (Σ=0)
  Step 2: dialectica → [○] skill-connectivity-hub
          Triad: [−]open-games, [○]skill-connectivity-hub, [○]glass-hopping (Σ=-1)

Parallel Trifurcated Walks

walker = TrifurcatedSkillWalk(skills_dir, seed=1069)
walker.index_skills()

# Three parallel walkers with independent RNG streams
results = walker.parallel_walk('bisimulation-game', steps=5)
# results = {'MINUS': [...], 'ERGODIC': [...], 'PLUS': [...]}

CLI Usage

python trifurcate_walk.py /path/to/skills start_skill [seed]
python trifurcate_walk.py /tmp/plurigrid-asi/skills bisimulation-game 1069

P-adic Ultrametric Distance

The

padic_ultrametric.py

module implements non-Archimedean distance for hierarchical skill clustering.

Why P-adic?

Standard Euclidean distance satisfies the triangle inequality:

d(x, z) ≤ d(x, y) + d(y, z)

P-adic ultrametric satisfies the strong triangle inequality:

d(x, z) ≤ max(d(x, y), d(y, z))

This means:

All triangles are isoceles (two sides equal)
The unequal side is always the shortest
Natural hierarchical clustering emerges

P-adic Valuation

def p_adic_valuation(n: int, p: int = 2) -> int:
    """v_p(n) = largest k such that p^k divides n."""
    if n == 0: return float('inf')
    k = 0
    while n % p == 0:
        n //= p
        k += 1
    return k

def p_adic_norm(n: int, p: int = 2) -> float:
    """|n|_p = p^(-v_p(n))"""
    return p ** (-p_adic_valuation(n, p))

def padic_ultrametric_distance(emb_a, emb_b, p=2) -> float:
    """d_p(a, b) = max_i |a_i - b_i|_p"""
    diff = (emb_a - emb_b) * 2**32  # Fixed-point
    return max(p_adic_norm(abs(int(d)), p) for d in diff)

Full Stack Trace

┌─────────────────────────────────────────────────────────────┐
│                    UMAP / itUMAP                            │
│            (dimensionality reduction to 2D/3D)              │
└─────────────────────┬───────────────────────────────────────┘
                      │
┌─────────────────────▼───────────────────────────────────────┐
│                    HNSW Index                               │
│            (approximate nearest neighbor)                   │
└─────────────────────┬───────────────────────────────────────┘
                      │
┌─────────────────────▼───────────────────────────────────────┐
│          Snowflake Arctic Embed 1024-bit                    │
│            (dense semantic vectors)                         │
└─────────────────────┬───────────────────────────────────────┘
                      │
┌─────────────────────▼───────────────────────────────────────┐
│                   MLX Operations                            │
│     tokenize → embedding_lookup → attention → pool          │
└─────────────────────┬───────────────────────────────────────┘
                      │
┌─────────────────────▼───────────────────────────────────────┐
│               Apple Silicon Metal                           │
│     gather, matmul, softmax, reduce_mean kernels            │
└─────────────────────┬───────────────────────────────────────┘
                      │
┌─────────────────────▼───────────────────────────────────────┐
│           Strong Parallelism Invariance                     │
│     seed → deterministic trace → fingerprint                │
└─────────────────────────────────────────────────────────────┘

Content ID Extraction

Finds skills with IDs and computes normal forms:

@dataclass
class ContentID:
    id: str
    content: str
    normal_form: str  # cq | jq normalized
    hash: str
    source: str
    
    @classmethod
    def from_skill(cls, skill_path: Path) -> Optional['ContentID']:
        """Extract if skill has id: field."""
        content = skill_path.read_text()
        id_match = re.search(r'id:\s*["\']?([^"\'\n]+)', content)
        if id_match:
            normal = jq_normalize(content)  # or cq_normalize
            return cls(id=id_match.group(1), ...)

Narya.el Semantic Diff

@dataclass 
class NaryaDiff:
    before: str
    after: str
    delta: Dict[str, Any]  # {added, removed, changed}
    birth: List[str]       # New content
    death: List[str]       # Removed content
    
    def to_narya_witness(self) -> Dict:
        return {
            'before': hash(self.before)[:16],
            'after': hash(self.after)[:16],
            'delta': self.delta,
            'birth': len(self.birth),
            'death': len(self.death),
            'impact': self.delta['changed'] > 0
        }

SPI Verification

@dataclass
class SPIVerifier:
    seed: int
    traces: List[MLXTrace]
    
    def verify_determinism(self, emb1, emb2) -> bool:
        """Same seed → same embedding (SPI)."""
        return np.allclose(emb1, emb2, rtol=1e-5)
    
    def fingerprint(self, embedding) -> str:
        """Deterministic hash for audit trail."""
        quantized = (embedding * 1e6).astype(np.int64)
        return sha256(quantized.tobytes()).hexdigest()[:16]

CLI Usage

# Find skills with content IDs and p-adic neighbors
python padic_ultrametric.py /path/to/skills

# Output:
# Skills with Content IDs (7)
#   python-development: int (hash: 0a49ae9b...)
#   time-travel-crdt: OpId (hash: 14973d04...)
#
# P-adic Nearest to bisimulation-game
#   glass-bead-game: eucl=0.8511, p-adic=1.000000
#   open-games: eucl=0.8755, p-adic=1.000000
#
# SPI Report
#   Seed: 1069, Prime: 2
#   Total ops: 1296, Chain valid: True
#   Total FLOPS: 14,998,474,752

End-of-Skill Interface

References

Snowflake Arctic Embed
DuckDB VSS Extension
MLX Embeddings
SplitMix64 - Steele et al. 2014
P-adic Numbers - Non-Archimedean analysis
Ultrametric Trees - Hierarchical clustering