Vibeship-spawner-skills transformer-architecture

id: transformer-architecture

install

source · Clone the upstream repo

git clone https://github.com/vibeforge1111/vibeship-spawner-skills

manifest: ai/transformer-architecture/skill.yaml

tags

#attention-mechanics #position-encoding #llm-optimization #transformer-implementation #self-attention

source content

id: transformer-architecture name: Transformer Architecture category: ai description: Use when implementing attention mechanisms, building custom transformer models, understanding positional encoding, or optimizing transformer inference - covers self-attention, multi-head attention, RoPE, ALiBi, and architecture variants

patterns: golden_rules: - rule: "Attention is O(n^2) in sequence length" reason: "Long contexts need efficient attention (FlashAttention, sparse)" - rule: "Position info must be injected" reason: "Transformers have no inherent order awareness" - rule: "Layer norm placement matters" reason: "Pre-norm (before attention) is more stable for training" - rule: "Masking determines causality" reason: "Wrong masks cause data leakage in training" - rule: "Mixed precision is free performance" reason: "bf16/fp16 on modern GPUs with negligible quality loss"

architecture_variants: encoder_only: models: ["BERT", "RoBERTa"] attention: "Bidirectional self-attention" use_cases: ["Classification", "NER/tagging", "Embeddings", "Retrieval"] decoder_only: models: ["GPT", "LLaMA"] attention: "Causal (left-to-right) self-attention" use_cases: ["Text generation", "Code completion", "Chat/dialogue", "Reasoning"] encoder_decoder: models: ["T5", "BART"] attention: "Cross-attention between enc/dec" use_cases: ["Translation", "Summarization", "Seq2seq tasks"]

positional_encoding: sinusoidal: pros: ["Deterministic", "Generalizes to longer sequences"] cons: ["Moderate length extrapolation"] best_for: "Short sequences" learned: pros: ["Simple", "Stable training"] cons: ["Poor length generalization"] best_for: "Fixed-length tasks" rope: pros: ["Excellent length extrapolation", "Relative position encoded naturally"] cons: ["Slightly more compute"] best_for: "Long-context LLMs" alibi: pros: ["Excellent extrapolation", "Very low memory"] cons: ["May underperform on short contexts"] best_for: "Very long sequences"

efficient_attention: flash_attention: memory: "O(n) instead of O(n^2)" speedup: "2-4x for long sequences" note: "Exact same results, not approximate" grouped_query_attention: description: "Fewer KV heads than query heads" models: ["LLaMA 2", "Mistral"] benefit: "Memory efficiency during inference"

anti_patterns:

pattern: "Wrong mask type" problem: "Data leakage in training" solution: "Causal for decoders, bidirectional for encoders"
pattern: "No scaling in attention" problem: "Gradients vanish/explode" solution: "Always divide by sqrt(d_k)"
pattern: "Post-norm for deep nets" problem: "Training instability" solution: "Use pre-norm (LayerNorm before attention)"
pattern: "Ignoring mixed precision" problem: "2x slower training" solution: "Use bf16/fp16 on modern GPUs"
pattern: "O(n^2) attention for long seqs" problem: "OOM errors" solution: "FlashAttention, sparse attention"
pattern: "Fixed positional embeddings" problem: "Poor length generalization" solution: "RoPE or ALiBi for long contexts"

implementation_checklist: from_scratch: - "Attention scaling by sqrt(d_k)" - "Correct masking (causal vs bidirectional)" - "Pre-norm vs post-norm decision" - "Positional encoding choice (RoPE for LLMs)" - "Residual connections around attention and FFN" - "Dropout in attention and FFN" optimization: - "Enable FlashAttention (PyTorch 2.0+)" - "Use mixed precision (bf16 preferred)" - "Consider GQA for inference efficiency" - "KV cache for autoregressive generation" - "Gradient checkpointing for memory" training: - "AdamW optimizer with weight decay" - "Learning rate warmup + decay" - "Gradient clipping (typically 1.0)" - "Proper initialization (Xavier/He)"

handoffs:

skill: distributed-training trigger: "training across multiple GPUs"
skill: model-optimization trigger: "quantization or inference optimization"
skill: llm-fine-tuning trigger: "adapting transformer to specific task"
skill: reinforcement-learning trigger: "RLHF or alignment"

ecosystem: frameworks: - "PyTorch - Native transformer modules" - "HuggingFace Transformers - Pre-trained models" - "FlashAttention - Efficient attention kernel" libraries: - "xformers - Memory-efficient transformers" - "triton - Custom GPU kernels" models: - "LLaMA - Meta's open LLM" - "Mistral - Efficient transformer" - "BERT - Encoder-only classic"

sources: papers: - "Vaswani et al. (2017). Attention Is All You Need" - "Su et al. (2021). RoFormer: Enhanced Transformer with Rotary Position Embedding" - "Press et al. (2022). ALiBi: Train Short, Test Long" tutorials: - "The Illustrated Transformer - jalammar.github.io" - "ICLR 2025: Positional Embeddings in Transformer Models"