Asi julia-gpu-kernels

Julia GPU Kernels Skill

install
source · Clone the upstream repo
git clone https://github.com/plurigrid/asi
Claude Code · Install into ~/.claude/skills/
T=$(mktemp -d) && git clone --depth=1 https://github.com/plurigrid/asi "$T" && mkdir -p ~/.claude/skills && cp -r "$T/skills/julia-gpu-kernels" ~/.claude/skills/plurigrid-asi-julia-gpu-kernels && rm -rf "$T"
manifest: skills/julia-gpu-kernels/SKILL.md
source content

Julia GPU Kernels Skill

KernelAbstractions.jl: Backend-agnostic GPU kernel programming for Julia.

Core Kernel Macros

@kernel - Define a kernel function

using KernelAbstractions

@kernel function vecadd!(A, @Const(B))
    I = @index(Global, Linear)
    @inbounds A[I] += B[I]
end

# Launch on backend
kernel = vecadd!(backend)
kernel(A, B, ndrange=size(A))
KernelAbstractions.synchronize(backend)

@Const - Read-only array annotation

Marks array as read-only and non-aliasing for compiler optimizations:

@kernel function copy_kernel(out, @Const(input))
    i = @index(Global, Linear)
    out[i] = input[i]
end

@index - Query work item indices

@index(Global, Linear)     # Flat global index
@index(Global, Cartesian)  # CartesianIndex in global space
@index(Local, Linear)      # Index within workgroup
@index(Group, Linear)      # Which workgroup
@index(Global, NTuple)     # As tuple

@groupsize / @ndrange

@kernel function info_kernel(out)
    gs = @groupsize()       # Workgroup dimensions
    nr = @ndrange()         # Total computation range
    N = @uniform prod(@groupsize())  # Total workgroup size
end

@localmem - Shared memory within workgroup

@kernel function reduce_kernel(out, @Const(input))
    lid = @index(Local, Linear)
    gid = @index(Global, Linear)
    
    shared = @localmem Float32 (256,)  # Shared across workgroup
    shared[lid] = input[gid]
    @synchronize
    
    # Now all threads can read shared
end

@private - Per-work-item memory

@kernel function accumulate_kernel(out, @Const(data))
    i = @index(Global, Linear)
    acc = @private Float32 (1,)  # Private to this work item
    acc[1] = 0.0f0
    # ... accumulate into acc
    out[i] = acc[1]
end

@uniform - Evaluate once per workgroup

@kernel function batched_kernel(out, @Const(input))
    @uniform begin
        groupsize = @groupsize()[1]
        scale = 2.0f0
    end
    # groupsize and scale shared across work items
end

@synchronize - Memory barrier

@synchronize  # All work items in workgroup must reach this point

Backend System

Backend Types

using KernelAbstractions

# Abstract hierarchy
Backend        # All backends
├── GPU        # GPU backends (deprecated in 1.0)
│   ├── CUDABackend
│   ├── ROCBackend  
│   └── oneAPIBackend
└── CPU        # CPU backend

# Get backend from array
backend = get_backend(A)

Kernel Type

A

Kernel
contains:

  • Backend reference
  • Workgroup size
  • NDRange
  • Transformed function
@kernel function my_kernel(A)
    # ...
end

# Create kernel for specific backend
kernel = my_kernel(CUDABackend())
kernel(A, ndrange=size(A), workgroupsize=256)

CUDA.jl Integration

using CUDA, KernelAbstractions

# Get CUDABackend
backend = CUDABackend()
# Or from existing array
A_gpu = CUDA.rand(1024)
backend = get_backend(A_gpu)

@kernel function saxpy!(y, @Const(a), @Const(x))
    i = @index(Global, Linear)
    @inbounds y[i] += a * x[i]
end

kernel = saxpy!(backend)
kernel(y_gpu, 2.0f0, x_gpu, ndrange=length(y_gpu))
KernelAbstractions.synchronize(backend)

Memory Model

Memory TypeScopeLifetimeSpeedMacro
GlobalAll workgroupsKernelSlowestRegular arrays
LocalWorkgroupWorkgroupFast
@localmem
PrivateWork itemWork itemFastest
@private
ConstantAll (read-only)KernelFast
@Const

Synchronization Rules

  • @localmem
    requires
    @synchronize
    for visibility across work items
  • @synchronize
    must be reached by ALL work items in workgroup or NONE
  • No synchronization needed for
    @private
    memory

Enzyme Autodiff Integration

using KernelAbstractions, Enzyme

@kernel function square!(A)
    I = @index(Global, Linear)
    @inbounds A[I] *= A[I]
end

function square_caller(A, backend)
    kernel = square!(backend)
    kernel(A, ndrange=size(A))
    KernelAbstractions.synchronize(backend)
    return
end

# Differentiate with Enzyme
A = rand(Float32, 1024)
dA = ones(Float32, 1024)  # Seed gradient

Enzyme.autodiff(Reverse, square_caller, 
    Duplicated(A, dA),    # Primal + tangent
    Const(CPU()))         # Backend is constant

Enzyme Annotations

  • Duplicated(primal, tangent)
    - Active array argument
  • Const(x)
    - Constant, not differentiated
  • Active(x)
    - Active scalar (not supported on GPU)

MaxEnt Triad Testing Protocol

Three agents maximize mutual information through complementary verification:

AgentRoleVerifies
julia-gpu-kernelsKernel definitionCorrect @kernel syntax, backend dispatch
enzyme-autodiffDifferentiation
autodiff(Reverse, kernel, ...)
works
julia-temperingRNG injectionSplittableRandom integrates with kernel

Test: Differentiable GPU Kernel with Splittable RNG

using KernelAbstractions, Enzyme, SplittableRandoms

# Agent A (julia-gpu-kernels): Define kernel with RNG state
@kernel function monte_carlo_kernel(out, rng_states, @Const(params))
    i = @index(Global)
    rng = rng_states[i]
    
    # Sample and accumulate
    acc = @private Float32 (1,)
    acc[1] = 0.0f0
    for _ in 1:100
        u = rand(rng, Float32)
        acc[1] += params[1] * u
    end
    out[i] = acc[1]
end

# Launcher for Enzyme compatibility
function mc_launcher(out, rng_states, params, backend)
    kernel = monte_carlo_kernel(backend)
    kernel(out, rng_states, params, ndrange=length(out))
    KernelAbstractions.synchronize(backend)
    return
end

Agent B (enzyme-autodiff): Differentiate the kernel

# Enzyme differentiates w.r.t. params
out = zeros(Float32, 1024)
dout = ones(Float32, 1024)
params = Float32[1.0]
dparams = Float32[0.0]

Enzyme.autodiff(Reverse, mc_launcher,
    Duplicated(out, dout),
    Const(rng_states),       # RNG is not differentiated
    Duplicated(params, dparams),
    Const(backend))

# dparams now contains ∂loss/∂params

Agent C (julia-tempering): Provide splittable RNG

using SplittableRandoms

# Create splittable RNG hierarchy for parallel kernel
master_rng = SplittableRandom(42)
n_threads = 1024

# Split into independent streams per work item
rng_states = [split(master_rng, i) for i in 1:n_threads]

# Property: Any permutation of splits yields same distribution
# Property: Parent-child independence for parallel safety

Verification Matrix

PropertyA ProvidesB VerifiesC Provides
Kernel correctness
@kernel
syntax
Launches without error-
Memory safety
@Const
,
@private
No aliasing violations-
Differentiability
synchronize
call
Gradients are correct-
RNG independence-dRNG/dparams = 0Split semantics
Reproducibility--Deterministic splits

Integration Test

function test_triad_integration()
    backend = CPU()
    n = 256
    
    # C: Setup RNG
    master = SplittableRandom(12345)
    rngs = [split(master, i) for i in 1:n]
    
    # A: Define arrays
    out = zeros(Float32, n)
    dout = ones(Float32, n)
    params = Float32[2.0]
    dparams = Float32[0.0]
    
    # B: Differentiate
    Enzyme.autodiff(Reverse, mc_launcher,
        Duplicated(out, dout),
        Const(rngs),
        Duplicated(params, dparams),
        Const(backend))
    
    @assert !isnan(dparams[1]) "Gradient should be finite"
    @assert dparams[1] != 0.0 "Gradient should be non-zero"
    
    println("✓ Triad integration verified")
    println("  ∂loss/∂params = $(dparams[1])")
end

Quick Reference

MacroPurpose
@kernel
Define kernel function
@Const(x)
Read-only array
@index(scope, kind)
Get work item index
@groupsize()
Workgroup dimensions
@ndrange()
Total range
@localmem T dims
Shared memory
@private T dims
Per-item memory
@uniform expr
Evaluate once per group
@synchronize
Memory barrier

Resources


Autopoietic Marginalia

The interaction IS the skill improving itself.

Every use of this skill is an opportunity for worlding:

  • MEMORY (-1): Record what was learned
  • REMEMBERING (0): Connect patterns to other skills
  • WORLDING (+1): Evolve the skill based on use

Add Interaction Exemplars here as the skill is used.