This Is the Whole Inner Loop

kernel:pairwise matches hand-optimized C

· 8 min read

This Is the Whole Inner Loop

std.kernel:pairwise {
    const dx = k.x - k.other.x;
    const dy = k.y - k.other.y;
    const dz = k.z - k.other.z;
    const dsq = dx*dx + dy*dy + dz*dz;
    const mag = DT / (dsq * @sqrt(dsq));
    k.vx -= dx * k.other.mass * mag;
    k.vy -= dy * k.other.mass * mag;
    k.vz -= dz * k.other.mass * mag;
    k.other.vx += dx * k.mass * mag;
    k.other.vy += dy * k.mass * mag;
    k.other.vz += dz * k.mass * mag;
}

Twelve lines. This is the gravitational force calculation for the n-body simulation — a standard benchmark for data-parallel computation. Five bodies, all unique pairs interact every timestep, 50 million iterations.

You write the physics. The compiler handles the rest.

Results

MeanMinMax
Koru (kernel fused)1.179s1.163s1.215s
C (clang -O3)1.322s1.314s1.331s
Zig (-O ReleaseFast)1.328s1.314s1.379s
Rust1.358s1.350s1.364s
SBCL2.399s2.391s2.407s
GHC3.034s3.016s3.058s

Koru is faster than standard C. The only thing that beats it is scalarized C without fast-math (1.162s) — a version that required manual restructuring of the data layout.

What You Didn’t Have to Write

To match Koru’s performance in C, you need to:

  1. Write the nested loop structure yourselffor (i = 0; i < 5; i++) { for (j = i + 1; j < 5; j++) { ... } }
  2. Add restrict to convince the compiler pointers don’t alias — and be right about it
  3. Enable -ffast-math — accepting the tradeoffs that come with it
  4. Use dsq * sqrt(dsq) instead of pow(distance, 3) — a micro-optimization you have to know about

In Zig, you need noalias, @setFloatMode(.optimized), and fixed *[5]Body so the compiler knows the iteration count.

In Koru, you write kernel:pairwise { ... } and the physics inside it. The compiler:

  • Generates the i < j loop structure
  • Emits noalias unconditionally — it’s always correct because pairwise guarantees distinct elements
  • Knows the kernel size from kernel:init

The proof obligation belongs on the side that has the information. pairwise has it by construction.

The Full Program

Here’s the complete Koru implementation using the fused kernel operations:

const std = @import("std");
~import "$std/kernel"
~import "$std/io"
~import "$std/control"

const PI: f64 = 3.141592653589793;
const SOLAR_MASS: f64 = 4.0 * PI * PI;
const DAYS_PER_YEAR: f64 = 365.24;
const DT: f64 = 0.01;

var iterations: usize = 50000000;

~event parse_args {}
~proc parse_args {
    const args = std.process.argsAlloc(std.heap.page_allocator) catch unreachable;
    defer std.process.argsFree(std.heap.page_allocator, args);
    if (args.len >= 2) {
        iterations = std.fmt.parseInt(usize, args[1], 10) catch unreachable;
    }
}

~std.kernel:shape(Body) {
    x: f64, y: f64, z: f64,
    vx: f64, vy: f64, vz: f64,
    mass: f64,
}

~parse_args()

~std.kernel:init(Body) {
    { x: 0, y: 0, z: 0, vx: 0, vy: 0, vz: 0, mass: SOLAR_MASS },
    { x: 4.84143144246472090e+00, y: -1.16032004402742839e+00, z: -1.03622044471123109e-01,
      vx: 1.66007664274403694e-03 * DAYS_PER_YEAR, vy: 7.69901118419740425e-03 * DAYS_PER_YEAR,
      vz: -6.90460016972063023e-05 * DAYS_PER_YEAR, mass: 9.54791938424326609e-04 * SOLAR_MASS },
    { x: 8.34336671824457987e+00, y: 4.12479856412430479e+00, z: -4.03523417114321381e-01,
      vx: -2.76742510726862411e-03 * DAYS_PER_YEAR, vy: 4.99852801234917238e-03 * DAYS_PER_YEAR,
      vz: 2.30417297573763929e-05 * DAYS_PER_YEAR, mass: 2.85885980666130812e-04 * SOLAR_MASS },
    { x: 1.28943695621391310e+01, y: -1.51111514016986312e+01, z: -2.23307578892655734e-01,
      vx: 2.96460137564761618e-03 * DAYS_PER_YEAR, vy: 2.37847173959480950e-03 * DAYS_PER_YEAR,
      vz: -2.96589568540237556e-05 * DAYS_PER_YEAR, mass: 4.36624404335156298e-05 * SOLAR_MASS },
    { x: 1.53796971148509165e+01, y: -2.59193146099879641e+01, z: 1.79258772950371181e-01,
      vx: 2.68067772490389322e-03 * DAYS_PER_YEAR, vy: 1.62824170038242295e-03 * DAYS_PER_YEAR,
      vz: -9.51592254519715870e-05 * DAYS_PER_YEAR, mass: 5.15138902046611451e-05 * SOLAR_MASS },
}
| kernel k |>
    std.kernel:step(0..iterations)
    | step |>
        std.kernel:pairwise {
            const dx = k.x - k.other.x;
            const dy = k.y - k.other.y;
            const dz = k.z - k.other.z;
            const dsq = dx*dx + dy*dy + dz*dz;
            const mag = DT / (dsq * @sqrt(dsq));
            k.vx -= dx * k.other.mass * mag;
            k.vy -= dy * k.other.mass * mag;
            k.vz -= dz * k.other.mass * mag;
            k.other.vx += dx * k.mass * mag;
            k.other.vy += dy * k.mass * mag;
            k.other.vz += dz * k.mass * mag;
        }
        |> std.kernel:self {
            k.x += DT * k.vx;
            k.y += DT * k.vy;
            k.z += DT * k.vz;
        }
| computed c |>
    capture({ energy: @as(f64, 0) })
    | as acc |>
        for(0..5)
        | each i |>
            captured { energy: acc.energy + 0.5*c[i].mass*(c[i].vx*c[i].vx+c[i].vy*c[i].vy+c[i].vz*c[i].vz) }
            |> for(i+1..5)
                | each j |>
                    captured { energy: acc.energy - c[i].mass*c[j].mass / @sqrt((c[i].x-c[j].x)*(c[i].x-c[j].x)+(c[i].y-c[j].y)*(c[i].y-c[j].y)+(c[i].z-c[j].z)*(c[i].z-c[j].z)) }
    | captured final |>
        std.io:print.blk {
            {{ final.energy:d:.9 }}
        }

The key constructs:

  • kernel:shape(Body) — declares the data layout
  • kernel:init(Body) — initializes the kernel with 5 bodies, returns a handle k
  • kernel:step(0..iterations) — runs the simulation loop, fused with the kernel ops
  • | step |> — the step branch, no binding (GPU-portable, no iteration identity)
  • kernel:pairwise { ... } — the force calculation, applied to all unique pairs
  • kernel:self { ... } — position updates, applied to each body
  • | each i |> — for regular for loops, iteration with the loop variable bound to i

What the Compiler Generates

The kernel:pairwise transform emits this:

const __koru_pair = struct {
    inline fn call(
        noalias __koru_pair_self: *Body,
        noalias __koru_pair_other: *Body,
    ) void {
        // your pairwise body, unchanged
    }
};
for (0..N) |i| {
    for (i + 1..N) |j| {
        __koru_pair.call(&ptr[i], &ptr[j]);
    }
}

noalias is unconditional because the loop structure guarantees i < j — the pointers are always distinct. In hand-written code, you add restrict after convincing yourself the invariant holds. Here, the transform has the proof by construction.


Appendix: Reference Implementations

For transparency, here’s exactly what we benchmarked against.

C (clang -O3 -ffast-math -march=native)

From the Computer Language Benchmarks Game, contributed by Christoph Bauer. License: Revised BSD.

#include <math.h>
#include <stdio.h>
#include <stdlib.h>

#define pi 3.141592653589793
#define solar_mass (4 * pi * pi)
#define days_per_year 365.24

struct planet {
  double x, y, z;
  double vx, vy, vz;
  double mass;
};

static inline void advance(struct planet * restrict bodies, double dt)
{
  int i, j;

  for (i = 0; i < 5; i++) {
    for (j = i + 1; j < 5; j++) {
      double dx = bodies[i].x - bodies[j].x;
      double dy = bodies[i].y - bodies[j].y;
      double dz = bodies[i].z - bodies[j].z;
      double dsq = dx * dx + dy * dy + dz * dz;
      double mag = dt / (dsq * sqrt(dsq));
      bodies[i].vx -= dx * bodies[j].mass * mag;
      bodies[i].vy -= dy * bodies[j].mass * mag;
      bodies[i].vz -= dz * bodies[j].mass * mag;
      bodies[j].vx += dx * bodies[i].mass * mag;
      bodies[j].vy += dy * bodies[i].mass * mag;
      bodies[j].vz += dz * bodies[i].mass * mag;
    }
  }

  for (i = 0; i < 5; i++) {
    bodies[i].x += dt * bodies[i].vx;
    bodies[i].y += dt * bodies[i].vy;
    bodies[i].z += dt * bodies[i].vz;
  }
}

double energy(int nbodies, struct planet * bodies)
{
  double e;
  int i, j;

  e = 0.0;
  for (i = 0; i < nbodies; i++) {
    struct planet * b = &(bodies[i]);
    e += 0.5 * b->mass * (b->vx * b->vx + b->vy * b->vy + b->vz * b->vz);
    for (j = i + 1; j < nbodies; j++) {
      struct planet * b2 = &(bodies[j]);
      double dx = b->x - b2->x;
      double dy = b->y - b2->y;
      double dz = b->z - b2->z;
      double distance = sqrt(dx * dx + dy * dy + dz * dz);
      e -= (b->mass * b2->mass) / distance;
    }
  }
  return e;
}

void offset_momentum(int nbodies, struct planet * bodies)
{
  double px = 0.0, py = 0.0, pz = 0.0;
  int i;
  for (i = 0; i < nbodies; i++) {
    px += bodies[i].vx * bodies[i].mass;
    py += bodies[i].vy * bodies[i].mass;
    pz += bodies[i].vz * bodies[i].mass;
  }
  bodies[0].vx = - px / solar_mass;
  bodies[0].vy = - py / solar_mass;
  bodies[0].vz = - pz / solar_mass;
}

#define NBODIES 5
struct planet bodies[NBODIES] = {
  { 0, 0, 0, 0, 0, 0, solar_mass },
  {
    4.84143144246472090e+00,
    -1.16032004402742839e+00,
    -1.03622044471123109e-01,
    1.66007664274403694e-03 * days_per_year,
    7.69901118419740425e-03 * days_per_year,
    -6.90460016972063023e-05 * days_per_year,
    9.54791938424326609e-04 * solar_mass
  },
  {
    8.34336671824457987e+00,
    4.12479856412430479e+00,
    -4.03523417114321381e-01,
    -2.76742510726862411e-03 * days_per_year,
    4.99852801234917238e-03 * days_per_year,
    2.30417297573763929e-05 * days_per_year,
    2.85885980666130812e-04 * solar_mass
  },
  {
    1.28943695621391310e+01,
    -1.51111514016986312e+01,
    -2.23307578892655734e-01,
    2.96460137564761618e-03 * days_per_year,
    2.37847173959480950e-03 * days_per_year,
    -2.96589568540237556e-05 * days_per_year,
    4.36624404335156298e-05 * solar_mass
  },
  {
    1.53796971148509165e+01,
    -2.59193146099879641e+01,
    1.79258772950371181e-01,
    2.68067772490389322e-03 * days_per_year,
    1.62824170038242295e-03 * days_per_year,
    -9.51592254519715870e-05 * days_per_year,
    5.15138902046611451e-05 * solar_mass
  }
};

int main(int argc, char ** argv)
{
  int n = atoi(argv[1]);
  int i;

  offset_momentum(NBODIES, bodies);
  printf ("%.9f\n", energy(NBODIES, bodies));
  for (i = 1; i <= n; i++)
    advance(bodies, 0.01);
  printf ("%.9f\n", energy(NBODIES, bodies));
  return 0;
}

Zig (noalias *[5]Body, -O ReleaseFast)

Ported from the Benchmarks Game. Every optimization applied: @setFloatMode(.optimized) for fast-math, dsq * sqrt(dsq) instead of distance³, fixed *[5]Body so the compiler knows the iteration count, and noalias to match C’s restrict.

comptime {
    @setFloatMode(.optimized);
}

const std = @import("std");

const PI = 3.141592653589793;
const SOLAR_MASS = 4 * PI * PI;
const DAYS_PER_YEAR = 365.24;

const Body = struct {
    x: f64, y: f64, z: f64,
    vx: f64, vy: f64, vz: f64,
    mass: f64,
};

fn advance(noalias bodies: *[5]Body, dt: f64) void {
    for (0..5) |i| {
        for (i + 1..5) |j| {
            const dx = bodies[i].x - bodies[j].x;
            const dy = bodies[i].y - bodies[j].y;
            const dz = bodies[i].z - bodies[j].z;
            const dsq = dx * dx + dy * dy + dz * dz;
            const mag = dt / (dsq * @sqrt(dsq));

            bodies[i].vx -= dx * bodies[j].mass * mag;
            bodies[i].vy -= dy * bodies[j].mass * mag;
            bodies[i].vz -= dz * bodies[j].mass * mag;

            bodies[j].vx += dx * bodies[i].mass * mag;
            bodies[j].vy += dy * bodies[i].mass * mag;
            bodies[j].vz += dz * bodies[i].mass * mag;
        }
    }

    for (bodies) |*body| {
        body.x += dt * body.vx;
        body.y += dt * body.vy;
        body.z += dt * body.vz;
    }
}

fn energy(bodies: *const [5]Body) f64 {
    var e: f64 = 0.0;
    for (bodies, 0..) |body, i| {
        e += 0.5 * body.mass * (body.vx * body.vx + body.vy * body.vy + body.vz * body.vz);
        for (i + 1..5) |j| {
            const dx = body.x - bodies[j].x;
            const dy = body.y - bodies[j].y;
            const dz = body.z - bodies[j].z;
            e -= (body.mass * bodies[j].mass) / @sqrt(dx * dx + dy * dy + dz * dz);
        }
    }
    return e;
}

fn offsetMomentum(bodies: *[5]Body) void {
    var px: f64 = 0.0;
    var py: f64 = 0.0;
    var pz: f64 = 0.0;
    for (bodies) |body| {
        px += body.vx * body.mass;
        py += body.vy * body.mass;
        pz += body.vz * body.mass;
    }
    bodies[0].vx = -px / SOLAR_MASS;
    bodies[0].vy = -py / SOLAR_MASS;
    bodies[0].vz = -pz / SOLAR_MASS;
}

pub fn main() !void {
    const args = try std.process.argsAlloc(std.heap.page_allocator);
    defer std.process.argsFree(std.heap.page_allocator, args);
    const n = try std.fmt.parseInt(u32, args[1], 10);

    var bodies = [_]Body{
        .{ .x = 0, .y = 0, .z = 0, .vx = 0, .vy = 0, .vz = 0, .mass = SOLAR_MASS },
        .{ .x = 4.84143144246472090e+00, .y = -1.16032004402742839e+00, .z = -1.03622044471123109e-01,
           .vx = 1.66007664274403694e-03 * DAYS_PER_YEAR, .vy = 7.69901118419740425e-03 * DAYS_PER_YEAR,
           .vz = -6.90460016972063023e-05 * DAYS_PER_YEAR, .mass = 9.54791938424326609e-04 * SOLAR_MASS },
        .{ .x = 8.34336671824457987e+00, .y = 4.12479856412430479e+00, .z = -4.03523417114321381e-01,
           .vx = -2.76742510726862411e-03 * DAYS_PER_YEAR, .vy = 4.99852801234917238e-03 * DAYS_PER_YEAR,
           .vz = 2.30417297573763929e-05 * DAYS_PER_YEAR, .mass = 2.85885980666130812e-04 * SOLAR_MASS },
        .{ .x = 1.28943695621391310e+01, .y = -1.51111514016986312e+01, .z = -2.23307578892655734e-01,
           .vx = 2.96460137564761618e-03 * DAYS_PER_YEAR, .vy = 2.37847173959480950e-03 * DAYS_PER_YEAR,
           .vz = -2.96589568540237556e-05 * DAYS_PER_YEAR, .mass = 4.36624404335156298e-05 * SOLAR_MASS },
        .{ .x = 1.53796971148509165e+01, .y = -2.59193146099879641e+01, .z = 1.79258772950371181e-01,
           .vx = 2.68067772490389322e-03 * DAYS_PER_YEAR, .vy = 1.62824170038242295e-03 * DAYS_PER_YEAR,
           .vz = -9.51592254519715870e-05 * DAYS_PER_YEAR, .mass = 5.15138902046611451e-05 * SOLAR_MASS },
    };

    offsetMomentum(&bodies);
    std.debug.print("{d:.9}\n", .{energy(&bodies)});
    for (0..n) |_| advance(&bodies, 0.01);
    std.debug.print("{d:.9}\n", .{energy(&bodies)});
}

Rust

use std::f64::consts::PI;

const SOLAR_MASS: f64 = 4.0 * PI * PI;
const DAYS_PER_YEAR: f64 = 365.24;

#[derive(Clone, Copy)]
struct Body {
    x: f64, y: f64, z: f64,
    vx: f64, vy: f64, vz: f64,
    mass: f64,
}

fn advance(bodies: &mut [Body; 5], dt: f64) {
    for i in 0..5 {
        for j in (i + 1)..5 {
            let dx = bodies[i].x - bodies[j].x;
            let dy = bodies[i].y - bodies[j].y;
            let dz = bodies[i].z - bodies[j].z;
            let dsq = dx * dx + dy * dy + dz * dz;
            let mag = dt / (dsq * dsq.sqrt());

            bodies[i].vx -= dx * bodies[j].mass * mag;
            bodies[i].vy -= dy * bodies[j].mass * mag;
            bodies[i].vz -= dz * bodies[j].mass * mag;

            bodies[j].vx += dx * bodies[i].mass * mag;
            bodies[j].vy += dy * bodies[i].mass * mag;
            bodies[j].vz += dz * bodies[i].mass * mag;
        }
    }

    for body in bodies.iter_mut() {
        body.x += dt * body.vx;
        body.y += dt * body.vy;
        body.z += dt * body.vz;
    }
}

fn energy(bodies: &[Body; 5]) -> f64 {
    let mut e = 0.0;
    for (i, body) in bodies.iter().enumerate() {
        e += 0.5 * body.mass * (body.vx * body.vx + body.vy * body.vy + body.vz * body.vz);
        for j in (i + 1)..5 {
            let dx = body.x - bodies[j].x;
            let dy = body.y - bodies[j].y;
            let dz = body.z - bodies[j].z;
            e -= (body.mass * bodies[j].mass) / (dx * dx + dy * dy + dz * dz).sqrt();
        }
    }
    e
}

fn offset_momentum(bodies: &mut [Body; 5]) {
    let (mut px, mut py, mut pz) = (0.0, 0.0, 0.0);
    for body in bodies.iter() {
        px += body.vx * body.mass;
        py += body.vy * body.mass;
        pz += body.vz * body.mass;
    }
    bodies[0].vx = -px / SOLAR_MASS;
    bodies[0].vy = -py / SOLAR_MASS;
    bodies[0].vz = -pz / SOLAR_MASS;
}

fn main() {
    let n: u32 = std::env::args().nth(1).unwrap().parse().unwrap();

    let mut bodies = [
        Body { x: 0.0, y: 0.0, z: 0.0, vx: 0.0, vy: 0.0, vz: 0.0, mass: SOLAR_MASS },
        Body { x: 4.84143144246472090e+00, y: -1.16032004402742839e+00, z: -1.03622044471123109e-01,
               vx: 1.66007664274403694e-03 * DAYS_PER_YEAR, vy: 7.69901118419740425e-03 * DAYS_PER_YEAR,
               vz: -6.90460016972063023e-05 * DAYS_PER_YEAR, mass: 9.54791938424326609e-04 * SOLAR_MASS },
        Body { x: 8.34336671824457987e+00, y: 4.12479856412430479e+00, z: -4.03523417114321381e-01,
               vx: -2.76742510726862411e-03 * DAYS_PER_YEAR, vy: 4.99852801234917238e-03 * DAYS_PER_YEAR,
               vz: 2.30417297573763929e-05 * DAYS_PER_YEAR, mass: 2.85885980666130812e-04 * SOLAR_MASS },
        Body { x: 1.28943695621391310e+01, y: -1.51111514016986312e+01, z: -2.23307578892655734e-01,
               vx: 2.96460137564761618e-03 * DAYS_PER_YEAR, vy: 2.37847173959480950e-03 * DAYS_PER_YEAR,
               vz: -2.96589568540237556e-05 * DAYS_PER_YEAR, mass: 4.36624404335156298e-05 * SOLAR_MASS },
        Body { x: 1.53796971148509165e+01, y: -2.59193146099879641e+01, z: 1.79258772950371181e-01,
               vx: 2.68067772490389322e-03 * DAYS_PER_YEAR, vy: 1.62824170038242295e-03 * DAYS_PER_YEAR,
               vz: -9.51592254519715870e-05 * DAYS_PER_YEAR, mass: 5.15138902046611451e-05 * SOLAR_MASS },
    ];

    offset_momentum(&mut bodies);
    println!("{:.9}", energy(&bodies));
    for _ in 0..n {
        advance(&mut bodies, 0.01);
    }
    println!("{:.9}", energy(&bodies));
}

The benchmark is in the repository at tests/regression/900_EXAMPLES_SHOWCASE/910_LANGUAGE_SHOOTOUT/nbody/.