This Is the Whole Inner Loop
kernel:pairwise matches hand-optimized C
This Is the Whole Inner Loop
std.kernel:pairwise {
const dx = k.x - k.other.x;
const dy = k.y - k.other.y;
const dz = k.z - k.other.z;
const dsq = dx*dx + dy*dy + dz*dz;
const mag = DT / (dsq * @sqrt(dsq));
k.vx -= dx * k.other.mass * mag;
k.vy -= dy * k.other.mass * mag;
k.vz -= dz * k.other.mass * mag;
k.other.vx += dx * k.mass * mag;
k.other.vy += dy * k.mass * mag;
k.other.vz += dz * k.mass * mag;
} Twelve lines. This is the gravitational force calculation for the n-body simulation — a standard benchmark for data-parallel computation. Five bodies, all unique pairs interact every timestep, 50 million iterations.
You write the physics. The compiler handles the rest.
Results
| Mean | Min | Max | |
|---|---|---|---|
| Koru (kernel fused) | 1.179s | 1.163s | 1.215s |
| C (clang -O3) | 1.322s | 1.314s | 1.331s |
| Zig (-O ReleaseFast) | 1.328s | 1.314s | 1.379s |
| Rust | 1.358s | 1.350s | 1.364s |
| SBCL | 2.399s | 2.391s | 2.407s |
| GHC | 3.034s | 3.016s | 3.058s |
Koru is faster than standard C. The only thing that beats it is scalarized C without fast-math (1.162s) — a version that required manual restructuring of the data layout.
What You Didn’t Have to Write
To match Koru’s performance in C, you need to:
- Write the nested loop structure yourself —
for (i = 0; i < 5; i++) { for (j = i + 1; j < 5; j++) { ... } } - Add
restrictto convince the compiler pointers don’t alias — and be right about it - Enable
-ffast-math— accepting the tradeoffs that come with it - Use
dsq * sqrt(dsq)instead ofpow(distance, 3)— a micro-optimization you have to know about
In Zig, you need noalias, @setFloatMode(.optimized), and fixed *[5]Body so the compiler knows the iteration count.
In Koru, you write kernel:pairwise { ... } and the physics inside it. The compiler:
- Generates the
i < jloop structure - Emits
noaliasunconditionally — it’s always correct becausepairwiseguarantees distinct elements - Knows the kernel size from
kernel:init
The proof obligation belongs on the side that has the information. pairwise has it by construction.
The Full Program
Here’s the complete Koru implementation using the fused kernel operations:
const std = @import("std");
~import "$std/kernel"
~import "$std/io"
~import "$std/control"
const PI: f64 = 3.141592653589793;
const SOLAR_MASS: f64 = 4.0 * PI * PI;
const DAYS_PER_YEAR: f64 = 365.24;
const DT: f64 = 0.01;
var iterations: usize = 50000000;
~event parse_args {}
~proc parse_args {
const args = std.process.argsAlloc(std.heap.page_allocator) catch unreachable;
defer std.process.argsFree(std.heap.page_allocator, args);
if (args.len >= 2) {
iterations = std.fmt.parseInt(usize, args[1], 10) catch unreachable;
}
}
~std.kernel:shape(Body) {
x: f64, y: f64, z: f64,
vx: f64, vy: f64, vz: f64,
mass: f64,
}
~parse_args()
~std.kernel:init(Body) {
{ x: 0, y: 0, z: 0, vx: 0, vy: 0, vz: 0, mass: SOLAR_MASS },
{ x: 4.84143144246472090e+00, y: -1.16032004402742839e+00, z: -1.03622044471123109e-01,
vx: 1.66007664274403694e-03 * DAYS_PER_YEAR, vy: 7.69901118419740425e-03 * DAYS_PER_YEAR,
vz: -6.90460016972063023e-05 * DAYS_PER_YEAR, mass: 9.54791938424326609e-04 * SOLAR_MASS },
{ x: 8.34336671824457987e+00, y: 4.12479856412430479e+00, z: -4.03523417114321381e-01,
vx: -2.76742510726862411e-03 * DAYS_PER_YEAR, vy: 4.99852801234917238e-03 * DAYS_PER_YEAR,
vz: 2.30417297573763929e-05 * DAYS_PER_YEAR, mass: 2.85885980666130812e-04 * SOLAR_MASS },
{ x: 1.28943695621391310e+01, y: -1.51111514016986312e+01, z: -2.23307578892655734e-01,
vx: 2.96460137564761618e-03 * DAYS_PER_YEAR, vy: 2.37847173959480950e-03 * DAYS_PER_YEAR,
vz: -2.96589568540237556e-05 * DAYS_PER_YEAR, mass: 4.36624404335156298e-05 * SOLAR_MASS },
{ x: 1.53796971148509165e+01, y: -2.59193146099879641e+01, z: 1.79258772950371181e-01,
vx: 2.68067772490389322e-03 * DAYS_PER_YEAR, vy: 1.62824170038242295e-03 * DAYS_PER_YEAR,
vz: -9.51592254519715870e-05 * DAYS_PER_YEAR, mass: 5.15138902046611451e-05 * SOLAR_MASS },
}
| kernel k |>
std.kernel:step(0..iterations)
| step |>
std.kernel:pairwise {
const dx = k.x - k.other.x;
const dy = k.y - k.other.y;
const dz = k.z - k.other.z;
const dsq = dx*dx + dy*dy + dz*dz;
const mag = DT / (dsq * @sqrt(dsq));
k.vx -= dx * k.other.mass * mag;
k.vy -= dy * k.other.mass * mag;
k.vz -= dz * k.other.mass * mag;
k.other.vx += dx * k.mass * mag;
k.other.vy += dy * k.mass * mag;
k.other.vz += dz * k.mass * mag;
}
|> std.kernel:self {
k.x += DT * k.vx;
k.y += DT * k.vy;
k.z += DT * k.vz;
}
| computed c |>
capture({ energy: @as(f64, 0) })
| as acc |>
for(0..5)
| each i |>
captured { energy: acc.energy + 0.5*c[i].mass*(c[i].vx*c[i].vx+c[i].vy*c[i].vy+c[i].vz*c[i].vz) }
|> for(i+1..5)
| each j |>
captured { energy: acc.energy - c[i].mass*c[j].mass / @sqrt((c[i].x-c[j].x)*(c[i].x-c[j].x)+(c[i].y-c[j].y)*(c[i].y-c[j].y)+(c[i].z-c[j].z)*(c[i].z-c[j].z)) }
| captured final |>
std.io:print.blk {
{{ final.energy:d:.9 }}
} The key constructs:
kernel:shape(Body)— declares the data layoutkernel:init(Body)— initializes the kernel with 5 bodies, returns a handlekkernel:step(0..iterations)— runs the simulation loop, fused with the kernel ops| step |>— the step branch, no binding (GPU-portable, no iteration identity)kernel:pairwise { ... }— the force calculation, applied to all unique pairskernel:self { ... }— position updates, applied to each body| each i |>— for regularforloops, iteration with the loop variable bound toi
What the Compiler Generates
The kernel:pairwise transform emits this:
const __koru_pair = struct {
inline fn call(
noalias __koru_pair_self: *Body,
noalias __koru_pair_other: *Body,
) void {
// your pairwise body, unchanged
}
};
for (0..N) |i| {
for (i + 1..N) |j| {
__koru_pair.call(&ptr[i], &ptr[j]);
}
} noalias is unconditional because the loop structure guarantees i < j — the pointers are always distinct. In hand-written code, you add restrict after convincing yourself the invariant holds. Here, the transform has the proof by construction.
Appendix: Reference Implementations
For transparency, here’s exactly what we benchmarked against.
C (clang -O3 -ffast-math -march=native)
From the Computer Language Benchmarks Game, contributed by Christoph Bauer. License: Revised BSD.
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#define pi 3.141592653589793
#define solar_mass (4 * pi * pi)
#define days_per_year 365.24
struct planet {
double x, y, z;
double vx, vy, vz;
double mass;
};
static inline void advance(struct planet * restrict bodies, double dt)
{
int i, j;
for (i = 0; i < 5; i++) {
for (j = i + 1; j < 5; j++) {
double dx = bodies[i].x - bodies[j].x;
double dy = bodies[i].y - bodies[j].y;
double dz = bodies[i].z - bodies[j].z;
double dsq = dx * dx + dy * dy + dz * dz;
double mag = dt / (dsq * sqrt(dsq));
bodies[i].vx -= dx * bodies[j].mass * mag;
bodies[i].vy -= dy * bodies[j].mass * mag;
bodies[i].vz -= dz * bodies[j].mass * mag;
bodies[j].vx += dx * bodies[i].mass * mag;
bodies[j].vy += dy * bodies[i].mass * mag;
bodies[j].vz += dz * bodies[i].mass * mag;
}
}
for (i = 0; i < 5; i++) {
bodies[i].x += dt * bodies[i].vx;
bodies[i].y += dt * bodies[i].vy;
bodies[i].z += dt * bodies[i].vz;
}
}
double energy(int nbodies, struct planet * bodies)
{
double e;
int i, j;
e = 0.0;
for (i = 0; i < nbodies; i++) {
struct planet * b = &(bodies[i]);
e += 0.5 * b->mass * (b->vx * b->vx + b->vy * b->vy + b->vz * b->vz);
for (j = i + 1; j < nbodies; j++) {
struct planet * b2 = &(bodies[j]);
double dx = b->x - b2->x;
double dy = b->y - b2->y;
double dz = b->z - b2->z;
double distance = sqrt(dx * dx + dy * dy + dz * dz);
e -= (b->mass * b2->mass) / distance;
}
}
return e;
}
void offset_momentum(int nbodies, struct planet * bodies)
{
double px = 0.0, py = 0.0, pz = 0.0;
int i;
for (i = 0; i < nbodies; i++) {
px += bodies[i].vx * bodies[i].mass;
py += bodies[i].vy * bodies[i].mass;
pz += bodies[i].vz * bodies[i].mass;
}
bodies[0].vx = - px / solar_mass;
bodies[0].vy = - py / solar_mass;
bodies[0].vz = - pz / solar_mass;
}
#define NBODIES 5
struct planet bodies[NBODIES] = {
{ 0, 0, 0, 0, 0, 0, solar_mass },
{
4.84143144246472090e+00,
-1.16032004402742839e+00,
-1.03622044471123109e-01,
1.66007664274403694e-03 * days_per_year,
7.69901118419740425e-03 * days_per_year,
-6.90460016972063023e-05 * days_per_year,
9.54791938424326609e-04 * solar_mass
},
{
8.34336671824457987e+00,
4.12479856412430479e+00,
-4.03523417114321381e-01,
-2.76742510726862411e-03 * days_per_year,
4.99852801234917238e-03 * days_per_year,
2.30417297573763929e-05 * days_per_year,
2.85885980666130812e-04 * solar_mass
},
{
1.28943695621391310e+01,
-1.51111514016986312e+01,
-2.23307578892655734e-01,
2.96460137564761618e-03 * days_per_year,
2.37847173959480950e-03 * days_per_year,
-2.96589568540237556e-05 * days_per_year,
4.36624404335156298e-05 * solar_mass
},
{
1.53796971148509165e+01,
-2.59193146099879641e+01,
1.79258772950371181e-01,
2.68067772490389322e-03 * days_per_year,
1.62824170038242295e-03 * days_per_year,
-9.51592254519715870e-05 * days_per_year,
5.15138902046611451e-05 * solar_mass
}
};
int main(int argc, char ** argv)
{
int n = atoi(argv[1]);
int i;
offset_momentum(NBODIES, bodies);
printf ("%.9f\n", energy(NBODIES, bodies));
for (i = 1; i <= n; i++)
advance(bodies, 0.01);
printf ("%.9f\n", energy(NBODIES, bodies));
return 0;
} Zig (noalias *[5]Body, -O ReleaseFast)
Ported from the Benchmarks Game. Every optimization applied: @setFloatMode(.optimized) for fast-math, dsq * sqrt(dsq) instead of distance³, fixed *[5]Body so the compiler knows the iteration count, and noalias to match C’s restrict.
comptime {
@setFloatMode(.optimized);
}
const std = @import("std");
const PI = 3.141592653589793;
const SOLAR_MASS = 4 * PI * PI;
const DAYS_PER_YEAR = 365.24;
const Body = struct {
x: f64, y: f64, z: f64,
vx: f64, vy: f64, vz: f64,
mass: f64,
};
fn advance(noalias bodies: *[5]Body, dt: f64) void {
for (0..5) |i| {
for (i + 1..5) |j| {
const dx = bodies[i].x - bodies[j].x;
const dy = bodies[i].y - bodies[j].y;
const dz = bodies[i].z - bodies[j].z;
const dsq = dx * dx + dy * dy + dz * dz;
const mag = dt / (dsq * @sqrt(dsq));
bodies[i].vx -= dx * bodies[j].mass * mag;
bodies[i].vy -= dy * bodies[j].mass * mag;
bodies[i].vz -= dz * bodies[j].mass * mag;
bodies[j].vx += dx * bodies[i].mass * mag;
bodies[j].vy += dy * bodies[i].mass * mag;
bodies[j].vz += dz * bodies[i].mass * mag;
}
}
for (bodies) |*body| {
body.x += dt * body.vx;
body.y += dt * body.vy;
body.z += dt * body.vz;
}
}
fn energy(bodies: *const [5]Body) f64 {
var e: f64 = 0.0;
for (bodies, 0..) |body, i| {
e += 0.5 * body.mass * (body.vx * body.vx + body.vy * body.vy + body.vz * body.vz);
for (i + 1..5) |j| {
const dx = body.x - bodies[j].x;
const dy = body.y - bodies[j].y;
const dz = body.z - bodies[j].z;
e -= (body.mass * bodies[j].mass) / @sqrt(dx * dx + dy * dy + dz * dz);
}
}
return e;
}
fn offsetMomentum(bodies: *[5]Body) void {
var px: f64 = 0.0;
var py: f64 = 0.0;
var pz: f64 = 0.0;
for (bodies) |body| {
px += body.vx * body.mass;
py += body.vy * body.mass;
pz += body.vz * body.mass;
}
bodies[0].vx = -px / SOLAR_MASS;
bodies[0].vy = -py / SOLAR_MASS;
bodies[0].vz = -pz / SOLAR_MASS;
}
pub fn main() !void {
const args = try std.process.argsAlloc(std.heap.page_allocator);
defer std.process.argsFree(std.heap.page_allocator, args);
const n = try std.fmt.parseInt(u32, args[1], 10);
var bodies = [_]Body{
.{ .x = 0, .y = 0, .z = 0, .vx = 0, .vy = 0, .vz = 0, .mass = SOLAR_MASS },
.{ .x = 4.84143144246472090e+00, .y = -1.16032004402742839e+00, .z = -1.03622044471123109e-01,
.vx = 1.66007664274403694e-03 * DAYS_PER_YEAR, .vy = 7.69901118419740425e-03 * DAYS_PER_YEAR,
.vz = -6.90460016972063023e-05 * DAYS_PER_YEAR, .mass = 9.54791938424326609e-04 * SOLAR_MASS },
.{ .x = 8.34336671824457987e+00, .y = 4.12479856412430479e+00, .z = -4.03523417114321381e-01,
.vx = -2.76742510726862411e-03 * DAYS_PER_YEAR, .vy = 4.99852801234917238e-03 * DAYS_PER_YEAR,
.vz = 2.30417297573763929e-05 * DAYS_PER_YEAR, .mass = 2.85885980666130812e-04 * SOLAR_MASS },
.{ .x = 1.28943695621391310e+01, .y = -1.51111514016986312e+01, .z = -2.23307578892655734e-01,
.vx = 2.96460137564761618e-03 * DAYS_PER_YEAR, .vy = 2.37847173959480950e-03 * DAYS_PER_YEAR,
.vz = -2.96589568540237556e-05 * DAYS_PER_YEAR, .mass = 4.36624404335156298e-05 * SOLAR_MASS },
.{ .x = 1.53796971148509165e+01, .y = -2.59193146099879641e+01, .z = 1.79258772950371181e-01,
.vx = 2.68067772490389322e-03 * DAYS_PER_YEAR, .vy = 1.62824170038242295e-03 * DAYS_PER_YEAR,
.vz = -9.51592254519715870e-05 * DAYS_PER_YEAR, .mass = 5.15138902046611451e-05 * SOLAR_MASS },
};
offsetMomentum(&bodies);
std.debug.print("{d:.9}\n", .{energy(&bodies)});
for (0..n) |_| advance(&bodies, 0.01);
std.debug.print("{d:.9}\n", .{energy(&bodies)});
} Rust
use std::f64::consts::PI;
const SOLAR_MASS: f64 = 4.0 * PI * PI;
const DAYS_PER_YEAR: f64 = 365.24;
#[derive(Clone, Copy)]
struct Body {
x: f64, y: f64, z: f64,
vx: f64, vy: f64, vz: f64,
mass: f64,
}
fn advance(bodies: &mut [Body; 5], dt: f64) {
for i in 0..5 {
for j in (i + 1)..5 {
let dx = bodies[i].x - bodies[j].x;
let dy = bodies[i].y - bodies[j].y;
let dz = bodies[i].z - bodies[j].z;
let dsq = dx * dx + dy * dy + dz * dz;
let mag = dt / (dsq * dsq.sqrt());
bodies[i].vx -= dx * bodies[j].mass * mag;
bodies[i].vy -= dy * bodies[j].mass * mag;
bodies[i].vz -= dz * bodies[j].mass * mag;
bodies[j].vx += dx * bodies[i].mass * mag;
bodies[j].vy += dy * bodies[i].mass * mag;
bodies[j].vz += dz * bodies[i].mass * mag;
}
}
for body in bodies.iter_mut() {
body.x += dt * body.vx;
body.y += dt * body.vy;
body.z += dt * body.vz;
}
}
fn energy(bodies: &[Body; 5]) -> f64 {
let mut e = 0.0;
for (i, body) in bodies.iter().enumerate() {
e += 0.5 * body.mass * (body.vx * body.vx + body.vy * body.vy + body.vz * body.vz);
for j in (i + 1)..5 {
let dx = body.x - bodies[j].x;
let dy = body.y - bodies[j].y;
let dz = body.z - bodies[j].z;
e -= (body.mass * bodies[j].mass) / (dx * dx + dy * dy + dz * dz).sqrt();
}
}
e
}
fn offset_momentum(bodies: &mut [Body; 5]) {
let (mut px, mut py, mut pz) = (0.0, 0.0, 0.0);
for body in bodies.iter() {
px += body.vx * body.mass;
py += body.vy * body.mass;
pz += body.vz * body.mass;
}
bodies[0].vx = -px / SOLAR_MASS;
bodies[0].vy = -py / SOLAR_MASS;
bodies[0].vz = -pz / SOLAR_MASS;
}
fn main() {
let n: u32 = std::env::args().nth(1).unwrap().parse().unwrap();
let mut bodies = [
Body { x: 0.0, y: 0.0, z: 0.0, vx: 0.0, vy: 0.0, vz: 0.0, mass: SOLAR_MASS },
Body { x: 4.84143144246472090e+00, y: -1.16032004402742839e+00, z: -1.03622044471123109e-01,
vx: 1.66007664274403694e-03 * DAYS_PER_YEAR, vy: 7.69901118419740425e-03 * DAYS_PER_YEAR,
vz: -6.90460016972063023e-05 * DAYS_PER_YEAR, mass: 9.54791938424326609e-04 * SOLAR_MASS },
Body { x: 8.34336671824457987e+00, y: 4.12479856412430479e+00, z: -4.03523417114321381e-01,
vx: -2.76742510726862411e-03 * DAYS_PER_YEAR, vy: 4.99852801234917238e-03 * DAYS_PER_YEAR,
vz: 2.30417297573763929e-05 * DAYS_PER_YEAR, mass: 2.85885980666130812e-04 * SOLAR_MASS },
Body { x: 1.28943695621391310e+01, y: -1.51111514016986312e+01, z: -2.23307578892655734e-01,
vx: 2.96460137564761618e-03 * DAYS_PER_YEAR, vy: 2.37847173959480950e-03 * DAYS_PER_YEAR,
vz: -2.96589568540237556e-05 * DAYS_PER_YEAR, mass: 4.36624404335156298e-05 * SOLAR_MASS },
Body { x: 1.53796971148509165e+01, y: -2.59193146099879641e+01, z: 1.79258772950371181e-01,
vx: 2.68067772490389322e-03 * DAYS_PER_YEAR, vy: 1.62824170038242295e-03 * DAYS_PER_YEAR,
vz: -9.51592254519715870e-05 * DAYS_PER_YEAR, mass: 5.15138902046611451e-05 * SOLAR_MASS },
];
offset_momentum(&mut bodies);
println!("{:.9}", energy(&bodies));
for _ in 0..n {
advance(&mut bodies, 0.01);
}
println!("{:.9}", energy(&bodies));
} The benchmark is in the repository at tests/regression/900_EXAMPLES_SHOWCASE/910_LANGUAGE_SHOOTOUT/nbody/.