Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion build.zig.zon
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
.name = .zmath,
.fingerprint = 0xfd23d422bd223cc2,
.version = "0.11.0-dev",
.minimum_zig_version = "0.15.1",
.minimum_zig_version = "0.16.0",
.paths = .{
"build.zig",
"build.zig.zon",
Expand Down
111 changes: 51 additions & 60 deletions src/benchmark.zig
Original file line number Diff line number Diff line change
Expand Up @@ -49,43 +49,41 @@
/// wave benchmark (SOA) - scalar version: 3.7832s, zmath version: 0.3642s
///
/// -------------------------------------------------------------------------------------------------
pub fn main() !void {
var gpa = std.heap.GeneralPurposeAllocator(.{}){};
defer _ = gpa.deinit();
const allocator = gpa.allocator();
pub fn main(init: std.process.Init) !void {
const allocator = init.gpa;
const io = init.io;

// m = mul(ma, mb); data set fits in L1 cache; AOS data layout.
try mat4MulBenchmark(allocator, 100_000);
try mat4MulBenchmark(allocator, io, 100_000);

// v = 0.01 * cross3(va, vb) + vec3(1.0); data set fits in L1 cache; AOS data layout.
try cross3ScaleBiasBenchmark(allocator, 10_000);
try cross3ScaleBiasBenchmark(allocator, io, 10_000);

// v = dot3(va, vb) * (0.1 * cross3(va, vb) + vec3(1.0)); data set fits in L1 cache; AOS data layout.
try cross3Dot3ScaleBiasBenchmark(allocator, 10_000);
try cross3Dot3ScaleBiasBenchmark(allocator, io, 10_000);

// q = qmul(qa, qb); data set fits in L1 cache; AOS data layout.
try quatBenchmark(allocator, 10_000);
try quatBenchmark(allocator, io, 10_000);

// d = sqrt(x * x + z * z); y = sin(d - t); SOA layout.
try waveBenchmark(allocator, 1_000);
try waveBenchmark(allocator, io, 1_000);
}

const std = @import("std");
const time = std.time;
const Timer = time.Timer;
const Clock = std.Io.Clock;
const zm = @import("zmath");

var prng = std.Random.DefaultPrng.init(0);
const random = prng.random();

noinline fn mat4MulBenchmark(allocator: std.mem.Allocator, comptime count: comptime_int) !void {
noinline fn mat4MulBenchmark(allocator: std.mem.Allocator, io: std.Io, comptime count: comptime_int) !void {
std.debug.print("\n", .{});
std.debug.print("{s:>42} - ", .{"matrix mul benchmark (AOS)"});

var data0 = try std.ArrayList([16]f32).initCapacity(allocator, 64);
defer data0.deinit();
defer data0.deinit(allocator);
var data1 = try std.ArrayList([16]f32).initCapacity(allocator, 64);
defer data1.deinit();
defer data1.deinit(allocator);

var i: usize = 0;
while (i < 64) : (i += 1) {
Expand Down Expand Up @@ -118,8 +116,7 @@ noinline fn mat4MulBenchmark(allocator: std.mem.Allocator, comptime count: compt

{
i = 0;
var timer = try Timer.start();
const start = timer.lap();
const start = Clock.now(.awake, io);
while (i < count) : (i += 1) {
for (data1.items) |b| {
for (data0.items) |a| {
Expand All @@ -145,16 +142,15 @@ noinline fn mat4MulBenchmark(allocator: std.mem.Allocator, comptime count: compt
}
}
}
const end = timer.read();
const elapsed_s = @as(f64, @floatFromInt(end - start)) / time.ns_per_s;
const end = Clock.now(.awake, io);
const elapsed_s = @as(f64, @floatFromInt(start.durationTo(end).toNanoseconds())) / std.time.ns_per_s;

std.debug.print("scalar version: {d:.4}s, ", .{elapsed_s});
}

{
i = 0;
var timer = try Timer.start();
const start = timer.lap();
const start = Clock.now(.awake, io);
while (i < count) : (i += 1) {
for (data1.items) |b| {
for (data0.items) |a| {
Expand All @@ -165,20 +161,23 @@ noinline fn mat4MulBenchmark(allocator: std.mem.Allocator, comptime count: compt
}
}
}
const end = timer.read();
const elapsed_s = @as(f64, @floatFromInt(end - start)) / time.ns_per_s;
const end = Clock.now(
.awake,
io,
);
const elapsed_s = @as(f64, @floatFromInt(start.durationTo(end).toNanoseconds())) / std.time.ns_per_s;

std.debug.print("zmath version: {d:.4}s\n", .{elapsed_s});
}
}

noinline fn cross3ScaleBiasBenchmark(allocator: std.mem.Allocator, comptime count: comptime_int) !void {
noinline fn cross3ScaleBiasBenchmark(allocator: std.mem.Allocator, io: std.Io, comptime count: comptime_int) !void {
std.debug.print("{s:>42} - ", .{"cross3, scale, bias benchmark (AOS)"});

var data0 = try std.ArrayList([3]f32).initCapacity(allocator, 256);
defer data0.deinit();
defer data0.deinit(allocator);
var data1 = try std.ArrayList([3]f32).initCapacity(allocator, 256);
defer data1.deinit();
defer data1.deinit(allocator);

var i: usize = 0;
while (i < 256) : (i += 1) {
Expand All @@ -201,8 +200,7 @@ noinline fn cross3ScaleBiasBenchmark(allocator: std.mem.Allocator, comptime coun

{
i = 0;
var timer = try Timer.start();
const start = timer.lap();
const start = Clock.now(.awake, io);
while (i < count) : (i += 1) {
for (data1.items) |b| {
for (data0.items) |a| {
Expand All @@ -215,16 +213,15 @@ noinline fn cross3ScaleBiasBenchmark(allocator: std.mem.Allocator, comptime coun
}
}
}
const end = timer.read();
const elapsed_s = @as(f64, @floatFromInt(end - start)) / time.ns_per_s;
const end = Clock.now(.awake, io);
const elapsed_s = @as(f64, @floatFromInt(start.durationTo(end).toNanoseconds())) / std.time.ns_per_s;

std.debug.print("scalar version: {d:.4}s, ", .{elapsed_s});
}

{
i = 0;
var timer = try Timer.start();
const start = timer.lap();
const start = Clock.now(.awake, io);
while (i < count) : (i += 1) {
for (data1.items) |b| {
for (data0.items) |a| {
Expand All @@ -235,14 +232,14 @@ noinline fn cross3ScaleBiasBenchmark(allocator: std.mem.Allocator, comptime coun
}
}
}
const end = timer.read();
const elapsed_s = @as(f64, @floatFromInt(end - start)) / time.ns_per_s;
const end = Clock.now(.awake, io);
const elapsed_s = @as(f64, @floatFromInt(start.durationTo(end).toNanoseconds())) / std.time.ns_per_s;

std.debug.print("zmath version: {d:.4}s\n", .{elapsed_s});
}
}

noinline fn cross3Dot3ScaleBiasBenchmark(allocator: std.mem.Allocator, comptime count: comptime_int) !void {
noinline fn cross3Dot3ScaleBiasBenchmark(allocator: std.mem.Allocator, io: std.Io, comptime count: comptime_int) !void {
std.debug.print("{s:>42} - ", .{"cross3, dot3, scale, bias benchmark (AOS)"});

var data0 = try std.ArrayList([3]f32).initCapacity(allocator, 256);
Expand Down Expand Up @@ -271,8 +268,7 @@ noinline fn cross3Dot3ScaleBiasBenchmark(allocator: std.mem.Allocator, comptime

{
i = 0;
var timer = try Timer.start();
const start = timer.lap();
const start = Clock.now(.awake, io);
while (i < count) : (i += 1) {
for (data1.items) |b| {
for (data0.items) |a| {
Expand All @@ -286,16 +282,15 @@ noinline fn cross3Dot3ScaleBiasBenchmark(allocator: std.mem.Allocator, comptime
}
}
}
const end = timer.read();
const elapsed_s = @as(f64, @floatFromInt(end - start)) / time.ns_per_s;
const end = Clock.now(.awake, io);
const elapsed_s = @as(f64, @floatFromInt(start.durationTo(end).toNanoseconds())) / std.time.ns_per_s;

std.debug.print("scalar version: {d:.4}s, ", .{elapsed_s});
}

{
i = 0;
var timer = try Timer.start();
const start = timer.lap();
const start = Clock.now(.awake, io);
while (i < count) : (i += 1) {
for (data1.items) |b| {
for (data0.items) |a| {
Expand All @@ -306,14 +301,14 @@ noinline fn cross3Dot3ScaleBiasBenchmark(allocator: std.mem.Allocator, comptime
}
}
}
const end = timer.read();
const elapsed_s = @as(f64, @floatFromInt(end - start)) / time.ns_per_s;
const end = Clock.now(.awake, io);
const elapsed_s = @as(f64, @floatFromInt(start.durationTo(end).toNanoseconds())) / std.time.ns_per_s;

std.debug.print("zmath version: {d:.4}s\n", .{elapsed_s});
}
}

noinline fn quatBenchmark(allocator: std.mem.Allocator, comptime count: comptime_int) !void {
noinline fn quatBenchmark(allocator: std.mem.Allocator, io: std.Io, comptime count: comptime_int) !void {
std.debug.print("{s:>42} - ", .{"quaternion mul benchmark (AOS)"});

var data0 = try std.ArrayList([4]f32).initCapacity(allocator, 256);
Expand Down Expand Up @@ -342,8 +337,7 @@ noinline fn quatBenchmark(allocator: std.mem.Allocator, comptime count: comptime

{
i = 0;
var timer = try Timer.start();
const start = timer.lap();
const start = Clock.now(.awake, io);
while (i < count) : (i += 1) {
for (data1.items) |b| {
for (data0.items) |a| {
Expand All @@ -357,16 +351,15 @@ noinline fn quatBenchmark(allocator: std.mem.Allocator, comptime count: comptime
}
}
}
const end = timer.read();
const elapsed_s = @as(f64, @floatFromInt(end - start)) / time.ns_per_s;
const end = Clock.now(.awake, io);
const elapsed_s = @as(f64, @floatFromInt(start.durationTo(end).toNanoseconds())) / std.time.ns_per_s;

std.debug.print("scalar version: {d:.4}s, ", .{elapsed_s});
}

{
i = 0;
var timer = try Timer.start();
const start = timer.lap();
const start = Clock.now(.awake, io);
while (i < count) : (i += 1) {
for (data1.items) |b| {
for (data0.items) |a| {
Expand All @@ -377,14 +370,14 @@ noinline fn quatBenchmark(allocator: std.mem.Allocator, comptime count: comptime
}
}
}
const end = timer.read();
const elapsed_s = @as(f64, @floatFromInt(end - start)) / time.ns_per_s;
const end = Clock.now(.awake, io);
const elapsed_s = @as(f64, @floatFromInt(start.durationTo(end).toNanoseconds())) / std.time.ns_per_s;

std.debug.print("zmath version: {d:.4}s\n", .{elapsed_s});
}
}

noinline fn waveBenchmark(allocator: std.mem.Allocator, comptime count: comptime_int) !void {
noinline fn waveBenchmark(allocator: std.mem.Allocator, io: std.Io, comptime count: comptime_int) !void {
_ = allocator;
std.debug.print("{s:>42} - ", .{"wave benchmark (SOA)"});

Expand All @@ -394,8 +387,7 @@ noinline fn waveBenchmark(allocator: std.mem.Allocator, comptime count: comptime

const scale: f32 = 0.05;

var timer = try Timer.start();
const start = timer.lap();
const start = Clock.now(.awake, io);

var iter: usize = 0;
while (iter < count) : (iter += 1) {
Expand Down Expand Up @@ -428,8 +420,8 @@ noinline fn waveBenchmark(allocator: std.mem.Allocator, comptime count: comptime
}
t += 0.001;
}
const end = timer.read();
const elapsed_s = @as(f64, @floatFromInt(end - start)) / time.ns_per_s;
const end = Clock.now(.awake, io);
const elapsed_s = @as(f64, @floatFromInt(start.durationTo(end).toNanoseconds())) / std.time.ns_per_s;

std.debug.print("scalar version: {d:.4}s, ", .{elapsed_s});
}
Expand All @@ -445,8 +437,7 @@ noinline fn waveBenchmark(allocator: std.mem.Allocator, comptime count: comptime

const scale: f32 = 0.05;

var timer = try Timer.start();
const start = timer.lap();
const start = Clock.now(.awake, io);

var iter: usize = 0;
while (iter < count) : (iter += 1) {
Expand All @@ -469,8 +460,8 @@ noinline fn waveBenchmark(allocator: std.mem.Allocator, comptime count: comptime
}
vt += zm.splat(T, 0.001);
}
const end = timer.read();
const elapsed_s = @as(f64, @floatFromInt(end - start)) / time.ns_per_s;
const end = Clock.now(.awake, io);
const elapsed_s = @as(f64, @floatFromInt(start.durationTo(end).toNanoseconds())) / std.time.ns_per_s;

std.debug.print("zmath version: {d:.4}s\n", .{elapsed_s});
}
Expand Down
28 changes: 22 additions & 6 deletions src/root.zig
Original file line number Diff line number Diff line change
Expand Up @@ -787,8 +787,15 @@ test "zmath.maxFast" {
}

pub inline fn min(v0: anytype, v1: anytype) @TypeOf(v0, v1) {
// This will handle inf & nan
return @min(v0, v1); // minps, cmpunordps, andps, andnps, orps
const T = @TypeOf(v0, v1);
const Child = std.meta.Child(T);
// v != v is true only when v is NaN
const nan0 = v0 != v0;
const nan1 = v1 != v1;
// if v0 is NaN, pick v1
// else if v1 is NaN, pick v0
// else pick normal @min
return @select(Child, nan0, v1, @select(Child, nan1, v0, @min(v0, v1)));
Comment on lines +791 to +798
Copy link

Copilot AI Apr 22, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

std.meta.Child(T) will fail to compile when T is a scalar type (e.g. f32, i32), even though this function’s anytype signature previously allowed scalar @min usage. Consider deriving the element type only for vectors (e.g. via @typeInfo(T)), and otherwise using T itself as the @select type parameter so scalar callers still compile.

Suggested change
const Child = std.meta.Child(T);
// v != v is true only when v is NaN
const nan0 = v0 != v0;
const nan1 = v1 != v1;
// if v0 is NaN, pick v1
// else if v1 is NaN, pick v0
// else pick normal @min
return @select(Child, nan0, v1, @select(Child, nan1, v0, @min(v0, v1)));
const SelectT = switch (@typeInfo(T)) {
.Vector => |info| info.child,
else => T,
};
// v != v is true only when v is NaN
const nan0 = v0 != v0;
const nan1 = v1 != v1;
// if v0 is NaN, pick v1
// else if v1 is NaN, pick v0
// else pick normal @min
return @select(SelectT, nan0, v1, @select(SelectT, nan1, v0, @min(v0, v1)));

Copilot uses AI. Check for mistakes.
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think supporting scalar types here is actually important.

}
test "zmath.min" {
// Calling math.inf causes test to fail!
Expand Down Expand Up @@ -831,8 +838,15 @@ test "zmath.min" {
}

pub inline fn max(v0: anytype, v1: anytype) @TypeOf(v0, v1) {
// This will handle inf & nan
return @max(v0, v1); // maxps, cmpunordps, andps, andnps, orps
const T = @TypeOf(v0, v1);
const Child = std.meta.Child(T);
// v != v is true only when v is NaN
Comment on lines +841 to +843
Copy link

Copilot AI Apr 22, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same issue as min: using std.meta.Child(T) will not compile for scalar T, turning max into a vector-only API despite its generic signature. Use a conditional element-type derivation for vectors and fall back to T for scalar types when calling @select.

Copilot uses AI. Check for mistakes.
const nan0 = v0 != v0;
const nan1 = v1 != v1;
// if v0 is NaN, pick v1
// else if v1 is NaN, pick v0
// else pick normal @max
return @select(Child, nan0, v1, @select(Child, nan1, v0, @max(v0, v1)));
}
test "zmath.max" {
// Calling math.inf causes test to fail!
Expand Down Expand Up @@ -4122,7 +4136,8 @@ test "zmath.fftN" {
-77.254834, 0.000000, -105.489863, 0.000000, -160.874864, 0.000000, -324.901452, 0.000000,
};
for (expected, 0..) |e, ie| {
try expect(std.math.approxEqAbs(f32, e, im[(ie / 4)][ie % 4], epsilon));
const v: [4]f32 = im[ie / 4];
try expect(std.math.approxEqAbs(f32, e, v[ie % 4], epsilon));
}
}

Expand Down Expand Up @@ -4185,7 +4200,8 @@ test "zmath.fftN" {
-321.749727, 0.000000, 0.000000, 0.000000, -649.802905, 0.000000, 0.000000, 0.000000,
};
for (expected, 0..) |e, ie| {
try expect(std.math.approxEqAbs(f32, e, im[(ie / 4)][ie % 4], epsilon));
const v: [4]f32 = im[ie / 4];
try expect(std.math.approxEqAbs(f32, e, v[ie % 4], epsilon));
}
}
}
Expand Down
Loading