diff --git a/build.zig.zon b/build.zig.zon index e06308d..6790c53 100644 --- a/build.zig.zon +++ b/build.zig.zon @@ -2,7 +2,7 @@ .name = .zmath, .fingerprint = 0xfd23d422bd223cc2, .version = "0.11.0-dev", - .minimum_zig_version = "0.15.1", + .minimum_zig_version = "0.16.0", .paths = .{ "build.zig", "build.zig.zon", diff --git a/src/benchmark.zig b/src/benchmark.zig index 775a1fc..1027f35 100644 --- a/src/benchmark.zig +++ b/src/benchmark.zig @@ -49,43 +49,41 @@ /// wave benchmark (SOA) - scalar version: 3.7832s, zmath version: 0.3642s /// /// ------------------------------------------------------------------------------------------------- -pub fn main() !void { - var gpa = std.heap.GeneralPurposeAllocator(.{}){}; - defer _ = gpa.deinit(); - const allocator = gpa.allocator(); +pub fn main(init: std.process.Init) !void { + const allocator = init.gpa; + const io = init.io; // m = mul(ma, mb); data set fits in L1 cache; AOS data layout. - try mat4MulBenchmark(allocator, 100_000); + try mat4MulBenchmark(allocator, io, 100_000); // v = 0.01 * cross3(va, vb) + vec3(1.0); data set fits in L1 cache; AOS data layout. - try cross3ScaleBiasBenchmark(allocator, 10_000); + try cross3ScaleBiasBenchmark(allocator, io, 10_000); // v = dot3(va, vb) * (0.1 * cross3(va, vb) + vec3(1.0)); data set fits in L1 cache; AOS data layout. - try cross3Dot3ScaleBiasBenchmark(allocator, 10_000); + try cross3Dot3ScaleBiasBenchmark(allocator, io, 10_000); // q = qmul(qa, qb); data set fits in L1 cache; AOS data layout. - try quatBenchmark(allocator, 10_000); + try quatBenchmark(allocator, io, 10_000); // d = sqrt(x * x + z * z); y = sin(d - t); SOA layout. - try waveBenchmark(allocator, 1_000); + try waveBenchmark(allocator, io, 1_000); } const std = @import("std"); -const time = std.time; -const Timer = time.Timer; +const Clock = std.Io.Clock; const zm = @import("zmath"); var prng = std.Random.DefaultPrng.init(0); const random = prng.random(); -noinline fn mat4MulBenchmark(allocator: std.mem.Allocator, comptime count: comptime_int) !void { +noinline fn mat4MulBenchmark(allocator: std.mem.Allocator, io: std.Io, comptime count: comptime_int) !void { std.debug.print("\n", .{}); std.debug.print("{s:>42} - ", .{"matrix mul benchmark (AOS)"}); var data0 = try std.ArrayList([16]f32).initCapacity(allocator, 64); - defer data0.deinit(); + defer data0.deinit(allocator); var data1 = try std.ArrayList([16]f32).initCapacity(allocator, 64); - defer data1.deinit(); + defer data1.deinit(allocator); var i: usize = 0; while (i < 64) : (i += 1) { @@ -118,8 +116,7 @@ noinline fn mat4MulBenchmark(allocator: std.mem.Allocator, comptime count: compt { i = 0; - var timer = try Timer.start(); - const start = timer.lap(); + const start = Clock.now(.awake, io); while (i < count) : (i += 1) { for (data1.items) |b| { for (data0.items) |a| { @@ -145,16 +142,15 @@ noinline fn mat4MulBenchmark(allocator: std.mem.Allocator, comptime count: compt } } } - const end = timer.read(); - const elapsed_s = @as(f64, @floatFromInt(end - start)) / time.ns_per_s; + const end = Clock.now(.awake, io); + const elapsed_s = @as(f64, @floatFromInt(start.durationTo(end).toNanoseconds())) / std.time.ns_per_s; std.debug.print("scalar version: {d:.4}s, ", .{elapsed_s}); } { i = 0; - var timer = try Timer.start(); - const start = timer.lap(); + const start = Clock.now(.awake, io); while (i < count) : (i += 1) { for (data1.items) |b| { for (data0.items) |a| { @@ -165,20 +161,23 @@ noinline fn mat4MulBenchmark(allocator: std.mem.Allocator, comptime count: compt } } } - const end = timer.read(); - const elapsed_s = @as(f64, @floatFromInt(end - start)) / time.ns_per_s; + const end = Clock.now( + .awake, + io, + ); + const elapsed_s = @as(f64, @floatFromInt(start.durationTo(end).toNanoseconds())) / std.time.ns_per_s; std.debug.print("zmath version: {d:.4}s\n", .{elapsed_s}); } } -noinline fn cross3ScaleBiasBenchmark(allocator: std.mem.Allocator, comptime count: comptime_int) !void { +noinline fn cross3ScaleBiasBenchmark(allocator: std.mem.Allocator, io: std.Io, comptime count: comptime_int) !void { std.debug.print("{s:>42} - ", .{"cross3, scale, bias benchmark (AOS)"}); var data0 = try std.ArrayList([3]f32).initCapacity(allocator, 256); - defer data0.deinit(); + defer data0.deinit(allocator); var data1 = try std.ArrayList([3]f32).initCapacity(allocator, 256); - defer data1.deinit(); + defer data1.deinit(allocator); var i: usize = 0; while (i < 256) : (i += 1) { @@ -201,8 +200,7 @@ noinline fn cross3ScaleBiasBenchmark(allocator: std.mem.Allocator, comptime coun { i = 0; - var timer = try Timer.start(); - const start = timer.lap(); + const start = Clock.now(.awake, io); while (i < count) : (i += 1) { for (data1.items) |b| { for (data0.items) |a| { @@ -215,16 +213,15 @@ noinline fn cross3ScaleBiasBenchmark(allocator: std.mem.Allocator, comptime coun } } } - const end = timer.read(); - const elapsed_s = @as(f64, @floatFromInt(end - start)) / time.ns_per_s; + const end = Clock.now(.awake, io); + const elapsed_s = @as(f64, @floatFromInt(start.durationTo(end).toNanoseconds())) / std.time.ns_per_s; std.debug.print("scalar version: {d:.4}s, ", .{elapsed_s}); } { i = 0; - var timer = try Timer.start(); - const start = timer.lap(); + const start = Clock.now(.awake, io); while (i < count) : (i += 1) { for (data1.items) |b| { for (data0.items) |a| { @@ -235,14 +232,14 @@ noinline fn cross3ScaleBiasBenchmark(allocator: std.mem.Allocator, comptime coun } } } - const end = timer.read(); - const elapsed_s = @as(f64, @floatFromInt(end - start)) / time.ns_per_s; + const end = Clock.now(.awake, io); + const elapsed_s = @as(f64, @floatFromInt(start.durationTo(end).toNanoseconds())) / std.time.ns_per_s; std.debug.print("zmath version: {d:.4}s\n", .{elapsed_s}); } } -noinline fn cross3Dot3ScaleBiasBenchmark(allocator: std.mem.Allocator, comptime count: comptime_int) !void { +noinline fn cross3Dot3ScaleBiasBenchmark(allocator: std.mem.Allocator, io: std.Io, comptime count: comptime_int) !void { std.debug.print("{s:>42} - ", .{"cross3, dot3, scale, bias benchmark (AOS)"}); var data0 = try std.ArrayList([3]f32).initCapacity(allocator, 256); @@ -271,8 +268,7 @@ noinline fn cross3Dot3ScaleBiasBenchmark(allocator: std.mem.Allocator, comptime { i = 0; - var timer = try Timer.start(); - const start = timer.lap(); + const start = Clock.now(.awake, io); while (i < count) : (i += 1) { for (data1.items) |b| { for (data0.items) |a| { @@ -286,16 +282,15 @@ noinline fn cross3Dot3ScaleBiasBenchmark(allocator: std.mem.Allocator, comptime } } } - const end = timer.read(); - const elapsed_s = @as(f64, @floatFromInt(end - start)) / time.ns_per_s; + const end = Clock.now(.awake, io); + const elapsed_s = @as(f64, @floatFromInt(start.durationTo(end).toNanoseconds())) / std.time.ns_per_s; std.debug.print("scalar version: {d:.4}s, ", .{elapsed_s}); } { i = 0; - var timer = try Timer.start(); - const start = timer.lap(); + const start = Clock.now(.awake, io); while (i < count) : (i += 1) { for (data1.items) |b| { for (data0.items) |a| { @@ -306,14 +301,14 @@ noinline fn cross3Dot3ScaleBiasBenchmark(allocator: std.mem.Allocator, comptime } } } - const end = timer.read(); - const elapsed_s = @as(f64, @floatFromInt(end - start)) / time.ns_per_s; + const end = Clock.now(.awake, io); + const elapsed_s = @as(f64, @floatFromInt(start.durationTo(end).toNanoseconds())) / std.time.ns_per_s; std.debug.print("zmath version: {d:.4}s\n", .{elapsed_s}); } } -noinline fn quatBenchmark(allocator: std.mem.Allocator, comptime count: comptime_int) !void { +noinline fn quatBenchmark(allocator: std.mem.Allocator, io: std.Io, comptime count: comptime_int) !void { std.debug.print("{s:>42} - ", .{"quaternion mul benchmark (AOS)"}); var data0 = try std.ArrayList([4]f32).initCapacity(allocator, 256); @@ -342,8 +337,7 @@ noinline fn quatBenchmark(allocator: std.mem.Allocator, comptime count: comptime { i = 0; - var timer = try Timer.start(); - const start = timer.lap(); + const start = Clock.now(.awake, io); while (i < count) : (i += 1) { for (data1.items) |b| { for (data0.items) |a| { @@ -357,16 +351,15 @@ noinline fn quatBenchmark(allocator: std.mem.Allocator, comptime count: comptime } } } - const end = timer.read(); - const elapsed_s = @as(f64, @floatFromInt(end - start)) / time.ns_per_s; + const end = Clock.now(.awake, io); + const elapsed_s = @as(f64, @floatFromInt(start.durationTo(end).toNanoseconds())) / std.time.ns_per_s; std.debug.print("scalar version: {d:.4}s, ", .{elapsed_s}); } { i = 0; - var timer = try Timer.start(); - const start = timer.lap(); + const start = Clock.now(.awake, io); while (i < count) : (i += 1) { for (data1.items) |b| { for (data0.items) |a| { @@ -377,14 +370,14 @@ noinline fn quatBenchmark(allocator: std.mem.Allocator, comptime count: comptime } } } - const end = timer.read(); - const elapsed_s = @as(f64, @floatFromInt(end - start)) / time.ns_per_s; + const end = Clock.now(.awake, io); + const elapsed_s = @as(f64, @floatFromInt(start.durationTo(end).toNanoseconds())) / std.time.ns_per_s; std.debug.print("zmath version: {d:.4}s\n", .{elapsed_s}); } } -noinline fn waveBenchmark(allocator: std.mem.Allocator, comptime count: comptime_int) !void { +noinline fn waveBenchmark(allocator: std.mem.Allocator, io: std.Io, comptime count: comptime_int) !void { _ = allocator; std.debug.print("{s:>42} - ", .{"wave benchmark (SOA)"}); @@ -394,8 +387,7 @@ noinline fn waveBenchmark(allocator: std.mem.Allocator, comptime count: comptime const scale: f32 = 0.05; - var timer = try Timer.start(); - const start = timer.lap(); + const start = Clock.now(.awake, io); var iter: usize = 0; while (iter < count) : (iter += 1) { @@ -428,8 +420,8 @@ noinline fn waveBenchmark(allocator: std.mem.Allocator, comptime count: comptime } t += 0.001; } - const end = timer.read(); - const elapsed_s = @as(f64, @floatFromInt(end - start)) / time.ns_per_s; + const end = Clock.now(.awake, io); + const elapsed_s = @as(f64, @floatFromInt(start.durationTo(end).toNanoseconds())) / std.time.ns_per_s; std.debug.print("scalar version: {d:.4}s, ", .{elapsed_s}); } @@ -445,8 +437,7 @@ noinline fn waveBenchmark(allocator: std.mem.Allocator, comptime count: comptime const scale: f32 = 0.05; - var timer = try Timer.start(); - const start = timer.lap(); + const start = Clock.now(.awake, io); var iter: usize = 0; while (iter < count) : (iter += 1) { @@ -469,8 +460,8 @@ noinline fn waveBenchmark(allocator: std.mem.Allocator, comptime count: comptime } vt += zm.splat(T, 0.001); } - const end = timer.read(); - const elapsed_s = @as(f64, @floatFromInt(end - start)) / time.ns_per_s; + const end = Clock.now(.awake, io); + const elapsed_s = @as(f64, @floatFromInt(start.durationTo(end).toNanoseconds())) / std.time.ns_per_s; std.debug.print("zmath version: {d:.4}s\n", .{elapsed_s}); } diff --git a/src/root.zig b/src/root.zig index d02c7dd..7110646 100644 --- a/src/root.zig +++ b/src/root.zig @@ -787,8 +787,15 @@ test "zmath.maxFast" { } pub inline fn min(v0: anytype, v1: anytype) @TypeOf(v0, v1) { - // This will handle inf & nan - return @min(v0, v1); // minps, cmpunordps, andps, andnps, orps + const T = @TypeOf(v0, v1); + const Child = std.meta.Child(T); + // v != v is true only when v is NaN + const nan0 = v0 != v0; + const nan1 = v1 != v1; + // if v0 is NaN, pick v1 + // else if v1 is NaN, pick v0 + // else pick normal @min + return @select(Child, nan0, v1, @select(Child, nan1, v0, @min(v0, v1))); } test "zmath.min" { // Calling math.inf causes test to fail! @@ -831,8 +838,15 @@ test "zmath.min" { } pub inline fn max(v0: anytype, v1: anytype) @TypeOf(v0, v1) { - // This will handle inf & nan - return @max(v0, v1); // maxps, cmpunordps, andps, andnps, orps + const T = @TypeOf(v0, v1); + const Child = std.meta.Child(T); + // v != v is true only when v is NaN + const nan0 = v0 != v0; + const nan1 = v1 != v1; + // if v0 is NaN, pick v1 + // else if v1 is NaN, pick v0 + // else pick normal @max + return @select(Child, nan0, v1, @select(Child, nan1, v0, @max(v0, v1))); } test "zmath.max" { // Calling math.inf causes test to fail! @@ -4122,7 +4136,8 @@ test "zmath.fftN" { -77.254834, 0.000000, -105.489863, 0.000000, -160.874864, 0.000000, -324.901452, 0.000000, }; for (expected, 0..) |e, ie| { - try expect(std.math.approxEqAbs(f32, e, im[(ie / 4)][ie % 4], epsilon)); + const v: [4]f32 = im[ie / 4]; + try expect(std.math.approxEqAbs(f32, e, v[ie % 4], epsilon)); } } @@ -4185,7 +4200,8 @@ test "zmath.fftN" { -321.749727, 0.000000, 0.000000, 0.000000, -649.802905, 0.000000, 0.000000, 0.000000, }; for (expected, 0..) |e, ie| { - try expect(std.math.approxEqAbs(f32, e, im[(ie / 4)][ie % 4], epsilon)); + const v: [4]f32 = im[ie / 4]; + try expect(std.math.approxEqAbs(f32, e, v[ie % 4], epsilon)); } } }