Skip to content

Commit e88710c

Browse files
committed
fix: IL MatMul - declare locals before executable code
IL requires all locals to be declared before any executable IL code. The EmitSimdBodyFloat/Double helper methods were declaring locCAddr inside the method, but these are called from within loops after IL code has already been emitted. Fix: Declare locCAddr at method start with other locals, pass as parameter. Performance (single-threaded): - 128x128: 6.3 GFLOPS - 256x256: 4.6 GFLOPS - 512x512: 5.8 GFLOPS - 1024x1024: 5.5 GFLOPS Room for improvement with cache blocking (~14 GFLOPS achievable).
1 parent 71d4cb3 commit e88710c

1 file changed

Lines changed: 8 additions & 8 deletions

File tree

src/NumSharp.Core/Backends/Kernels/ILKernelGenerator.MatMul.cs

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -144,6 +144,7 @@ private static void EmitMatMulFloat(ILGenerator il)
144144
var locCRow = il.DeclareLocal(typeof(float*)); // 5: pointer to C[i,:]
145145
var locARow = il.DeclareLocal(typeof(float*)); // 6: pointer to A[i,:]
146146
var locBRow = il.DeclareLocal(typeof(float*)); // 7: pointer to B[k,:]
147+
var locCAddr = il.DeclareLocal(typeof(float*)); // 8: temp C address for SIMD store
147148

148149
const int vectorCount = 8; // Vector256<float>.Count
149150
const int elementSize = 4; // sizeof(float)
@@ -280,7 +281,7 @@ private static void EmitMatMulFloat(ILGenerator il)
280281
il.Emit(OpCodes.Bgt, lblInnerSimdEnd);
281282

282283
// Emit SIMD body: C[i,j:j+8] += aik * B[k,j:j+8]
283-
EmitSimdBodyFloat(il, locCRow, locBRow, locJ, locAik);
284+
EmitSimdBodyFloat(il, locCRow, locBRow, locJ, locAik, locCAddr);
284285

285286
// j += 8
286287
il.Emit(OpCodes.Ldloc, locJ);
@@ -360,7 +361,7 @@ private static void EmitMatMulFloat(ILGenerator il)
360361
/// Emit SIMD body for float: C[i,j:j+8] += aik * B[k,j:j+8]
361362
/// Uses Vector256 with FMA when available.
362363
/// </summary>
363-
private static void EmitSimdBodyFloat(ILGenerator il, LocalBuilder locCRow, LocalBuilder locBRow, LocalBuilder locJ, LocalBuilder locAik)
364+
private static void EmitSimdBodyFloat(ILGenerator il, LocalBuilder locCRow, LocalBuilder locBRow, LocalBuilder locJ, LocalBuilder locAik, LocalBuilder locCAddr)
364365
{
365366
const int elementSize = 4;
366367

@@ -395,8 +396,7 @@ private static void EmitSimdBodyFloat(ILGenerator il, LocalBuilder locCRow, Loca
395396
// Clean stack management for SIMD body
396397
// Store signature: Store(Vector256<T> source, T* destination)
397398

398-
// Save C address for later
399-
var locCAddr = il.DeclareLocal(typeof(float*));
399+
// Compute C address: cRow + j * elementSize
400400
il.Emit(OpCodes.Ldloc, locCRow);
401401
il.Emit(OpCodes.Ldloc, locJ);
402402
il.Emit(OpCodes.Conv_I);
@@ -450,6 +450,7 @@ private static void EmitMatMulDouble(ILGenerator il)
450450
var locCRow = il.DeclareLocal(typeof(double*));
451451
var locARow = il.DeclareLocal(typeof(double*));
452452
var locBRow = il.DeclareLocal(typeof(double*));
453+
var locCAddr = il.DeclareLocal(typeof(double*)); // temp C address for SIMD store
453454

454455
const int vectorCount = 4; // Vector256<double>.Count
455456
const int elementSize = 8; // sizeof(double)
@@ -575,7 +576,7 @@ private static void EmitMatMulDouble(ILGenerator il)
575576
il.Emit(OpCodes.Ldloc, locJEnd);
576577
il.Emit(OpCodes.Bgt, lblInnerSimdEnd);
577578

578-
EmitSimdBodyDouble(il, locCRow, locBRow, locJ, locAik);
579+
EmitSimdBodyDouble(il, locCRow, locBRow, locJ, locAik, locCAddr);
579580

580581
il.Emit(OpCodes.Ldloc, locJ);
581582
il.Emit(OpCodes.Ldc_I4, vectorCount);
@@ -643,7 +644,7 @@ private static void EmitMatMulDouble(ILGenerator il)
643644
/// <summary>
644645
/// Emit SIMD body for double: C[i,j:j+4] += aik * B[k,j:j+4]
645646
/// </summary>
646-
private static void EmitSimdBodyDouble(ILGenerator il, LocalBuilder locCRow, LocalBuilder locBRow, LocalBuilder locJ, LocalBuilder locAik)
647+
private static void EmitSimdBodyDouble(ILGenerator il, LocalBuilder locCRow, LocalBuilder locBRow, LocalBuilder locJ, LocalBuilder locAik, LocalBuilder locCAddr)
647648
{
648649
const int elementSize = 8;
649650

@@ -677,8 +678,7 @@ private static void EmitSimdBodyDouble(ILGenerator il, LocalBuilder locCRow, Loc
677678
// Clean stack management for SIMD body
678679
// Store signature: Store(Vector256<T> source, T* destination)
679680

680-
// Save C address for later
681-
var locCAddr = il.DeclareLocal(typeof(double*));
681+
// Compute C address: cRow + j * elementSize
682682
il.Emit(OpCodes.Ldloc, locCRow);
683683
il.Emit(OpCodes.Ldloc, locJ);
684684
il.Emit(OpCodes.Conv_I);

0 commit comments

Comments
 (0)