@@ -144,6 +144,7 @@ private static void EmitMatMulFloat(ILGenerator il)
144144 var locCRow = il . DeclareLocal ( typeof ( float * ) ) ; // 5: pointer to C[i,:]
145145 var locARow = il . DeclareLocal ( typeof ( float * ) ) ; // 6: pointer to A[i,:]
146146 var locBRow = il . DeclareLocal ( typeof ( float * ) ) ; // 7: pointer to B[k,:]
147+ var locCAddr = il . DeclareLocal ( typeof ( float * ) ) ; // 8: temp C address for SIMD store
147148
148149 const int vectorCount = 8 ; // Vector256<float>.Count
149150 const int elementSize = 4 ; // sizeof(float)
@@ -280,7 +281,7 @@ private static void EmitMatMulFloat(ILGenerator il)
280281 il . Emit ( OpCodes . Bgt , lblInnerSimdEnd ) ;
281282
282283 // Emit SIMD body: C[i,j:j+8] += aik * B[k,j:j+8]
283- EmitSimdBodyFloat ( il , locCRow , locBRow , locJ , locAik ) ;
284+ EmitSimdBodyFloat ( il , locCRow , locBRow , locJ , locAik , locCAddr ) ;
284285
285286 // j += 8
286287 il . Emit ( OpCodes . Ldloc , locJ ) ;
@@ -360,7 +361,7 @@ private static void EmitMatMulFloat(ILGenerator il)
360361 /// Emit SIMD body for float: C[i,j:j+8] += aik * B[k,j:j+8]
361362 /// Uses Vector256 with FMA when available.
362363 /// </summary>
363- private static void EmitSimdBodyFloat ( ILGenerator il , LocalBuilder locCRow , LocalBuilder locBRow , LocalBuilder locJ , LocalBuilder locAik )
364+ private static void EmitSimdBodyFloat ( ILGenerator il , LocalBuilder locCRow , LocalBuilder locBRow , LocalBuilder locJ , LocalBuilder locAik , LocalBuilder locCAddr )
364365 {
365366 const int elementSize = 4 ;
366367
@@ -395,8 +396,7 @@ private static void EmitSimdBodyFloat(ILGenerator il, LocalBuilder locCRow, Loca
395396 // Clean stack management for SIMD body
396397 // Store signature: Store(Vector256<T> source, T* destination)
397398
398- // Save C address for later
399- var locCAddr = il . DeclareLocal ( typeof ( float * ) ) ;
399+ // Compute C address: cRow + j * elementSize
400400 il . Emit ( OpCodes . Ldloc , locCRow ) ;
401401 il . Emit ( OpCodes . Ldloc , locJ ) ;
402402 il . Emit ( OpCodes . Conv_I ) ;
@@ -450,6 +450,7 @@ private static void EmitMatMulDouble(ILGenerator il)
450450 var locCRow = il . DeclareLocal ( typeof ( double * ) ) ;
451451 var locARow = il . DeclareLocal ( typeof ( double * ) ) ;
452452 var locBRow = il . DeclareLocal ( typeof ( double * ) ) ;
453+ var locCAddr = il . DeclareLocal ( typeof ( double * ) ) ; // temp C address for SIMD store
453454
454455 const int vectorCount = 4 ; // Vector256<double>.Count
455456 const int elementSize = 8 ; // sizeof(double)
@@ -575,7 +576,7 @@ private static void EmitMatMulDouble(ILGenerator il)
575576 il . Emit ( OpCodes . Ldloc , locJEnd ) ;
576577 il . Emit ( OpCodes . Bgt , lblInnerSimdEnd ) ;
577578
578- EmitSimdBodyDouble ( il , locCRow , locBRow , locJ , locAik ) ;
579+ EmitSimdBodyDouble ( il , locCRow , locBRow , locJ , locAik , locCAddr ) ;
579580
580581 il . Emit ( OpCodes . Ldloc , locJ ) ;
581582 il . Emit ( OpCodes . Ldc_I4 , vectorCount ) ;
@@ -643,7 +644,7 @@ private static void EmitMatMulDouble(ILGenerator il)
643644 /// <summary>
644645 /// Emit SIMD body for double: C[i,j:j+4] += aik * B[k,j:j+4]
645646 /// </summary>
646- private static void EmitSimdBodyDouble ( ILGenerator il , LocalBuilder locCRow , LocalBuilder locBRow , LocalBuilder locJ , LocalBuilder locAik )
647+ private static void EmitSimdBodyDouble ( ILGenerator il , LocalBuilder locCRow , LocalBuilder locBRow , LocalBuilder locJ , LocalBuilder locAik , LocalBuilder locCAddr )
647648 {
648649 const int elementSize = 8 ;
649650
@@ -677,8 +678,7 @@ private static void EmitSimdBodyDouble(ILGenerator il, LocalBuilder locCRow, Loc
677678 // Clean stack management for SIMD body
678679 // Store signature: Store(Vector256<T> source, T* destination)
679680
680- // Save C address for later
681- var locCAddr = il . DeclareLocal ( typeof ( double * ) ) ;
681+ // Compute C address: cRow + j * elementSize
682682 il . Emit ( OpCodes . Ldloc , locCRow ) ;
683683 il . Emit ( OpCodes . Ldloc , locJ ) ;
684684 il . Emit ( OpCodes . Conv_I ) ;
0 commit comments