@@ -306,12 +306,14 @@ private static ComparisonKernel GenerateComparisonGeneralKernel(ComparisonKernel
306306 #region Comparison Loop Emission
307307
308308 /// <summary>
309- /// Emit a SIMD loop for contiguous comparison (adapts to V128/V256/V512).
309+ /// Emit a SIMD loop for contiguous comparison with 4x unrolling (adapts to V128/V256/V512).
310310 /// </summary>
311311 private static void EmitComparisonSimdLoop ( ILGenerator il , ComparisonKernelKey key ,
312312 int lhsSize , int rhsSize , NPTypeCode comparisonType )
313313 {
314314 int vectorCount = GetVectorCount ( comparisonType ) ;
315+ int unrollFactor = 4 ;
316+ int unrollStep = vectorCount * unrollFactor ;
315317 var clrType = GetClrType ( comparisonType ) ;
316318 var vectorType = GetVectorType ( clrType ) ;
317319
@@ -320,8 +322,21 @@ private static void EmitComparisonSimdLoop(ILGenerator il, ComparisonKernelKey k
320322 // int ndim (6), int totalSize (7)
321323
322324 var locI = il . DeclareLocal ( typeof ( int ) ) ;
325+ var locUnrollEnd = il . DeclareLocal ( typeof ( int ) ) ;
323326 var locVectorEnd = il . DeclareLocal ( typeof ( int ) ) ;
324- var locMask = il . DeclareLocal ( vectorType ) ;
327+
328+ // Declare mask locals for 4x unrolling
329+ var locMask0 = il . DeclareLocal ( vectorType ) ;
330+ var locMask1 = il . DeclareLocal ( vectorType ) ;
331+ var locMask2 = il . DeclareLocal ( vectorType ) ;
332+ var locMask3 = il . DeclareLocal ( vectorType ) ;
333+ var maskLocals = new [ ] { locMask0 , locMask1 , locMask2 , locMask3 } ;
334+
335+ // unrollEnd = totalSize - unrollStep + 1 (last valid 4x start position)
336+ il . Emit ( OpCodes . Ldarg_S , ( byte ) 7 ) ; // totalSize
337+ il . Emit ( OpCodes . Ldc_I4 , unrollStep - 1 ) ;
338+ il . Emit ( OpCodes . Sub ) ;
339+ il . Emit ( OpCodes . Stloc , locUnrollEnd ) ;
325340
326341 // vectorEnd = totalSize - vectorCount + 1 (last valid SIMD start position)
327342 il . Emit ( OpCodes . Ldarg_S , ( byte ) 7 ) ; // totalSize
@@ -333,16 +348,88 @@ private static void EmitComparisonSimdLoop(ILGenerator il, ComparisonKernelKey k
333348 il . Emit ( OpCodes . Ldc_I4_0 ) ;
334349 il . Emit ( OpCodes . Stloc , locI ) ;
335350
336- var lblSimdLoop = il . DefineLabel ( ) ;
337- var lblSimdEnd = il . DefineLabel ( ) ;
351+ var lblUnrollLoop = il . DefineLabel ( ) ;
352+ var lblUnrollEnd = il . DefineLabel ( ) ;
353+ var lblRemainderLoop = il . DefineLabel ( ) ;
354+ var lblRemainderEnd = il . DefineLabel ( ) ;
338355 var lblTailLoop = il . DefineLabel ( ) ;
339356 var lblTailEnd = il . DefineLabel ( ) ;
340357
341- // === SIMD Loop ===
342- il . MarkLabel ( lblSimdLoop ) ;
358+ // === 4x UNROLLED SIMD LOOP ===
359+ il . MarkLabel ( lblUnrollLoop ) ;
360+ il . Emit ( OpCodes . Ldloc , locI ) ;
361+ il . Emit ( OpCodes . Ldloc , locUnrollEnd ) ;
362+ il . Emit ( OpCodes . Bgt , lblUnrollEnd ) ;
363+
364+ // Load 4 lhs vectors, 4 rhs vectors, compare, store masks
365+ for ( int n = 0 ; n < unrollFactor ; n ++ )
366+ {
367+ int offset = n * vectorCount ;
368+
369+ // Load lhs vector at (i + offset) * lhsSize
370+ il . Emit ( OpCodes . Ldarg_0 ) ; // lhs
371+ il . Emit ( OpCodes . Ldloc , locI ) ;
372+ if ( offset > 0 )
373+ {
374+ il . Emit ( OpCodes . Ldc_I4 , offset ) ;
375+ il . Emit ( OpCodes . Add ) ;
376+ }
377+ il . Emit ( OpCodes . Conv_I ) ;
378+ il . Emit ( OpCodes . Ldc_I4 , lhsSize ) ;
379+ il . Emit ( OpCodes . Mul ) ;
380+ il . Emit ( OpCodes . Add ) ;
381+ EmitVectorLoad ( il , comparisonType ) ;
382+
383+ // Load rhs vector at (i + offset) * rhsSize
384+ il . Emit ( OpCodes . Ldarg_1 ) ; // rhs
385+ il . Emit ( OpCodes . Ldloc , locI ) ;
386+ if ( offset > 0 )
387+ {
388+ il . Emit ( OpCodes . Ldc_I4 , offset ) ;
389+ il . Emit ( OpCodes . Add ) ;
390+ }
391+ il . Emit ( OpCodes . Conv_I ) ;
392+ il . Emit ( OpCodes . Ldc_I4 , rhsSize ) ;
393+ il . Emit ( OpCodes . Mul ) ;
394+ il . Emit ( OpCodes . Add ) ;
395+ EmitVectorLoad ( il , comparisonType ) ;
396+
397+ // Compare: produces mask vector
398+ EmitVectorComparison ( il , key . Op , comparisonType ) ;
399+ il . Emit ( OpCodes . Stloc , maskLocals [ n ] ) ;
400+ }
401+
402+ // Extract all 4 masks to booleans
403+ for ( int n = 0 ; n < unrollFactor ; n ++ )
404+ {
405+ int offset = n * vectorCount ;
406+
407+ // Create a temporary local to hold (i + offset) for extraction
408+ var locIOffset = il . DeclareLocal ( typeof ( int ) ) ;
409+ il . Emit ( OpCodes . Ldloc , locI ) ;
410+ if ( offset > 0 )
411+ {
412+ il . Emit ( OpCodes . Ldc_I4 , offset ) ;
413+ il . Emit ( OpCodes . Add ) ;
414+ }
415+ il . Emit ( OpCodes . Stloc , locIOffset ) ;
416+
417+ EmitMaskToBoolExtraction ( il , comparisonType , vectorCount , locIOffset , maskLocals [ n ] ) ;
418+ }
419+
420+ // i += unrollStep
421+ il . Emit ( OpCodes . Ldloc , locI ) ;
422+ il . Emit ( OpCodes . Ldc_I4 , unrollStep ) ;
423+ il . Emit ( OpCodes . Add ) ;
424+ il . Emit ( OpCodes . Stloc , locI ) ;
425+ il . Emit ( OpCodes . Br , lblUnrollLoop ) ;
426+
427+ // === REMAINDER SIMD LOOP (0-3 vectors) ===
428+ il . MarkLabel ( lblUnrollEnd ) ;
429+ il . MarkLabel ( lblRemainderLoop ) ;
343430 il . Emit ( OpCodes . Ldloc , locI ) ;
344431 il . Emit ( OpCodes . Ldloc , locVectorEnd ) ;
345- il . Emit ( OpCodes . Bgt , lblSimdEnd ) ;
432+ il . Emit ( OpCodes . Bgt , lblRemainderEnd ) ;
346433
347434 // Load lhs vector: lhs + i * elemSize
348435 il . Emit ( OpCodes . Ldarg_0 ) ; // lhs
@@ -364,20 +451,20 @@ private static void EmitComparisonSimdLoop(ILGenerator il, ComparisonKernelKey k
364451
365452 // Compare: produces mask vector
366453 EmitVectorComparison ( il , key . Op , comparisonType ) ;
367- il . Emit ( OpCodes . Stloc , locMask ) ;
454+ il . Emit ( OpCodes . Stloc , locMask0 ) ;
368455
369456 // Extract mask to booleans
370- EmitMaskToBoolExtraction ( il , comparisonType , vectorCount , locI , locMask ) ;
457+ EmitMaskToBoolExtraction ( il , comparisonType , vectorCount , locI , locMask0 ) ;
371458
372459 // i += vectorCount
373460 il . Emit ( OpCodes . Ldloc , locI ) ;
374461 il . Emit ( OpCodes . Ldc_I4 , vectorCount ) ;
375462 il . Emit ( OpCodes . Add ) ;
376463 il . Emit ( OpCodes . Stloc , locI ) ;
377- il . Emit ( OpCodes . Br , lblSimdLoop ) ;
464+ il . Emit ( OpCodes . Br , lblRemainderLoop ) ;
378465
379- // === Tail Loop (scalar) ===
380- il . MarkLabel ( lblSimdEnd ) ;
466+ // === SCALAR TAIL LOOP ===
467+ il . MarkLabel ( lblRemainderEnd ) ;
381468 il . MarkLabel ( lblTailLoop ) ;
382469
383470 // if (i >= totalSize) goto end
0 commit comments