@@ -252,9 +252,6 @@ private static void EmitWhereSIMDLoop<T>(ILGenerator il, LocalBuilder locI) wher
252252
253253 private static void EmitWhereV256BodyWithOffset < T > ( ILGenerator il , LocalBuilder locI , long elementSize , long offset ) where T : unmanaged
254254 {
255- // Get the appropriate mask creation method based on element size
256- var maskMethod = GetMaskCreationMethod256 ( ( int ) elementSize ) ;
257-
258255 // Get Vector256 methods via reflection - need to find generic method definitions first
259256 var loadMethod = Array . Find ( typeof ( Vector256 ) . GetMethods ( ) ,
260257 m => m . Name == "Load" && m . IsGenericMethodDefinition && m . GetParameters ( ) . Length == 1 ) !
@@ -277,8 +274,8 @@ private static void EmitWhereV256BodyWithOffset<T>(ILGenerator il, LocalBuilder
277274 il . Emit ( OpCodes . Conv_I ) ;
278275 il . Emit ( OpCodes . Add ) ;
279276
280- // Call mask creation: returns Vector256<T> on stack
281- il . Emit ( OpCodes . Call , maskMethod ) ;
277+ // Inline mask creation - emit AVX2 instructions directly instead of calling helper
278+ EmitInlineMaskCreationV256 ( il , ( int ) elementSize ) ;
282279
283280 // Load x vector: x + (i + offset) * elementSize
284281 il . Emit ( OpCodes . Ldarg_1 ) ; // x
@@ -329,8 +326,6 @@ private static void EmitWhereV256BodyWithOffset<T>(ILGenerator il, LocalBuilder
329326
330327 private static void EmitWhereV128BodyWithOffset < T > ( ILGenerator il , LocalBuilder locI , long elementSize , long offset ) where T : unmanaged
331328 {
332- var maskMethod = GetMaskCreationMethod128 ( ( int ) elementSize ) ;
333-
334329 // Get Vector128 methods via reflection - need to find generic method definitions first
335330 var loadMethod = Array . Find ( typeof ( Vector128 ) . GetMethods ( ) ,
336331 m => m . Name == "Load" && m . IsGenericMethodDefinition && m . GetParameters ( ) . Length == 1 ) !
@@ -352,7 +347,9 @@ private static void EmitWhereV128BodyWithOffset<T>(ILGenerator il, LocalBuilder
352347 }
353348 il . Emit ( OpCodes . Conv_I ) ;
354349 il . Emit ( OpCodes . Add ) ;
355- il . Emit ( OpCodes . Call , maskMethod ) ;
350+
351+ // Inline mask creation - emit SSE4.1 instructions directly
352+ EmitInlineMaskCreationV128 ( il , ( int ) elementSize ) ;
356353
357354 // Load x vector
358355 il . Emit ( OpCodes . Ldarg_1 ) ;
@@ -497,6 +494,230 @@ private static MethodInfo GetMaskCreationMethod128(int elementSize)
497494 } ;
498495 }
499496
497+ #endregion
498+
499+ #region Inline Mask IL Emission
500+
501+ // Cache reflection lookups for inline emission
502+ private static readonly MethodInfo _v128LoadByte = Array . Find ( typeof ( Vector128 ) . GetMethods ( ) ,
503+ m => m . Name == "Load" && m . IsGenericMethodDefinition ) ! . MakeGenericMethod ( typeof ( byte ) ) ;
504+ private static readonly MethodInfo _v256LoadByte = Array . Find ( typeof ( Vector256 ) . GetMethods ( ) ,
505+ m => m . Name == "Load" && m . IsGenericMethodDefinition ) ! . MakeGenericMethod ( typeof ( byte ) ) ;
506+
507+ private static readonly MethodInfo _v128CreateScalarUInt = Array . Find ( typeof ( Vector128 ) . GetMethods ( ) ,
508+ m => m . Name == "CreateScalar" && m . IsGenericMethodDefinition ) ! . MakeGenericMethod ( typeof ( uint ) ) ;
509+ private static readonly MethodInfo _v128CreateScalarULong = Array . Find ( typeof ( Vector128 ) . GetMethods ( ) ,
510+ m => m . Name == "CreateScalar" && m . IsGenericMethodDefinition ) ! . MakeGenericMethod ( typeof ( ulong ) ) ;
511+ private static readonly MethodInfo _v128CreateScalarUShort = Array . Find ( typeof ( Vector128 ) . GetMethods ( ) ,
512+ m => m . Name == "CreateScalar" && m . IsGenericMethodDefinition ) ! . MakeGenericMethod ( typeof ( ushort ) ) ;
513+
514+ // AsByte is an extension method on Vector128 static class, not instance method
515+ private static readonly MethodInfo _v128UIntAsByte = Array . Find ( typeof ( Vector128 ) . GetMethods ( ) ,
516+ m => m . Name == "AsByte" && m . IsGenericMethodDefinition ) ! . MakeGenericMethod ( typeof ( uint ) ) ;
517+ private static readonly MethodInfo _v128ULongAsByte = Array . Find ( typeof ( Vector128 ) . GetMethods ( ) ,
518+ m => m . Name == "AsByte" && m . IsGenericMethodDefinition ) ! . MakeGenericMethod ( typeof ( ulong ) ) ;
519+ private static readonly MethodInfo _v128UShortAsByte = Array . Find ( typeof ( Vector128 ) . GetMethods ( ) ,
520+ m => m . Name == "AsByte" && m . IsGenericMethodDefinition ) ! . MakeGenericMethod ( typeof ( ushort ) ) ;
521+
522+ private static readonly MethodInfo _avx2ConvertToV256Int64 = typeof ( Avx2 ) . GetMethod ( "ConvertToVector256Int64" , new [ ] { typeof ( Vector128 < byte > ) } ) ! ;
523+ private static readonly MethodInfo _avx2ConvertToV256Int32 = typeof ( Avx2 ) . GetMethod ( "ConvertToVector256Int32" , new [ ] { typeof ( Vector128 < byte > ) } ) ! ;
524+ private static readonly MethodInfo _avx2ConvertToV256Int16 = typeof ( Avx2 ) . GetMethod ( "ConvertToVector256Int16" , new [ ] { typeof ( Vector128 < byte > ) } ) ! ;
525+
526+ private static readonly MethodInfo _sse41ConvertToV128Int64 = typeof ( Sse41 ) . GetMethod ( "ConvertToVector128Int64" , new [ ] { typeof ( Vector128 < byte > ) } ) ! ;
527+ private static readonly MethodInfo _sse41ConvertToV128Int32 = typeof ( Sse41 ) . GetMethod ( "ConvertToVector128Int32" , new [ ] { typeof ( Vector128 < byte > ) } ) ! ;
528+ private static readonly MethodInfo _sse41ConvertToV128Int16 = typeof ( Sse41 ) . GetMethod ( "ConvertToVector128Int16" , new [ ] { typeof ( Vector128 < byte > ) } ) ! ;
529+
530+ // As* methods are extension methods on Vector256/Vector128 static classes
531+ private static readonly MethodInfo _v256LongAsULong = Array . Find ( typeof ( Vector256 ) . GetMethods ( ) ,
532+ m => m . Name == "AsUInt64" && m . IsGenericMethodDefinition ) ! . MakeGenericMethod ( typeof ( long ) ) ;
533+ private static readonly MethodInfo _v256IntAsUInt = Array . Find ( typeof ( Vector256 ) . GetMethods ( ) ,
534+ m => m . Name == "AsUInt32" && m . IsGenericMethodDefinition ) ! . MakeGenericMethod ( typeof ( int ) ) ;
535+ private static readonly MethodInfo _v256ShortAsUShort = Array . Find ( typeof ( Vector256 ) . GetMethods ( ) ,
536+ m => m . Name == "AsUInt16" && m . IsGenericMethodDefinition ) ! . MakeGenericMethod ( typeof ( short ) ) ;
537+
538+ private static readonly MethodInfo _v128LongAsULong = Array . Find ( typeof ( Vector128 ) . GetMethods ( ) ,
539+ m => m . Name == "AsUInt64" && m . IsGenericMethodDefinition ) ! . MakeGenericMethod ( typeof ( long ) ) ;
540+ private static readonly MethodInfo _v128IntAsUInt = Array . Find ( typeof ( Vector128 ) . GetMethods ( ) ,
541+ m => m . Name == "AsUInt32" && m . IsGenericMethodDefinition ) ! . MakeGenericMethod ( typeof ( int ) ) ;
542+ private static readonly MethodInfo _v128ShortAsUShort = Array . Find ( typeof ( Vector128 ) . GetMethods ( ) ,
543+ m => m . Name == "AsUInt16" && m . IsGenericMethodDefinition ) ! . MakeGenericMethod ( typeof ( short ) ) ;
544+
545+ private static readonly MethodInfo _v256GreaterThanULong = Array . Find ( typeof ( Vector256 ) . GetMethods ( ) ,
546+ m => m . Name == "GreaterThan" && m . IsGenericMethodDefinition ) ! . MakeGenericMethod ( typeof ( ulong ) ) ;
547+ private static readonly MethodInfo _v256GreaterThanUInt = Array . Find ( typeof ( Vector256 ) . GetMethods ( ) ,
548+ m => m . Name == "GreaterThan" && m . IsGenericMethodDefinition ) ! . MakeGenericMethod ( typeof ( uint ) ) ;
549+ private static readonly MethodInfo _v256GreaterThanUShort = Array . Find ( typeof ( Vector256 ) . GetMethods ( ) ,
550+ m => m . Name == "GreaterThan" && m . IsGenericMethodDefinition ) ! . MakeGenericMethod ( typeof ( ushort ) ) ;
551+ private static readonly MethodInfo _v256GreaterThanByte = Array . Find ( typeof ( Vector256 ) . GetMethods ( ) ,
552+ m => m . Name == "GreaterThan" && m . IsGenericMethodDefinition ) ! . MakeGenericMethod ( typeof ( byte ) ) ;
553+
554+ private static readonly MethodInfo _v128GreaterThanULong = Array . Find ( typeof ( Vector128 ) . GetMethods ( ) ,
555+ m => m . Name == "GreaterThan" && m . IsGenericMethodDefinition ) ! . MakeGenericMethod ( typeof ( ulong ) ) ;
556+ private static readonly MethodInfo _v128GreaterThanUInt = Array . Find ( typeof ( Vector128 ) . GetMethods ( ) ,
557+ m => m . Name == "GreaterThan" && m . IsGenericMethodDefinition ) ! . MakeGenericMethod ( typeof ( uint ) ) ;
558+ private static readonly MethodInfo _v128GreaterThanUShort = Array . Find ( typeof ( Vector128 ) . GetMethods ( ) ,
559+ m => m . Name == "GreaterThan" && m . IsGenericMethodDefinition ) ! . MakeGenericMethod ( typeof ( ushort ) ) ;
560+ private static readonly MethodInfo _v128GreaterThanByte = Array . Find ( typeof ( Vector128 ) . GetMethods ( ) ,
561+ m => m . Name == "GreaterThan" && m . IsGenericMethodDefinition ) ! . MakeGenericMethod ( typeof ( byte ) ) ;
562+
563+ private static readonly FieldInfo _v256ZeroULong = typeof ( Vector256 < ulong > ) . GetProperty ( "Zero" ) ! . GetMethod ! . IsStatic
564+ ? null ! : null ! ; // Use GetMethod call instead
565+ private static readonly MethodInfo _v256GetZeroULong = typeof ( Vector256 < ulong > ) . GetProperty ( "Zero" ) ! . GetMethod ! ;
566+ private static readonly MethodInfo _v256GetZeroUInt = typeof ( Vector256 < uint > ) . GetProperty ( "Zero" ) ! . GetMethod ! ;
567+ private static readonly MethodInfo _v256GetZeroUShort = typeof ( Vector256 < ushort > ) . GetProperty ( "Zero" ) ! . GetMethod ! ;
568+ private static readonly MethodInfo _v256GetZeroByte = typeof ( Vector256 < byte > ) . GetProperty ( "Zero" ) ! . GetMethod ! ;
569+
570+ private static readonly MethodInfo _v128GetZeroULong = typeof ( Vector128 < ulong > ) . GetProperty ( "Zero" ) ! . GetMethod ! ;
571+ private static readonly MethodInfo _v128GetZeroUInt = typeof ( Vector128 < uint > ) . GetProperty ( "Zero" ) ! . GetMethod ! ;
572+ private static readonly MethodInfo _v128GetZeroUShort = typeof ( Vector128 < ushort > ) . GetProperty ( "Zero" ) ! . GetMethod ! ;
573+ private static readonly MethodInfo _v128GetZeroByte = typeof ( Vector128 < byte > ) . GetProperty ( "Zero" ) ! . GetMethod ! ;
574+
575+ /// <summary>
576+ /// Emit inline V256 mask creation. Stack: byte* -> Vector256{T} (as mask)
577+ /// </summary>
578+ private static void EmitInlineMaskCreationV256 ( ILGenerator il , int elementSize )
579+ {
580+ // Stack has: byte* pointing to condition bools
581+
582+ switch ( elementSize )
583+ {
584+ case 8 : // double/long: load 4 bytes, expand to 4 qwords
585+ // *(uint*)ptr
586+ il . Emit ( OpCodes . Ldind_U4 ) ;
587+ // Vector128.CreateScalar<uint>(value)
588+ il . Emit ( OpCodes . Call , _v128CreateScalarUInt ) ;
589+ // .AsByte()
590+ il . Emit ( OpCodes . Call , _v128UIntAsByte ) ;
591+ // Avx2.ConvertToVector256Int64(bytes)
592+ il . Emit ( OpCodes . Call , _avx2ConvertToV256Int64 ) ;
593+ // .AsUInt64()
594+ il . Emit ( OpCodes . Call , _v256LongAsULong ) ;
595+ // Vector256<ulong>.Zero
596+ il . Emit ( OpCodes . Call , _v256GetZeroULong ) ;
597+ // Vector256.GreaterThan(expanded, zero)
598+ il . Emit ( OpCodes . Call , _v256GreaterThanULong ) ;
599+ break ;
600+
601+ case 4 : // float/int: load 8 bytes, expand to 8 dwords
602+ // *(ulong*)ptr
603+ il . Emit ( OpCodes . Ldind_I8 ) ;
604+ // Vector128.CreateScalar<ulong>(value)
605+ il . Emit ( OpCodes . Call , _v128CreateScalarULong ) ;
606+ // .AsByte()
607+ il . Emit ( OpCodes . Call , _v128ULongAsByte ) ;
608+ // Avx2.ConvertToVector256Int32(bytes)
609+ il . Emit ( OpCodes . Call , _avx2ConvertToV256Int32 ) ;
610+ // .AsUInt32()
611+ il . Emit ( OpCodes . Call , _v256IntAsUInt ) ;
612+ // Vector256<uint>.Zero
613+ il . Emit ( OpCodes . Call , _v256GetZeroUInt ) ;
614+ // Vector256.GreaterThan(expanded, zero)
615+ il . Emit ( OpCodes . Call , _v256GreaterThanUInt ) ;
616+ break ;
617+
618+ case 2 : // short/char: load 16 bytes, expand to 16 words
619+ // Vector128.Load<byte>(ptr)
620+ il . Emit ( OpCodes . Call , _v128LoadByte ) ;
621+ // Avx2.ConvertToVector256Int16(bytes)
622+ il . Emit ( OpCodes . Call , _avx2ConvertToV256Int16 ) ;
623+ // .AsUInt16()
624+ il . Emit ( OpCodes . Call , _v256ShortAsUShort ) ;
625+ // Vector256<ushort>.Zero
626+ il . Emit ( OpCodes . Call , _v256GetZeroUShort ) ;
627+ // Vector256.GreaterThan(expanded, zero)
628+ il . Emit ( OpCodes . Call , _v256GreaterThanUShort ) ;
629+ break ;
630+
631+ case 1 : // byte/bool: load 32 bytes, compare directly
632+ // Vector256.Load<byte>(ptr)
633+ il . Emit ( OpCodes . Call , _v256LoadByte ) ;
634+ // Vector256<byte>.Zero
635+ il . Emit ( OpCodes . Call , _v256GetZeroByte ) ;
636+ // Vector256.GreaterThan(vec, zero)
637+ il . Emit ( OpCodes . Call , _v256GreaterThanByte ) ;
638+ break ;
639+
640+ default :
641+ throw new NotSupportedException ( $ "Element size { elementSize } not supported") ;
642+ }
643+ }
644+
645+ /// <summary>
646+ /// Emit inline V128 mask creation. Stack: byte* -> Vector128{T} (as mask)
647+ /// </summary>
648+ private static void EmitInlineMaskCreationV128 ( ILGenerator il , int elementSize )
649+ {
650+ switch ( elementSize )
651+ {
652+ case 8 : // double/long: load 2 bytes, expand to 2 qwords
653+ // *(ushort*)ptr
654+ il . Emit ( OpCodes . Ldind_U2 ) ;
655+ // Vector128.CreateScalar<ushort>(value)
656+ il . Emit ( OpCodes . Call , _v128CreateScalarUShort ) ;
657+ // .AsByte()
658+ il . Emit ( OpCodes . Call , _v128UShortAsByte ) ;
659+ // Sse41.ConvertToVector128Int64(bytes)
660+ il . Emit ( OpCodes . Call , _sse41ConvertToV128Int64 ) ;
661+ // .AsUInt64()
662+ il . Emit ( OpCodes . Call , _v128LongAsULong ) ;
663+ // Vector128<ulong>.Zero
664+ il . Emit ( OpCodes . Call , _v128GetZeroULong ) ;
665+ // Vector128.GreaterThan(expanded, zero)
666+ il . Emit ( OpCodes . Call , _v128GreaterThanULong ) ;
667+ break ;
668+
669+ case 4 : // float/int: load 4 bytes, expand to 4 dwords
670+ // *(uint*)ptr
671+ il . Emit ( OpCodes . Ldind_U4 ) ;
672+ // Vector128.CreateScalar<uint>(value)
673+ il . Emit ( OpCodes . Call , _v128CreateScalarUInt ) ;
674+ // .AsByte()
675+ il . Emit ( OpCodes . Call , _v128UIntAsByte ) ;
676+ // Sse41.ConvertToVector128Int32(bytes)
677+ il . Emit ( OpCodes . Call , _sse41ConvertToV128Int32 ) ;
678+ // .AsUInt32()
679+ il . Emit ( OpCodes . Call , _v128IntAsUInt ) ;
680+ // Vector128<uint>.Zero
681+ il . Emit ( OpCodes . Call , _v128GetZeroUInt ) ;
682+ // Vector128.GreaterThan(expanded, zero)
683+ il . Emit ( OpCodes . Call , _v128GreaterThanUInt ) ;
684+ break ;
685+
686+ case 2 : // short/char: load 8 bytes, expand to 8 words
687+ // *(ulong*)ptr
688+ il . Emit ( OpCodes . Ldind_I8 ) ;
689+ // Vector128.CreateScalar<ulong>(value)
690+ il . Emit ( OpCodes . Call , _v128CreateScalarULong ) ;
691+ // .AsByte()
692+ il . Emit ( OpCodes . Call , _v128ULongAsByte ) ;
693+ // Sse41.ConvertToVector128Int16(bytes)
694+ il . Emit ( OpCodes . Call , _sse41ConvertToV128Int16 ) ;
695+ // .AsUInt16()
696+ il . Emit ( OpCodes . Call , _v128ShortAsUShort ) ;
697+ // Vector128<ushort>.Zero
698+ il . Emit ( OpCodes . Call , _v128GetZeroUShort ) ;
699+ // Vector128.GreaterThan(expanded, zero)
700+ il . Emit ( OpCodes . Call , _v128GreaterThanUShort ) ;
701+ break ;
702+
703+ case 1 : // byte/bool: load 16 bytes, compare directly
704+ // Vector128.Load<byte>(ptr)
705+ il . Emit ( OpCodes . Call , _v128LoadByte ) ;
706+ // Vector128<byte>.Zero
707+ il . Emit ( OpCodes . Call , _v128GetZeroByte ) ;
708+ // Vector128.GreaterThan(vec, zero)
709+ il . Emit ( OpCodes . Call , _v128GreaterThanByte ) ;
710+ break ;
711+
712+ default :
713+ throw new NotSupportedException ( $ "Element size { elementSize } not supported") ;
714+ }
715+ }
716+
717+ #endregion
718+
719+ #region Static Mask Creation Methods (fallback)
720+
500721 /// <summary>
501722 /// Create V256 mask from 32 bools for 1-byte elements.
502723 /// </summary>
0 commit comments