11using System ;
22using System . Collections . Generic ;
3+ #if NET8_0_OR_GREATER
4+ using System . Runtime . CompilerServices ;
5+ using System . Runtime . InteropServices ;
6+ using System . Runtime . Intrinsics ;
7+ #endif
38using System . Threading . Tasks ;
49
510namespace SIPSorceryMedia . Abstractions
@@ -506,6 +511,10 @@ public static byte[] NV12toI420(byte[] nv12, int width, int height, int dop = 1)
506511 int i420UOffset = ySize ;
507512 int i420VOffset = ySize + uvWidth * uvHeight ;
508513
514+ #if NET8_0_OR_GREATER
515+ // Use SIMD for de-interleaving UV plane when available
516+ DeinterleaveUVSimd ( nv12 , nv12UvOffset , i420 , i420UOffset , i420VOffset , uvWidth , uvHeight ) ;
517+ #else
509518 if ( ! _optDOP . ContainsKey ( dop ) )
510519 _optDOP [ dop ] = new ParallelOptions ( ) { MaxDegreeOfParallelism = dop } ;
511520
@@ -522,10 +531,120 @@ public static byte[] NV12toI420(byte[] nv12, int width, int height, int dop = 1)
522531 i420 [ i420VPosn ] = nv12 [ nv12Posn + 1 ] ; // V
523532 }
524533 } ) ;
534+ #endif
525535
526536 return i420 ;
527537 }
528538
539+ #if NET8_0_OR_GREATER
540+ /// <summary>
541+ /// SIMD-optimized de-interleave of UV plane from NV12 format (UVUVUV...) to I420 format (separate U and V planes).
542+ /// </summary>
543+ [ MethodImpl ( MethodImplOptions . AggressiveInlining ) ]
544+ private static void DeinterleaveUVSimd ( byte [ ] src , int srcOffset , byte [ ] dst , int dstUOffset , int dstVOffset , int uvWidth , int uvHeight )
545+ {
546+ int totalUV = uvWidth * uvHeight ;
547+ int i = 0 ;
548+
549+ ref byte srcRef = ref Unsafe . Add ( ref MemoryMarshal . GetArrayDataReference ( src ) , srcOffset ) ;
550+ ref byte dstURef = ref Unsafe . Add ( ref MemoryMarshal . GetArrayDataReference ( dst ) , dstUOffset ) ;
551+ ref byte dstVRef = ref Unsafe . Add ( ref MemoryMarshal . GetArrayDataReference ( dst ) , dstVOffset ) ;
552+
553+ // Process 32 UV pairs at a time (64 bytes) using Vector256
554+ if ( Vector256 . IsHardwareAccelerated )
555+ {
556+ // Indices for de-interleaving: extract U values (even positions) and V values (odd positions)
557+ // For byte pairs: [U0,V0,U1,V1,U2,V2,...] -> U: [U0,U1,U2,...], V: [V0,V1,V2,...]
558+ for ( ; i <= totalUV - 32 ; i += 32 )
559+ {
560+ // Load 64 bytes (32 UV pairs)
561+ var uv0 = Vector256 . LoadUnsafe ( ref Unsafe . Add ( ref srcRef , i * 2 ) ) ;
562+ var uv1 = Vector256 . LoadUnsafe ( ref Unsafe . Add ( ref srcRef , i * 2 + 32 ) ) ;
563+
564+ // Use shuffle to de-interleave - extract even bytes (U) and odd bytes (V)
565+ var ( u0 , v0 ) = DeinterleaveVector256 ( uv0 ) ;
566+ var ( u1 , v1 ) = DeinterleaveVector256 ( uv1 ) ;
567+
568+ // Combine into 256-bit vectors
569+ var u = Vector256 . Create ( u0 , u1 ) ;
570+ var v = Vector256 . Create ( v0 , v1 ) ;
571+
572+ u . StoreUnsafe ( ref Unsafe . Add ( ref dstURef , i ) ) ;
573+ v . StoreUnsafe ( ref Unsafe . Add ( ref dstVRef , i ) ) ;
574+ }
575+ }
576+
577+ // Process 16 UV pairs at a time (32 bytes) using Vector128
578+ if ( Vector128 . IsHardwareAccelerated )
579+ {
580+ for ( ; i <= totalUV - 16 ; i += 16 )
581+ {
582+ // Load 32 bytes (16 UV pairs)
583+ var uv0 = Vector128 . LoadUnsafe ( ref Unsafe . Add ( ref srcRef , i * 2 ) ) ;
584+ var uv1 = Vector128 . LoadUnsafe ( ref Unsafe . Add ( ref srcRef , i * 2 + 16 ) ) ;
585+
586+ var ( u0 , v0 ) = DeinterleaveVector128 ( uv0 ) ;
587+ var ( u1 , v1 ) = DeinterleaveVector128 ( uv1 ) ;
588+
589+ var u = Vector128 . Create ( u0 , u1 ) ;
590+ var v = Vector128 . Create ( v0 , v1 ) ;
591+
592+ u . StoreUnsafe ( ref Unsafe . Add ( ref dstURef , i ) ) ;
593+ v . StoreUnsafe ( ref Unsafe . Add ( ref dstVRef , i ) ) ;
594+ }
595+ }
596+
597+ // Handle remaining elements with scalar code
598+ for ( ; i < totalUV ; i ++ )
599+ {
600+ Unsafe . Add ( ref dstURef , i ) = Unsafe. Add ( ref srcRef , i * 2 ) ;
601+ Unsafe . Add ( ref dstVRef , i ) = Unsafe. Add ( ref srcRef , i * 2 + 1 ) ;
602+ }
603+ }
604+
605+ /// <summary>
606+ /// De-interleave 16 byte pairs from a Vector256 into two Vector128 containing U and V values.
607+ /// Input: [U0,V0,U1,V1,U2,V2,...,U15,V15]
608+ /// Output: U=[U0,U1,...,U15], V=[V0,V1,...,V15]
609+ /// </summary>
610+ [ MethodImpl ( MethodImplOptions . AggressiveInlining ) ]
611+ private static ( Vector128 < byte > u , Vector128 < byte > v ) DeinterleaveVector256 ( Vector256 < byte > uv )
612+ {
613+ // Extract low and high 128-bit halves
614+ var low = uv . GetLower ( ) ; // [U0,V0,U1,V1,U2,V2,U3,V3,U4,V4,U5,V5,U6,V6,U7,V7]
615+ var high = uv . GetUpper ( ) ; // [U8,V8,U9,V9,U10,V10,U11,V11,U12,V12,U13,V13,U14,V14,U15,V15]
616+
617+ var ( uLow , vLow ) = DeinterleaveVector128 ( low ) ;
618+ var ( uHigh , vHigh ) = DeinterleaveVector128 ( high ) ;
619+
620+ // Combine halves
621+ var u = Vector128 . Create ( uLow , uHigh ) ;
622+ var v = Vector128 . Create ( vLow , vHigh ) ;
623+
624+ return ( u , v ) ;
625+ }
626+
627+ /// <summary>
628+ /// De-interleave 8 byte pairs from a Vector128 into two Vector64 containing U and V values.
629+ /// Input: [U0,V0,U1,V1,U2,V2,U3,V3,U4,V4,U5,V5,U6,V6,U7,V7]
630+ /// Output: U=[U0,U1,U2,U3,U4,U5,U6,U7], V=[V0,V1,V2,V3,V4,V5,V6,V7]
631+ /// </summary>
632+ [ MethodImpl ( MethodImplOptions . AggressiveInlining ) ]
633+ private static ( Vector64 < byte > u , Vector64 < byte > v ) DeinterleaveVector128 ( Vector128 < byte > uv )
634+ {
635+ // Shuffle bytes to gather all U values in low 64 bits and V values in high 64 bits
636+ // This shuffle pattern extracts even indices (U) to the first 8 bytes and odd indices (V) to the last 8 bytes
637+ var shuffleIndices = Vector128 . Create (
638+ ( byte ) 0 , 2 , 4 , 6 , 8 , 10 , 12 , 14 , // U indices (even positions)
639+ 1 , 3 , 5 , 7 , 9 , 11 , 13 , 15 // V indices (odd positions)
640+ ) ;
641+
642+ var shuffled = Vector128 . Shuffle ( uv , shuffleIndices ) ;
643+
644+ return ( shuffled . GetLower ( ) , shuffled . GetUpper ( ) ) ;
645+ }
646+ #endif
647+
529648 /// <summary>
530649 /// Converts an I420 sample to an NV12 formatted sample.
531650 /// I420: Y plane followed by U plane, then V plane (planar format).
@@ -557,6 +676,10 @@ public static byte[] I420toNV12(byte[] i420, int width, int height, int dop = 1)
557676 int i420VOffset = ySize + uvWidth * uvHeight ;
558677 int nv12UvOffset = ySize ;
559678
679+ #if NET8_0_OR_GREATER
680+ // Use SIMD for interleaving U and V planes when available
681+ InterleaveUVSimd ( i420 , i420UOffset , i420VOffset , nv12 , nv12UvOffset , uvWidth , uvHeight ) ;
682+ #else
560683 if ( ! _optDOP . ContainsKey ( dop ) )
561684 _optDOP [ dop ] = new ParallelOptions ( ) { MaxDegreeOfParallelism = dop } ;
562685
@@ -573,8 +696,129 @@ public static byte[] I420toNV12(byte[] i420, int width, int height, int dop = 1)
573696 nv12 [ nv12Posn + 1 ] = i420 [ i420VPosn ] ; // V
574697 }
575698 } ) ;
699+ #endif
576700
577701 return nv12 ;
578702 }
703+
704+ #if NET8_0_OR_GREATER
705+ /// <summary>
706+ /// SIMD-optimized interleave of separate U and V planes from I420 format to NV12 format (UVUVUV...).
707+ /// </summary>
708+ [ MethodImpl ( MethodImplOptions . AggressiveInlining ) ]
709+ private static void InterleaveUVSimd ( byte [ ] src , int srcUOffset , int srcVOffset , byte [ ] dst , int dstOffset , int uvWidth , int uvHeight )
710+ {
711+ int totalUV = uvWidth * uvHeight ;
712+ int i = 0 ;
713+
714+ ref byte srcURef = ref Unsafe . Add ( ref MemoryMarshal . GetArrayDataReference ( src ) , srcUOffset ) ;
715+ ref byte srcVRef = ref Unsafe . Add ( ref MemoryMarshal . GetArrayDataReference ( src ) , srcVOffset ) ;
716+ ref byte dstRef = ref Unsafe . Add ( ref MemoryMarshal . GetArrayDataReference ( dst ) , dstOffset ) ;
717+
718+ // Process 32 U/V values at a time using Vector256
719+ if ( Vector256 . IsHardwareAccelerated )
720+ {
721+ for ( ; i <= totalUV - 32 ; i += 32 )
722+ {
723+ // Load 32 U values and 32 V values
724+ var u = Vector256 . LoadUnsafe ( ref Unsafe . Add ( ref srcURef , i ) ) ;
725+ var v = Vector256 . LoadUnsafe ( ref Unsafe . Add ( ref srcVRef , i ) ) ;
726+
727+ // Interleave U and V values
728+ var ( uv0 , uv1 ) = InterleaveVector256 ( u , v ) ;
729+
730+ // Store 64 bytes (32 UV pairs)
731+ uv0 . StoreUnsafe ( ref Unsafe . Add ( ref dstRef , i * 2 ) ) ;
732+ uv1 . StoreUnsafe ( ref Unsafe . Add ( ref dstRef , i * 2 + 32 ) ) ;
733+ }
734+ }
735+
736+ // Process 16 U/V values at a time using Vector128
737+ if ( Vector128 . IsHardwareAccelerated )
738+ {
739+ for ( ; i <= totalUV - 16 ; i += 16 )
740+ {
741+ // Load 16 U values and 16 V values
742+ var u = Vector128 . LoadUnsafe ( ref Unsafe . Add ( ref srcURef , i ) ) ;
743+ var v = Vector128 . LoadUnsafe ( ref Unsafe . Add ( ref srcVRef , i ) ) ;
744+
745+ // Interleave U and V values
746+ var ( uv0 , uv1 ) = InterleaveVector128 ( u , v ) ;
747+
748+ // Store 32 bytes (16 UV pairs)
749+ uv0 . StoreUnsafe ( ref Unsafe . Add ( ref dstRef , i * 2 ) ) ;
750+ uv1 . StoreUnsafe ( ref Unsafe . Add ( ref dstRef , i * 2 + 16 ) ) ;
751+ }
752+ }
753+
754+ // Handle remaining elements with scalar code
755+ for ( ; i < totalUV ; i ++ )
756+ {
757+ Unsafe . Add ( ref dstRef , i * 2 ) = Unsafe. Add ( ref srcURef , i ) ;
758+ Unsafe . Add ( ref dstRef , i * 2 + 1 ) = Unsafe. Add ( ref srcVRef , i ) ;
759+ }
760+ }
761+
762+ /// <summary>
763+ /// Interleave two Vector256 of U and V values into two Vector256 of interleaved UV pairs.
764+ /// Input: U=[U0,U1,...,U31], V=[V0,V1,...,V31]
765+ /// Output: UV0=[U0,V0,U1,V1,...,U15,V15], UV1=[U16,V16,...,U31,V31]
766+ /// </summary>
767+ [ MethodImpl ( MethodImplOptions . AggressiveInlining ) ]
768+ private static ( Vector256 < byte > uv0 , Vector256 < byte > uv1 ) InterleaveVector256 ( Vector256 < byte > u , Vector256 < byte > v )
769+ {
770+ // Get low and high halves
771+ var uLow = u . GetLower ( ) ; // U0-U15
772+ var uHigh = u . GetUpper ( ) ; // U16-U31
773+ var vLow = v . GetLower ( ) ; // V0-V15
774+ var vHigh = v . GetUpper ( ) ; // V16-V31
775+
776+ // Interleave low halves -> first 32 bytes
777+ var ( uv0Low , uv0High ) = InterleaveVector128ToTwo ( uLow , vLow ) ;
778+ var uv0 = Vector256 . Create ( uv0Low , uv0High ) ;
779+
780+ // Interleave high halves -> second 32 bytes
781+ var ( uv1Low , uv1High ) = InterleaveVector128ToTwo ( uHigh , vHigh ) ;
782+ var uv1 = Vector256 . Create ( uv1Low , uv1High ) ;
783+
784+ return ( uv0 , uv1 ) ;
785+ }
786+
787+ /// <summary>
788+ /// Interleave two Vector128 of U and V values into two Vector128 of interleaved UV pairs.
789+ /// Input: U=[U0,U1,...,U15], V=[V0,V1,...,V15]
790+ /// Output: UV0=[U0,V0,U1,V1,...,U7,V7], UV1=[U8,V8,...,U15,V15]
791+ /// </summary>
792+ [ MethodImpl ( MethodImplOptions . AggressiveInlining ) ]
793+ private static ( Vector128 < byte > uv0 , Vector128 < byte > uv1 ) InterleaveVector128 ( Vector128 < byte > u , Vector128 < byte > v )
794+ {
795+ return InterleaveVector128ToTwo ( u , v ) ;
796+ }
797+
798+ /// <summary>
799+ /// Interleave two Vector128 of 16 bytes each into two Vector128 of interleaved pairs.
800+ /// Input: A=[A0,A1,...,A15], B=[B0,B1,...,B15]
801+ /// Output: Out0=[A0,B0,A1,B1,...,A7,B7], Out1=[A8,B8,...,A15,B15]
802+ /// </summary>
803+ [ MethodImpl ( MethodImplOptions . AggressiveInlining ) ]
804+ private static ( Vector128 < byte > out0 , Vector128 < byte > out1 ) InterleaveVector128ToTwo ( Vector128 < byte > a , Vector128 < byte > b )
805+ {
806+ // Create interleave shuffle patterns for low and high halves
807+ // Low: takes elements 0-7 from A and B, interleaves them
808+ // Pattern for first 8 pairs: A0,B0,A1,B1,A2,B2,A3,B3,A4,B4,A5,B5,A6,B6,A7,B7
809+ var shuffleLowA = Vector128 . Create ( ( byte ) 0 , 255 , 1 , 255 , 2 , 255 , 3 , 255 , 4 , 255 , 5 , 255 , 6 , 255 , 7 , 255 ) ;
810+ var shuffleLowB = Vector128 . Create ( ( byte ) 255 , 0 , 255 , 1 , 255 , 2 , 255 , 3 , 255 , 4 , 255 , 5 , 255 , 6 , 255 , 7 ) ;
811+
812+ // Pattern for second 8 pairs: A8,B8,A9,B9,...,A15,B15
813+ var shuffleHighA = Vector128 . Create ( ( byte ) 8 , 255 , 9 , 255 , 10 , 255 , 11 , 255 , 12 , 255 , 13 , 255 , 14 , 255 , 15 , 255 ) ;
814+ var shuffleHighB = Vector128 . Create ( ( byte ) 255 , 8 , 255 , 9 , 255 , 10 , 255 , 11 , 255 , 12 , 255 , 13 , 255 , 14 , 255 , 15 ) ;
815+
816+ // Shuffle and OR to combine
817+ var out0 = Vector128 . Shuffle ( a , shuffleLowA ) | Vector128 . Shuffle ( b , shuffleLowB ) ;
818+ var out1 = Vector128 . Shuffle ( a , shuffleHighA ) | Vector128 . Shuffle ( b , shuffleHighB ) ;
819+
820+ return ( out0 , out1 ) ;
821+ }
822+ #endif
579823 }
580824}
0 commit comments