Skip to content

Commit 80c7963

Browse files
Copilotsipsorcery
andauthored
Add SIMD-optimized NV12 to/from I420 pixel format conversion (#1599)
* Add SIMD-optimized NV12 to/from I420 conversion methods Agent-Logs-Url: https://github.com/sipsorcery-org/sipsorcery/sessions/4429b44c-6870-4d08-bbaf-1313168d1c86 Co-authored-by: sipsorcery <197660+sipsorcery@users.noreply.github.com> * Fix misleading comment in SIMD de-interleave code Agent-Logs-Url: https://github.com/sipsorcery-org/sipsorcery/sessions/4429b44c-6870-4d08-bbaf-1313168d1c86 Co-authored-by: sipsorcery <197660+sipsorcery@users.noreply.github.com> --------- Co-authored-by: copilot-swe-agent[bot] <198982749+Copilot@users.noreply.github.com> Co-authored-by: sipsorcery <197660+sipsorcery@users.noreply.github.com>
1 parent 5b49a95 commit 80c7963

1 file changed

Lines changed: 244 additions & 0 deletions

File tree

src/SIPSorceryMedia.Abstractions/PixelConverter.cs

Lines changed: 244 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,10 @@
11
using System;
22
using System.Collections.Generic;
3+
#if NET8_0_OR_GREATER
4+
using System.Runtime.CompilerServices;
5+
using System.Runtime.InteropServices;
6+
using System.Runtime.Intrinsics;
7+
#endif
38
using System.Threading.Tasks;
49

510
namespace SIPSorceryMedia.Abstractions
@@ -506,6 +511,10 @@ public static byte[] NV12toI420(byte[] nv12, int width, int height, int dop = 1)
506511
int i420UOffset = ySize;
507512
int i420VOffset = ySize + uvWidth * uvHeight;
508513

514+
#if NET8_0_OR_GREATER
515+
// Use SIMD for de-interleaving UV plane when available
516+
DeinterleaveUVSimd(nv12, nv12UvOffset, i420, i420UOffset, i420VOffset, uvWidth, uvHeight);
517+
#else
509518
if (!_optDOP.ContainsKey(dop))
510519
_optDOP[dop] = new ParallelOptions() { MaxDegreeOfParallelism = dop };
511520

@@ -522,10 +531,120 @@ public static byte[] NV12toI420(byte[] nv12, int width, int height, int dop = 1)
522531
i420[i420VPosn] = nv12[nv12Posn + 1]; // V
523532
}
524533
});
534+
#endif
525535

526536
return i420;
527537
}
528538

539+
#if NET8_0_OR_GREATER
540+
/// <summary>
541+
/// SIMD-optimized de-interleave of UV plane from NV12 format (UVUVUV...) to I420 format (separate U and V planes).
542+
/// </summary>
543+
[MethodImpl(MethodImplOptions.AggressiveInlining)]
544+
private static void DeinterleaveUVSimd(byte[] src, int srcOffset, byte[] dst, int dstUOffset, int dstVOffset, int uvWidth, int uvHeight)
545+
{
546+
int totalUV = uvWidth * uvHeight;
547+
int i = 0;
548+
549+
ref byte srcRef = ref Unsafe.Add(ref MemoryMarshal.GetArrayDataReference(src), srcOffset);
550+
ref byte dstURef = ref Unsafe.Add(ref MemoryMarshal.GetArrayDataReference(dst), dstUOffset);
551+
ref byte dstVRef = ref Unsafe.Add(ref MemoryMarshal.GetArrayDataReference(dst), dstVOffset);
552+
553+
// Process 32 UV pairs at a time (64 bytes) using Vector256
554+
if (Vector256.IsHardwareAccelerated)
555+
{
556+
// Indices for de-interleaving: extract U values (even positions) and V values (odd positions)
557+
// For byte pairs: [U0,V0,U1,V1,U2,V2,...] -> U: [U0,U1,U2,...], V: [V0,V1,V2,...]
558+
for (; i <= totalUV - 32; i += 32)
559+
{
560+
// Load 64 bytes (32 UV pairs)
561+
var uv0 = Vector256.LoadUnsafe(ref Unsafe.Add(ref srcRef, i * 2));
562+
var uv1 = Vector256.LoadUnsafe(ref Unsafe.Add(ref srcRef, i * 2 + 32));
563+
564+
// Use shuffle to de-interleave - extract even bytes (U) and odd bytes (V)
565+
var (u0, v0) = DeinterleaveVector256(uv0);
566+
var (u1, v1) = DeinterleaveVector256(uv1);
567+
568+
// Combine into 256-bit vectors
569+
var u = Vector256.Create(u0, u1);
570+
var v = Vector256.Create(v0, v1);
571+
572+
u.StoreUnsafe(ref Unsafe.Add(ref dstURef, i));
573+
v.StoreUnsafe(ref Unsafe.Add(ref dstVRef, i));
574+
}
575+
}
576+
577+
// Process 16 UV pairs at a time (32 bytes) using Vector128
578+
if (Vector128.IsHardwareAccelerated)
579+
{
580+
for (; i <= totalUV - 16; i += 16)
581+
{
582+
// Load 32 bytes (16 UV pairs)
583+
var uv0 = Vector128.LoadUnsafe(ref Unsafe.Add(ref srcRef, i * 2));
584+
var uv1 = Vector128.LoadUnsafe(ref Unsafe.Add(ref srcRef, i * 2 + 16));
585+
586+
var (u0, v0) = DeinterleaveVector128(uv0);
587+
var (u1, v1) = DeinterleaveVector128(uv1);
588+
589+
var u = Vector128.Create(u0, u1);
590+
var v = Vector128.Create(v0, v1);
591+
592+
u.StoreUnsafe(ref Unsafe.Add(ref dstURef, i));
593+
v.StoreUnsafe(ref Unsafe.Add(ref dstVRef, i));
594+
}
595+
}
596+
597+
// Handle remaining elements with scalar code
598+
for (; i < totalUV; i++)
599+
{
600+
Unsafe.Add(ref dstURef, i) = Unsafe.Add(ref srcRef, i * 2);
601+
Unsafe.Add(ref dstVRef, i) = Unsafe.Add(ref srcRef, i * 2 + 1);
602+
}
603+
}
604+
605+
/// <summary>
606+
/// De-interleave 16 byte pairs from a Vector256 into two Vector128 containing U and V values.
607+
/// Input: [U0,V0,U1,V1,U2,V2,...,U15,V15]
608+
/// Output: U=[U0,U1,...,U15], V=[V0,V1,...,V15]
609+
/// </summary>
610+
[MethodImpl(MethodImplOptions.AggressiveInlining)]
611+
private static (Vector128<byte> u, Vector128<byte> v) DeinterleaveVector256(Vector256<byte> uv)
612+
{
613+
// Extract low and high 128-bit halves
614+
var low = uv.GetLower(); // [U0,V0,U1,V1,U2,V2,U3,V3,U4,V4,U5,V5,U6,V6,U7,V7]
615+
var high = uv.GetUpper(); // [U8,V8,U9,V9,U10,V10,U11,V11,U12,V12,U13,V13,U14,V14,U15,V15]
616+
617+
var (uLow, vLow) = DeinterleaveVector128(low);
618+
var (uHigh, vHigh) = DeinterleaveVector128(high);
619+
620+
// Combine halves
621+
var u = Vector128.Create(uLow, uHigh);
622+
var v = Vector128.Create(vLow, vHigh);
623+
624+
return (u, v);
625+
}
626+
627+
/// <summary>
628+
/// De-interleave 8 byte pairs from a Vector128 into two Vector64 containing U and V values.
629+
/// Input: [U0,V0,U1,V1,U2,V2,U3,V3,U4,V4,U5,V5,U6,V6,U7,V7]
630+
/// Output: U=[U0,U1,U2,U3,U4,U5,U6,U7], V=[V0,V1,V2,V3,V4,V5,V6,V7]
631+
/// </summary>
632+
[MethodImpl(MethodImplOptions.AggressiveInlining)]
633+
private static (Vector64<byte> u, Vector64<byte> v) DeinterleaveVector128(Vector128<byte> uv)
634+
{
635+
// Shuffle bytes to gather all U values in low 64 bits and V values in high 64 bits
636+
// This shuffle pattern extracts even indices (U) to the first 8 bytes and odd indices (V) to the last 8 bytes
637+
var shuffleIndices = Vector128.Create(
638+
(byte)0, 2, 4, 6, 8, 10, 12, 14, // U indices (even positions)
639+
1, 3, 5, 7, 9, 11, 13, 15 // V indices (odd positions)
640+
);
641+
642+
var shuffled = Vector128.Shuffle(uv, shuffleIndices);
643+
644+
return (shuffled.GetLower(), shuffled.GetUpper());
645+
}
646+
#endif
647+
529648
/// <summary>
530649
/// Converts an I420 sample to an NV12 formatted sample.
531650
/// I420: Y plane followed by U plane, then V plane (planar format).
@@ -557,6 +676,10 @@ public static byte[] I420toNV12(byte[] i420, int width, int height, int dop = 1)
557676
int i420VOffset = ySize + uvWidth * uvHeight;
558677
int nv12UvOffset = ySize;
559678

679+
#if NET8_0_OR_GREATER
680+
// Use SIMD for interleaving U and V planes when available
681+
InterleaveUVSimd(i420, i420UOffset, i420VOffset, nv12, nv12UvOffset, uvWidth, uvHeight);
682+
#else
560683
if (!_optDOP.ContainsKey(dop))
561684
_optDOP[dop] = new ParallelOptions() { MaxDegreeOfParallelism = dop };
562685

@@ -573,8 +696,129 @@ public static byte[] I420toNV12(byte[] i420, int width, int height, int dop = 1)
573696
nv12[nv12Posn + 1] = i420[i420VPosn]; // V
574697
}
575698
});
699+
#endif
576700

577701
return nv12;
578702
}
703+
704+
#if NET8_0_OR_GREATER
705+
/// <summary>
706+
/// SIMD-optimized interleave of separate U and V planes from I420 format to NV12 format (UVUVUV...).
707+
/// </summary>
708+
[MethodImpl(MethodImplOptions.AggressiveInlining)]
709+
private static void InterleaveUVSimd(byte[] src, int srcUOffset, int srcVOffset, byte[] dst, int dstOffset, int uvWidth, int uvHeight)
710+
{
711+
int totalUV = uvWidth * uvHeight;
712+
int i = 0;
713+
714+
ref byte srcURef = ref Unsafe.Add(ref MemoryMarshal.GetArrayDataReference(src), srcUOffset);
715+
ref byte srcVRef = ref Unsafe.Add(ref MemoryMarshal.GetArrayDataReference(src), srcVOffset);
716+
ref byte dstRef = ref Unsafe.Add(ref MemoryMarshal.GetArrayDataReference(dst), dstOffset);
717+
718+
// Process 32 U/V values at a time using Vector256
719+
if (Vector256.IsHardwareAccelerated)
720+
{
721+
for (; i <= totalUV - 32; i += 32)
722+
{
723+
// Load 32 U values and 32 V values
724+
var u = Vector256.LoadUnsafe(ref Unsafe.Add(ref srcURef, i));
725+
var v = Vector256.LoadUnsafe(ref Unsafe.Add(ref srcVRef, i));
726+
727+
// Interleave U and V values
728+
var (uv0, uv1) = InterleaveVector256(u, v);
729+
730+
// Store 64 bytes (32 UV pairs)
731+
uv0.StoreUnsafe(ref Unsafe.Add(ref dstRef, i * 2));
732+
uv1.StoreUnsafe(ref Unsafe.Add(ref dstRef, i * 2 + 32));
733+
}
734+
}
735+
736+
// Process 16 U/V values at a time using Vector128
737+
if (Vector128.IsHardwareAccelerated)
738+
{
739+
for (; i <= totalUV - 16; i += 16)
740+
{
741+
// Load 16 U values and 16 V values
742+
var u = Vector128.LoadUnsafe(ref Unsafe.Add(ref srcURef, i));
743+
var v = Vector128.LoadUnsafe(ref Unsafe.Add(ref srcVRef, i));
744+
745+
// Interleave U and V values
746+
var (uv0, uv1) = InterleaveVector128(u, v);
747+
748+
// Store 32 bytes (16 UV pairs)
749+
uv0.StoreUnsafe(ref Unsafe.Add(ref dstRef, i * 2));
750+
uv1.StoreUnsafe(ref Unsafe.Add(ref dstRef, i * 2 + 16));
751+
}
752+
}
753+
754+
// Handle remaining elements with scalar code
755+
for (; i < totalUV; i++)
756+
{
757+
Unsafe.Add(ref dstRef, i * 2) = Unsafe.Add(ref srcURef, i);
758+
Unsafe.Add(ref dstRef, i * 2 + 1) = Unsafe.Add(ref srcVRef, i);
759+
}
760+
}
761+
762+
/// <summary>
763+
/// Interleave two Vector256 of U and V values into two Vector256 of interleaved UV pairs.
764+
/// Input: U=[U0,U1,...,U31], V=[V0,V1,...,V31]
765+
/// Output: UV0=[U0,V0,U1,V1,...,U15,V15], UV1=[U16,V16,...,U31,V31]
766+
/// </summary>
767+
[MethodImpl(MethodImplOptions.AggressiveInlining)]
768+
private static (Vector256<byte> uv0, Vector256<byte> uv1) InterleaveVector256(Vector256<byte> u, Vector256<byte> v)
769+
{
770+
// Get low and high halves
771+
var uLow = u.GetLower(); // U0-U15
772+
var uHigh = u.GetUpper(); // U16-U31
773+
var vLow = v.GetLower(); // V0-V15
774+
var vHigh = v.GetUpper(); // V16-V31
775+
776+
// Interleave low halves -> first 32 bytes
777+
var (uv0Low, uv0High) = InterleaveVector128ToTwo(uLow, vLow);
778+
var uv0 = Vector256.Create(uv0Low, uv0High);
779+
780+
// Interleave high halves -> second 32 bytes
781+
var (uv1Low, uv1High) = InterleaveVector128ToTwo(uHigh, vHigh);
782+
var uv1 = Vector256.Create(uv1Low, uv1High);
783+
784+
return (uv0, uv1);
785+
}
786+
787+
/// <summary>
788+
/// Interleave two Vector128 of U and V values into two Vector128 of interleaved UV pairs.
789+
/// Input: U=[U0,U1,...,U15], V=[V0,V1,...,V15]
790+
/// Output: UV0=[U0,V0,U1,V1,...,U7,V7], UV1=[U8,V8,...,U15,V15]
791+
/// </summary>
792+
[MethodImpl(MethodImplOptions.AggressiveInlining)]
793+
private static (Vector128<byte> uv0, Vector128<byte> uv1) InterleaveVector128(Vector128<byte> u, Vector128<byte> v)
794+
{
795+
return InterleaveVector128ToTwo(u, v);
796+
}
797+
798+
/// <summary>
799+
/// Interleave two Vector128 of 16 bytes each into two Vector128 of interleaved pairs.
800+
/// Input: A=[A0,A1,...,A15], B=[B0,B1,...,B15]
801+
/// Output: Out0=[A0,B0,A1,B1,...,A7,B7], Out1=[A8,B8,...,A15,B15]
802+
/// </summary>
803+
[MethodImpl(MethodImplOptions.AggressiveInlining)]
804+
private static (Vector128<byte> out0, Vector128<byte> out1) InterleaveVector128ToTwo(Vector128<byte> a, Vector128<byte> b)
805+
{
806+
// Create interleave shuffle patterns for low and high halves
807+
// Low: takes elements 0-7 from A and B, interleaves them
808+
// Pattern for first 8 pairs: A0,B0,A1,B1,A2,B2,A3,B3,A4,B4,A5,B5,A6,B6,A7,B7
809+
var shuffleLowA = Vector128.Create((byte)0, 255, 1, 255, 2, 255, 3, 255, 4, 255, 5, 255, 6, 255, 7, 255);
810+
var shuffleLowB = Vector128.Create((byte)255, 0, 255, 1, 255, 2, 255, 3, 255, 4, 255, 5, 255, 6, 255, 7);
811+
812+
// Pattern for second 8 pairs: A8,B8,A9,B9,...,A15,B15
813+
var shuffleHighA = Vector128.Create((byte)8, 255, 9, 255, 10, 255, 11, 255, 12, 255, 13, 255, 14, 255, 15, 255);
814+
var shuffleHighB = Vector128.Create((byte)255, 8, 255, 9, 255, 10, 255, 11, 255, 12, 255, 13, 255, 14, 255, 15);
815+
816+
// Shuffle and OR to combine
817+
var out0 = Vector128.Shuffle(a, shuffleLowA) | Vector128.Shuffle(b, shuffleLowB);
818+
var out1 = Vector128.Shuffle(a, shuffleHighA) | Vector128.Shuffle(b, shuffleHighB);
819+
820+
return (out0, out1);
821+
}
822+
#endif
579823
}
580824
}

0 commit comments

Comments
 (0)