2323#include " qmltypes/qmlfilter.h"
2424#include " qmltypes/qmlutilities.h"
2525#include " settings.h"
26+ #include " util.h"
2627
2728#include < Mlt.h>
2829#include < QOffscreenSurface>
3637#include < arm_neon.h>
3738#endif
3839
40+ #if defined(__x86_64__) || defined(_M_AMD64)
41+ #include < immintrin.h>
42+ #endif
43+
3944using namespace Mlt ;
4045
4146VideoWidget::VideoWidget (QObject *parent)
@@ -557,6 +562,79 @@ void VideoWidget::setVideoSink(QVideoSink *sink)
557562 pushFrameToSink (m_sharedFrame);
558563}
559564
565+ #if defined(__x86_64__) || defined(_M_AMD64)
566+ #if defined(__GNUC__) || defined(__clang__)
567+ __attribute__ ((target(" avx2" )))
568+ #endif
569+ static void
570+ shiftYPlane_AVX2 (const uint16_t *src, uint16_t *dst, int n)
571+ {
572+ int i = 0 ;
573+ for (; i + 16 <= n; i += 16 ) {
574+ __m256i y = _mm256_loadu_si256 (reinterpret_cast <const __m256i *>(src + i));
575+ _mm256_storeu_si256 (reinterpret_cast <__m256i *>(dst + i), _mm256_slli_epi16 (y, 6 ));
576+ }
577+ for (; i < n; ++i)
578+ dst[i] = src[i] << 6 ;
579+ }
580+
581+ #if defined(__GNUC__) || defined(__clang__)
582+ __attribute__ ((target(" avx2" )))
583+ #endif
584+ static void
585+ interleaveUVPlanes_AVX2 (const uint16_t *srcU, const uint16_t *srcV, uint16_t *dst, int n)
586+ {
587+ // AVX2 unpack operates within 128-bit lanes; permute to restore linear order.
588+ // unpacklo(u,v): lane0 = u0v0u1v1u2v2u3v3, lane1 = u8v8...u11v11
589+ // unpackhi(u,v): lane0 = u4v4...u7v7, lane1 = u12v12...u15v15
590+ // permute2x128 0x20 → [lo.lane0 | hi.lane0] = u0v0..u7v7
591+ // permute2x128 0x31 → [lo.lane1 | hi.lane1] = u8v8..u15v15
592+ int j = 0 ;
593+ for (; j + 16 <= n; j += 16 ) {
594+ __m256i u
595+ = _mm256_slli_epi16 (_mm256_loadu_si256 (reinterpret_cast <const __m256i *>(srcU + j)), 6 );
596+ __m256i v
597+ = _mm256_slli_epi16 (_mm256_loadu_si256 (reinterpret_cast <const __m256i *>(srcV + j)), 6 );
598+ __m256i lo = _mm256_unpacklo_epi16 (u, v);
599+ __m256i hi = _mm256_unpackhi_epi16 (u, v);
600+ _mm256_storeu_si256 (reinterpret_cast <__m256i *>(dst + j * 2 ),
601+ _mm256_permute2x128_si256 (lo, hi, 0x20 ));
602+ _mm256_storeu_si256 (reinterpret_cast <__m256i *>(dst + j * 2 + 16 ),
603+ _mm256_permute2x128_si256 (lo, hi, 0x31 ));
604+ }
605+ for (; j < n; ++j) {
606+ dst[2 * j] = srcU[j] << 6 ;
607+ dst[2 * j + 1 ] = srcV[j] << 6 ;
608+ }
609+ }
610+
611+ static void shiftYPlane_SSE2 (const uint16_t *src, uint16_t *dst, int n)
612+ {
613+ int i = 0 ;
614+ for (; i + 8 <= n; i += 8 ) {
615+ __m128i y = _mm_loadu_si128 (reinterpret_cast <const __m128i *>(src + i));
616+ _mm_storeu_si128 (reinterpret_cast <__m128i *>(dst + i), _mm_slli_epi16 (y, 6 ));
617+ }
618+ for (; i < n; ++i)
619+ dst[i] = src[i] << 6 ;
620+ }
621+
622+ static void interleaveUVPlanes_SSE2 (const uint16_t *srcU, const uint16_t *srcV, uint16_t *dst, int n)
623+ {
624+ int j = 0 ;
625+ for (; j + 8 <= n; j += 8 ) {
626+ __m128i u = _mm_slli_epi16 (_mm_loadu_si128 (reinterpret_cast <const __m128i *>(srcU + j)), 6 );
627+ __m128i v = _mm_slli_epi16 (_mm_loadu_si128 (reinterpret_cast <const __m128i *>(srcV + j)), 6 );
628+ _mm_storeu_si128 (reinterpret_cast <__m128i *>(dst + j * 2 ), _mm_unpacklo_epi16 (u, v));
629+ _mm_storeu_si128 (reinterpret_cast <__m128i *>(dst + j * 2 + 8 ), _mm_unpackhi_epi16 (u, v));
630+ }
631+ for (; j < n; ++j) {
632+ dst[2 * j] = srcU[j] << 6 ;
633+ dst[2 * j + 1 ] = srcV[j] << 6 ;
634+ }
635+ }
636+ #endif // defined(__x86_64__) || defined(_M_AMD64)
637+
560638void VideoWidget::pushFrameToSink (const SharedFrame &frame)
561639{
562640 if (!m_videoSink)
@@ -621,6 +699,11 @@ void VideoWidget::pushFrameToSink(const SharedFrame &frame)
621699 }
622700 for (; i < ySamples; ++i)
623701 dstY[i] = srcY[i] << 6 ;
702+ #elif defined(__x86_64__) || defined(_M_AMD64)
703+ if (Util::cpuHasAVX2 ())
704+ shiftYPlane_AVX2 (srcY, dstY, ySamples);
705+ else
706+ shiftYPlane_SSE2 (srcY, dstY, ySamples);
624707#else
625708 for (int i = 0 ; i < ySamples; ++i) {
626709 dstY[i] = srcY[i] << 6 ;
@@ -644,6 +727,11 @@ void VideoWidget::pushFrameToSink(const SharedFrame &frame)
644727 dstUV[2 * j] = srcU[j] << 6 ;
645728 dstUV[2 * j + 1 ] = srcV[j] << 6 ;
646729 }
730+ #elif defined(__x86_64__) || defined(_M_AMD64)
731+ if (Util::cpuHasAVX2 ())
732+ interleaveUVPlanes_AVX2 (srcU, srcV, dstUV, uvSamples);
733+ else
734+ interleaveUVPlanes_SSE2 (srcU, srcV, dstUV, uvSamples);
647735#else
648736 for (int i = 0 ; i < uvSamples; ++i) {
649737 dstUV[2 * i] = srcU[i] << 6 ;
0 commit comments