Skip to content

Commit 64ccba4

Browse files
committed
AVX2/SSE2 optimizations in VideoWidget
1 parent 3093e0f commit 64ccba4

3 files changed

Lines changed: 120 additions & 0 deletions

File tree

src/util.cpp

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,10 @@
5959
#include <windows.h>
6060
#endif
6161

62+
#if defined(__x86_64__) || defined(_M_AMD64)
63+
#include <immintrin.h>
64+
#endif
65+
6266
#ifdef Q_OS_MAC
6367
static constexpr unsigned int kLowMemoryThresholdPercent = 10U;
6468
#else
@@ -1260,3 +1264,30 @@ bool Util::openUrl(const QUrl &url)
12601264
#endif
12611265
return success;
12621266
}
1267+
1268+
bool Util::cpuHasAVX2()
1269+
{
1270+
static const bool result = []() -> bool {
1271+
#if defined(__GNUC__) || defined(__clang__)
1272+
return __builtin_cpu_supports("avx2");
1273+
#elif defined(_MSC_VER)
1274+
int info[4];
1275+
__cpuid(info, 1);
1276+
// Check OSXSAVE and AVX bits in ECX
1277+
if (!((info[2] >> 27) & 1) || !((info[2] >> 28) & 1))
1278+
return false;
1279+
// Check OS saves/restores YMM registers (XCR0[2:1] == 0b11)
1280+
if ((_xgetbv(0) & 0x6) != 0x6)
1281+
return false;
1282+
// Check AVX2 via structured extended feature leaf 7, EBX bit 5
1283+
__cpuid(info, 0);
1284+
if (info[0] < 7)
1285+
return false;
1286+
__cpuidex(info, 7, 0);
1287+
return (info[1] >> 5) & 1;
1288+
#else
1289+
return false;
1290+
#endif
1291+
}();
1292+
return result;
1293+
}

src/util.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -109,6 +109,7 @@ class Util
109109
static bool isChromiumAvailable();
110110
static bool startDetached(const QString &program, const QStringList &arguments);
111111
static bool openUrl(const QUrl &url);
112+
static bool cpuHasAVX2();
112113
};
113114

114115
#endif // UTIL_H

src/videowidget.cpp

Lines changed: 88 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
#include "qmltypes/qmlfilter.h"
2424
#include "qmltypes/qmlutilities.h"
2525
#include "settings.h"
26+
#include "util.h"
2627

2728
#include <Mlt.h>
2829
#include <QOffscreenSurface>
@@ -36,6 +37,10 @@
3637
#include <arm_neon.h>
3738
#endif
3839

40+
#if defined(__x86_64__) || defined(_M_AMD64)
41+
#include <immintrin.h>
42+
#endif
43+
3944
using namespace Mlt;
4045

4146
VideoWidget::VideoWidget(QObject *parent)
@@ -557,6 +562,79 @@ void VideoWidget::setVideoSink(QVideoSink *sink)
557562
pushFrameToSink(m_sharedFrame);
558563
}
559564

565+
#if defined(__x86_64__) || defined(_M_AMD64)
566+
#if defined(__GNUC__) || defined(__clang__)
567+
__attribute__((target("avx2")))
568+
#endif
569+
static void
570+
shiftYPlane_AVX2(const uint16_t *src, uint16_t *dst, int n)
571+
{
572+
int i = 0;
573+
for (; i + 16 <= n; i += 16) {
574+
__m256i y = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(src + i));
575+
_mm256_storeu_si256(reinterpret_cast<__m256i *>(dst + i), _mm256_slli_epi16(y, 6));
576+
}
577+
for (; i < n; ++i)
578+
dst[i] = src[i] << 6;
579+
}
580+
581+
#if defined(__GNUC__) || defined(__clang__)
582+
__attribute__((target("avx2")))
583+
#endif
584+
static void
585+
interleaveUVPlanes_AVX2(const uint16_t *srcU, const uint16_t *srcV, uint16_t *dst, int n)
586+
{
587+
// AVX2 unpack operates within 128-bit lanes; permute to restore linear order.
588+
// unpacklo(u,v): lane0 = u0v0u1v1u2v2u3v3, lane1 = u8v8...u11v11
589+
// unpackhi(u,v): lane0 = u4v4...u7v7, lane1 = u12v12...u15v15
590+
// permute2x128 0x20 → [lo.lane0 | hi.lane0] = u0v0..u7v7
591+
// permute2x128 0x31 → [lo.lane1 | hi.lane1] = u8v8..u15v15
592+
int j = 0;
593+
for (; j + 16 <= n; j += 16) {
594+
__m256i u
595+
= _mm256_slli_epi16(_mm256_loadu_si256(reinterpret_cast<const __m256i *>(srcU + j)), 6);
596+
__m256i v
597+
= _mm256_slli_epi16(_mm256_loadu_si256(reinterpret_cast<const __m256i *>(srcV + j)), 6);
598+
__m256i lo = _mm256_unpacklo_epi16(u, v);
599+
__m256i hi = _mm256_unpackhi_epi16(u, v);
600+
_mm256_storeu_si256(reinterpret_cast<__m256i *>(dst + j * 2),
601+
_mm256_permute2x128_si256(lo, hi, 0x20));
602+
_mm256_storeu_si256(reinterpret_cast<__m256i *>(dst + j * 2 + 16),
603+
_mm256_permute2x128_si256(lo, hi, 0x31));
604+
}
605+
for (; j < n; ++j) {
606+
dst[2 * j] = srcU[j] << 6;
607+
dst[2 * j + 1] = srcV[j] << 6;
608+
}
609+
}
610+
611+
static void shiftYPlane_SSE2(const uint16_t *src, uint16_t *dst, int n)
612+
{
613+
int i = 0;
614+
for (; i + 8 <= n; i += 8) {
615+
__m128i y = _mm_loadu_si128(reinterpret_cast<const __m128i *>(src + i));
616+
_mm_storeu_si128(reinterpret_cast<__m128i *>(dst + i), _mm_slli_epi16(y, 6));
617+
}
618+
for (; i < n; ++i)
619+
dst[i] = src[i] << 6;
620+
}
621+
622+
static void interleaveUVPlanes_SSE2(const uint16_t *srcU, const uint16_t *srcV, uint16_t *dst, int n)
623+
{
624+
int j = 0;
625+
for (; j + 8 <= n; j += 8) {
626+
__m128i u = _mm_slli_epi16(_mm_loadu_si128(reinterpret_cast<const __m128i *>(srcU + j)), 6);
627+
__m128i v = _mm_slli_epi16(_mm_loadu_si128(reinterpret_cast<const __m128i *>(srcV + j)), 6);
628+
_mm_storeu_si128(reinterpret_cast<__m128i *>(dst + j * 2), _mm_unpacklo_epi16(u, v));
629+
_mm_storeu_si128(reinterpret_cast<__m128i *>(dst + j * 2 + 8), _mm_unpackhi_epi16(u, v));
630+
}
631+
for (; j < n; ++j) {
632+
dst[2 * j] = srcU[j] << 6;
633+
dst[2 * j + 1] = srcV[j] << 6;
634+
}
635+
}
636+
#endif // defined(__x86_64__) || defined(_M_AMD64)
637+
560638
void VideoWidget::pushFrameToSink(const SharedFrame &frame)
561639
{
562640
if (!m_videoSink)
@@ -621,6 +699,11 @@ void VideoWidget::pushFrameToSink(const SharedFrame &frame)
621699
}
622700
for (; i < ySamples; ++i)
623701
dstY[i] = srcY[i] << 6;
702+
#elif defined(__x86_64__) || defined(_M_AMD64)
703+
if (Util::cpuHasAVX2())
704+
shiftYPlane_AVX2(srcY, dstY, ySamples);
705+
else
706+
shiftYPlane_SSE2(srcY, dstY, ySamples);
624707
#else
625708
for (int i = 0; i < ySamples; ++i) {
626709
dstY[i] = srcY[i] << 6;
@@ -644,6 +727,11 @@ void VideoWidget::pushFrameToSink(const SharedFrame &frame)
644727
dstUV[2 * j] = srcU[j] << 6;
645728
dstUV[2 * j + 1] = srcV[j] << 6;
646729
}
730+
#elif defined(__x86_64__) || defined(_M_AMD64)
731+
if (Util::cpuHasAVX2())
732+
interleaveUVPlanes_AVX2(srcU, srcV, dstUV, uvSamples);
733+
else
734+
interleaveUVPlanes_SSE2(srcU, srcV, dstUV, uvSamples);
647735
#else
648736
for (int i = 0; i < uvSamples; ++i) {
649737
dstUV[2 * i] = srcU[i] << 6;

0 commit comments

Comments
 (0)