@@ -46,15 +46,6 @@ VMSM_INLINE void VideoMaterialSIMD::RuntimeDispatch() noexcept
4646 else {
4747 m_memcpy_fn = memcpy_sse2;
4848 }
49-
50- // Conversion function dispatch
51- /* */
52- if ( CPUHasAVX2 () ) {
53- m_convert_u16_u8_rrs_fn = convert_u16_u8_rrs_avx2;
54- }
55- else {
56- m_convert_u16_u8_rrs_fn = convert_u16_u8_rrs_sse2;
57- }
5849}
5950
6051// --------------------------------------------------------------------------------------------
@@ -63,7 +54,6 @@ VMSM_INLINE void VideoMaterialSIMD::RuntimeDispatch() noexcept
6354// src - source buffer: AVFrame uint8_t/uint16_t, 32/64-byte aligned
6455// --------------------------------------------------------------------------------------------
6556
66-
6757#define MASK_64B 0x3F // 6: 64-byte blocks (used with __m512i, AVX-512 register = 512 bits)
6858#define MASK_32B 0x1F // 5: 32-byte blocks (used with __m256i, AVX/AVX2 register = 256 bits)
6959#define MASK_16B 0xF // 4: 16-byte blocks (used with __m128i, SSE/SSE2 register = 128 bits)
@@ -119,7 +109,7 @@ VMSM_INLINE void VideoMaterialSIMD::memcpy_sse2( uint8_t *VMSM_RESTRICT dst, uin
119109 mov edi, d
120110
121111 align_loop :
122- movdqa xmm0, [ esi ]
112+ movdqa xmm0, [ esi ]
123113 movdqa xmm1, [ esi + 16 ]
124114 movdqa xmm2, [ esi + 32 ]
125115 movdqa xmm3, [ esi + 48 ]
@@ -142,97 +132,6 @@ VMSM_INLINE void VideoMaterialSIMD::memcpy_sse2( uint8_t *VMSM_RESTRICT dst, uin
142132 memcpy ( d, s, rem );
143133}
144134
145- // --------------------------------------------------------------------------------------------
146- // SIMD Convert U16 to U8 - Right Shift Implementations for Source Engine Video Materials
147- // dst: Source Engine: IVTFTexture uint8_t, 1byte per pixel, 16-byte aligned
148- // src: FFmpeg: AVFrame uint16_t, 2byte per pixel, 32/64-byte aligned
149- // --------------------------------------------------------------------------------------------
150-
151- // -----------------------------------------------------------------
152- // SIMD Convert_U16_U8_RS: AVX2 STREAM Aligned(dst)-Unaligned(src)
153- // -----------------------------------------------------------------
154-
155- VMSM_INLINE void VideoMaterialSIMD::convert_u16_u8_rrs_avx2 (
156- uint8_t *VMSM_RESTRICT dst, const uint16_t *VMSM_RESTRICT src, size_t bts, int sft ) noexcept
157- {
158- const int bs = 1 << ( sft - 1 );
159- const __m256i rnd = _mm256_set1_epi16 ( bs );
160-
161- size_t bts2a = ( 64 - ( reinterpret_cast < uintptr_t >( dst ) & MASK_64B ) ) & MASK_64B ;
162- if ( bts2a > 0 ) {
163- for ( size_t i = 0 ; i < bts2a; ++i ) {
164- dst[ i ] = static_cast < uint8_t >( ( src[ i ] + bs ) >> sft );
165- }
166- dst += bts2a;
167- src += bts2a;
168- bts -= bts2a;
169- }
170-
171- const __m256i *s = reinterpret_cast < const __m256i * >( src );
172- __m256i *d = reinterpret_cast < __m256i * >( dst );
173-
174- size_t blx = bts >> 6 ;
175- const size_t rem = bts & MASK_64B ;
176-
177- while ( blx-- )
178- {
179- __m256i v01 = _mm256_lddqu_si256 ( s++ );
180- __m256i v02 = _mm256_lddqu_si256 ( s++ );
181- __m256i v03 = _mm256_lddqu_si256 ( s++ );
182- __m256i v04 = _mm256_lddqu_si256 ( s++ );
183-
184- v01 = _mm256_srli_epi16 ( _mm256_add_epi16 ( v01, rnd ), sft );
185- v02 = _mm256_srli_epi16 ( _mm256_add_epi16 ( v02, rnd ), sft );
186- v03 = _mm256_srli_epi16 ( _mm256_add_epi16 ( v03, rnd ), sft );
187- v04 = _mm256_srli_epi16 ( _mm256_add_epi16 ( v04, rnd ), sft );
188-
189- _mm256_stream_si256 ( d++, _mm256_permute4x64_epi64 ( _mm256_packus_epi16 ( v01, v02 ), 0xD8 ) );
190- _mm256_stream_si256 ( d++, _mm256_permute4x64_epi64 ( _mm256_packus_epi16 ( v03, v04 ), 0xD8 ) );
191- }
192- _mm_sfence ();
193- if ( rem > 0 ) {
194- for ( size_t i = 0 ; i < rem; ++i ) {
195- reinterpret_cast < uint8_t * >( d )[ i ] =
196- static_cast < uint8_t >( ( reinterpret_cast < const uint16_t * >( s )[ i ] + bs ) >> sft );
197- }
198- }
199- }
200-
201- // -----------------------------------------------------------------
202- // SIMD Convert_U16_U8_RS: SSE2 STREAM Aligned(dst)-Aligned(src)
203- // -----------------------------------------------------------------
204-
205- VMSM_INLINE void VideoMaterialSIMD::convert_u16_u8_rrs_sse2 (
206- uint8_t *VMSM_RESTRICT dst, const uint16_t *VMSM_RESTRICT src, size_t bts, int sft ) noexcept
207- {
208- const int bs = 1 << ( sft - 1 );
209- const __m128i rnd = _mm_set1_epi16 ( bs );
210-
211- const __m128i *s = reinterpret_cast < const __m128i * >( src );
212- __m128i *d = reinterpret_cast < __m128i * >( dst );
213-
214- size_t blx = bts >> 4 ;
215- const size_t rem = bts & MASK_16B ;
216-
217- while ( blx-- )
218- {
219- __m128i v01 = _mm_load_si128 ( s++ );
220- __m128i v02 = _mm_load_si128 ( s++ );
221-
222- v01 = _mm_srli_epi16 ( _mm_add_epi16 ( v01, rnd ), sft );
223- v02 = _mm_srli_epi16 ( _mm_add_epi16 ( v02, rnd ), sft );
224-
225- _mm_stream_si128 ( d++, _mm_packus_epi16 ( v01, v02 ) );
226- }
227- _mm_sfence ();
228- if ( rem > 0 ) {
229- for ( size_t i = 0 ; i < rem; ++i ) {
230- reinterpret_cast < uint8_t * >( d )[ i ] =
231- static_cast < uint8_t >( ( reinterpret_cast < const uint16_t * >( s )[ i ] + bs ) >> sft );
232- }
233- }
234- }
235-
236135// --------------------------------------------------------------------------------------------
237136// CPU Feature Detection
238137// These functions check for the presence of specific SIMD instruction sets
0 commit comments