Skip to content

Commit 7cd99c6

Browse files
committed
moved from planar conversions from pixfmt_conv
removed all ad hoc optimizations: - -Ofast - removed by context (pixfmt_conv compiles with), _but_ see previous commit - even in pixfmt_conv it was not actually used - remove ALWAYS_INLINE + OPTIMIZED_FOR - from measurements it doesn't seem to make some difference
1 parent 791f13f commit 7cd99c6

8 files changed

Lines changed: 417 additions & 312 deletions

File tree

Makefile.in

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -136,6 +136,7 @@ COMMON_OBJS = \
136136
src/crypto/md5.o \
137137
src/crypto/random.o \
138138
src/export.o \
139+
src/from_planar.o \
139140
src/ihdtv/ihdtv.o \
140141
src/lib_common.o \
141142
src/module.o \

src/from_planar.c

Lines changed: 341 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,341 @@
1+
/**
2+
* @file from_planar.h
3+
* @author Martin Pulec <pulec@cesnet.cz>
4+
*/
5+
/*
6+
* Copyright (c) 2026 CESNET, zájmové sdružení právnických osob
7+
* All rights reserved.
8+
*
9+
* Redistribution and use in source and binary forms, with or without
10+
* modification, is permitted provided that the following conditions
11+
* are met:
12+
*
13+
* 1. Redistributions of source code must retain the above copyright
14+
* notice, this list of conditions and the following disclaimer.
15+
*
16+
* 2. Redistributions in binary form must reproduce the above copyright
17+
* notice, this list of conditions and the following disclaimer in the
18+
* documentation and/or other materials provided with the distribution.
19+
*
20+
* 3. Neither the name of CESNET nor the names of its contributors may be
21+
* used to endorse or promote products derived from this software without
22+
* specific prior written permission.
23+
*
24+
* THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS
25+
* "AS IS" AND ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING,
26+
* BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY
27+
* AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
28+
* EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
29+
* INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
30+
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
31+
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
32+
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
33+
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
34+
* OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
35+
* EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
36+
*/
37+
38+
39+
#include "from_planar.h"
40+
41+
#include <assert.h> // for assert
42+
#include <stdint.h> // for uint32_t, uintptr_t, uint16_t, uint64_t
43+
#include <string.h> // for memcpy
44+
45+
#include "compat/c23.h" // for size_t, NULL, countof, nullptr, ptrdiff_t
46+
#include "compat/endian.h" // BYTE_ORDER, BIG_ENDIAN
47+
#include "types.h" // for depth
48+
#include "utils/misc.h" // for get_cpu_core_count
49+
#include "utils/worker.h" // for task_run_parallel
50+
51+
#if BYTE_ORDER == BIG_ENDIAN
52+
#define BYTE_SWAP(x) (3 - x)
53+
#else
54+
#define BYTE_SWAP(x) x
55+
#endif
56+
57+
static void
58+
gbrpXXle_to_r12l(unsigned char *const restrict out_data, const int out_pitch,
59+
const unsigned char *const restrict *const restrict in_data,
60+
const int *const restrict in_linesize, const int width,
61+
const int height, const int in_depth, int rind, int gind, int bind)
62+
{
63+
assert((uintptr_t) in_linesize[0] % 2 == 0);
64+
assert((uintptr_t) in_linesize[1] % 2 == 0);
65+
assert((uintptr_t) in_linesize[2] % 2 == 0);
66+
67+
#define S(x) ((x) >> (in_depth - 12))
68+
// clang-format off
69+
for (size_t y = 0; y < (size_t) height; ++y) {
70+
const uint16_t *src_r = (const void *) (in_data[rind] + (in_linesize[rind] * y));
71+
const uint16_t *src_g = (const void *) (in_data[gind] + (in_linesize[gind] * y));
72+
const uint16_t *src_b = (const void *) (in_data[bind] + (in_linesize[bind] * y));
73+
unsigned char *dst =
74+
(unsigned char *) out_data + (y * out_pitch);
75+
76+
for (int x = 0; x < width; x += 8) {
77+
uint16_t tmpbuf[3][8];
78+
if (x + 8 >= width) {
79+
size_t remains = sizeof(uint16_t) * (width - x);
80+
memcpy(tmpbuf[0], src_r, remains);
81+
memcpy(tmpbuf[1], src_g, remains);
82+
memcpy(tmpbuf[2], src_b, remains);
83+
src_r = tmpbuf[0];
84+
src_g = tmpbuf[1];
85+
src_b = tmpbuf[2];
86+
}
87+
dst[BYTE_SWAP(0)] = S(*src_r) & 0xff;
88+
dst[BYTE_SWAP(1)] = (S(*src_g) & 0xf) << 4 | S(*src_r++) >> 8;
89+
dst[BYTE_SWAP(2)] = S(*src_g++) >> 4;
90+
dst[BYTE_SWAP(3)] = S(*src_b) & 0xff;
91+
dst[4 + BYTE_SWAP(0)] = (S(*src_r) & 0xf) << 4 | S(*src_b++) >> 8;
92+
dst[4 + BYTE_SWAP(1)] = S(*src_r++) >> 4;
93+
dst[4 + BYTE_SWAP(2)] = S(*src_g) & 0xff;
94+
dst[4 + BYTE_SWAP(3)] = (S(*src_b) & 0xf) << 4 | S(*src_g++) >> 8;
95+
dst[8 + BYTE_SWAP(0)] = S(*src_b++) >> 4;
96+
dst[8 + BYTE_SWAP(1)] = S(*src_r) & 0xff;
97+
dst[8 + BYTE_SWAP(2)] = (S(*src_g) & 0xf) << 4 | S(*src_r++) >> 8;
98+
dst[8 + BYTE_SWAP(3)] = S(*src_g++) >> 4;
99+
dst[12 + BYTE_SWAP(0)] = S(*src_b) & 0xff;
100+
dst[12 + BYTE_SWAP(1)] = (S(*src_r) & 0xf) << 4 | S(*src_b++) >> 8;
101+
dst[12 + BYTE_SWAP(2)] = S(*src_r++) >> 4;
102+
dst[12 + BYTE_SWAP(3)] = S(*src_g) & 0xff;
103+
dst[16 + BYTE_SWAP(0)] = (S(*src_b) & 0xf) << 4 | S(*src_g++) >> 8;
104+
dst[16 + BYTE_SWAP(1)] = S(*src_b++) >> 4;
105+
dst[16 + BYTE_SWAP(2)] = S(*src_r) & 0xff;
106+
dst[16 + BYTE_SWAP(3)] = (S(*src_g) & 0xf) << 4 | S(*src_r++) >> 8;
107+
dst[20 + BYTE_SWAP(0)] = S(*src_g++) >> 4;
108+
dst[20 + BYTE_SWAP(1)] = S(*src_b) & 0xff;
109+
dst[20 + BYTE_SWAP(2)] = (S(*src_r) & 0xf) << 4 | S(*src_b++) >> 8;
110+
dst[20 + BYTE_SWAP(3)] = S(*src_r++) >> 4;;
111+
dst[24 + BYTE_SWAP(0)] = S(*src_g) & 0xff;
112+
dst[24 + BYTE_SWAP(1)] = (S(*src_b) & 0xf) << 4 | S(*src_g++) >> 8;
113+
dst[24 + BYTE_SWAP(2)] = S(*src_b++) >> 4;
114+
dst[24 + BYTE_SWAP(3)] = S(*src_r) & 0xff;
115+
dst[28 + BYTE_SWAP(0)] = (S(*src_g) & 0xf) << 4 | S(*src_r++) >> 8;
116+
dst[28 + BYTE_SWAP(1)] = S(*src_g++) >> 4;
117+
dst[28 + BYTE_SWAP(2)] = S(*src_b) & 0xff;
118+
dst[28 + BYTE_SWAP(3)] = (S(*src_r) & 0xf) << 4 | S(*src_b++) >> 8;
119+
dst[32 + BYTE_SWAP(0)] = S(*src_r++) >> 4;
120+
dst[32 + BYTE_SWAP(1)] = S(*src_g) & 0xff;
121+
dst[32 + BYTE_SWAP(2)] = (S(*src_b) & 0xf) << 4 | S(*src_g++) >> 8;
122+
dst[32 + BYTE_SWAP(3)] = S(*src_b++) >> 4;
123+
dst += 36;
124+
}
125+
}
126+
// clang-format on
127+
#undef S
128+
}
129+
130+
/**
131+
* test with:
132+
* @code{.sh}
133+
* uv -t testcard:c=R12L -c lavc:e=libx265 -p change_pixfmt:RGBA -d gl
134+
* uv -t testcard:s=511x512c=R12L -c lavc:e=libx265 -p change_pixfmt:RGBA -d gl # irregular sz
135+
* # optionally also `--param decoder-use-codec=R12L` to ensure decoded codec
136+
* @endcode
137+
*/
138+
void
139+
gbrp12le_to_r12l(unsigned char *out_data, int out_pitch,
140+
const unsigned char *const *in_data, const int *in_linesize,
141+
int width, int height)
142+
{
143+
gbrpXXle_to_r12l(out_data, out_pitch, in_data, in_linesize, width,
144+
height, DEPTH12, 2, 0, 1);
145+
}
146+
147+
void
148+
gbrp16le_to_r12l(unsigned char *out_data, int out_pitch,
149+
const unsigned char *const *in_data, const int *in_linesize,
150+
int width, int height)
151+
{
152+
gbrpXXle_to_r12l(out_data, out_pitch, in_data, in_linesize, width,
153+
height, DEPTH16, 2, 0, 1);
154+
}
155+
156+
void
157+
rgbp12le_to_r12l(unsigned char *out_data, int out_pitch,
158+
const unsigned char *const *in_data, const int *in_linesize,
159+
int width, int height)
160+
{
161+
gbrpXXle_to_r12l(out_data, out_pitch, in_data, in_linesize, width,
162+
height, DEPTH12, 0, 1, 2);
163+
}
164+
165+
static void
166+
rgbpXXle_to_rg48(unsigned char *out_data, int out_pitch,
167+
const unsigned char *const *in_data, const int *in_linesize,
168+
int width, int height, unsigned int in_depth, int rind,
169+
int gind, int bind)
170+
{
171+
assert((uintptr_t) out_data % 2 == 0);
172+
assert((uintptr_t) in_data[0] % 2 == 0);
173+
assert((uintptr_t) in_data[1] % 2 == 0);
174+
assert((uintptr_t) in_data[2] % 2 == 0);
175+
176+
for (ptrdiff_t y = 0; y < height; ++y) {
177+
const uint16_t *src_r = (const void *) (in_data[rind] + (in_linesize[rind] * y));
178+
const uint16_t *src_g = (const void *) (in_data[gind] + (in_linesize[gind] * y));
179+
const uint16_t *src_b = (const void *) (in_data[bind] + (in_linesize[bind] * y));
180+
uint16_t *dst = (void *) (out_data + (y * out_pitch));
181+
182+
for (int x = 0; x < width; ++x) {
183+
*dst++ = *src_r++ << (16U - in_depth);
184+
*dst++ = *src_g++ << (16U - in_depth);
185+
*dst++ = *src_b++ << (16U - in_depth);
186+
}
187+
}
188+
}
189+
190+
void
191+
gbrp10le_to_rg48(unsigned char *out_data, int out_pitch,
192+
const unsigned char *const *in_data, const int *in_linesize,
193+
int width, int height)
194+
{
195+
rgbpXXle_to_rg48(out_data, out_pitch, in_data, in_linesize, width,
196+
height, DEPTH10, 2, 0, 1);
197+
}
198+
199+
void
200+
gbrp12le_to_rg48(unsigned char *out_data, int out_pitch,
201+
const unsigned char *const *in_data, const int *in_linesize,
202+
int width, int height)
203+
{
204+
rgbpXXle_to_rg48(out_data, out_pitch, in_data, in_linesize, width,
205+
height, DEPTH12, 2, 0, 1);
206+
}
207+
208+
void
209+
gbrp16le_to_rg48(unsigned char *out_data, int out_pitch,
210+
const unsigned char *const *in_data, const int *in_linesize,
211+
int width, int height)
212+
{
213+
rgbpXXle_to_rg48(out_data, out_pitch, in_data, in_linesize, width,
214+
height, DEPTH16, 2, 0, 1);
215+
}
216+
217+
void
218+
rgbp12le_to_rg48(unsigned char *out_data, int out_pitch,
219+
const unsigned char *const *in_data, const int *in_linesize,
220+
int width, int height)
221+
{
222+
rgbpXXle_to_rg48(out_data, out_pitch, in_data, in_linesize, width,
223+
height, DEPTH12, 0, 1, 2);
224+
}
225+
226+
static void
227+
gbrpXXle_to_r10k(unsigned char *out_data, int out_pitch,
228+
const unsigned char *const *in_data, const int *in_linesize,
229+
const int width, const int height, const unsigned int in_depth,
230+
int rind, int gind, int bind)
231+
{
232+
// __builtin_trap();
233+
assert((uintptr_t) in_linesize[0] % 2 == 0);
234+
assert((uintptr_t) in_linesize[1] % 2 == 0);
235+
assert((uintptr_t) in_linesize[2] % 2 == 0);
236+
237+
for (size_t y = 0; y < (size_t) height; ++y) {
238+
const uint16_t *src_r = (const void *) (in_data[rind] + (in_linesize[rind] * y));
239+
const uint16_t *src_g = (const void *) (in_data[gind] + (in_linesize[gind] * y));
240+
const uint16_t *src_b = (const void *) (in_data[bind] + (in_linesize[bind] * y));
241+
unsigned char *dst = out_data + (y * out_pitch);
242+
243+
for (int x = 0; x < width; ++x) {
244+
*dst++ = *src_r >> (in_depth - 8U);
245+
*dst++ = ((*src_r++ >> (in_depth - 10U)) & 0x3U) << 6U | *src_g >> (in_depth - 6U);
246+
*dst++ = ((*src_g++ >> (in_depth - 10U)) & 0xFU) << 4U | *src_b >> (in_depth - 4U);
247+
*dst++ = ((*src_b++ >> (in_depth - 10U)) & 0x3FU) << 2U | 0x3U;
248+
}
249+
}
250+
}
251+
252+
void
253+
gbrp10le_to_r10k(unsigned char *out_data, int out_pitch,
254+
const unsigned char *const *in_data, const int *in_linesize,
255+
int width, int height)
256+
{
257+
gbrpXXle_to_r10k(out_data, out_pitch, in_data, in_linesize, width,
258+
height, DEPTH10, 2, 0, 1);
259+
}
260+
261+
void
262+
gbrp12le_to_r10k(unsigned char *out_data, int out_pitch,
263+
const unsigned char *const *in_data, const int *in_linesize,
264+
int width, int height)
265+
{
266+
gbrpXXle_to_r10k(out_data, out_pitch, in_data, in_linesize, width,
267+
height, DEPTH12, 2, 0, 1);
268+
}
269+
270+
void
271+
gbrp16le_to_r10k(unsigned char *out_data, int out_pitch,
272+
const unsigned char *const *in_data, const int *in_linesize,
273+
int width, int height)
274+
{
275+
gbrpXXle_to_r10k(out_data, out_pitch, in_data, in_linesize, width,
276+
height, DEPTH16, 2, 0, 1);
277+
}
278+
279+
void
280+
rgbp10le_to_r10k(unsigned char *out_data, int out_pitch,
281+
const unsigned char *const *in_data, const int *in_linesize,
282+
int width, int height)
283+
{
284+
gbrpXXle_to_r10k(out_data, out_pitch, in_data, in_linesize, width,
285+
height, DEPTH10, 0, 1, 2);
286+
}
287+
288+
struct convert_task_data {
289+
decode_planar_func_t *convert;
290+
int width;
291+
int height;
292+
const unsigned char *in_data[3];
293+
int in_linesize[3];
294+
unsigned char *out_data;
295+
int pitch;
296+
};
297+
298+
static void *
299+
convert_task(void *arg)
300+
{
301+
struct convert_task_data *d = arg;
302+
d->convert(d->out_data, d->pitch, d->in_data, d->in_linesize, d->width,
303+
d->height);
304+
return nullptr;
305+
}
306+
307+
// destiled from av_to_uv_convert
308+
void decode_planar_parallel(decode_planar_func_t *dec, unsigned char *out_data, int out_pitch,
309+
const unsigned char *const *in_data,
310+
const int *in_linesize, int width,
311+
int height)
312+
{
313+
const unsigned cpu_count = get_cpu_core_count();
314+
315+
struct convert_task_data d[cpu_count];
316+
for (size_t i = 0; i < cpu_count; ++i) {
317+
unsigned row_height = (height / cpu_count) & ~1; // needs to be even
318+
d[i].convert = dec;
319+
d[i].pitch = out_pitch;
320+
d[i].out_data = out_data + (i * row_height * out_pitch);
321+
memcpy(&d[i].in_linesize, in_linesize, sizeof d[i].in_linesize);
322+
323+
for (unsigned plane = 0; plane < countof(d[0].in_data); ++plane) {
324+
if (in_data[plane] == NULL) {
325+
break;
326+
}
327+
const int chroma_subs_log2 = 0;
328+
d[i].in_data[plane] =
329+
in_data[plane] +
330+
((i * row_height * in_linesize[plane]) >>
331+
(plane == 0 ? 0 : chroma_subs_log2));
332+
}
333+
if (i == cpu_count - 1) {
334+
row_height = height - (row_height * (cpu_count - 1));
335+
}
336+
d[i].width = width;
337+
d[i].height = (int) row_height;
338+
}
339+
task_run_parallel(convert_task, (int) cpu_count, d, sizeof d[0], NULL);
340+
}
341+

0 commit comments

Comments
 (0)