Skip to content

Commit 39c9e84

Browse files
authored
feat(linux): Add hardware yuv444 chromasubsampling support on nvidia cards (cuda/cuda gl) (#4965)
1 parent 2b440bc commit 39c9e84

14 files changed

Lines changed: 810 additions & 205 deletions

File tree

src/nvenc/nvenc_base.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -183,7 +183,7 @@ namespace nvenc {
183183
};
184184

185185
auto buffer_is_yuv444 = [&]() {
186-
return buffer_format == NV_ENC_BUFFER_FORMAT_AYUV || buffer_format == NV_ENC_BUFFER_FORMAT_YUV444_10BIT;
186+
return buffer_format == NV_ENC_BUFFER_FORMAT_AYUV || buffer_format == NV_ENC_BUFFER_FORMAT_YUV444 || buffer_format == NV_ENC_BUFFER_FORMAT_YUV444_10BIT;
187187
};
188188

189189
{

src/nvenc/nvenc_utils.cpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,9 @@ namespace nvenc {
4242
case platf::pix_fmt_e::ayuv:
4343
return NV_ENC_BUFFER_FORMAT_AYUV;
4444

45+
case platf::pix_fmt_e::yuv444p:
46+
return NV_ENC_BUFFER_FORMAT_YUV444;
47+
4548
case platf::pix_fmt_e::yuv444p16:
4649
return NV_ENC_BUFFER_FORMAT_YUV444_10BIT;
4750

src/nvhttp.cpp

Lines changed: 35 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -684,6 +684,39 @@ namespace nvhttp {
684684
return true;
685685
}
686686

687+
uint32_t get_codec_mode_flags() {
688+
uint32_t codec_mode_flags = SCM_H264;
689+
if (video::last_encoder_probe_supported_yuv444_for_codec[0]) {
690+
codec_mode_flags |= SCM_H264_HIGH8_444;
691+
}
692+
if (video::active_hevc_mode >= 2) {
693+
codec_mode_flags |= SCM_HEVC;
694+
if (video::last_encoder_probe_supported_yuv444_for_codec[1]) {
695+
codec_mode_flags |= SCM_HEVC_REXT8_444;
696+
}
697+
}
698+
if (video::active_hevc_mode == 3 || video::active_hevc_mode == 5) {
699+
codec_mode_flags |= SCM_HEVC_MAIN10;
700+
}
701+
if ((video::active_hevc_mode == 4 || video::active_hevc_mode == 5) && video::last_encoder_probe_supported_yuv444_for_codec[1]) {
702+
codec_mode_flags |= SCM_HEVC_REXT10_444;
703+
}
704+
705+
if (video::active_av1_mode >= 2) {
706+
codec_mode_flags |= SCM_AV1_MAIN8;
707+
if (video::last_encoder_probe_supported_yuv444_for_codec[2]) {
708+
codec_mode_flags |= SCM_AV1_HIGH8_444;
709+
}
710+
}
711+
if (video::active_av1_mode == 3 || video::active_av1_mode == 5) {
712+
codec_mode_flags |= SCM_AV1_MAIN10;
713+
}
714+
if ((video::active_av1_mode == 4 || video::active_av1_mode == 5) && video::last_encoder_probe_supported_yuv444_for_codec[2]) {
715+
codec_mode_flags |= SCM_AV1_HIGH10_444;
716+
}
717+
return codec_mode_flags;
718+
}
719+
687720
template<class T>
688721
void serverinfo(std::shared_ptr<typename SimpleWeb::ServerBase<T>::Response> response, std::shared_ptr<typename SimpleWeb::ServerBase<T>::Request> request) {
689722
print_req<T>(request);
@@ -735,34 +768,7 @@ namespace nvhttp {
735768
tree.put("root.LocalIP", net::addr_to_normalized_string(local_endpoint.address()));
736769
}
737770

738-
uint32_t codec_mode_flags = SCM_H264;
739-
if (video::last_encoder_probe_supported_yuv444_for_codec[0]) {
740-
codec_mode_flags |= SCM_H264_HIGH8_444;
741-
}
742-
if (video::active_hevc_mode >= 2) {
743-
codec_mode_flags |= SCM_HEVC;
744-
if (video::last_encoder_probe_supported_yuv444_for_codec[1]) {
745-
codec_mode_flags |= SCM_HEVC_REXT8_444;
746-
}
747-
}
748-
if (video::active_hevc_mode >= 3) {
749-
codec_mode_flags |= SCM_HEVC_MAIN10;
750-
if (video::last_encoder_probe_supported_yuv444_for_codec[1]) {
751-
codec_mode_flags |= SCM_HEVC_REXT10_444;
752-
}
753-
}
754-
if (video::active_av1_mode >= 2) {
755-
codec_mode_flags |= SCM_AV1_MAIN8;
756-
if (video::last_encoder_probe_supported_yuv444_for_codec[2]) {
757-
codec_mode_flags |= SCM_AV1_HIGH8_444;
758-
}
759-
}
760-
if (video::active_av1_mode >= 3) {
761-
codec_mode_flags |= SCM_AV1_MAIN10;
762-
if (video::last_encoder_probe_supported_yuv444_for_codec[2]) {
763-
codec_mode_flags |= SCM_AV1_HIGH10_444;
764-
}
765-
}
771+
const uint32_t codec_mode_flags = get_codec_mode_flags();
766772
tree.put("root.ServerCodecModeSupport", codec_mode_flags);
767773

768774
if (!config::nvhttp.external_ip.empty()) {
@@ -815,7 +821,7 @@ namespace nvhttp {
815821
for (auto &proc : proc::proc.get_apps()) {
816822
pt::ptree app;
817823

818-
app.put("IsHdrSupported"s, video::active_hevc_mode == 3 ? 1 : 0);
824+
app.put("IsHdrSupported"s, video::active_hevc_mode >= 3 ? 1 : 0);
819825
app.put("AppTitle"s, proc.name);
820826
app.put("ID", proc.id);
821827

src/platform/common.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -243,6 +243,7 @@ namespace platf {
243243
p010, ///< P010
244244
ayuv, ///< AYUV
245245
yuv444p16, ///< Planar 10-bit (shifted to 16-bit) YUV 4:4:4
246+
yuv444p, ///< Planar 8-bit YUV 4:4:4
246247
y410, ///< Y410
247248
unknown ///< Unknown
248249
};
@@ -259,6 +260,7 @@ namespace platf {
259260
_CONVERT(p010);
260261
_CONVERT(ayuv);
261262
_CONVERT(yuv444p16);
263+
_CONVERT(yuv444p);
262264
_CONVERT(y410);
263265
_CONVERT(unknown);
264266
}

src/platform/linux/cuda.cpp

Lines changed: 113 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -120,8 +120,10 @@ namespace cuda {
120120
this->frame = frame;
121121

122122
auto hwframe_ctx = (AVHWFramesContext *) hw_frames_ctx->data;
123-
if (hwframe_ctx->sw_format != AV_PIX_FMT_NV12) {
124-
BOOST_LOG(error) << "cuda::cuda_t doesn't support any format other than AV_PIX_FMT_NV12"sv;
123+
124+
if (hwframe_ctx->sw_format != AV_PIX_FMT_NV12 &&
125+
hwframe_ctx->sw_format != AV_PIX_FMT_YUV444P) {
126+
BOOST_LOG(error) << "cuda::cuda_t doesn't support any format other than AV_PIX_FMT_NV12 and AV_PIX_FMT_YUV444P"sv;
125127
return -1;
126128
}
127129

@@ -132,6 +134,8 @@ namespace cuda {
132134
}
133135
}
134136

137+
is_yuv444 = (hwframe_ctx->sw_format == AV_PIX_FMT_YUV444P);
138+
135139
auto cuda_ctx = (AVCUDADeviceContext *) hwframe_ctx->device_ctx->hwctx;
136140

137141
stream = make_stream();
@@ -178,7 +182,11 @@ namespace cuda {
178182
return;
179183
}
180184

181-
sws.convert(frame->data[0], frame->data[1], frame->linesize[0], frame->linesize[1], tex->texture.linear, stream.get(), {frame->width, frame->height, 0, 0});
185+
if (is_yuv444) {
186+
sws.convert_yuv444(frame->data[0], frame->data[1], frame->data[2], frame->linesize[0], tex->texture.linear, stream.get(), {frame->width, frame->height, 0, 0});
187+
} else {
188+
sws.convert_nv12(frame->data[0], frame->data[1], frame->linesize[0], frame->linesize[1], tex->texture.linear, stream.get(), {frame->width, frame->height, 0, 0});
189+
}
182190
}
183191

184192
cudaTextureObject_t tex_obj(const tex_t &tex) const {
@@ -194,13 +202,18 @@ namespace cuda {
194202
// When height and width don't change, it's not necessary to use linear interpolation
195203
bool linear_interpolation;
196204

205+
bool is_yuv444;
206+
197207
sws_t sws;
198208
};
199209

200210
class cuda_ram_t: public cuda_t {
201211
public:
202212
int convert(platf::img_t &img) override {
203-
return sws.load_ram(img, tex.array) || sws.convert(frame->data[0], frame->data[1], frame->linesize[0], frame->linesize[1], tex_obj(tex), stream.get());
213+
if (is_yuv444) {
214+
return sws.load_ram(img, tex.array) || sws.convert_yuv444(frame->data[0], frame->data[1], frame->data[2], frame->linesize[0], tex_obj(tex), stream.get());
215+
}
216+
return sws.load_ram(img, tex.array) || sws.convert_nv12(frame->data[0], frame->data[1], frame->linesize[0], frame->linesize[1], tex_obj(tex), stream.get());
204217
}
205218

206219
int set_frame(AVFrame *frame, AVBufferRef *hw_frames_ctx) override {
@@ -224,7 +237,10 @@ namespace cuda {
224237
class cuda_vram_t: public cuda_t {
225238
public:
226239
int convert(platf::img_t &img) override {
227-
return sws.convert(frame->data[0], frame->data[1], frame->linesize[0], frame->linesize[1], tex_obj(((img_t *) &img)->tex), stream.get());
240+
if (is_yuv444) {
241+
return sws.convert_yuv444(frame->data[0], frame->data[1], frame->data[2], frame->linesize[0], tex_obj(((img_t *) &img)->tex), stream.get());
242+
}
243+
return sws.convert_nv12(frame->data[0], frame->data[1], frame->linesize[0], frame->linesize[1], tex_obj(((img_t *) &img)->tex), stream.get());
228244
}
229245
};
230246

@@ -274,6 +290,13 @@ namespace cuda {
274290
return -1;
275291
}
276292

293+
struct cu_resources {
294+
registered_resource_t y_res;
295+
registered_resource_t u_res;
296+
registered_resource_t v_res;
297+
registered_resource_t uv_res;
298+
};
299+
277300
class gl_cuda_vram_t: public platf::avcodec_encode_device_t {
278301
public:
279302
/**
@@ -335,28 +358,44 @@ namespace cuda {
335358
this->hwframe.reset(frame);
336359
this->frame = frame;
337360

361+
auto hw_frames_ctx = (AVHWFramesContext *) hw_frames_ctx_buf->data;
362+
363+
if (hw_frames_ctx->sw_format != AV_PIX_FMT_NV12 &&
364+
hw_frames_ctx->sw_format != AV_PIX_FMT_YUV444P) {
365+
BOOST_LOG(error) << "cuda::gl_cuda_vram_t doesn't support any format other than AV_PIX_FMT_NV12 and AV_PIX_FMT_YUV444P"sv;
366+
return -1;
367+
}
368+
338369
if (!frame->buf[0]) {
339370
if (av_hwframe_get_buffer(hw_frames_ctx_buf, frame, 0)) {
340-
BOOST_LOG(error) << "Couldn't get hwframe for VAAPI"sv;
371+
BOOST_LOG(error) << "Couldn't get hwframe for NVENC_GL"sv;
341372
return -1;
342373
}
343374
}
344375

345-
auto hw_frames_ctx = (AVHWFramesContext *) hw_frames_ctx_buf->data;
346376
sw_format = hw_frames_ctx->sw_format;
377+
is_yuv444 = (sw_format == AV_PIX_FMT_YUV444P);
347378

348-
auto nv12_opt = egl::create_target(frame->width, frame->height, sw_format);
349-
if (!nv12_opt) {
350-
return -1;
351-
}
352-
353-
auto sws_opt = egl::sws_t::make(width, height, frame->width, frame->height, sw_format);
379+
auto sws_opt = egl::sws_t::make(width, height, frame->width, frame->height, sw_format, is_yuv444);
354380
if (!sws_opt) {
355381
return -1;
356382
}
357383

358384
this->sws = std::move(*sws_opt);
359-
this->nv12 = std::move(*nv12_opt);
385+
386+
if (is_yuv444) {
387+
auto yuv444_opt = egl::create_yuv444_target(frame->width, frame->height, sw_format);
388+
if (!yuv444_opt) {
389+
return -1;
390+
}
391+
this->yuv444 = std::move(*yuv444_opt);
392+
} else {
393+
auto nv12_opt = egl::create_nv12_target(frame->width, frame->height, sw_format);
394+
if (!nv12_opt) {
395+
return -1;
396+
}
397+
this->nv12 = std::move(*nv12_opt);
398+
}
360399

361400
auto cuda_ctx = (AVCUDADeviceContext *) hw_frames_ctx->device_ctx->hwctx;
362401

@@ -367,9 +406,14 @@ namespace cuda {
367406

368407
cuda_ctx->stream = stream.get();
369408

370-
CU_CHECK(cdf->cuGraphicsGLRegisterImage(&y_res, nv12->tex[0], GL_TEXTURE_2D, CU_GRAPHICS_REGISTER_FLAGS_READ_ONLY), "Couldn't register Y plane texture");
371-
CU_CHECK(cdf->cuGraphicsGLRegisterImage(&uv_res, nv12->tex[1], GL_TEXTURE_2D, CU_GRAPHICS_REGISTER_FLAGS_READ_ONLY), "Couldn't register UV plane texture");
372-
409+
if (is_yuv444) {
410+
CU_CHECK(cdf->cuGraphicsGLRegisterImage(&cu_res.y_res, yuv444->tex[0], GL_TEXTURE_2D, CU_GRAPHICS_REGISTER_FLAGS_READ_ONLY), "Couldn't register Y texture");
411+
CU_CHECK(cdf->cuGraphicsGLRegisterImage(&cu_res.u_res, yuv444->tex[1], GL_TEXTURE_2D, CU_GRAPHICS_REGISTER_FLAGS_READ_ONLY), "Couldn't register U texture");
412+
CU_CHECK(cdf->cuGraphicsGLRegisterImage(&cu_res.v_res, yuv444->tex[2], GL_TEXTURE_2D, CU_GRAPHICS_REGISTER_FLAGS_READ_ONLY), "Couldn't register V texture");
413+
} else {
414+
CU_CHECK(cdf->cuGraphicsGLRegisterImage(&cu_res.y_res, nv12->tex[0], GL_TEXTURE_2D, CU_GRAPHICS_REGISTER_FLAGS_READ_ONLY), "Couldn't register Y plane texture");
415+
CU_CHECK(cdf->cuGraphicsGLRegisterImage(&cu_res.uv_res, nv12->tex[1], GL_TEXTURE_2D, CU_GRAPHICS_REGISTER_FLAGS_READ_ONLY), "Couldn't register UV plane texture");
416+
}
373417
return 0;
374418
}
375419

@@ -398,33 +442,61 @@ namespace cuda {
398442
rgb = std::move(*rgb_opt);
399443
}
400444

401-
// Perform the color conversion and scaling in GL
402-
sws.load_vram(descriptor, offset_x, offset_y, rgb->tex[0]);
403-
sws.convert(nv12->buf);
404-
405445
auto fmt_desc = av_pix_fmt_desc_get(sw_format);
406446

407-
// Map the GL textures to read for CUDA
408-
CUgraphicsResource resources[2] = {y_res.get(), uv_res.get()};
409-
CU_CHECK(cdf->cuGraphicsMapResources(2, resources, stream.get()), "Couldn't map GL textures in CUDA");
447+
sws.load_vram(descriptor, offset_x, offset_y, rgb->tex[0], is_yuv444);
448+
449+
if (is_yuv444) {
450+
// Perform the color conversion and scaling in GL
451+
sws.convert_yuv444(yuv444->buf);
410452

411-
// Copy from the GL textures to the target CUDA frame
412-
for (int i = 0; i < 2; i++) {
413-
CUDA_MEMCPY2D cpy = {};
414-
cpy.srcMemoryType = CU_MEMORYTYPE_ARRAY;
415-
CU_CHECK(cdf->cuGraphicsSubResourceGetMappedArray(&cpy.srcArray, resources[i], 0, 0), "Couldn't get mapped plane array");
453+
// Map the GL textures to read for CUDA
454+
std::array<CUgraphicsResource, 3> resources = {{cu_res.y_res.get(), cu_res.u_res.get(), cu_res.v_res.get()}};
455+
CU_CHECK(cdf->cuGraphicsMapResources(resources.size(), resources.data(), stream.get()), "Couldn't map GL textures in CUDA");
416456

417-
cpy.dstMemoryType = CU_MEMORYTYPE_DEVICE;
418-
cpy.dstDevice = (CUdeviceptr) frame->data[i];
419-
cpy.dstPitch = frame->linesize[i];
420-
cpy.WidthInBytes = (frame->width * fmt_desc->comp[i].step) >> (i ? fmt_desc->log2_chroma_w : 0);
421-
cpy.Height = frame->height >> (i ? fmt_desc->log2_chroma_h : 0);
457+
// Copy from the GL textures to the target CUDA frame
458+
for (int i = 0; i < 3; i++) {
459+
CUDA_MEMCPY2D cpy = {};
460+
cpy.srcMemoryType = CU_MEMORYTYPE_ARRAY;
461+
CU_CHECK(cdf->cuGraphicsSubResourceGetMappedArray(&cpy.srcArray, resources[i], 0, 0), "Couldn't get mapped plane array");
422462

423-
CU_CHECK_IGNORE(cdf->cuMemcpy2DAsync(&cpy, stream.get()), "Couldn't copy texture to CUDA frame");
463+
cpy.dstMemoryType = CU_MEMORYTYPE_DEVICE;
464+
cpy.dstDevice = (CUdeviceptr) frame->data[i];
465+
cpy.dstPitch = frame->linesize[i];
466+
cpy.WidthInBytes = (frame->width * fmt_desc->comp[i].step);
467+
cpy.Height = frame->height;
468+
469+
CU_CHECK_IGNORE(cdf->cuMemcpy2DAsync(&cpy, stream.get()), "Couldn't copy texture to CUDA frame");
470+
}
471+
// Unmap the textures to allow modification from GL again
472+
CU_CHECK(cdf->cuGraphicsUnmapResources(resources.size(), resources.data(), stream.get()), "Couldn't unmap GL textures from CUDA");
473+
474+
} else {
475+
// Perform the color conversion and scaling in GL
476+
sws.convert_nv12(nv12->buf);
477+
478+
// Map the GL textures to read for CUDA
479+
std::array<CUgraphicsResource, 2> resources = {{cu_res.y_res.get(), cu_res.uv_res.get()}};
480+
CU_CHECK(cdf->cuGraphicsMapResources(resources.size(), resources.data(), stream.get()), "Couldn't map GL textures in CUDA");
481+
482+
// Copy from the GL textures to the target CUDA frame
483+
for (int i = 0; i < 2; i++) {
484+
CUDA_MEMCPY2D cpy = {};
485+
cpy.srcMemoryType = CU_MEMORYTYPE_ARRAY;
486+
CU_CHECK(cdf->cuGraphicsSubResourceGetMappedArray(&cpy.srcArray, resources[i], 0, 0), "Couldn't get mapped plane array");
487+
488+
cpy.dstMemoryType = CU_MEMORYTYPE_DEVICE;
489+
cpy.dstDevice = (CUdeviceptr) frame->data[i];
490+
cpy.dstPitch = frame->linesize[i];
491+
cpy.WidthInBytes = (frame->width * fmt_desc->comp[i].step) >> (i ? fmt_desc->log2_chroma_w : 0);
492+
cpy.Height = frame->height >> (i ? fmt_desc->log2_chroma_h : 0);
493+
494+
CU_CHECK_IGNORE(cdf->cuMemcpy2DAsync(&cpy, stream.get()), "Couldn't copy texture to CUDA frame");
495+
}
496+
// Unmap the textures to allow modification from GL again
497+
CU_CHECK(cdf->cuGraphicsUnmapResources(resources.size(), resources.data(), stream.get()), "Couldn't unmap GL textures from CUDA");
424498
}
425499

426-
// Unmap the textures to allow modification from GL again
427-
CU_CHECK(cdf->cuGraphicsUnmapResources(2, resources, stream.get()), "Couldn't unmap GL textures from CUDA");
428500
return 0;
429501
}
430502

@@ -446,6 +518,7 @@ namespace cuda {
446518

447519
egl::sws_t sws;
448520
egl::nv12_t nv12;
521+
egl::yuv444_t yuv444;
449522
AVPixelFormat sw_format;
450523

451524
int height;
@@ -454,11 +527,12 @@ namespace cuda {
454527
std::uint64_t sequence;
455528
egl::rgb_t rgb;
456529

457-
registered_resource_t y_res;
458-
registered_resource_t uv_res;
530+
cu_resources cu_res;
459531

460532
int offset_x;
461533
int offset_y;
534+
535+
bool is_yuv444;
462536
};
463537

464538
std::unique_ptr<platf::avcodec_encode_device_t> make_avcodec_encode_device(int width, int height, bool vram) {

0 commit comments

Comments
 (0)