Skip to content

Commit 21aacaf

Browse files
authored
feat(win32): no stall pipeline (#29)
* draft * update * update * comments
1 parent d19286c commit 21aacaf

4 files changed

Lines changed: 127 additions & 70 deletions

File tree

media_kit_video/windows/d3d11_renderer.h

Lines changed: 13 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -27,10 +27,14 @@
2727
//
2828
// The MailboxSwapChain is passed directly to mpv as the IDXGISwapChain* in
2929
// mpv_dxgi_init_params. mpv calls GetBuffer(0, ...) to obtain a render
30-
// target, renders into it, and flushes. The plugin then calls
31-
// ProducerCommit() to atomically publish the frame. Flutter's
32-
// GpuSurfaceTexture callback calls ConsumerAcquire() to receive the DXGI
33-
// shared HANDLE of the newest complete frame — with no copy and no OS lock.
30+
// target and submits GPU work into it. The plugin then calls
31+
// ProducerCommit(), which (a) signals a fence on the submitted work,
32+
// (b) non-blockingly checks the *previous* frame's fence and, if already
33+
// GPU-complete, promotes it to latest_completed_slot_, and (c) atomically
34+
// publishes write_slot_ as the new pending frame. Flutter's
35+
// GpuSurfaceTexture callback calls ConsumerAcquire() — a single acquire
36+
// load of latest_completed_slot_ — to receive the DXGI shared HANDLE of
37+
// the newest confirmed frame, with no copy, no flush, and no OS lock.
3438
class D3D11Renderer {
3539
public:
3640
int32_t width() const { return width_; }
@@ -57,11 +61,14 @@ class D3D11Renderer {
5761
void SetSize(int32_t width, int32_t height);
5862

5963
// Called from the producer thread (mpv thread pool) after
60-
// mpv_render_context_render returns. Publishes the rendered frame.
64+
// mpv_render_context_render returns. Signals the frame fence, then
65+
// non-blockingly attempts to promote the previous pending frame to
66+
// latest_completed_slot_, and finally publishes the new pending frame.
6167
void ProducerCommit();
6268

6369
// Called from the consumer thread (Flutter GpuSurfaceTexture callback).
64-
// Returns the DXGI shared HANDLE of the most recent complete frame.
70+
// Returns the DXGI shared HANDLE of the most recent fence-confirmed frame
71+
// via a single atomic load — no fence poll, no flush, no stall.
6572
HANDLE ConsumerAcquire();
6673

6774
// Returns the DXGI shared HANDLE for the current read slot without

media_kit_video/windows/mailbox_swap_chain.cc

Lines changed: 73 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -111,51 +111,92 @@ MailboxSwapChain::GetDesc(DXGI_SWAP_CHAIN_DESC* pDesc) {
111111
}
112112

113113
void MailboxSwapChain::ProducerCommit() {
114+
auto& ws = slots_[write_slot_];
115+
context4_->Signal(ws.fence.Get(), ++ws.fence_value);
116+
117+
// This runs one full render-cycle after the *previous* Signal was enqueued.
118+
// By then the D3D11 runtime has had ample opportunity to submit the prior
119+
// command buffer to the GPU, so GetCompletedValue() is far more likely to
120+
// have advanced than it would be inside ConsumerAcquire (which can be
121+
// called microseconds after the Signal). The check is non-blocking: if
122+
// the fence isn't done yet, we simply leave latest_completed_slot_ as-is
123+
// and try again next frame.
124+
//
125+
// On success we do a combined promotion CAS on mailbox_state_:
126+
// (has_pending=1, pending=P, completed=C, free=F)
127+
// → (has_pending=0, extra=C, completed=P, free=F)
128+
// then store latest_completed_slot_ = P with release ordering so that
129+
// ConsumerAcquire's acquire load cannot observe P before mailbox_state_
130+
// reflects P in the 'completed' role (i.e., protected from the producer).
114131
{
115-
auto& ws = slots_[write_slot_];
116-
context4_->Signal(ws.fence.Get(), ++ws.fence_value);
132+
uint32_t snap = mailbox_state_.load(std::memory_order_acquire);
133+
if (snap & (1u << 6)) { // has_pending
134+
const int pend = static_cast<int>((snap >> 4) & 0x3u);
135+
const int comp = static_cast<int>((snap >> 2) & 0x3u);
136+
const int fr = static_cast<int>( snap & 0x3u);
137+
if (slots_[pend].fence->GetCompletedValue() >=
138+
slots_[pend].fence_value) {
139+
const uint32_t snap_desired =
140+
(static_cast<uint32_t>(comp) << 4) |
141+
(static_cast<uint32_t>(pend) << 2) |
142+
static_cast<uint32_t>(fr);
143+
if (mailbox_state_.compare_exchange_strong(
144+
snap, snap_desired,
145+
std::memory_order_acq_rel,
146+
std::memory_order_relaxed)) {
147+
latest_completed_slot_.store(pend, std::memory_order_release);
148+
}
149+
// CAS failure means no concurrent writer exists (ProducerCommit is
150+
// called from a single producer thread); the only way it can fail is
151+
// if mailbox_state_ was already has_pending=0, which means nothing
152+
// to promote. Either way, leave latest_completed_slot_ untouched.
153+
}
154+
}
117155
}
118156

119-
const uint32_t desired = static_cast<uint32_t>(write_slot_) | 0x4u;
157+
// Desired state:
158+
// has_pending = 1
159+
// pending = write_slot_ (new latest frame)
160+
// completed = old completed_slot (unchanged)
161+
// free = old pending_or_extra (recycled: was old pending or extra_free)
162+
//
163+
// new write_slot_ (producer-private) = old free_slot.
120164
uint32_t expected = mailbox_state_.load(std::memory_order_relaxed);
121-
while (!mailbox_state_.compare_exchange_weak(
122-
expected, desired, std::memory_order_release,
123-
std::memory_order_relaxed)) {}
124-
write_slot_ = static_cast<int>(expected & 0x3u);
165+
while (true) {
166+
const int old_free = static_cast<int>( expected & 0x3u);
167+
const int old_completed = static_cast<int>((expected >> 2) & 0x3u);
168+
const int old_poe = static_cast<int>((expected >> 4) & 0x3u);
169+
const uint32_t desired =
170+
(1u << 6) |
171+
(static_cast<uint32_t>(write_slot_) << 4) |
172+
(static_cast<uint32_t>(old_completed) << 2) |
173+
static_cast<uint32_t>(old_poe);
174+
if (mailbox_state_.compare_exchange_weak(
175+
expected, desired,
176+
std::memory_order_release,
177+
std::memory_order_relaxed)) {
178+
write_slot_ = old_free;
179+
break;
180+
}
181+
}
125182
}
126183

127184
HANDLE MailboxSwapChain::ConsumerAcquire() {
128-
uint32_t expected = mailbox_state_.load(std::memory_order_acquire);
129-
if (!(expected & 0x4u)) {
130-
// No new frame — we already waited for this slot last time we acquired it.
131-
return slots_[read_slot_].shared_handle;
132-
}
133-
const uint32_t desired = static_cast<uint32_t>(read_slot_); // dirty=0
134-
while (!mailbox_state_.compare_exchange_weak(
135-
expected, desired, std::memory_order_acq_rel,
136-
std::memory_order_relaxed)) {
137-
if (!(expected & 0x4u))
138-
return slots_[read_slot_].shared_handle;
139-
}
140-
read_slot_ = static_cast<int>(expected & 0x3u);
141-
142-
auto& rs = slots_[read_slot_];
143-
if (rs.fence->GetCompletedValue() < rs.fence_value) {
144-
if (SUCCEEDED(rs.fence->SetEventOnCompletion(rs.fence_value,
145-
rs.fence_event))) {
146-
::WaitForSingleObject(rs.fence_event, INFINITE);
147-
}
148-
}
149-
return rs.shared_handle;
185+
// Always return the most recently fence-confirmed frame.
186+
// Advancement is handled exclusively by ProducerCommit (called one full
187+
// render-cycle after each Signal, where fence completion is far more
188+
// likely).
189+
return slots_[latest_completed_slot_.load(std::memory_order_acquire)]
190+
.shared_handle;
150191
}
151192

152193
HRESULT MailboxSwapChain::Resize(int32_t width, int32_t height) {
153194
ReleaseSlots();
154195
width_ = (width > 0) ? width : 1;
155196
height_ = (height > 0) ? height : 1;
156-
mailbox_state_.store(2u, std::memory_order_relaxed);
197+
mailbox_state_.store(57u, std::memory_order_relaxed);
198+
latest_completed_slot_.store(2, std::memory_order_relaxed);
157199
write_slot_ = 0;
158-
read_slot_ = 1;
159200
return AllocateSlots();
160201
}
161202

@@ -184,7 +225,7 @@ HRESULT MailboxSwapChain::AllocateSlots() {
184225
desc.CPUAccessFlags = 0;
185226
desc.MiscFlags = D3D11_RESOURCE_MISC_SHARED;
186227

187-
for (int i = 0; i < 3; ++i) {
228+
for (int i = 0; i < 4; ++i) {
188229
HRESULT hr = device_->CreateTexture2D(&desc, nullptr, &slots_[i].texture);
189230
if (FAILED(hr)) {
190231
std::cout << "media_kit: MailboxSwapChain: CreateTexture2D slot " << i
@@ -220,17 +261,6 @@ HRESULT MailboxSwapChain::AllocateSlots() {
220261
return hr;
221262
}
222263
slots_[i].fence_value = 0;
223-
224-
slots_[i].fence_event =
225-
::CreateEventW(nullptr, /*bManualReset=*/FALSE, /*bInitialState=*/FALSE,
226-
nullptr);
227-
if (!slots_[i].fence_event) {
228-
const HRESULT hrE = HRESULT_FROM_WIN32(::GetLastError());
229-
std::cout << "media_kit: MailboxSwapChain: CreateEvent slot " << i
230-
<< " failed (hr=0x" << std::hex << hrE << std::dec << ")"
231-
<< std::endl;
232-
return hrE;
233-
}
234264
}
235265

236266
return S_OK;
@@ -242,9 +272,5 @@ void MailboxSwapChain::ReleaseSlots() {
242272
slot.shared_handle = nullptr;
243273
slot.fence.Reset();
244274
slot.fence_value = 0;
245-
if (slot.fence_event) {
246-
::CloseHandle(slot.fence_event);
247-
slot.fence_event = nullptr;
248-
}
249275
}
250276
}

media_kit_video/windows/mailbox_swap_chain.h

Lines changed: 39 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -18,14 +18,25 @@
1818
#include <atomic>
1919
#include <cstdint>
2020

21-
// Minimal IDXGISwapChain facade backed by a lock-free triple-buffer mailbox.
21+
// Minimal IDXGISwapChain facade backed by a lock-free 4-slot mailbox with
22+
// a last-completed-frame cache.
2223
//
23-
// Three BGRA8 textures are kept, each with a DXGI shared HANDLE.
24+
// Four BGRA8 textures are kept, each with a DXGI shared HANDLE.
2425
// mailbox_state_ is a single atomic<uint32_t>:
25-
// bits [1:0] slot index in the mailbox (0-2)
26-
// bit [2] dirty flag: 1 = producer has committed a new frame
26+
// bits [1:0] = free_slot (0-3): producer takes this for the next frame
27+
// bits [3:2] = completed_slot (0-3): most recent fence-confirmed frame;
28+
// safe Consumer fallback at any time
29+
// bits [5:4] = pending_or_extra (0-3): has_pending=1 → latest submitted frame
30+
// (fence may still be in-flight);
31+
// has_pending=0 → second free slot
32+
// bit [6] = has_pending : 1 = a new frame is waiting to be consumed
2733
//
28-
// {write_slot_, mailbox slot, read_slot_} is always a permutation of {0,1,2}.
34+
// 4-slot invariant (all roles are always distinct):
35+
// has_pending=1: write_slot_(private) | free | pending | completed = 4 slots
36+
// has_pending=0: write_slot_(private) | free | extra_free | completed = 4 slots
37+
//
38+
// Initial value 57u = 0b0_11_10_01:
39+
// has_pending=0, extra_free=3, completed=2, free=1, write_slot_=0 (private)
2940
class MailboxSwapChain final : public IDXGISwapChain {
3041
public:
3142
// Returns an AddRef'd pointer (ref count = 1). device must outlive this.
@@ -95,19 +106,28 @@ class MailboxSwapChain final : public IDXGISwapChain {
95106
}
96107

97108
// Called from the producer thread after mpv_render_context_render returns.
109+
// In addition to publishing write_slot_ as the new pending frame, it
110+
// non-blockingly polls the *previous* pending frame's fence and, if the GPU
111+
// has already completed it, promotes it to completed and updates
112+
// latest_completed_slot_ (release store). This is the sole site that
113+
// advances latest_completed_slot_; ConsumerAcquire never touches the fence.
98114
void ProducerCommit();
99115

100116
// Called from the consumer thread (Flutter GpuSurfaceTexture callback).
101-
// Returns the DXGI shared HANDLE of the most recent complete frame.
117+
// Returns the DXGI shared HANDLE of the most recent fence-confirmed frame.
118+
// Implementation is a single acquire load of latest_completed_slot_ —
119+
// no CAS, no fence poll, no flush, no stall, no KeyedMutex.
102120
HANDLE ConsumerAcquire();
103121

104122
// Recreates all three texture slots at the new dimensions.
105123
// Must only be called from the producer thread with no active consumer.
106124
HRESULT Resize(int32_t width, int32_t height);
107125

108-
// Returns the current read-slot HANDLE without advancing mailbox state.
126+
// Returns the latest GPU-confirmed HANDLE without advancing mailbox state.
127+
// Safe to call before the consumer thread starts.
109128
HANDLE ReadHandleSnapshot() const {
110-
return slots_[read_slot_].shared_handle;
129+
return slots_[latest_completed_slot_.load(std::memory_order_acquire)]
130+
.shared_handle;
111131
}
112132

113133
int32_t width() const { return width_; }
@@ -127,7 +147,6 @@ class MailboxSwapChain final : public IDXGISwapChain {
127147
Microsoft::WRL::ComPtr<ID3D11Texture2D> texture;
128148
HANDLE shared_handle = nullptr;
129149
Microsoft::WRL::ComPtr<ID3D11Fence> fence;
130-
HANDLE fence_event = nullptr;
131150
uint64_t fence_value = 0;
132151
};
133152

@@ -137,13 +156,20 @@ class MailboxSwapChain final : public IDXGISwapChain {
137156
int32_t width_ = 1;
138157
int32_t height_ = 1;
139158

140-
TextureSlot slots_[3];
159+
TextureSlot slots_[4];
160+
161+
// Lock-free mailbox state — see bit-field comment at top of class.
162+
// Initial value 57u = 0b0_11_10_01.
163+
std::atomic<uint32_t> mailbox_state_{57u};
141164

142-
// Lock-free mailbox: bits [1:0] = slot index (0-2), bit [2] = dirty; init = 2u.
143-
std::atomic<uint32_t> mailbox_state_{2u};
165+
// Cache of the most recently fence-confirmed completed slot.
166+
// ConsumerAcquire reads this directly (one atomic load, no CAS, no fence
167+
// poll). Updated by ProducerCommit after a successful non-blocking
168+
// pending→completed promotion. Initialised to 2, which matches the
169+
// 'completed' field in mailbox_state_'s initial value 57u.
170+
std::atomic<int> latest_completed_slot_{2};
144171

145172
int write_slot_ = 0; // producer-private
146-
int read_slot_ = 1; // consumer-private
147173

148174
std::atomic<ULONG> ref_count_{1u};
149175
};

media_kit_video/windows/video_output.cc

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -166,8 +166,6 @@ void VideoOutput::Render() {
166166
if (d3d11_renderer_ != nullptr) {
167167
mpv_render_context_render(render_context_, nullptr);
168168
mpv_render_context_report_swap(render_context_);
169-
// Atomically publish the rendered slot to the mailbox so that Flutter's
170-
// GpuSurfaceTexture callback can import it without a copy.
171169
d3d11_renderer_->ProducerCommit();
172170
}
173171
// S/W
@@ -299,8 +297,8 @@ void VideoOutput::Resize(int64_t required_width, int64_t required_height) {
299297

300298
auto texture = std::make_unique<FlutterDesktopGpuSurfaceDescriptor>();
301299
texture->struct_size = sizeof(FlutterDesktopGpuSurfaceDescriptor);
302-
// Seed with the current read-slot handle so Flutter has a valid surface
303-
// even before the first mpv frame is committed.
300+
// Seed with the latest-completed-slot handle so Flutter has a valid
301+
// surface even before the first mpv frame is committed.
304302
texture->handle = d3d11_renderer_->ReadHandleSnapshot();
305303
texture->width = texture->visible_width = d3d11_renderer_->width();
306304
texture->height = texture->visible_height = d3d11_renderer_->height();

0 commit comments

Comments
 (0)