@@ -243,136 +243,134 @@ float4 main(
243243
244244 HRESULT STDMETHODCALLTYPE CustomComputeBridgeEffect::Dispatch (
245245 ID2D1DeviceContext5* dc,
246- ID2D1Image* inputImage,
246+ ID2D1Image* const * inputImages,
247+ UINT32 inputCount,
247248 const BYTE * cbufferData,
248249 UINT32 cbufferSize,
249250 UINT32 analysisFloat4Count,
250251 UINT32 imageOutputW,
251252 UINT32 imageOutputH,
252253 std::vector<float >* outAnalysisFloats)
253254 {
254- if (!dc || !inputImage) return E_INVALIDARG ;
255+ if (!dc || !inputImages || inputCount == 0 || !inputImages[0 ])
256+ return E_INVALIDARG ;
255257
256- // Pre-render input to FP32 D3D11 texture. Mirrors the pre-Phase-8
257- // logic that lived inline in `GraphEvaluator::DispatchUserD3D11Compute`.
258- // 96 DPI keeps GetImageLocalBounds in pixels.
258+ // Pre-render each input to its own FP32 D3D11 texture. 96 DPI
259+ // keeps GetImageLocalBounds in pixels.
259260 float oldDpiX = 0 , oldDpiY = 0 ;
260261 dc->GetDpi (&oldDpiX, &oldDpiY);
261262 dc->SetDpi (96 .0f , 96 .0f );
262263
263- D2D1_RECT_F bounds{};
264- dc->GetImageLocalBounds (inputImage, &bounds);
265- UINT32 w = static_cast <UINT32 >((std::min)(bounds.right - bounds.left , 8192 .0f ));
266- UINT32 h = static_cast <UINT32 >((std::min)(bounds.bottom - bounds.top , 8192 .0f ));
267- if (w == 0 || h == 0 )
268- {
269- dc->SetDpi (oldDpiX, oldDpiY);
270- return E_NOT_VALID_STATE ;
271- }
264+ if (m_inputBitmaps.size () < inputCount)
265+ m_inputBitmaps.resize (inputCount);
272266
273- winrt::com_ptr<ID2D1Bitmap1> inputBmp;
274- // Fast path: if `inputImage` is already an FP32-RGBA D2D bitmap
275- // of the correct size (e.g. produced by GraphEvaluator's
276- // PreRenderInputBitmap and shared across all deferred-compute
277- // consumers in the same frame), we can grab its DXGI surface
278- // directly and skip the per-dispatch DrawImage round-trip.
279- // This is the dominant cost on multi-consumer graphs (4 stats +
280- // 1 tonemap = 5 dispatches all consuming the same Source ->
281- // 5 redundant DrawImage calls + 5 FP32 bitmap allocations
282- // before this fast path).
283- bool inputIsAlreadyFp32Bitmap = false ;
267+ // Hold a reference per slot so the textures don't get released
268+ // while we're building SRVs / driving the dispatch below.
269+ std::vector<winrt::com_ptr<ID3D11Texture2D>> inputTextures;
270+ inputTextures.reserve (inputCount);
271+ UINT32 firstW = 0 , firstH = 0 ;
272+
273+ for (UINT32 idx = 0 ; idx < inputCount; ++idx)
284274 {
285- winrt::com_ptr<ID2D1Bitmap1> asBitmap ;
286- if (SUCCEEDED ( inputImage-> QueryInterface (asBitmap. put ())) && asBitmap )
275+ ID2D1Image* inputImage = inputImages[idx] ;
276+ if (! inputImage)
287277 {
288- auto px = asBitmap->GetPixelFormat ();
289- auto sz = asBitmap->GetPixelSize ();
290- if (px.format == DXGI_FORMAT_R32G32B32A32_FLOAT &&
291- sz.width == w && sz.height == h)
292- {
293- inputBmp = asBitmap;
294- inputIsAlreadyFp32Bitmap = true ;
295- }
278+ dc->SetDpi (oldDpiX, oldDpiY);
279+ return E_INVALIDARG ;
296280 }
297- }
298281
299- if (!inputIsAlreadyFp32Bitmap)
300- {
301- if (m_inputBitmap && m_inputBitmapW == w && m_inputBitmapH == h)
282+ D2D1_RECT_F bounds{};
283+ dc->GetImageLocalBounds (inputImage, &bounds);
284+ UINT32 w = static_cast <UINT32 >((std::min)(bounds.right - bounds.left , 8192 .0f ));
285+ UINT32 h = static_cast <UINT32 >((std::min)(bounds.bottom - bounds.top , 8192 .0f ));
286+ if (w == 0 || h == 0 )
287+ {
288+ dc->SetDpi (oldDpiX, oldDpiY);
289+ return E_NOT_VALID_STATE ;
290+ }
291+ if (idx == 0 ) { firstW = w; firstH = h; }
292+
293+ winrt::com_ptr<ID2D1Bitmap1> inputBmp;
294+ // Fast path: input is already an FP32-RGBA D2D bitmap of the
295+ // expected size (e.g. produced by GraphEvaluator's
296+ // PreRenderInputBitmap and shared across multiple deferred-
297+ // compute consumers). Skip the per-dispatch DrawImage
298+ // round-trip and grab the DXGI surface directly.
299+ bool inputIsAlreadyFp32Bitmap = false ;
302300 {
303- inputBmp = m_inputBitmap;
301+ winrt::com_ptr<ID2D1Bitmap1> asBitmap;
302+ if (SUCCEEDED (inputImage->QueryInterface (asBitmap.put ())) && asBitmap)
303+ {
304+ auto px = asBitmap->GetPixelFormat ();
305+ auto sz = asBitmap->GetPixelSize ();
306+ if (px.format == DXGI_FORMAT_R32G32B32A32_FLOAT &&
307+ sz.width == w && sz.height == h)
308+ {
309+ inputBmp = asBitmap;
310+ inputIsAlreadyFp32Bitmap = true ;
311+ }
312+ }
304313 }
305- else
314+
315+ if (!inputIsAlreadyFp32Bitmap)
306316 {
307- D2D1_BITMAP_PROPERTIES1 bp{};
308- bp.pixelFormat = { DXGI_FORMAT_R32G32B32A32_FLOAT , D2D1_ALPHA_MODE_PREMULTIPLIED };
309- bp.bitmapOptions = D2D1_BITMAP_OPTIONS_TARGET ;
310- bp.dpiX = 96 .0f ;
311- bp.dpiY = 96 .0f ;
312- HRESULT hrAlloc = dc->CreateBitmap (D2D1::SizeU (w, h), nullptr , 0 , bp, inputBmp.put ());
313- if (FAILED (hrAlloc)) { dc->SetDpi (oldDpiX, oldDpiY); return hrAlloc; }
314- m_inputBitmap = inputBmp;
315- m_inputBitmapW = w;
316- m_inputBitmapH = h;
317+ auto & cache = m_inputBitmaps[idx];
318+ if (cache.bitmap && cache.width == w && cache.height == h)
319+ {
320+ inputBmp = cache.bitmap ;
321+ }
322+ else
323+ {
324+ D2D1_BITMAP_PROPERTIES1 bp{};
325+ bp.pixelFormat = { DXGI_FORMAT_R32G32B32A32_FLOAT , D2D1_ALPHA_MODE_PREMULTIPLIED };
326+ bp.bitmapOptions = D2D1_BITMAP_OPTIONS_TARGET ;
327+ bp.dpiX = 96 .0f ;
328+ bp.dpiY = 96 .0f ;
329+ HRESULT hrAlloc = dc->CreateBitmap (D2D1::SizeU (w, h), nullptr , 0 , bp, inputBmp.put ());
330+ if (FAILED (hrAlloc)) { dc->SetDpi (oldDpiX, oldDpiY); return hrAlloc; }
331+ cache.bitmap = inputBmp;
332+ cache.width = w;
333+ cache.height = h;
334+ }
335+
336+ // Render upstream chain into our cached bitmap. Necessary
337+ // when inputImage is a D2D effect output (not already an
338+ // FP32 bitmap) -- the format-convert happens via DrawImage.
339+ winrt::com_ptr<ID2D1Image> prevTarget;
340+ dc->GetTarget (prevTarget.put ());
341+ dc->SetTarget (inputBmp.get ());
342+ dc->Clear (D2D1::ColorF (0 , 0 , 0 , 0 ));
343+ dc->DrawImage (inputImage, D2D1::Point2F (-bounds.left , -bounds.top ));
344+ dc->SetTarget (prevTarget.get ());
345+ dc->Flush (); // D2D batches DrawImage until Flush/EndDraw.
317346 }
318347
319- // Render upstream chain into our cached bitmap. Necessary
320- // when inputImage is a D2D effect output (not already an
321- // FP32 bitmap) -- the format-convert happens via DrawImage.
322- winrt::com_ptr<ID2D1Image> prevTarget;
323- dc->GetTarget (prevTarget.put ());
324- dc->SetTarget (inputBmp.get ());
325- dc->Clear (D2D1::ColorF (0 , 0 , 0 , 0 ));
326- dc->DrawImage (inputImage, D2D1::Point2F (-bounds.left , -bounds.top ));
327- dc->SetTarget (prevTarget.get ());
328- dc->Flush (); // D2D batches DrawImage until Flush/EndDraw.
348+ winrt::com_ptr<IDXGISurface> surface;
349+ HRESULT hr = inputBmp->GetSurface (surface.put ());
350+ if (FAILED (hr)) { dc->SetDpi (oldDpiX, oldDpiY); return hr; }
351+ winrt::com_ptr<ID3D11Texture2D> tex;
352+ hr = surface->QueryInterface (tex.put ());
353+ if (FAILED (hr)) { dc->SetDpi (oldDpiX, oldDpiY); return hr; }
354+ inputTextures.push_back (std::move (tex));
329355 }
330- dc->SetDpi (oldDpiX, oldDpiY);
331356
332- winrt::com_ptr<IDXGISurface> surface;
333- HRESULT hr = inputBmp->GetSurface (surface.put ());
334- if (FAILED (hr)) return hr;
335- winrt::com_ptr<ID3D11Texture2D> inputTex;
336- hr = surface->QueryInterface (inputTex.put ());
337- if (FAILED (hr)) return hr;
357+ dc->SetDpi (oldDpiX, oldDpiY);
338358
339- // Lazily initialize the runner with the same D3D11 device the
340- // input texture is on (matches D2D's underlying device).
359+ // Lazily initialize the runner with the same D3D11 device as the
360+ // input texture (matches D2D's underlying device).
341361 winrt::com_ptr<ID3D11Device> device;
342- inputTex ->GetDevice (device.put ());
362+ inputTextures[ 0 ] ->GetDevice (device.put ());
343363 if (!m_runner.IsInitialized ())
344364 m_runner.Initialize (device.get ());
345365
346366 if (m_bytecodeDirty)
347367 {
348- // The runner's CompileShader takes HLSL source -- but for
349- // the bridge we have pre-compiled bytecode. The runner
350- // accepts a compile that re-emits the same bytecode by
351- // including a marker; for a fully clean cut we'd add a
352- // SetCompiledBytecode method to the runner. For now,
353- // call the runner's existing path: the Dispatch() method
354- // requires the runner's m_shader to be populated, which
355- // happens via CompileShader. We bypass by going direct
356- // through CreateComputeShader on the device.
357368 winrt::com_ptr<ID3D11ComputeShader> tempShader;
358- hr = device->CreateComputeShader (
369+ HRESULT hr = device->CreateComputeShader (
359370 m_pendingBytecode.data (),
360371 m_pendingBytecode.size (),
361372 nullptr , tempShader.put ());
362373 if (FAILED (hr)) return hr;
363- // Stash bytecode + shader on the runner via its public
364- // interface. Today the runner exposes CompileShader(string)
365- // which compiles at runtime; we want to install pre-compiled
366- // bytecode. The runner's path through Dispatch will work as
367- // long as m_shader is set -- we extend the runner with
368- // SetPrecompiledShader in a follow-up. For now, fall back
369- // to D3DCompile path: not ideal but functional.
370- //
371- // Actually -- we only need a single cs_5_0 shader install,
372- // and the runner's Dispatch internally uses m_shader.get().
373- // The CreateComputeShader on `device` produced a valid
374- // shader; we can pass it to the runner via a small helper
375- // we add below.
376374 m_runner.InstallPrecompiledShader (
377375 m_pendingBytecode, tempShader);
378376 m_bytecodeDirty = false ;
@@ -385,13 +383,11 @@ float4 main(
385383 // re-created on size change.
386384 if (imageOutputW > 0 && imageOutputH > 0 )
387385 {
388- hr = EnsureImageOutputTexture (device.get (), dc, imageOutputW, imageOutputH);
386+ HRESULT hr = EnsureImageOutputTexture (device.get (), dc, imageOutputW, imageOutputH);
389387 if (FAILED (hr)) return hr;
390388 }
391389 else if (m_imageOutputTex)
392390 {
393- // Tear down the image-output side if a prior instance had
394- // one and this dispatch doesn't.
395391 m_imageOutput = nullptr ;
396392 m_imageOutputTex = nullptr ;
397393 m_imageOutputW = 0 ;
@@ -405,21 +401,18 @@ float4 main(
405401 if (cbufferData && cbufferSize > 0 )
406402 cbBytes.assign (cbufferData, cbufferData + cbufferSize);
407403
408- // Drive the dispatch through the runner's existing entry. This
409- // populates the runner's structured-buffer SRV (analysis output)
410- // and reads it back to floats. The image-output side runs in
411- // parallel via a u1 binding the bridge sets up below. Phase 8
412- // GPU bindings (m_gpuBindingSrvs / m_gpuBindingSlots) flow
413- // through as extra SRVs at consumer-declared t-slots.
414- //
415- // Phase 8c: when caller passes outAnalysisFloats=nullptr it
416- // signals "no CPU consumer this frame" -- skip the runner's
417- // CopyResource + Map. The structured-buffer UAV is still sized
418- // and populated by the dispatch; only the readback round-trip
419- // is elided. Downstream GPU SRV consumers see the same buffer.
404+ // Drive the dispatch through the runner. Phase 8c: when caller
405+ // passes outAnalysisFloats=nullptr it signals "no CPU consumer
406+ // this frame" -- skip the runner's CopyResource + Map.
420407 const bool readbackToCpu = (outAnalysisFloats != nullptr );
408+
409+ // Build a raw-pointer array of input textures for the runner.
410+ std::vector<ID3D11Texture2D*> inputRaw;
411+ inputRaw.reserve (inputTextures.size ());
412+ for (const auto & t : inputTextures) inputRaw.push_back (t.get ());
413+
421414 auto floats = m_runner.DispatchWithImageOutput (
422- inputTex. get () ,
415+ inputRaw ,
423416 cbBytes,
424417 analysisFloat4Count,
425418 m_imageOutputTex.get (),
0 commit comments