Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
178 changes: 178 additions & 0 deletions packages/dev/core/src/Compute/prefixSumCompute.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,178 @@
import { type Nullable } from "core/types";
import { type AbstractEngine } from "core/Engines/abstractEngine.pure";
import { type WebGPUEngine } from "core/Engines/webgpuEngine.pure";
import { type DataBuffer } from "core/Buffers/dataBuffer";
import { type ComputeBindingMapping } from "core/Engines/Extensions/engine.computeShader.pure";
import { ComputeShader } from "core/Compute/computeShader.pure";
import { StorageBuffer } from "core/Buffers/storageBuffer";
import { UniformBuffer } from "core/Materials/uniformBuffer";
import { Constants } from "core/Engines/constants";

const BlockSize = 256;

/**
* @internal
* WebGPU compute utility that performs an in-place hierarchical EXCLUSIVE prefix sum (scan) over a
* `StorageBuffer` of u32 values.
*
* It scans the array in blocks of {@link BlockSize}, recursively scans the per-block totals, and adds
* the resulting offsets back, so it handles arrays much larger than a single workgroup. It is used by
* the Gaussian Splatting GPU depth sort (histogram into per-bucket start offsets) and its interval
* culling pass.
*
* The WGSL compute shaders are loaded asynchronously during construction (they self-register into the
* ShaderStore), so this module stays side-effect free: no top-level shader imports and therefore no
* `.pure` / side-effect-wrapper split. {@link isReady} returns false until the shaders have loaded and
* the compute pipelines have compiled.
*/
export class PrefixSumCompute {
private readonly _engine: AbstractEngine;
private _scanBlock: Nullable<ComputeShader> = null;
private _addOffsets: Nullable<ComputeShader> = null;
// One UBO per dispatch, cycled per scan, to avoid a later dispatch's parameters overwriting an
// earlier (still-pending) dispatch's uniform buffer (the update/dispatch hazard).
private _ubos: UniformBuffer[] = [];
private _uboIndex = 0;
// Per-recursion-level scratch buffers holding the block totals; grown/cached by level.
private _levelBuffers: StorageBuffer[] = [];
// Set once the WGSL shader modules have been dynamically imported (and thus registered in the ShaderStore).
private _shadersLoaded = false;

/**
* Creates a new prefix-sum compute helper and kicks off asynchronous loading of its WGSL shaders.
* @param engine the (WebGPU) engine to run the compute passes on
*/
public constructor(engine: AbstractEngine) {
this._engine = engine;
// Async-load the compute shaders during construction. Importing the generated shader modules registers
// them into the ShaderStore as a side effect; isReady() stays false until this resolves.
// eslint-disable-next-line @typescript-eslint/no-floating-promises
this._loadShadersAsync();
}

private async _loadShadersAsync(): Promise<void> {
await Promise.all([import("../ShadersWGSL/prefixSumScanBlock.compute"), import("../ShadersWGSL/prefixSumAddOffsets.compute")]);
this._shadersLoaded = true;
}

private _ensureShaders(): void {
if (this._scanBlock || !this._shadersLoaded) {
return;
}
const scanBlockBindings: ComputeBindingMapping = {
data: { group: 0, binding: 0 },
blockSums: { group: 0, binding: 1 },
params: { group: 0, binding: 2 },
};
this._scanBlock = new ComputeShader("prefixSumScanBlock", this._engine, "prefixSumScanBlock", { bindingsMapping: scanBlockBindings });
const addOffsetsBindings: ComputeBindingMapping = {
data: { group: 0, binding: 0 },
blockOffsets: { group: 0, binding: 1 },
params: { group: 0, binding: 2 },
};
this._addOffsets = new ComputeShader("prefixSumAddOffsets", this._engine, "prefixSumAddOffsets", { bindingsMapping: addOffsetsBindings });
}

private _getUbo(count: number): UniformBuffer {
if (this._uboIndex >= this._ubos.length) {
const ubo = new UniformBuffer(this._engine, undefined, undefined, "PrefixSumComputeParams", false, false);
ubo.addUniform("count", 1);
this._ubos.push(ubo);
}
const ubo = this._ubos[this._uboIndex++];
ubo.updateUInt("count", count);
ubo.update();
return ubo;
}

private _getLevelBuffer(level: number, numBlocks: number): StorageBuffer {
// Padded to a multiple of BlockSize so the next level's block scan never reads out of bounds.
const capacity = (Math.ceil(numBlocks / BlockSize) * BlockSize) | 0;
const existing = this._levelBuffers[level];
if (existing && existing.getBuffer().capacity >= capacity * Uint32Array.BYTES_PER_ELEMENT) {
return existing;
}
existing?.dispose();
const buffer = new StorageBuffer(
this._engine as WebGPUEngine,
Math.max(capacity, BlockSize) * Uint32Array.BYTES_PER_ELEMENT,
Constants.BUFFER_CREATIONFLAG_READWRITE,
"PrefixSumLevel" + level
);
this._levelBuffers[level] = buffer;
return buffer;
}

/**
* Whether both compute shaders are loaded, compiled and ready to dispatch.
* @returns true when the scan can run
*/
public isReady(): boolean {
if (!this._shadersLoaded) {
return false;
}
this._ensureShaders();
return !!this._scanBlock!.isReady() && !!this._addOffsets!.isReady();
}

/**
* Runs an in-place exclusive prefix sum over the first `count` entries of `buffer`.
* Call {@link resetForFrame} once before the sequence of scans issued in a frame.
* @param buffer the u32 storage buffer to scan in place
* @param count number of valid entries to scan
*/
public scanExclusive(buffer: StorageBuffer, count: number): void {
this._ensureShaders();
this._scanRecursive(buffer.getBuffer(), count, 0);
}

/**
* Resets the internal UBO ring. Must be called once at the start of each frame's scan sequence.
*/
public resetForFrame(): void {
this._uboIndex = 0;
}

private _scanRecursive(data: DataBuffer, count: number, level: number): void {
// count === 1 must still run the block scan so the single element is written to its exclusive value (0);
// returning here would leave it unchanged. Only an empty scan is a true no-op.
if (count <= 0) {
return;
}
const numBlocks = Math.ceil(count / BlockSize);
const blockSums = this._getLevelBuffer(level, numBlocks);

const scanUbo = this._getUbo(count);
this._scanBlock!.setStorageBuffer("data", data);
this._scanBlock!.setStorageBuffer("blockSums", blockSums);
this._scanBlock!.setUniformBuffer("params", scanUbo);
this._scanBlock!.dispatch(numBlocks);

if (numBlocks > 1) {
this._scanRecursive(blockSums.getBuffer(), numBlocks, level + 1);

const addUbo = this._getUbo(count);
this._addOffsets!.setStorageBuffer("data", data);
this._addOffsets!.setStorageBuffer("blockOffsets", blockSums);
this._addOffsets!.setUniformBuffer("params", addUbo);
this._addOffsets!.dispatch(numBlocks);
}
}

/**
* Releases all GPU resources held by the helper.
*/
public dispose(): void {
for (const ubo of this._ubos) {
ubo.dispose();
}
for (const buffer of this._levelBuffers) {
buffer.dispose();
}
this._ubos = [];
this._levelBuffers = [];
this._uboIndex = 0;
this._scanBlock = null;
this._addOffsets = null;
}
}
10 changes: 10 additions & 0 deletions packages/dev/core/src/Engines/WebGPU/webgpuDrawContext.ts
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,16 @@ export class WebGPUDrawContext implements IDrawContext {

public indirectDrawBuffer?: GPUBuffer;

/**
* @internal
* The caller-owned (external) indirect buffer and offset the cached {@link fastBundle} was recorded against,
* if any. Used to invalidate the bundle when the caller passes a different external indirect buffer/offset
* (the bundle records drawIndirect against a specific buffer, which is otherwise not part of the dirty key).
*/
public _externalIndirectBuffer?: GPUBuffer;
/** @internal */
public _externalIndirectOffset = 0;

private _materialContextUpdateId: number;
private _bufferManager: WebGPUBufferManager;
private _useInstancing: boolean;
Expand Down
13 changes: 13 additions & 0 deletions packages/dev/core/src/Engines/abstractEngine.pure.ts
Original file line number Diff line number Diff line change
Expand Up @@ -1345,6 +1345,19 @@ export abstract class AbstractEngine {
*/
public abstract drawElementsType(fillMode: number, indexStart: number, indexCount: number, instancesCount?: number): void;

/**
* Draws indexed instanced primitives where the draw arguments (including the instance count) are read from a
* caller-owned GPU indirect buffer. Only supported on WebGPU; other engines throw.
* @param _fillMode defines the primitive to use
* @param _indexStart defines the starting index
* @param _indexCount defines the number of indices per instance
* @param _indirectBuffer the GPU buffer holding the draw-indexed-indirect arguments
* @param _indirectByteOffset byte offset of the arguments within the buffer (default 0)
*/
public drawElementsInstancedIndirect(_fillMode: number, _indexStart: number, _indexCount: number, _indirectBuffer: DataBuffer, _indirectByteOffset = 0): void {
throw new Error("drawElementsInstancedIndirect is only supported on WebGPU engines.");
}

/**
* Unbind the current render target texture from the webGL context
* @param texture defines the render target wrapper to unbind
Expand Down
50 changes: 46 additions & 4 deletions packages/dev/core/src/Engines/webgpuEngine.pure.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3722,7 +3722,7 @@ export class WebGPUEngine extends ThinWebGPUEngine {
}
}

private _draw(drawType: number, fillMode: number, start: number, count: number, instancesCount: number): void {
private _draw(drawType: number, fillMode: number, start: number, count: number, instancesCount: number, externalIndirect?: { buffer: GPUBuffer; offset: number }): void {
const renderPass = this._getCurrentRenderPass();
const bundleList = this._bundleList;

Expand Down Expand Up @@ -3750,9 +3750,19 @@ export class WebGPUEngine extends ThinWebGPUEngine {
return;
}

// The cached fast bundle records drawIndirect against a specific external buffer/offset, which is not part
// of the draw/material context dirty key. Invalidate it when the caller supplies a different external
// indirect buffer/offset (or stops/starts using one), so a stale bundle can't draw from the wrong buffer.
const externalIndirectChanged = externalIndirect
? this._currentDrawContext._externalIndirectBuffer !== externalIndirect.buffer || this._currentDrawContext._externalIndirectOffset !== externalIndirect.offset
: this._currentDrawContext._externalIndirectBuffer !== undefined;

if (
!this.compatibilityMode &&
(this._currentDrawContext.isDirty(this._currentMaterialContext.updateId) || this._currentMaterialContext.isDirty || this._currentMaterialContext.forceBindGroupCreation)
(this._currentDrawContext.isDirty(this._currentMaterialContext.updateId) ||
this._currentMaterialContext.isDirty ||
this._currentMaterialContext.forceBindGroupCreation ||
externalIndirectChanged)
) {
this._currentDrawContext.fastBundle = undefined;
}
Expand All @@ -3764,7 +3774,10 @@ export class WebGPUEngine extends ThinWebGPUEngine {
this._applyRenderPassChanges(bundleList);
if (!this._snapshotRendering.record) {
this._counters.numBundleReuseNonCompatMode++;
if (this._currentDrawContext.indirectDrawBuffer) {
if (externalIndirect) {
// The instance count lives in an app-owned indirect buffer written by a compute shader; the
// cached bundle already records drawIndexedIndirect against it, so nothing to update here.
} else if (this._currentDrawContext.indirectDrawBuffer) {
this._currentDrawContext.setIndirectData(count, instancesCount || 1, start);
Comment thread
CedricGuillemet marked this conversation as resolved.
}
bundleList.addBundle(this._currentDrawContext.fastBundle);
Expand Down Expand Up @@ -3838,7 +3851,15 @@ export class WebGPUEngine extends ThinWebGPUEngine {
// draw
const nonCompatMode = !this.compatibilityMode && !this._snapshotRendering.record;

if ((nonCompatMode || this._currentDrawContext._enableIndirectDrawInCompatMode) && this._currentDrawContext.indirectDrawBuffer) {
if (externalIndirect) {
// App-driven indirect draw: the draw arguments (including a GPU-computed instance count) live in a
// caller-owned buffer. Used by the Gaussian Splatting GPU culling path.
if (drawType === 0) {
renderPass2.drawIndexedIndirect(externalIndirect.buffer, externalIndirect.offset);
} else {
renderPass2.drawIndirect(externalIndirect.buffer, externalIndirect.offset);
}
} else if ((nonCompatMode || this._currentDrawContext._enableIndirectDrawInCompatMode) && this._currentDrawContext.indirectDrawBuffer) {
this._currentDrawContext.setIndirectData(count, instancesCount || 1, start);
if (drawType === 0) {
renderPass2.drawIndexedIndirect(this._currentDrawContext.indirectDrawBuffer, 0);
Expand All @@ -3852,6 +3873,10 @@ export class WebGPUEngine extends ThinWebGPUEngine {
}

if (nonCompatMode) {
// Remember which external indirect buffer/offset this bundle draws from, so it is invalidated above if
// the caller changes it on a later draw sharing this draw context.
this._currentDrawContext._externalIndirectBuffer = externalIndirect?.buffer;
this._currentDrawContext._externalIndirectOffset = externalIndirect?.offset ?? 0;
this._currentDrawContext.fastBundle = (renderPass2 as GPURenderBundleEncoder).finish();
bundleList.addBundle(this._currentDrawContext.fastBundle);
}
Expand Down Expand Up @@ -3882,6 +3907,23 @@ export class WebGPUEngine extends ThinWebGPUEngine {
this._draw(1, fillMode, verticesStart, verticesCount, instancesCount);
}

/**
* Draws indexed instanced primitives where the draw arguments (including the instance count) are read
* from a caller-owned GPU buffer (drawIndexedIndirect). The buffer must contain 5 consecutive u32:
* [indexCount, instanceCount, firstIndex, baseVertex, firstInstance] and be created with the
* {@link Constants.BUFFER_CREATIONFLAG_INDIRECT} usage. Used by the Gaussian Splatting GPU culling path so
* that a compute-computed visible count drives the draw without any CPU readback.
* @param fillMode defines the primitive to use
* @param indexStart defines the starting index (used for pipeline/index-buffer binding)
* @param indexCount defines the number of indices per instance (used for pipeline/index-buffer binding)
* @param indirectBuffer the GPU buffer holding the draw-indexed-indirect arguments
* @param indirectByteOffset byte offset of the arguments within the buffer (default 0)
*/
public override drawElementsInstancedIndirect(fillMode: number, indexStart: number, indexCount: number, indirectBuffer: DataBuffer, indirectByteOffset: number = 0): void {
const buffer = (indirectBuffer as WebGPUDataBuffer).underlyingResource as GPUBuffer;
this._draw(0, fillMode, indexStart, indexCount, 1, { buffer, offset: indirectByteOffset });
}

//------------------------------------------------------------------------------
// Async Pipeline Pre-Warming
//------------------------------------------------------------------------------
Expand Down
Loading