Skip to content
10 changes: 9 additions & 1 deletion dub.json
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
"dependencies": {
"derelict-cl" : "~>3.2.0",
"bindbc-cuda": "~>0.1.0",
"metal-d": "~>0.5.3",
"taggedalgebraic": "~>0.10.7"
},
"configurations": [
Expand Down Expand Up @@ -35,5 +36,12 @@
"targetType": "executable",
"versions": ["DComputeTestOpenCL"],
},
]
{
"name" : "test-metal",
"dflags": ["-mdcompute-targets=metal-400", "-version=LDC_DCompute","-oq"],
"dflags-ldc": ["-wi"],
"targetType": "executable",
"versions": ["DComputeTestMetal"],
},
]
}
30 changes: 30 additions & 0 deletions source/dcompute/driver/metal/buffer.d
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
module dcompute.driver.metal.buffer;
import metal;
import dcompute.driver.metal.program;
import dcompute.driver.metal;
import core.stdc.string;

struct Buffer(T)
{
MTLBuffer mtlBuffer;

// Host memory associated with this buffer
T[] hostMemory;

this(MTLBuffer _mtlBuffer, T[] array)
{
mtlBuffer = _mtlBuffer;
hostMemory = array;
}

T* contents()
{
return cast(T*) mtlBuffer.contents();
}

void release()
{
mtlBuffer = null;
hostMemory = null;
}
}
44 changes: 44 additions & 0 deletions source/dcompute/driver/metal/device.d
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
module dcompute.driver.metal.device;
import dcompute.driver.metal.buffer;
import core.stdc.string;
import metal;

struct Device
{
/**
A pointer to $(D MTLDevice). It is $(D void*) because upon storing array of $(D Device),
linker look for the $(D MTLDevice) but fails to
find it as it is Objective-C binding hence had to wrap it as such
*/
void* raw;

@property MTLDevice mtlDevice()
{
return cast(MTLDevice) raw;
}

this(MTLDevice device)
{
raw = cast(void*)device;
}

MTLBuffer newBuffer(size_t sizeInBytes)
{
return mtlDevice.newBuffer(sizeInBytes, MTLResourceOptions.StorageModeShared);
}

Buffer!T makeBuffer(T)(T[] hostMemory)
{
size_t sizeInBytes = hostMemory.length * T.sizeof;

auto mtlBuffer = newBuffer(sizeInBytes);
auto buffer = Buffer!T(mtlBuffer, hostMemory);

if (buffer.hostMemory.ptr !is null && sizeInBytes > 0)
{
memcpy(buffer.mtlBuffer.contents(), buffer.hostMemory.ptr, sizeInBytes);
}

return buffer;
}
}
12 changes: 12 additions & 0 deletions source/dcompute/driver/metal/kernel.d
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
module dcompute.driver.metal.kernel;
import metal.library;

struct Kernel(F) if (is(F==function) || is(F==void))
{
MTLFunction kernelFunction;

this(MTLFunction _kernelFunction)
{
kernelFunction = _kernelFunction;
}
}
26 changes: 26 additions & 0 deletions source/dcompute/driver/metal/package.d
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
module dcompute.driver.metal;
import ldc.dcompute;
import std.range;
import std.meta;
import std.traits;

public import dcompute.driver.metal.buffer;
public import dcompute.driver.metal.device;
public import dcompute.driver.metal.kernel;
public import dcompute.driver.metal.platform;
public import dcompute.driver.metal.program;
public import dcompute.driver.metal.queue;


template HostArgsOf(F)
{
template toBuffer(T)
{
static if (is(T: Pointer!(n,U), uint n, U))
alias toBuffer = Buffer!U;
else
alias toBuffer = T;
}

alias HostArgsOf = staticMap!(toBuffer, Parameters!F);
}
26 changes: 26 additions & 0 deletions source/dcompute/driver/metal/platform.d
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
module dcompute.driver.metal.platform;

import dcompute.driver.metal.device;
import metal.device;

struct Platform
{
static Device[] getDevices()
{
auto mtlDevices = MTLCopyAllDevices();
auto devices = new Device[mtlDevices.length];

for(int i=0;i < mtlDevices.length;i ++)
{
devices[i] = Device(mtlDevices[i]);
}

return devices;
}

static Device getDefaultDevice()
{
auto device = Device(MTLCreateSystemDefaultDevice());
return device;
}
}
62 changes: 62 additions & 0 deletions source/dcompute/driver/metal/program.d
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
module dcompute.driver.metal.program;
import dcompute.driver.metal.device;
import dcompute.driver.metal.kernel;
import objc;
import foundation;
import core.stdc.stdio;
import std.string;
import std.path;
import metal.library;
import metal.device;

struct Program
{
MTLLibrary metalLibrary;

Device device;

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why does the program contain a device?
The program represents the code.
It seems to me that you should split out the concept of a kernel from its binding to a pipeline state object.


Kernel!void getKernelByName(immutable(char)* name)
{
auto kName = fromStringz(name);

auto kNameInNSString = NSString.create(kName);

auto kernelFunction = metalLibrary.newFunctionWithName(kNameInNSString);

if (kernelFunction is null)
{
printf("Error: Could not find kernel function %s in library.\n", name);
assert(0);
}

return Kernel!void(kernelFunction);
}

Kernel!(typeof(k)) getKernel(alias k)()
{
return cast(typeof(return)) getKernelByName(k.mangleof.ptr);
}

static Program fromFile(Device device, string path)
{
NSError error;
auto nsPath = NSString.create(absolutePath(path));

auto library = device.mtlDevice.newLibrary(NSURL.fromPath(nsPath), error);

if (library is null)
{
printf("Error loading .metallib: %s\n", error.localizedDescription().ptr);
assert(0);
}

return Program(library, device);
}

__gshared static Program globalProgram;

void unload()
{
metalLibrary = null;
}
}
105 changes: 105 additions & 0 deletions source/dcompute/driver/metal/queue.d
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
module dcompute.driver.metal.queue;
import dcompute.driver.metal.buffer;

import dcompute.driver.metal;
import dcompute.driver.metal.device;
import dcompute.driver.metal.program;
import metal;
import metal.argument;
import metal.types;
import core.stdc.stdio;
import objc;
import foundation;

struct Queue

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

a pipeline state seems much more associated with a Queue.

Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yes agreed, I moved it to Queue as it fits better there

{
Device device;
MTLCommandQueue commandQueue;
MTLCommandBuffer lastActiveBuffer;

// TODO(asadbek): explore options to make the use of async execution with events
this (Device _device /*bool async*/)
{
device = _device;

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

device is unused outside of the constructor, why cache a reference to it in the Queue?

Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't know how to encode it in other way, I used device within enqueue method's OpCall which makes it functor type of function, I can add a new parameter to get the device but I thought it would be nice to encode it as field instead of another parameter in the enqueue function

commandQueue = device.mtlDevice.newCommandQueue();
}

auto enqueue(alias k)(uint[3] _grid, uint[3] _block)
{
static struct Call
{
Queue* q;
uint[3] grid, block;

this(Queue* _q, uint[3] _grid, uint[3] _block)
{
q = _q;
grid = _grid;
block = _block;
}

void opCall(HostArgsOf!(typeof(k)) args)
{
NSError error;

auto kernel = Program.globalProgram.getKernel!k();

auto pipelineState = q.device.mtlDevice.newComputePipelineStateWithFunction(
kernel.kernelFunction,
MTLPipelineOption.None,
null,
error
);

if (pipelineState is null)
{
printf("Error: Backend compilation failed: %s\n", error.localizedDescription().ptr);
assert(0);
}

auto commandBuffer = q.commandQueue.commandBuffer();

auto computeEncoder = commandBuffer.computeCommandEncoder();

computeEncoder.setComputePipelineState(pipelineState);

foreach (i, arg; args)
{
static if (is(typeof(arg): Buffer!U, U))
{
computeEncoder.setBuffer(arg.mtlBuffer, 0, i);
} else static if (__traits(isScalar, typeof(arg)))
{
computeEncoder.setBytes(&arg, typeof(arg).sizeof, i);
}
else
{
static assert(0, "Unsupported argument type for Metal kernel dispatch!");
}
}

auto threadgroupsPerGrid = MTLSize(grid[0], grid[1], grid[2]);

auto threadsPerThreadgroup = MTLSize(block[0], block[1], block[2]);

computeEncoder.dispatchThreads(threadgroupsPerGrid, threadsPerThreadgroup);

computeEncoder.endEncoding();
commandBuffer.commit();

q.lastActiveBuffer = commandBuffer;
}
}

return Call(&this, _grid, _block);
}

void finish() {
if (lastActiveBuffer !is null) {
lastActiveBuffer.waitUntilCompleted();
lastActiveBuffer.release();

lastActiveBuffer = null;
}
}
}
Loading