1- // Test: Verify GPU array addition matches CPU computation
2- // Use a small array so the test runs on both real Apple Silicon and the
3- // CI paravirtual GPU (which can't handle the 108 M element sample size).
4- # define METAL_ADDER_ARRAY_LENGTH 10000
1+ // Verify the add_arrays GPU kernel from 01-MetalAdder.
2+ // Loads default.metallib by explicit path so this test works on both real
3+ // Apple Silicon and the CI paravirtual GPU, which rejects newDefaultLibrary()
4+ // (that API searches for an app bundle and silently fails on the VM).
55
6- #include < iostream>
76#include < cstdlib>
8- #include < cmath >
7+ #include < iostream >
98
109#define NS_PRIVATE_IMPLEMENTATION
1110#define CA_PRIVATE_IMPLEMENTATION
1413#include " Metal/Metal.hpp"
1514#include " QuartzCore/QuartzCore.hpp"
1615
17- # include " MetalAdder.hpp "
16+ static const unsigned int kN = 10000 ;
1817
1918int main ()
2019{
@@ -24,31 +23,74 @@ int main()
2423 std::cerr << " FAIL: No Metal device found." << std::endl;
2524 return 1 ;
2625 }
27- std::string deviceName = device->name ()->utf8String ();
28- std::cout << " Running on " << deviceName << std::endl;
26+ std::cout << " Running on " << device->name ()->utf8String () << std::endl;
27+
28+ NS::Error *error = nullptr ;
2929
30- // MetalAdder uses newDefaultLibrary() which the Apple Paravirtual device
31- // cannot load correctly — results are silently all-zero regardless of size.
32- // Tests 02 and 03 load their metallib explicitly and run fine on the VM.
33- if (deviceName. find ( " Paravirtual " ) != std::string::npos )
30+ // Load by explicit path — works on both real and paravirtual GPUs.
31+ auto libPath = NS ::String::string ( " default.metallib " , NS::ASCIIStringEncoding);
32+ MTL::Library *lib = device-> newLibrary (libPath, &error);
33+ if (!lib )
3434 {
35- std::cout << " SKIP: newDefaultLibrary() not supported on Paravirtual device." << std::endl;
35+ std::cerr << " FAIL: Could not load default.metallib: "
36+ << (error ? error->description ()->utf8String () : " unknown" ) << std::endl;
3637 device->release ();
37- return 77 ; // CTest SKIP_RETURN_CODE
38+ return 1 ;
3839 }
3940
40- MetalAdder *adder = new MetalAdder (device);
41+ auto fnName = NS::String::string (" add_arrays" , NS::ASCIIStringEncoding);
42+ MTL::Function *fn = lib->newFunction (fnName);
43+ lib->release ();
44+ if (!fn)
45+ {
46+ std::cerr << " FAIL: add_arrays not found in default.metallib" << std::endl;
47+ device->release ();
48+ return 1 ;
49+ }
4150
42- // Run GPU addition
43- adder->sendComputeCommand ();
51+ MTL::ComputePipelineState *pso = device->newComputePipelineState (fn, &error);
52+ fn->release ();
53+ if (!pso)
54+ {
55+ std::cerr << " FAIL: Could not create pipeline state" << std::endl;
56+ device->release ();
57+ return 1 ;
58+ }
59+
60+ MTL::CommandQueue *queue = device->newCommandQueue ();
61+
62+ // Allocate shared-memory buffers and fill inputs with random data.
63+ size_t nbytes = kN * sizeof (float );
64+ MTL::Buffer *bufA = device->newBuffer (nbytes, MTL::ResourceStorageModeShared);
65+ MTL::Buffer *bufB = device->newBuffer (nbytes, MTL::ResourceStorageModeShared);
66+ MTL::Buffer *bufResult = device->newBuffer (nbytes, MTL::ResourceStorageModeShared);
67+
68+ float *a = (float *)bufA->contents ();
69+ float *b = (float *)bufB->contents ();
70+ for (unsigned int i = 0 ; i < kN ; i++)
71+ {
72+ a[i] = (float )rand () / RAND_MAX;
73+ b[i] = (float )rand () / RAND_MAX;
74+ }
4475
45- // Verify against CPU
46- float *a = (float *)adder->_mBufferA ->contents ();
47- float *b = (float *)adder->_mBufferB ->contents ();
48- float *result = (float *)adder->_mBufferResult ->contents ();
76+ // Dispatch the kernel.
77+ auto cmdBuf = queue->commandBuffer ();
78+ auto enc = cmdBuf->computeCommandEncoder ();
79+ enc->setComputePipelineState (pso);
80+ enc->setBuffer (bufA, 0 , 0 );
81+ enc->setBuffer (bufB, 0 , 1 );
82+ enc->setBuffer (bufResult, 0 , 2 );
83+ NS::UInteger tgSize = pso->maxTotalThreadsPerThreadgroup ();
84+ if (tgSize > kN ) tgSize = kN ;
85+ enc->dispatchThreads (MTL::Size::Make (kN , 1 , 1 ), MTL::Size::Make (tgSize, 1 , 1 ));
86+ enc->endEncoding ();
87+ cmdBuf->commit ();
88+ cmdBuf->waitUntilCompleted ();
4989
90+ // Verify against CPU reference.
91+ float *result = (float *)bufResult->contents ();
5092 int errors = 0 ;
51- for (unsigned long i = 0 ; i < arrayLength ; i++)
93+ for (unsigned int i = 0 ; i < kN ; i++)
5294 {
5395 if (result[i] != (a[i] + b[i]))
5496 {
@@ -59,14 +101,18 @@ int main()
59101 }
60102 }
61103
62- delete adder;
104+ pso->release ();
105+ queue->release ();
106+ bufA->release ();
107+ bufB->release ();
108+ bufResult->release ();
63109 device->release ();
64110
65111 if (errors > 0 )
66112 {
67- std::cerr << " FAIL: " << errors << " mismatches out of " << arrayLength << std::endl;
113+ std::cerr << " FAIL: " << errors << " mismatches out of " << kN << std::endl;
68114 return 1 ;
69115 }
70- std::cout << " PASS: GPU addition matches CPU (" << arrayLength << " elements)" << std::endl;
116+ std::cout << " PASS: GPU addition matches CPU (" << kN << " elements)" << std::endl;
71117 return 0 ;
72118}
0 commit comments