Skip to content

Commit 7fa6071

Browse files
committed
Fix broken multi-GPU support (no unique deviceIndex used in salt). Remove broken cache system since kernel builds quickly.
1 parent 5cac43b commit 7fa6071

5 files changed

Lines changed: 51 additions & 95 deletions

File tree

Dispatcher.cpp

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -112,7 +112,8 @@ void Dispatcher::run(const mode & mode) {
112112
d.m_memResult.setKernelArg(d.m_kernelIterate, 0);
113113
d.m_memMode.setKernelArg(d.m_kernelIterate, 1);
114114
CLMemory<cl_uchar>::setKernelArg(d.m_kernelIterate, 2, d.m_clScoreMax); // Updated in handleResult()
115-
CLMemory<cl_ulong>::setKernelArg(d.m_kernelIterate, 3, d.m_round); // Round information updated in deviceDispatch()
115+
CLMemory<cl_uint>::setKernelArg(d.m_kernelIterate, 3, d.m_index);
116+
// Round information updated in deviceDispatch()
116117
}
117118

118119
m_quit = false;
@@ -195,7 +196,7 @@ void Dispatcher::deviceDispatch(Device & d) {
195196
cl_event event;
196197
d.m_memResult.read(false, &event);
197198

198-
CLMemory<cl_ulong>::setKernelArg(d.m_kernelIterate, 3, ++d.m_round); // Round information updated in deviceDispatch()
199+
CLMemory<cl_uint>::setKernelArg(d.m_kernelIterate, 4, ++d.m_round); // Round information updated in deviceDispatch()
199200
enqueueKernelDevice(d, d.m_kernelIterate, m_size);
200201
clFlush(d.m_clQueue);
201202

Dispatcher.hpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,7 @@ class Dispatcher {
5252
CLMemory<result> m_memResult;
5353
CLMemory<mode> m_memMode;
5454

55-
cl_ulong m_round;
55+
cl_uint m_round;
5656
};
5757

5858
public:

eradicate2.cl

Lines changed: 44 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -14,23 +14,27 @@ typedef struct __attribute__((packed)) {
1414
uint found;
1515
} result;
1616

17-
__kernel void eradicate2_iterate(__global result * const pResult, __global const mode * const pMode, const uchar scoreMax, const ulong round);
18-
void eradicate2_result_update(const uchar * const hash, __global result * const pResult, const uchar score, const uchar scoreMax, const ulong round);
19-
void eradicate2_score_leading(const uchar * const hash, __global result * const pResult, __global const mode * const pMode, const uchar scoreMax, const ulong round);
20-
void eradicate2_score_benchmark(const uchar * const hash, __global result * const pResult, __global const mode * const pMode, const uchar scoreMax, const ulong round);
21-
void eradicate2_score_matching(const uchar * const hash, __global result * const pResult, __global const mode * const pMode, const uchar scoreMax, const ulong round);
22-
void eradicate2_score_range(const uchar * const hash, __global result * const pResult, __global const mode * const pMode, const uchar scoreMax, const ulong round);
23-
void eradicate2_score_leadingrange(const uchar * const hash, __global result * const pResult, __global const mode * const pMode, const uchar scoreMax, const ulong round);
24-
void eradicate2_score_mirror(const uchar * const hash, __global result * const pResult, __global const mode * const pMode, const uchar scoreMax, const ulong round);
25-
void eradicate2_score_doubles(const uchar * const hash, __global result * const pResult, __global const mode * const pMode, const uchar scoreMax, const ulong round);
26-
27-
__kernel void eradicate2_iterate(__global result * const pResult, __global const mode * const pMode, const uchar scoreMax, const ulong round) {
17+
__kernel void eradicate2_iterate(__global result * const pResult, __global const mode * const pMode, const uchar scoreMax, const uint deviceIndex, const uint round);
18+
void eradicate2_result_update(const uchar * const hash, __global result * const pResult, const uchar score, const uchar scoreMax, const uint deviceIndex, const uint round);
19+
void eradicate2_score_leading(const uchar * const hash, __global result * const pResult, __global const mode * const pMode, const uchar scoreMax, const uint deviceIndex, const uint round);
20+
void eradicate2_score_benchmark(const uchar * const hash, __global result * const pResult, __global const mode * const pMode, const uchar scoreMax, const uint deviceIndex, const uint round);
21+
void eradicate2_score_matching(const uchar * const hash, __global result * const pResult, __global const mode * const pMode, const uchar scoreMax, const uint deviceIndex, const uint round);
22+
void eradicate2_score_range(const uchar * const hash, __global result * const pResult, __global const mode * const pMode, const uchar scoreMax, const uint deviceIndex, const uint round);
23+
void eradicate2_score_leadingrange(const uchar * const hash, __global result * const pResult, __global const mode * const pMode, const uchar scoreMax, const uint deviceIndex, const uint round);
24+
void eradicate2_score_mirror(const uchar * const hash, __global result * const pResult, __global const mode * const pMode, const uchar scoreMax, const uint deviceIndex, const uint round);
25+
void eradicate2_score_doubles(const uchar * const hash, __global result * const pResult, __global const mode * const pMode, const uchar scoreMax, const uint deviceIndex, const uint round);
26+
27+
__kernel void eradicate2_iterate(__global result * const pResult, __global const mode * const pMode, const uchar scoreMax, const uint deviceIndex, const uint round) {
2828
ethhash h = { .q = { ERADICATE2_INITHASH } };
2929

30-
// Salt have index h.b[21:52] inclusive, which covers QWORDS with index h.q[3:5] inclusive (they represent h.b[24:47] inclusive)
31-
// We use two of those three QWORD indexes to generate a unique salt value for each round.
32-
h.q[3] += get_global_id(0);
33-
h.q[4] += round;
30+
// Salt have index h.b[21:52] inclusive, which covers WORDS with index h.d[6:12] inclusive (they represent h.b[24:51] inclusive)
31+
// We use three out of those six words to generate a unique salt value for each device, thread and round. We ignore any overflows
32+
// and assume that there'll never be more than 2**32 devices, threads or rounds. Worst case scenario with default settings
33+
// of 16777216 = 2**24 threads means the assumption fails after a device has tried 2**32 * 2**24 = 2**56 salts, enough to match
34+
// 14 characters in the address! A GTX 1070 with speed of ~700*10**6 combinations per second would hit this target after ~3 years.
35+
h.d[6] += deviceIndex;
36+
h.d[7] += get_global_id(0);
37+
h.d[8] += round;
3438

3539
// Hash
3640
sha3_keccakf(&h);
@@ -41,45 +45,46 @@ __kernel void eradicate2_iterate(__global result * const pResult, __global const
4145
*/
4246
switch (pMode->function) {
4347
case Benchmark:
44-
eradicate2_score_benchmark(h.b + 12, pResult, pMode, scoreMax, round);
48+
eradicate2_score_benchmark(h.b + 12, pResult, pMode, scoreMax, deviceIndex, round);
4549
break;
4650

4751
case Matching:
48-
eradicate2_score_matching(h.b + 12, pResult, pMode, scoreMax, round);
52+
eradicate2_score_matching(h.b + 12, pResult, pMode, scoreMax, deviceIndex, round);
4953
break;
5054

5155
case Leading:
52-
eradicate2_score_leading(h.b + 12, pResult, pMode, scoreMax, round);
56+
eradicate2_score_leading(h.b + 12, pResult, pMode, scoreMax, deviceIndex, round);
5357
break;
5458

5559
case Range:
56-
eradicate2_score_range(h.b + 12, pResult, pMode, scoreMax, round);
60+
eradicate2_score_range(h.b + 12, pResult, pMode, scoreMax, deviceIndex, round);
5761
break;
5862

5963
case Mirror:
60-
eradicate2_score_mirror(h.b + 12, pResult, pMode, scoreMax, round);
64+
eradicate2_score_mirror(h.b + 12, pResult, pMode, scoreMax, deviceIndex, round);
6165
break;
6266

6367
case Doubles:
64-
eradicate2_score_doubles(h.b + 12, pResult, pMode, scoreMax, round);
68+
eradicate2_score_doubles(h.b + 12, pResult, pMode, scoreMax, deviceIndex, round);
6569
break;
6670

6771
case LeadingRange:
68-
eradicate2_score_leadingrange(h.b + 12, pResult, pMode, scoreMax, round);
72+
eradicate2_score_leadingrange(h.b + 12, pResult, pMode, scoreMax, deviceIndex, round);
6973
break;
7074
}
7175
}
7276

73-
void eradicate2_result_update(const uchar * const H, __global result * const pResult, const uchar score, const uchar scoreMax, const ulong round) {
77+
void eradicate2_result_update(const uchar * const H, __global result * const pResult, const uchar score, const uchar scoreMax, const uint deviceIndex, const uint round) {
7478
if (score && score > scoreMax) {
7579
const uchar hasResult = atomic_inc(&pResult[score].found); // NOTE: If "too many" results are found it'll wrap around to 0 again and overwrite last result. Only relevant if global worksize exceeds MAX(uint).
7680

7781
// Save only one result for each score, the first.
7882
if (hasResult == 0) {
7983
// Reconstruct state with hash and extract salt
8084
ethhash h = { .q = { ERADICATE2_INITHASH } };
81-
h.q[3] += get_global_id(0);
82-
h.q[4] += round;
85+
h.d[6] += deviceIndex;
86+
h.d[7] += get_global_id(0);
87+
h.d[8] += round;
8388

8489
ethhash be;
8590

@@ -94,7 +99,7 @@ void eradicate2_result_update(const uchar * const H, __global result * const pRe
9499
}
95100
}
96101

97-
void eradicate2_score_leading(const uchar * const hash, __global result * const pResult, __global const mode * const pMode, const uchar scoreMax, const ulong round) {
102+
void eradicate2_score_leading(const uchar * const hash, __global result * const pResult, __global const mode * const pMode, const uchar scoreMax, const uint deviceIndex, const uint round) {
98103
int score = 0;
99104

100105
for (int i = 0; i < 20; ++i) {
@@ -111,17 +116,17 @@ void eradicate2_score_leading(const uchar * const hash, __global result * const
111116
}
112117
}
113118

114-
eradicate2_result_update(hash, pResult, score, scoreMax, round);
119+
eradicate2_result_update(hash, pResult, score, scoreMax, deviceIndex, round);
115120
}
116121

117-
void eradicate2_score_benchmark(const uchar * const hash, __global result * const pResult, __global const mode * const pMode, const uchar scoreMax, const ulong round) {
122+
void eradicate2_score_benchmark(const uchar * const hash, __global result * const pResult, __global const mode * const pMode, const uchar scoreMax, const uint deviceIndex, const uint round) {
118123
const size_t id = get_global_id(0);
119124
int score = 0;
120125

121-
eradicate2_result_update(hash, pResult, score, scoreMax, round);
126+
eradicate2_result_update(hash, pResult, score, scoreMax, deviceIndex, round);
122127
}
123128

124-
void eradicate2_score_matching(const uchar * const hash, __global result * const pResult, __global const mode * const pMode, const uchar scoreMax, const ulong round) {
129+
void eradicate2_score_matching(const uchar * const hash, __global result * const pResult, __global const mode * const pMode, const uchar scoreMax, const uint deviceIndex, const uint round) {
125130
const size_t id = get_global_id(0);
126131
int score = 0;
127132

@@ -131,10 +136,10 @@ void eradicate2_score_matching(const uchar * const hash, __global result * const
131136
}
132137
}
133138

134-
eradicate2_result_update(hash, pResult, score, scoreMax, round);
139+
eradicate2_result_update(hash, pResult, score, scoreMax, deviceIndex, round);
135140
}
136141

137-
void eradicate2_score_range(const uchar * const hash, __global result * const pResult, __global const mode * const pMode, const uchar scoreMax, const ulong round) {
142+
void eradicate2_score_range(const uchar * const hash, __global result * const pResult, __global const mode * const pMode, const uchar scoreMax, const uint deviceIndex, const uint round) {
138143
const size_t id = get_global_id(0);
139144
int score = 0;
140145

@@ -151,10 +156,10 @@ void eradicate2_score_range(const uchar * const hash, __global result * const pR
151156
}
152157
}
153158

154-
eradicate2_result_update(hash, pResult, score, scoreMax, round);
159+
eradicate2_result_update(hash, pResult, score, scoreMax, deviceIndex, round);
155160
}
156161

157-
void eradicate2_score_leadingrange(const uchar * const hash, __global result * const pResult, __global const mode * const pMode, const uchar scoreMax, const ulong round) {
162+
void eradicate2_score_leadingrange(const uchar * const hash, __global result * const pResult, __global const mode * const pMode, const uchar scoreMax, const uint deviceIndex, const uint round) {
158163
const size_t id = get_global_id(0);
159164
int score = 0;
160165

@@ -177,10 +182,10 @@ void eradicate2_score_leadingrange(const uchar * const hash, __global result * c
177182
}
178183
}
179184

180-
eradicate2_result_update(hash, pResult, score, scoreMax, round);
185+
eradicate2_result_update(hash, pResult, score, scoreMax, deviceIndex, round);
181186
}
182187

183-
void eradicate2_score_mirror(const uchar * const hash, __global result * const pResult, __global const mode * const pMode, const uchar scoreMax, const ulong round) {
188+
void eradicate2_score_mirror(const uchar * const hash, __global result * const pResult, __global const mode * const pMode, const uchar scoreMax, const uint deviceIndex, const uint round) {
184189
const size_t id = get_global_id(0);
185190
int score = 0;
186191

@@ -204,10 +209,10 @@ void eradicate2_score_mirror(const uchar * const hash, __global result * const p
204209
++score;
205210
}
206211

207-
eradicate2_result_update(hash, pResult, score, scoreMax, round);
212+
eradicate2_result_update(hash, pResult, score, scoreMax, deviceIndex, round);
208213
}
209214

210-
void eradicate2_score_doubles(const uchar * const hash, __global result * const pResult, __global const mode * const pMode, const uchar scoreMax, const ulong round) {
215+
void eradicate2_score_doubles(const uchar * const hash, __global result * const pResult, __global const mode * const pMode, const uchar scoreMax, const uint deviceIndex, const uint round) {
211216
const size_t id = get_global_id(0);
212217
int score = 0;
213218

@@ -220,5 +225,5 @@ void eradicate2_score_doubles(const uchar * const hash, __global result * const
220225
}
221226
}
222227

223-
eradicate2_result_update(hash, pResult, score, scoreMax, round);
228+
eradicate2_result_update(hash, pResult, score, scoreMax, deviceIndex, round);
224229
}

eradicate2.cpp

Lines changed: 3 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -12,15 +12,10 @@
1212

1313
#if defined(__APPLE__) || defined(__MACOSX)
1414
#include <OpenCL/cl.h>
15-
#include <OpenCL/cl_ext.h> // Included to get topology to get an actual unique identifier per device
1615
#else
1716
#include <CL/cl.h>
18-
#include <CL/cl_ext.h> // Included to get topology to get an actual unique identifier per device
1917
#endif
2018

21-
#define CL_DEVICE_PCI_BUS_ID_NV 0x4008
22-
#define CL_DEVICE_PCI_SLOT_ID_NV 0x4009
23-
2419
#include "hexadecimal.hpp"
2520
#include "Dispatcher.hpp"
2621
#include "ArgParser.hpp"
@@ -117,18 +112,6 @@ std::vector<std::string> getBinaries(cl_program & clProgram) {
117112
return vReturn;
118113
}
119114

120-
unsigned int getUniqueDeviceIdentifier(const cl_device_id & deviceId) {
121-
#if defined(CL_DEVICE_TOPOLOGY_AMD)
122-
auto topology = clGetWrapper<cl_device_topology_amd>(clGetDeviceInfo, deviceId, CL_DEVICE_TOPOLOGY_AMD);
123-
if (topology.raw.type == CL_DEVICE_TOPOLOGY_TYPE_PCIE_AMD) {
124-
return (topology.pcie.bus << 16) + (topology.pcie.device << 8) + topology.pcie.function;
125-
}
126-
#endif
127-
cl_int bus_id = clGetWrapper<cl_int>(clGetDeviceInfo, deviceId, CL_DEVICE_PCI_BUS_ID_NV);
128-
cl_int slot_id = clGetWrapper<cl_int>(clGetDeviceInfo, deviceId, CL_DEVICE_PCI_SLOT_ID_NV);
129-
return (bus_id << 16) + slot_id;
130-
}
131-
132115
template <typename T> bool printResult(const T & t, const cl_int & err) {
133116
std::cout << ((t == NULL) ? lexical_cast::write(err) : "OK") << std::endl;
134117
return t == NULL;
@@ -139,11 +122,6 @@ bool printResult(const cl_int err) {
139122
return err != CL_SUCCESS;
140123
}
141124

142-
std::string getDeviceCacheFilename(cl_device_id & d) {
143-
const auto uniqueId = getUniqueDeviceIdentifier(d);
144-
return "cache-opencl." + lexical_cast::write(uniqueId);
145-
}
146-
147125
std::string keccakDigest(const std::string data) {
148126
char digest[32];
149127
sha3(data.c_str(), data.size(), digest, 32);
@@ -215,7 +193,6 @@ int main(int argc, char * * argv) {
215193
std::vector<size_t> vDeviceSkipIndex;
216194
size_t worksizeLocal = 128;
217195
size_t worksizeMax = 0; // Will be automatically determined later if not overriden by user
218-
bool bNoCache = false;
219196
size_t size = 16777216;
220197
std::string strAddress;
221198
std::string strInitCode;
@@ -237,7 +214,6 @@ int main(int argc, char * * argv) {
237214
argp.addMultiSwitch('s', "skip", vDeviceSkipIndex);
238215
argp.addSwitch('w', "work", worksizeLocal);
239216
argp.addSwitch('W', "work-max", worksizeMax);
240-
argp.addSwitch('n', "no-cache", bNoCache);
241217
argp.addSwitch('S', "size", size);
242218
argp.addSwitch('A', "address", strAddress);
243219
argp.addSwitch('I', "init-code", strInitCode);
@@ -267,7 +243,7 @@ int main(int argc, char * * argv) {
267243
const std::string strAddressBinary = parseHexadecimalBytes(strAddress);
268244
const std::string strInitCodeBinary = parseHexadecimalBytes(strInitCode);
269245
const std::string strInitCodeDigest = keccakDigest(strInitCodeBinary);
270-
const std::string strPreprocessorInitHash = makePreprocessorInitHashExpression(strAddressBinary, strInitCodeDigest);
246+
const std::string strPreprocessorInitStructure = makePreprocessorInitHashExpression(strAddressBinary, strInitCodeDigest);
271247

272248
mode mode = ModeFactory::benchmark();
273249
if (bModeBenchmark) {
@@ -302,7 +278,6 @@ int main(int argc, char * * argv) {
302278
std::vector<std::string> vDeviceBinary;
303279
std::vector<size_t> vDeviceBinarySize;
304280
cl_int errorCode;
305-
bool bUsedCache = false;
306281

307282
std::cout << "Devices:" << std::endl;
308283
for (size_t i = 0; i < vFoundDevices.size(); ++i) {
@@ -316,19 +291,8 @@ int main(int argc, char * * argv) {
316291
const auto strName = clGetWrapperString(clGetDeviceInfo, deviceId, CL_DEVICE_NAME);
317292
const auto computeUnits = clGetWrapper<cl_uint>(clGetDeviceInfo, deviceId, CL_DEVICE_MAX_COMPUTE_UNITS);
318293
const auto globalMemSize = clGetWrapper<cl_ulong>(clGetDeviceInfo, deviceId, CL_DEVICE_GLOBAL_MEM_SIZE);
319-
bool precompiled = false;
320-
321-
// Check if there's a prebuilt binary for this device and load it
322-
if(!bNoCache) {
323-
std::ifstream fileIn(getDeviceCacheFilename(deviceId), std::ios::binary);
324-
if (fileIn.is_open()) {
325-
vDeviceBinary.push_back(std::string((std::istreambuf_iterator<char>(fileIn)), std::istreambuf_iterator<char>()));
326-
vDeviceBinarySize.push_back(vDeviceBinary.back().size());
327-
precompiled = true;
328-
}
329-
}
330294

331-
std::cout << " GPU" << i << ": " << strName << ", " << globalMemSize << " bytes available, " << computeUnits << " compute units (precompiled = " << (precompiled ? "yes" : "no") << ")" << std::endl;
295+
std::cout << " GPU" << i << ": " << strName << ", " << globalMemSize << " bytes available, " << computeUnits << " compute units" << std::endl;
332296
vDevices.push_back(vFoundDevices[i]);
333297
mDeviceIndex[vFoundDevices[i]] = i;
334298
}
@@ -348,8 +312,6 @@ int main(int argc, char * * argv) {
348312
cl_program clProgram;
349313
if (vDeviceBinary.size() == vDevices.size()) {
350314
// Create program from binaries
351-
bUsedCache = true;
352-
353315
std::cout << " Loading kernel from binary..." << std::flush;
354316
const unsigned char * * pKernels = new const unsigned char *[vDevices.size()];
355317
for (size_t i = 0; i < vDeviceBinary.size(); ++i) {
@@ -378,7 +340,7 @@ int main(int argc, char * * argv) {
378340
// Build the program
379341
std::cout << " Building program..." << std::flush;
380342

381-
const std::string strBuildOptions = "-D ERADICATE2_MAX_SCORE=" + lexical_cast::write(ERADICATE2_MAX_SCORE) + " -D ERADICATE2_INITHASH=" + strPreprocessorInitHash;
343+
const std::string strBuildOptions = "-D ERADICATE2_MAX_SCORE=" + lexical_cast::write(ERADICATE2_MAX_SCORE) + " -D ERADICATE2_INITHASH=" + strPreprocessorInitStructure;
382344
if (printResult(clBuildProgram(clProgram, vDevices.size(), vDevices.data(), strBuildOptions.c_str(), NULL, NULL))) {
383345
#ifdef ERADICATE2_DEBUG
384346
std::cout << std::endl;
@@ -395,17 +357,6 @@ int main(int argc, char * * argv) {
395357
return 1;
396358
}
397359

398-
// Save binary for each device to improve future start times
399-
if( !bUsedCache && !bNoCache ) {
400-
std::cout << " Saving program..." << std::flush;
401-
auto binaries = getBinaries(clProgram);
402-
for (size_t i = 0; i < binaries.size(); ++i) {
403-
std::ofstream fileOut(getDeviceCacheFilename(vDevices[i]), std::ios::binary);
404-
fileOut.write(binaries[i].data(), binaries[i].size());
405-
}
406-
std::cout << "OK" << std::endl;
407-
}
408-
409360
std::cout << std::endl;
410361

411362
Dispatcher d(clContext, clProgram, worksizeMax == 0 ? size : worksizeMax, size);

help.hpp

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,6 @@ usage: ./ERADICATE2 [OPTIONS]
4040
4141
Device control:
4242
-s, --skip <index> Skip device given by index.
43-
-n, --no-cache Don't load cached pre-compiled version of kernel.
4443
4544
Tweaking:
4645
-w, --work <size> Set OpenCL local work size. [default = 64]

0 commit comments

Comments
 (0)