fix: undo accidental crippling (#27)

stringhandler · web-flow · commit 1ea0c0ac4279 · 2024-11-26T09:04:26.000+02:00
A previous attempt to fix a bug on AMD GPUs accidentally crippled all
miners.

The main problem was trying to initialize the output to 0. This would
not change the hashrate but would result in valid nonces being missed
diff --git a/src/main.rs b/src/main.rs
@@ -514,7 +514,7 @@ fn run_thread<T: EngineImpl>(
                 debug!(target: LOG_TARGET, "Elapsed {:?} > {:?}", elapsed.elapsed().as_secs(), config.template_refresh_secs );
                 break;
             }
-            let num_iterations = 16;
+            let num_iterations = 1;
             let result = gpu_engine.mine(
                 &gpu_function,
                 &context,
diff --git a/src/opencl_engine.rs b/src/opencl_engine.rs
@@ -1,3 +1,4 @@
+use core::ffi::c_void;
 use std::{
     io::Read,
     ptr,
@@ -11,7 +12,7 @@ use opencl3::{
     context::Context,
     device::{Device, CL_DEVICE_TYPE_GPU},
     kernel::{ExecuteKernel, Kernel},
-    memory::{Buffer, CL_MEM_READ_ONLY, CL_MEM_WRITE_ONLY},
+    memory::{Buffer, CL_MEM_COPY_HOST_PTR, CL_MEM_READ_ONLY, CL_MEM_WRITE_ONLY},
     platform::{get_platforms, Platform},
     program::Program,
     types::{cl_ulong, CL_TRUE},
@@ -169,8 +170,8 @@ impl EngineImpl for OpenClEngine {
 
             debug!(target: LOG_TARGET, "OpenClEngine: created queue");
 
-            let batch_size = 1 << 19; // According to tests, but we can try work this out
-            let global_dimensions = [batch_size as usize];
+            // let batch_size = 1 << 19; // According to tests, but we can try work this out
+            // let global_dimensions = [batch_size as usize];
             // let max_workgroups = Device::new(context.context.devices()[0]).max_work_group_size().unwrap();
             // dbg!(max_compute);
             // let max_work_items = queue.max_work_item_dimensions();
@@ -195,14 +196,19 @@ impl EngineImpl for OpenClEngine {
             };
 
             debug!(target: LOG_TARGET, "OpenClEngine: buffer created",);
-            let output_buffer =
-                match Buffer::<cl_ulong>::create(&context.context, CL_MEM_WRITE_ONLY, 2, ptr::null_mut()) {
-                    Ok(buffer) => buffer,
-                    Err(e) => {
-                        error!(target: LOG_TARGET, "OpenClEngine: failed to create output buffer: {}", e);
-                        return Err(e.into());
-                    },
-                };
+            let initial_output = vec![0u64, 0u64];
+            let output_buffer = match Buffer::<cl_ulong>::create(
+                &context.context,
+                CL_MEM_WRITE_ONLY | CL_MEM_COPY_HOST_PTR,
+                2,
+                initial_output.as_ptr() as *mut c_void,
+            ) {
+                Ok(buffer) => buffer,
+                Err(e) => {
+                    error!(target: LOG_TARGET, "OpenClEngine: failed to create output buffer: {}", e);
+                    return Err(e.into());
+                },
+            };
             // dbg!(block_size);
             // dbg!(grid_size);
             debug!(target: LOG_TARGET, "OpenClEngine: output buffer created",);
@@ -243,9 +249,9 @@ impl EngineImpl for OpenClEngine {
             queue.finish()?;
 
             let mut output = vec![0u64, 0u64];
-            debug!(target: LOG_TARGET, "OpenClEngine: mine output {:?}", output[0] > 0);
             queue.enqueue_read_buffer(&output_buffer, CL_TRUE, 0, output.as_mut_slice(), &[])?;
             if output[0] > 0 {
+                println!("output and diff {:?} {:?}", output[0], u64::MAX / output[1]);
                 return Ok((
                     Some(output[0]),
                     grid_size * block_size * num_iterations,
diff --git a/src/opencl_sha3.cl b/src/opencl_sha3.cl
@@ -17,167 +17,156 @@ constant static const ulong RC[] = {
     0x8000000000008080ul, 0x0000000080000001ul, 0x8000000080008008ul,
 };
 
-
 ulong swap_endian_64(ulong value) {
-    return ((value & 0x00000000000000FFULL) << 56) |
-           ((value & 0x000000000000FF00ULL) << 40) |
-           ((value & 0x0000000000FF0000ULL) << 24) |
-           ((value & 0x00000000FF000000ULL) << 8)  |
-           ((value & 0x000000FF00000000ULL) >> 8)  |
-           ((value & 0x0000FF0000000000ULL) >> 24) |
-           ((value & 0x00FF000000000000ULL) >> 40) |
-           ((value & 0xFF00000000000000ULL) >> 56);
+  return ((value & 0x00000000000000FFULL) << 56) |
+         ((value & 0x000000000000FF00ULL) << 40) |
+         ((value & 0x0000000000FF0000ULL) << 24) |
+         ((value & 0x00000000FF000000ULL) << 8) |
+         ((value & 0x000000FF00000000ULL) >> 8) |
+         ((value & 0x0000FF0000000000ULL) >> 24) |
+         ((value & 0x00FF000000000000ULL) >> 40) |
+         ((value & 0xFF00000000000000ULL) >> 56);
 }
 
+kernel void sha3(global ulong *buffer, ulong nonce_start, ulong difficulty,
+                 uint num_rounds, global ulong *output_1) {
+
+  // output_1[0] = 0;
+  // output_1[1] = 0;
+  ulong state[25];
+  for (uint i = 0; i < num_rounds; i++) {
+
+    for (uint j = 0; j < 25; j++) {
+      state[j] = 0;
+    }
+    state[0] = nonce_start + get_global_id(0) + i * get_global_size(0);
+    state[1] = buffer[1];
+    state[2] = buffer[2];
+    state[3] = buffer[3];
+
+    state[4] = buffer[4];
+    state[5] = buffer[5];
+
+    state[16] ^= 0x8000000000000000ul;
+
+    uint r, x, y, t;
+    ulong tmp, current, C[5];
+    for (r = 0; r < 24; ++r) {
+      for (x = 0; x < 5; ++x) {
+        C[x] = state[x] ^ state[x + 5] ^ state[x + 10] ^ state[x + 15] ^
+               state[x + 20];
+      }
+      for (x = 0; x < 5; ++x) {
+        tmp = C[(x + 4) % 5] ^ rotate(C[(x + 1) % 5], 1ul);
+        for (y = 0; y < 5; ++y) {
+          state[x + y * 5] ^= tmp;
+        }
+      }
+      current = state[1];
+      for (t = 0; t < 24; ++t) {
+        tmp = state[pos[t]];
+        state[pos[t]] = rotate(current, rot[t]);
+        current = tmp;
+      }
+      for (y = 0; y < 25; y += 5) {
+        for (x = 0; x < 5; ++x)
+          C[x] = state[y + x];
+        for (x = 0; x < 5; ++x) {
+          state[x + y] = C[x] ^ (~C[(x + 1) % 5] & C[(x + 2) % 5]);
+        }
+      }
+      state[0] ^= RC[r];
+    }
+
+    for (uint j = 4; j < 25; j++) {
+      state[j] = 0;
+    }
+    state[4] = 0x06;
+    state[16] = 0x8000000000000000ul;
+
+    for (r = 0; r < 24; ++r) {
+      for (x = 0; x < 5; ++x) {
+        C[x] = state[x] ^ state[x + 5] ^ state[x + 10] ^ state[x + 15] ^
+               state[x + 20];
+      }
+      for (x = 0; x < 5; ++x) {
+        tmp = C[(x + 4) % 5] ^ rotate(C[(x + 1) % 5], 1ul);
+        for (y = 0; y < 5; ++y) {
+          state[x + y * 5] ^= tmp;
+        }
+      }
+      current = state[1];
+      for (t = 0; t < 24; ++t) {
+        tmp = state[pos[t]];
+        state[pos[t]] = rotate(current, rot[t]);
+        current = tmp;
+      }
+      for (y = 0; y < 25; y += 5) {
+        for (x = 0; x < 5; ++x)
+          C[x] = state[y + x];
+        for (x = 0; x < 5; ++x) {
+          state[x + y] = C[x] ^ (~C[(x + 1) % 5] & C[(x + 2) % 5]);
+        }
+      }
+      state[0] ^= RC[r];
+    }
+
+    for (uint j = 4; j < 25; j++) {
+      state[j] = 0;
+    }
+    state[4] = 0x06;
+    state[16] = 0x8000000000000000ul;
+
+    // round 3
+    for (r = 0; r < 24; ++r) {
+      for (x = 0; x < 5; ++x) {
+        C[x] = state[x] ^ state[x + 5] ^ state[x + 10] ^ state[x + 15] ^
+               state[x + 20];
+      }
+      for (x = 0; x < 5; ++x) {
+        tmp = C[(x + 4) % 5] ^ rotate(C[(x + 1) % 5], 1ul);
+        for (y = 0; y < 5; ++y) {
+          state[x + y * 5] ^= tmp;
+        }
+      }
+      current = state[1];
+      for (t = 0; t < 24; ++t) {
+        tmp = state[pos[t]];
+        state[pos[t]] = rotate(current, rot[t]);
+        current = tmp;
+      }
+      for (y = 0; y < 25; y += 5) {
+        for (x = 0; x < 5; ++x)
+          C[x] = state[y + x];
+        for (x = 0; x < 5; ++x) {
+          state[x + y] = C[x] ^ (~C[(x + 1) % 5] & C[(x + 2) % 5]);
+        }
+      }
+      state[0] ^= RC[r];
+    }
 
-kernel void sha3(global ulong *buffer,
-                 ulong nonce_start, ulong difficulty,
-                 uint num_rounds, global ulong *output_1
-                ) {
-
-output_1[0] = 0;
-output_1[1] = 0;
-ulong state[25];
-for (uint i = 0;i< num_rounds; i++) {
-
-  for (uint j = 0; j < 25; j++) {
-    state[j] = 0;
-  }
-  state[0] = nonce_start + get_global_id(0) + i * get_global_size(0);
-  state[1] = buffer[1];
-  state[2] = buffer[2];
-  state[3] = buffer[3];
-
-  state[4] = buffer[4];
-  state[5] = buffer[5];
-  
-  state[16] ^= 0x8000000000000000ul;
-
-
-
-  
- 
-     uint r, x, y, t;
-     ulong tmp, current, C[5];
-     for (r = 0; r < 24; ++r) {
-       for (x = 0; x < 5; ++x) {
-         C[x] = state[x] ^ state[x + 5] ^ state[x + 10] ^ state[x + 15] ^
-                state[x + 20];
-       }
-       for (x = 0; x < 5; ++x) {
-         tmp = C[(x + 4) % 5] ^ rotate(C[(x + 1) % 5], 1ul);
-         for (y = 0; y < 5; ++y) {
-           state[x + y * 5] ^= tmp;
-         }
-       }
-       current = state[1];
-       for (t = 0; t < 24; ++t) {
-         tmp = state[pos[t]];
-         state[pos[t]] = rotate(current, rot[t]);
-         current = tmp;
-       }
-       for (y = 0; y < 25; y += 5) {
-         for (x = 0; x < 5; ++x)
-           C[x] = state[y + x];
-         for (x = 0; x < 5; ++x) {
-           state[x + y] = C[x] ^ (~C[(x + 1) % 5] & C[(x + 2) % 5]);
-         }
-       }
-       state[0] ^= RC[r];
-     }
-   
-
-  for (uint j = 4; j < 25; j++) {
-    state[j] = 0;
-  }
-  state[4] = 0x06;
-  state[16] = 0x8000000000000000ul;
-
-  for (r = 0; r < 24; ++r) {
-       for (x = 0; x < 5; ++x) {
-         C[x] = state[x] ^ state[x + 5] ^ state[x + 10] ^ state[x + 15] ^
-                state[x + 20];
-       }
-       for (x = 0; x < 5; ++x) {
-         tmp = C[(x + 4) % 5] ^ rotate(C[(x + 1) % 5], 1ul);
-         for (y = 0; y < 5; ++y) {
-           state[x + y * 5] ^= tmp;
-         }
-       }
-       current = state[1];
-       for (t = 0; t < 24; ++t) {
-         tmp = state[pos[t]];
-         state[pos[t]] = rotate(current, rot[t]);
-         current = tmp;
-       }
-       for (y = 0; y < 25; y += 5) {
-         for (x = 0; x < 5; ++x)
-           C[x] = state[y + x];
-         for (x = 0; x < 5; ++x) {
-           state[x + y] = C[x] ^ (~C[(x + 1) % 5] & C[(x + 2) % 5]);
-         }
-       }
-       state[0] ^= RC[r];
-     }
-   
-
-  for (uint j = 4; j < 25; j++) {
-    state[j] = 0;
-  }
-  state[4] = 0x06;
-  state[16] = 0x8000000000000000ul;
-
-  // round 3
-   for (r = 0; r < 24; ++r) {
-       for (x = 0; x < 5; ++x) {
-         C[x] = state[x] ^ state[x + 5] ^ state[x + 10] ^ state[x + 15] ^
-                state[x + 20];
-       }
-       for (x = 0; x < 5; ++x) {
-         tmp = C[(x + 4) % 5] ^ rotate(C[(x + 1) % 5], 1ul);
-         for (y = 0; y < 5; ++y) {
-           state[x + y * 5] ^= tmp;
-         }
-       }
-       current = state[1];
-       for (t = 0; t < 24; ++t) {
-         tmp = state[pos[t]];
-         state[pos[t]] = rotate(current, rot[t]);
-         current = tmp;
-       }
-       for (y = 0; y < 25; y += 5) {
-         for (x = 0; x < 5; ++x)
-           C[x] = state[y + x];
-         for (x = 0; x < 5; ++x) {
-           state[x + y] = C[x] ^ (~C[(x + 1) % 5] & C[(x + 2) % 5]);
-         }
-       }
-       state[0] ^= RC[r];
-     }
-
-
-     // check difficulty
-     ulong swap = swap_endian_64(state[0]);
-     if (swap < difficulty) {
-       output_1[0] = nonce_start + get_global_id(0) + i * get_global_size(0);
-       output_1[1] = swap; 
-     }
-     else {
+    // check difficulty
+    ulong swap = swap_endian_64(state[0]);
+    if (swap < difficulty) {
       if (output_1[1] == 0 || output_1[1] > swap) {
-          output_1[1] = swap;
+        output_1[0] = nonce_start + get_global_id(0) + i * get_global_size(0);
+        output_1[1] = swap;
       }
-     // if (output_1[1] < nonce_start+ get_global_id(0)) {
-     //   output_1[1] = nonce_start + get_global_id(0);
-     // }
-     }
+    } else {
+      if (output_1[1] == 0 || output_1[1] > swap) {
+        // output_1[0] = nonce_start + get_global_id(0) + i *
+        // get_global_size(0);
+        output_1[1] = swap;
+      }
+      // if (output_1[1] < nonce_start+ get_global_id(0)) {
+      //   output_1[1] = nonce_start + get_global_id(0);
+      // }
+    }
 
-     //output_1[0] = difficulty;
-     //  output_1[0] = nonce_start + get_global_id(0) ;
+    // output_1[0] = difficulty;
+    //   output_1[0] = nonce_start + get_global_id(0) ;
     //  output_1[0] = 1;
-}
-
-                
-
+  }
 
   // // Compare difficulty
   // bool le = true;
@@ -225,7 +214,8 @@ for (uint i = 0;i< num_rounds; i++) {
   //       // n -= d
   //       int r = 0;
   //       for (int j = 31; j >= 0; --j) {
-  //         // There is no temporary overflow, because in OpenCL uchar + uchar is
+  //         // There is no temporary overflow, because in OpenCL uchar + uchar
+  //         is
   //         // ulong (not really sure, but it's bigger than uchar)
   //         if (n[j] < output_buffer[j] + r) {
   //           n[j] = n[j] - r - output_buffer[j];

Original file line number	Diff line number	Diff line change
`@@ -514,7 +514,7 @@ fn run_thread<T: EngineImpl>(`
`514`	`514`	`debug!(target: LOG_TARGET, "Elapsed {:?} > {:?}", elapsed.elapsed().as_secs(), config.template_refresh_secs );`
`515`	`515`	`break;`
`516`	`516`	`}`
`517`		`- let num_iterations = 16;`
	`517`	`+ let num_iterations = 1;`
`518`	`518`	`let result = gpu_engine.mine(`
`519`	`519`	`&gpu_function,`
`520`	`520`	`&context,`