fix error-free arithmetic

tk-yoshimura · tk-yoshimura · commit 63b95338cf16 · 2022-09-18T23:06:25.000+09:00
diff --git a/README.md b/README.md
@@ -8,7 +8,7 @@
 
 **Supports High-Dimensional Convolution Neural Networks.** (Complex, Quaternion, Vector3D)
 
-For regression problems: **High precision calculate** by FP32-FP32 arithmetic (1/8 error of FP32 arithmetic)
+For regression problems: **High precision calculate** by FP32-FP32 arithmetic
 
 ## Requirement  
 .NET 6.0  
diff --git a/TensorShaderCudaBackend/ShaderDefines.cs b/TensorShaderCudaBackend/ShaderDefines.cs
@@ -402,7 +402,7 @@ static __inline__ __device__ void floatfloat_hilo_add(float &hi, float &lo, floa
                 public static string AtomicAdd =>
                 $@"
                 static __inline__ __device__ void floatfloat_atomicadd(float *ptr, float hi, float lo){{
-                    float tmp = atomicAdd(ptr, hi + lo);
+                    float tmp = atomicAdd(ptr, hi);
                     atomicAdd(ptr + 1, lo - (((tmp + hi) - tmp) - hi));
                 }}";
 
@@ -465,9 +465,9 @@ static __inline__ __device__ void complex_mulgrad(float2 &hi, float2 &lo, float2
                     static __inline__ __device__ void floatfloat_atomicadd(float2 *ptr, float2 hi, float2 lo){{
                         float *ptr_float = (float*)ptr;
 
-                        float tmpx = atomicAdd(ptr_float, hi.x + lo.x);
+                        float tmpx = atomicAdd(ptr_float, hi.x);
                         atomicAdd(ptr_float + 1, lo.x - (((tmpx + hi.x) - tmpx) - hi.x));
-                        float tmpy = atomicAdd(ptr_float + 2, hi.y + lo.y);
+                        float tmpy = atomicAdd(ptr_float + 2, hi.y);
                         atomicAdd(ptr_float + 3, lo.y - (((tmpy + hi.y) - tmpy) - hi.y));
                     }}";
                 }
@@ -585,13 +585,13 @@ static __inline__ __device__ void quaternion_mulgrad(float4 &hi, float4 &lo, flo
                     static __inline__ __device__ void floatfloat_atomicadd(float4 *ptr, float4 hi, float4 lo){{
                         float *ptr_float = (float*)ptr;
 
-                        float tmpx = atomicAdd(ptr_float, hi.x + lo.x);
+                        float tmpx = atomicAdd(ptr_float, hi.x);
                         atomicAdd(ptr_float + 1, lo.x - (((tmpx + hi.x) - tmpx) - hi.x));
-                        float tmpy = atomicAdd(ptr_float + 2, hi.y + lo.y);
+                        float tmpy = atomicAdd(ptr_float + 2, hi.y);
                         atomicAdd(ptr_float + 3, lo.y - (((tmpy + hi.y) - tmpy) - hi.y));
-                        float tmpz = atomicAdd(ptr_float + 4, hi.z + lo.z);
+                        float tmpz = atomicAdd(ptr_float + 4, hi.z);
                         atomicAdd(ptr_float + 5, lo.z - (((tmpz + hi.z) - tmpz) - hi.z));
-                        float tmpw = atomicAdd(ptr_float + 6, hi.w + lo.w);
+                        float tmpw = atomicAdd(ptr_float + 6, hi.w);
                         atomicAdd(ptr_float + 7, lo.w - (((tmpw + hi.w) - tmpw) - hi.w));
                     }}";
                 }
diff --git a/TensorShaderCudaBackend/Shaders/Complex/Convolution/FloatFloatPrecision/Convolution1D.cs b/TensorShaderCudaBackend/Shaders/Complex/Convolution/FloatFloatPrecision/Convolution1D.cs
@@ -90,7 +90,7 @@ __global__ void complex_convolution_1d(const float2* __restrict__ inmap, float2*
                 { (OutChannels % ThreadsX != 0 ? $"if(outch < {OutChannels}){{" : "") }
                     unsigned int outmap_idx = outch + {OutChannels} * ox;
 
-                    outmap[outmap_idx] = ctor_float2(uv_hi.x + uv_lo.x, uv_hi.y + uv_lo.y);
+                    outmap[outmap_idx] = ctor_float2(uv_hi.x, uv_hi.y);
                 { (OutChannels % ThreadsX != 0 ? "}" : "") }
             }}";
 
diff --git a/TensorShaderCudaBackend/Shaders/Complex/Convolution/FloatFloatPrecision/Convolution2D.cs b/TensorShaderCudaBackend/Shaders/Complex/Convolution/FloatFloatPrecision/Convolution2D.cs
@@ -103,7 +103,7 @@ __global__ void complex_convolution_2d(const float2* __restrict__ inmap, float2*
                 { (OutChannels % ThreadsX != 0 ? $"if(outch < {OutChannels}){{" : "") }
                     unsigned int outmap_idx = outch + {OutChannels} * (ox + outwidth * oy);
 
-                    outmap[outmap_idx] = ctor_float2(uv_hi.x + uv_lo.x, uv_hi.y + uv_lo.y);
+                    outmap[outmap_idx] = ctor_float2(uv_hi.x, uv_hi.y);
                 { (OutChannels % ThreadsX != 0 ? "}" : "") }
             }}";
 
diff --git a/TensorShaderCudaBackend/Shaders/Complex/Convolution/FloatFloatPrecision/Convolution3D.cs b/TensorShaderCudaBackend/Shaders/Complex/Convolution/FloatFloatPrecision/Convolution3D.cs
@@ -111,7 +111,7 @@ __global__ void complex_convolution_3d(const float2* __restrict__ inmap, float2*
                 { (OutChannels % ThreadsX != 0 ? $"if(outch < {OutChannels}){{" : "") }
                     unsigned int outmap_idx = outch + {OutChannels} * (ox + outwidth * (oy + outheight * oz));
 
-                    outmap[outmap_idx] = ctor_float2(uv_hi.x + uv_lo.x, uv_hi.y + uv_lo.y);
+                    outmap[outmap_idx] = ctor_float2(uv_hi.x, uv_hi.y);
                 { (OutChannels % ThreadsX != 0 ? "}" : "") }
             }}";
 
diff --git a/TensorShaderCudaBackend/Shaders/Complex/Convolution/FloatFloatPrecision/Deconvolution1D.cs b/TensorShaderCudaBackend/Shaders/Complex/Convolution/FloatFloatPrecision/Deconvolution1D.cs
@@ -90,7 +90,7 @@ __global__ void complex_deconvolution_1d(const float2* __restrict__ inmap, float
                 { (OutChannels % ThreadsX != 0 ? $"if(outch < {OutChannels}){{" : "") }
                     unsigned int outmap_idx = outch + {OutChannels} * ox;
 
-                    outmap[outmap_idx] = ctor_float2(uv_hi.x + uv_lo.x, uv_hi.y + uv_lo.y);
+                    outmap[outmap_idx] = ctor_float2(uv_hi.x, uv_hi.y);
                 { (OutChannels % ThreadsX != 0 ? "}" : "") }
             }}";
 
diff --git a/TensorShaderCudaBackend/Shaders/Complex/Convolution/FloatFloatPrecision/Deconvolution2D.cs b/TensorShaderCudaBackend/Shaders/Complex/Convolution/FloatFloatPrecision/Deconvolution2D.cs
@@ -106,7 +106,7 @@ __global__ void complex_deconvolution_2d(const float2* __restrict__ inmap, float
                 { (OutChannels % ThreadsX != 0 ? $"if(outch < {OutChannels}){{" : "") }
                     unsigned int outmap_idx = outch + {OutChannels} * (ox + outwidth * oy);
 
-                    outmap[outmap_idx] = ctor_float2(uv_hi.x + uv_lo.x, uv_hi.y + uv_lo.y);
+                    outmap[outmap_idx] = ctor_float2(uv_hi.x, uv_hi.y);
                 { (OutChannels % ThreadsX != 0 ? "}" : "") }
             }}";
 
diff --git a/TensorShaderCudaBackend/Shaders/Complex/Convolution/FloatFloatPrecision/Deconvolution3D.cs b/TensorShaderCudaBackend/Shaders/Complex/Convolution/FloatFloatPrecision/Deconvolution3D.cs
@@ -118,7 +118,7 @@ __global__ void complex_deconvolution_3d(const float2* __restrict__ inmap, float
                 { (OutChannels % ThreadsX != 0 ? $"if(outch < {OutChannels}){{" : "") }
                     unsigned int outmap_idx = outch + {OutChannels} * (ox + outwidth * (oy + outheight * oz));
 
-                    outmap[outmap_idx] = ctor_float2(uv_hi.x + uv_lo.x, uv_hi.y + uv_lo.y);
+                    outmap[outmap_idx] = ctor_float2(uv_hi.x, uv_hi.y);
                 { (OutChannels % ThreadsX != 0 ? "}" : "") }
             }}";
 
diff --git a/TensorShaderCudaBackend/Shaders/Complex/Convolution/FloatFloatPrecision/Dense.cs b/TensorShaderCudaBackend/Shaders/Complex/Convolution/FloatFloatPrecision/Dense.cs
@@ -76,7 +76,7 @@ __global__ void complex_dense(const float2* __restrict__ inmap, float2* __restri
 
                         filter_idx += {OutChannels};
                     }}
-                    outmap[outch] = ctor_float2(vu_hi.x + vu_lo.x, vu_hi.y + vu_lo.y);
+                    outmap[outch] = ctor_float2(vu_hi.x, vu_hi.y);
 
                 { (OutChannels % ThreadsX != 0 ? "}" : "") }
             }}";
diff --git a/TensorShaderCudaBackend/Shaders/Complex/Convolution/FloatFloatPrecision/TransposeDense.cs b/TensorShaderCudaBackend/Shaders/Complex/Convolution/FloatFloatPrecision/TransposeDense.cs
@@ -74,7 +74,7 @@ __global__ void complex_transpose_dense(const float2* __restrict__ inmap, float2
 
                         filter_idx += {OutChannels};
                     }}
-                    outmap[outch] = ctor_float2(uv_hi.x + uv_lo.x, uv_hi.y + uv_lo.y);
+                    outmap[outch] = ctor_float2(uv_hi.x, uv_hi.y);
 
                 { (OutChannels % ThreadsX != 0 ? "}" : "") }
             }}";
diff --git a/TensorShaderCudaBackend/Shaders/Convolution/FloatFloatPrecision/ChannelwiseConvolution1D.cs b/TensorShaderCudaBackend/Shaders/Convolution/FloatFloatPrecision/ChannelwiseConvolution1D.cs
@@ -59,7 +59,7 @@ __global__ void chwise_convolution_1d(const float* __restrict__ inmap, float* __
 
                 unsigned int outmap_idx = ch + {Channels} * ox;
 
-                outmap[outmap_idx] = uv_hi + uv_lo;
+                outmap[outmap_idx] = uv_hi;
             }}";
 
             this.Kernel = new Kernel(code, "chwise_convolution_1d");
diff --git a/TensorShaderCudaBackend/Shaders/Convolution/FloatFloatPrecision/ChannelwiseConvolution2D.cs b/TensorShaderCudaBackend/Shaders/Convolution/FloatFloatPrecision/ChannelwiseConvolution2D.cs
@@ -72,7 +72,7 @@ __global__ void chwise_convolution_2d(const float* __restrict__ inmap, float* __
 
                 unsigned int outmap_idx = ch + {Channels} * (ox + outwidth * oy);
 
-                outmap[outmap_idx] = uv_hi + uv_lo;
+                outmap[outmap_idx] = uv_hi;
             }}";
 
             this.Kernel = new Kernel(code, "chwise_convolution_2d");
diff --git a/TensorShaderCudaBackend/Shaders/Convolution/FloatFloatPrecision/ChannelwiseConvolution3D.cs b/TensorShaderCudaBackend/Shaders/Convolution/FloatFloatPrecision/ChannelwiseConvolution3D.cs
@@ -79,7 +79,7 @@ __global__ void chwise_convolution_3d(const float* __restrict__ inmap, float* __
 
                 unsigned int outmap_idx = ch + {Channels} * (ox + outwidth * (oy + outheight * oz));
 
-                outmap[outmap_idx] = uv_hi + uv_lo;
+                outmap[outmap_idx] = uv_hi;
             }}";
 
             this.Kernel = new Kernel(code, "chwise_convolution_3d");
diff --git a/TensorShaderCudaBackend/Shaders/Convolution/FloatFloatPrecision/ChannelwiseDeconvolution1D.cs b/TensorShaderCudaBackend/Shaders/Convolution/FloatFloatPrecision/ChannelwiseDeconvolution1D.cs
@@ -61,7 +61,7 @@ __global__ void chwise_deconvolution_1d(const float* __restrict__ inmap, float*
 
                 unsigned int outmap_idx = ch + {Channels} * ox;
 
-                outmap[outmap_idx] = uv_hi + uv_lo;
+                outmap[outmap_idx] = uv_hi;
             }}";
 
             this.Kernel = new Kernel(code, "chwise_deconvolution_1d");
diff --git a/TensorShaderCudaBackend/Shaders/Convolution/FloatFloatPrecision/ChannelwiseDeconvolution2D.cs b/TensorShaderCudaBackend/Shaders/Convolution/FloatFloatPrecision/ChannelwiseDeconvolution2D.cs
@@ -77,7 +77,7 @@ __global__ void chwise_deconvolution_2d(const float* __restrict__ inmap, float*
 
                 unsigned int outmap_idx = ch + {Channels} * (ox + outwidth * oy);
 
-                outmap[outmap_idx] = uv_hi + uv_lo;
+                outmap[outmap_idx] = uv_hi;
             }}";
 
             this.Kernel = new Kernel(code, "chwise_deconvolution_2d");
diff --git a/TensorShaderCudaBackend/Shaders/Convolution/FloatFloatPrecision/ChannelwiseDeconvolution3D.cs b/TensorShaderCudaBackend/Shaders/Convolution/FloatFloatPrecision/ChannelwiseDeconvolution3D.cs
@@ -88,7 +88,7 @@ __global__ void chwise_deconvolution_3d(const float* __restrict__ inmap, float*
 
                 unsigned int outmap_idx = ch + {Channels} * (ox + outwidth * (oy + outheight * oz));
 
-                outmap[outmap_idx] = uv_hi + uv_lo;
+                outmap[outmap_idx] = uv_hi;
             }}";
 
             this.Kernel = new Kernel(code, "chwise_deconvolution_3d");
diff --git a/TensorShaderCudaBackend/Shaders/Convolution/FloatFloatPrecision/Convolution1D.cs b/TensorShaderCudaBackend/Shaders/Convolution/FloatFloatPrecision/Convolution1D.cs
@@ -81,7 +81,7 @@ __global__ void convolution_1d(const float* __restrict__ inmap, float* __restric
                 { (OutChannels % ThreadsX != 0 ? $"if(outch < {OutChannels}){{" : "") }
                     unsigned int outmap_idx = outch + {OutChannels} * ox;
 
-                    outmap[outmap_idx] = uv_hi + uv_lo;
+                    outmap[outmap_idx] = uv_hi;
                 { (OutChannels % ThreadsX != 0 ? "}" : "") }
             }}";
 
diff --git a/TensorShaderCudaBackend/Shaders/Convolution/FloatFloatPrecision/Convolution2D.cs b/TensorShaderCudaBackend/Shaders/Convolution/FloatFloatPrecision/Convolution2D.cs
@@ -94,7 +94,7 @@ __global__ void convolution_2d(const float* __restrict__ inmap, float* __restric
                 { (OutChannels % ThreadsX != 0 ? $"if(outch < {OutChannels}){{" : "") }
                     unsigned int outmap_idx = outch + {OutChannels} * (ox + outwidth * oy);
 
-                    outmap[outmap_idx] = uv_hi + uv_lo;
+                    outmap[outmap_idx] = uv_hi;
                 { (OutChannels % ThreadsX != 0 ? "}" : "") }
             }}";
 
diff --git a/TensorShaderCudaBackend/Shaders/Convolution/FloatFloatPrecision/Convolution3D.cs b/TensorShaderCudaBackend/Shaders/Convolution/FloatFloatPrecision/Convolution3D.cs
@@ -101,7 +101,7 @@ __global__ void convolution_3d(const float* __restrict__ inmap, float* __restric
                 { (OutChannels % ThreadsX != 0 ? $"if(outch < {OutChannels}){{" : "") }
                     unsigned int outmap_idx = outch + {OutChannels} * (ox + outwidth * (oy + outheight * oz));
 
-                    outmap[outmap_idx] = uv_hi + uv_lo;
+                    outmap[outmap_idx] = uv_hi;
                 { (OutChannels % ThreadsX != 0 ? "}" : "") }
             }}";
 
diff --git a/TensorShaderCudaBackend/Shaders/Convolution/FloatFloatPrecision/Deconvolution1D.cs b/TensorShaderCudaBackend/Shaders/Convolution/FloatFloatPrecision/Deconvolution1D.cs
@@ -81,7 +81,7 @@ __global__ void deconvolution_1d(const float* __restrict__ inmap, float* __restr
                 { (OutChannels % ThreadsX != 0 ? $"if(outch < {OutChannels}){{" : "") }
                     unsigned int outmap_idx = outch + {OutChannels} * ox;
 
-                    outmap[outmap_idx] = uv_hi + uv_lo;
+                    outmap[outmap_idx] = uv_hi;
                 { (OutChannels % ThreadsX != 0 ? "}" : "") }
             }}";
 
diff --git a/TensorShaderCudaBackend/Shaders/Convolution/FloatFloatPrecision/Deconvolution2D.cs b/TensorShaderCudaBackend/Shaders/Convolution/FloatFloatPrecision/Deconvolution2D.cs
@@ -97,7 +97,7 @@ __global__ void deconvolution_2d(const float* __restrict__ inmap, float* __restr
                 { (OutChannels % ThreadsX != 0 ? $"if(outch < {OutChannels}){{" : "") }
                     unsigned int outmap_idx = outch + {OutChannels} * (ox + outwidth * oy);
 
-                    outmap[outmap_idx] = uv_hi + uv_lo;
+                    outmap[outmap_idx] = uv_hi;
                 { (OutChannels % ThreadsX != 0 ? "}" : "") }
             }}";
 
diff --git a/TensorShaderCudaBackend/Shaders/Convolution/FloatFloatPrecision/Deconvolution3D.cs b/TensorShaderCudaBackend/Shaders/Convolution/FloatFloatPrecision/Deconvolution3D.cs
@@ -108,7 +108,7 @@ __global__ void deconvolution_3d(const float* __restrict__ inmap, float* __restr
                 { (OutChannels % ThreadsX != 0 ? $"if(outch < {OutChannels}){{" : "") }
                     unsigned int outmap_idx = outch + {OutChannels} * (ox + outwidth * (oy + outheight * oz));
 
-                    outmap[outmap_idx] = uv_hi + uv_lo;
+                    outmap[outmap_idx] = uv_hi;
                 { (OutChannels % ThreadsX != 0 ? "}" : "") }
             }}";
 
diff --git a/TensorShaderCudaBackend/Shaders/Convolution/FloatFloatPrecision/Dense.cs b/TensorShaderCudaBackend/Shaders/Convolution/FloatFloatPrecision/Dense.cs
@@ -70,7 +70,7 @@ __global__ void dense(const float* __restrict__ inmap, float* __restrict__ outma
                         filter_idx += {OutChannels};
                     }}
 
-                    outmap[outch] = uv_hi + uv_lo;
+                    outmap[outch] = uv_hi;
                 { (OutChannels % ThreadsX != 0 ? "}" : "") }
             }}";
 
diff --git a/TensorShaderCudaBackend/Shaders/Convolution/FloatFloatPrecision/PointwiseConvolution.cs b/TensorShaderCudaBackend/Shaders/Convolution/FloatFloatPrecision/PointwiseConvolution.cs
@@ -70,7 +70,7 @@ __global__ void ptwise_convolution(const float* __restrict__ inmap, float* __res
 
                     unsigned int outmap_idx = outch + {OutChannels} * i;
 
-                    outmap[outmap_idx] = uv_hi + uv_lo;
+                    outmap[outmap_idx] = uv_hi;
                 { (OutChannels % ThreadsX != 0 ? "}" : "") }
             }}";
 
diff --git a/TensorShaderCudaBackend/Shaders/Convolution/FloatFloatPrecision/PointwiseDeconvolution.cs b/TensorShaderCudaBackend/Shaders/Convolution/FloatFloatPrecision/PointwiseDeconvolution.cs
@@ -68,7 +68,7 @@ __global__ void ptwise_deconvolution(const float* __restrict__ inmap, float* __r
 
                     unsigned int outmap_idx = outch + {OutChannels} * i;
 
-                    outmap[outmap_idx] = uv_hi + uv_lo;
+                    outmap[outmap_idx] = uv_hi;
                 { (OutChannels % ThreadsX != 0 ? "}" : "") }
             }}";
 
diff --git a/TensorShaderCudaBackend/Shaders/Convolution/FloatFloatPrecision/TransposeDense.cs b/TensorShaderCudaBackend/Shaders/Convolution/FloatFloatPrecision/TransposeDense.cs
@@ -65,7 +65,7 @@ __global__ void transpose_dense(const float* __restrict__ inmap, float* __restri
                         filter_idx += {OutChannels};
                     }}
 
-                    outmap[outch] = uv_hi + uv_lo;
+                    outmap[outch] = uv_hi;
                 { (OutChannels % ThreadsX != 0 ? "}" : "") }
             }}";
 
diff --git a/TensorShaderCudaBackend/Shaders/Quaternion/Convolution/FloatFloatPrecision/Convolution1D.cs b/TensorShaderCudaBackend/Shaders/Quaternion/Convolution/FloatFloatPrecision/Convolution1D.cs
@@ -90,7 +90,7 @@ __global__ void quaternion_convolution_1d(const float4* __restrict__ inmap, floa
                 { (OutChannels % ThreadsX != 0 ? $"if(outch < {OutChannels}){{" : "") }
                     unsigned int outmap_idx = outch + {OutChannels} * ox;
 
-                    outmap[outmap_idx] = ctor_float4(uv_hi.x + uv_lo.x, uv_hi.y + uv_lo.y, uv_hi.z + uv_lo.z, uv_hi.w + uv_lo.w);
+                    outmap[outmap_idx] = ctor_float4(uv_hi.x, uv_hi.y, uv_hi.z, uv_hi.w);
                 { (OutChannels % ThreadsX != 0 ? "}" : "") }
             }}";
 
diff --git a/TensorShaderCudaBackend/Shaders/Quaternion/Convolution/FloatFloatPrecision/Convolution2D.cs b/TensorShaderCudaBackend/Shaders/Quaternion/Convolution/FloatFloatPrecision/Convolution2D.cs
@@ -103,7 +103,7 @@ __global__ void quaternion_convolution_2d(const float4* __restrict__ inmap, floa
                 { (OutChannels % ThreadsX != 0 ? $"if(outch < {OutChannels}){{" : "") }
                     unsigned int outmap_idx = outch + {OutChannels} * (ox + outwidth * oy);
 
-                    outmap[outmap_idx] = ctor_float4(uv_hi.x + uv_lo.x, uv_hi.y + uv_lo.y, uv_hi.z + uv_lo.z, uv_hi.w + uv_lo.w);
+                    outmap[outmap_idx] = ctor_float4(uv_hi.x, uv_hi.y, uv_hi.z, uv_hi.w);
                 { (OutChannels % ThreadsX != 0 ? "}" : "") }
             }}";
 
diff --git a/TensorShaderCudaBackend/Shaders/Quaternion/Convolution/FloatFloatPrecision/Convolution3D.cs b/TensorShaderCudaBackend/Shaders/Quaternion/Convolution/FloatFloatPrecision/Convolution3D.cs
@@ -111,7 +111,7 @@ __global__ void quaternion_convolution_3d(const float4* __restrict__ inmap, floa
                 { (OutChannels % ThreadsX != 0 ? $"if(outch < {OutChannels}){{" : "") }
                     unsigned int outmap_idx = outch + {OutChannels} * (ox + outwidth * (oy + outheight * oz));
 
-                    outmap[outmap_idx] = ctor_float4(uv_hi.x + uv_lo.x, uv_hi.y + uv_lo.y, uv_hi.z + uv_lo.z, uv_hi.w + uv_lo.w);
+                    outmap[outmap_idx] = ctor_float4(uv_hi.x, uv_hi.y, uv_hi.z, uv_hi.w);
                 { (OutChannels % ThreadsX != 0 ? "}" : "") }
             }}";
 
diff --git a/TensorShaderCudaBackend/Shaders/Quaternion/Convolution/FloatFloatPrecision/Deconvolution1D.cs b/TensorShaderCudaBackend/Shaders/Quaternion/Convolution/FloatFloatPrecision/Deconvolution1D.cs
@@ -90,7 +90,7 @@ __global__ void quaternion_deconvolution_1d(const float4* __restrict__ inmap, fl
                 { (OutChannels % ThreadsX != 0 ? $"if(outch < {OutChannels}){{" : "") }
                     unsigned int outmap_idx = outch + {OutChannels} * ox;
 
-                    outmap[outmap_idx] = ctor_float4(uv_hi.x + uv_lo.x, uv_hi.y + uv_lo.y, uv_hi.z + uv_lo.z, uv_hi.w + uv_lo.w);
+                    outmap[outmap_idx] = ctor_float4(uv_hi.x, uv_hi.y, uv_hi.z, uv_hi.w);
                 { (OutChannels % ThreadsX != 0 ? "}" : "") }
             }}";
 
diff --git a/TensorShaderCudaBackend/Shaders/Quaternion/Convolution/FloatFloatPrecision/Deconvolution2D.cs b/TensorShaderCudaBackend/Shaders/Quaternion/Convolution/FloatFloatPrecision/Deconvolution2D.cs
@@ -106,7 +106,7 @@ __global__ void quaternion_deconvolution_2d(const float4* __restrict__ inmap, fl
                 { (OutChannels % ThreadsX != 0 ? $"if(outch < {OutChannels}){{" : "") }
                     unsigned int outmap_idx = outch + {OutChannels} * (ox + outwidth * oy);
 
-                    outmap[outmap_idx] = ctor_float4(uv_hi.x + uv_lo.x, uv_hi.y + uv_lo.y, uv_hi.z + uv_lo.z, uv_hi.w + uv_lo.w);
+                    outmap[outmap_idx] = ctor_float4(uv_hi.x, uv_hi.y, uv_hi.z, uv_hi.w);
                 { (OutChannels % ThreadsX != 0 ? "}" : "") }
             }}";
 
diff --git a/TensorShaderCudaBackend/Shaders/Quaternion/Convolution/FloatFloatPrecision/Deconvolution3D.cs b/TensorShaderCudaBackend/Shaders/Quaternion/Convolution/FloatFloatPrecision/Deconvolution3D.cs
@@ -118,7 +118,7 @@ __global__ void quaternion_deconvolution_3d(const float4* __restrict__ inmap, fl
                 { (OutChannels % ThreadsX != 0 ? $"if(outch < {OutChannels}){{" : "") }
                     unsigned int outmap_idx = outch + {OutChannels} * (ox + outwidth * (oy + outheight * oz));
 
-                    outmap[outmap_idx] = ctor_float4(uv_hi.x + uv_lo.x, uv_hi.y + uv_lo.y, uv_hi.z + uv_lo.z, uv_hi.w + uv_lo.w);
+                    outmap[outmap_idx] = ctor_float4(uv_hi.x, uv_hi.y, uv_hi.z, uv_hi.w);
                 { (OutChannels % ThreadsX != 0 ? "}" : "") }
             }}";
 
diff --git a/TensorShaderCudaBackend/Shaders/Quaternion/Convolution/FloatFloatPrecision/Dense.cs b/TensorShaderCudaBackend/Shaders/Quaternion/Convolution/FloatFloatPrecision/Dense.cs
@@ -77,7 +77,7 @@ __global__ void quaternion_dense(const float4* __restrict__ inmap, float4* __res
                         filter_idx += {OutChannels};
                     }}
 
-                    outmap[outch] = ctor_float4(uv_hi.x + uv_lo.x, uv_hi.y + uv_lo.y, uv_hi.z + uv_lo.z, uv_hi.w + uv_lo.w);
+                    outmap[outch] = ctor_float4(uv_hi.x, uv_hi.y, uv_hi.z, uv_hi.w);
                 { (OutChannels % ThreadsX != 0 ? "}" : "") }
             }}";
 
diff --git a/TensorShaderCudaBackend/Shaders/Quaternion/Convolution/FloatFloatPrecision/TransposeDense.cs b/TensorShaderCudaBackend/Shaders/Quaternion/Convolution/FloatFloatPrecision/TransposeDense.cs
@@ -75,7 +75,7 @@ __global__ void quaternion_transpose_dense(const float4* __restrict__ inmap, flo
                         filter_idx += {OutChannels};
                     }}
 
-                    outmap[outch] = ctor_float4(uv_hi.x + uv_lo.x, uv_hi.y + uv_lo.y, uv_hi.z + uv_lo.z, uv_hi.w + uv_lo.w);
+                    outmap[outch] = ctor_float4(uv_hi.x, uv_hi.y, uv_hi.z, uv_hi.w);
                 { (OutChannels % ThreadsX != 0 ? "}" : "") }
             }}";
 
diff --git a/TensorShaderCudaBackend/Shaders/Transform/ColumnToImage1D.cs b/TensorShaderCudaBackend/Shaders/Transform/ColumnToImage1D.cs
@@ -57,7 +57,7 @@ __global__ void column_to_image_1d(const float* __restrict__ inmap, float* __res
                 }}
 
                 unsigned int outmap_idx = ch + {Channels} * ox;
-                outmap[outmap_idx] = hi + lo;
+                outmap[outmap_idx] = hi;
             }}";
 
             this.Kernel = new Kernel(code, "column_to_image_1d");
diff --git a/TensorShaderCudaBackend/Shaders/Transform/ColumnToImage2D.cs b/TensorShaderCudaBackend/Shaders/Transform/ColumnToImage2D.cs
@@ -69,7 +69,7 @@ __global__ void column_to_image_2d(const float* __restrict__ inmap, float* __res
                 }}
 
                 unsigned int outmap_idx = ch + {Channels} * (ox + outwidth * oy);
-                outmap[outmap_idx] = hi + lo;
+                outmap[outmap_idx] = hi;
             }}";
 
             this.Kernel = new Kernel(code, "column_to_image_2d");
diff --git a/TensorShaderCudaBackend/Shaders/Transform/ColumnToImage3D.cs b/TensorShaderCudaBackend/Shaders/Transform/ColumnToImage3D.cs
@@ -81,7 +81,7 @@ __global__ void column_to_image_3d(const float* __restrict__ inmap, float* __res
                 }}
 
                 unsigned int outmap_idx = ch + {Channels} * (ox + outwidth * (oy + outheight * oz));
-                outmap[outmap_idx] = hi + lo;
+                outmap[outmap_idx] = hi;
             }}";
 
             this.Kernel = new Kernel(code, "column_to_image_3d");
diff --git a/TensorShaderCudaBackend/Shaders/Trivector/Convolution/FloatFloatPrecision/Convolution1D.cs b/TensorShaderCudaBackend/Shaders/Trivector/Convolution/FloatFloatPrecision/Convolution1D.cs
@@ -89,7 +89,7 @@ __global__ void trivector_convolution_1d(const float3* __restrict__ inmap, float
                 { (OutChannels % ThreadsX != 0 ? $"if(outch < {OutChannels}){{" : "") }
                     unsigned int outmap_idx = outch + {OutChannels} * ox;
 
-                    outmap[outmap_idx] = ctor_float3(vq_hi.x + vq_lo.x, vq_hi.y + vq_lo.y, vq_hi.z + vq_lo.z);
+                    outmap[outmap_idx] = ctor_float3(vq_hi.x, vq_hi.y, vq_hi.z);
                 { (OutChannels % ThreadsX != 0 ? "}" : "") }
             }}";
 
diff --git a/TensorShaderCudaBackend/Shaders/Trivector/Convolution/FloatFloatPrecision/Convolution2D.cs b/TensorShaderCudaBackend/Shaders/Trivector/Convolution/FloatFloatPrecision/Convolution2D.cs
@@ -102,7 +102,7 @@ __global__ void trivector_convolution_2d(const float3* __restrict__ inmap, float
                 { (OutChannels % ThreadsX != 0 ? $"if(outch < {OutChannels}){{" : "") }
                     unsigned int outmap_idx = outch + {OutChannels} * (ox + outwidth * oy);
 
-                    outmap[outmap_idx] = ctor_float3(vq_hi.x + vq_lo.x, vq_hi.y + vq_lo.y, vq_hi.z + vq_lo.z);
+                    outmap[outmap_idx] = ctor_float3(vq_hi.x, vq_hi.y, vq_hi.z);
                 { (OutChannels % ThreadsX != 0 ? "}" : "") }
             }}";
 
diff --git a/TensorShaderCudaBackend/Shaders/Trivector/Convolution/FloatFloatPrecision/Convolution3D.cs b/TensorShaderCudaBackend/Shaders/Trivector/Convolution/FloatFloatPrecision/Convolution3D.cs
diff --git a/TensorShaderCudaBackend/Shaders/Trivector/Convolution/FloatFloatPrecision/Deconvolution1D.cs b/TensorShaderCudaBackend/Shaders/Trivector/Convolution/FloatFloatPrecision/Deconvolution1D.cs
diff --git a/TensorShaderCudaBackend/Shaders/Trivector/Convolution/FloatFloatPrecision/Deconvolution2D.cs b/TensorShaderCudaBackend/Shaders/Trivector/Convolution/FloatFloatPrecision/Deconvolution2D.cs
diff --git a/TensorShaderCudaBackend/Shaders/Trivector/Convolution/FloatFloatPrecision/Deconvolution3D.cs b/TensorShaderCudaBackend/Shaders/Trivector/Convolution/FloatFloatPrecision/Deconvolution3D.cs
diff --git a/TensorShaderCudaBackend/Shaders/Trivector/Convolution/FloatFloatPrecision/Dense.cs b/TensorShaderCudaBackend/Shaders/Trivector/Convolution/FloatFloatPrecision/Dense.cs
diff --git a/TensorShaderCudaBackend/Shaders/Trivector/Convolution/FloatFloatPrecision/TransposeDense.cs b/TensorShaderCudaBackend/Shaders/Trivector/Convolution/FloatFloatPrecision/TransposeDense.cs
diff --git a/TensorShaderSample/MNIST/MnistDownloader.cs b/TensorShaderSample/MNIST/MnistDownloader.cs