issue/1126 fix softmax and conv2d

PanZezhong1725 · PanZezhong1725 · commit e084b69782b2 · 2026-04-20T08:22:06.000Z
diff --git a/src/infinicore/ops/conv2d/conv2d.cc b/src/infinicore/ops/conv2d/conv2d.cc
@@ -37,10 +37,41 @@ Tensor conv2d(Tensor input,
               const std::vector<size_t> &pads,
               const std::vector<size_t> &strides,
               const std::vector<size_t> &dilations) {
-    // Output shape should be pre-computed by caller; allocate a conservative placeholder.
-    // This helper is rarely used in performance-critical paths.
-    Shape shape = input->shape();
-    auto output = Tensor::empty(shape, input->dtype(), input->device());
+    const auto &in_shape = input->shape(); // [N, C_in, H_in, W_in]
+    const auto &w_shape = weight->shape(); // [C_out, C_in, kH, kW]
+
+    // -------------------------------
+    // Extract dimensions
+    // -------------------------------
+    size_t N = in_shape[0];
+    size_t C_in = in_shape[1];
+    size_t H_in = in_shape[2];
+    size_t W_in = in_shape[3];
+
+    size_t C_out = w_shape[0];
+    size_t kH = w_shape[2];
+    size_t kW = w_shape[3];
+
+    size_t pad_h = pads[0];
+    size_t pad_w = pads[1];
+
+    size_t stride_h = strides[0];
+    size_t stride_w = strides[1];
+
+    size_t dil_h = dilations[0];
+    size_t dil_w = dilations[1];
+
+    auto calc_out = [](size_t in, size_t pad, size_t dilation, size_t kernel, size_t stride) {
+        return (in + 2 * pad - dilation * (kernel - 1) - 1) / stride + 1;
+    };
+    size_t H_out = calc_out(H_in, pad_h, dil_h, kH, stride_h);
+    size_t W_out = calc_out(W_in, pad_w, dil_w, kW, stride_w);
+    if ((int64_t)H_out <= 0 || (int64_t)W_out <= 0) {
+        throw std::runtime_error("Invalid conv2d output shape (negative or zero)");
+    }
+    Shape out_shape = {N, C_out, H_out, W_out};
+
+    auto output = Tensor::empty(out_shape, input->dtype(), input->device());
     conv2d_(output, input, weight, bias, pads, strides, dilations);
     return output;
 }
diff --git a/src/infiniop/ops/softmax/nvidia/softmax_nvidia.cu b/src/infiniop/ops/softmax/nvidia/softmax_nvidia.cu
@@ -81,6 +81,33 @@ infiniStatus_t launchKernel(void *y, const void *x, infiniDtype_t dtype,
                                                      othersize, dimsize, stride);
         }
 
+    } else if (dtype == INFINI_DTYPE_BF16) {
+        if (dimsize > 1024) {
+            blockSoftmax<cuda_bfloat16, BLOCK_SIZE>
+                <<<num_blocks, BLOCK_SIZE, 0, stream>>>((cuda_bfloat16 *)y, (const cuda_bfloat16 *)x,
+                                                        dimsize, stride);
+        } else if (dimsize > 31) {
+            constexpr unsigned int BLOCK_SIZE_x = 32;
+            constexpr unsigned int BLOCK_SIZE_y = 32;
+            constexpr int numPerThreadx = 32;
+            int num_block_x = (num_blocks + BLOCK_SIZE_y - 1) / BLOCK_SIZE_y;
+            dim3 block_dim(BLOCK_SIZE_x, BLOCK_SIZE_y, 1);
+            dim3 grid_dim(num_block_x, 1, 1);
+            warpSoftmax<cuda_bfloat16, BLOCK_SIZE_x, BLOCK_SIZE_y, numPerThreadx>
+                <<<grid_dim, block_dim, 0, stream>>>((cuda_bfloat16 *)y, (const cuda_bfloat16 *)x,
+                                                     othersize, dimsize, stride);
+        } else {
+            constexpr unsigned int BLOCK_SIZE_x = 16;
+            constexpr unsigned int BLOCK_SIZE_y = 32;
+            constexpr int numPerThreadx = 2;
+            int num_block_x = (num_blocks + BLOCK_SIZE_y - 1) / BLOCK_SIZE_y;
+            dim3 block_dim(BLOCK_SIZE_x, BLOCK_SIZE_y, 1);
+            dim3 grid_dim(num_block_x, 1, 1);
+            warpSoftmax<cuda_bfloat16, BLOCK_SIZE_x, BLOCK_SIZE_y, numPerThreadx>
+                <<<grid_dim, block_dim, 0, stream>>>((cuda_bfloat16 *)y, (const cuda_bfloat16 *)x,
+                                                     othersize, dimsize, stride);
+        }
+
     } else if (dtype == INFINI_DTYPE_F32) {
         if (dimsize > 1024) {
             blockSoftmax<float, BLOCK_SIZE>