issue/1153 - apply clang-format-16 and fix MSVC C4996 on Windows

ZhouBencheng · ZhouBencheng · commit 38d3dc36d413 · 2026-05-08T13:05:11.000+08:00
diff --git a/src/infinicore-test/test_mutual_awareness.cc b/src/infinicore-test/test_mutual_awareness.cc
@@ -13,22 +13,22 @@ namespace {
 
 using namespace infinicore::analyzer;
 
-#define MA_ASSERT_TRUE(cond)                                                  \
-    do {                                                                      \
-        if (!(cond)) {                                                        \
-            spdlog::error("ASSERT_TRUE failed: {} ({}:{})",                   \
-                          #cond, __FILE__, __LINE__);                         \
-            return false;                                                     \
-        }                                                                     \
+#define MA_ASSERT_TRUE(cond)                                \
+    do {                                                    \
+        if (!(cond)) {                                      \
+            spdlog::error("ASSERT_TRUE failed: {} ({}:{})", \
+                          #cond, __FILE__, __LINE__);       \
+            return false;                                   \
+        }                                                   \
     } while (0)
 
-#define MA_ASSERT_EQ(a, b)                                                    \
-    do {                                                                      \
-        if (!((a) == (b))) {                                                  \
-            spdlog::error("ASSERT_EQ failed: {} == {} ({}:{})",               \
-                          #a, #b, __FILE__, __LINE__);                        \
-            return false;                                                     \
-        }                                                                     \
+#define MA_ASSERT_EQ(a, b)                                      \
+    do {                                                        \
+        if (!((a) == (b))) {                                    \
+            spdlog::error("ASSERT_EQ failed: {} == {} ({}:{})", \
+                          #a, #b, __FILE__, __LINE__);          \
+            return false;                                       \
+        }                                                       \
     } while (0)
 
 std::vector<OpTraceEntry> makeWindow(const std::vector<OpType> &types,
diff --git a/src/infinicore/analyzer/mutual_awareness_analyzer.cc b/src/infinicore/analyzer/mutual_awareness_analyzer.cc
@@ -15,8 +15,8 @@ DeviceResourceSnapshot buildSnapshotFromMemoryStats(
     snapshot.device_type = device_type;
     snapshot.has_memory_capacity = stats.total_capacity > 0;
     snapshot.free_bytes = stats.total_capacity >= stats.allocated_bytes
-        ? (stats.total_capacity - stats.allocated_bytes)
-        : 0;
+                            ? (stats.total_capacity - stats.allocated_bytes)
+                            : 0;
     snapshot.total_bytes = stats.total_capacity;
     snapshot.used_bytes = stats.allocated_bytes;
     snapshot.reserved_bytes = stats.total_capacity;
@@ -233,7 +233,9 @@ const OptimizationIntent &MutualAwarenessAnalyzer::lastIntent() const {
 // ============================================================
 
 void MutualAwarenessAnalyzer::onGraphRecordingStop() {
-    if (!enabled_) return;
+    if (!enabled_) {
+        return;
+    }
 
     // Analyze the op sequence recorded during graph capture
     // and cache the result. Graph ops are static, so we only
diff --git a/src/infinicore/pybind11/analyzer.hpp b/src/infinicore/pybind11/analyzer.hpp
@@ -11,7 +11,7 @@ namespace infinicore::analyzer::pybind {
 
 inline void bind(py::module &m) {
     auto analyzer_mod = m.def_submodule("analyzer",
-        "Hardware-Task Mutual Awareness Analysis Module");
+                                        "Hardware-Task Mutual Awareness Analysis Module");
 
     // --- Enums ---
     py::enum_<PhaseType>(analyzer_mod, "PhaseType")
@@ -121,12 +121,12 @@ inline void bind(py::module &m) {
 
     // --- Top-level functions ---
     analyzer_mod.def("analyze", &analyzeCurrentState,
-        "Analyze current state and return an OptimizationIntent");
+                     "Analyze current state and return an OptimizationIntent");
     analyzer_mod.def("get_current_phase", &getCurrentPhase,
-        "Get the current detected task phase");
+                     "Get the current detected task phase");
     analyzer_mod.def("set_enabled", &setAnalyzerEnabled,
-        "Enable/disable the mutual awareness analyzer",
-        py::arg("enabled"));
+                     "Enable/disable the mutual awareness analyzer",
+                     py::arg("enabled"));
     analyzer_mod.def(
         "trace_op_for_test",
         [](OpType op_type,
@@ -142,15 +142,17 @@ inline void bind(py::module &m) {
         py::arg("dtype") = 0,
         py::arg("device_type") = 0,
         py::arg("device_id") = 0);
-    analyzer_mod.def("clear_trace", []() {
-        getGlobalOpTrace().clear();
-        MutualAwarenessAnalyzer::instance().clearGraphCache();
-    }, "Clear the global OpTrace ring and analyzer graph cache");
+    analyzer_mod.def(
+        "clear_trace", []() {
+            getGlobalOpTrace().clear();
+            MutualAwarenessAnalyzer::instance().clearGraphCache();
+        },
+        "Clear the global OpTrace ring and analyzer graph cache");
 
     // --- Access to analyzer instance for advanced usage ---
     analyzer_mod.def("get_analyzer", &MutualAwarenessAnalyzer::instance,
-        py::return_value_policy::reference,
-        "Get the MutualAwarenessAnalyzer singleton instance");
+                     py::return_value_policy::reference,
+                     "Get the MutualAwarenessAnalyzer singleton instance");
 
     py::class_<MutualAwarenessAnalyzer>(analyzer_mod, "MutualAwarenessAnalyzer")
         .def("analyze", py::overload_cast<>(&MutualAwarenessAnalyzer::analyze))
diff --git a/src/infiniop/ops/fused_ffn/cpu/fused_ffn_cpu.cc b/src/infiniop/ops/fused_ffn/cpu/fused_ffn_cpu.cc
@@ -125,7 +125,7 @@ infiniStatus_t calculateTyped(
     {
         bool fuse_residual = info.has_residual && (out_ptr == residual_ptr);
         for (size_t t = 0; t < ntok; t++) {
-            const Tdata *hidden = gate_up_buf + t * 2 * di;  // stride = 2*di to match non-fused
+            const Tdata *hidden = gate_up_buf + t * 2 * di; // stride = 2*di to match non-fused
             Tdata *o = out_ptr + t * info.out_stride;
 
             if (fuse_residual) {
diff --git a/src/infiniop/ops/fused_ffn/info.h b/src/infiniop/ops/fused_ffn/info.h
@@ -122,16 +122,13 @@ class FusedFFNInfo {
         {
             size_t dw_dim0 = down_weight_desc->dim(0);
             size_t dw_dim1 = down_weight_desc->dim(1);
-            if (!((dw_dim0 == hidden_dim && dw_dim1 == intermediate_dim) ||
-                  (dw_dim0 == intermediate_dim && dw_dim1 == hidden_dim))) {
+            if (!((dw_dim0 == hidden_dim && dw_dim1 == intermediate_dim) || (dw_dim0 == intermediate_dim && dw_dim1 == hidden_dim))) {
                 return INFINI_STATUS_BAD_TENSOR_SHAPE;
             }
         }
 
         // Check contiguity of the last dimension for activation tensors and norm weights
-        if (out_desc->stride(out_ndim - 1) != 1 ||
-            in_desc->stride(in_ndim - 1) != 1 ||
-            norm_weight_desc->stride(0) != 1) {
+        if (out_desc->stride(out_ndim - 1) != 1 || in_desc->stride(in_ndim - 1) != 1 || norm_weight_desc->stride(0) != 1) {
             return INFINI_STATUS_BAD_TENSOR_STRIDES;
         }
         // For matrix weights, at least one stride dimension must be 1 (contiguous along one axis)
diff --git a/src/infiniop/ops/fused_ffn/nvidia/fused_ffn_nvidia.cu b/src/infiniop/ops/fused_ffn/nvidia/fused_ffn_nvidia.cu
@@ -86,9 +86,9 @@ struct Descriptor::Opaque {
 
     // Workspace slab sizes (bytes), padded to kWsAlign.
     size_t normalized_bytes = 0;
-    size_t gate_up_bytes    = 0;
-    size_t hidden_bytes     = 0;
-    size_t inner_ws_bytes   = 0; // max of sub-descriptor workspaceSize()
+    size_t gate_up_bytes = 0;
+    size_t hidden_bytes = 0;
+    size_t inner_ws_bytes = 0; // max of sub-descriptor workspaceSize()
 
     bool has_residual = false;
 
@@ -108,10 +108,10 @@ struct Descriptor::Opaque {
     // Sub-descriptors owned by this fused op; each one is a standard
     // InfiniopDescriptor for the corresponding standalone operator.
     std::unique_ptr<op::rms_norm::nvidia::Descriptor> rms_norm;
-    std::unique_ptr<op::gemm::nvidia::Descriptor>     gate_up_gemm;
-    std::unique_ptr<op::swiglu::nvidia::Descriptor>   swiglu;
-    std::unique_ptr<op::gemm::nvidia::Descriptor>     down_gemm;
-    std::unique_ptr<op::add::nvidia::Descriptor>      residual_add;
+    std::unique_ptr<op::gemm::nvidia::Descriptor> gate_up_gemm;
+    std::unique_ptr<op::swiglu::nvidia::Descriptor> swiglu;
+    std::unique_ptr<op::gemm::nvidia::Descriptor> down_gemm;
+    std::unique_ptr<op::add::nvidia::Descriptor> residual_add;
 };
 
 Descriptor::~Descriptor() {
@@ -138,12 +138,12 @@ infiniStatus_t Descriptor::create(
     auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);
 
     auto opaque = std::make_unique<Opaque>();
-    opaque->internal     = handle->internal();
+    opaque->internal = handle->internal();
     opaque->has_residual = info.has_residual;
 
     const size_t ntok = info.ntok();
-    const size_t d    = info.d();
-    const size_t di   = info.di();
+    const size_t d = info.d();
+    const size_t di = info.di();
     const size_t dtype_sz = infiniSizeOf(info.dtype);
 
     // Profile-driven scheduler for the deep-fused kernel path.
@@ -164,7 +164,9 @@ infiniStatus_t Descriptor::create(
             const char *thr = std::getenv("INFINIOP_FUSED_FFN_DEEP_MAX_NTOK");
             if (thr != nullptr) {
                 max_ntok = static_cast<size_t>(std::atol(thr));
-                if (max_ntok == 0) max_ntok = 4;
+                if (max_ntok == 0) {
+                    max_ntok = 4;
+                }
             }
             opaque->use_deep_fused = (ntok <= max_ntok);
         }
@@ -199,9 +201,9 @@ infiniStatus_t Descriptor::create(
     // The compact hidden slab (stride=di instead of stride=2*di) gives the
     // Down-GEMM a tightly packed K dimension, which matters on BIV150 where
     // cuBLAS 10.2 tensor-core paths prefer aligned contiguous leading dims.
-    opaque->normalized_bytes = alignUp(ntok * d      * dtype_sz, kWsAlign);
-    opaque->gate_up_bytes    = alignUp(ntok * 2 * di * dtype_sz, kWsAlign);
-    opaque->hidden_bytes     = alignUp(ntok * di     * dtype_sz, kWsAlign);
+    opaque->normalized_bytes = alignUp(ntok * d * dtype_sz, kWsAlign);
+    opaque->gate_up_bytes = alignUp(ntok * 2 * di * dtype_sz, kWsAlign);
+    opaque->hidden_bytes = alignUp(ntok * di * dtype_sz, kWsAlign);
 
     DescScope scope;
 
@@ -219,8 +221,7 @@ infiniStatus_t Descriptor::create(
             normalized_desc, in_view, norm_weight_desc,
             info.epsilon));
         opaque->rms_norm.reset(sub);
-        opaque->inner_ws_bytes =
-            std::max(opaque->inner_ws_bytes, sub->workspaceSize());
+        opaque->inner_ws_bytes = std::max(opaque->inner_ws_bytes, sub->workspaceSize());
     }
 
     // ── GateUp GEMM sub-descriptor ──
@@ -235,8 +236,7 @@ infiniStatus_t Descriptor::create(
         CHECK_STATUS(op::gemm::nvidia::Descriptor::create(
             handle_, &sub, gate_up_c_desc, normalized_desc, gate_up_b_desc));
         opaque->gate_up_gemm.reset(sub);
-        opaque->inner_ws_bytes =
-            std::max(opaque->inner_ws_bytes, sub->workspaceSize());
+        opaque->inner_ws_bytes = std::max(opaque->inner_ws_bytes, sub->workspaceSize());
     }
 
     // ── SwiGLU sub-descriptor ──
@@ -256,8 +256,7 @@ infiniStatus_t Descriptor::create(
         CHECK_STATUS(op::swiglu::nvidia::Descriptor::create(
             handle_, &sub, hidden_desc, {half_desc, half_desc}));
         opaque->swiglu.reset(sub);
-        opaque->inner_ws_bytes =
-            std::max(opaque->inner_ws_bytes, sub->workspaceSize());
+        opaque->inner_ws_bytes = std::max(opaque->inner_ws_bytes, sub->workspaceSize());
     }
 
     // ── Down GEMM sub-descriptor ──
@@ -274,8 +273,7 @@ infiniStatus_t Descriptor::create(
         CHECK_STATUS(op::gemm::nvidia::Descriptor::create(
             handle_, &sub, out_view, hidden_desc, down_b_desc));
         opaque->down_gemm.reset(sub);
-        opaque->inner_ws_bytes =
-            std::max(opaque->inner_ws_bytes, sub->workspaceSize());
+        opaque->inner_ws_bytes = std::max(opaque->inner_ws_bytes, sub->workspaceSize());
     }
 
     // ── Residual add sub-descriptor (optional) ──
@@ -293,15 +291,10 @@ infiniStatus_t Descriptor::create(
             handle_, &sub,
             out_view_for_add, {out_view_for_add, residual_view}));
         opaque->residual_add.reset(sub);
-        opaque->inner_ws_bytes =
-            std::max(opaque->inner_ws_bytes, sub->workspaceSize());
+        opaque->inner_ws_bytes = std::max(opaque->inner_ws_bytes, sub->workspaceSize());
     }
 
-    const size_t workspace_size =
-        opaque->normalized_bytes +
-        opaque->gate_up_bytes +
-        opaque->hidden_bytes +
-        alignUp(opaque->inner_ws_bytes, kWsAlign);
+    const size_t workspace_size = opaque->normalized_bytes + opaque->gate_up_bytes + opaque->hidden_bytes + alignUp(opaque->inner_ws_bytes, kWsAlign);
 
     *desc_ptr = new Descriptor(
         opaque.release(),
@@ -325,16 +318,19 @@ infiniStatus_t Descriptor::calculate(
         return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
     }
 
-    const size_t di       = _info.di();
+    const size_t di = _info.di();
     const size_t dtype_sz = infiniSizeOf(_info.dtype);
 
     // Partition the workspace into the three persistent slabs plus an
     // inner scratch buffer shared by all sub-descriptors.
     char *ws = static_cast<char *>(workspace);
-    void *normalized_buf = ws; ws += _opaque->normalized_bytes;
-    void *gate_up_buf    = ws; ws += _opaque->gate_up_bytes;
-    void *hidden_buf     = ws; ws += _opaque->hidden_bytes;
-    void *inner_ws       = ws;
+    void *normalized_buf = ws;
+    ws += _opaque->normalized_bytes;
+    void *gate_up_buf = ws;
+    ws += _opaque->gate_up_bytes;
+    void *hidden_buf = ws;
+    ws += _opaque->hidden_bytes;
+    void *inner_ws = ws;
     const size_t inner_ws_size = _opaque->inner_ws_bytes;
 
     // gate and up are two halves of the interleaved gate_up buffer.
@@ -344,7 +340,7 @@ infiniStatus_t Descriptor::calculate(
     // shared half_desc at create time).
     const char *gu_bytes = static_cast<const char *>(gate_up_buf);
     const void *gate_ptr = gu_bytes;
-    const void *up_ptr   = gu_bytes + di * dtype_sz;
+    const void *up_ptr = gu_bytes + di * dtype_sz;
 
     // Stage 1: RMSNorm
     CHECK_STATUS(_opaque->rms_norm->calculate(
@@ -365,17 +361,17 @@ infiniStatus_t Descriptor::calculate(
         dim3 grid(static_cast<unsigned>(ntok), static_cast<unsigned>(di));
         dim3 block(kBlock);
 
-#define DEEP_FUSED_LAUNCH(TD, TW)                                        \
-    deepFusedGateUpSiluKernel<kBlock, float, TD, TW>                    \
-        <<<grid, block, 0, cuda_stream>>>(                              \
-            reinterpret_cast<TD *>(hidden_buf),                          \
-            reinterpret_cast<const TD *>(normalized_buf),                \
-            reinterpret_cast<const TW *>(gate_up_weight),                \
-            ntok, d, di,                                                 \
-            static_cast<ptrdiff_t>(d),                                   \
-            static_cast<ptrdiff_t>(di),                                  \
-            _opaque->gate_up_w_k_stride,                                 \
-            _opaque->gate_up_w_col_stride,                               \
+#define DEEP_FUSED_LAUNCH(TD, TW)                         \
+    deepFusedGateUpSiluKernel<kBlock, float, TD, TW>      \
+        <<<grid, block, 0, cuda_stream>>>(                \
+            reinterpret_cast<TD *>(hidden_buf),           \
+            reinterpret_cast<const TD *>(normalized_buf), \
+            reinterpret_cast<const TW *>(gate_up_weight), \
+            ntok, d, di,                                  \
+            static_cast<ptrdiff_t>(d),                    \
+            static_cast<ptrdiff_t>(di),                   \
+            _opaque->gate_up_w_k_stride,                  \
+            _opaque->gate_up_w_col_stride,                \
             /*gate_col_base=*/0u, /*up_col_base=*/di)
 
         if (_info.dtype == INFINI_DTYPE_F16 && _info.mtype == INFINI_DTYPE_F16) {
@@ -416,8 +412,7 @@ infiniStatus_t Descriptor::calculate(
     // Stage 4: Down GEMM, with optional in-place residual fuse via beta=1.
     //   fuse path : out = 1.0 * out + hidden_buf @ down_weight
     //   plain path: out = 0.0 * out + hidden_buf @ down_weight
-    const bool fuse_residual =
-        _opaque->has_residual && (out == residual);
+    const bool fuse_residual = _opaque->has_residual && (out == residual);
     CHECK_STATUS(_opaque->down_gemm->calculate(
         inner_ws, inner_ws_size,
         out, /*beta=*/fuse_residual ? 1.f : 0.f,
diff --git a/src/infiniop/ops/fused_ffn/operator.cc b/src/infiniop/ops/fused_ffn/operator.cc
diff --git a/src/infinirt/cpu/infinirt_cpu.cc b/src/infinirt/cpu/infinirt_cpu.cc
diff --git a/src/infinirt/cuda/infinirt_cuda.cu b/src/infinirt/cuda/infinirt_cuda.cu

Original file line number	Diff line number	Diff line change
`@@ -125,7 +125,7 @@ infiniStatus_t calculateTyped(`
`125`	`125`	`{`
`126`	`126`	`bool fuse_residual = info.has_residual && (out_ptr == residual_ptr);`
`127`	`127`	`for (size_t t = 0; t < ntok; t++) {`
`128`		`- const Tdata hidden = gate_up_buf + t 2 * di; // stride = 2*di to match non-fused`
	`128`	`+ const Tdata hidden = gate_up_buf + t 2 * di; // stride = 2*di to match non-fused`
`129`	`129`	`Tdata o = out_ptr + t info.out_stride;`
`130`	`130`
`131`	`131`	`if (fuse_residual) {`
Original file line number	Diff line number	Diff line change
`@@ -122,16 +122,13 @@ class FusedFFNInfo {`
`122`	`122`	`{`
`123`	`123`	`size_t dw_dim0 = down_weight_desc->dim(0);`
`124`	`124`	`size_t dw_dim1 = down_weight_desc->dim(1);`
`125`		`- if (!((dw_dim0 == hidden_dim && dw_dim1 == intermediate_dim) \|\|`
`126`		`- (dw_dim0 == intermediate_dim && dw_dim1 == hidden_dim))) {`
	`125`	`+ if (!((dw_dim0 == hidden_dim && dw_dim1 == intermediate_dim) \|\| (dw_dim0 == intermediate_dim && dw_dim1 == hidden_dim))) {`
`127`	`126`	`return INFINI_STATUS_BAD_TENSOR_SHAPE;`
`128`	`127`	`}`
`129`	`128`	`}`
`130`	`129`
`131`	`130`	`// Check contiguity of the last dimension for activation tensors and norm weights`
`132`		`- if (out_desc->stride(out_ndim - 1) != 1 \|\|`
`133`		`- in_desc->stride(in_ndim - 1) != 1 \|\|`
`134`		`- norm_weight_desc->stride(0) != 1) {`
	`131`	`+ if (out_desc->stride(out_ndim - 1) != 1 \|\| in_desc->stride(in_ndim - 1) != 1 \|\| norm_weight_desc->stride(0) != 1) {`
`135`	`132`	`return INFINI_STATUS_BAD_TENSOR_STRIDES;`
`136`	`133`	`}`
`137`	`134`	`// For matrix weights, at least one stride dimension must be 1 (contiguous along one axis)`