fix(dflash): address weicj review — HIP link, loader validation, hipMalloc checks

smpurkis · claude · smpurkis · commit 5d155493a176 · 2026-05-10T02:07:42.000+01:00
- CMakeLists.txt: pflash_daemon and spike_thin_copy now link against
  ${_dflash27b_ggml_backend_lib} instead of hardcoded ggml-cuda, fixing
  HIP build link failures reported by weicj.
- gguf_target_loader.cpp: restore structural validation removed in
  prior cleanup — kl!=vl check, n_layer%fai divisibility, full
  rope_sections validation (presence, count, bounds vs head_dim),
  EOS metadata assignment, capture_layer_ids recomputation, and
  output.weight requirement in top-level tensor check.
- bsa_launcher_hip.cu: check hipMalloc return values; on failure,
  roll back partial allocations, reset kv_buf_cap, and return -1
  instead of silently proceeding with null buffers.

Co-Authored-By: Claude Opus 4.6 &lt;noreply@anthropic.com&gt;
diff --git a/dflash/CMakeLists.txt b/dflash/CMakeLists.txt
@@ -359,7 +359,7 @@ endif()
 if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/test/pflash_daemon.cpp")
     add_executable(pflash_daemon test/pflash_daemon.cpp)
     target_include_directories(pflash_daemon PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/src)
-    target_link_libraries(pflash_daemon PRIVATE dflash27b ggml ggml-cuda)
+    target_link_libraries(pflash_daemon PRIVATE dflash27b ggml ${_dflash27b_ggml_backend_lib})
 endif()
 
 # ─── Tests (numerics vs oracle) ────────────────────────────────────
@@ -392,7 +392,7 @@ if(DFLASH27B_TESTS)
     if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/test/spike_thin_copy.cpp")
         add_executable(spike_thin_copy test/spike_thin_copy.cpp)
         target_include_directories(spike_thin_copy PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/src)
-        target_link_libraries(spike_thin_copy PRIVATE ggml ggml-cuda)
+        target_link_libraries(spike_thin_copy PRIVATE ggml ${_dflash27b_ggml_backend_lib})
     endif()
     if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/test/smoke_draft_graph.cpp")
         add_executable(smoke_draft_graph test/smoke_draft_graph.cpp)
diff --git a/dflash/src/bsa_launcher_hip.cu b/dflash/src/bsa_launcher_hip.cu
@@ -72,10 +72,17 @@ extern "C" int launch_bsa_sparse_flash_forward_bf16(
     // Ensure persistent transpose buffers are large enough.
     const size_t kv_bytes = (size_t)B * Hk * S * D * 2;  // sizeof(bfloat16)=2
     if (kv_bytes > kv_buf_cap) {
-        if (kv_buf_K) hipFree(kv_buf_K);
-        if (kv_buf_V) hipFree(kv_buf_V);
-        hipMalloc(&kv_buf_K, kv_bytes);
-        hipMalloc(&kv_buf_V, kv_bytes);
+        if (kv_buf_K) { hipFree(kv_buf_K); kv_buf_K = nullptr; }
+        if (kv_buf_V) { hipFree(kv_buf_V); kv_buf_V = nullptr; }
+        hipError_t err_k = hipMalloc(&kv_buf_K, kv_bytes);
+        hipError_t err_v = hipMalloc(&kv_buf_V, kv_bytes);
+        if (err_k != hipSuccess || err_v != hipSuccess) {
+            // Roll back: free any partial allocation and reset state.
+            if (kv_buf_K) { hipFree(kv_buf_K); kv_buf_K = nullptr; }
+            if (kv_buf_V) { hipFree(kv_buf_V); kv_buf_V = nullptr; }
+            kv_buf_cap = 0;
+            return -1;
+        }
         kv_buf_cap = kv_bytes;
     }
 
diff --git a/dflash/src/gguf_target_loader.cpp b/dflash/src/gguf_target_loader.cpp
@@ -305,16 +305,59 @@ bool load_target_gguf_partial(const std::string & path,
         return false;
     }
 
+    // Structural invariants required by the graph builder.
+    if (kl != vl) {
+        set_last_error("key_length != value_length not supported");
+        gguf_free(gctx); return false;
+    }
+    if (n_layer % fai != 0) {
+        char buf[128];
+        std::snprintf(buf, sizeof(buf), "block_count=%u not divisible by full_attention_interval=%u", n_layer, fai);
+        set_last_error(buf);
+        gguf_free(gctx); return false;
+    }
+
     // rope dimension_sections (array of 4 uint32)
     int rope_sections[4] = {0, 0, 0, 0};
     {
         int64_t rid = gguf_find_key(gctx, "qwen35.rope.dimension_sections");
-        if (rid >= 0) {
-            size_t n = gguf_get_arr_n(gctx, rid);
-            if (n >= 4) {
-                const int32_t * arr = (const int32_t *)gguf_get_arr_data(gctx, rid);
-                for (int k = 0; k < 4; k++) rope_sections[k] = arr[k];
+        if (rid < 0) {
+            set_last_error("missing qwen35.rope.dimension_sections");
+            gguf_free(gctx); return false;
+        }
+        size_t n = gguf_get_arr_n(gctx, rid);
+        if (n < 4) {
+            set_last_error("qwen35.rope.dimension_sections has < 4 entries");
+            gguf_free(gctx); return false;
+        }
+        const int32_t * arr = (const int32_t *)gguf_get_arr_data(gctx, rid);
+        for (int k = 0; k < 4; k++) rope_sections[k] = arr[k];
+    }
+
+    // Validate rope_sections against head_dim. n_rot = 2 * sum(sections) is
+    // the number of dims rotated by ggml_rope_multi; it must be even, > 0,
+    // and ≤ head_dim, otherwise rope reads/writes out of bounds.
+    {
+        long sum = 0;
+        for (int k = 0; k < 4; k++) {
+            if (rope_sections[k] < 0) {
+                char buf[160];
+                std::snprintf(buf, sizeof(buf),
+                    "rope_sections[%d]=%d is negative", k, rope_sections[k]);
+                set_last_error(buf);
+                gguf_free(gctx); return false;
             }
+            sum += rope_sections[k];
+        }
+        const long n_rot = 2 * sum;
+        if (n_rot <= 0 || n_rot > (long)kl) {
+            char buf[200];
+            std::snprintf(buf, sizeof(buf),
+                "rope_sections {%d,%d,%d,%d} → n_rot=%ld invalid for head_dim=%u",
+                rope_sections[0], rope_sections[1], rope_sections[2], rope_sections[3],
+                n_rot, kl);
+            set_last_error(buf);
+            gguf_free(gctx); return false;
         }
     }
 
@@ -351,6 +394,28 @@ bool load_target_gguf_partial(const std::string & path,
     out.rope_dimension_count = (int)get_u32_or(gctx, "qwen35.rope.dimension_count", 64);
     out.rope_theta = get_f32_or(gctx, "qwen35.rope.freq_base", 10000000.0f);
     out.rms_eps = get_f32_or(gctx, "qwen35.attention.layer_norm_rms_epsilon", 1e-6f);
+
+    // EOS token ids from GGUF tokenizer metadata (stored as UINT32 by the
+    // GGUF spec; we use the u32 helper and cast). UINT32_MAX is the
+    // missing-key sentinel and maps to int32_t -1, which the runtime EOS
+    // check rejects via the `>= 0` guard.
+    {
+        const uint32_t kEosKeyMissing = 0xFFFFFFFFu;
+        const uint32_t raw_eos      = get_u32_or(gctx, "tokenizer.ggml.eos_token_id", kEosKeyMissing);
+        const uint32_t raw_eos_chat = get_u32_or(gctx, "tokenizer.ggml.eot_token_id", kEosKeyMissing);
+        out.eos_id      = (raw_eos      == kEosKeyMissing) ? -1 : (int32_t)raw_eos;
+        out.eos_chat_id = (raw_eos_chat == kEosKeyMissing) ? -1 : (int32_t)raw_eos_chat;
+        std::printf("[loader] eos_id=%d eos_chat_id=%d\n", out.eos_id, out.eos_chat_id);
+    }
+
+    // Compute capture layer IDs: evenly spaced through the target layers.
+    // step = (n_layer - 2) / (N - 1), ids[k] = 1 + k * step.
+    {
+        const int N = DFLASH27B_DRAFT_N_TARGET_LAYERS;
+        const int step = ((int)n_layer - 2) / (N - 1);
+        for (int k = 0; k < N; k++) out.capture_layer_ids[k] = 1 + k * step;
+    }
+
     out.layers.assign((size_t)n_layer, TargetLayer{});
 
     // ── 2. Wire our layer pointers to tensors inside meta_ctx ─────────
@@ -360,8 +425,8 @@ bool load_target_gguf_partial(const std::string & path,
     out.tok_embd = g("token_embd.weight");
     out.out_norm = g("output_norm.weight");
     out.output   = g("output.weight");
-    if (!out.tok_embd || !out.out_norm) {
-        set_last_error("missing top-level tensors (token_embd/output_norm)");
+    if (!out.tok_embd || !out.out_norm || !out.output) {
+        set_last_error("missing top-level tensors (token_embd/output_norm/output)");
         gguf_free(gctx);
         return false;
     }