InfiniTensor
diff --git a/‎csrc/engine/rank_worker.cpp‎
Lines changed: 3 additions & 37 deletions b/‎csrc/engine/rank_worker.cpp‎
Lines changed: 3 additions & 37 deletions
diff --git a/‎csrc/engine/rank_worker.hpp‎
Lines changed: 0 additions & 12 deletions b/‎csrc/engine/rank_worker.hpp‎
Lines changed: 0 additions & 12 deletions
diff --git a/‎csrc/models/minicpm_sala/minicpm_sala_attention.cpp‎
Lines changed: 0 additions & 108 deletions b/‎csrc/models/minicpm_sala/minicpm_sala_attention.cpp‎
Lines changed: 0 additions & 108 deletions
diff --git a/‎csrc/models/minicpm_sala/minicpm_sala_decoder_layer.cpp‎
Lines changed: 0 additions & 136 deletions b/‎csrc/models/minicpm_sala/minicpm_sala_decoder_layer.cpp‎
Lines changed: 0 additions & 136 deletions
@@ -358,17 +358,6 @@ void RankWorker::thread_loop() {
                         // Fall back to eager mode
                         if (!logits) {
                             auto model_args = local_args.to_model_input(rank_info_.device);
-                            // Ensure H2D copies for input tensors (e.g. input_ids / position_ids
-                            // used by embedding) complete before forward.
-                            // Profiling-sensitive path: avoid device-wide sync by default.
-                            // Enable the extra `syncDevice()` barrier for stricter logit alignment
-                            // when running debug/alignment modes.
-                            // infinicore::context::syncStream();
-                            if (std::getenv("INFINI_LOGIT_ALIGNMENT_EXTRA_SYNC_DEVICE") != nullptr ||
-                                std::getenv("INFINI_DEBUG_ATTN_DUMP") != nullptr ||
-                                std::getenv("INFINI_DEBUG_LOG") != nullptr) {
-                                infinicore::context::syncDevice();
-                            }
                             logits = model_->forward(model_args).logits;
                         }
 
@@ -387,43 +376,20 @@ void RankWorker::thread_loop() {
                             int32_t *input_offsets = (int32_t *)local_args.input_offsets.value()->data();
 
                             auto output_ids{infinicore::Tensor::empty({n_req}, infinicore::DataType::I64, rank_info_.device)};
-                            // Request-0 last-token logits on CPU (see RankWorker::Output::logits).
-                            // InferEngine.forward_logits() returns this; it must not be left default-constructed
-                            // (null tensor) or Python will segfault on .device / readback.
-                            // Greedy sampling uses top_k==1; capture in that case by default.
-                            // Profiling: set INFINI_SKIP_LAST_LOGITS_CPU=1 to skip the extra D2H copy.
-                            const char *skip_logits_env = std::getenv("INFINI_SKIP_LAST_LOGITS_CPU");
-                            const bool skip_last_logits_cpu =
-                                skip_logits_env && skip_logits_env[0] != '\0' && skip_logits_env[0] != '0';
-                            const bool enable_last_logits_cpu = (top_k == 1) && !skip_last_logits_cpu;
-                            infinicore::Tensor last_logits_cpu;
 
                             for (auto i{decltype(n_req)(0)}; i < n_req; ++i) {
                                 auto score{logits->view({batch_size * total_len, vocab_size})->narrow({{0, size_t(input_offsets[i + 1] - 1), 1}})->view({vocab_size})};
                                 auto out{output_ids->narrow({{0, i, 1}})->view({})};
-                                if (enable_last_logits_cpu && i == 0) {
-                                    // Capture request-0 score vector before sampling.
-                                    last_logits_cpu = score->contiguous()->to(infinicore::Device::cpu());
-                                }
                                 float random_val = std::uniform_real_distribution<float>(0, 1)(rng_);
                                 infinicore::op::random_sample_(
                                     out, score, random_val, top_p, top_k, temperature);
                             }
 
-                            // Profiling optimization: avoid per-step GPU->CPU copies for the predicted token ids.
-                            // Set `INFINI_PROFILE_KEEP_OUTPUT_IDS_ON_DEVICE=1` to keep `output_ids` on GPU.
-                            const bool keep_output_ids_on_device =
-                                std::getenv("INFINI_PROFILE_KEEP_OUTPUT_IDS_ON_DEVICE") != nullptr;
-                            if (!keep_output_ids_on_device) {
-                                output_ids = output_ids->to(infinicore::Device::cpu());
-                            }
+                            output_ids = output_ids->to(infinicore::Device::cpu());
 
-                            // Only sync when we actually kicked off device->host copies.
-                            if (!keep_output_ids_on_device || enable_last_logits_cpu) {
-                                infinicore::context::syncStream();
-                            }
+                            infinicore::context::syncStream();
 
-                            auto out{Output{output_ids, last_logits_cpu}};
+                            auto out{Output{output_ids}};
 
                             output_ = std::move(out);
                         }
 
@@ -61,18 +61,6 @@ class RankWorker {
 
     struct Output {
         infinicore::Tensor output_ids;
-        // DEBUG-ONLY HOOK:
-        // Last-token logits for request 0 (CPU), shape [vocab_size].
-        //
-        // This is intentionally a minimal “sanity check” export to help validate
-        // model math during bring-up (e.g. compare against HF on a fixed prompt).
-        //
-        // Limitations / assumptions:
-        // - Only populated on rank 0 (tp_rank==0)
-        // - Only captures request 0 (i==0) in a continuous batch
-        // - Copied to CPU when top_k==1 (greedy); skip with INFINI_SKIP_LAST_LOGITS_CPU=1
-        // - Not guaranteed to be set if caller doesn’t provide input_offsets
-        infinicore::Tensor logits;
     };
 
     RankWorker(const InfinilmModel::Config &model_config,
 
@@ -165,95 +165,6 @@ void MiniCPMSALAAttention::reset_cache() {
     // KV state is maintained by the shared engine cache (StaticKVCache).
 }
 
-static void dump_tensor_brief_append(const infinicore::Tensor &t, const char *name, const char *path) {
-    if (!path) return;
-    try {
-        auto cpu_t = t->to(infinicore::Device::cpu());
-        const auto &shp = cpu_t->shape();
-        const auto dt = cpu_t->dtype();
-        std::ofstream f(path, std::ios::app);
-        if (!f) return;
-        f << name << " shape=[";
-        for (size_t i = 0; i < shp.size(); ++i) {
-            if (i) f << ",";
-            f << shp[i];
-        }
-        f << "] dtype=" << static_cast<int>(dt) << "\n";
-
-        const size_t n = cpu_t->numel();
-        const size_t k = std::min<size_t>(n, 16);
-        std::vector<float> buf(k);
-        if (dt == infinicore::DataType::BF16) {
-            const uint16_t *p = reinterpret_cast<const uint16_t *>(cpu_t->data());
-            for (size_t i = 0; i < k; ++i) {
-                uint32_t u = static_cast<uint32_t>(p[i]) << 16;
-                buf[i] = *reinterpret_cast<float *>(&u);
-            }
-        } else if (dt == infinicore::DataType::F16) {
-            const uint16_t *p = reinterpret_cast<const uint16_t *>(cpu_t->data());
-            for (size_t i = 0; i < k; ++i) {
-                uint32_t u = (p[i] & 0x8000) << 16 | ((p[i] & 0x7fff) + (127 - 15)) << 23 | (p[i] & 0x03ff) << 13;
-                buf[i] = *reinterpret_cast<float *>(&u);
-            }
-        } else if (dt == infinicore::DataType::F32) {
-            const float *p = reinterpret_cast<const float *>(cpu_t->data());
-            for (size_t i = 0; i < k; ++i) buf[i] = p[i];
-        } else {
-            f << "  (brief dump skipped for dtype)\n";
-            return;
-        }
-        f << "  first[" << k << "]:";
-        for (size_t i = 0; i < k; ++i) f << " " << buf[i];
-        f << "\n";
-    } catch (...) {
-    }
-}
-
-static void dump_tensor_brief_tail_append(const infinicore::Tensor &t, const char *name, const char *path) {
-    if (!path) return;
-    try {
-        auto cpu_t = t->to(infinicore::Device::cpu());
-        const auto &shp = cpu_t->shape();
-        const auto dt = cpu_t->dtype();
-        std::ofstream f(path, std::ios::app);
-        if (!f) return;
-        f << name << " shape=[";
-        for (size_t i = 0; i < shp.size(); ++i) {
-            if (i) f << ",";
-            f << shp[i];
-        }
-        f << "] dtype=" << static_cast<int>(dt) << "\n";
-
-        const size_t n = cpu_t->numel();
-        const size_t k = std::min<size_t>(n, 16);
-        std::vector<float> buf(k);
-        if (dt == infinicore::DataType::BF16) {
-            const uint16_t *p = reinterpret_cast<const uint16_t *>(cpu_t->data());
-            for (size_t i = 0; i < k; ++i) {
-                uint32_t u = static_cast<uint32_t>(p[n - k + i]) << 16;
-                buf[i] = *reinterpret_cast<float *>(&u);
-            }
-        } else if (dt == infinicore::DataType::F16) {
-            const uint16_t *p = reinterpret_cast<const uint16_t *>(cpu_t->data());
-            for (size_t i = 0; i < k; ++i) {
-                uint32_t u = (p[n - k + i] & 0x8000) << 16 | ((p[n - k + i] & 0x7fff) + (127 - 15)) << 23 |
-                             (p[n - k + i] & 0x03ff) << 13;
-                buf[i] = *reinterpret_cast<float *>(&u);
-            }
-        } else if (dt == infinicore::DataType::F32) {
-            const float *p = reinterpret_cast<const float *>(cpu_t->data());
-            for (size_t i = 0; i < k; ++i) buf[i] = p[n - k + i];
-        } else {
-            f << "  (tail dump skipped for dtype)\n";
-            return;
-        }
-
-        f << "  tail[" << k << "]:";
-        for (size_t i = 0; i < k; ++i) f << " " << buf[i];
-        f << "\n";
-    } catch (...) {
-    }
-}
 
 infinicore::Tensor MiniCPMSALAAttention::forward(const infinicore::Tensor &hidden_states,
                                                  const infinicore::Tensor &position_ids,
@@ -373,25 +284,6 @@ infinicore::Tensor MiniCPMSALAAttention::forward_dense_(const infinicore::Tensor
     k_total = k_total->narrow({{2, 0, total_seq_len}});
     v_total = v_total->narrow({{2, 0, total_seq_len}});
 
-    // Debug KV cache parity: dump brief k/v for layer0 when enabled.
-    // Helps verify that decode cached KV matches full-sequence KV.
-    {
-        const char *kv_prefix = std::getenv("INFINI_DEBUG_KV_DUMP_PREFIX");
-        if (kv_prefix && kv_prefix[0] != '\0' && kv_prefix[0] != '0' && layer_idx_ == 0 && batch_size == 1) {
-            std::string path = std::string("/tmp/kv_dump_") + kv_prefix + ".txt";
-            char namek[256];
-            char namev[256];
-            std::snprintf(namek, sizeof(namek), "k_total cache_pos=%zu total_seq_len=%zu seq_len=%zu", cache_pos, total_seq_len, seq_len);
-            std::snprintf(namev, sizeof(namev), "v_total cache_pos=%zu total_seq_len=%zu seq_len=%zu", cache_pos, total_seq_len, seq_len);
-            // Ensure cache update kernels are finished before dumping to CPU.
-            infinicore::context::syncStream();
-            dump_tensor_brief_append(k_total, namek, path.c_str());
-            dump_tensor_brief_append(v_total, namev, path.c_str());
-            dump_tensor_brief_tail_append(k_total, namek, path.c_str());
-            dump_tensor_brief_tail_append(v_total, namev, path.c_str());
-        }
-    }
-
     infinicore::Tensor attn_output;
     if (!is_sparse_layer_) {
         // Lightning-attn: Simple GLA (HF-aligned).
 
@@ -11,111 +11,6 @@
 
 namespace infinilm::models::minicpm_sala {
 
-namespace {
-
-void log_tensor_stats_if_enabled(const infinicore::Tensor &tensor,
-                                 size_t layer_idx,
-                                 const char *hypothesis_id,
-                                 const char *location,
-                                 const char *message) {
-    const char *log_path = std::getenv("INFINI_DEBUG_LOG");
-    if (!log_path) {
-        return;
-    }
-    try {
-        auto cpu_t = tensor->to(infinicore::Device::cpu());
-        const size_t n = cpu_t->numel();
-        const auto &shp = cpu_t->shape();
-        const auto dt = cpu_t->dtype();
-        std::vector<float> f32_buf(n);
-        if (dt == infinicore::DataType::BF16) {
-            const uint16_t *p = reinterpret_cast<const uint16_t *>(cpu_t->data());
-            for (size_t i = 0; i < n; ++i) {
-                uint32_t u = static_cast<uint32_t>(p[i]) << 16;
-                f32_buf[i] = *reinterpret_cast<float *>(&u);
-            }
-        } else if (dt == infinicore::DataType::F32) {
-            const float *p = reinterpret_cast<const float *>(cpu_t->data());
-            for (size_t i = 0; i < n; ++i) f32_buf[i] = p[i];
-        } else if (dt == infinicore::DataType::F16) {
-            const uint16_t *p = reinterpret_cast<const uint16_t *>(cpu_t->data());
-            for (size_t i = 0; i < n; ++i) {
-                uint32_t u = (p[i] & 0x8000) << 16 | ((p[i] & 0x7fff) + (127 - 15)) << 23 | (p[i] & 0x03ff) << 13;
-                f32_buf[i] = *reinterpret_cast<float *>(&u);
-            }
-        }
-        float mn = f32_buf.empty() ? 0.f : f32_buf[0];
-        float mx = mn;
-        double sum = 0.0;
-        double ss = 0.0;
-        for (float v : f32_buf) {
-            mn = std::min(mn, v);
-            mx = std::max(mx, v);
-            sum += v;
-            ss += static_cast<double>(v) * static_cast<double>(v);
-        }
-        const double mean = n ? (sum / static_cast<double>(n)) : 0.0;
-        const double norm = ss > 0.0 ? std::sqrt(ss) : 0.0;
-
-        std::ofstream log(log_path, std::ios::app);
-        if (log) {
-            std::string shape_json = "[";
-            for (size_t i = 0; i < shp.size(); ++i) {
-                shape_json += (i ? "," : "") + std::to_string(shp[i]);
-            }
-            shape_json += "]";
-            const auto now_ms = std::chrono::duration_cast<std::chrono::milliseconds>(
-                                    std::chrono::system_clock::now().time_since_epoch())
-                                    .count();
-            log << "{\"sessionId\":\"9146ea\",\"hypothesisId\":\"" << hypothesis_id
-                << "\",\"location\":\"" << location
-                << "\",\"message\":\"" << message
-                << "\",\"data\":{\"layer\":" << layer_idx
-                << ",\"shape\":" << shape_json
-                << ",\"min\":" << mn
-                << ",\"max\":" << mx
-                << ",\"mean\":" << mean
-                << ",\"l2\":" << norm
-                << "},\"timestamp\":" << now_ms << "}\n";
-        }
-    } catch (...) {
-        // Best-effort diagnostics; never throw from logging path.
-    }
-}
-
-// Convert to float and optionally dump to binary (for alignment checks).
-void tensor_to_f32_and_dump(const infinicore::Tensor &tensor, const char *bin_path) {
-    if (!bin_path || !std::getenv("INFINI_DEBUG_ATTN_DUMP")) return;
-    try {
-        // Debug-only: ensure pending GPU work for `tensor` is complete
-        // before CPU copy so the binary dump reflects real values.
-        infinicore::context::syncStream();
-        auto cpu_t = tensor->to(infinicore::Device::cpu());
-        const size_t n = cpu_t->numel();
-        const auto dt = cpu_t->dtype();
-        std::vector<float> f32_buf(n);
-        if (dt == infinicore::DataType::BF16) {
-            const uint16_t *p = reinterpret_cast<const uint16_t *>(cpu_t->data());
-            for (size_t i = 0; i < n; ++i) {
-                uint32_t u = static_cast<uint32_t>(p[i]) << 16;
-                f32_buf[i] = *reinterpret_cast<float *>(&u);
-            }
-        } else if (dt == infinicore::DataType::F32) {
-            const float *p = reinterpret_cast<const float *>(cpu_t->data());
-            for (size_t i = 0; i < n; ++i) f32_buf[i] = p[i];
-        } else if (dt == infinicore::DataType::F16) {
-            const uint16_t *p = reinterpret_cast<const uint16_t *>(cpu_t->data());
-            for (size_t i = 0; i < n; ++i) {
-                uint32_t u = (p[i] & 0x8000) << 16 | ((p[i] & 0x7fff) + (127 - 15)) << 23 | (p[i] & 0x03ff) << 13;
-                f32_buf[i] = *reinterpret_cast<float *>(&u);
-            }
-        } else return;
-        std::ofstream bin(bin_path, std::ios::binary);
-        if (bin) bin.write(reinterpret_cast<const char *>(f32_buf.data()), n * sizeof(float));
-    } catch (...) {}
-}
-
-} // namespace
 
 MiniCPMSALADecoderLayer::MiniCPMSALADecoderLayer(std::shared_ptr<infinilm::config::ModelConfig> model_config,
                                                  const infinicore::Device &device,
@@ -158,22 +53,6 @@ infinicore::Tensor MiniCPMSALADecoderLayer::forward(const infinicore::Tensor &hi
                                                     std::optional<infinicore::Tensor> slot_mapping) const {
     // Pre-norm attention
     auto hs1 = input_layernorm_->forward(hidden_states);
-    if (layer_idx_ == 0)
-        tensor_to_f32_and_dump(hs1, "/tmp/inf_layer0_attn_input.bin");
-
-    const bool log_all_layers = []() {
-        const char *env = std::getenv("INFINI_DEBUG_LOG_ALL_LAYERS");
-        if (!env) return false;
-        return env[0] != '\0' && env[0] != '0';
-    }();
-
-    if (log_all_layers || layer_idx_ < 2) {
-        log_tensor_stats_if_enabled(hs1,
-                                    layer_idx_,
-                                    "INF_B",
-                                    "minicpm_sala_decoder_layer.cpp:input_layernorm",
-                                    "Inf input layernorm output");
-    }
     auto attn_out = self_attn_->forward(
         hs1,
         position_ids,
@@ -198,21 +77,6 @@ infinicore::Tensor MiniCPMSALADecoderLayer::forward(const infinicore::Tensor &hi
     infinicore::op::ones_(ones_mlp);
     auto out2 = infinicore::op::addcmul(out1, mlp_out, ones_mlp, static_cast<float>(residual_scale_));
 
-    // #region agent log
-    if (log_all_layers || layer_idx_ < 3) {
-        log_tensor_stats_if_enabled(out2,
-                                    layer_idx_,
-                                    "INF_L",
-                                    "minicpm_sala_decoder_layer.cpp:forward_output",
-                                    "Inf decoder layer output stats");
-        if (std::getenv("INFINI_DEBUG_ATTN_DUMP")) {
-            char path[64];
-            std::snprintf(path, sizeof(path), "/tmp/inf_layer_out_%zu.bin", layer_idx_);
-            tensor_to_f32_and_dump(out2, path);
-        }
-    }
-    // #endregion
-
     return out2;
 }