Skip to content

Commit c0cbec6

Browse files
committed
cleanup code
Signed-off-by: Ceng23333 <441651826@qq.com>
1 parent 0194fc9 commit c0cbec6

7 files changed

Lines changed: 8 additions & 515 deletions

File tree

csrc/engine/rank_worker.cpp

Lines changed: 3 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -358,17 +358,6 @@ void RankWorker::thread_loop() {
358358
// Fall back to eager mode
359359
if (!logits) {
360360
auto model_args = local_args.to_model_input(rank_info_.device);
361-
// Ensure H2D copies for input tensors (e.g. input_ids / position_ids
362-
// used by embedding) complete before forward.
363-
// Profiling-sensitive path: avoid device-wide sync by default.
364-
// Enable the extra `syncDevice()` barrier for stricter logit alignment
365-
// when running debug/alignment modes.
366-
// infinicore::context::syncStream();
367-
if (std::getenv("INFINI_LOGIT_ALIGNMENT_EXTRA_SYNC_DEVICE") != nullptr ||
368-
std::getenv("INFINI_DEBUG_ATTN_DUMP") != nullptr ||
369-
std::getenv("INFINI_DEBUG_LOG") != nullptr) {
370-
infinicore::context::syncDevice();
371-
}
372361
logits = model_->forward(model_args).logits;
373362
}
374363

@@ -387,43 +376,20 @@ void RankWorker::thread_loop() {
387376
int32_t *input_offsets = (int32_t *)local_args.input_offsets.value()->data();
388377

389378
auto output_ids{infinicore::Tensor::empty({n_req}, infinicore::DataType::I64, rank_info_.device)};
390-
// Request-0 last-token logits on CPU (see RankWorker::Output::logits).
391-
// InferEngine.forward_logits() returns this; it must not be left default-constructed
392-
// (null tensor) or Python will segfault on .device / readback.
393-
// Greedy sampling uses top_k==1; capture in that case by default.
394-
// Profiling: set INFINI_SKIP_LAST_LOGITS_CPU=1 to skip the extra D2H copy.
395-
const char *skip_logits_env = std::getenv("INFINI_SKIP_LAST_LOGITS_CPU");
396-
const bool skip_last_logits_cpu =
397-
skip_logits_env && skip_logits_env[0] != '\0' && skip_logits_env[0] != '0';
398-
const bool enable_last_logits_cpu = (top_k == 1) && !skip_last_logits_cpu;
399-
infinicore::Tensor last_logits_cpu;
400379

401380
for (auto i{decltype(n_req)(0)}; i < n_req; ++i) {
402381
auto score{logits->view({batch_size * total_len, vocab_size})->narrow({{0, size_t(input_offsets[i + 1] - 1), 1}})->view({vocab_size})};
403382
auto out{output_ids->narrow({{0, i, 1}})->view({})};
404-
if (enable_last_logits_cpu && i == 0) {
405-
// Capture request-0 score vector before sampling.
406-
last_logits_cpu = score->contiguous()->to(infinicore::Device::cpu());
407-
}
408383
float random_val = std::uniform_real_distribution<float>(0, 1)(rng_);
409384
infinicore::op::random_sample_(
410385
out, score, random_val, top_p, top_k, temperature);
411386
}
412387

413-
// Profiling optimization: avoid per-step GPU->CPU copies for the predicted token ids.
414-
// Set `INFINI_PROFILE_KEEP_OUTPUT_IDS_ON_DEVICE=1` to keep `output_ids` on GPU.
415-
const bool keep_output_ids_on_device =
416-
std::getenv("INFINI_PROFILE_KEEP_OUTPUT_IDS_ON_DEVICE") != nullptr;
417-
if (!keep_output_ids_on_device) {
418-
output_ids = output_ids->to(infinicore::Device::cpu());
419-
}
388+
output_ids = output_ids->to(infinicore::Device::cpu());
420389

421-
// Only sync when we actually kicked off device->host copies.
422-
if (!keep_output_ids_on_device || enable_last_logits_cpu) {
423-
infinicore::context::syncStream();
424-
}
390+
infinicore::context::syncStream();
425391

426-
auto out{Output{output_ids, last_logits_cpu}};
392+
auto out{Output{output_ids}};
427393

428394
output_ = std::move(out);
429395
}

csrc/engine/rank_worker.hpp

Lines changed: 0 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -61,18 +61,6 @@ class RankWorker {
6161

6262
struct Output {
6363
infinicore::Tensor output_ids;
64-
// DEBUG-ONLY HOOK:
65-
// Last-token logits for request 0 (CPU), shape [vocab_size].
66-
//
67-
// This is intentionally a minimal “sanity check” export to help validate
68-
// model math during bring-up (e.g. compare against HF on a fixed prompt).
69-
//
70-
// Limitations / assumptions:
71-
// - Only populated on rank 0 (tp_rank==0)
72-
// - Only captures request 0 (i==0) in a continuous batch
73-
// - Copied to CPU when top_k==1 (greedy); skip with INFINI_SKIP_LAST_LOGITS_CPU=1
74-
// - Not guaranteed to be set if caller doesn’t provide input_offsets
75-
infinicore::Tensor logits;
7664
};
7765

7866
RankWorker(const InfinilmModel::Config &model_config,

csrc/models/minicpm_sala/minicpm_sala_attention.cpp

Lines changed: 0 additions & 108 deletions
Original file line numberDiff line numberDiff line change
@@ -165,95 +165,6 @@ void MiniCPMSALAAttention::reset_cache() {
165165
// KV state is maintained by the shared engine cache (StaticKVCache).
166166
}
167167

168-
static void dump_tensor_brief_append(const infinicore::Tensor &t, const char *name, const char *path) {
169-
if (!path) return;
170-
try {
171-
auto cpu_t = t->to(infinicore::Device::cpu());
172-
const auto &shp = cpu_t->shape();
173-
const auto dt = cpu_t->dtype();
174-
std::ofstream f(path, std::ios::app);
175-
if (!f) return;
176-
f << name << " shape=[";
177-
for (size_t i = 0; i < shp.size(); ++i) {
178-
if (i) f << ",";
179-
f << shp[i];
180-
}
181-
f << "] dtype=" << static_cast<int>(dt) << "\n";
182-
183-
const size_t n = cpu_t->numel();
184-
const size_t k = std::min<size_t>(n, 16);
185-
std::vector<float> buf(k);
186-
if (dt == infinicore::DataType::BF16) {
187-
const uint16_t *p = reinterpret_cast<const uint16_t *>(cpu_t->data());
188-
for (size_t i = 0; i < k; ++i) {
189-
uint32_t u = static_cast<uint32_t>(p[i]) << 16;
190-
buf[i] = *reinterpret_cast<float *>(&u);
191-
}
192-
} else if (dt == infinicore::DataType::F16) {
193-
const uint16_t *p = reinterpret_cast<const uint16_t *>(cpu_t->data());
194-
for (size_t i = 0; i < k; ++i) {
195-
uint32_t u = (p[i] & 0x8000) << 16 | ((p[i] & 0x7fff) + (127 - 15)) << 23 | (p[i] & 0x03ff) << 13;
196-
buf[i] = *reinterpret_cast<float *>(&u);
197-
}
198-
} else if (dt == infinicore::DataType::F32) {
199-
const float *p = reinterpret_cast<const float *>(cpu_t->data());
200-
for (size_t i = 0; i < k; ++i) buf[i] = p[i];
201-
} else {
202-
f << " (brief dump skipped for dtype)\n";
203-
return;
204-
}
205-
f << " first[" << k << "]:";
206-
for (size_t i = 0; i < k; ++i) f << " " << buf[i];
207-
f << "\n";
208-
} catch (...) {
209-
}
210-
}
211-
212-
static void dump_tensor_brief_tail_append(const infinicore::Tensor &t, const char *name, const char *path) {
213-
if (!path) return;
214-
try {
215-
auto cpu_t = t->to(infinicore::Device::cpu());
216-
const auto &shp = cpu_t->shape();
217-
const auto dt = cpu_t->dtype();
218-
std::ofstream f(path, std::ios::app);
219-
if (!f) return;
220-
f << name << " shape=[";
221-
for (size_t i = 0; i < shp.size(); ++i) {
222-
if (i) f << ",";
223-
f << shp[i];
224-
}
225-
f << "] dtype=" << static_cast<int>(dt) << "\n";
226-
227-
const size_t n = cpu_t->numel();
228-
const size_t k = std::min<size_t>(n, 16);
229-
std::vector<float> buf(k);
230-
if (dt == infinicore::DataType::BF16) {
231-
const uint16_t *p = reinterpret_cast<const uint16_t *>(cpu_t->data());
232-
for (size_t i = 0; i < k; ++i) {
233-
uint32_t u = static_cast<uint32_t>(p[n - k + i]) << 16;
234-
buf[i] = *reinterpret_cast<float *>(&u);
235-
}
236-
} else if (dt == infinicore::DataType::F16) {
237-
const uint16_t *p = reinterpret_cast<const uint16_t *>(cpu_t->data());
238-
for (size_t i = 0; i < k; ++i) {
239-
uint32_t u = (p[n - k + i] & 0x8000) << 16 | ((p[n - k + i] & 0x7fff) + (127 - 15)) << 23 |
240-
(p[n - k + i] & 0x03ff) << 13;
241-
buf[i] = *reinterpret_cast<float *>(&u);
242-
}
243-
} else if (dt == infinicore::DataType::F32) {
244-
const float *p = reinterpret_cast<const float *>(cpu_t->data());
245-
for (size_t i = 0; i < k; ++i) buf[i] = p[n - k + i];
246-
} else {
247-
f << " (tail dump skipped for dtype)\n";
248-
return;
249-
}
250-
251-
f << " tail[" << k << "]:";
252-
for (size_t i = 0; i < k; ++i) f << " " << buf[i];
253-
f << "\n";
254-
} catch (...) {
255-
}
256-
}
257168

258169
infinicore::Tensor MiniCPMSALAAttention::forward(const infinicore::Tensor &hidden_states,
259170
const infinicore::Tensor &position_ids,
@@ -373,25 +284,6 @@ infinicore::Tensor MiniCPMSALAAttention::forward_dense_(const infinicore::Tensor
373284
k_total = k_total->narrow({{2, 0, total_seq_len}});
374285
v_total = v_total->narrow({{2, 0, total_seq_len}});
375286

376-
// Debug KV cache parity: dump brief k/v for layer0 when enabled.
377-
// Helps verify that decode cached KV matches full-sequence KV.
378-
{
379-
const char *kv_prefix = std::getenv("INFINI_DEBUG_KV_DUMP_PREFIX");
380-
if (kv_prefix && kv_prefix[0] != '\0' && kv_prefix[0] != '0' && layer_idx_ == 0 && batch_size == 1) {
381-
std::string path = std::string("/tmp/kv_dump_") + kv_prefix + ".txt";
382-
char namek[256];
383-
char namev[256];
384-
std::snprintf(namek, sizeof(namek), "k_total cache_pos=%zu total_seq_len=%zu seq_len=%zu", cache_pos, total_seq_len, seq_len);
385-
std::snprintf(namev, sizeof(namev), "v_total cache_pos=%zu total_seq_len=%zu seq_len=%zu", cache_pos, total_seq_len, seq_len);
386-
// Ensure cache update kernels are finished before dumping to CPU.
387-
infinicore::context::syncStream();
388-
dump_tensor_brief_append(k_total, namek, path.c_str());
389-
dump_tensor_brief_append(v_total, namev, path.c_str());
390-
dump_tensor_brief_tail_append(k_total, namek, path.c_str());
391-
dump_tensor_brief_tail_append(v_total, namev, path.c_str());
392-
}
393-
}
394-
395287
infinicore::Tensor attn_output;
396288
if (!is_sparse_layer_) {
397289
// Lightning-attn: Simple GLA (HF-aligned).

csrc/models/minicpm_sala/minicpm_sala_decoder_layer.cpp

Lines changed: 0 additions & 136 deletions
Original file line numberDiff line numberDiff line change
@@ -11,111 +11,6 @@
1111

1212
namespace infinilm::models::minicpm_sala {
1313

14-
namespace {
15-
16-
void log_tensor_stats_if_enabled(const infinicore::Tensor &tensor,
17-
size_t layer_idx,
18-
const char *hypothesis_id,
19-
const char *location,
20-
const char *message) {
21-
const char *log_path = std::getenv("INFINI_DEBUG_LOG");
22-
if (!log_path) {
23-
return;
24-
}
25-
try {
26-
auto cpu_t = tensor->to(infinicore::Device::cpu());
27-
const size_t n = cpu_t->numel();
28-
const auto &shp = cpu_t->shape();
29-
const auto dt = cpu_t->dtype();
30-
std::vector<float> f32_buf(n);
31-
if (dt == infinicore::DataType::BF16) {
32-
const uint16_t *p = reinterpret_cast<const uint16_t *>(cpu_t->data());
33-
for (size_t i = 0; i < n; ++i) {
34-
uint32_t u = static_cast<uint32_t>(p[i]) << 16;
35-
f32_buf[i] = *reinterpret_cast<float *>(&u);
36-
}
37-
} else if (dt == infinicore::DataType::F32) {
38-
const float *p = reinterpret_cast<const float *>(cpu_t->data());
39-
for (size_t i = 0; i < n; ++i) f32_buf[i] = p[i];
40-
} else if (dt == infinicore::DataType::F16) {
41-
const uint16_t *p = reinterpret_cast<const uint16_t *>(cpu_t->data());
42-
for (size_t i = 0; i < n; ++i) {
43-
uint32_t u = (p[i] & 0x8000) << 16 | ((p[i] & 0x7fff) + (127 - 15)) << 23 | (p[i] & 0x03ff) << 13;
44-
f32_buf[i] = *reinterpret_cast<float *>(&u);
45-
}
46-
}
47-
float mn = f32_buf.empty() ? 0.f : f32_buf[0];
48-
float mx = mn;
49-
double sum = 0.0;
50-
double ss = 0.0;
51-
for (float v : f32_buf) {
52-
mn = std::min(mn, v);
53-
mx = std::max(mx, v);
54-
sum += v;
55-
ss += static_cast<double>(v) * static_cast<double>(v);
56-
}
57-
const double mean = n ? (sum / static_cast<double>(n)) : 0.0;
58-
const double norm = ss > 0.0 ? std::sqrt(ss) : 0.0;
59-
60-
std::ofstream log(log_path, std::ios::app);
61-
if (log) {
62-
std::string shape_json = "[";
63-
for (size_t i = 0; i < shp.size(); ++i) {
64-
shape_json += (i ? "," : "") + std::to_string(shp[i]);
65-
}
66-
shape_json += "]";
67-
const auto now_ms = std::chrono::duration_cast<std::chrono::milliseconds>(
68-
std::chrono::system_clock::now().time_since_epoch())
69-
.count();
70-
log << "{\"sessionId\":\"9146ea\",\"hypothesisId\":\"" << hypothesis_id
71-
<< "\",\"location\":\"" << location
72-
<< "\",\"message\":\"" << message
73-
<< "\",\"data\":{\"layer\":" << layer_idx
74-
<< ",\"shape\":" << shape_json
75-
<< ",\"min\":" << mn
76-
<< ",\"max\":" << mx
77-
<< ",\"mean\":" << mean
78-
<< ",\"l2\":" << norm
79-
<< "},\"timestamp\":" << now_ms << "}\n";
80-
}
81-
} catch (...) {
82-
// Best-effort diagnostics; never throw from logging path.
83-
}
84-
}
85-
86-
// Convert to float and optionally dump to binary (for alignment checks).
87-
void tensor_to_f32_and_dump(const infinicore::Tensor &tensor, const char *bin_path) {
88-
if (!bin_path || !std::getenv("INFINI_DEBUG_ATTN_DUMP")) return;
89-
try {
90-
// Debug-only: ensure pending GPU work for `tensor` is complete
91-
// before CPU copy so the binary dump reflects real values.
92-
infinicore::context::syncStream();
93-
auto cpu_t = tensor->to(infinicore::Device::cpu());
94-
const size_t n = cpu_t->numel();
95-
const auto dt = cpu_t->dtype();
96-
std::vector<float> f32_buf(n);
97-
if (dt == infinicore::DataType::BF16) {
98-
const uint16_t *p = reinterpret_cast<const uint16_t *>(cpu_t->data());
99-
for (size_t i = 0; i < n; ++i) {
100-
uint32_t u = static_cast<uint32_t>(p[i]) << 16;
101-
f32_buf[i] = *reinterpret_cast<float *>(&u);
102-
}
103-
} else if (dt == infinicore::DataType::F32) {
104-
const float *p = reinterpret_cast<const float *>(cpu_t->data());
105-
for (size_t i = 0; i < n; ++i) f32_buf[i] = p[i];
106-
} else if (dt == infinicore::DataType::F16) {
107-
const uint16_t *p = reinterpret_cast<const uint16_t *>(cpu_t->data());
108-
for (size_t i = 0; i < n; ++i) {
109-
uint32_t u = (p[i] & 0x8000) << 16 | ((p[i] & 0x7fff) + (127 - 15)) << 23 | (p[i] & 0x03ff) << 13;
110-
f32_buf[i] = *reinterpret_cast<float *>(&u);
111-
}
112-
} else return;
113-
std::ofstream bin(bin_path, std::ios::binary);
114-
if (bin) bin.write(reinterpret_cast<const char *>(f32_buf.data()), n * sizeof(float));
115-
} catch (...) {}
116-
}
117-
118-
} // namespace
11914

12015
MiniCPMSALADecoderLayer::MiniCPMSALADecoderLayer(std::shared_ptr<infinilm::config::ModelConfig> model_config,
12116
const infinicore::Device &device,
@@ -158,22 +53,6 @@ infinicore::Tensor MiniCPMSALADecoderLayer::forward(const infinicore::Tensor &hi
15853
std::optional<infinicore::Tensor> slot_mapping) const {
15954
// Pre-norm attention
16055
auto hs1 = input_layernorm_->forward(hidden_states);
161-
if (layer_idx_ == 0)
162-
tensor_to_f32_and_dump(hs1, "/tmp/inf_layer0_attn_input.bin");
163-
164-
const bool log_all_layers = []() {
165-
const char *env = std::getenv("INFINI_DEBUG_LOG_ALL_LAYERS");
166-
if (!env) return false;
167-
return env[0] != '\0' && env[0] != '0';
168-
}();
169-
170-
if (log_all_layers || layer_idx_ < 2) {
171-
log_tensor_stats_if_enabled(hs1,
172-
layer_idx_,
173-
"INF_B",
174-
"minicpm_sala_decoder_layer.cpp:input_layernorm",
175-
"Inf input layernorm output");
176-
}
17756
auto attn_out = self_attn_->forward(
17857
hs1,
17958
position_ids,
@@ -198,21 +77,6 @@ infinicore::Tensor MiniCPMSALADecoderLayer::forward(const infinicore::Tensor &hi
19877
infinicore::op::ones_(ones_mlp);
19978
auto out2 = infinicore::op::addcmul(out1, mlp_out, ones_mlp, static_cast<float>(residual_scale_));
20079

201-
// #region agent log
202-
if (log_all_layers || layer_idx_ < 3) {
203-
log_tensor_stats_if_enabled(out2,
204-
layer_idx_,
205-
"INF_L",
206-
"minicpm_sala_decoder_layer.cpp:forward_output",
207-
"Inf decoder layer output stats");
208-
if (std::getenv("INFINI_DEBUG_ATTN_DUMP")) {
209-
char path[64];
210-
std::snprintf(path, sizeof(path), "/tmp/inf_layer_out_%zu.bin", layer_idx_);
211-
tensor_to_f32_and_dump(out2, path);
212-
}
213-
}
214-
// #endregion
215-
21680
return out2;
21781
}
21882

0 commit comments

Comments
 (0)