1111
1212namespace infinilm ::models::minicpm_sala {
1313
14- namespace {
15-
16- void log_tensor_stats_if_enabled (const infinicore::Tensor &tensor,
17- size_t layer_idx,
18- const char *hypothesis_id,
19- const char *location,
20- const char *message) {
21- const char *log_path = std::getenv (" INFINI_DEBUG_LOG" );
22- if (!log_path) {
23- return ;
24- }
25- try {
26- auto cpu_t = tensor->to (infinicore::Device::cpu ());
27- const size_t n = cpu_t ->numel ();
28- const auto &shp = cpu_t ->shape ();
29- const auto dt = cpu_t ->dtype ();
30- std::vector<float > f32_buf (n);
31- if (dt == infinicore::DataType::BF16) {
32- const uint16_t *p = reinterpret_cast <const uint16_t *>(cpu_t ->data ());
33- for (size_t i = 0 ; i < n; ++i) {
34- uint32_t u = static_cast <uint32_t >(p[i]) << 16 ;
35- f32_buf[i] = *reinterpret_cast <float *>(&u);
36- }
37- } else if (dt == infinicore::DataType::F32) {
38- const float *p = reinterpret_cast <const float *>(cpu_t ->data ());
39- for (size_t i = 0 ; i < n; ++i) f32_buf[i] = p[i];
40- } else if (dt == infinicore::DataType::F16) {
41- const uint16_t *p = reinterpret_cast <const uint16_t *>(cpu_t ->data ());
42- for (size_t i = 0 ; i < n; ++i) {
43- uint32_t u = (p[i] & 0x8000 ) << 16 | ((p[i] & 0x7fff ) + (127 - 15 )) << 23 | (p[i] & 0x03ff ) << 13 ;
44- f32_buf[i] = *reinterpret_cast <float *>(&u);
45- }
46- }
47- float mn = f32_buf.empty () ? 0 .f : f32_buf[0 ];
48- float mx = mn;
49- double sum = 0.0 ;
50- double ss = 0.0 ;
51- for (float v : f32_buf) {
52- mn = std::min (mn, v);
53- mx = std::max (mx, v);
54- sum += v;
55- ss += static_cast <double >(v) * static_cast <double >(v);
56- }
57- const double mean = n ? (sum / static_cast <double >(n)) : 0.0 ;
58- const double norm = ss > 0.0 ? std::sqrt (ss) : 0.0 ;
59-
60- std::ofstream log (log_path, std::ios::app);
61- if (log) {
62- std::string shape_json = " [" ;
63- for (size_t i = 0 ; i < shp.size (); ++i) {
64- shape_json += (i ? " ," : " " ) + std::to_string (shp[i]);
65- }
66- shape_json += " ]" ;
67- const auto now_ms = std::chrono::duration_cast<std::chrono::milliseconds>(
68- std::chrono::system_clock::now ().time_since_epoch ())
69- .count ();
70- log << " {\" sessionId\" :\" 9146ea\" ,\" hypothesisId\" :\" " << hypothesis_id
71- << " \" ,\" location\" :\" " << location
72- << " \" ,\" message\" :\" " << message
73- << " \" ,\" data\" :{\" layer\" :" << layer_idx
74- << " ,\" shape\" :" << shape_json
75- << " ,\" min\" :" << mn
76- << " ,\" max\" :" << mx
77- << " ,\" mean\" :" << mean
78- << " ,\" l2\" :" << norm
79- << " },\" timestamp\" :" << now_ms << " }\n " ;
80- }
81- } catch (...) {
82- // Best-effort diagnostics; never throw from logging path.
83- }
84- }
85-
86- // Convert to float and optionally dump to binary (for alignment checks).
87- void tensor_to_f32_and_dump (const infinicore::Tensor &tensor, const char *bin_path) {
88- if (!bin_path || !std::getenv (" INFINI_DEBUG_ATTN_DUMP" )) return ;
89- try {
90- // Debug-only: ensure pending GPU work for `tensor` is complete
91- // before CPU copy so the binary dump reflects real values.
92- infinicore::context::syncStream ();
93- auto cpu_t = tensor->to (infinicore::Device::cpu ());
94- const size_t n = cpu_t ->numel ();
95- const auto dt = cpu_t ->dtype ();
96- std::vector<float > f32_buf (n);
97- if (dt == infinicore::DataType::BF16) {
98- const uint16_t *p = reinterpret_cast <const uint16_t *>(cpu_t ->data ());
99- for (size_t i = 0 ; i < n; ++i) {
100- uint32_t u = static_cast <uint32_t >(p[i]) << 16 ;
101- f32_buf[i] = *reinterpret_cast <float *>(&u);
102- }
103- } else if (dt == infinicore::DataType::F32) {
104- const float *p = reinterpret_cast <const float *>(cpu_t ->data ());
105- for (size_t i = 0 ; i < n; ++i) f32_buf[i] = p[i];
106- } else if (dt == infinicore::DataType::F16) {
107- const uint16_t *p = reinterpret_cast <const uint16_t *>(cpu_t ->data ());
108- for (size_t i = 0 ; i < n; ++i) {
109- uint32_t u = (p[i] & 0x8000 ) << 16 | ((p[i] & 0x7fff ) + (127 - 15 )) << 23 | (p[i] & 0x03ff ) << 13 ;
110- f32_buf[i] = *reinterpret_cast <float *>(&u);
111- }
112- } else return ;
113- std::ofstream bin (bin_path, std::ios::binary);
114- if (bin) bin.write (reinterpret_cast <const char *>(f32_buf.data ()), n * sizeof (float ));
115- } catch (...) {}
116- }
117-
118- } // namespace
11914
12015MiniCPMSALADecoderLayer::MiniCPMSALADecoderLayer (std::shared_ptr<infinilm::config::ModelConfig> model_config,
12116 const infinicore::Device &device,
@@ -158,22 +53,6 @@ infinicore::Tensor MiniCPMSALADecoderLayer::forward(const infinicore::Tensor &hi
15853 std::optional<infinicore::Tensor> slot_mapping) const {
15954 // Pre-norm attention
16055 auto hs1 = input_layernorm_->forward (hidden_states);
161- if (layer_idx_ == 0 )
162- tensor_to_f32_and_dump (hs1, " /tmp/inf_layer0_attn_input.bin" );
163-
164- const bool log_all_layers = []() {
165- const char *env = std::getenv (" INFINI_DEBUG_LOG_ALL_LAYERS" );
166- if (!env) return false ;
167- return env[0 ] != ' \0 ' && env[0 ] != ' 0' ;
168- }();
169-
170- if (log_all_layers || layer_idx_ < 2 ) {
171- log_tensor_stats_if_enabled (hs1,
172- layer_idx_,
173- " INF_B" ,
174- " minicpm_sala_decoder_layer.cpp:input_layernorm" ,
175- " Inf input layernorm output" );
176- }
17756 auto attn_out = self_attn_->forward (
17857 hs1,
17958 position_ids,
@@ -198,21 +77,6 @@ infinicore::Tensor MiniCPMSALADecoderLayer::forward(const infinicore::Tensor &hi
19877 infinicore::op::ones_ (ones_mlp);
19978 auto out2 = infinicore::op::addcmul (out1, mlp_out, ones_mlp, static_cast <float >(residual_scale_));
20079
201- // #region agent log
202- if (log_all_layers || layer_idx_ < 3 ) {
203- log_tensor_stats_if_enabled (out2,
204- layer_idx_,
205- " INF_L" ,
206- " minicpm_sala_decoder_layer.cpp:forward_output" ,
207- " Inf decoder layer output stats" );
208- if (std::getenv (" INFINI_DEBUG_ATTN_DUMP" )) {
209- char path[64 ];
210- std::snprintf (path, sizeof (path), " /tmp/inf_layer_out_%zu.bin" , layer_idx_);
211- tensor_to_f32_and_dump (out2, path);
212- }
213- }
214- // #endregion
215-
21680 return out2;
21781}
21882
0 commit comments