@@ -83,9 +83,6 @@ inline std::vector<uint16_t> float_to_fp16_bits(const std::vector<float> &values
8383}
8484} // anonymous namespace
8585
86- // ============================================================================
87- // GPTQ_QY Class (最终修复版)
88- // ============================================================================
8986namespace infinicore ::quantization {
9087
9188class GPTQ_QY : public BaseQuantization {
@@ -123,9 +120,6 @@ class GPTQ_QY : public BaseQuantization {
123120 const int bits = weight_bits ();
124121 const int values_per_int32 = 32 / bits;
125122
126- // --------------------------------------------------------------------
127- // 1. qweight 转换(保持不变)
128- // --------------------------------------------------------------------
129123 {
130124 const auto &shape = original_qweight->shape ();
131125 assert (shape.size () == 2 );
@@ -148,9 +142,6 @@ class GPTQ_QY : public BaseQuantization {
148142 target_device);
149143 }
150144
151- // --------------------------------------------------------------------
152- // 2. qzeros 转换(保持 int32 -> fp32 -> fp16 逻辑)
153- // --------------------------------------------------------------------
154145 {
155146 const auto &shape = original_qzeros->shape ();
156147 assert (shape.size () == 2 );
@@ -167,24 +158,15 @@ class GPTQ_QY : public BaseQuantization {
167158 target_device);
168159 }
169160
170- // --------------------------------------------------------------------
171- // 3. scales 转换(核心修复:如果输入已是 FP16,直接内存拷贝)
172- // --------------------------------------------------------------------
173161 {
174162 auto scales_cpu = original_scales->to (Device::Type::CPU);
175163 size_t num_elements = scales_cpu->numel ();
176164 const void *raw_data = scales_cpu->data ();
177165
178166 std::vector<uint16_t > scales_fp16 (num_elements);
179-
180- // 关键:根据输入 dtype 决定处理方式
181167 if (scales_cpu->dtype () == DataType::F16) {
182- // 输入已经是 FP16,直接 memcpy,不做任何转换!
183- // spdlog::info("Scales is already FP16, performing direct memory copy");
184168 std::memcpy (scales_fp16.data (), raw_data, num_elements * sizeof (uint16_t ));
185169 } else if (scales_cpu->dtype () == DataType::F32) {
186- // 输入是 FP32,才需要转换
187- // spdlog::info("Scales is FP32, converting to FP16");
188170 std::vector<float > scales_fp32 (num_elements);
189171 std::memcpy (scales_fp32.data (), raw_data, num_elements * sizeof (float ));
190172 scales_fp16 = ::float_to_fp16_bits (scales_fp32);
@@ -201,19 +183,15 @@ class GPTQ_QY : public BaseQuantization {
201183 target_device);
202184 }
203185
204- // --------------------------------------------------------------------
205- // 4. g_idx 处理
206- // --------------------------------------------------------------------
207186 if (g_idx->numel () > 0 ) {
208187 g_idx_ = g_idx->to (target_device);
209188 }
210189
211190 converted_ = true ;
212- // spdlog::info("GPTQ_QY conversion completed successfully");
213191 }
214192
215193 void release_buffers () {
216- converted_weight_ = Tensor (); // 赋值为空 Tensor,释放显存
194+ converted_weight_ = Tensor ();
217195 converted_zeros_ = Tensor ();
218196 converted_scales_ = Tensor ();
219197 g_idx_ = Tensor ();
@@ -226,16 +204,12 @@ class GPTQ_QY : public BaseQuantization {
226204 return ;
227205 }
228206
229- // 1. 执行转换(只读传入的原始数据)
230207 convert_from_gptq_w4a16 (weight, zeros, scales, g_idx, target_device);
231208
232- // 2. 转移所有权(Move 语义:converted_weight_ 的 impl_ 指针会置为 nullptr)
233- // 原 weight/zeros/scales 持有的旧 shared_ptr 会被自动析构,释放显存
234209 weight = std::move (converted_weight_);
235210 zeros = std::move (converted_zeros_);
236211 scales = std::move (converted_scales_);
237212
238- // 3. 清理内部状态
239213 converted_ = false ;
240214 spdlog::debug (" GPTQ_QY: Ownership transferred, internal buffers cleared." );
241215 }
@@ -343,4 +317,4 @@ class GPTQ_QY : public BaseQuantization {
343317 bool converted_ = false ;
344318};
345319
346- } // namespace infinicore::quantization
320+ } // namespace infinicore::quantization
0 commit comments