@@ -175,7 +175,8 @@ class MxRequantizer[T <: Data](
175175 dontTouch(quantized_buffer)
176176 dontTouch( should_compute)
177177 should_compute := false .B
178-
178+ val quant_half_counter = RegInit (false .B )
179+ val first_half_buf = RegInit (0 .U (128 .W ))
179180
180181 val total_bits_per_element = WireDefault (0 .U (5 .W ))
181182 total_bits_per_element := 1 .U +& exp_bits +& mant_bits
@@ -213,7 +214,13 @@ class MxRequantizer[T <: Data](
213214 data_buffer_counter := ~ data_buffer_counter
214215 }
215216
216-
217+ val gpu_addr = RegInit (0 .U ((32 ).W ))
218+ when(io.requant_data_in_gpu.fire && data_buffer_counter === 0 .U && total_bits_per_element === 8 .U ){
219+ gpu_addr := (io.requant_data_in_gpu.bits.address >> 1 )
220+ }.elsewhen(io.requant_data_in_gpu.fire && data_buffer_counter === 0 .U && total_bits_per_element =/= 8 .U && ! quant_half_counter){
221+ gpu_addr := (io.requant_data_in_gpu.bits.address >> 2 )
222+ }
223+
217224 when(io.mxacc_req.mx_data_in.fire) {
218225 pipe_in.valid := true .B
219226 pipe_in.bits.mx_mode := io.mxacc_req.mx_mode
@@ -232,9 +239,11 @@ class MxRequantizer[T <: Data](
232239 pipe_in.bits.out.full_mx_data_out := VecInit (combined.reverse).asTypeOf(half_acc_row_t)
233240 pipe_in.bits.out.fromDMA := false .B
234241 pipe_in.bits.is_gpu := true .B
235- pipe_in.bits.gpu_addr := io.requant_data_in_gpu.bits.address
242+ pipe_in.bits.gpu_addr := gpu_addr
236243 }
237244
245+
246+
238247 // for (i <- 1 until pipelineLatency) {
239248 // pipelined_out(i) := pipelined_out(i-1)
240249 // }
@@ -276,8 +285,7 @@ class MxRequantizer[T <: Data](
276285 // }
277286
278287 // Two-cycle accumulation registers for FP4 / FP6:
279- val quant_half_counter = RegInit (false .B )
280- val first_half_buf = RegInit (0 .U (128 .W ))
288+
281289
282290
283291 val fp6_lut_out = Cat (quantLut.io.projected_data.bits.reverse)
@@ -369,7 +377,7 @@ class MxRequantizer[T <: Data](
369377
370378
371379 final_pipe_out.ready := Mux (final_pipe_out.bits.is_gpu,
372- io.requant_data_out.ready,
380+ io.requant_data_out.ready || final_pipe_out.bits.out.is_garbage ,
373381 io.mxacc_req.mx_data_out.ready)
374382 oldest_pipe_out.ready := final_pipe_out.ready
375383
@@ -378,33 +386,33 @@ class MxRequantizer[T <: Data](
378386 val helding_flag = RegInit (0 .U )
379387 when (io.requant_data_out.fire){
380388 helding_flag := 0 .U
381- }.elsewhen(final_pipe_out.bits.is_gpu && final_pipe_out.valid && ! io.requant_data_out.ready){
389+ }.elsewhen(final_pipe_out.bits.is_gpu && final_pipe_out.valid && ! io.requant_data_out.ready && ! final_pipe_out.bits.out.is_garbage ){
382390 helding_flag := 1 .U
383391 }
384392
385- when(final_pipe_out.bits.is_gpu && final_pipe_out.valid){
393+ when(final_pipe_out.bits.is_gpu && final_pipe_out.valid && ! final_pipe_out.bits.out.is_garbage ){
386394 when(helding_flag === 1 .U && io.requant_data_out.ready){
387395 io.requant_data_out.bits.data := gpu_out_held
388396 io.requant_data_out.valid := true .B
389397 io.requant_data_out.bits.dataType := RequantizerDataType (format_reg)
390- io.requant_data_out.bits.address := Mux (format_reg === 0 . U , final_pipe_out.bits.gpu_addr, final_pipe_out.bits.gpu_addr >> 1 )
398+ io.requant_data_out.bits.address := final_pipe_out.bits.gpu_addr
391399 }.elsewhen(helding_flag === 0 .U && io.requant_data_out.ready){
392400 io.requant_data_out.bits.data := final_pipe_out.bits.out.quant_mx_data_out.asUInt
393401 io.requant_data_out.valid := true .B
394402 io.requant_data_out.bits.dataType := RequantizerDataType (format_reg)
395- io.requant_data_out.bits.address := Mux (format_reg === 0 . U , final_pipe_out.bits.gpu_addr, final_pipe_out.bits.gpu_addr >> 1 )
403+ io.requant_data_out.bits.address := final_pipe_out.bits.gpu_addr
396404 }.otherwise{
397405 gpu_out_held := final_pipe_out.bits.out.quant_mx_data_out.asUInt
398406 io.requant_data_out.bits.data := 0 .U
399407 io.requant_data_out.valid := false .B
400408 io.requant_data_out.bits.dataType := RequantizerDataType (format_reg)
401- io.requant_data_out.bits.address := Mux (format_reg === 0 . U , final_pipe_out.bits.gpu_addr, final_pipe_out.bits.gpu_addr >> 1 )
409+ io.requant_data_out.bits.address := final_pipe_out.bits.gpu_addr
402410 }
403411 }.otherwise{
404412 io.requant_data_out.bits.data := 0 .U
405413 io.requant_data_out.valid := false .B
406414 io.requant_data_out.bits.dataType := RequantizerDataType (format_reg)
407- io.requant_data_out.bits.address := Mux (format_reg === 0 . U , final_pipe_out.bits.gpu_addr, final_pipe_out.bits.gpu_addr >> 1 )
415+ io.requant_data_out.bits.address := final_pipe_out.bits.gpu_addr
408416 }
409417
410418 should_compute := false .B
0 commit comments