Tencent · yghstill · Dec 22, 2025 · Dec 19, 2025 · Dec 19, 2025 · Dec 19, 2025
diff --git a/angelslim/compressor/quant/modules/awq/search.py b/angelslim/compressor/quant/modules/awq/search.py
@@ -63,6 +63,7 @@ def search_by_block(
         best_ratio = -1
         best_scales = None
         dev = get_best_device()
+        block.to(dev)
         with torch.no_grad():
             if cache is not None:
                 origin_out = torch.ones_like(act)
@@ -97,7 +98,8 @@ def search_by_block(
                     if layer.weight.dtype == torch.float8_e4m3fn:
                         w = weight_dequant(layer.weight, layer.weight_scale_inv)
                         layer.weight.data = w
-                    layer.weight.mul_(scales.view(1, -1))
+                    # Fix for low_memory mode: ensure scales is on same device as weights
+                    layer.weight.mul_(scales.to(layer.weight.device).view(1, -1))
                     if type(layer) in self.observer_layer_classes:
                         quant_dequant_weight = pseudo_quantize_tensor(
                             layer.weight,
@@ -130,7 +132,8 @@ def search_by_block(
                     best_scales = scales
 
                 for layer, w in zip(layers, org_w):
-                    layer.weight.data = w.to(act.device)
+                    # Fix for low_memory mode: restore weights to GPU, not act.device
+                    layer.weight.data = w.to(dev)
 
         origin_out = origin_out.detach().cpu()
         new_out = w.detach().cpu()