fix lint

jiqing-feng · jiqing-feng · commit aefc6bf44fb1 · 2026-03-18T10:35:06.000Z
Signed-off-by: jiqing-feng &lt;jiqing.feng@intel.com&gt;
diff --git a/bitsandbytes/backends/cpu/ops.py b/bitsandbytes/backends/cpu/ops.py
@@ -322,7 +322,10 @@ def _(
 
 
 def _compute_update_norm_and_scale(
-    update: torch.Tensor, unorm_vec: Optional[torch.Tensor], max_unorm: float, param_norm: float,
+    update: torch.Tensor,
+    unorm_vec: Optional[torch.Tensor],
+    max_unorm: float,
+    param_norm: float,
 ) -> float:
     """Compute trust-ratio scaling factor for LAMB/LARS and store update norm."""
     if max_unorm <= 0.0:
@@ -446,26 +449,35 @@ def _optimizer_update_32bit_cpu(
 
 
 @torch.no_grad()
-def _dequant_blockwise_fp32_direct(A_uint8: torch.Tensor, absmax: torch.Tensor, code: torch.Tensor,
-                                    blocksize: int) -> torch.Tensor:
+def _dequant_blockwise_fp32_direct(
+    A_uint8: torch.Tensor, absmax: torch.Tensor, code: torch.Tensor, blocksize: int
+) -> torch.Tensor:
     """Dequantize blockwise via direct C lib call, avoiding torch.ops dispatch overhead."""
     n = A_uint8.numel()
     out = torch.empty(n, dtype=torch.float32, device=A_uint8.device)
     lib.cdequantize_blockwise_cpu_fp32(
-        get_ptr(code), get_ptr(A_uint8.reshape(-1)), get_ptr(absmax),
-        get_ptr(out), ct.c_longlong(blocksize), ct.c_longlong(n),
+        get_ptr(code),
+        get_ptr(A_uint8.reshape(-1)),
+        get_ptr(absmax),
+        get_ptr(out),
+        ct.c_longlong(blocksize),
+        ct.c_longlong(n),
     )
     return out.reshape(A_uint8.shape)
 
 
-def _quant_blockwise_fp32_direct(A_fp32: torch.Tensor, code: torch.Tensor,
-                                  absmax_out: torch.Tensor, out_uint8: torch.Tensor,
-                                  blocksize: int) -> None:
+def _quant_blockwise_fp32_direct(
+    A_fp32: torch.Tensor, code: torch.Tensor, absmax_out: torch.Tensor, out_uint8: torch.Tensor, blocksize: int
+) -> None:
     """Quantize blockwise via direct C lib call, writing into existing buffers (zero-alloc)."""
     n = A_fp32.numel()
     lib.cquantize_blockwise_cpu_fp32(
-        get_ptr(code), get_ptr(A_fp32.reshape(-1)), get_ptr(absmax_out),
-        get_ptr(out_uint8.reshape(-1)), ct.c_longlong(blocksize), ct.c_longlong(n),
+        get_ptr(code),
+        get_ptr(A_fp32.reshape(-1)),
+        get_ptr(absmax_out),
+        get_ptr(out_uint8.reshape(-1)),
+        ct.c_longlong(blocksize),
+        ct.c_longlong(n),
     )
 
 
diff --git a/bitsandbytes/optim/optimizer.py b/bitsandbytes/optim/optimizer.py
@@ -2,20 +2,20 @@
 #
 # This source code is licensed under the MIT license found in the
 # LICENSE file in the root directory of this source tree.
-import logging
 from collections import abc as container_abcs, defaultdict
 from copy import deepcopy
 from itertools import chain
+import logging
 from typing import Optional
 import warnings
 
 import torch
 
-logger = logging.getLogger(__name__)
-
 import bitsandbytes.functional as F
 from bitsandbytes.utils import sync_gpu
 
+logger = logging.getLogger(__name__)
+
 
 class MockArgs:
     def __init__(self, initial_data):
@@ -375,8 +375,7 @@ def get_state_buffer(self, p, dtype=torch.float32):
         if p.device.type == "cpu":
             if self.is_paged and not getattr(self, "_cpu_paged_warned", False):
                 warnings.warn(
-                    "Paged optimizers are not supported on CPU. "
-                    "Falling back to non-paged optimizer behavior.",
+                    "Paged optimizers are not supported on CPU. Falling back to non-paged optimizer behavior.",
                     stacklevel=2,
                 )
                 self._cpu_paged_warned = True
diff --git a/csrc/cpu_ops.cpp b/csrc/cpu_ops.cpp
@@ -252,11 +252,8 @@ struct LUTCache {
         float fp[4];
         compute_fingerprint(code, fp);
         for (int i = 0; i < kLUTCacheSlots; ++i) {
-            if (cached_codes[i] == code &&
-                cached_fingerprints[i][0] == fp[0] &&
-                cached_fingerprints[i][1] == fp[1] &&
-                cached_fingerprints[i][2] == fp[2] &&
-                cached_fingerprints[i][3] == fp[3]) {
+            if (cached_codes[i] == code && cached_fingerprints[i][0] == fp[0] && cached_fingerprints[i][1] == fp[1] &&
+                cached_fingerprints[i][2] == fp[2] && cached_fingerprints[i][3] == fp[3]) {
                 return luts[i];
             }
         }
diff --git a/csrc/cpu_ops.h b/csrc/cpu_ops.h
@@ -196,7 +196,7 @@ static inline float fp16_to_float(uint16_t h) {
 
     if (exp == 0) {
         if (mant == 0) {
-            bits = sign << 31;  // zero
+            bits = sign << 31; // zero
         } else {
             // subnormal fp16 -> normal fp32
             exp = 1;
@@ -208,7 +208,7 @@ static inline float fp16_to_float(uint16_t h) {
             bits = (sign << 31) | ((exp + 127 - 15) << 23) | (mant << 13);
         }
     } else if (exp == 0x1F) {
-        bits = (sign << 31) | (0xFF << 23) | (mant ? (mant << 13) : 0);  // Inf or NaN
+        bits = (sign << 31) | (0xFF << 23) | (mant ? (mant << 13) : 0); // Inf or NaN
     } else {
         bits = (sign << 31) | ((exp + 127 - 15) << 23) | (mant << 13);
     }
diff --git a/examples/cpu/cpu_training.py b/examples/cpu/cpu_training.py
@@ -164,7 +164,10 @@ def run_single(args):
 
     ds = prepare_data(tokenizer, args.dataset, args.max_length)
     dataloader = torch.utils.data.DataLoader(
-        ds, batch_size=args.batch_size, shuffle=True, collate_fn=collate_fn,
+        ds,
+        batch_size=args.batch_size,
+        shuffle=True,
+        collate_fn=collate_fn,
     )
 
     optimizer = create_optimizer(model, args.optimizer, args.lr)
@@ -198,7 +201,10 @@ def run_compare(args):
 
     ds = prepare_data(tokenizer, args.dataset, args.max_length, num_samples=100)
     dataloader = torch.utils.data.DataLoader(
-        ds, batch_size=args.batch_size, shuffle=False, collate_fn=collate_fn,
+        ds,
+        batch_size=args.batch_size,
+        shuffle=False,
+        collate_fn=collate_fn,
     )
 
     results = {}
@@ -362,4 +368,3 @@ def main():
 # Training runtime: 3.2s
 # Steps/sec: 9.5
 # Optimizer: bnb.optim.adamw8bit | Dtype: bf16
-
diff --git a/tests/test_optim.py b/tests/test_optim.py
@@ -425,8 +425,12 @@ def test_optimizer8bit(dim1, dim2, gtype, optim_name, device):
         else:
             assert err.mean() < 0.00006
             # Lion on CPU fp16 has slightly higher relative error due to sign-based updates at boundary
-            relerr_thr = 0.00062 if (device == "cpu" and optim_name == "lion8bit_blockwise" and gtype == torch.float16) else 0.0006
-            assert relerr.mean() < relerr_thr
+            relerr_the = (
+                0.00062
+                if (device == "cpu" and optim_name == "lion8bit_blockwise" and gtype == torch.float16)
+                else 0.0006
+            )
+            assert relerr.mean() < relerr_the
 
         errors.append(err.mean().item())
         relerrors.append(relerr.mean().item())

Original file line number	Diff line number	Diff line change
`@@ -252,11 +252,8 @@ struct LUTCache {`
`252`	`252`	`float fp[4];`
`253`	`253`	`compute_fingerprint(code, fp);`
`254`	`254`	`for (int i = 0; i < kLUTCacheSlots; ++i) {`
`255`		`- if (cached_codes[i] == code &&`
`256`		`- cached_fingerprints[i][0] == fp[0] &&`
`257`		`- cached_fingerprints[i][1] == fp[1] &&`
`258`		`- cached_fingerprints[i][2] == fp[2] &&`
`259`		`- cached_fingerprints[i][3] == fp[3]) {`
	`255`	`+ if (cached_codes[i] == code && cached_fingerprints[i][0] == fp[0] && cached_fingerprints[i][1] == fp[1] &&`
	`256`	`+ cached_fingerprints[i][2] == fp[2] && cached_fingerprints[i][3] == fp[3]) {`
`260`	`257`	`return luts[i];`
`261`	`258`	`}`
`262`	`259`	`}`