Optimize math_utils for performance and fix sqrtm numpy 2.0 crash

LemonPi · claude · LemonPi · commit 52d67f226186 · 2026-03-10T14:14:01.000-07:00
- replace_nan_and_inf: use torch.nan_to_num (~3.9x faster)
- angular_diff_batch: use modulo wrapping (~1.4x faster, fixes correctness for large diffs)
- angle_between_stable: use broadcasting instead of .repeat() (~1.15x faster)
- sqrtm: replace removed np.float_ alias with np.float64
- bench_compile: remove lambda wrappers that broke torch.compile tracing

Co-Authored-By: Claude Opus 4.6 &lt;noreply@anthropic.com&gt;
diff --git a/benchmarks/bench_compile.py b/benchmarks/bench_compile.py
@@ -84,69 +84,73 @@ def run_benchmarks(device_str):
     x_nan = torch.randn(10000, 100, device=device)
     mask = torch.rand_like(x_nan) < 0.1
     x_nan[mask] = float('nan')
-    benchmarks['replace_nan_and_inf'] = (math_utils.replace_nan_and_inf, (x_nan.clone(), 0))
+    benchmarks['replace_nan_and_inf'] = (math_utils.replace_nan_and_inf, (x_nan.clone(), 0), True)
 
     # angular_diff_batch
     a_ang = torch.randn(100000, device=device)
     b_ang = torch.randn(100000, device=device)
-    benchmarks['angular_diff_batch'] = (math_utils.angular_diff_batch, (a_ang, b_ang))
+    benchmarks['angular_diff_batch'] = (math_utils.angular_diff_batch, (a_ang, b_ang), False)
 
     # angle_between_stable
     u_abs = torch.randn(200, 50, device=device)
     v_abs = torch.randn(150, 50, device=device)
-    benchmarks['angle_between_stable'] = (math_utils.angle_between_stable, (u_abs, v_abs))
+    benchmarks['angle_between_stable'] = (math_utils.angle_between_stable, (u_abs, v_abs), False)
 
     # cos_sim_pairwise
     x1_cos = torch.randn(500, 50, device=device)
     x2_cos = torch.randn(300, 50, device=device)
-    benchmarks['cos_sim_pairwise'] = (math_utils.cos_sim_pairwise, (x1_cos, x2_cos))
+    benchmarks['cos_sim_pairwise'] = (math_utils.cos_sim_pairwise, (x1_cos, x2_cos), False)
 
     # batch_batch_product
     X_bbp = torch.randn(10000, 20, device=device)
     A_bbp = torch.randn(10000, 20, 20, device=device)
-    benchmarks['batch_batch_product'] = (linalg.batch_batch_product, (X_bbp, A_bbp))
+    benchmarks['batch_batch_product'] = (linalg.batch_batch_product, (X_bbp, A_bbp), False)
 
     # batch_quadratic_product
     X_bqp = torch.randn(10000, 20, device=device)
     A_bqp = make_psd(20, device)
-    benchmarks['batch_quadratic_product'] = (linalg.batch_quadratic_product, (X_bqp, A_bqp))
+    benchmarks['batch_quadratic_product'] = (linalg.batch_quadratic_product, (X_bqp, A_bqp), False)
 
     # batch_outer_product
     u_bop = torch.randn(10000, 20, device=device)
     v_bop = torch.randn(10000, 20, device=device)
-    benchmarks['batch_outer_product'] = (linalg.batch_outer_product, (u_bop, v_bop))
+    benchmarks['batch_outer_product'] = (linalg.batch_outer_product, (u_bop, v_bop), False)
 
     # squeeze_n
     x_sq = torch.randn(1, 1, 1, 1000, 50, device=device)
-    benchmarks['squeeze_n'] = (lambda x: tensor_utils.squeeze_n(x, 3), (x_sq,))
+    benchmarks['squeeze_n'] = (tensor_utils.squeeze_n, (x_sq, 3), False)
 
     # MinMaxScaler.transform
     x_mm = torch.randn(10000, 50, device=device)
     scaler = preprocess.MinMaxScaler()
     scaler.fit(x_mm)
-    benchmarks['MinMaxScaler.transform'] = (scaler.transform, (x_mm,))
+    benchmarks['MinMaxScaler.transform'] = (scaler.transform, (x_mm,), False)
 
     # SoftKNN.forward
     x_knn = torch.randn(200, 10, device=device)
     knn = softknn.SoftKNN(min_k=20)
-    benchmarks['SoftKNN.forward'] = (knn, (x_knn,))
+    benchmarks['SoftKNN.forward'] = (knn, (x_knn,), False)
 
     # sqrtm (CPU only due to .numpy())
     if device_str == 'cpu':
         A_sqrtm = make_psd(50, device)
-        benchmarks['sqrtm'] = (linalg.sqrtm, (A_sqrtm,))
+        benchmarks['sqrtm'] = (linalg.sqrtm, (A_sqrtm,), False)
 
     # --- Run benchmarks ---
     print(f"\n{'Function':<30} {'Eager (ms)':>12} {'Compile (ms)':>14} {'Speedup':>10} {'Compile OK':>12}")
     print("-" * 80)
 
-    for name, (fn, args) in benchmarks.items():
+    for name, (fn, args, needs_clone) in benchmarks.items():
         # Eager benchmark
-        # For replace_nan_and_inf, need fresh clone each call
-        if name == 'replace_nan_and_inf':
-            def eager_fn(x_template=x_nan):
-                return math_utils.replace_nan_and_inf(x_template.clone(), 0)
-            eager_ms = bench(eager_fn, warmup=5, repeats=20, device=device_str)
+        if needs_clone:
+            # For in-place functions, clone first arg each call
+            template = args[0]
+            rest_args = args[1:]
+
+            def cloning_fn(*a, _fn=fn, _tpl=template, _rest=rest_args):
+                return _fn(_tpl.clone(), *_rest)
+
+            eager_ms = bench(cloning_fn, warmup=5, repeats=20, device=device_str)
         else:
             try:
                 eager_ms = bench(fn, *args, device=device_str)
@@ -157,10 +161,7 @@ def eager_fn(x_template=x_nan):
                 continue
 
         # Compile benchmark
-        if name == 'replace_nan_and_inf':
-            compile_result = try_compile_bench(eager_fn, device=device_str)
-        else:
-            compile_result = try_compile_bench(fn, *args, device=device_str)
+        compile_result = try_compile_bench(fn, *args, device=device_str)
 
         if len(compile_result) == 2:
             compile_ms, compile_ok = compile_result
diff --git a/src/arm_pytorch_utilities/linalg.py b/src/arm_pytorch_utilities/linalg.py
@@ -115,7 +115,7 @@ class MatrixSquareRoot(Function):
 
     @staticmethod
     def forward(ctx, input):
-        m = input.detach().numpy().astype(np.float_)
+        m = input.detach().numpy().astype(np.float64)
         sqrtm = torch.from_numpy(scipy.linalg.sqrtm(m).real).type_as(input)
         ctx.save_for_backward(sqrtm)
         return sqrtm
@@ -125,8 +125,8 @@ def backward(ctx, grad_output):
         grad_input = None
         if ctx.needs_input_grad[0]:
             sqrtm, = ctx.saved_tensors
-            sqrtm = sqrtm.data.numpy().astype(np.float_)
-            gm = grad_output.data.numpy().astype(np.float_)
+            sqrtm = sqrtm.data.numpy().astype(np.float64)
+            gm = grad_output.data.numpy().astype(np.float64)
 
             # Given a positive semi-definite matrix X,
             # since X = X^{1/2}X^{1/2}, we can compute the gradient of the
diff --git a/src/arm_pytorch_utilities/math_utils.py b/src/arm_pytorch_utilities/math_utils.py
@@ -11,9 +11,7 @@ def clip(a, min_val, max_val):
 
 def replace_nan_and_inf(a, replacement=0):
     """Replaces nan,inf,-inf values with replacement value in place"""
-    a[torch.isnan(a)] = replacement
-    a[a == float('inf')] = replacement
-    a[a == -float('inf')] = replacement
+    torch.nan_to_num(a, nan=replacement, posinf=replacement, neginf=replacement, out=a)
     return a
 
 
@@ -66,8 +64,8 @@ def angle_between_stable(u: torch.tensor, v: torch.tensor):
     dim = -1
     u_norm = u.norm(dim=dim, keepdim=True)
     v_norm = v.norm(dim=dim, keepdim=True)
-    uv = u.unsqueeze(1).repeat(1, v.shape[0], 1) * v_norm
-    vu = v.unsqueeze(0).repeat(u.shape[0], 1, 1) * u_norm.unsqueeze(1)
+    uv = u.unsqueeze(1) * v_norm.transpose(-2, -1).unsqueeze(-1)
+    vu = v.unsqueeze(0) * u_norm.unsqueeze(1)
     num = (uv - vu).norm(dim=dim)
     den = (uv + vu).norm(dim=dim)
     return 2 * torch.atan2(num, den)
@@ -104,9 +102,7 @@ def angular_diff(a, b):
 def angular_diff_batch(a, b):
     """Angle difference from b to a (a - b)"""
     d = a - b
-    d[d > math.pi] -= 2 * math.pi
-    d[d < -math.pi] += 2 * math.pi
-    return d
+    return ((d + math.pi) % (2 * math.pi)) - math.pi
 
 
 def angle_normalize(a):
diff --git a/tests/test_math.py b/tests/test_math.py
@@ -130,15 +130,23 @@ def test_angular_diff_batch():
     assert (result > -math.pi).all()
     assert (result <= math.pi + 1e-6).all()
 
-    # Compare against element-wise angular_diff
+    # Compare against element-wise angular_diff with inputs where |a-b| < 2*pi
+    # (scalar angular_diff only wraps once, so it's only correct in that range)
     N = 50
-    a = (torch.rand(N) - 0.5) * 4 * math.pi
-    b = (torch.rand(N) - 0.5) * 4 * math.pi
+    a = (torch.rand(N) - 0.5) * 2 * math.pi
+    b = (torch.rand(N) - 0.5) * 2 * math.pi
     batch_result = math_utils.angular_diff_batch(a, b)
     for i in range(N):
         scalar_result = math_utils.angular_diff(a[i].item(), b[i].item())
         assert abs(batch_result[i].item() - scalar_result) < 1e-5
 
+    # Verify batch version handles large differences correctly (beyond single-wrap range)
+    a_large = torch.tensor([10.0, -10.0, 20.0])
+    b_large = torch.tensor([0.0, 0.0, 0.0])
+    result_large = math_utils.angular_diff_batch(a_large, b_large)
+    assert (result_large > -math.pi).all()
+    assert (result_large <= math.pi + 1e-6).all()
+
 
 def test_get_bounds():
     assert math_utils.get_bounds(None, 5) == (-5, 5)