From 64836bca72052adfcf1937b9b876692bf1c346ff Mon Sep 17 00:00:00 2001
From: Alain Denzler <adenzler@nvidia.com>
Date: Wed, 13 May 2026 17:18:30 +0200
Subject: [PATCH] Use upper Cholesky factors

---
 mujoco_warp/_src/block_cholesky.py | 50 ++++++++---------
 mujoco_warp/_src/forward.py        |  4 +-
 mujoco_warp/_src/io.py             | 20 +++----
 mujoco_warp/_src/smooth.py         | 20 +++----
 mujoco_warp/_src/smooth_test.py    | 15 +++--
 mujoco_warp/_src/solver.py         | 90 +++++++++++++++---------------
 mujoco_warp/_src/solver_test.py    | 16 ++++++
 mujoco_warp/_src/support_test.py   | 20 +++----
 mujoco_warp/_src/types.py          | 14 +++--
 pyproject.toml                     |  2 +-
 uv.lock                            | 12 ++--
 11 files changed, 146 insertions(+), 117 deletions(-)

diff --git a/mujoco_warp/_src/block_cholesky.py b/mujoco_warp/_src/block_cholesky.py
index 855f9686c..dbef87c64 100644
--- a/mujoco_warp/_src/block_cholesky.py
+++ b/mujoco_warp/_src/block_cholesky.py
@@ -26,11 +26,11 @@ def blocked_cholesky_func(
     A: wp.array2d[float],
     matrix_size: int,
     # Out:
-    L: wp.array2d[float],
+    U: wp.array2d[float],
   ):
     """Computes the Cholesky factorization of a symmetric positive definite matrix A in blocks.
 
-    It returns a lower-triangular matrix L such that A = L L^T.
+    It returns an upper-triangular matrix U such that A = U^T U.
     """
     # Process the matrix in blocks along its leading dimension.
     for k in range(0, matrix_size, block_size):
@@ -41,24 +41,24 @@ def blocked_cholesky_func(
       A_kk_tile = wp.tile_load(A, shape=(block_size, block_size), offset=(k, k), storage="shared")
 
       for j in range(0, k, block_size):
-        L_block = wp.tile_load(L, shape=(block_size, block_size), offset=(k, j), storage="shared")
-        wp.tile_matmul(L_block, wp.tile_transpose(L_block), A_kk_tile, alpha=-1.0)
+        U_block = wp.tile_load(U, shape=(block_size, block_size), offset=(j, k), storage="shared")
+        wp.tile_matmul(wp.tile_transpose(U_block), U_block, A_kk_tile, alpha=-1.0)
 
       # Compute the Cholesky factorization for the block
-      wp.tile_cholesky_inplace(A_kk_tile)
-      wp.tile_store(L, A_kk_tile, offset=(k, k))
+      wp.tile_cholesky_inplace(A_kk_tile, fill_mode="upper")
+      wp.tile_store(U, A_kk_tile, offset=(k, k))
 
-      # Process the blocks below the current block
+      # Process the blocks to the right of the current block
       for i in range(end, matrix_size, block_size):
-        A_ik_tile = wp.tile_load(A, shape=(block_size, block_size), offset=(i, k), storage="shared")
+        A_ki_tile = wp.tile_load(A, shape=(block_size, block_size), offset=(k, i), storage="shared")
 
         for j in range(0, k, block_size):
-          L_tile = wp.tile_load(L, shape=(block_size, block_size), offset=(i, j), storage="shared")
-          L_2_tile = wp.tile_load(L, shape=(block_size, block_size), offset=(k, j), storage="shared")
-          wp.tile_matmul(L_tile, wp.tile_transpose(L_2_tile), A_ik_tile, alpha=-1.0)
+          U_jk_tile = wp.tile_load(U, shape=(block_size, block_size), offset=(j, k), storage="shared")
+          U_ji_tile = wp.tile_load(U, shape=(block_size, block_size), offset=(j, i), storage="shared")
+          wp.tile_matmul(wp.tile_transpose(U_jk_tile), U_ji_tile, A_ki_tile, alpha=-1.0)
 
-        wp.tile_lower_solve_inplace(A_kk_tile, wp.tile_transpose(A_ik_tile))
-        wp.tile_store(L, A_ik_tile, offset=(i, k))
+        wp.tile_lower_solve_inplace(wp.tile_transpose(A_kk_tile), A_ki_tile)
+        wp.tile_store(U, A_ki_tile, offset=(k, i))
 
   return blocked_cholesky_func
 
@@ -68,7 +68,7 @@ def create_blocked_cholesky_solve_func(block_size: int, matrix_size_static: int)
   @wp.func
   def blocked_cholesky_solve_func(
     # In:
-    L: wp.array2d[float],
+    U: wp.array2d[float],
     b: wp.array2d[float],
     matrix_size: int,
     # Out:
@@ -76,33 +76,33 @@ def blocked_cholesky_solve_func(
   ):
     """Block Cholesky factorization and solve.
 
-    Solves A x = b given the Cholesky factor L (A = L L^T) using blocked forward and backward
+    Solves A x = b given the Cholesky factor U (A = U^T U) using blocked forward and backward
     substitution.
     """
     rhs_tile = wp.tile_load(b, shape=(matrix_size_static, 1), offset=(0, 0), storage="shared", bounds_check=False)
 
-    # Forward substitution: solve L y = b
+    # Forward substitution: solve U^T y = b
     for i in range(0, matrix_size, block_size):
       rhs_view = wp.tile_view(rhs_tile, shape=(block_size, 1), offset=(i, 0))
       for j in range(0, i, block_size):
-        L_block = wp.tile_load(L, shape=(block_size, block_size), offset=(i, j), storage="shared")
+        U_block = wp.tile_load(U, shape=(block_size, block_size), offset=(j, i), storage="shared")
         y_block = wp.tile_view(rhs_tile, shape=(block_size, 1), offset=(j, 0))
-        wp.tile_matmul(L_block, y_block, rhs_view, alpha=-1.0)
+        wp.tile_matmul(wp.tile_transpose(U_block), y_block, rhs_view, alpha=-1.0)
 
-      L_tile = wp.tile_load(L, shape=(block_size, block_size), offset=(i, i), storage="shared")
-      wp.tile_lower_solve_inplace(L_tile, rhs_view)
+      U_tile = wp.tile_load(U, shape=(block_size, block_size), offset=(i, i), storage="shared")
+      wp.tile_lower_solve_inplace(wp.tile_transpose(U_tile), rhs_view)
 
-    # Backward substitution: solve L^T x = y
+    # Backward substitution: solve U x = y
     for i in range(matrix_size - block_size, -1, -block_size):
       i_end = i + block_size
       tmp_tile = wp.tile_view(rhs_tile, shape=(block_size, 1), offset=(i, 0))
       for j in range(i_end, matrix_size, block_size):
-        L_tile = wp.tile_load(L, shape=(block_size, block_size), offset=(j, i), storage="shared")
+        U_tile = wp.tile_load(U, shape=(block_size, block_size), offset=(i, j), storage="shared")
         x_tile = wp.tile_view(rhs_tile, shape=(block_size, 1), offset=(j, 0))
-        wp.tile_matmul(wp.tile_transpose(L_tile), x_tile, tmp_tile, alpha=-1.0)
-      L_tile = wp.tile_load(L, shape=(block_size, block_size), offset=(i, i), storage="shared")
+        wp.tile_matmul(U_tile, x_tile, tmp_tile, alpha=-1.0)
+      U_tile = wp.tile_load(U, shape=(block_size, block_size), offset=(i, i), storage="shared")
 
-      wp.tile_upper_solve_inplace(wp.tile_transpose(L_tile), tmp_tile)
+      wp.tile_upper_solve_inplace(U_tile, tmp_tile)
 
     wp.tile_store(x, rhs_tile, offset=(0, 0), bounds_check=False)
 
diff --git a/mujoco_warp/_src/forward.py b/mujoco_warp/_src/forward.py
index df4e98097..46455bb06 100644
--- a/mujoco_warp/_src/forward.py
+++ b/mujoco_warp/_src/forward.py
@@ -385,8 +385,8 @@ def euler_dense(
     qm_integration_tile = wp.tile_diag_add(M_tile, damping_scaled)
 
     Ma_tile = wp.tile_load(efc_Ma_in[worldid], shape=(TILE_SIZE,), offset=(dofid,))
-    L_tile = wp.tile_cholesky(qm_integration_tile)
-    qacc_tile = wp.tile_cholesky_solve(L_tile, Ma_tile)
+    L_tile = wp.tile_cholesky(qm_integration_tile, fill_mode="upper")
+    qacc_tile = wp.tile_cholesky_solve(L_tile, Ma_tile, fill_mode="upper")
     wp.tile_store(qacc_out[worldid], qacc_tile, offset=(dofid))
 
   return euler_dense
diff --git a/mujoco_warp/_src/io.py b/mujoco_warp/_src/io.py
index 67abfa17e..f74c640a1 100644
--- a/mujoco_warp/_src/io.py
+++ b/mujoco_warp/_src/io.py
@@ -271,7 +271,7 @@ def _check_friction(name: str, id_: int, condim: int, friction, checks):
   jnt_limited_slide_hinge = mjm.jnt_limited & np.isin(mjm.jnt_type, (mujoco.mjtJoint.mjJNT_SLIDE, mujoco.mjtJoint.mjJNT_HINGE))
   m.jnt_limited_slide_hinge_adr = np.nonzero(jnt_limited_slide_hinge)[0]
   m.jnt_limited_ball_adr = np.nonzero(mjm.jnt_limited & (mjm.jnt_type == mujoco.mjtJoint.mjJNT_BALL))[0]
-  m.dof_tri_row, m.dof_tri_col = np.tril_indices(mjm.nv)
+  m.dof_tri_row, m.dof_tri_col = np.triu_indices(mjm.nv)
 
   # precompute body_isdofancestor: which DOFs affect each body
   # TODO: Investigate alternative approach such as bitmap
@@ -640,6 +640,12 @@ def _check_margin(name, t1, t2, margin):
       elemid += 1
       j = mjm.dof_parentid[j]
   m.qM_fullm_elemid = qM_fullm_elemid
+  upper_j, upper_i = np.triu_indices(mjm.nv)
+  upper_elemid = qM_fullm_elemid[upper_i, upper_j]
+  valid_mask = upper_elemid != -1
+  m.qM_fullm_upper_i = upper_j[valid_mask].tolist()
+  m.qM_fullm_upper_j = upper_i[valid_mask].tolist()
+  m.qM_fullm_upper_elemid = upper_elemid[valid_mask].tolist()
 
   # indices for sparse qD_fullm (used in RNE derivatives)
   # D-structure is the full square sparsity pattern (both upper and lower triangle)
@@ -1290,10 +1296,10 @@ def put_data(
     qM = np.zeros((mjm.nv, mjm.nv))
     if check_version("mujoco>=3.8.1.dev910242375"):
       mujoco.mju_sym2dense(qM, mjd.M, mjm.M_rownnz, mjm.M_rowadr, mjm.M_colind)
-      qLD = np.linalg.cholesky(qM) if (mjd.M != 0.0).any() and (mjd.qLD != 0.0).any() else np.zeros((mjm.nv, mjm.nv))
+      qLD = np.linalg.cholesky(qM).T if (mjd.M != 0.0).any() and (mjd.qLD != 0.0).any() else np.zeros((mjm.nv, mjm.nv))
     else:
       mujoco.mj_fullM(mjm, qM, mjd.qM)
-      qLD = np.linalg.cholesky(qM) if (mjd.qM != 0.0).any() and (mjd.qLD != 0.0).any() else np.zeros((mjm.nv, mjm.nv))
+      qLD = np.linalg.cholesky(qM).T if (mjd.qM != 0.0).any() and (mjd.qLD != 0.0).any() else np.zeros((mjm.nv, mjm.nv))
     padding = sizes["nv_pad"] - mjm.nv
     qM_padded = np.pad(qM, ((0, padding), (0, padding)), mode="constant", constant_values=0.0)
     d.qM = wp.array(np.full((nworld, sizes["nv_pad"], sizes["nv_pad"]), qM_padded), dtype=float)
@@ -2813,13 +2819,7 @@ def create_render_context(
   mjd = mujoco.MjData(mjm)
   mujoco.mj_forward(mjm, mjd)
 
-  constructor = "sah"
-  if check_version("warp>=1.13.0.dev20260325"):
-    # TODO: The cubql constructor and is_cubql_available exist only in
-    # recent Warp 1.13+ builds, modify this after warp is updated to 1.13+.
-    _cubql_avail = getattr(wp, "is_cubql_available", None)
-    if callable(_cubql_avail) and _cubql_avail():
-      constructor = "cubql"
+  constructor = "cubql"
 
   # Mesh BVHs – build for all meshes so per-world variants are available
   nmesh = mjm.nmesh
diff --git a/mujoco_warp/_src/smooth.py b/mujoco_warp/_src/smooth.py
index a05c250dd..913d31234 100644
--- a/mujoco_warp/_src/smooth.py
+++ b/mujoco_warp/_src/smooth.py
@@ -1082,7 +1082,7 @@ def cholesky_factorize(
 
     dofid = adr[nodeid]
     M_tile = wp.tile_load(qM_in[worldid], shape=(TILE_SIZE, TILE_SIZE), offset=(dofid, dofid))
-    L_tile = wp.tile_cholesky(M_tile)
+    L_tile = wp.tile_cholesky(M_tile, fill_mode="upper")
     wp.tile_store(L_out[worldid], L_tile, offset=(dofid, dofid))
 
   return cholesky_factorize
@@ -2842,14 +2842,14 @@ def cholesky_solve(
     dofid = adr[nodeid]
     y_slice = wp.tile_load(y[worldid], shape=(TILE_SIZE,), offset=(dofid,))
     L_tile = wp.tile_load(L[worldid], shape=(TILE_SIZE, TILE_SIZE), offset=(dofid, dofid))
-    x_slice = wp.tile_cholesky_solve(L_tile, y_slice)
+    x_slice = wp.tile_cholesky_solve(L_tile, y_slice, fill_mode="upper")
     wp.tile_store(x[worldid], x_slice, offset=(dofid,))
 
   return cholesky_solve
 
 
 def _solve_LD_dense(m: Model, d: Data, L: wp.array3d[float], x: wp.array2d[float], y: wp.array2d[float]):
-  """Computes dense backsubstitution: x = inv(L'*L)*y."""
+  """Computes dense backsubstitution: x = inv(U.T @ U) * y."""
   for tile in m.qM_tiles:
     wp.launch_tiled(
       _tile_cholesky_solve(tile),
@@ -2868,9 +2868,9 @@ def solve_LD(
   x: wp.array2d[float],
   y: wp.array2d[float],
 ):
-  """Computes backsubstitution to solve a linear system of the form x = inv(L'*D*L) * y.
+  """Computes backsubstitution for the inertia factorization.
 
-  L and D are the factors from the Cholesky factorization of the inertia matrix.
+  Sparse models use MuJoCo's L'*D*L factors; dense models use an upper Cholesky factor U.
 
   This function dispatches to either a sparse or dense solver depending on Model options.
 
@@ -2922,9 +2922,9 @@ def cholesky_factorize_solve(
     M_tile = wp.tile_load(M[worldid], shape=(TILE_SIZE, TILE_SIZE), offset=(dofid, dofid))
     y_slice = wp.tile_load(y[worldid], shape=(TILE_SIZE,), offset=(dofid,))
 
-    L_tile = wp.tile_cholesky(M_tile)
+    L_tile = wp.tile_cholesky(M_tile, fill_mode="upper")
     wp.tile_store(L[worldid], L_tile, offset=(dofid, dofid))
-    x_slice = wp.tile_cholesky_solve(L_tile, y_slice)
+    x_slice = wp.tile_cholesky_solve(L_tile, y_slice, fill_mode="upper")
     wp.tile_store(x[worldid], x_slice, offset=(dofid,))
 
   return cholesky_factorize_solve
@@ -2949,9 +2949,9 @@ def _factor_solve_i_dense(
 
 
 def factor_solve_i(m, d, M, L, D, x, y):
-  """Factorizes and solves the linear system: x = inv(L'*D*L) * y or x = inv(L'*L) * y.
+  """Factorizes and solves the inertia-like linear system.
 
-  M is an inertia-like matrix and L, D are its Cholesky-like factors.
+  Sparse models use MuJoCo's L'*D*L factors; dense models use an upper Cholesky factor U.
 
   This function first factorizes the matrix M (sparse or dense depending on model options),
   then solves the system for x given right-hand side y.
@@ -2960,7 +2960,7 @@ def factor_solve_i(m, d, M, L, D, x, y):
     m: The model containing factorization and sparsity information.
     d: The data object containing workspace and factorization results.
     M: The inertia-like matrix to factorize.
-    L: Output lower-triangular factor from the factorization (sparse or dense).
+    L: Output sparse factor or dense upper Cholesky factor.
     D: Output diagonal factor from the factorization (only used for sparse).
     x: Output array for the solution.
     y: Input right-hand side array.
diff --git a/mujoco_warp/_src/smooth_test.py b/mujoco_warp/_src/smooth_test.py
index 8fa64207b..47dc0ccc6 100644
--- a/mujoco_warp/_src/smooth_test.py
+++ b/mujoco_warp/_src/smooth_test.py
@@ -212,11 +212,16 @@ def test_crb(self, jacobian):
   @parameterized.parameters(mujoco.mjtJacobian.mjJAC_SPARSE, mujoco.mjtJacobian.mjJAC_DENSE)
   def test_factor_m(self, jacobian):
     """Tests factor_m."""
-    _, mjd, m, d = test_data.fixture("pendula.xml", overrides={"opt.jacobian": jacobian})
+    mjm, mjd, m, d = test_data.fixture("pendula.xml", overrides={"opt.jacobian": jacobian})
 
-    qLD = d.qLD.numpy()[0].copy()
     d.qLDiagInv.fill_(wp.inf)
     if jacobian == mujoco.mjtJacobian.mjJAC_DENSE:
+      qM = np.zeros((mjm.nv, mjm.nv))
+      if check_version("mujoco>=3.8.1.dev910242375"):
+        mujoco.mju_sym2dense(qM, mjd.M, mjm.M_rownnz, mjm.M_rowadr, mjm.M_colind)
+      else:
+        mujoco.mj_fullM(mjm, qM, mjd.qM)
+      qLD = np.linalg.cholesky(qM).T
       wp_qLD = qLD.copy()
       wp_qLD[wp_qLD != 0.0] = np.inf
       wp.copy(d.qLD, wp.array(wp_qLD, dtype=float))
@@ -229,7 +234,7 @@ def test_factor_m(self, jacobian):
       _assert_eq(d.qLD.numpy()[0, 0], mjd.qLD, "qLD (sparse)")
       _assert_eq(d.qLDiagInv.numpy()[0], mjd.qLDiagInv, "qLDiagInv")
     else:
-      _assert_eq(d.qLD.numpy()[0], qLD, "qLD (dense)")
+      _assert_eq(d.qLD.numpy()[0], qLD, "qLD (dense upper)")
 
   @parameterized.parameters(mujoco.mjtJacobian.mjJAC_SPARSE, mujoco.mjtJacobian.mjJAC_DENSE)
   def test_solve_m(self, jacobian):
@@ -507,8 +512,8 @@ def test_factor_solve_i(self, jacobian):
       _assert_eq(d.qLD.numpy()[0].reshape(-1), mjd.qLD, "qLD")
       _assert_eq(d.qLDiagInv.numpy()[0], mjd.qLDiagInv, "qLDiagInv")
     else:
-      qLD = np.linalg.cholesky(qM)
-      _assert_eq(d.qLD.numpy()[0], qLD, "qLD")
+      qLD = np.linalg.cholesky(qM).T
+      _assert_eq(d.qLD.numpy()[0], qLD, "qLD upper")
 
   def test_tendon_armature(self):
     mjm, mjd, m, d = test_data.fixture("tendon/armature.xml", keyframe=0)
diff --git a/mujoco_warp/_src/solver.py b/mujoco_warp/_src/solver.py
index 74f13433a..0c9e2881d 100644
--- a/mujoco_warp/_src/solver.py
+++ b/mujoco_warp/_src/solver.py
@@ -2063,9 +2063,9 @@ def update_gradient_h_incremental(
   # Out:
   ctx_h_out: wp.array3d[float],
 ):
-  """Incrementally update lower triangle of H for changed constraints.
+  """Incrementally update upper triangle of H for changed constraints.
 
-  Each thread handles one (i, j) element of the lower triangle.
+  Each thread handles one unique (i, j) element and writes it to the upper triangle.
   For each changed constraint, adds or subtracts D * J[i] * J[j].
   """
   worldid, elementid = wp.tid()
@@ -2074,28 +2074,28 @@ def update_gradient_h_incremental(
   if n_changes == 0:
     return
 
-  # Lower triangle index: elementid -> (i, j) where i >= j
-  i = (int(wp.sqrt(float(1 + 8 * elementid))) - 1) // 2
-  j = elementid - (i * (i + 1)) // 2
+  # Upper-triangle enumeration: elementid -> (row, col) where row <= col.
+  col = (int(wp.sqrt(float(1 + 8 * elementid))) - 1) // 2
+  row = elementid - (col * (col + 1)) // 2
 
   delta = float(0.0)
   for change_idx in range(n_changes):
     efcid = changed_ids_in[worldid, change_idx]
-    Ji = efc_J_in[worldid, efcid, i]
-    if Ji == 0.0:
+    Jrow = efc_J_in[worldid, efcid, row]
+    if Jrow == 0.0:
       continue
-    Jj = efc_J_in[worldid, efcid, j]
-    if Jj == 0.0:
+    Jcol = efc_J_in[worldid, efcid, col]
+    if Jcol == 0.0:
       continue
 
     D = efc_D_in[worldid, efcid]
     if efc_state_in[worldid, efcid] == types.ConstraintState.QUADRATIC.value:
-      delta += D * Ji * Jj
+      delta += D * Jrow * Jcol
     else:
-      delta -= D * Ji * Jj
+      delta -= D * Jrow * Jcol
 
   if delta != 0.0:
-    ctx_h_out[worldid, i, j] += delta
+    ctx_h_out[worldid, row, col] += delta
 
 
 @wp.kernel
@@ -2113,7 +2113,7 @@ def update_gradient_h_incremental_sparse(
   # Out:
   ctx_h_out: wp.array3d[float],
 ):
-  """Incrementally update lower triangle of H for changed constraints (sparse J)."""
+  """Incrementally update upper triangle of H for changed constraints (sparse J)."""
   worldid, change_idx = wp.tid()
 
   n_changes = changed_count_in[worldid]
@@ -2144,8 +2144,8 @@ def update_gradient_h_incremental_sparse(
         continue
       colindj = efc_J_colind_in[worldid, 0, sparseidj]
       h = sign * Ji * Jj
-      # Ensure lower triangle: larger index first
-      if colindi >= colindj:
+      # Ensure upper triangle: smaller index first.
+      if colindi <= colindj:
         wp.atomic_add(ctx_h_out[worldid, colindi], colindj, h)
       else:
         wp.atomic_add(ctx_h_out[worldid, colindj], colindi, h)
@@ -2257,10 +2257,11 @@ def update_gradient_grad(
 
 
 @wp.kernel
-def update_gradient_set_h_qM_lower_sparse(
+def update_gradient_set_h_qM_upper_sparse(
   # Model:
-  qM_fullm_i: wp.array[int],
-  qM_fullm_j: wp.array[int],
+  qM_fullm_upper_i: wp.array[int],
+  qM_fullm_upper_j: wp.array[int],
+  qM_fullm_upper_elemid: wp.array[int],
   # Data in:
   qM_in: wp.array3d[float],
   # In:
@@ -2273,9 +2274,10 @@ def update_gradient_set_h_qM_lower_sparse(
   if ctx_done_in[worldid]:
     return
 
-  i = qM_fullm_i[elementid]
-  j = qM_fullm_j[elementid]
-  ctx_h_out[worldid, i, j] += qM_in[worldid, 0, elementid]
+  i = qM_fullm_upper_i[elementid]
+  j = qM_fullm_upper_j[elementid]
+  qM_elemid = qM_fullm_upper_elemid[elementid]
+  ctx_h_out[worldid, i, j] += qM_in[worldid, 0, qM_elemid]
 
 
 @wp.func
@@ -2317,12 +2319,12 @@ def kernel(
 
     nefc = nefc_in[worldid]
 
-    # get lower diagonal index
-    i = (int(sqrt(float(1 + 8 * elementid))) - 1) // 2
-    j = elementid - (i * (i + 1)) // 2
+    # Upper-triangle tile index: elementid -> (row, col) where row <= col.
+    col = (int(sqrt(float(1 + 8 * elementid))) - 1) // 2
+    row = elementid - (col * (col + 1)) // 2
 
-    offset_i = i * TILE_SIZE
-    offset_j = j * TILE_SIZE
+    offset_row = row * TILE_SIZE
+    offset_col = col * TILE_SIZE
 
     sum_val = wp.tile_zeros(shape=(TILE_SIZE, TILE_SIZE), dtype=wp.float32)
 
@@ -2334,12 +2336,12 @@ def kernel(
       # AD: leaving bounds-check disabled here because I'm not entirely sure that
       # everything always hits the fast path. The padding takes care of any
       # potential OOB accesses.
-      J_ki = wp.tile_load(efc_J_in[worldid], shape=(TILE_SIZE, TILE_SIZE), offset=(k, offset_i), bounds_check=False)
+      J_krow = wp.tile_load(efc_J_in[worldid], shape=(TILE_SIZE, TILE_SIZE), offset=(k, offset_row), bounds_check=False)
 
-      if offset_i != offset_j:
-        J_kj = wp.tile_load(efc_J_in[worldid], shape=(TILE_SIZE, TILE_SIZE), offset=(k, offset_j), bounds_check=False)
+      if offset_row != offset_col:
+        J_kcol = wp.tile_load(efc_J_in[worldid], shape=(TILE_SIZE, TILE_SIZE), offset=(k, offset_col), bounds_check=False)
       else:
-        wp.tile_assign(J_kj, J_ki, (0, 0))
+        wp.tile_assign(J_kcol, J_krow, (0, 0))
 
       D_k = wp.tile_load(efc_D_in[worldid], shape=TILE_SIZE, offset=k, bounds_check=False)
       state = wp.tile_load(efc_state_in[worldid], shape=TILE_SIZE, offset=k, bounds_check=False)
@@ -2353,13 +2355,13 @@ def kernel(
       active_tile = wp.tile_map(active_check, tid_tile, threshold_tile)
       D_k = wp.tile_map(wp.mul, active_tile, D_k)
 
-      J_ki = wp.tile_map(wp.mul, wp.tile_transpose(J_ki), wp.tile_broadcast(D_k, shape=(TILE_SIZE, TILE_SIZE)))
+      J_krow = wp.tile_map(wp.mul, wp.tile_transpose(J_krow), wp.tile_broadcast(D_k, shape=(TILE_SIZE, TILE_SIZE)))
 
-      sum_val += wp.tile_matmul(J_ki, J_kj)
+      sum_val += wp.tile_matmul(J_krow, J_kcol)
 
     # AD: setting bounds_check to True explicitly here because for some reason it was
     # slower to disable it.
-    wp.tile_store(ctx_h_out[worldid], sum_val, offset=(offset_i, offset_j), bounds_check=True)
+    wp.tile_store(ctx_h_out[worldid], sum_val, offset=(offset_row, offset_col), bounds_check=True)
 
   return kernel
 
@@ -2746,9 +2748,9 @@ def kernel(
       return
 
     mat_tile = wp.tile_load(h_in[worldid], shape=(TILE_SIZE, TILE_SIZE))
-    fact_tile = wp.tile_cholesky(mat_tile)
+    fact_tile = wp.tile_cholesky(mat_tile, fill_mode="upper")
     input_tile = wp.tile_load(ctx_grad_in[worldid], shape=TILE_SIZE)
-    output_tile = wp.tile_cholesky_solve(fact_tile, input_tile)
+    output_tile = wp.tile_cholesky_solve(fact_tile, input_tile, fill_mode="upper")
     wp.tile_store(ctx_Mgrad_out[worldid], output_tile)
 
   return kernel
@@ -2914,9 +2916,9 @@ def _JTDAJ_sparse(
         colindj = efc_J_colind_in[worldid, 0, sparseidj]
 
       h = Ji * Jj * efc_D
-      # Store in lower triangle only: ensure row >= col
-      row = wp.max(colindi, colindj)
-      col = wp.min(colindi, colindj)
+      # Store in upper triangle only: ensure row <= col.
+      row = wp.min(colindi, colindj)
+      col = wp.max(colindi, colindj)
       wp.atomic_add(h_out[worldid, row], col, h)
 
 
@@ -2945,9 +2947,9 @@ def _update_gradient(m: types.Model, d: types.Data, ctx: SolverContext):
       )
 
       wp.launch(
-        update_gradient_set_h_qM_lower_sparse,
-        dim=(d.nworld, m.qM_fullm_i.size),
-        inputs=[m.qM_fullm_i, m.qM_fullm_j, d.qM, ctx.done],
+        update_gradient_set_h_qM_upper_sparse,
+        dim=(d.nworld, m.qM_fullm_upper_i.size),
+        inputs=[m.qM_fullm_upper_i, m.qM_fullm_upper_j, m.qM_fullm_upper_elemid, d.qM, ctx.done],
         outputs=[ctx.h],
       )
     else:
@@ -3067,7 +3069,7 @@ def _update_gradient_incremental(m: types.Model, d: types.Data, ctx: SolverConte
     outputs=[ctx.grad, ctx.grad_dot],
   )
 
-  # Update lower triangle of H with delta from changed constraints
+  # Update upper triangle of H with delta from changed constraints.
   if m.is_sparse:
     wp.launch(
       update_gradient_h_incremental_sparse,
@@ -3085,10 +3087,10 @@ def _update_gradient_incremental(m: types.Model, d: types.Data, ctx: SolverConte
       outputs=[ctx.h],
     )
   else:
-    lower_tri_dim = m.nv * (m.nv + 1) // 2
+    tri_dim = m.nv * (m.nv + 1) // 2
     wp.launch(
       update_gradient_h_incremental,
-      dim=(d.nworld, lower_tri_dim),
+      dim=(d.nworld, tri_dim),
       inputs=[
         d.efc.J,
         d.efc.D,
diff --git a/mujoco_warp/_src/solver_test.py b/mujoco_warp/_src/solver_test.py
index a1f0b5ffb..70c62e49e 100644
--- a/mujoco_warp/_src/solver_test.py
+++ b/mujoco_warp/_src/solver_test.py
@@ -40,6 +40,22 @@ def _assert_eq(a, b, name):
 
 
 class SolverTest(parameterized.TestCase):
+  def test_qM_fullm_upper_indices_are_row_sorted(self):
+    """Sparse qM seeding uses upper-triangle row-sorted writes."""
+    _, _, m, _ = test_data.fixture("humanoid/humanoid.xml")
+
+    lower_row = m.qM_fullm_i.numpy()
+    lower_col = m.qM_fullm_j.numpy()
+    upper_row = m.qM_fullm_upper_i.numpy()
+    upper_col = m.qM_fullm_upper_j.numpy()
+    upper_elemid = m.qM_fullm_upper_elemid.numpy()
+
+    self.assertEqual(upper_row.size, lower_row.size)
+    self.assertTrue(np.all(upper_row <= upper_col))
+    self.assertTrue(np.all(upper_row[:-1] <= upper_row[1:]))
+    np.testing.assert_array_equal(upper_row, lower_col[upper_elemid])
+    np.testing.assert_array_equal(upper_col, lower_row[upper_elemid])
+
   @parameterized.product(
     cone=tuple(ConeType),
     solver_=tuple(SolverType),
diff --git a/mujoco_warp/_src/support_test.py b/mujoco_warp/_src/support_test.py
index ba5b89a3e..86b25596f 100644
--- a/mujoco_warp/_src/support_test.py
+++ b/mujoco_warp/_src/support_test.py
@@ -265,18 +265,18 @@ def combined_cholesky_kernel(
     )
     wp.synchronize()
 
-    L_result = hfactor.numpy()[0]
+    U_result = hfactor.numpy()[0]
     x_result = Mgrad.numpy()[0]
 
     # Verify padding outside active region doesn't affect active computation
     # Off-diagonal padding should be zero (active region shouldn't touch padding)
     np.testing.assert_array_equal(
-      L_result[nv:, :nv],
+      U_result[nv:, :nv],
       0.0,
       err_msg="padding rows in active region were overwritten",
     )
     np.testing.assert_array_equal(
-      L_result[:nv, nv:],
+      U_result[:nv, nv:],
       0.0,
       err_msg="padding columns in active region were overwritten",
     )
@@ -284,7 +284,7 @@ def combined_cholesky_kernel(
     # Check that padding region remains identity after factorization
     padding_size = nv_pad - nv
     if padding_size > 0:
-      padding_square = L_result[nv:, nv:]
+      padding_square = U_result[nv:, nv:]
       expected_identity = np.eye(padding_size, dtype=np.float32)
       np.testing.assert_allclose(
         padding_square,
@@ -295,23 +295,23 @@ def combined_cholesky_kernel(
       )
 
     # Compare with numpy cholesky on symmetrized Hessian
-    L_numpy = np.linalg.cholesky(SPD_active_hessian)
+    U_numpy = np.linalg.cholesky(SPD_active_hessian).T
     np.testing.assert_allclose(
-      L_result[:nv, :nv],
-      L_numpy,
+      U_result[:nv, :nv],
+      U_numpy,
       rtol=1e-4,
       atol=1e-4,
       err_msg="Cholesky decomposition mismatch with numpy",
     )
 
-    # Verify L @ L.T = A (symmetrized Hessian)
-    A_reconstructed = L_result[:nv, :nv] @ L_result[:nv, :nv].T
+    # Verify U.T @ U = A (symmetrized Hessian)
+    A_reconstructed = U_result[:nv, :nv].T @ U_result[:nv, :nv]
     np.testing.assert_allclose(
       A_reconstructed,
       SPD_active_hessian,
       rtol=1e-5,
       atol=1e-5,
-      err_msg="L @ L.T does not equal symmetrized Hessian",
+      err_msg="U.T @ U does not equal symmetrized Hessian",
     )
 
     # Verify solution: A @ x = b using the symmetrized Hessian
diff --git a/mujoco_warp/_src/types.py b/mujoco_warp/_src/types.py
index 44fa4b3c0..d3c4e7696 100644
--- a/mujoco_warp/_src/types.py
+++ b/mujoco_warp/_src/types.py
@@ -1188,8 +1188,8 @@ class Model:
     jnt_limited_slide_hinge_adr: limited/slide/hinge jntadr
     jnt_limited_ball_adr: limited/ball jntadr
     body_isdofancestor: precomputed mask of which DOFs affect each body
-    dof_tri_row: dof lower triangle row (used in solver)
-    dof_tri_col: dof lower triangle col (used in solver)
+    dof_tri_row: dof upper triangle row (used in solver)
+    dof_tri_col: dof upper triangle col (used in solver)
     nxn_geom_pair: collision pair geom ids [-2, ngeom-1]
     nxn_geom_pair_filtered: valid collision pair geom ids
                             [-1, ngeom - 1]
@@ -1251,6 +1251,9 @@ class Model:
     qM_fullm_i: sparse mass matrix addressing
     qM_fullm_j: sparse mass matrix addressing
     qM_fullm_elemid: (row, col) -> elemid into qM_fullm_i/qM_fullm_j; -1 if not a chain ancestor
+    qM_fullm_upper_i: upper-triangle row indices for solver h seeding
+    qM_fullm_upper_j: upper-triangle column indices for solver h seeding
+    qM_fullm_upper_elemid: source elemid into qM_fullm_i/qM_fullm_j
     qD_fullm_i: D-structure row indices for RNE derivatives
     qD_fullm_j: D-structure column indices for RNE derivatives
     qM_mulm_rowadr: sparse matmul row pointers
@@ -1647,6 +1650,9 @@ class Model:
   qM_fullm_i: wp.array[int]
   qM_fullm_j: wp.array[int]
   qM_fullm_elemid: wp.array2d[int]  # (row, col) -> elemid in qM_fullm_i/j; -1 if col is not a chain ancestor of row
+  qM_fullm_upper_i: wp.array[int]
+  qM_fullm_upper_j: wp.array[int]
+  qM_fullm_upper_elemid: wp.array[int]
   qD_fullm_i: wp.array[int]  # D-structure (full square) row indices for RNE derivatives
   qD_fullm_j: wp.array[int]  # D-structure (full square) column indices for RNE derivatives
   # Gather-based sparse mul_m indices (thread per DOF, no atomics)
@@ -1814,8 +1820,8 @@ class Data:
     crb: com-based composite inertia and mass                   (nworld, nbody, 10)
     qM: total inertia                                           (nworld, nv, nv) if dense
                                                                 (nworld, 1, nM) if sparse
-    qLD: L'*D*L factorization of M                              (nworld, nv, nv) if dense
-                                                                (nworld, 1, nC) if sparse
+    qLD: upper Cholesky factorization if dense                  (nworld, nv, nv)
+         L'*D*L factorization of M if sparse                    (nworld, 1, nC)
     qLDiagInv: 1/diag(D)                                        (nworld, nv)
     flexedge_velocity: flex edge velocities                     (nworld, nflexedge)
     ten_velocity: tendon velocities                             (nworld, ntendon)
diff --git a/pyproject.toml b/pyproject.toml
index 9032af0ed..a97a5832e 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -31,7 +31,7 @@ dependencies = [
     "etils[epath]",
     "mujoco>=3.8.0",
     "numpy",
-    "warp-lang>=1.12",
+    "warp-lang>=1.13",
 ]
 
 [[tool.uv.index]]
diff --git a/uv.lock b/uv.lock
index 7375993bc..5f2264ed6 100644
--- a/uv.lock
+++ b/uv.lock
@@ -1136,7 +1136,7 @@ requires-dist = [
     { name = "pytest", marker = "extra == 'dev'" },
     { name = "pytest-xdist", marker = "extra == 'dev'" },
     { name = "ruff", marker = "extra == 'dev'" },
-    { name = "warp-lang", specifier = ">=1.12", index = "https://pypi.nvidia.com/" },
+    { name = "warp-lang", specifier = ">=1.13", index = "https://pypi.nvidia.com/" },
     { name = "warp-lang", marker = "extra == 'dev'", specifier = ">=1.11.0.dev0", index = "https://pypi.nvidia.com/" },
 ]
 provides-extras = ["dev", "cpu", "cuda"]
@@ -2397,17 +2397,17 @@ wheels = [
 
 [[package]]
 name = "warp-lang"
-version = "1.13.0.dev20260227"
+version = "1.13.0"
 source = { registry = "https://pypi.nvidia.com/" }
 dependencies = [
     { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" },
     { name = "numpy", version = "2.3.4", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" },
 ]
 wheels = [
-    { url = "https://pypi.nvidia.com/warp-lang/warp_lang-1.13.0.dev20260227-py3-none-macosx_11_0_arm64.whl", hash = "sha256:5e0bd6abe4ea0b91ed2539fac04d1b86b6c29bd65304ee81032b420cafe91ae9" },
-    { url = "https://pypi.nvidia.com/warp-lang/warp_lang-1.13.0.dev20260227-py3-none-manylinux_2_28_x86_64.whl", hash = "sha256:382583045cd214e586d249f2391f9fe6abd28feaff0da5bfc6a9c45657b28116" },
-    { url = "https://pypi.nvidia.com/warp-lang/warp_lang-1.13.0.dev20260227-py3-none-manylinux_2_34_aarch64.whl", hash = "sha256:ab957adfc8df513462cd8e2f62f946650dbd6dc04268ccfbecb9627b3d454466" },
-    { url = "https://pypi.nvidia.com/warp-lang/warp_lang-1.13.0.dev20260227-py3-none-win_amd64.whl", hash = "sha256:3b07f61e1ac60b1b74dba936bc4f307af36ed0595f18032150e673367fe35f8e" },
+    { url = "https://pypi.nvidia.com/warp-lang/warp_lang-1.13.0-py3-none-macosx_11_0_arm64.whl", hash = "sha256:4375f572301991fe0fbf0af29fc84d76cd27d531432d6df8452b25088c21ea5a" },
+    { url = "https://pypi.nvidia.com/warp-lang/warp_lang-1.13.0-py3-none-manylinux_2_28_x86_64.whl", hash = "sha256:ac2479c70ad410d58deb088c2a64168792be588efc1bec22dc51f92238e8c8c3" },
+    { url = "https://pypi.nvidia.com/warp-lang/warp_lang-1.13.0-py3-none-manylinux_2_34_aarch64.whl", hash = "sha256:476b54f0dcf6767f23305a328660a9a74d01e371b6b7c3d77e03e18b4a1bb1a5" },
+    { url = "https://pypi.nvidia.com/warp-lang/warp_lang-1.13.0-py3-none-win_amd64.whl", hash = "sha256:47975ea07252d45a4d09d2d1a6cffc55002fa7fbde771c8dceb1baf4b32d4fb8" },
 ]
 
 [[package]]