google-deepmind · adenzler-nvidia · May 13, 2026
diff --git a/mujoco_warp/_src/block_cholesky.py b/mujoco_warp/_src/block_cholesky.py
@@ -26,11 +26,11 @@ def blocked_cholesky_func(
     A: wp.array2d[float],
     matrix_size: int,
     # Out:
-    L: wp.array2d[float],
+    U: wp.array2d[float],
   ):
     """Computes the Cholesky factorization of a symmetric positive definite matrix A in blocks.
 
-    It returns a lower-triangular matrix L such that A = L L^T.
+    It returns an upper-triangular matrix U such that A = U^T U.
     """
     # Process the matrix in blocks along its leading dimension.
     for k in range(0, matrix_size, block_size):
@@ -41,24 +41,24 @@ def blocked_cholesky_func(
       A_kk_tile = wp.tile_load(A, shape=(block_size, block_size), offset=(k, k), storage="shared")
 
       for j in range(0, k, block_size):
-        L_block = wp.tile_load(L, shape=(block_size, block_size), offset=(k, j), storage="shared")
-        wp.tile_matmul(L_block, wp.tile_transpose(L_block), A_kk_tile, alpha=-1.0)
+        U_block = wp.tile_load(U, shape=(block_size, block_size), offset=(j, k), storage="shared")
+        wp.tile_matmul(wp.tile_transpose(U_block), U_block, A_kk_tile, alpha=-1.0)
 
       # Compute the Cholesky factorization for the block
-      wp.tile_cholesky_inplace(A_kk_tile)
-      wp.tile_store(L, A_kk_tile, offset=(k, k))
+      wp.tile_cholesky_inplace(A_kk_tile, fill_mode="upper")
+      wp.tile_store(U, A_kk_tile, offset=(k, k))
 
-      # Process the blocks below the current block
+      # Process the blocks to the right of the current block
       for i in range(end, matrix_size, block_size):
-        A_ik_tile = wp.tile_load(A, shape=(block_size, block_size), offset=(i, k), storage="shared")
+        A_ki_tile = wp.tile_load(A, shape=(block_size, block_size), offset=(k, i), storage="shared")
 
         for j in range(0, k, block_size):
-          L_tile = wp.tile_load(L, shape=(block_size, block_size), offset=(i, j), storage="shared")
-          L_2_tile = wp.tile_load(L, shape=(block_size, block_size), offset=(k, j), storage="shared")
-          wp.tile_matmul(L_tile, wp.tile_transpose(L_2_tile), A_ik_tile, alpha=-1.0)
+          U_jk_tile = wp.tile_load(U, shape=(block_size, block_size), offset=(j, k), storage="shared")
+          U_ji_tile = wp.tile_load(U, shape=(block_size, block_size), offset=(j, i), storage="shared")
+          wp.tile_matmul(wp.tile_transpose(U_jk_tile), U_ji_tile, A_ki_tile, alpha=-1.0)
 
-        wp.tile_lower_solve_inplace(A_kk_tile, wp.tile_transpose(A_ik_tile))
-        wp.tile_store(L, A_ik_tile, offset=(i, k))
+        wp.tile_lower_solve_inplace(wp.tile_transpose(A_kk_tile), A_ki_tile)
+        wp.tile_store(U, A_ki_tile, offset=(k, i))
 
   return blocked_cholesky_func
 
@@ -68,41 +68,41 @@ def create_blocked_cholesky_solve_func(block_size: int, matrix_size_static: int)
   @wp.func
   def blocked_cholesky_solve_func(
     # In:
-    L: wp.array2d[float],
+    U: wp.array2d[float],
     b: wp.array2d[float],
     matrix_size: int,
     # Out:
     x: wp.array2d[float],
   ):
     """Block Cholesky factorization and solve.
 
-    Solves A x = b given the Cholesky factor L (A = L L^T) using blocked forward and backward
+    Solves A x = b given the Cholesky factor U (A = U^T U) using blocked forward and backward
     substitution.
     """
     rhs_tile = wp.tile_load(b, shape=(matrix_size_static, 1), offset=(0, 0), storage="shared", bounds_check=False)
 
-    # Forward substitution: solve L y = b
+    # Forward substitution: solve U^T y = b
     for i in range(0, matrix_size, block_size):
       rhs_view = wp.tile_view(rhs_tile, shape=(block_size, 1), offset=(i, 0))
       for j in range(0, i, block_size):
-        L_block = wp.tile_load(L, shape=(block_size, block_size), offset=(i, j), storage="shared")
+        U_block = wp.tile_load(U, shape=(block_size, block_size), offset=(j, i), storage="shared")
         y_block = wp.tile_view(rhs_tile, shape=(block_size, 1), offset=(j, 0))
-        wp.tile_matmul(L_block, y_block, rhs_view, alpha=-1.0)
+        wp.tile_matmul(wp.tile_transpose(U_block), y_block, rhs_view, alpha=-1.0)
 
-      L_tile = wp.tile_load(L, shape=(block_size, block_size), offset=(i, i), storage="shared")
-      wp.tile_lower_solve_inplace(L_tile, rhs_view)
+      U_tile = wp.tile_load(U, shape=(block_size, block_size), offset=(i, i), storage="shared")
+      wp.tile_lower_solve_inplace(wp.tile_transpose(U_tile), rhs_view)
 
-    # Backward substitution: solve L^T x = y
+    # Backward substitution: solve U x = y
     for i in range(matrix_size - block_size, -1, -block_size):
       i_end = i + block_size
       tmp_tile = wp.tile_view(rhs_tile, shape=(block_size, 1), offset=(i, 0))
       for j in range(i_end, matrix_size, block_size):
-        L_tile = wp.tile_load(L, shape=(block_size, block_size), offset=(j, i), storage="shared")
+        U_tile = wp.tile_load(U, shape=(block_size, block_size), offset=(i, j), storage="shared")
         x_tile = wp.tile_view(rhs_tile, shape=(block_size, 1), offset=(j, 0))
-        wp.tile_matmul(wp.tile_transpose(L_tile), x_tile, tmp_tile, alpha=-1.0)
-      L_tile = wp.tile_load(L, shape=(block_size, block_size), offset=(i, i), storage="shared")
+        wp.tile_matmul(U_tile, x_tile, tmp_tile, alpha=-1.0)
+      U_tile = wp.tile_load(U, shape=(block_size, block_size), offset=(i, i), storage="shared")
 
-      wp.tile_upper_solve_inplace(wp.tile_transpose(L_tile), tmp_tile)
+      wp.tile_upper_solve_inplace(U_tile, tmp_tile)
 
     wp.tile_store(x, rhs_tile, offset=(0, 0), bounds_check=False)
 

diff --git a/mujoco_warp/_src/forward.py b/mujoco_warp/_src/forward.py
@@ -385,8 +385,8 @@ def euler_dense(
     qm_integration_tile = wp.tile_diag_add(M_tile, damping_scaled)
 
     Ma_tile = wp.tile_load(efc_Ma_in[worldid], shape=(TILE_SIZE,), offset=(dofid,))
-    L_tile = wp.tile_cholesky(qm_integration_tile)
-    qacc_tile = wp.tile_cholesky_solve(L_tile, Ma_tile)
+    L_tile = wp.tile_cholesky(qm_integration_tile, fill_mode="upper")
+    qacc_tile = wp.tile_cholesky_solve(L_tile, Ma_tile, fill_mode="upper")
     wp.tile_store(qacc_out[worldid], qacc_tile, offset=(dofid))
 
   return euler_dense

diff --git a/mujoco_warp/_src/io.py b/mujoco_warp/_src/io.py
@@ -271,7 +271,7 @@ def _check_friction(name: str, id_: int, condim: int, friction, checks):
   jnt_limited_slide_hinge = mjm.jnt_limited & np.isin(mjm.jnt_type, (mujoco.mjtJoint.mjJNT_SLIDE, mujoco.mjtJoint.mjJNT_HINGE))
   m.jnt_limited_slide_hinge_adr = np.nonzero(jnt_limited_slide_hinge)[0]
   m.jnt_limited_ball_adr = np.nonzero(mjm.jnt_limited & (mjm.jnt_type == mujoco.mjtJoint.mjJNT_BALL))[0]
-  m.dof_tri_row, m.dof_tri_col = np.tril_indices(mjm.nv)
+  m.dof_tri_row, m.dof_tri_col = np.triu_indices(mjm.nv)
 
   # precompute body_isdofancestor: which DOFs affect each body
   # TODO: Investigate alternative approach such as bitmap
@@ -640,6 +640,15 @@ def _check_margin(name, t1, t2, margin):
       elemid += 1
       j = mjm.dof_parentid[j]
   m.qM_fullm_elemid = qM_fullm_elemid
+  m.qM_fullm_upper_i, m.qM_fullm_upper_j, m.qM_fullm_upper_elemid = [], [], []
+  for j in range(mjm.nv):
+    for i in range(j, mjm.nv):
+      elemid = qM_fullm_elemid[i, j]
+      if elemid == -1:
+        continue
+      m.qM_fullm_upper_i.append(j)
+      m.qM_fullm_upper_j.append(i)
+      m.qM_fullm_upper_elemid.append(elemid)
 
   # indices for sparse qD_fullm (used in RNE derivatives)
   # D-structure is the full square sparsity pattern (both upper and lower triangle)
@@ -1290,10 +1299,10 @@ def put_data(
     qM = np.zeros((mjm.nv, mjm.nv))
     if check_version("mujoco>=3.8.1.dev910242375"):
       mujoco.mju_sym2dense(qM, mjd.M, mjm.M_rownnz, mjm.M_rowadr, mjm.M_colind)
-      qLD = np.linalg.cholesky(qM) if (mjd.M != 0.0).any() and (mjd.qLD != 0.0).any() else np.zeros((mjm.nv, mjm.nv))
+      qLD = np.linalg.cholesky(qM).T if (mjd.M != 0.0).any() and (mjd.qLD != 0.0).any() else np.zeros((mjm.nv, mjm.nv))
     else:
       mujoco.mj_fullM(mjm, qM, mjd.qM)
-      qLD = np.linalg.cholesky(qM) if (mjd.qM != 0.0).any() and (mjd.qLD != 0.0).any() else np.zeros((mjm.nv, mjm.nv))
+      qLD = np.linalg.cholesky(qM).T if (mjd.qM != 0.0).any() and (mjd.qLD != 0.0).any() else np.zeros((mjm.nv, mjm.nv))
     padding = sizes["nv_pad"] - mjm.nv
     qM_padded = np.pad(qM, ((0, padding), (0, padding)), mode="constant", constant_values=0.0)
     d.qM = wp.array(np.full((nworld, sizes["nv_pad"], sizes["nv_pad"]), qM_padded), dtype=float)

diff --git a/mujoco_warp/_src/smooth.py b/mujoco_warp/_src/smooth.py
@@ -1082,7 +1082,7 @@ def cholesky_factorize(
 
     dofid = adr[nodeid]
     M_tile = wp.tile_load(qM_in[worldid], shape=(TILE_SIZE, TILE_SIZE), offset=(dofid, dofid))
-    L_tile = wp.tile_cholesky(M_tile)
+    L_tile = wp.tile_cholesky(M_tile, fill_mode="upper")
     wp.tile_store(L_out[worldid], L_tile, offset=(dofid, dofid))
 
   return cholesky_factorize
@@ -2842,14 +2842,14 @@ def cholesky_solve(
     dofid = adr[nodeid]
     y_slice = wp.tile_load(y[worldid], shape=(TILE_SIZE,), offset=(dofid,))
     L_tile = wp.tile_load(L[worldid], shape=(TILE_SIZE, TILE_SIZE), offset=(dofid, dofid))
-    x_slice = wp.tile_cholesky_solve(L_tile, y_slice)
+    x_slice = wp.tile_cholesky_solve(L_tile, y_slice, fill_mode="upper")
     wp.tile_store(x[worldid], x_slice, offset=(dofid,))
 
   return cholesky_solve
 
 
 def _solve_LD_dense(m: Model, d: Data, L: wp.array3d[float], x: wp.array2d[float], y: wp.array2d[float]):
-  """Computes dense backsubstitution: x = inv(L'*L)*y."""
+  """Computes dense backsubstitution: x = inv(U.T @ U) * y."""
   for tile in m.qM_tiles:
     wp.launch_tiled(
       _tile_cholesky_solve(tile),
@@ -2868,9 +2868,9 @@ def solve_LD(
   x: wp.array2d[float],
   y: wp.array2d[float],
 ):
-  """Computes backsubstitution to solve a linear system of the form x = inv(L'*D*L) * y.
+  """Computes backsubstitution for the inertia factorization.
 
-  L and D are the factors from the Cholesky factorization of the inertia matrix.
+  Sparse models use MuJoCo's L'*D*L factors; dense models use an upper Cholesky factor U.
 
   This function dispatches to either a sparse or dense solver depending on Model options.
 
@@ -2922,9 +2922,9 @@ def cholesky_factorize_solve(
     M_tile = wp.tile_load(M[worldid], shape=(TILE_SIZE, TILE_SIZE), offset=(dofid, dofid))
     y_slice = wp.tile_load(y[worldid], shape=(TILE_SIZE,), offset=(dofid,))
 
-    L_tile = wp.tile_cholesky(M_tile)
+    L_tile = wp.tile_cholesky(M_tile, fill_mode="upper")
     wp.tile_store(L[worldid], L_tile, offset=(dofid, dofid))
-    x_slice = wp.tile_cholesky_solve(L_tile, y_slice)
+    x_slice = wp.tile_cholesky_solve(L_tile, y_slice, fill_mode="upper")
     wp.tile_store(x[worldid], x_slice, offset=(dofid,))
 
   return cholesky_factorize_solve
@@ -2949,9 +2949,9 @@ def _factor_solve_i_dense(
 
 
 def factor_solve_i(m, d, M, L, D, x, y):
-  """Factorizes and solves the linear system: x = inv(L'*D*L) * y or x = inv(L'*L) * y.
+  """Factorizes and solves the inertia-like linear system.
 
-  M is an inertia-like matrix and L, D are its Cholesky-like factors.
+  Sparse models use MuJoCo's L'*D*L factors; dense models use an upper Cholesky factor U.
 
   This function first factorizes the matrix M (sparse or dense depending on model options),
   then solves the system for x given right-hand side y.
@@ -2960,7 +2960,7 @@ def factor_solve_i(m, d, M, L, D, x, y):
     m: The model containing factorization and sparsity information.
     d: The data object containing workspace and factorization results.
     M: The inertia-like matrix to factorize.
-    L: Output lower-triangular factor from the factorization (sparse or dense).
+    L: Output sparse factor or dense upper Cholesky factor.
     D: Output diagonal factor from the factorization (only used for sparse).
     x: Output array for the solution.
     y: Input right-hand side array.

diff --git a/mujoco_warp/_src/smooth_test.py b/mujoco_warp/_src/smooth_test.py
@@ -212,11 +212,16 @@ def test_crb(self, jacobian):
   @parameterized.parameters(mujoco.mjtJacobian.mjJAC_SPARSE, mujoco.mjtJacobian.mjJAC_DENSE)
   def test_factor_m(self, jacobian):
     """Tests factor_m."""
-    _, mjd, m, d = test_data.fixture("pendula.xml", overrides={"opt.jacobian": jacobian})
+    mjm, mjd, m, d = test_data.fixture("pendula.xml", overrides={"opt.jacobian": jacobian})
 
-    qLD = d.qLD.numpy()[0].copy()
     d.qLDiagInv.fill_(wp.inf)
     if jacobian == mujoco.mjtJacobian.mjJAC_DENSE:
+      qM = np.zeros((mjm.nv, mjm.nv))
+      if check_version("mujoco>=3.8.1.dev910242375"):
+        mujoco.mju_sym2dense(qM, mjd.M, mjm.M_rownnz, mjm.M_rowadr, mjm.M_colind)
+      else:
+        mujoco.mj_fullM(mjm, qM, mjd.qM)
+      qLD = np.linalg.cholesky(qM).T
       wp_qLD = qLD.copy()
       wp_qLD[wp_qLD != 0.0] = np.inf
       wp.copy(d.qLD, wp.array(wp_qLD, dtype=float))
@@ -229,7 +234,7 @@ def test_factor_m(self, jacobian):
       _assert_eq(d.qLD.numpy()[0, 0], mjd.qLD, "qLD (sparse)")
       _assert_eq(d.qLDiagInv.numpy()[0], mjd.qLDiagInv, "qLDiagInv")
     else:
-      _assert_eq(d.qLD.numpy()[0], qLD, "qLD (dense)")
+      _assert_eq(d.qLD.numpy()[0], qLD, "qLD (dense upper)")
 
   @parameterized.parameters(mujoco.mjtJacobian.mjJAC_SPARSE, mujoco.mjtJacobian.mjJAC_DENSE)
   def test_solve_m(self, jacobian):
@@ -507,8 +512,8 @@ def test_factor_solve_i(self, jacobian):
       _assert_eq(d.qLD.numpy()[0].reshape(-1), mjd.qLD, "qLD")
       _assert_eq(d.qLDiagInv.numpy()[0], mjd.qLDiagInv, "qLDiagInv")
     else:
-      qLD = np.linalg.cholesky(qM)
-      _assert_eq(d.qLD.numpy()[0], qLD, "qLD")
+      qLD = np.linalg.cholesky(qM).T
+      _assert_eq(d.qLD.numpy()[0], qLD, "qLD upper")
 
   def test_tendon_armature(self):
     mjm, mjd, m, d = test_data.fixture("tendon/armature.xml", keyframe=0)