From 64836bca72052adfcf1937b9b876692bf1c346ff Mon Sep 17 00:00:00 2001 From: Alain Denzler Date: Wed, 13 May 2026 17:18:30 +0200 Subject: [PATCH] Use upper Cholesky factors --- mujoco_warp/_src/block_cholesky.py | 50 ++++++++--------- mujoco_warp/_src/forward.py | 4 +- mujoco_warp/_src/io.py | 20 +++---- mujoco_warp/_src/smooth.py | 20 +++---- mujoco_warp/_src/smooth_test.py | 15 +++-- mujoco_warp/_src/solver.py | 90 +++++++++++++++--------------- mujoco_warp/_src/solver_test.py | 16 ++++++ mujoco_warp/_src/support_test.py | 20 +++---- mujoco_warp/_src/types.py | 14 +++-- pyproject.toml | 2 +- uv.lock | 12 ++-- 11 files changed, 146 insertions(+), 117 deletions(-) diff --git a/mujoco_warp/_src/block_cholesky.py b/mujoco_warp/_src/block_cholesky.py index 855f9686c..dbef87c64 100644 --- a/mujoco_warp/_src/block_cholesky.py +++ b/mujoco_warp/_src/block_cholesky.py @@ -26,11 +26,11 @@ def blocked_cholesky_func( A: wp.array2d[float], matrix_size: int, # Out: - L: wp.array2d[float], + U: wp.array2d[float], ): """Computes the Cholesky factorization of a symmetric positive definite matrix A in blocks. - It returns a lower-triangular matrix L such that A = L L^T. + It returns an upper-triangular matrix U such that A = U^T U. """ # Process the matrix in blocks along its leading dimension. for k in range(0, matrix_size, block_size): @@ -41,24 +41,24 @@ def blocked_cholesky_func( A_kk_tile = wp.tile_load(A, shape=(block_size, block_size), offset=(k, k), storage="shared") for j in range(0, k, block_size): - L_block = wp.tile_load(L, shape=(block_size, block_size), offset=(k, j), storage="shared") - wp.tile_matmul(L_block, wp.tile_transpose(L_block), A_kk_tile, alpha=-1.0) + U_block = wp.tile_load(U, shape=(block_size, block_size), offset=(j, k), storage="shared") + wp.tile_matmul(wp.tile_transpose(U_block), U_block, A_kk_tile, alpha=-1.0) # Compute the Cholesky factorization for the block - wp.tile_cholesky_inplace(A_kk_tile) - wp.tile_store(L, A_kk_tile, offset=(k, k)) + wp.tile_cholesky_inplace(A_kk_tile, fill_mode="upper") + wp.tile_store(U, A_kk_tile, offset=(k, k)) - # Process the blocks below the current block + # Process the blocks to the right of the current block for i in range(end, matrix_size, block_size): - A_ik_tile = wp.tile_load(A, shape=(block_size, block_size), offset=(i, k), storage="shared") + A_ki_tile = wp.tile_load(A, shape=(block_size, block_size), offset=(k, i), storage="shared") for j in range(0, k, block_size): - L_tile = wp.tile_load(L, shape=(block_size, block_size), offset=(i, j), storage="shared") - L_2_tile = wp.tile_load(L, shape=(block_size, block_size), offset=(k, j), storage="shared") - wp.tile_matmul(L_tile, wp.tile_transpose(L_2_tile), A_ik_tile, alpha=-1.0) + U_jk_tile = wp.tile_load(U, shape=(block_size, block_size), offset=(j, k), storage="shared") + U_ji_tile = wp.tile_load(U, shape=(block_size, block_size), offset=(j, i), storage="shared") + wp.tile_matmul(wp.tile_transpose(U_jk_tile), U_ji_tile, A_ki_tile, alpha=-1.0) - wp.tile_lower_solve_inplace(A_kk_tile, wp.tile_transpose(A_ik_tile)) - wp.tile_store(L, A_ik_tile, offset=(i, k)) + wp.tile_lower_solve_inplace(wp.tile_transpose(A_kk_tile), A_ki_tile) + wp.tile_store(U, A_ki_tile, offset=(k, i)) return blocked_cholesky_func @@ -68,7 +68,7 @@ def create_blocked_cholesky_solve_func(block_size: int, matrix_size_static: int) @wp.func def blocked_cholesky_solve_func( # In: - L: wp.array2d[float], + U: wp.array2d[float], b: wp.array2d[float], matrix_size: int, # Out: @@ -76,33 +76,33 @@ def blocked_cholesky_solve_func( ): """Block Cholesky factorization and solve. - Solves A x = b given the Cholesky factor L (A = L L^T) using blocked forward and backward + Solves A x = b given the Cholesky factor U (A = U^T U) using blocked forward and backward substitution. """ rhs_tile = wp.tile_load(b, shape=(matrix_size_static, 1), offset=(0, 0), storage="shared", bounds_check=False) - # Forward substitution: solve L y = b + # Forward substitution: solve U^T y = b for i in range(0, matrix_size, block_size): rhs_view = wp.tile_view(rhs_tile, shape=(block_size, 1), offset=(i, 0)) for j in range(0, i, block_size): - L_block = wp.tile_load(L, shape=(block_size, block_size), offset=(i, j), storage="shared") + U_block = wp.tile_load(U, shape=(block_size, block_size), offset=(j, i), storage="shared") y_block = wp.tile_view(rhs_tile, shape=(block_size, 1), offset=(j, 0)) - wp.tile_matmul(L_block, y_block, rhs_view, alpha=-1.0) + wp.tile_matmul(wp.tile_transpose(U_block), y_block, rhs_view, alpha=-1.0) - L_tile = wp.tile_load(L, shape=(block_size, block_size), offset=(i, i), storage="shared") - wp.tile_lower_solve_inplace(L_tile, rhs_view) + U_tile = wp.tile_load(U, shape=(block_size, block_size), offset=(i, i), storage="shared") + wp.tile_lower_solve_inplace(wp.tile_transpose(U_tile), rhs_view) - # Backward substitution: solve L^T x = y + # Backward substitution: solve U x = y for i in range(matrix_size - block_size, -1, -block_size): i_end = i + block_size tmp_tile = wp.tile_view(rhs_tile, shape=(block_size, 1), offset=(i, 0)) for j in range(i_end, matrix_size, block_size): - L_tile = wp.tile_load(L, shape=(block_size, block_size), offset=(j, i), storage="shared") + U_tile = wp.tile_load(U, shape=(block_size, block_size), offset=(i, j), storage="shared") x_tile = wp.tile_view(rhs_tile, shape=(block_size, 1), offset=(j, 0)) - wp.tile_matmul(wp.tile_transpose(L_tile), x_tile, tmp_tile, alpha=-1.0) - L_tile = wp.tile_load(L, shape=(block_size, block_size), offset=(i, i), storage="shared") + wp.tile_matmul(U_tile, x_tile, tmp_tile, alpha=-1.0) + U_tile = wp.tile_load(U, shape=(block_size, block_size), offset=(i, i), storage="shared") - wp.tile_upper_solve_inplace(wp.tile_transpose(L_tile), tmp_tile) + wp.tile_upper_solve_inplace(U_tile, tmp_tile) wp.tile_store(x, rhs_tile, offset=(0, 0), bounds_check=False) diff --git a/mujoco_warp/_src/forward.py b/mujoco_warp/_src/forward.py index df4e98097..46455bb06 100644 --- a/mujoco_warp/_src/forward.py +++ b/mujoco_warp/_src/forward.py @@ -385,8 +385,8 @@ def euler_dense( qm_integration_tile = wp.tile_diag_add(M_tile, damping_scaled) Ma_tile = wp.tile_load(efc_Ma_in[worldid], shape=(TILE_SIZE,), offset=(dofid,)) - L_tile = wp.tile_cholesky(qm_integration_tile) - qacc_tile = wp.tile_cholesky_solve(L_tile, Ma_tile) + L_tile = wp.tile_cholesky(qm_integration_tile, fill_mode="upper") + qacc_tile = wp.tile_cholesky_solve(L_tile, Ma_tile, fill_mode="upper") wp.tile_store(qacc_out[worldid], qacc_tile, offset=(dofid)) return euler_dense diff --git a/mujoco_warp/_src/io.py b/mujoco_warp/_src/io.py index 67abfa17e..f74c640a1 100644 --- a/mujoco_warp/_src/io.py +++ b/mujoco_warp/_src/io.py @@ -271,7 +271,7 @@ def _check_friction(name: str, id_: int, condim: int, friction, checks): jnt_limited_slide_hinge = mjm.jnt_limited & np.isin(mjm.jnt_type, (mujoco.mjtJoint.mjJNT_SLIDE, mujoco.mjtJoint.mjJNT_HINGE)) m.jnt_limited_slide_hinge_adr = np.nonzero(jnt_limited_slide_hinge)[0] m.jnt_limited_ball_adr = np.nonzero(mjm.jnt_limited & (mjm.jnt_type == mujoco.mjtJoint.mjJNT_BALL))[0] - m.dof_tri_row, m.dof_tri_col = np.tril_indices(mjm.nv) + m.dof_tri_row, m.dof_tri_col = np.triu_indices(mjm.nv) # precompute body_isdofancestor: which DOFs affect each body # TODO: Investigate alternative approach such as bitmap @@ -640,6 +640,12 @@ def _check_margin(name, t1, t2, margin): elemid += 1 j = mjm.dof_parentid[j] m.qM_fullm_elemid = qM_fullm_elemid + upper_j, upper_i = np.triu_indices(mjm.nv) + upper_elemid = qM_fullm_elemid[upper_i, upper_j] + valid_mask = upper_elemid != -1 + m.qM_fullm_upper_i = upper_j[valid_mask].tolist() + m.qM_fullm_upper_j = upper_i[valid_mask].tolist() + m.qM_fullm_upper_elemid = upper_elemid[valid_mask].tolist() # indices for sparse qD_fullm (used in RNE derivatives) # D-structure is the full square sparsity pattern (both upper and lower triangle) @@ -1290,10 +1296,10 @@ def put_data( qM = np.zeros((mjm.nv, mjm.nv)) if check_version("mujoco>=3.8.1.dev910242375"): mujoco.mju_sym2dense(qM, mjd.M, mjm.M_rownnz, mjm.M_rowadr, mjm.M_colind) - qLD = np.linalg.cholesky(qM) if (mjd.M != 0.0).any() and (mjd.qLD != 0.0).any() else np.zeros((mjm.nv, mjm.nv)) + qLD = np.linalg.cholesky(qM).T if (mjd.M != 0.0).any() and (mjd.qLD != 0.0).any() else np.zeros((mjm.nv, mjm.nv)) else: mujoco.mj_fullM(mjm, qM, mjd.qM) - qLD = np.linalg.cholesky(qM) if (mjd.qM != 0.0).any() and (mjd.qLD != 0.0).any() else np.zeros((mjm.nv, mjm.nv)) + qLD = np.linalg.cholesky(qM).T if (mjd.qM != 0.0).any() and (mjd.qLD != 0.0).any() else np.zeros((mjm.nv, mjm.nv)) padding = sizes["nv_pad"] - mjm.nv qM_padded = np.pad(qM, ((0, padding), (0, padding)), mode="constant", constant_values=0.0) d.qM = wp.array(np.full((nworld, sizes["nv_pad"], sizes["nv_pad"]), qM_padded), dtype=float) @@ -2813,13 +2819,7 @@ def create_render_context( mjd = mujoco.MjData(mjm) mujoco.mj_forward(mjm, mjd) - constructor = "sah" - if check_version("warp>=1.13.0.dev20260325"): - # TODO: The cubql constructor and is_cubql_available exist only in - # recent Warp 1.13+ builds, modify this after warp is updated to 1.13+. - _cubql_avail = getattr(wp, "is_cubql_available", None) - if callable(_cubql_avail) and _cubql_avail(): - constructor = "cubql" + constructor = "cubql" # Mesh BVHs – build for all meshes so per-world variants are available nmesh = mjm.nmesh diff --git a/mujoco_warp/_src/smooth.py b/mujoco_warp/_src/smooth.py index a05c250dd..913d31234 100644 --- a/mujoco_warp/_src/smooth.py +++ b/mujoco_warp/_src/smooth.py @@ -1082,7 +1082,7 @@ def cholesky_factorize( dofid = adr[nodeid] M_tile = wp.tile_load(qM_in[worldid], shape=(TILE_SIZE, TILE_SIZE), offset=(dofid, dofid)) - L_tile = wp.tile_cholesky(M_tile) + L_tile = wp.tile_cholesky(M_tile, fill_mode="upper") wp.tile_store(L_out[worldid], L_tile, offset=(dofid, dofid)) return cholesky_factorize @@ -2842,14 +2842,14 @@ def cholesky_solve( dofid = adr[nodeid] y_slice = wp.tile_load(y[worldid], shape=(TILE_SIZE,), offset=(dofid,)) L_tile = wp.tile_load(L[worldid], shape=(TILE_SIZE, TILE_SIZE), offset=(dofid, dofid)) - x_slice = wp.tile_cholesky_solve(L_tile, y_slice) + x_slice = wp.tile_cholesky_solve(L_tile, y_slice, fill_mode="upper") wp.tile_store(x[worldid], x_slice, offset=(dofid,)) return cholesky_solve def _solve_LD_dense(m: Model, d: Data, L: wp.array3d[float], x: wp.array2d[float], y: wp.array2d[float]): - """Computes dense backsubstitution: x = inv(L'*L)*y.""" + """Computes dense backsubstitution: x = inv(U.T @ U) * y.""" for tile in m.qM_tiles: wp.launch_tiled( _tile_cholesky_solve(tile), @@ -2868,9 +2868,9 @@ def solve_LD( x: wp.array2d[float], y: wp.array2d[float], ): - """Computes backsubstitution to solve a linear system of the form x = inv(L'*D*L) * y. + """Computes backsubstitution for the inertia factorization. - L and D are the factors from the Cholesky factorization of the inertia matrix. + Sparse models use MuJoCo's L'*D*L factors; dense models use an upper Cholesky factor U. This function dispatches to either a sparse or dense solver depending on Model options. @@ -2922,9 +2922,9 @@ def cholesky_factorize_solve( M_tile = wp.tile_load(M[worldid], shape=(TILE_SIZE, TILE_SIZE), offset=(dofid, dofid)) y_slice = wp.tile_load(y[worldid], shape=(TILE_SIZE,), offset=(dofid,)) - L_tile = wp.tile_cholesky(M_tile) + L_tile = wp.tile_cholesky(M_tile, fill_mode="upper") wp.tile_store(L[worldid], L_tile, offset=(dofid, dofid)) - x_slice = wp.tile_cholesky_solve(L_tile, y_slice) + x_slice = wp.tile_cholesky_solve(L_tile, y_slice, fill_mode="upper") wp.tile_store(x[worldid], x_slice, offset=(dofid,)) return cholesky_factorize_solve @@ -2949,9 +2949,9 @@ def _factor_solve_i_dense( def factor_solve_i(m, d, M, L, D, x, y): - """Factorizes and solves the linear system: x = inv(L'*D*L) * y or x = inv(L'*L) * y. + """Factorizes and solves the inertia-like linear system. - M is an inertia-like matrix and L, D are its Cholesky-like factors. + Sparse models use MuJoCo's L'*D*L factors; dense models use an upper Cholesky factor U. This function first factorizes the matrix M (sparse or dense depending on model options), then solves the system for x given right-hand side y. @@ -2960,7 +2960,7 @@ def factor_solve_i(m, d, M, L, D, x, y): m: The model containing factorization and sparsity information. d: The data object containing workspace and factorization results. M: The inertia-like matrix to factorize. - L: Output lower-triangular factor from the factorization (sparse or dense). + L: Output sparse factor or dense upper Cholesky factor. D: Output diagonal factor from the factorization (only used for sparse). x: Output array for the solution. y: Input right-hand side array. diff --git a/mujoco_warp/_src/smooth_test.py b/mujoco_warp/_src/smooth_test.py index 8fa64207b..47dc0ccc6 100644 --- a/mujoco_warp/_src/smooth_test.py +++ b/mujoco_warp/_src/smooth_test.py @@ -212,11 +212,16 @@ def test_crb(self, jacobian): @parameterized.parameters(mujoco.mjtJacobian.mjJAC_SPARSE, mujoco.mjtJacobian.mjJAC_DENSE) def test_factor_m(self, jacobian): """Tests factor_m.""" - _, mjd, m, d = test_data.fixture("pendula.xml", overrides={"opt.jacobian": jacobian}) + mjm, mjd, m, d = test_data.fixture("pendula.xml", overrides={"opt.jacobian": jacobian}) - qLD = d.qLD.numpy()[0].copy() d.qLDiagInv.fill_(wp.inf) if jacobian == mujoco.mjtJacobian.mjJAC_DENSE: + qM = np.zeros((mjm.nv, mjm.nv)) + if check_version("mujoco>=3.8.1.dev910242375"): + mujoco.mju_sym2dense(qM, mjd.M, mjm.M_rownnz, mjm.M_rowadr, mjm.M_colind) + else: + mujoco.mj_fullM(mjm, qM, mjd.qM) + qLD = np.linalg.cholesky(qM).T wp_qLD = qLD.copy() wp_qLD[wp_qLD != 0.0] = np.inf wp.copy(d.qLD, wp.array(wp_qLD, dtype=float)) @@ -229,7 +234,7 @@ def test_factor_m(self, jacobian): _assert_eq(d.qLD.numpy()[0, 0], mjd.qLD, "qLD (sparse)") _assert_eq(d.qLDiagInv.numpy()[0], mjd.qLDiagInv, "qLDiagInv") else: - _assert_eq(d.qLD.numpy()[0], qLD, "qLD (dense)") + _assert_eq(d.qLD.numpy()[0], qLD, "qLD (dense upper)") @parameterized.parameters(mujoco.mjtJacobian.mjJAC_SPARSE, mujoco.mjtJacobian.mjJAC_DENSE) def test_solve_m(self, jacobian): @@ -507,8 +512,8 @@ def test_factor_solve_i(self, jacobian): _assert_eq(d.qLD.numpy()[0].reshape(-1), mjd.qLD, "qLD") _assert_eq(d.qLDiagInv.numpy()[0], mjd.qLDiagInv, "qLDiagInv") else: - qLD = np.linalg.cholesky(qM) - _assert_eq(d.qLD.numpy()[0], qLD, "qLD") + qLD = np.linalg.cholesky(qM).T + _assert_eq(d.qLD.numpy()[0], qLD, "qLD upper") def test_tendon_armature(self): mjm, mjd, m, d = test_data.fixture("tendon/armature.xml", keyframe=0) diff --git a/mujoco_warp/_src/solver.py b/mujoco_warp/_src/solver.py index 74f13433a..0c9e2881d 100644 --- a/mujoco_warp/_src/solver.py +++ b/mujoco_warp/_src/solver.py @@ -2063,9 +2063,9 @@ def update_gradient_h_incremental( # Out: ctx_h_out: wp.array3d[float], ): - """Incrementally update lower triangle of H for changed constraints. + """Incrementally update upper triangle of H for changed constraints. - Each thread handles one (i, j) element of the lower triangle. + Each thread handles one unique (i, j) element and writes it to the upper triangle. For each changed constraint, adds or subtracts D * J[i] * J[j]. """ worldid, elementid = wp.tid() @@ -2074,28 +2074,28 @@ def update_gradient_h_incremental( if n_changes == 0: return - # Lower triangle index: elementid -> (i, j) where i >= j - i = (int(wp.sqrt(float(1 + 8 * elementid))) - 1) // 2 - j = elementid - (i * (i + 1)) // 2 + # Upper-triangle enumeration: elementid -> (row, col) where row <= col. + col = (int(wp.sqrt(float(1 + 8 * elementid))) - 1) // 2 + row = elementid - (col * (col + 1)) // 2 delta = float(0.0) for change_idx in range(n_changes): efcid = changed_ids_in[worldid, change_idx] - Ji = efc_J_in[worldid, efcid, i] - if Ji == 0.0: + Jrow = efc_J_in[worldid, efcid, row] + if Jrow == 0.0: continue - Jj = efc_J_in[worldid, efcid, j] - if Jj == 0.0: + Jcol = efc_J_in[worldid, efcid, col] + if Jcol == 0.0: continue D = efc_D_in[worldid, efcid] if efc_state_in[worldid, efcid] == types.ConstraintState.QUADRATIC.value: - delta += D * Ji * Jj + delta += D * Jrow * Jcol else: - delta -= D * Ji * Jj + delta -= D * Jrow * Jcol if delta != 0.0: - ctx_h_out[worldid, i, j] += delta + ctx_h_out[worldid, row, col] += delta @wp.kernel @@ -2113,7 +2113,7 @@ def update_gradient_h_incremental_sparse( # Out: ctx_h_out: wp.array3d[float], ): - """Incrementally update lower triangle of H for changed constraints (sparse J).""" + """Incrementally update upper triangle of H for changed constraints (sparse J).""" worldid, change_idx = wp.tid() n_changes = changed_count_in[worldid] @@ -2144,8 +2144,8 @@ def update_gradient_h_incremental_sparse( continue colindj = efc_J_colind_in[worldid, 0, sparseidj] h = sign * Ji * Jj - # Ensure lower triangle: larger index first - if colindi >= colindj: + # Ensure upper triangle: smaller index first. + if colindi <= colindj: wp.atomic_add(ctx_h_out[worldid, colindi], colindj, h) else: wp.atomic_add(ctx_h_out[worldid, colindj], colindi, h) @@ -2257,10 +2257,11 @@ def update_gradient_grad( @wp.kernel -def update_gradient_set_h_qM_lower_sparse( +def update_gradient_set_h_qM_upper_sparse( # Model: - qM_fullm_i: wp.array[int], - qM_fullm_j: wp.array[int], + qM_fullm_upper_i: wp.array[int], + qM_fullm_upper_j: wp.array[int], + qM_fullm_upper_elemid: wp.array[int], # Data in: qM_in: wp.array3d[float], # In: @@ -2273,9 +2274,10 @@ def update_gradient_set_h_qM_lower_sparse( if ctx_done_in[worldid]: return - i = qM_fullm_i[elementid] - j = qM_fullm_j[elementid] - ctx_h_out[worldid, i, j] += qM_in[worldid, 0, elementid] + i = qM_fullm_upper_i[elementid] + j = qM_fullm_upper_j[elementid] + qM_elemid = qM_fullm_upper_elemid[elementid] + ctx_h_out[worldid, i, j] += qM_in[worldid, 0, qM_elemid] @wp.func @@ -2317,12 +2319,12 @@ def kernel( nefc = nefc_in[worldid] - # get lower diagonal index - i = (int(sqrt(float(1 + 8 * elementid))) - 1) // 2 - j = elementid - (i * (i + 1)) // 2 + # Upper-triangle tile index: elementid -> (row, col) where row <= col. + col = (int(sqrt(float(1 + 8 * elementid))) - 1) // 2 + row = elementid - (col * (col + 1)) // 2 - offset_i = i * TILE_SIZE - offset_j = j * TILE_SIZE + offset_row = row * TILE_SIZE + offset_col = col * TILE_SIZE sum_val = wp.tile_zeros(shape=(TILE_SIZE, TILE_SIZE), dtype=wp.float32) @@ -2334,12 +2336,12 @@ def kernel( # AD: leaving bounds-check disabled here because I'm not entirely sure that # everything always hits the fast path. The padding takes care of any # potential OOB accesses. - J_ki = wp.tile_load(efc_J_in[worldid], shape=(TILE_SIZE, TILE_SIZE), offset=(k, offset_i), bounds_check=False) + J_krow = wp.tile_load(efc_J_in[worldid], shape=(TILE_SIZE, TILE_SIZE), offset=(k, offset_row), bounds_check=False) - if offset_i != offset_j: - J_kj = wp.tile_load(efc_J_in[worldid], shape=(TILE_SIZE, TILE_SIZE), offset=(k, offset_j), bounds_check=False) + if offset_row != offset_col: + J_kcol = wp.tile_load(efc_J_in[worldid], shape=(TILE_SIZE, TILE_SIZE), offset=(k, offset_col), bounds_check=False) else: - wp.tile_assign(J_kj, J_ki, (0, 0)) + wp.tile_assign(J_kcol, J_krow, (0, 0)) D_k = wp.tile_load(efc_D_in[worldid], shape=TILE_SIZE, offset=k, bounds_check=False) state = wp.tile_load(efc_state_in[worldid], shape=TILE_SIZE, offset=k, bounds_check=False) @@ -2353,13 +2355,13 @@ def kernel( active_tile = wp.tile_map(active_check, tid_tile, threshold_tile) D_k = wp.tile_map(wp.mul, active_tile, D_k) - J_ki = wp.tile_map(wp.mul, wp.tile_transpose(J_ki), wp.tile_broadcast(D_k, shape=(TILE_SIZE, TILE_SIZE))) + J_krow = wp.tile_map(wp.mul, wp.tile_transpose(J_krow), wp.tile_broadcast(D_k, shape=(TILE_SIZE, TILE_SIZE))) - sum_val += wp.tile_matmul(J_ki, J_kj) + sum_val += wp.tile_matmul(J_krow, J_kcol) # AD: setting bounds_check to True explicitly here because for some reason it was # slower to disable it. - wp.tile_store(ctx_h_out[worldid], sum_val, offset=(offset_i, offset_j), bounds_check=True) + wp.tile_store(ctx_h_out[worldid], sum_val, offset=(offset_row, offset_col), bounds_check=True) return kernel @@ -2746,9 +2748,9 @@ def kernel( return mat_tile = wp.tile_load(h_in[worldid], shape=(TILE_SIZE, TILE_SIZE)) - fact_tile = wp.tile_cholesky(mat_tile) + fact_tile = wp.tile_cholesky(mat_tile, fill_mode="upper") input_tile = wp.tile_load(ctx_grad_in[worldid], shape=TILE_SIZE) - output_tile = wp.tile_cholesky_solve(fact_tile, input_tile) + output_tile = wp.tile_cholesky_solve(fact_tile, input_tile, fill_mode="upper") wp.tile_store(ctx_Mgrad_out[worldid], output_tile) return kernel @@ -2914,9 +2916,9 @@ def _JTDAJ_sparse( colindj = efc_J_colind_in[worldid, 0, sparseidj] h = Ji * Jj * efc_D - # Store in lower triangle only: ensure row >= col - row = wp.max(colindi, colindj) - col = wp.min(colindi, colindj) + # Store in upper triangle only: ensure row <= col. + row = wp.min(colindi, colindj) + col = wp.max(colindi, colindj) wp.atomic_add(h_out[worldid, row], col, h) @@ -2945,9 +2947,9 @@ def _update_gradient(m: types.Model, d: types.Data, ctx: SolverContext): ) wp.launch( - update_gradient_set_h_qM_lower_sparse, - dim=(d.nworld, m.qM_fullm_i.size), - inputs=[m.qM_fullm_i, m.qM_fullm_j, d.qM, ctx.done], + update_gradient_set_h_qM_upper_sparse, + dim=(d.nworld, m.qM_fullm_upper_i.size), + inputs=[m.qM_fullm_upper_i, m.qM_fullm_upper_j, m.qM_fullm_upper_elemid, d.qM, ctx.done], outputs=[ctx.h], ) else: @@ -3067,7 +3069,7 @@ def _update_gradient_incremental(m: types.Model, d: types.Data, ctx: SolverConte outputs=[ctx.grad, ctx.grad_dot], ) - # Update lower triangle of H with delta from changed constraints + # Update upper triangle of H with delta from changed constraints. if m.is_sparse: wp.launch( update_gradient_h_incremental_sparse, @@ -3085,10 +3087,10 @@ def _update_gradient_incremental(m: types.Model, d: types.Data, ctx: SolverConte outputs=[ctx.h], ) else: - lower_tri_dim = m.nv * (m.nv + 1) // 2 + tri_dim = m.nv * (m.nv + 1) // 2 wp.launch( update_gradient_h_incremental, - dim=(d.nworld, lower_tri_dim), + dim=(d.nworld, tri_dim), inputs=[ d.efc.J, d.efc.D, diff --git a/mujoco_warp/_src/solver_test.py b/mujoco_warp/_src/solver_test.py index a1f0b5ffb..70c62e49e 100644 --- a/mujoco_warp/_src/solver_test.py +++ b/mujoco_warp/_src/solver_test.py @@ -40,6 +40,22 @@ def _assert_eq(a, b, name): class SolverTest(parameterized.TestCase): + def test_qM_fullm_upper_indices_are_row_sorted(self): + """Sparse qM seeding uses upper-triangle row-sorted writes.""" + _, _, m, _ = test_data.fixture("humanoid/humanoid.xml") + + lower_row = m.qM_fullm_i.numpy() + lower_col = m.qM_fullm_j.numpy() + upper_row = m.qM_fullm_upper_i.numpy() + upper_col = m.qM_fullm_upper_j.numpy() + upper_elemid = m.qM_fullm_upper_elemid.numpy() + + self.assertEqual(upper_row.size, lower_row.size) + self.assertTrue(np.all(upper_row <= upper_col)) + self.assertTrue(np.all(upper_row[:-1] <= upper_row[1:])) + np.testing.assert_array_equal(upper_row, lower_col[upper_elemid]) + np.testing.assert_array_equal(upper_col, lower_row[upper_elemid]) + @parameterized.product( cone=tuple(ConeType), solver_=tuple(SolverType), diff --git a/mujoco_warp/_src/support_test.py b/mujoco_warp/_src/support_test.py index ba5b89a3e..86b25596f 100644 --- a/mujoco_warp/_src/support_test.py +++ b/mujoco_warp/_src/support_test.py @@ -265,18 +265,18 @@ def combined_cholesky_kernel( ) wp.synchronize() - L_result = hfactor.numpy()[0] + U_result = hfactor.numpy()[0] x_result = Mgrad.numpy()[0] # Verify padding outside active region doesn't affect active computation # Off-diagonal padding should be zero (active region shouldn't touch padding) np.testing.assert_array_equal( - L_result[nv:, :nv], + U_result[nv:, :nv], 0.0, err_msg="padding rows in active region were overwritten", ) np.testing.assert_array_equal( - L_result[:nv, nv:], + U_result[:nv, nv:], 0.0, err_msg="padding columns in active region were overwritten", ) @@ -284,7 +284,7 @@ def combined_cholesky_kernel( # Check that padding region remains identity after factorization padding_size = nv_pad - nv if padding_size > 0: - padding_square = L_result[nv:, nv:] + padding_square = U_result[nv:, nv:] expected_identity = np.eye(padding_size, dtype=np.float32) np.testing.assert_allclose( padding_square, @@ -295,23 +295,23 @@ def combined_cholesky_kernel( ) # Compare with numpy cholesky on symmetrized Hessian - L_numpy = np.linalg.cholesky(SPD_active_hessian) + U_numpy = np.linalg.cholesky(SPD_active_hessian).T np.testing.assert_allclose( - L_result[:nv, :nv], - L_numpy, + U_result[:nv, :nv], + U_numpy, rtol=1e-4, atol=1e-4, err_msg="Cholesky decomposition mismatch with numpy", ) - # Verify L @ L.T = A (symmetrized Hessian) - A_reconstructed = L_result[:nv, :nv] @ L_result[:nv, :nv].T + # Verify U.T @ U = A (symmetrized Hessian) + A_reconstructed = U_result[:nv, :nv].T @ U_result[:nv, :nv] np.testing.assert_allclose( A_reconstructed, SPD_active_hessian, rtol=1e-5, atol=1e-5, - err_msg="L @ L.T does not equal symmetrized Hessian", + err_msg="U.T @ U does not equal symmetrized Hessian", ) # Verify solution: A @ x = b using the symmetrized Hessian diff --git a/mujoco_warp/_src/types.py b/mujoco_warp/_src/types.py index 44fa4b3c0..d3c4e7696 100644 --- a/mujoco_warp/_src/types.py +++ b/mujoco_warp/_src/types.py @@ -1188,8 +1188,8 @@ class Model: jnt_limited_slide_hinge_adr: limited/slide/hinge jntadr jnt_limited_ball_adr: limited/ball jntadr body_isdofancestor: precomputed mask of which DOFs affect each body - dof_tri_row: dof lower triangle row (used in solver) - dof_tri_col: dof lower triangle col (used in solver) + dof_tri_row: dof upper triangle row (used in solver) + dof_tri_col: dof upper triangle col (used in solver) nxn_geom_pair: collision pair geom ids [-2, ngeom-1] nxn_geom_pair_filtered: valid collision pair geom ids [-1, ngeom - 1] @@ -1251,6 +1251,9 @@ class Model: qM_fullm_i: sparse mass matrix addressing qM_fullm_j: sparse mass matrix addressing qM_fullm_elemid: (row, col) -> elemid into qM_fullm_i/qM_fullm_j; -1 if not a chain ancestor + qM_fullm_upper_i: upper-triangle row indices for solver h seeding + qM_fullm_upper_j: upper-triangle column indices for solver h seeding + qM_fullm_upper_elemid: source elemid into qM_fullm_i/qM_fullm_j qD_fullm_i: D-structure row indices for RNE derivatives qD_fullm_j: D-structure column indices for RNE derivatives qM_mulm_rowadr: sparse matmul row pointers @@ -1647,6 +1650,9 @@ class Model: qM_fullm_i: wp.array[int] qM_fullm_j: wp.array[int] qM_fullm_elemid: wp.array2d[int] # (row, col) -> elemid in qM_fullm_i/j; -1 if col is not a chain ancestor of row + qM_fullm_upper_i: wp.array[int] + qM_fullm_upper_j: wp.array[int] + qM_fullm_upper_elemid: wp.array[int] qD_fullm_i: wp.array[int] # D-structure (full square) row indices for RNE derivatives qD_fullm_j: wp.array[int] # D-structure (full square) column indices for RNE derivatives # Gather-based sparse mul_m indices (thread per DOF, no atomics) @@ -1814,8 +1820,8 @@ class Data: crb: com-based composite inertia and mass (nworld, nbody, 10) qM: total inertia (nworld, nv, nv) if dense (nworld, 1, nM) if sparse - qLD: L'*D*L factorization of M (nworld, nv, nv) if dense - (nworld, 1, nC) if sparse + qLD: upper Cholesky factorization if dense (nworld, nv, nv) + L'*D*L factorization of M if sparse (nworld, 1, nC) qLDiagInv: 1/diag(D) (nworld, nv) flexedge_velocity: flex edge velocities (nworld, nflexedge) ten_velocity: tendon velocities (nworld, ntendon) diff --git a/pyproject.toml b/pyproject.toml index 9032af0ed..a97a5832e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -31,7 +31,7 @@ dependencies = [ "etils[epath]", "mujoco>=3.8.0", "numpy", - "warp-lang>=1.12", + "warp-lang>=1.13", ] [[tool.uv.index]] diff --git a/uv.lock b/uv.lock index 7375993bc..5f2264ed6 100644 --- a/uv.lock +++ b/uv.lock @@ -1136,7 +1136,7 @@ requires-dist = [ { name = "pytest", marker = "extra == 'dev'" }, { name = "pytest-xdist", marker = "extra == 'dev'" }, { name = "ruff", marker = "extra == 'dev'" }, - { name = "warp-lang", specifier = ">=1.12", index = "https://pypi.nvidia.com/" }, + { name = "warp-lang", specifier = ">=1.13", index = "https://pypi.nvidia.com/" }, { name = "warp-lang", marker = "extra == 'dev'", specifier = ">=1.11.0.dev0", index = "https://pypi.nvidia.com/" }, ] provides-extras = ["dev", "cpu", "cuda"] @@ -2397,17 +2397,17 @@ wheels = [ [[package]] name = "warp-lang" -version = "1.13.0.dev20260227" +version = "1.13.0" source = { registry = "https://pypi.nvidia.com/" } dependencies = [ { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" }, { name = "numpy", version = "2.3.4", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" }, ] wheels = [ - { url = "https://pypi.nvidia.com/warp-lang/warp_lang-1.13.0.dev20260227-py3-none-macosx_11_0_arm64.whl", hash = "sha256:5e0bd6abe4ea0b91ed2539fac04d1b86b6c29bd65304ee81032b420cafe91ae9" }, - { url = "https://pypi.nvidia.com/warp-lang/warp_lang-1.13.0.dev20260227-py3-none-manylinux_2_28_x86_64.whl", hash = "sha256:382583045cd214e586d249f2391f9fe6abd28feaff0da5bfc6a9c45657b28116" }, - { url = "https://pypi.nvidia.com/warp-lang/warp_lang-1.13.0.dev20260227-py3-none-manylinux_2_34_aarch64.whl", hash = "sha256:ab957adfc8df513462cd8e2f62f946650dbd6dc04268ccfbecb9627b3d454466" }, - { url = "https://pypi.nvidia.com/warp-lang/warp_lang-1.13.0.dev20260227-py3-none-win_amd64.whl", hash = "sha256:3b07f61e1ac60b1b74dba936bc4f307af36ed0595f18032150e673367fe35f8e" }, + { url = "https://pypi.nvidia.com/warp-lang/warp_lang-1.13.0-py3-none-macosx_11_0_arm64.whl", hash = "sha256:4375f572301991fe0fbf0af29fc84d76cd27d531432d6df8452b25088c21ea5a" }, + { url = "https://pypi.nvidia.com/warp-lang/warp_lang-1.13.0-py3-none-manylinux_2_28_x86_64.whl", hash = "sha256:ac2479c70ad410d58deb088c2a64168792be588efc1bec22dc51f92238e8c8c3" }, + { url = "https://pypi.nvidia.com/warp-lang/warp_lang-1.13.0-py3-none-manylinux_2_34_aarch64.whl", hash = "sha256:476b54f0dcf6767f23305a328660a9a74d01e371b6b7c3d77e03e18b4a1bb1a5" }, + { url = "https://pypi.nvidia.com/warp-lang/warp_lang-1.13.0-py3-none-win_amd64.whl", hash = "sha256:47975ea07252d45a4d09d2d1a6cffc55002fa7fbde771c8dceb1baf4b32d4fb8" }, ] [[package]]