From 7daccf6f78043ee3ea0a30919f28a4f482181d1d Mon Sep 17 00:00:00 2001
From: William Zijie Zhang <william@gridmatic.com>
Date: Wed, 15 Apr 2026 14:51:30 -0400
Subject: [PATCH 1/5] fix diff engine converter bugs: 1D matmul dims, reshape,
 transpose, quad form

- helpers.py: fix 1D dimension normalization in dense matmul helpers.
  Right-matmul treated 1D A as (1,n) instead of (n,1), causing segfaults.
- converters.py: reshape 1D child to column vector in left-matmul for
  dot products; add constant-atom fallback for unfolded expressions;
  move QuadForm/SymbolicQuadForm dispatch out of ATOM_CONVERTERS (needs
  n_vars); tolerate 1D dimension mismatches via reshape instead of error.
- registry.py: support C-order reshape via transpose decomposition;
  fix 1D transpose to be a no-op; improve scalar quad form P handling;
  raise NotImplementedError for vector SymbolicQuadForm (TODO: native
  block quadform in SparseDiffPy).
- perspective_canon.py: handle DiffengineConeProgram which stores
  q, d, A, b as separate arrays (vs ParamConeProg sparse tensors).

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 .../canonicalizers/perspective_canon.py       | 20 ++++--
 .../nlp_solvers/diff_engine/converters.py     | 55 ++++++++++++---
 .../nlp_solvers/diff_engine/helpers.py        | 10 ++-
 .../nlp_solvers/diff_engine/registry.py       | 67 ++++++++++++-------
 4 files changed, 114 insertions(+), 38 deletions(-)

diff --git a/cvxpy/reductions/dcp2cone/canonicalizers/perspective_canon.py b/cvxpy/reductions/dcp2cone/canonicalizers/perspective_canon.py
index 973b84f24f..8be83140de 100644
--- a/cvxpy/reductions/dcp2cone/canonicalizers/perspective_canon.py
+++ b/cvxpy/reductions/dcp2cone/canonicalizers/perspective_canon.py
@@ -39,10 +39,22 @@ def perspective_canon(expr, args, solver_context: SolverInfo | None = None):
     prob_canon = chain.apply(aux_prob)[0]  # grab problem instance
     # get cone representation of c, A, and b for some problem.
 
-    q = prob_canon.q.toarray().flatten()[:-1]
-    d = prob_canon.q.toarray().flatten()[-1]
-    Ab = prob_canon.A.toarray().reshape((-1, len(q) + 1), order="F")
-    A, b = Ab[:, :-1], Ab[:, -1]
+    # Extract cone representation q, d, A, b.
+    # ParamConeProg stores q as sparse (n+1, n_params+1) tensor with d
+    # embedded in the last row, and A as a flattened tensor with b embedded.
+    # DiffengineConeProgram stores q, d, A, b as separate concrete arrays.
+    if hasattr(prob_canon, 'd') and not hasattr(prob_canon.q, 'toarray'):
+        # DiffengineConeProgram: concrete matrices, d stored separately.
+        q = prob_canon.q
+        d = prob_canon.d
+        A = prob_canon.A.toarray() if hasattr(prob_canon.A, 'toarray') else prob_canon.A
+        b = prob_canon.b
+    else:
+        # ParamConeProg: sparse tensor with embedded offsets.
+        q = prob_canon.q.toarray().flatten()[:-1]
+        d = prob_canon.q.toarray().flatten()[-1]
+        Ab = prob_canon.A.toarray().reshape((-1, len(q) + 1), order="F")
+        A, b = Ab[:, :-1], Ab[:, -1]
 
     # given f in epigraph form, aka epi f = \{(x,t) | f(x) \leq t\}
     # = \{(x,t) | Fx +tg + e \in K} for K a cone, the epigraph of the
diff --git a/cvxpy/reductions/solvers/nlp_solvers/diff_engine/converters.py b/cvxpy/reductions/solvers/nlp_solvers/diff_engine/converters.py
index 88f54f73cd..6aea0cad04 100644
--- a/cvxpy/reductions/solvers/nlp_solvers/diff_engine/converters.py
+++ b/cvxpy/reductions/solvers/nlp_solvers/diff_engine/converters.py
@@ -31,7 +31,18 @@
 
 
 def convert_matmul(expr, children, var_dict, n_vars, param_dict):
-    """Convert matrix multiplication A @ f(x), f(x) @ A, or X @ Y."""
+    """Convert matrix multiplication A @ f(x), f(x) @ A, or X @ Y.
+
+    NumPy matmul semantics for 1D arrays:
+      (n,) @ (m,k) → treat left as (1,n)
+      (m,k) @ (n,) → treat right as (n,1)
+      (n,) @ (n,)  → dot product, treat as (1,n) @ (n,1) → scalar
+
+    The C engine only has 2D nodes. 1D expressions are stored as (1,n) by
+    normalize_shape.  For left-matmul A @ child, if the child is 1D we must
+    reshape it to (n,1) so the inner dimensions agree.  For right-matmul
+    child @ A, the helpers already handle 1D A via the ndim==1 branch.
+    """
     left_arg, right_arg = expr.args
 
     if left_arg.is_constant():
@@ -40,9 +51,15 @@ def convert_matmul(expr, children, var_dict, n_vars, param_dict):
             param_node = param_dict[left_arg.id]
         else:
             param_node = None
+        child = children[1]
+        # If the right operand is 1D, the C node is (1,n) but left-matmul
+        # needs it as (n,1) so inner dims match: A(m,n) @ child(n,1).
+        if len(right_arg.shape) <= 1 and right_arg.size > 1:
+            n = right_arg.size
+            child = _diffengine.make_reshape(child, n, 1)
         if sparse.issparse(A):
-            return make_sparse_left_matmul(param_node, children[1], A)
-        return make_dense_left_matmul(param_node, children[1], A)
+            return make_sparse_left_matmul(param_node, child, A)
+        return make_dense_left_matmul(param_node, child, A)
 
     elif right_arg.is_constant():
         A = right_arg.value
@@ -98,11 +115,22 @@ def convert_expr(expr, var_dict, n_vars, param_dict=None):
         return param_dict[expr.id]
 
     # Base case: constant (in the diff engine, a constant is a parameter with ID -1)
+    # Also handles atoms applied to pure constants (e.g. PnormApprox(Constant), floor(Constant))
+    # that weren't folded by Dcp2Cone.
     if isinstance(expr, cp.Constant):
         c = to_dense_float(expr.value)
         d1, d2 = normalize_shape(expr.shape)
         return _diffengine.make_parameter(d1, d2, -1, n_vars, c.flatten(order='F'))
 
+    # Constant atom: no variables/parameters, so it must have a concrete value.
+    # Handles atoms like floor(NegExpression(Constant(5.0))) after EvalParams.
+    # Check variables()/parameters() before .value to avoid triggering
+    # numeric() on atoms that don't support it (e.g. cached SymbolicQuadForm).
+    if not expr.variables() and not expr.parameters() and expr.value is not None:
+        c = to_dense_float(expr.value)
+        d1, d2 = normalize_shape(expr.shape)
+        return _diffengine.make_parameter(d1, d2, -1, n_vars, c.flatten(order='F'))
+
     # Recursive case: atoms
     atom_name = type(expr).__name__
     children = [convert_expr(arg, var_dict, n_vars, param_dict) for arg in expr.args]
@@ -113,6 +141,11 @@ def convert_expr(expr, var_dict, n_vars, param_dict=None):
         C_expr = convert_matmul(expr, children, var_dict, n_vars, param_dict)
     elif atom_name == "multiply":
         C_expr = convert_multiply(expr, children, var_dict, n_vars, param_dict)
+    elif atom_name in ("QuadForm", "SymbolicQuadForm"):
+        from cvxpy.reductions.solvers.nlp_solvers.diff_engine.registry import (
+            convert_quad_form,
+        )
+        C_expr = convert_quad_form(expr, children, n_vars)
     elif atom_name in ATOM_CONVERTERS:
         C_expr = ATOM_CONVERTERS[atom_name](expr, children)
     else:
@@ -123,10 +156,16 @@ def convert_expr(expr, var_dict, n_vars, param_dict=None):
     d1_Python, d2_Python = normalize_shape(expr.shape)
 
     if d1_C != d1_Python or d2_C != d2_Python:
-        raise ValueError(
-            f"Dimension mismatch for atom '{atom_name}': "
-            f"C dimensions ({d1_C}, {d2_C}) vs Python dimensions ({d1_Python}, {d2_Python})"
-        )
-
+        # 1D Python shapes (n,) normalize to (1, n), but the C engine may
+        # produce (n, 1) — e.g. matrix @ scalar or transpose of a vector.
+        # Both represent the same 1D data; reshape to match Python convention.
+        if len(expr.shape) <= 1 and d1_C * d2_C == d1_Python * d2_Python:
+            C_expr = _diffengine.make_reshape(C_expr, d1_Python, d2_Python)
+        else:
+            raise ValueError(
+                f"Dimension mismatch for atom '{atom_name}': "
+                f"C dimensions ({d1_C}, {d2_C}) vs "
+                f"Python dimensions ({d1_Python}, {d2_Python})"
+            )
 
     return C_expr
diff --git a/cvxpy/reductions/solvers/nlp_solvers/diff_engine/helpers.py b/cvxpy/reductions/solvers/nlp_solvers/diff_engine/helpers.py
index cf240bee87..7805a69887 100644
--- a/cvxpy/reductions/solvers/nlp_solvers/diff_engine/helpers.py
+++ b/cvxpy/reductions/solvers/nlp_solvers/diff_engine/helpers.py
@@ -53,7 +53,10 @@ def make_sparse_left_matmul(param_node, child, A):
 
 
 def make_dense_left_matmul(param_node, child, A):
-    m, n = normalize_shape(A.shape)
+    if A.ndim == 1:
+        m, n = 1, A.shape[0]
+    else:
+        m, n = A.shape
     return _diffengine.make_left_matmul(
         param_node, child, 'dense', A.flatten(order='C'), m, n)
 
@@ -70,7 +73,10 @@ def make_sparse_right_matmul(param_node, child, A):
 
 
 def make_dense_right_matmul(param_node, child, A):
-    m, n = normalize_shape(A.shape)
+    if A.ndim == 1:
+        m, n = A.shape[0], 1
+    else:
+        m, n = A.shape
     return _diffengine.make_right_matmul(
         param_node, child, 'dense', A.flatten(order='C'), m, n)
 
diff --git a/cvxpy/reductions/solvers/nlp_solvers/diff_engine/registry.py b/cvxpy/reductions/solvers/nlp_solvers/diff_engine/registry.py
index 6035f8ea73..e79731f02d 100644
--- a/cvxpy/reductions/solvers/nlp_solvers/diff_engine/registry.py
+++ b/cvxpy/reductions/solvers/nlp_solvers/diff_engine/registry.py
@@ -92,42 +92,58 @@ def convert_rel_entr(expr, children):
     return _diffengine.make_rel_entr(children[0], children[1])
 
 
-def convert_quad_form(expr, children):
-    """Convert quadratic form x.T @ P @ x."""
+def convert_quad_form(expr, children, n_vars):
+    """Convert quadratic form x.T @ P @ x.
 
+    Currently only handles scalar quad forms (shape ()). Vector-shaped
+    SymbolicQuadForm (e.g. from huber, quad_over_lin with axis) requires
+    native block quadform support in SparseDiffPy.
+    """
     P = expr.args[1]
 
-    if not isinstance(P, cp.Constant):
+    if not P.is_constant():
         raise NotImplementedError("quad_form requires P to be a constant matrix")
 
-    P = P.value
+    # TODO: vector-shaped SymbolicQuadForm (e.g. from huber, quad_over_lin
+    # with axis) needs native block quadform support in SparseDiffPy rather
+    # than decomposing into diag(P) * power(x, 2).
+    if expr.size > 1:
+        raise NotImplementedError(
+            f"Vector-shaped quad form (shape {expr.shape}) not yet supported "
+            "by the diffengine backend. Needs native block quadform in SparseDiffPy."
+        )
 
-    if not isinstance(P, sparse.csr_matrix):
-          P = sparse.csr_matrix(P)
+    P_val = P.value
+    if sparse.issparse(P_val):
+        P_val = P_val.toarray()
+    P_val = np.asarray(P_val, dtype=np.float64)
 
+    P_csr = sparse.csr_matrix(P_val)
     return _diffengine.make_quad_form(
         children[0],
-        P.data.astype(np.float64),
-        P.indices.astype(np.int32),
-        P.indptr.astype(np.int32),
-        P.shape[0],
-        P.shape[1],
+        P_csr.data.astype(np.float64),
+        P_csr.indices.astype(np.int32),
+        P_csr.indptr.astype(np.int32),
+        P_csr.shape[0],
+        P_csr.shape[1],
     )
 
 
 def convert_reshape(expr, children):
-    """Convert reshape - only Fortran order is supported.
+    """Convert reshape.
 
-    Note: Only order='F' (Fortran/column-major) is supported.
+    F-order (column-major) uses make_reshape directly.
+    C-order (row-major) is decomposed as: transpose(reshape(transpose(x), (n, m), F))
+    since reshape(x, (m, n), C) == transpose(reshape(transpose(x), (n, m), F)).
     """
-    if expr.order != "F":
-        raise NotImplementedError(
-            f"reshape with order='{expr.order}' not supported. "
-            "Only order='F' (Fortran) is currently supported."
-        )
-
     d1, d2 = normalize_shape(expr.shape)
-    return _diffengine.make_reshape(children[0], d1, d2)
+    if expr.order == "F":
+        return _diffengine.make_reshape(children[0], d1, d2)
+    else:
+        # C-order: transpose input, F-reshape to swapped dims, transpose output
+        transposed = _diffengine.make_transpose(children[0])
+        reshaped = _diffengine.make_reshape(transposed, d2, d1)
+        return _diffengine.make_transpose(reshaped)
 
 def convert_broadcast(expr, children):
     d1, d2 = expr.broadcast_shape
@@ -173,9 +189,12 @@ def convert_prod(expr, children):
         return _diffengine.make_prod_axis_one(children[0])
 
 def convert_transpose(expr, children):
-    # If the child is a vector (shape (n,) or (n,1) or (1,n)), use reshape to transpose
-    child_shape = normalize_shape(expr.args[0].shape)
+    # In CVXPY, transposing a 1D vector (n,) is a no-op: (n,).T == (n,).
+    # The C engine stores 1D as (1, n), so we must not flip it to (n, 1).
+    if len(expr.args[0].shape) <= 1:
+        return children[0]
 
+    child_shape = normalize_shape(expr.args[0].shape)
     if 1 in child_shape:
         return _diffengine.make_reshape(children[0], child_shape[1], child_shape[0])
     else:
@@ -229,8 +248,8 @@ def convert_upper_tri(_expr, children):
     # Reductions
     "Sum": convert_sum,
     # Bivariate
-    "QuadForm": convert_quad_form,
-    "SymbolicQuadForm": convert_quad_form,
+    # QuadForm and SymbolicQuadForm are dispatched directly in convert_expr
+    # (they need n_vars for creating constant nodes in the vector case).
     "quad_over_lin": convert_quad_over_lin,
     "rel_entr": convert_rel_entr,
     # Elementwise univariate with parameter

From ee6e20cc5d18372a7a3dec4177edb36942213940 Mon Sep 17 00:00:00 2001
From: William Zijie Zhang <william@gridmatic.com>
Date: Thu, 16 Apr 2026 16:45:54 -0400
Subject: [PATCH 2/5] adds converters changes for right matmul support

---
 .../nlp_solvers/diff_engine/converters.py     | 60 ++++++++++---------
 .../nlp_solvers/diff_engine/helpers.py        | 10 +---
 2 files changed, 35 insertions(+), 35 deletions(-)

diff --git a/cvxpy/reductions/solvers/nlp_solvers/diff_engine/converters.py b/cvxpy/reductions/solvers/nlp_solvers/diff_engine/converters.py
index 6aea0cad04..788a67bfa4 100644
--- a/cvxpy/reductions/solvers/nlp_solvers/diff_engine/converters.py
+++ b/cvxpy/reductions/solvers/nlp_solvers/diff_engine/converters.py
@@ -30,49 +30,55 @@
 from cvxpy.reductions.solvers.nlp_solvers.diff_engine.registry import ATOM_CONVERTERS
 
 
+def _matmul_normalize_1d(A, side):
+    """Reshape a 1D numpy array to 2D for matmul.
+
+    NumPy matmul treats 1D arrays differently depending on which side:
+      Left 1D:  (k,) → (1, k) — row vector
+      Right 1D: (k,) → (k, 1) — column vector
+    2D input is returned unchanged.
+    """
+    if A.ndim == 1:
+        return A.reshape(1, -1) if side == 'left' else A.reshape(-1, 1)
+    return A
+
+
 def convert_matmul(expr, children, var_dict, n_vars, param_dict):
     """Convert matrix multiplication A @ f(x), f(x) @ A, or X @ Y.
 
     NumPy matmul semantics for 1D arrays:
-      (n,) @ (m,k) → treat left as (1,n)
-      (m,k) @ (n,) → treat right as (n,1)
-      (n,) @ (n,)  → dot product, treat as (1,n) @ (n,1) → scalar
+      (n,) @ (m,k) → treat left as (1,n)   — normalize_shape already does this
+      (m,k) @ (n,) → treat right as (n,1)  — must reshape from (1,n) storage
+      (n,) @ (n,)  → dot product: (1,n) @ (n,1) → scalar
 
     The C engine only has 2D nodes. 1D expressions are stored as (1,n) by
-    normalize_shape.  For left-matmul A @ child, if the child is 1D we must
-    reshape it to (n,1) so the inner dimensions agree.  For right-matmul
-    child @ A, the helpers already handle 1D A via the ndim==1 branch.
+    normalize_shape. All 1D→2D matmul normalization is handled here so that
+    helper functions always receive properly shaped 2D data.
     """
     left_arg, right_arg = expr.args
+    left_child, right_child = children
+
+    # Right 1D child: C stores as (1, n) but matmul needs (n, 1).
+    # Do this once, before branching — used by all three branches.
+    if len(right_arg.shape) <= 1 and right_arg.size > 1:
+        right_child = _diffengine.make_reshape(right_child, right_arg.size, 1)
 
     if left_arg.is_constant():
-        A = left_arg.value
-        if isinstance(left_arg, cp.Parameter):
-            param_node = param_dict[left_arg.id]
-        else:
-            param_node = None
-        child = children[1]
-        # If the right operand is 1D, the C node is (1,n) but left-matmul
-        # needs it as (n,1) so inner dims match: A(m,n) @ child(n,1).
-        if len(right_arg.shape) <= 1 and right_arg.size > 1:
-            n = right_arg.size
-            child = _diffengine.make_reshape(child, n, 1)
+        A = _matmul_normalize_1d(left_arg.value, 'left')
+        param_node = param_dict[left_arg.id] if isinstance(left_arg, cp.Parameter) else None
         if sparse.issparse(A):
-            return make_sparse_left_matmul(param_node, child, A)
-        return make_dense_left_matmul(param_node, child, A)
+            return make_sparse_left_matmul(param_node, right_child, A)
+        return make_dense_left_matmul(param_node, right_child, A)
 
     elif right_arg.is_constant():
-        A = right_arg.value
-        if isinstance(right_arg, cp.Parameter):
-            param_node = param_dict[right_arg.id]
-        else:
-            param_node = None
+        A = _matmul_normalize_1d(right_arg.value, 'right')
+        param_node = param_dict[right_arg.id] if isinstance(right_arg, cp.Parameter) else None
         if sparse.issparse(A):
-            return make_sparse_right_matmul(param_node, children[0], A)
-        return make_dense_right_matmul(param_node, children[0], A)
+            return make_sparse_right_matmul(param_node, left_child, A)
+        return make_dense_right_matmul(param_node, left_child, A)
 
     else:
-        return _diffengine.make_matmul(children[0], children[1])
+        return _diffengine.make_matmul(left_child, right_child)
 
 # TODO we should support sparse elementwise multiply at some point.
 def convert_multiply(expr, children, var_dict, n_vars, param_dict):
diff --git a/cvxpy/reductions/solvers/nlp_solvers/diff_engine/helpers.py b/cvxpy/reductions/solvers/nlp_solvers/diff_engine/helpers.py
index 7805a69887..d1b392fe03 100644
--- a/cvxpy/reductions/solvers/nlp_solvers/diff_engine/helpers.py
+++ b/cvxpy/reductions/solvers/nlp_solvers/diff_engine/helpers.py
@@ -53,10 +53,7 @@ def make_sparse_left_matmul(param_node, child, A):
 
 
 def make_dense_left_matmul(param_node, child, A):
-    if A.ndim == 1:
-        m, n = 1, A.shape[0]
-    else:
-        m, n = A.shape
+    m, n = A.shape
     return _diffengine.make_left_matmul(
         param_node, child, 'dense', A.flatten(order='C'), m, n)
 
@@ -73,10 +70,7 @@ def make_sparse_right_matmul(param_node, child, A):
 
 
 def make_dense_right_matmul(param_node, child, A):
-    if A.ndim == 1:
-        m, n = A.shape[0], 1
-    else:
-        m, n = A.shape
+    m, n = A.shape
     return _diffengine.make_right_matmul(
         param_node, child, 'dense', A.flatten(order='C'), m, n)
 

From 14ec3af77237dacf323dbf2cbe56a7121875eaa2 Mon Sep 17 00:00:00 2001
From: William Zijie Zhang <william@gridmatic.com>
Date: Tue, 21 Apr 2026 16:52:32 -0400
Subject: [PATCH 3/5] fixes crashes for left matmul

---
 cvxpy/cvxcore/python/canonInterface.py             |  7 +++++++
 .../solvers/nlp_solvers/diff_engine/converters.py  | 14 ++++++++++++--
 pyproject.toml                                     |  7 +++++++
 3 files changed, 26 insertions(+), 2 deletions(-)

diff --git a/cvxpy/cvxcore/python/canonInterface.py b/cvxpy/cvxcore/python/canonInterface.py
index ebf2945e6b..ae4bbfae0a 100644
--- a/cvxpy/cvxcore/python/canonInterface.py
+++ b/cvxpy/cvxcore/python/canonInterface.py
@@ -292,6 +292,13 @@ def get_problem_matrix(linOps,
     default_canon_backend = get_default_canon_backend()
     canon_backend = default_canon_backend if not canon_backend else canon_backend
 
+    # DIFFENGINE is a reduction-layer replacement for ConeMatrixStuffing and
+    # has no lin_ops backend of its own. When code paths reach this matrix
+    # builder directly (e.g. tests that bypass the stuffing reduction), fall
+    # through to the CPP backend so the lin_ops pipeline still works.
+    if canon_backend == s.DIFFENGINE_BACKEND:
+        canon_backend = s.CPP_CANON_BACKEND
+
     if canon_backend == s.CPP_CANON_BACKEND:
         from cvxpy.cvxcore.python.cppbackend import build_matrix
         return build_matrix(id_to_col, param_to_size, param_to_col, var_length, constr_length, linOps)
diff --git a/cvxpy/reductions/solvers/nlp_solvers/diff_engine/converters.py b/cvxpy/reductions/solvers/nlp_solvers/diff_engine/converters.py
index 788a67bfa4..8db23b5bd1 100644
--- a/cvxpy/reductions/solvers/nlp_solvers/diff_engine/converters.py
+++ b/cvxpy/reductions/solvers/nlp_solvers/diff_engine/converters.py
@@ -65,14 +65,24 @@ def convert_matmul(expr, children, var_dict, n_vars, param_dict):
 
     if left_arg.is_constant():
         A = _matmul_normalize_1d(left_arg.value, 'left')
-        param_node = param_dict[left_arg.id] if isinstance(left_arg, cp.Parameter) else None
+        if isinstance(left_arg, cp.Parameter):
+            param_node = param_dict[left_arg.id]
+        elif left_arg.parameters():
+            param_node = left_child
+        else:
+            param_node = None
         if sparse.issparse(A):
             return make_sparse_left_matmul(param_node, right_child, A)
         return make_dense_left_matmul(param_node, right_child, A)
 
     elif right_arg.is_constant():
         A = _matmul_normalize_1d(right_arg.value, 'right')
-        param_node = param_dict[right_arg.id] if isinstance(right_arg, cp.Parameter) else None
+        if isinstance(right_arg, cp.Parameter):
+            param_node = param_dict[right_arg.id]
+        elif right_arg.parameters():
+            param_node = right_child
+        else:
+            param_node = None
         if sparse.issparse(A):
             return make_sparse_right_matmul(param_node, left_child, A)
         return make_dense_right_matmul(param_node, left_child, A)
diff --git a/pyproject.toml b/pyproject.toml
index 0b4136e758..39f3ba08d8 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -59,6 +59,13 @@ testpaths = [
     "cvxpy/tests/"
 ]
 
+[tool.uv.sources]
+# Local dev override: build sparsediffpy from the adjacent source tree so we
+# pick up bindings (vstack / diag_mat / upper_tri / parameter support) that
+# haven't shipped a PyPI release yet. Remove once sparsediffpy >= 0.3.0 is
+# published.
+sparsediffpy = { path = "../SparseDiffPy", editable = true }
+
 [build-system]
 requires = [
     "numpy >= 2.0.0",

From bc91ef80e01b6378ee6c1e80c1931b1998a54b62 Mon Sep 17 00:00:00 2001
From: William Zijie Zhang <william@gridmatic.com>
Date: Wed, 22 Apr 2026 11:19:46 -0400
Subject: [PATCH 4/5] add changes to registry

---
 .../nlp_solvers/diff_engine/registry.py       | 162 ++++++++++++++++--
 1 file changed, 149 insertions(+), 13 deletions(-)

diff --git a/cvxpy/reductions/solvers/nlp_solvers/diff_engine/registry.py b/cvxpy/reductions/solvers/nlp_solvers/diff_engine/registry.py
index e79731f02d..5c2134af0c 100644
--- a/cvxpy/reductions/solvers/nlp_solvers/diff_engine/registry.py
+++ b/cvxpy/reductions/solvers/nlp_solvers/diff_engine/registry.py
@@ -40,6 +40,49 @@ def convert_vstack(expr, children):
     return _diffengine.make_vstack(children)
 
 
+def convert_conv(expr, children):
+    """Convert cp.conv / cp.convolve to _diffengine.make_convolve.
+
+    Both CVXPY atoms take args = [constant_kernel, signal] with the kernel
+    validated as constant. The C node computes full 1D convolution
+    (length m + n - 1) given a length-m kernel parameter capsule and a
+    length-n child. convert_expr has already built both children as
+    appropriate capsules (PARAM_FIXED for Constants, param_dict lookup
+    for Parameters, recursive build for affine-of-parameter).
+    """
+    return _diffengine.make_convolve(children[0], children[1])
+
+
+def convert_concatenate(expr, children):
+    """Convert Concatenate along an existing axis via hstack/vstack.
+
+    Concatenate semantics:
+      ndim=1, axis=0 → flat concat, same as hstack on 1D
+      ndim=2, axis=0 → stack rows, same as vstack
+      ndim=2, axis=1 → stack cols, same as hstack
+    """
+    axis = expr.axis
+    arg_shapes = [arg.shape for arg in expr.args]
+    ndims = {len(s) for s in arg_shapes}
+    if len(ndims) != 1:
+        raise NotImplementedError(
+            "Concatenate across inputs of mixed dimensionality is not "
+            "supported by the diffengine backend."
+        )
+    ndim = ndims.pop()
+
+    if ndim == 1 and axis == 0:
+        return _diffengine.make_hstack(children)
+    if ndim == 2 and axis == 0:
+        return _diffengine.make_vstack(children)
+    if ndim == 2 and axis == 1:
+        return _diffengine.make_hstack(children)
+    raise NotImplementedError(
+        f"Concatenate with ndim={ndim}, axis={axis} is not supported "
+        "by the diffengine backend."
+    )
+
+
 def convert_div(expr, children):
     """Convert x / c by multiplying x by the elementwise reciprocal of c."""
     from cvxpy.reductions.solvers.nlp_solvers.diff_engine.helpers import to_dense_float
@@ -92,33 +135,122 @@ def convert_rel_entr(expr, children):
     return _diffengine.make_rel_entr(children[0], children[1])
 
 
+def _diagonal_weights(P_val):
+    """If P_val is (numerically) diagonal, return its diagonal as a 1D
+    float64 array; otherwise return None."""
+    if sparse.issparse(P_val):
+        P_val = P_val.toarray()
+    P = np.asarray(P_val, dtype=np.float64)
+    if P.ndim != 2 or P.shape[0] != P.shape[1]:
+        return None
+    w = np.diag(P)
+    if np.allclose(P, np.diag(w)):
+        return w
+    return None
+
+
 def convert_quad_form(expr, children, n_vars):
     """Convert quadratic form x.T @ P @ x.
 
-    Currently only handles scalar quad forms (shape ()). Vector-shaped
-    SymbolicQuadForm (e.g. from huber, quad_over_lin with axis) requires
-    native block quadform support in SparseDiffPy.
+    Scalar output (shape ()) uses the native `make_quad_form` C node.
+
+    Vector-shaped SymbolicQuadForm is lowered using existing atoms when P
+    is diagonal (every CVXPY canonicalizer that produces a vector-shaped
+    SymbolicQuadForm today builds it with P = eye or P = α·eye):
+
+      - `block_indices is None`, P = diag(w):
+          y = w ⊙ x**2
+      - `block_indices` given with uniform block size m, P = α·I:
+          y_j = α · Σ_{i ∈ I_j} x_i**2
+        (gather → square → axis-0 sum → scalar-mult by α)
+
+    A genuinely non-diagonal vector quad_form still raises
+    NotImplementedError — that case needs a native block quad_form node
+    in SparseDiffPy.
     """
     P = expr.args[1]
 
     if not P.is_constant():
         raise NotImplementedError("quad_form requires P to be a constant matrix")
 
-    # TODO: vector-shaped SymbolicQuadForm (e.g. from huber, quad_over_lin
-    # with axis) needs native block quadform support in SparseDiffPy rather
-    # than decomposing into diag(P) * power(x, 2).
-    if expr.size > 1:
+    P_val = P.value
+    if P_val is None:
         raise NotImplementedError(
-            f"Vector-shaped quad form (shape {expr.shape}) not yet supported "
-            "by the diffengine backend. Needs native block quadform in SparseDiffPy."
+            "quad_form with a symbolic P (e.g. eye/parameter) is not yet "
+            "supported by the diffengine backend."
         )
 
-    P_val = P.value
+    if expr.size > 1:
+        w = _diagonal_weights(P_val)
+        if w is None:
+            raise NotImplementedError(
+                f"Non-diagonal vector-shaped quad form (shape {expr.shape}) "
+                "is not supported by the diffengine backend. Needs native "
+                "block quadform in SparseDiffPy."
+            )
+
+        block_indices = getattr(expr, "block_indices", None)
+        child = children[0]
+        uniform_alpha = float(w[0]) if np.allclose(w, w[0]) else None
+
+        if block_indices is None:
+            # Case A: y_j = w_j * x_j**2 (elementwise, K == N).
+            x_sq = _diffengine.make_power(child, 2.0)
+            if uniform_alpha is not None and np.isclose(uniform_alpha, 1.0):
+                out = x_sq
+            elif uniform_alpha is not None:
+                alpha_node = _diffengine.make_parameter(
+                    1, 1, -1, n_vars,
+                    np.array([uniform_alpha], dtype=np.float64))
+                out = _diffengine.make_param_scalar_mult(alpha_node, x_sq)
+            else:
+                d1, d2 = normalize_shape(expr.shape)
+                w_node = _diffengine.make_parameter(
+                    d1, d2, -1, n_vars,
+                    np.asarray(w, dtype=np.float64).flatten(order="F"))
+                out = _diffengine.make_param_vector_mult(w_node, x_sq)
+        else:
+            # Case B: y_j = α · Σ_{i ∈ I_j} x_i**2 (requires uniform blocks).
+            if uniform_alpha is None:
+                raise NotImplementedError(
+                    "quad_form with block_indices and non-uniform P diagonal "
+                    "is not supported by the diffengine backend."
+                )
+            block_lens = {len(b) for b in block_indices}
+            if len(block_lens) != 1:
+                raise NotImplementedError(
+                    "quad_form with non-uniform block_indices sizes is not "
+                    "supported by the diffengine backend."
+                )
+            m = block_lens.pop()
+            K = len(block_indices)
+            flat_idx = np.concatenate(
+                [np.asarray(b, dtype=np.int32) for b in block_indices]
+            ).astype(np.int32)
+            gathered = _diffengine.make_index(child, m, K, flat_idx)
+            sq = _diffengine.make_power(gathered, 2.0)
+            summed = _diffengine.make_sum(sq, 0)
+            if np.isclose(uniform_alpha, 1.0):
+                out = summed
+            else:
+                alpha_node = _diffengine.make_parameter(
+                    1, 1, -1, n_vars,
+                    np.array([uniform_alpha], dtype=np.float64))
+                out = _diffengine.make_param_scalar_mult(alpha_node, summed)
+
+        # Match the declared output shape (convert_expr's final dim check
+        # compares C dims to normalize_shape(expr.shape)).
+        d1_out, d2_out = normalize_shape(expr.shape)
+        d1_c, d2_c = _diffengine.get_expr_dimensions(out)
+        if (d1_c, d2_c) != (d1_out, d2_out):
+            out = _diffengine.make_reshape(out, d1_out, d2_out)
+        return out
+
+    # Scalar output: native quad_form node.
     if sparse.issparse(P_val):
         P_val = P_val.toarray()
-    P_val = np.asarray(P_val, dtype=np.float64)
-
-    P_csr = sparse.csr_matrix(P_val)
+    P_arr = np.asarray(P_val, dtype=np.float64)
+    P_csr = sparse.csr_matrix(P_arr)
     return _diffengine.make_quad_form(
         children[0],
         P_csr.data.astype(np.float64),
@@ -280,6 +412,10 @@ def convert_upper_tri(_expr, children):
     # Horizontal/vertical stack
     "Hstack": convert_hstack,
     "Vstack": convert_vstack,
+    "Concatenate": convert_concatenate,
+    # 1D full convolution
+    "conv": convert_conv,
+    "convolve": convert_conv,
     "Trace": convert_trace,
     # Diagonal and triangular
     "diag_vec": convert_diag_vec,

From e71ea9d75c31586f9c974be7930589032dc91904 Mon Sep 17 00:00:00 2001
From: William Zijie Zhang <william@gridmatic.com>
Date: Wed, 22 Apr 2026 11:24:44 -0400
Subject: [PATCH 5/5] run linter

---
 cvxpy/reductions/solvers/nlp_solvers/diff_engine/registry.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/cvxpy/reductions/solvers/nlp_solvers/diff_engine/registry.py b/cvxpy/reductions/solvers/nlp_solvers/diff_engine/registry.py
index 5c2134af0c..5fd7093156 100644
--- a/cvxpy/reductions/solvers/nlp_solvers/diff_engine/registry.py
+++ b/cvxpy/reductions/solvers/nlp_solvers/diff_engine/registry.py
@@ -23,7 +23,6 @@
 from scipy import sparse
 from sparsediffpy import _sparsediffengine as _diffengine
 
-import cvxpy as cp
 from cvxpy.reductions.solvers.nlp_solvers.diff_engine.helpers import (
     chain_add,
     normalize_shape,