diff --git a/cvxpy/cvxcore/python/canonInterface.py b/cvxpy/cvxcore/python/canonInterface.py
index ebf2945e6b..ae4bbfae0a 100644
--- a/cvxpy/cvxcore/python/canonInterface.py
+++ b/cvxpy/cvxcore/python/canonInterface.py
@@ -292,6 +292,13 @@ def get_problem_matrix(linOps,
     default_canon_backend = get_default_canon_backend()
     canon_backend = default_canon_backend if not canon_backend else canon_backend
 
+    # DIFFENGINE is a reduction-layer replacement for ConeMatrixStuffing and
+    # has no lin_ops backend of its own. When code paths reach this matrix
+    # builder directly (e.g. tests that bypass the stuffing reduction), fall
+    # through to the CPP backend so the lin_ops pipeline still works.
+    if canon_backend == s.DIFFENGINE_BACKEND:
+        canon_backend = s.CPP_CANON_BACKEND
+
     if canon_backend == s.CPP_CANON_BACKEND:
         from cvxpy.cvxcore.python.cppbackend import build_matrix
         return build_matrix(id_to_col, param_to_size, param_to_col, var_length, constr_length, linOps)
diff --git a/cvxpy/reductions/dcp2cone/canonicalizers/perspective_canon.py b/cvxpy/reductions/dcp2cone/canonicalizers/perspective_canon.py
index 973b84f24f..8be83140de 100644
--- a/cvxpy/reductions/dcp2cone/canonicalizers/perspective_canon.py
+++ b/cvxpy/reductions/dcp2cone/canonicalizers/perspective_canon.py
@@ -39,10 +39,22 @@ def perspective_canon(expr, args, solver_context: SolverInfo | None = None):
     prob_canon = chain.apply(aux_prob)[0]  # grab problem instance
     # get cone representation of c, A, and b for some problem.
 
-    q = prob_canon.q.toarray().flatten()[:-1]
-    d = prob_canon.q.toarray().flatten()[-1]
-    Ab = prob_canon.A.toarray().reshape((-1, len(q) + 1), order="F")
-    A, b = Ab[:, :-1], Ab[:, -1]
+    # Extract cone representation q, d, A, b.
+    # ParamConeProg stores q as sparse (n+1, n_params+1) tensor with d
+    # embedded in the last row, and A as a flattened tensor with b embedded.
+    # DiffengineConeProgram stores q, d, A, b as separate concrete arrays.
+    if hasattr(prob_canon, 'd') and not hasattr(prob_canon.q, 'toarray'):
+        # DiffengineConeProgram: concrete matrices, d stored separately.
+        q = prob_canon.q
+        d = prob_canon.d
+        A = prob_canon.A.toarray() if hasattr(prob_canon.A, 'toarray') else prob_canon.A
+        b = prob_canon.b
+    else:
+        # ParamConeProg: sparse tensor with embedded offsets.
+        q = prob_canon.q.toarray().flatten()[:-1]
+        d = prob_canon.q.toarray().flatten()[-1]
+        Ab = prob_canon.A.toarray().reshape((-1, len(q) + 1), order="F")
+        A, b = Ab[:, :-1], Ab[:, -1]
 
     # given f in epigraph form, aka epi f = \{(x,t) | f(x) \leq t\}
     # = \{(x,t) | Fx +tg + e \in K} for K a cone, the epigraph of the
diff --git a/cvxpy/reductions/solvers/nlp_solvers/diff_engine/converters.py b/cvxpy/reductions/solvers/nlp_solvers/diff_engine/converters.py
index 88f54f73cd..8db23b5bd1 100644
--- a/cvxpy/reductions/solvers/nlp_solvers/diff_engine/converters.py
+++ b/cvxpy/reductions/solvers/nlp_solvers/diff_engine/converters.py
@@ -30,32 +30,65 @@
 from cvxpy.reductions.solvers.nlp_solvers.diff_engine.registry import ATOM_CONVERTERS
 
 
+def _matmul_normalize_1d(A, side):
+    """Reshape a 1D numpy array to 2D for matmul.
+
+    NumPy matmul treats 1D arrays differently depending on which side:
+      Left 1D:  (k,) → (1, k) — row vector
+      Right 1D: (k,) → (k, 1) — column vector
+    2D input is returned unchanged.
+    """
+    if A.ndim == 1:
+        return A.reshape(1, -1) if side == 'left' else A.reshape(-1, 1)
+    return A
+
+
 def convert_matmul(expr, children, var_dict, n_vars, param_dict):
-    """Convert matrix multiplication A @ f(x), f(x) @ A, or X @ Y."""
+    """Convert matrix multiplication A @ f(x), f(x) @ A, or X @ Y.
+
+    NumPy matmul semantics for 1D arrays:
+      (n,) @ (m,k) → treat left as (1,n)   — normalize_shape already does this
+      (m,k) @ (n,) → treat right as (n,1)  — must reshape from (1,n) storage
+      (n,) @ (n,)  → dot product: (1,n) @ (n,1) → scalar
+
+    The C engine only has 2D nodes. 1D expressions are stored as (1,n) by
+    normalize_shape. All 1D→2D matmul normalization is handled here so that
+    helper functions always receive properly shaped 2D data.
+    """
     left_arg, right_arg = expr.args
+    left_child, right_child = children
+
+    # Right 1D child: C stores as (1, n) but matmul needs (n, 1).
+    # Do this once, before branching — used by all three branches.
+    if len(right_arg.shape) <= 1 and right_arg.size > 1:
+        right_child = _diffengine.make_reshape(right_child, right_arg.size, 1)
 
     if left_arg.is_constant():
-        A = left_arg.value
+        A = _matmul_normalize_1d(left_arg.value, 'left')
         if isinstance(left_arg, cp.Parameter):
             param_node = param_dict[left_arg.id]
+        elif left_arg.parameters():
+            param_node = left_child
         else:
             param_node = None
         if sparse.issparse(A):
-            return make_sparse_left_matmul(param_node, children[1], A)
-        return make_dense_left_matmul(param_node, children[1], A)
+            return make_sparse_left_matmul(param_node, right_child, A)
+        return make_dense_left_matmul(param_node, right_child, A)
 
     elif right_arg.is_constant():
-        A = right_arg.value
+        A = _matmul_normalize_1d(right_arg.value, 'right')
         if isinstance(right_arg, cp.Parameter):
             param_node = param_dict[right_arg.id]
+        elif right_arg.parameters():
+            param_node = right_child
         else:
             param_node = None
         if sparse.issparse(A):
-            return make_sparse_right_matmul(param_node, children[0], A)
-        return make_dense_right_matmul(param_node, children[0], A)
+            return make_sparse_right_matmul(param_node, left_child, A)
+        return make_dense_right_matmul(param_node, left_child, A)
 
     else:
-        return _diffengine.make_matmul(children[0], children[1])
+        return _diffengine.make_matmul(left_child, right_child)
 
 # TODO we should support sparse elementwise multiply at some point.
 def convert_multiply(expr, children, var_dict, n_vars, param_dict):
@@ -98,11 +131,22 @@ def convert_expr(expr, var_dict, n_vars, param_dict=None):
         return param_dict[expr.id]
 
     # Base case: constant (in the diff engine, a constant is a parameter with ID -1)
+    # Also handles atoms applied to pure constants (e.g. PnormApprox(Constant), floor(Constant))
+    # that weren't folded by Dcp2Cone.
     if isinstance(expr, cp.Constant):
         c = to_dense_float(expr.value)
         d1, d2 = normalize_shape(expr.shape)
         return _diffengine.make_parameter(d1, d2, -1, n_vars, c.flatten(order='F'))
 
+    # Constant atom: no variables/parameters, so it must have a concrete value.
+    # Handles atoms like floor(NegExpression(Constant(5.0))) after EvalParams.
+    # Check variables()/parameters() before .value to avoid triggering
+    # numeric() on atoms that don't support it (e.g. cached SymbolicQuadForm).
+    if not expr.variables() and not expr.parameters() and expr.value is not None:
+        c = to_dense_float(expr.value)
+        d1, d2 = normalize_shape(expr.shape)
+        return _diffengine.make_parameter(d1, d2, -1, n_vars, c.flatten(order='F'))
+
     # Recursive case: atoms
     atom_name = type(expr).__name__
     children = [convert_expr(arg, var_dict, n_vars, param_dict) for arg in expr.args]
@@ -113,6 +157,11 @@ def convert_expr(expr, var_dict, n_vars, param_dict=None):
         C_expr = convert_matmul(expr, children, var_dict, n_vars, param_dict)
     elif atom_name == "multiply":
         C_expr = convert_multiply(expr, children, var_dict, n_vars, param_dict)
+    elif atom_name in ("QuadForm", "SymbolicQuadForm"):
+        from cvxpy.reductions.solvers.nlp_solvers.diff_engine.registry import (
+            convert_quad_form,
+        )
+        C_expr = convert_quad_form(expr, children, n_vars)
     elif atom_name in ATOM_CONVERTERS:
         C_expr = ATOM_CONVERTERS[atom_name](expr, children)
     else:
@@ -123,10 +172,16 @@ def convert_expr(expr, var_dict, n_vars, param_dict=None):
     d1_Python, d2_Python = normalize_shape(expr.shape)
 
     if d1_C != d1_Python or d2_C != d2_Python:
-        raise ValueError(
-            f"Dimension mismatch for atom '{atom_name}': "
-            f"C dimensions ({d1_C}, {d2_C}) vs Python dimensions ({d1_Python}, {d2_Python})"
-        )
-
+        # 1D Python shapes (n,) normalize to (1, n), but the C engine may
+        # produce (n, 1) — e.g. matrix @ scalar or transpose of a vector.
+        # Both represent the same 1D data; reshape to match Python convention.
+        if len(expr.shape) <= 1 and d1_C * d2_C == d1_Python * d2_Python:
+            C_expr = _diffengine.make_reshape(C_expr, d1_Python, d2_Python)
+        else:
+            raise ValueError(
+                f"Dimension mismatch for atom '{atom_name}': "
+                f"C dimensions ({d1_C}, {d2_C}) vs "
+                f"Python dimensions ({d1_Python}, {d2_Python})"
+            )
 
     return C_expr
diff --git a/cvxpy/reductions/solvers/nlp_solvers/diff_engine/helpers.py b/cvxpy/reductions/solvers/nlp_solvers/diff_engine/helpers.py
index cf240bee87..d1b392fe03 100644
--- a/cvxpy/reductions/solvers/nlp_solvers/diff_engine/helpers.py
+++ b/cvxpy/reductions/solvers/nlp_solvers/diff_engine/helpers.py
@@ -53,7 +53,7 @@ def make_sparse_left_matmul(param_node, child, A):
 
 
 def make_dense_left_matmul(param_node, child, A):
-    m, n = normalize_shape(A.shape)
+    m, n = A.shape
     return _diffengine.make_left_matmul(
         param_node, child, 'dense', A.flatten(order='C'), m, n)
 
@@ -70,7 +70,7 @@ def make_sparse_right_matmul(param_node, child, A):
 
 
 def make_dense_right_matmul(param_node, child, A):
-    m, n = normalize_shape(A.shape)
+    m, n = A.shape
     return _diffengine.make_right_matmul(
         param_node, child, 'dense', A.flatten(order='C'), m, n)
 
diff --git a/cvxpy/reductions/solvers/nlp_solvers/diff_engine/registry.py b/cvxpy/reductions/solvers/nlp_solvers/diff_engine/registry.py
index 6035f8ea73..5fd7093156 100644
--- a/cvxpy/reductions/solvers/nlp_solvers/diff_engine/registry.py
+++ b/cvxpy/reductions/solvers/nlp_solvers/diff_engine/registry.py
@@ -23,7 +23,6 @@
 from scipy import sparse
 from sparsediffpy import _sparsediffengine as _diffengine
 
-import cvxpy as cp
 from cvxpy.reductions.solvers.nlp_solvers.diff_engine.helpers import (
     chain_add,
     normalize_shape,
@@ -40,6 +39,49 @@ def convert_vstack(expr, children):
     return _diffengine.make_vstack(children)
 
 
+def convert_conv(expr, children):
+    """Convert cp.conv / cp.convolve to _diffengine.make_convolve.
+
+    Both CVXPY atoms take args = [constant_kernel, signal] with the kernel
+    validated as constant. The C node computes full 1D convolution
+    (length m + n - 1) given a length-m kernel parameter capsule and a
+    length-n child. convert_expr has already built both children as
+    appropriate capsules (PARAM_FIXED for Constants, param_dict lookup
+    for Parameters, recursive build for affine-of-parameter).
+    """
+    return _diffengine.make_convolve(children[0], children[1])
+
+
+def convert_concatenate(expr, children):
+    """Convert Concatenate along an existing axis via hstack/vstack.
+
+    Concatenate semantics:
+      ndim=1, axis=0 → flat concat, same as hstack on 1D
+      ndim=2, axis=0 → stack rows, same as vstack
+      ndim=2, axis=1 → stack cols, same as hstack
+    """
+    axis = expr.axis
+    arg_shapes = [arg.shape for arg in expr.args]
+    ndims = {len(s) for s in arg_shapes}
+    if len(ndims) != 1:
+        raise NotImplementedError(
+            "Concatenate across inputs of mixed dimensionality is not "
+            "supported by the diffengine backend."
+        )
+    ndim = ndims.pop()
+
+    if ndim == 1 and axis == 0:
+        return _diffengine.make_hstack(children)
+    if ndim == 2 and axis == 0:
+        return _diffengine.make_vstack(children)
+    if ndim == 2 and axis == 1:
+        return _diffengine.make_hstack(children)
+    raise NotImplementedError(
+        f"Concatenate with ndim={ndim}, axis={axis} is not supported "
+        "by the diffengine backend."
+    )
+
+
 def convert_div(expr, children):
     """Convert x / c by multiplying x by the elementwise reciprocal of c."""
     from cvxpy.reductions.solvers.nlp_solvers.diff_engine.helpers import to_dense_float
@@ -92,42 +134,147 @@ def convert_rel_entr(expr, children):
     return _diffengine.make_rel_entr(children[0], children[1])
 
 
-def convert_quad_form(expr, children):
-    """Convert quadratic form x.T @ P @ x."""
+def _diagonal_weights(P_val):
+    """If P_val is (numerically) diagonal, return its diagonal as a 1D
+    float64 array; otherwise return None."""
+    if sparse.issparse(P_val):
+        P_val = P_val.toarray()
+    P = np.asarray(P_val, dtype=np.float64)
+    if P.ndim != 2 or P.shape[0] != P.shape[1]:
+        return None
+    w = np.diag(P)
+    if np.allclose(P, np.diag(w)):
+        return w
+    return None
+
+
+def convert_quad_form(expr, children, n_vars):
+    """Convert quadratic form x.T @ P @ x.
 
+    Scalar output (shape ()) uses the native `make_quad_form` C node.
+
+    Vector-shaped SymbolicQuadForm is lowered using existing atoms when P
+    is diagonal (every CVXPY canonicalizer that produces a vector-shaped
+    SymbolicQuadForm today builds it with P = eye or P = α·eye):
+
+      - `block_indices is None`, P = diag(w):
+          y = w ⊙ x**2
+      - `block_indices` given with uniform block size m, P = α·I:
+          y_j = α · Σ_{i ∈ I_j} x_i**2
+        (gather → square → axis-0 sum → scalar-mult by α)
+
+    A genuinely non-diagonal vector quad_form still raises
+    NotImplementedError — that case needs a native block quad_form node
+    in SparseDiffPy.
+    """
     P = expr.args[1]
 
-    if not isinstance(P, cp.Constant):
+    if not P.is_constant():
         raise NotImplementedError("quad_form requires P to be a constant matrix")
 
-    P = P.value
-
-    if not isinstance(P, sparse.csr_matrix):
-          P = sparse.csr_matrix(P)
+    P_val = P.value
+    if P_val is None:
+        raise NotImplementedError(
+            "quad_form with a symbolic P (e.g. eye/parameter) is not yet "
+            "supported by the diffengine backend."
+        )
 
+    if expr.size > 1:
+        w = _diagonal_weights(P_val)
+        if w is None:
+            raise NotImplementedError(
+                f"Non-diagonal vector-shaped quad form (shape {expr.shape}) "
+                "is not supported by the diffengine backend. Needs native "
+                "block quadform in SparseDiffPy."
+            )
+
+        block_indices = getattr(expr, "block_indices", None)
+        child = children[0]
+        uniform_alpha = float(w[0]) if np.allclose(w, w[0]) else None
+
+        if block_indices is None:
+            # Case A: y_j = w_j * x_j**2 (elementwise, K == N).
+            x_sq = _diffengine.make_power(child, 2.0)
+            if uniform_alpha is not None and np.isclose(uniform_alpha, 1.0):
+                out = x_sq
+            elif uniform_alpha is not None:
+                alpha_node = _diffengine.make_parameter(
+                    1, 1, -1, n_vars,
+                    np.array([uniform_alpha], dtype=np.float64))
+                out = _diffengine.make_param_scalar_mult(alpha_node, x_sq)
+            else:
+                d1, d2 = normalize_shape(expr.shape)
+                w_node = _diffengine.make_parameter(
+                    d1, d2, -1, n_vars,
+                    np.asarray(w, dtype=np.float64).flatten(order="F"))
+                out = _diffengine.make_param_vector_mult(w_node, x_sq)
+        else:
+            # Case B: y_j = α · Σ_{i ∈ I_j} x_i**2 (requires uniform blocks).
+            if uniform_alpha is None:
+                raise NotImplementedError(
+                    "quad_form with block_indices and non-uniform P diagonal "
+                    "is not supported by the diffengine backend."
+                )
+            block_lens = {len(b) for b in block_indices}
+            if len(block_lens) != 1:
+                raise NotImplementedError(
+                    "quad_form with non-uniform block_indices sizes is not "
+                    "supported by the diffengine backend."
+                )
+            m = block_lens.pop()
+            K = len(block_indices)
+            flat_idx = np.concatenate(
+                [np.asarray(b, dtype=np.int32) for b in block_indices]
+            ).astype(np.int32)
+            gathered = _diffengine.make_index(child, m, K, flat_idx)
+            sq = _diffengine.make_power(gathered, 2.0)
+            summed = _diffengine.make_sum(sq, 0)
+            if np.isclose(uniform_alpha, 1.0):
+                out = summed
+            else:
+                alpha_node = _diffengine.make_parameter(
+                    1, 1, -1, n_vars,
+                    np.array([uniform_alpha], dtype=np.float64))
+                out = _diffengine.make_param_scalar_mult(alpha_node, summed)
+
+        # Match the declared output shape (convert_expr's final dim check
+        # compares C dims to normalize_shape(expr.shape)).
+        d1_out, d2_out = normalize_shape(expr.shape)
+        d1_c, d2_c = _diffengine.get_expr_dimensions(out)
+        if (d1_c, d2_c) != (d1_out, d2_out):
+            out = _diffengine.make_reshape(out, d1_out, d2_out)
+        return out
+
+    # Scalar output: native quad_form node.
+    if sparse.issparse(P_val):
+        P_val = P_val.toarray()
+    P_arr = np.asarray(P_val, dtype=np.float64)
+    P_csr = sparse.csr_matrix(P_arr)
     return _diffengine.make_quad_form(
         children[0],
-        P.data.astype(np.float64),
-        P.indices.astype(np.int32),
-        P.indptr.astype(np.int32),
-        P.shape[0],
-        P.shape[1],
+        P_csr.data.astype(np.float64),
+        P_csr.indices.astype(np.int32),
+        P_csr.indptr.astype(np.int32),
+        P_csr.shape[0],
+        P_csr.shape[1],
     )
 
 
 def convert_reshape(expr, children):
-    """Convert reshape - only Fortran order is supported.
+    """Convert reshape.
 
-    Note: Only order='F' (Fortran/column-major) is supported.
+    F-order (column-major) uses make_reshape directly.
+    C-order (row-major) is decomposed as: transpose(reshape(transpose(x), (n, m), F))
+    since reshape(x, (m, n), C) == transpose(reshape(transpose(x), (n, m), F)).
     """
-    if expr.order != "F":
-        raise NotImplementedError(
-            f"reshape with order='{expr.order}' not supported. "
-            "Only order='F' (Fortran) is currently supported."
-        )
-
     d1, d2 = normalize_shape(expr.shape)
-    return _diffengine.make_reshape(children[0], d1, d2)
+    if expr.order == "F":
+        return _diffengine.make_reshape(children[0], d1, d2)
+    else:
+        # C-order: transpose input, F-reshape to swapped dims, transpose output
+        transposed = _diffengine.make_transpose(children[0])
+        reshaped = _diffengine.make_reshape(transposed, d2, d1)
+        return _diffengine.make_transpose(reshaped)
 
 def convert_broadcast(expr, children):
     d1, d2 = expr.broadcast_shape
@@ -173,9 +320,12 @@ def convert_prod(expr, children):
         return _diffengine.make_prod_axis_one(children[0])
 
 def convert_transpose(expr, children):
-    # If the child is a vector (shape (n,) or (n,1) or (1,n)), use reshape to transpose
-    child_shape = normalize_shape(expr.args[0].shape)
+    # In CVXPY, transposing a 1D vector (n,) is a no-op: (n,).T == (n,).
+    # The C engine stores 1D as (1, n), so we must not flip it to (n, 1).
+    if len(expr.args[0].shape) <= 1:
+        return children[0]
 
+    child_shape = normalize_shape(expr.args[0].shape)
     if 1 in child_shape:
         return _diffengine.make_reshape(children[0], child_shape[1], child_shape[0])
     else:
@@ -229,8 +379,8 @@ def convert_upper_tri(_expr, children):
     # Reductions
     "Sum": convert_sum,
     # Bivariate
-    "QuadForm": convert_quad_form,
-    "SymbolicQuadForm": convert_quad_form,
+    # QuadForm and SymbolicQuadForm are dispatched directly in convert_expr
+    # (they need n_vars for creating constant nodes in the vector case).
     "quad_over_lin": convert_quad_over_lin,
     "rel_entr": convert_rel_entr,
     # Elementwise univariate with parameter
@@ -261,6 +411,10 @@ def convert_upper_tri(_expr, children):
     # Horizontal/vertical stack
     "Hstack": convert_hstack,
     "Vstack": convert_vstack,
+    "Concatenate": convert_concatenate,
+    # 1D full convolution
+    "conv": convert_conv,
+    "convolve": convert_conv,
     "Trace": convert_trace,
     # Diagonal and triangular
     "diag_vec": convert_diag_vec,
diff --git a/pyproject.toml b/pyproject.toml
index 0b4136e758..39f3ba08d8 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -59,6 +59,13 @@ testpaths = [
     "cvxpy/tests/"
 ]
 
+[tool.uv.sources]
+# Local dev override: build sparsediffpy from the adjacent source tree so we
+# pick up bindings (vstack / diag_mat / upper_tri / parameter support) that
+# haven't shipped a PyPI release yet. Remove once sparsediffpy >= 0.3.0 is
+# published.
+sparsediffpy = { path = "../SparseDiffPy", editable = true }
+
 [build-system]
 requires = [
     "numpy >= 2.0.0",