PriorLabs
diff --git a/‎src/tabpfn/preprocessing/torch/torch_quantile_transformer.py‎
Lines changed: 98 additions & 6 deletions b/‎src/tabpfn/preprocessing/torch/torch_quantile_transformer.py‎
Lines changed: 98 additions & 6 deletions
diff --git a/‎src/tabpfn/preprocessing/torch/torch_svd.py‎
Lines changed: 113 additions & 60 deletions b/‎src/tabpfn/preprocessing/torch/torch_svd.py‎
Lines changed: 113 additions & 60 deletions
@@ -61,20 +61,78 @@ def fit(self, x: torch.Tensor) -> dict[str, torch.Tensor]:
         references = torch.linspace(
             0, 1, n_quantiles_effective, device=x.device, dtype=compute_dtype
         )
-        quantiles = torch.nanquantile(x.to(compute_dtype), references, dim=0)
+
+        x_compute = x.to(compute_dtype)
+        quantiles = self._nanquantile_chunked(x_compute, references)
+
         # Ensure monotonicity (handle floating point issues)
         # Use cumulative maximum along the quantile dimension
         quantiles = torch.cummax(quantiles, dim=0).values
 
         return {"quantiles": quantiles, "references": references}
 
+    def _nanquantile_chunked(
+        self,
+        x: torch.Tensor,
+        references: torch.Tensor,
+    ) -> torch.Tensor:
+        """Compute nanquantile in column-chunks to bound peak memory.
+
+        Each column is independent so we process subsets of columns to keep
+        peak memory at O(N * chunk_cols) instead of O(N * F).  The chunk size
+        is determined by ``_get_fit_chunk_cols``.
+        """
+        original_shape = x.shape  # [N, ...]
+        n_samples = x.shape[0]
+
+        # Flatten trailing dimensions to [N, F]
+        x_flat = x.reshape(n_samples, -1) if x.ndim > 2 else x
+
+        n_features = x_flat.shape[1] if x_flat.ndim > 1 else 1
+        chunk_cols = self._get_fit_chunk_cols(x_flat)
+        process_all_at_once = n_features <= chunk_cols
+
+        if process_all_at_once:
+            quantiles = torch.nanquantile(x, references, dim=0)
+        else:
+            chunks = []
+            for col_start in range(0, n_features, chunk_cols):
+                q_chunk = torch.nanquantile(
+                    x_flat[:, col_start : col_start + chunk_cols], references, dim=0
+                )
+                chunks.append(q_chunk)
+            quantiles = torch.cat(chunks, dim=-1)
+
+            if x.ndim > 2:
+                quantiles = quantiles.reshape(len(references), *original_shape[1:])
+
+        return quantiles
+
+    def _get_fit_chunk_cols(self, x_flat: torch.Tensor) -> int:
+        """Compute a column-chunk size that keeps nanquantile peak memory bounded.
+
+        ``torch.nanquantile`` internally sorts the data along dim=0, requiring
+        roughly 10x the input size as temporary memory (sort buffer + indexing).
+        We target ~2 GB of intermediates so that 100k x 500 fits in one shot
+        while 1M-row datasets stay bounded at ~2 GB of workspace per chunk.
+        """
+        n_samples = x_flat.shape[0]
+        element_size = x_flat.element_size()
+        overhead_factor = 10
+        target_bytes = 2 * 1024**3  # 2 GB
+        bytes_per_col = n_samples * element_size * overhead_factor
+        return max(1, target_bytes // max(bytes_per_col, 1))
+
     def transform(
         self,
         x: torch.Tensor,
         fitted_cache: dict[str, torch.Tensor],
     ) -> torch.Tensor:
         """Transform the data to uniform distribution using fitted quantiles.
 
+        Automatically processes in row-chunks when the data is large to keep
+        peak intermediate memory bounded (~2 GB of temporaries).
+
         Args:
             x: Input tensor to transform.
             fitted_cache: Cache returned by fit.
@@ -94,14 +152,48 @@ def transform(
         # Compute in the cache dtype, then cast back.
         orig_dtype = x.dtype
         compute_dtype = quantiles.dtype
-        x_compute = x.to(compute_dtype) if x.dtype != compute_dtype else x
+        x_compute = x.to(compute_dtype)
+
+        chunk_size = self._get_transform_chunk_size(x_compute)
+        n_samples = x_compute.shape[0]
+
+        if n_samples <= chunk_size:
+            result = self._transform_chunk(x_compute, quantiles, references)
+        else:
+            chunks = []
+            for start in range(0, n_samples, chunk_size):
+                chunk = x_compute[start : start + chunk_size]
+                chunks.append(self._transform_chunk(chunk, quantiles, references))
+            result = torch.cat(chunks, dim=0)
 
-        nan_mask = torch.isnan(x_compute)
-        result = self._interpolate(x_compute, quantiles, references)
-        nan_fill = torch.tensor(float("nan"), device=x.device, dtype=compute_dtype)
-        result = torch.where(nan_mask, nan_fill, result)
         return result.to(orig_dtype) if orig_dtype != compute_dtype else result
 
+    def _transform_chunk(
+        self,
+        x: torch.Tensor,
+        quantiles: torch.Tensor,
+        references: torch.Tensor,
+    ) -> torch.Tensor:
+        """Transform a single chunk, preserving NaN positions."""
+        nan_mask = torch.isnan(x)
+        result = self._interpolate(x, quantiles, references)
+        return torch.where(nan_mask, float("nan"), result)
+
+    def _get_transform_chunk_size(self, x: torch.Tensor) -> int:
+        """Compute a row-chunk size that keeps intermediate memory bounded.
+
+        ``_interpolate`` creates ~15x intermediate memory per input element
+        (forward + backward searchsorted, gather, slope tensors).  We target
+        ~2 GB of intermediates so that even very large datasets stay within
+        reasonable GPU memory.
+        """
+        n_features = x.shape[-1] if x.ndim > 1 else 1
+        element_size = x.element_size()
+        overhead_factor = 15
+        target_bytes = 2 * 1024**3  # 2 GB
+        bytes_per_row = n_features * element_size * overhead_factor
+        return max(1_000, target_bytes // max(bytes_per_row, 1))
+
     def _interpolate(
         self,
         x: torch.Tensor,
 
@@ -42,8 +42,9 @@ class TorchTruncatedSVD:
     """Truncated SVD for PyTorch tensors.
 
     Similar to sklearn's TruncatedSVD but without any implicit state.
-    The state is returned explicitly. Uses full SVD and truncates to
-    n_components (efficient for typical TabPFN dimensions).
+    The state is returned explicitly. Uses randomized SVD
+    (``torch.svd_lowrank``) for large matrices and exact
+    ``torch.linalg.svd`` for small ones.
 
     Note: Unlike sklearn's TruncatedSVD, this does not center the data.
     If centering is needed, apply it before calling fit.
@@ -60,6 +61,11 @@ def __init__(self, n_components: int) -> None:
     def fit(self, x: torch.Tensor) -> dict[str, torch.Tensor]:
         """Compute the truncated SVD on the training data.
 
+        Uses randomized SVD (``torch.svd_lowrank``) when ``n_components`` is
+        much smaller than the matrix dimensions.  This reduces memory from
+        O(N * min(N, F)) to O(N * n_components) — a large saving when
+        n_components << min(N, F) (e.g. 128 vs 500).
+
         Args:
             x: Input tensor with shape [n_samples, n_features].
 
@@ -69,6 +75,17 @@ def fit(self, x: torch.Tensor) -> dict[str, torch.Tensor]:
                     [n_components, n_features]
                 - "singular_values": The singular values [n_components]
         """
+        orig_device = x.device
+
+        if x.device.type == "mps":
+            warnings.warn(
+                "SVD operators ('aten::linalg_svd', 'aten::linalg_qr') are not "
+                "currently supported on the MPS backend and will fall back to "
+                "run on the CPU. This may have performance implications.",
+                stacklevel=2,
+            )
+            x = x.cpu()
+
         n_samples, n_features = x.shape
 
         # Handle NaN values by replacing with 0 for SVD computation
@@ -79,25 +96,43 @@ def fit(self, x: torch.Tensor) -> dict[str, torch.Tensor]:
         n_components = min(self.n_components, n_samples, n_features)
         n_components = max(1, n_components)
 
-        # torch.linalg.svd requires float32 or float64; cast up if needed
+        # torch SVD ops require float32 or float64; cast up if needed
         compute_dtype = x_filled.dtype
         if compute_dtype not in (torch.float32, torch.float64):
             compute_dtype = torch.float32
             x_filled = x_filled.to(compute_dtype)
 
-        # Compute full SVD: X = U @ diag(S) @ V^T
-        # torch.linalg.svd returns V^T directly (not V)
-        with warnings.catch_warnings():
-            warnings.filterwarnings(
-                "ignore",
-                message=".*linalg_svd.*not currently supported on the MPS backend.*",
-            )
-            u, s, vh = torch.linalg.svd(x_filled, full_matrices=False)
+        # Use randomized SVD only when it is both (a) accurate — the matrix
+        # is large enough that the top components are well-separated — and
+        # (b) faster — the projected rank q is well below the matrix rank.
+        # Benchmark shows svd_lowrank becomes favorable once
+        # min(N, F) >= 2*q; below that, the random-projection overhead
+        # exceeds the savings from avoiding the full decomposition.
+        oversampling = 10  # matches sklearn's TruncatedSVD n_oversamples default
+        q = n_components + oversampling
+        use_lowrank = (
+            n_samples * n_features > 1_000_000 and min(n_samples, n_features) >= 2 * q
+        )
 
-        # Truncate to n_components
-        u = u[:, :n_components]
-        s = s[:n_components]
-        vh = vh[:n_components, :]
+        if use_lowrank:
+            # torch.svd_lowrank returns (U, S, V) with A ≈ U diag(S) V^T
+            u, s, v = torch.svd_lowrank(x_filled, q=q, niter=2)
+            # Truncate oversampling dimensions
+            u = u[:, :n_components]
+            s = s[:n_components]
+            vh = v[:, :n_components].T  # V [n_features, q] → V^T [n_comp, n_features]
+        else:
+            # Fall back to full SVD for small matrices or when n_components
+            # is close to min(n_samples, n_features).
+            with warnings.catch_warnings():  # warning thrown above already
+                warnings.filterwarnings(
+                    "ignore",
+                    message=".*linalg_svd.*not currently supported on the MPS backend.*",  # noqa: E501
+                )
+                u, s, vh = torch.linalg.svd(x_filled, full_matrices=False)
+            u = u[:, :n_components]
+            s = s[:n_components]
+            vh = vh[:n_components, :]
 
         # Apply sign flip for deterministic output.
         # We use the same convention as sklearn (u_based_decision=False:
@@ -108,8 +143,8 @@ def fit(self, x: torch.Tensor) -> dict[str, torch.Tensor]:
         u, vh = _svd_flip_stable(u, vh)
 
         return {
-            "components": vh,
-            "singular_values": s,
+            "components": vh.to(orig_device),
+            "singular_values": s.to(orig_device),
         }
 
     def transform(
@@ -119,6 +154,9 @@ def transform(
     ) -> torch.Tensor:
         """Project the data onto the SVD components.
 
+        Automatically processes in row-chunks when the data is large to keep
+        peak intermediate memory bounded.
+
         Args:
             x: Input tensor to transform [n_samples, n_features].
             fitted_cache: Cache returned by fit.
@@ -130,27 +168,36 @@ def transform(
             raise ValueError("Invalid fitted cache. Must contain 'components'.")
 
         components = fitted_cache["components"]
-
-        # Components may be in float32 (from fit) while input is float16.
-        # Compute in the components dtype, then cast back.
         orig_dtype = x.dtype
         compute_dtype = components.dtype
+        chunk_size = self._get_transform_chunk_size(x, components)
 
-        # Handle NaN values: preserve them in output
-        x_compute = x.to(compute_dtype) if x.dtype != compute_dtype else x
-        nan_mask = torch.isnan(x_compute)
-        x_filled = torch.where(nan_mask, torch.zeros_like(x_compute), x_compute)
+        def _per_row(row: torch.Tensor) -> torch.Tensor:
+            x_c = row.to(compute_dtype)
+            nan_mask = torch.isnan(x_c)
+            x_filled = torch.where(nan_mask, torch.zeros_like(x_c), x_c)
+            out = x_filled @ components.T
+            return torch.where(nan_mask.any(), float("nan"), out)
 
-        # Project: X @ V (V = components.T)
-        result = x_filled @ components.T
-
-        # If any input feature was NaN, the corresponding output should be NaN
-        # Since projection is a linear combination, any NaN input affects all outputs
-        any_nan_per_row = nan_mask.any(dim=-1, keepdim=True)
-        nan_fill = torch.tensor(float("nan"), device=x.device, dtype=compute_dtype)
-        result = torch.where(any_nan_per_row.expand_as(result), nan_fill, result)
+        result = torch.vmap(_per_row, chunk_size=chunk_size)(x)
         return result.to(orig_dtype) if orig_dtype != compute_dtype else result
 
+    def _get_transform_chunk_size(
+        self, x: torch.Tensor, components: torch.Tensor
+    ) -> int:
+        """Compute a row-chunk size that keeps intermediate memory bounded.
+
+        Transform creates ~3x N*F intermediates (dtype cast, nan_mask,
+        x_filled) plus the N*C result.  Target ~2 GB of temporaries.
+        """
+        n_features = x.shape[-1]
+        n_components = components.shape[0]
+        element_size = max(x.element_size(), components.element_size())
+        # x_compute + nan_mask(~1B) + x_filled + result
+        bytes_per_row = (3 * n_features + n_components) * element_size
+        target_bytes = 2 * 1024**3  # 2 GB
+        return max(1_000, target_bytes // max(bytes_per_row, 1))
+
     def __call__(
         self,
         x: torch.Tensor,
@@ -241,6 +288,9 @@ def transform(
     ) -> torch.Tensor:
         """Apply the fitted scaling to the data (no mean centering).
 
+        Automatically processes in row-chunks when the data is large to keep
+        peak intermediate memory bounded.
+
         Args:
             x: Input tensor to transform.
             fitted_cache: Cache returned by fit.
@@ -252,40 +302,43 @@ def transform(
             raise ValueError("Invalid fitted cache. Must contain 'std'.")
 
         std = fitted_cache["std"]
-
-        # Align dtype: std is in float32+ from fit, input may be float16
         orig_dtype = x.dtype
         compute_dtype = std.dtype
-        x_compute = x.to(compute_dtype) if x.dtype != compute_dtype else x
-
-        # Replace inf with nan before scaling
-        x_safe = torch.where(
-            torch.isinf(x_compute),
-            torch.tensor(float("nan"), device=x.device, dtype=compute_dtype),
-            x_compute,
+        chunk_size = self._get_transform_chunk_size(
+            x, compute_element_size=max(x.element_size(), std.element_size())
+        )
+        col_means = (
+            fitted_cache["mean"].to(device=x.device, dtype=compute_dtype)
+            if "mean" in fitted_cache
+            else None
         )
 
-        # Impute NaN with column means (matching CPU make_scaler_safe which
-        # wraps the scaler with SimpleImputer(strategy="mean") pre/post).
-        if "mean" in fitted_cache:
-            nan_mask = torch.isnan(x_safe)
-            if nan_mask.any():
-                col_means = fitted_cache["mean"].to(
-                    device=x_safe.device, dtype=x_safe.dtype
-                )
-                x_safe = torch.where(nan_mask, col_means.unsqueeze(0), x_safe)
-
-        x_scaled = x_safe / (std + torch.finfo(std.dtype).eps)
+        def _per_row(row: torch.Tensor) -> torch.Tensor:
+            x_c = row.to(compute_dtype)
+            x_safe = torch.where(torch.isinf(x_c), float("nan"), x_c)
+            # Impute NaN with column means (matching CPU make_scaler_safe).
+            if col_means is not None:
+                nan_mask = torch.isnan(x_safe)
+                x_safe = torch.where(nan_mask, col_means, x_safe)
+            x_scaled = x_safe / (std + torch.finfo(compute_dtype).eps)
+            x_scaled = torch.clip(x_scaled, min=-100, max=100)
+            return torch.where(torch.isfinite(x_scaled), x_scaled, 0)
+
+        result = torch.vmap(_per_row, chunk_size=chunk_size)(x)
+        return result.to(orig_dtype) if orig_dtype != compute_dtype else result
 
-        # Clip very large values
-        x_scaled = torch.clip(x_scaled, min=-100, max=100)
+    def _get_transform_chunk_size(
+        self, x: torch.Tensor, compute_element_size: int
+    ) -> int:
+        """Compute a row-chunk size that keeps intermediate memory bounded.
 
-        # Replace any inf that might have been created with nan, then impute
-        # remaining non-finite values (matching CPU post-imputation safety net)
-        result = torch.where(
-            torch.isfinite(x_scaled), x_scaled, torch.zeros_like(x_scaled)
-        )
-        return result.to(orig_dtype) if orig_dtype != compute_dtype else result
+        Transform creates ~5x N*F intermediates (dtype cast, isinf check,
+        nan_mask, imputed, scaled, clipped, finite-check).  Target ~2 GB.
+        """
+        n_features = x.shape[-1] if x.ndim > 1 else 1
+        bytes_per_row = n_features * compute_element_size * 5
+        target_bytes = 2 * 1024**3  # 2 GB
+        return max(1_000, target_bytes // max(bytes_per_row, 1))
 
     def __call__(
         self,