NVIDIA
diff --git a/‎examples/commons/perf_model/csrc/kk_partition.cpp‎
Lines changed: 203 additions & 0 deletions b/‎examples/commons/perf_model/csrc/kk_partition.cpp‎
Lines changed: 203 additions & 0 deletions
diff --git a/‎examples/commons/perf_model/partitioner.py‎
Lines changed: 66 additions & 18 deletions b/‎examples/commons/perf_model/partitioner.py‎
Lines changed: 66 additions & 18 deletions
@@ -0,0 +1,203 @@
+// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+//
+// Karmarkar-Karp k-way partitioning in C++ — drop-in replacement for the
+// pure-Python implementation in `partitioner.py`.  The whole compute path
+// releases the GIL so the main Python thread can keep submitting CUDA
+// kernels while the algorithm runs in a background ThreadPoolExecutor.
+//
+// Output is bit-for-bit identical to the Python version (same tie-breaking
+// rules) so it can be swapped in without changing downstream behaviour.
+
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include <algorithm>
+#include <cstdint>
+#include <functional>
+#include <stdexcept>
+#include <utility>
+#include <vector>
+
+namespace py = pybind11;
+
+namespace {
+
+struct Set {
+    int64_t sum = 0;
+    std::vector<std::pair<int64_t, int64_t>> items;  // (idx, val)
+
+    void add(int64_t idx, int64_t val) {
+        items.emplace_back(idx, val);
+        sum += val;
+    }
+
+    void merge_from(Set& other) {
+        items.reserve(items.size() + other.items.size());
+        for (auto& it : other.items) {
+            items.push_back(it);
+            sum += it.second;
+        }
+    }
+
+    // Matches Python `Set.__lt__`:
+    //   if sum != other.sum: return sum < other.sum
+    //   if len(items) != len(other.items): return len(items) < len(other.items)
+    //   return items < other.items   # lexicographic
+    bool operator<(const Set& other) const {
+        if (sum != other.sum) return sum < other.sum;
+        if (items.size() != other.items.size())
+            return items.size() < other.items.size();
+        return items < other.items;
+    }
+    bool operator>(const Set& other) const { return other < *this; }
+};
+
+struct State {
+    int k;
+    std::vector<Set> sets;  // maintained in *descending* order (sets[0] largest)
+
+    explicit State(int k_) : k(k_), sets(k_) {}
+
+    // ``items`` has length in [1, k]; element i goes into sets[i] (matching
+    // Python init), then sets are sorted descending.
+    void init_from(const std::vector<std::pair<int64_t, int64_t>>& items) {
+        for (size_t i = 0; i < items.size(); ++i) {
+            sets[i].add(items[i].first, items[i].second);
+        }
+        std::sort(sets.begin(), sets.end(), std::greater<Set>());
+    }
+
+    // Python `merge`: pair sets[i] ↔ other.sets[k-1-i], then resort descending.
+    void merge_with(State& other) {
+        for (int i = 0; i < k; ++i) {
+            sets[i].merge_from(other.sets[k - 1 - i]);
+        }
+        std::sort(sets.begin(), sets.end(), std::greater<Set>());
+    }
+
+    int64_t spread() const { return sets.front().sum - sets.back().sum; }
+
+    // Heap ordering. Python uses a min-heap (`heapq`) with `State.__lt__`
+    // flipped so the state with the LARGEST spread is popped first:
+    //   if spread != other.spread: return spread > other.spread
+    //   return sets[0] > other.sets[0]
+    //
+    // ``std::priority_queue`` / ``std::push_heap`` give a max-heap based on
+    // ``operator<``: the element where ``a < b`` is true for every other ``b``
+    // gets popped LAST.  So define ``operator<`` such that "smaller" means
+    // "lower priority" (popped later), which means we want LARGER spread
+    // (and, on tie, larger ``sets[0]``) to compare as GREATER.
+    bool operator<(const State& other) const {
+        const int64_t s0 = spread();
+        const int64_t s1 = other.spread();
+        if (s0 != s1) return s0 < s1;
+        return sets.front() < other.sets.front();
+    }
+};
+
+std::vector<std::vector<int64_t>> karmarkar_karp_cpp(
+    std::vector<int64_t> workloads,
+    int k_partitions,
+    bool equal_size) {
+    // Release the GIL for the entire compute.  ``workloads`` was already
+    // pickled in by pybind11 (when called across processes) or copied from a
+    // Python list (when called in-process) before this point, so we do not
+    // touch any Python object until we return.
+    py::gil_scoped_release release;
+
+    if (k_partitions <= 0) {
+        throw std::invalid_argument("k_partitions must be > 0");
+    }
+    const size_t n = workloads.size();
+    if (equal_size && (n % static_cast<size_t>(k_partitions) != 0)) {
+        throw std::invalid_argument(
+            "len(workloads) must be divisible by k_partitions when equal_size=True");
+    }
+    if (n == 0) {
+        return std::vector<std::vector<int64_t>>(k_partitions);
+    }
+
+    // Match Python's ``sorted([(workload, i) for i, workload in enumerate(workloads)])``
+    // — ascending by (workload, idx).  std::pair<int64_t,int64_t>::operator< is
+    // lexicographic, so a plain std::sort on (workload, idx) does it.
+    std::vector<std::pair<int64_t, int64_t>> sorted_workloads;
+    sorted_workloads.reserve(n);
+    for (size_t i = 0; i < n; ++i) {
+        sorted_workloads.emplace_back(workloads[i], static_cast<int64_t>(i));
+    }
+    std::sort(sorted_workloads.begin(), sorted_workloads.end());
+
+    // Build initial heap of States.
+    std::vector<State> heap;
+    heap.reserve(equal_size ? n / k_partitions : n);
+
+    if (equal_size) {
+        std::vector<std::pair<int64_t, int64_t>> group;
+        group.reserve(k_partitions);
+        for (size_t off = 0; off < n; off += k_partitions) {
+            group.clear();
+            for (int i = 0; i < k_partitions; ++i) {
+                const auto& [workload, idx] = sorted_workloads[off + i];
+                // Python: items.append((idx, workload))  (note: (idx, workload), not (workload, idx))
+                group.emplace_back(idx, workload);
+            }
+            State s(k_partitions);
+            s.init_from(group);
+            heap.push_back(std::move(s));
+        }
+    } else {
+        std::vector<std::pair<int64_t, int64_t>> single(1);
+        for (const auto& [workload, idx] : sorted_workloads) {
+            single[0] = {idx, workload};
+            State s(k_partitions);
+            s.init_from(single);
+            heap.push_back(std::move(s));
+        }
+    }
+    std::make_heap(heap.begin(), heap.end());
+
+    while (heap.size() > 1) {
+        std::pop_heap(heap.begin(), heap.end());
+        State s0 = std::move(heap.back());
+        heap.pop_back();
+
+        std::pop_heap(heap.begin(), heap.end());
+        State s1 = std::move(heap.back());
+        heap.pop_back();
+
+        s0.merge_with(s1);
+        heap.push_back(std::move(s0));
+        std::push_heap(heap.begin(), heap.end());
+    }
+
+    // Extract partitions from the surviving state.
+    State& final_state = heap.front();
+    std::vector<std::vector<int64_t>> partitions(k_partitions);
+    for (int i = 0; i < k_partitions; ++i) {
+        auto& src = final_state.sets[i].items;
+        auto& dst = partitions[i];
+        dst.reserve(src.size());
+        for (const auto& [idx, _val] : src) {
+            dst.push_back(idx);
+        }
+    }
+    return partitions;
+}
+
+}  // namespace
+
+PYBIND11_MODULE(kk_cpu_ops, m) {
+    m.doc() =
+        "C++ Karmarkar-Karp k-way partitioning. Releases the GIL during compute "
+        "so the main Python thread can keep submitting CUDA kernels.";
+    m.def(
+        "karmarkar_karp",
+        &karmarkar_karp_cpp,
+        py::arg("workloads"),
+        py::arg("k_partitions"),
+        py::arg("equal_size"),
+        "Identical output to commons.perf_model.partitioner.karmarkar_karp "
+        "(same tie-breaking rules), but with the GIL released for the entire "
+        "compute.");
+}
@@ -27,6 +27,7 @@
 # limitations under the License.
 
 import heapq
+import os
 from typing import Any, List, Tuple, Union
 
 import numpy as np
@@ -38,10 +39,74 @@
     Tensor = None
     nvtx = None
 
+# Optional C++ accelerator. Same output as the Python implementation but
+# releases the GIL for the entire compute, so the main thread can keep
+# submitting CUDA kernels while KK runs in a background ThreadPoolExecutor.
+# Set ``KK_FORCE_PYTHON=1`` to bypass the C++ path (useful for parity tests).
+#
+# Resolution order:
+#   1. Honour ``KK_FORCE_PYTHON=1`` → no native module.
+#   2. Top-level import — the location used by ``python setup.py install``
+#      inside the container (``/usr/local/lib/.../dist-packages``).
+#   3. Sibling .so next to the ``perf_model`` package — the location used by
+#      ``python setup.py build_ext --inplace`` during dev iteration.
+_FORCE_PYTHON = os.environ.get("KK_FORCE_PYTHON", "0") == "1"
+_kk_cpu_ops = None
+if not _FORCE_PYTHON:
+    try:
+        import kk_cpu_ops as _kk_cpu_ops  # type: ignore[import-not-found,no-redef]
+    except ImportError:
+        import glob as _glob
+        import importlib.util as _importlib_util
+
+        _so_glob = os.path.join(
+            os.path.dirname(os.path.dirname(__file__)),
+            "kk_cpu_ops*.so",
+        )
+        _matches = sorted(_glob.glob(_so_glob))
+        if _matches:
+            _spec = _importlib_util.spec_from_file_location("kk_cpu_ops", _matches[0])
+            if _spec is not None and _spec.loader is not None:
+                _kk_cpu_ops = _importlib_util.module_from_spec(_spec)
+                _spec.loader.exec_module(_kk_cpu_ops)
+
 
 def karmarkar_karp(
     workloads: Union[np.ndarray, List[int], Tensor], k_partitions: int, equal_size: bool
 ):
+    """K-way load-balanced partitioning via Karmarkar-Karp.
+
+    Returns ``k_partitions`` lists of original indices.  When the C++ accelerator
+    ``kk_cpu_ops`` is importable, the heavy heap traversal runs without the
+    GIL; otherwise the pure-Python fallback below is used.  Output is
+    bit-identical between the two paths (same tie-breaking).
+    """
+    if nvtx is not None:
+        nvtx.range_push("karmarkar_karp")
+    try:
+        # Normalize to a plain Python list of ints.  Tensors / ndarrays both
+        # have ``.tolist()``; built-in lists do not, so a hasattr check picks
+        # the right branch.
+        if hasattr(workloads, "tolist"):
+            workloads = workloads.tolist()
+
+        if _kk_cpu_ops is not None:
+            partitions = _kk_cpu_ops.karmarkar_karp(workloads, k_partitions, equal_size)
+        else:
+            partitions = _karmarkar_karp_python(workloads, k_partitions, equal_size)
+
+        if equal_size:
+            for partition in partitions:
+                assert len(partition) * k_partitions == len(
+                    workloads
+                ), f"{len(partition)} * {k_partitions} != {len(workloads)}"
+        return partitions
+    finally:
+        if nvtx is not None:
+            nvtx.range_pop()  # karmarkar_karp
+
+
+def _karmarkar_karp_python(workloads: List[int], k_partitions: int, equal_size: bool):
     # see: https://en.wikipedia.org/wiki/Largest_differencing_method
     class Set:
         def __init__(self) -> None:
@@ -114,14 +179,6 @@ def __repr__(self) -> str:
             repr_str += "]"
             return repr_str
 
-    if nvtx is not None:
-        nvtx.range_push("karmarkar_karp")
-
-    workloads = (
-        workloads.tolist()
-        if isinstance(workloads, Tensor) and Tensor is not None
-        else workloads
-    )
     sorted_workloads = sorted([(workload, i) for i, workload in enumerate(workloads)])
     states_pq: List[Any] = []
     if equal_size:
@@ -145,16 +202,7 @@ def __repr__(self) -> str:
         state0.merge(state1)
         heapq.heappush(states_pq, state0)
 
-    final_state = states_pq[0]
-    partitions = final_state.get_partitions()
-    if equal_size:
-        for i, partition in enumerate(partitions):
-            assert len(partition) * k_partitions == len(
-                workloads
-            ), f"{len(partition)} * {k_partitions} != {len(workloads)}"
-    if nvtx is not None:
-        nvtx.range_pop()  # karmarkar_karp
-    return partitions
+    return states_pq[0].get_partitions()
 
 
 if __name__ == "__main__":