Support constructing FMBScene with Python data (#34)

horizon-blue · web-flow · commit 8a69ad8bbd03 · 2025-12-05T16:52:19.000-05:00
Part of MET-36

## Summary of Changes

While working on the forward kernel, I noticed that currently we don't
have a way to create a `FMBScene` with data from Python side. This PR
fills in this missing piece by defining a new constructor for `FMBScene`
that takes in references to `std::vector`s and copy the values.

Sadly, this copy is needed because `FMBScene` is designed to manage the
memory for itself and we want to avoid double-free issue. In addition,
since Python list is always on host (CPU) side, if we want the scene to
live in GPU memory, then we have to move the data anyways.

### Aside: Using RAII Containers in `FMBScene`

While writing the constructor &amp; the unit test, I started to get a bit
frustrated by having to write similar code twice to specialize for
host/device memory management, so I replaced the underlying containers
in `FMBScene` with `thrust::device/host_vector` :). Another nice side
effect of this is that Thrust's device pointer &amp; device reference types
handle the moving of data for us automatically when we attempt to read
it from host (see the definition of `FMBScene.__getitem__` in
`bindints.cu` for example).

## Test Plans

I added a unit test to verify that we can create `FMBScene` on both
device &amp; host and read the values stored in it.

```bash
pixi run test
```
diff --git a/genmetaballs/src/cuda/bindings.cu b/genmetaballs/src/cuda/bindings.cu
@@ -253,10 +253,22 @@ template <MemoryLocation location>
 void bind_fmb_scene(nb::module_& m, const char* name) {
     nb::class_<FMBScene<location>>(m, name)
         .def(nb::init<size_t>(), nb::arg("size"))
+        .def(nb::init<const std::vector<FMB>&, const std::vector<float>&>(), nb::arg("fmbs"),
+             nb::arg("log_weights"),
+             "Construct FMBScene from a list of FMBs and corresponding log weights")
         .def_prop_ro("size", &FMBScene<location>::size)
         .def("__len__", &FMBScene<location>::size)
-        .def("__getitem__", &FMBScene<location>::get_fmb, nb::arg("idx"),
-             "Get the (FMB, log_weight) tuple at index i")
+        .def(
+            "__getitem__",
+            // Convert cuda::std::tuple to std::tuple for nanobind
+            [](const FMBScene<location>& scene, size_t idx) {
+                const auto& [fmb, log_weight] = scene[idx];
+                // for device data, the types would be thrust::device_reference, which cannot be
+                // returned directly to Python. The static cast forces a copy (to host) to be made.
+                return std::make_tuple(static_cast<const FMB&>(fmb),
+                                       static_cast<const float&>(log_weight));
+            },
+            "Get the (FMB, log_weight) tuple at index i")
         .def("__repr__", [=](const FMBScene<location>& scene) {
             return nb::str("{}(size={})").format(name, scene.size());
         });
diff --git a/genmetaballs/src/cuda/core/fmb.cu b/genmetaballs/src/cuda/core/fmb.cu
@@ -15,25 +15,3 @@ CUDA_CALLABLE float FMB::quadratic_form(const Vec3D vec) const {
     const auto shifted_vec = vec - get_mean();
     return dot(shifted_vec, cov_inv_apply(shifted_vec));
 }
-
-template <>
-__host__ FMBScene<MemoryLocation::HOST>::FMBScene(size_t size)
-    : fmbs_{new FMB[size]}, log_weights_{new float[size]}, size_{size} {}
-
-template <>
-__host__ FMBScene<MemoryLocation::DEVICE>::FMBScene(size_t size) : size_{size} {
-    CUDA_CHECK(cudaMalloc(&fmbs_, size * sizeof(FMB)));
-    CUDA_CHECK(cudaMalloc(&log_weights_, size * sizeof(float)));
-}
-
-template <>
-__host__ FMBScene<MemoryLocation::HOST>::~FMBScene() {
-    delete[] fmbs_;
-    delete[] log_weights_;
-}
-
-template <>
-__host__ FMBScene<MemoryLocation::DEVICE>::~FMBScene() {
-    CUDA_CHECK(cudaFree(fmbs_));
-    CUDA_CHECK(cudaFree(log_weights_));
-}
diff --git a/genmetaballs/src/cuda/core/fmb.cuh b/genmetaballs/src/cuda/core/fmb.cuh
@@ -2,7 +2,12 @@
 
 #include <cuda/std/span>
 #include <cuda/std/tuple>
+#include <cuda_runtime.h>
 #include <stdexcept>
+#include <thrust/device_vector.h>
+#include <thrust/host_vector.h>
+#include <thrust/iterator/zip_iterator.h>
+#include <vector>
 
 #include "geometry.cuh"
 #include "utils.cuh"
@@ -46,74 +51,49 @@ public:
 template <MemoryLocation location>
 class FMBScene {
 private:
-    FMB* fmbs_;
-    float* log_weights_;
+    // Host memory -> thrust::host_vector
+    // Device memory -> thrust::device_vector
+    template <typename T>
+    using vector_t = std::conditional_t<location == MemoryLocation::HOST, thrust::host_vector<T>,
+                                        thrust::device_vector<T>>;
+
+    vector_t<FMB> fmbs_;
+    vector_t<float> log_weights_;
     size_t size_;
 
 public:
-    __host__ FMBScene(size_t size);
-
-    __host__ ~FMBScene();
-
-    CUDA_CALLABLE cuda::std::tuple<FMB&, float&> operator[](const uint32_t i) {
-        return cuda::std::tie(fmbs_[i], log_weights_[i]);
+    __host__ FMBScene(size_t size) : size_{size}, fmbs_(size), log_weights_(size) {};
+
+    // Copy constructor from std::vector
+    // This enables easy construction from Python side
+    __host__ FMBScene<location>(const std::vector<FMB>& fmbs, const std::vector<float>& log_weights)
+        : size_{fmbs.size()}, fmbs_(fmbs.begin(), fmbs.end()),
+          log_weights_(log_weights.begin(), log_weights.end()) {
+        if (fmbs.size() != log_weights.size()) {
+            throw std::invalid_argument(
+                "FMBScene constructor: fmbs and log_weights must have the same size");
+        }
     }
 
-    CUDA_CALLABLE cuda::std::tuple<const FMB&, const float&> operator[](const uint32_t i) const {
-        return cuda::std::tie(fmbs_[i], log_weights_[i]);
+    CUDA_CALLABLE auto operator[](const uint32_t i) {
+        return cuda::std::make_tuple(fmbs_[i], log_weights_[i]);
     }
 
-    class Iterator {
-    private:
-        FMB* fmb_ptr_;
-        float* log_weight_ptr_;
-
-    public:
-        CUDA_CALLABLE Iterator(FMB* const fmb_ptr, float* const log_weight_ptr)
-            : fmb_ptr_{fmb_ptr}, log_weight_ptr_{log_weight_ptr} {}
-        CUDA_CALLABLE cuda::std::tuple<FMB&, float&> operator*() {
-            return cuda::std::tie(*fmb_ptr_, *log_weight_ptr_);
-        }
-        CUDA_CALLABLE bool operator!=(const Iterator& other) const {
-            return fmb_ptr_ != other.fmb_ptr_ || log_weight_ptr_ != other.log_weight_ptr_;
-        }
-        CUDA_CALLABLE Iterator& operator++() {
-            fmb_ptr_++, log_weight_ptr_++;
-            return *this;
-        }
-    };
-
-    class ConstIterator {
-    private:
-        const FMB* fmb_ptr_;
-        const float* log_weight_ptr_;
-
-    public:
-        CUDA_CALLABLE ConstIterator(const FMB* const fmb_ptr, const float* const log_weight_ptr)
-            : fmb_ptr_{fmb_ptr}, log_weight_ptr_{log_weight_ptr} {}
-        CUDA_CALLABLE cuda::std::tuple<const FMB&, const float&> operator*() const {
-            return cuda::std::tie(*fmb_ptr_, *log_weight_ptr_);
-        }
-        CUDA_CALLABLE bool operator!=(const ConstIterator& other) const {
-            return fmb_ptr_ != other.fmb_ptr_ || log_weight_ptr_ != other.log_weight_ptr_;
-        }
-        CUDA_CALLABLE ConstIterator& operator++() {
-            fmb_ptr_++, log_weight_ptr_++;
-            return *this;
-        }
-    };
+    CUDA_CALLABLE auto operator[](const uint32_t i) const {
+        return cuda::std::make_tuple(fmbs_[i], log_weights_[i]);
+    }
 
-    CUDA_CALLABLE Iterator begin() {
-        return Iterator(fmbs_, log_weights_);
+    CUDA_CALLABLE auto begin() {
+        return thrust::make_zip_iterator(fmbs_.begin(), log_weights_.begin());
     }
-    CUDA_CALLABLE Iterator end() {
-        return Iterator(fmbs_ + size_, log_weights_ + size_);
+    CUDA_CALLABLE auto end() {
+        return thrust::make_zip_iterator(fmbs_.end(), log_weights_.end());
     }
-    CUDA_CALLABLE ConstIterator begin() const {
-        return ConstIterator(fmbs_, log_weights_);
+    CUDA_CALLABLE auto begin() const {
+        return thrust::make_zip_iterator(fmbs_.begin(), log_weights_.begin());
     }
-    CUDA_CALLABLE ConstIterator end() const {
-        return ConstIterator(fmbs_ + size_, log_weights_ + size_);
+    CUDA_CALLABLE auto end() const {
+        return thrust::make_zip_iterator(fmbs_.end(), log_weights_.end());
     }
     CUDA_CALLABLE const FMB& get_fmb(uint32_t idx) const {
         return fmbs_[idx];
diff --git a/genmetaballs/src/genmetaballs/core/__init__.py b/genmetaballs/src/genmetaballs/core/__init__.py
@@ -10,7 +10,7 @@
     TwoParameterConfidence,
     ZeroParameterConfidence,
 )
-from genmetaballs._genmetaballs_bindings.fmb import CPUFMBScene, GPUFMBScene
+from genmetaballs._genmetaballs_bindings.fmb import FMB, CPUFMBScene, GPUFMBScene
 from genmetaballs._genmetaballs_bindings.image import CPUImage, GPUImage
 from genmetaballs._genmetaballs_bindings.utils import CPUFloatArray2D, GPUFloatArray2D, sigmoid
 
@@ -63,6 +63,19 @@ def make_fmb_scene(size: int, device: DeviceType) -> CPUFMBScene | GPUFMBScene:
         raise ValueError(f"Unsupported device type: {device}")
 
 
+# TODO: create a wrapper class for FMBScene and turn the factory functions into
+# class methods
+def make_fmb_scene_from_values(
+    fmbs: list[fmb.FMB], log_weights: list[float], device: DeviceType
+) -> CPUFMBScene | GPUFMBScene:
+    if device == "cpu":
+        return CPUFMBScene(fmbs, log_weights)
+    elif device == "gpu":
+        return GPUFMBScene(fmbs, log_weights)
+    else:
+        raise ValueError(f"Unsupported device type: {device}")
+
+
 __all__ = [
     "array2d_float",
     "ZeroParameterConfidence",
@@ -74,7 +87,10 @@ def make_fmb_scene(size: int, device: DeviceType) -> CPUFMBScene | GPUFMBScene:
     "intersector",
     "sigmoid",
     "FourParameterBlender",
+    "FMB",
+    "Intrinsics",
     "ThreeParameterBlender",
     "make_image",
     "make_fmb_scene",
+    "make_fmb_scene_from_values",
 ]
diff --git a/tests/python_tests/test_fmb.py b/tests/python_tests/test_fmb.py
@@ -3,7 +3,7 @@
 from scipy.spatial.distance import mahalanobis
 from scipy.spatial.transform import Rotation as Rot
 
-from genmetaballs.core import fmb, geometry, make_fmb_scene
+from genmetaballs.core import fmb, geometry, make_fmb_scene, make_fmb_scene_from_values
 
 FMB = fmb.FMB
 Pose, Vec3D, Rotation = geometry.Pose, geometry.Vec3D, geometry.Rotation
@@ -48,3 +48,33 @@ def test_fmb_scene_creation():
     gpu_scene = make_fmb_scene(20, device="gpu")
     assert isinstance(gpu_scene, fmb.GPUFMBScene)
     assert len(gpu_scene) == 20
+
+
+@pytest.mark.parametrize("device", ["cpu", "gpu"])
+def test_fmb_scene_creation_from_lists(rng, device):
+    fmbs = []
+    log_weights = []
+    gt_translations = []
+    gt_extents = []
+    num_balls = 15
+    for _ in range(num_balls):
+        quat = rng.uniform(size=4).astype(np.float32)
+        tran, extent = rng.uniform(size=(2, 3)).astype(np.float32)
+        pose = Pose.from_components(Rotation.from_quat(*quat), Vec3D(*tran))
+        fmbs.append(FMB(pose, *extent))
+        log_weights.append(rng.uniform())
+        gt_translations.append(tran)
+        gt_extents.append(extent)
+
+    scene = make_fmb_scene_from_values(fmbs, log_weights, device=device)
+
+    assert len(scene) == num_balls
+    # Verify that we can retrieve each FMB and log weight correctly
+    for i in range(num_balls):
+        fmb_i, log_weight = scene[i]
+        translation = fmb_i.pose.tran
+        assert np.allclose([translation.x, translation.y, translation.z], gt_translations[i])
+
+        fmb_extent = fmb_i.extent
+        assert np.allclose(fmb_extent, gt_extents[i])
+        assert np.isclose(log_weight, log_weights[i])