ml-explore · Ogilthorp3 · Apr 28, 2026
diff --git a/docs/src/python/memory_management.rst b/docs/src/python/memory_management.rst
@@ -10,6 +10,8 @@ Memory Management
   get_peak_memory
   reset_peak_memory
   get_cache_memory
+  get_active_resource_count
+  get_cache_count
   set_memory_limit
   set_cache_limit
   set_wired_limit

diff --git a/mlx/backend/common/buffer_cache.h b/mlx/backend/common/buffer_cache.h
@@ -102,6 +102,10 @@ class BufferCache {
     return pool_size_;
   }
 
+  size_t cache_count() const {
+    return buffer_pool_.size();
+  }
+
   size_t page_size() const {
     return page_size_;
   }

diff --git a/mlx/backend/cuda/allocator.cpp b/mlx/backend/cuda/allocator.cpp
@@ -443,6 +443,16 @@ void clear_cache() {
   cu::allocator().clear_cache();
 }
 
+// Resource-count introspection is Metal-specific (descriptor pressure on
+// M-series GPUs, see ml-explore/mlx-lm#1185). CUDA does not expose an
+// equivalent buffer-count limit, so these stubs return 0.
+size_t get_active_resource_count() {
+  return 0;
+}
+size_t get_cache_count() {
+  return 0;
+}
+
 // Not supported in CUDA.
 size_t set_wired_limit(size_t) {
   return 0;

diff --git a/mlx/backend/metal/allocator.cpp b/mlx/backend/metal/allocator.cpp
@@ -261,6 +261,12 @@ void reset_peak_memory() {
 size_t get_cache_memory() {
   return metal::allocator().get_cache_memory();
 }
+size_t get_active_resource_count() {
+  return metal::allocator().get_active_resource_count();
+}
+size_t get_cache_count() {
+  return metal::allocator().get_cache_count();
+}
 void clear_cache() {
   return metal::allocator().clear_cache();
 }

diff --git a/mlx/backend/metal/allocator.h b/mlx/backend/metal/allocator.h
@@ -36,6 +36,12 @@ class MetalAllocator : public allocator::Allocator {
   size_t get_cache_memory() {
     return buffer_cache_.cache_size();
   };
+  size_t get_active_resource_count() {
+    return num_resources_;
+  };
+  size_t get_cache_count() {
+    return buffer_cache_.cache_count();
+  };
   size_t set_cache_limit(size_t limit);
   size_t set_memory_limit(size_t limit);
   size_t get_memory_limit();

diff --git a/mlx/memory.h b/mlx/memory.h
@@ -33,6 +33,18 @@ MLX_API void reset_peak_memory();
  * */
 MLX_API size_t get_cache_memory();
 
+/* Get the total number of GPU buffers (active + cached) currently allocated
+ * by mlx.
+ *
+ * On Metal this is the count of MTLBuffer objects, useful for diagnosing
+ * descriptor-pressure bugs (see ml-explore/mlx-lm#1185). CUDA returns 0
+ * (the resource limit on CUDA is bytes, not handles).
+ * */
+MLX_API size_t get_active_resource_count();
+
+/* Get the number of GPU buffers currently sitting in the buffer cache. */
+MLX_API size_t get_cache_count();
+
 /* Set the memory limit.
  * The memory limit is a guideline for the maximum amount of memory to use
  * during graph evaluation. If the memory limit is exceeded and there is no

diff --git a/python/src/memory.cpp b/python/src/memory.cpp
@@ -122,4 +122,24 @@ void init_memory(nb::module_& m) {
 
       After calling this, :func:`get_cache_memory` should return ``0``.
       )pbdoc");
+  m.def(
+      "get_active_resource_count",
+      &mx::get_active_resource_count,
+      R"pbdoc(
+      Get the total number of GPU buffers (active + cached) currently
+      allocated by mlx.
+
+      On Metal this is the count of MTLBuffer objects, useful for diagnosing
+      descriptor-pressure bugs such as ml-explore/mlx-lm#1185. CUDA returns
+      ``0`` (the resource limit on CUDA is bytes, not handles).
+      )pbdoc");
+  m.def(
+      "get_cache_count",
+      &mx::get_cache_count,
+      R"pbdoc(
+      Get the number of GPU buffers currently sitting in the buffer cache.
+
+      Useful in conjunction with :func:`get_active_resource_count` for
+      debugging descriptor-pressure issues. Returns ``0`` on CUDA.
+      )pbdoc");
 }
diff --git a/tests/allocator_tests.cpp b/tests/allocator_tests.cpp
@@ -5,6 +5,7 @@
 #include "doctest/doctest.h"
 
 #include "mlx/allocator.h"
+#include "mlx/memory.h"
 
 using namespace mlx::core;
 
@@ -39,3 +40,29 @@ TEST_CASE("test large allocations") {
     allocator::free(buffer);
   }
 }
+
+TEST_CASE("buffer-count introspection probes track alloc/free/cache state") {
+  // Probes added for ml-explore/mlx-lm#1185, where descriptor pressure was
+  // diagnosed only after a crash because there was no way to observe the
+  // MTLBuffer count from Python. These checks verify the probes report
+  // sensible deltas under a controlled alloc/free pattern. They do not
+  // depend on any specific eviction policy — only that the counters are
+  // monotonic with respect to alloc/free/cache operations.
+
+  clear_cache();
+  size_t active_baseline = get_active_resource_count();
+  size_t cache_baseline = get_cache_count();
+
+  auto buffer = allocator::malloc(1 << 20); // 1 MB
+  // After a fresh malloc the active count must have grown by at least 1.
+  CHECK_GE(get_active_resource_count(), active_baseline + 1);
+
+  allocator::free(buffer);
+  // After free, the buffer either landed in the cache or was released.
+  // Either way, the cache count must be >= baseline (cache only grows here).
+  CHECK_GE(get_cache_count(), cache_baseline);
+
+  clear_cache();
+  // clear_cache must drop cache count to zero.
+  CHECK_EQ(get_cache_count(), 0u);
+}