Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions docs/src/python/memory_management.rst
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@ Memory Management
get_peak_memory
reset_peak_memory
get_cache_memory
get_active_resource_count
get_cache_count
set_memory_limit
set_cache_limit
set_wired_limit
Expand Down
4 changes: 4 additions & 0 deletions mlx/backend/common/buffer_cache.h
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,10 @@ class BufferCache {
return pool_size_;
}

size_t cache_count() const {
return buffer_pool_.size();
}

size_t page_size() const {
return page_size_;
}
Expand Down
10 changes: 10 additions & 0 deletions mlx/backend/cuda/allocator.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -443,6 +443,16 @@ void clear_cache() {
cu::allocator().clear_cache();
}

// Resource-count introspection is Metal-specific (descriptor pressure on
// M-series GPUs, see ml-explore/mlx-lm#1185). CUDA does not expose an
// equivalent buffer-count limit, so these stubs return 0.
size_t get_active_resource_count() {
return 0;
}
size_t get_cache_count() {
return 0;
}

// Not supported in CUDA.
size_t set_wired_limit(size_t) {
return 0;
Expand Down
6 changes: 6 additions & 0 deletions mlx/backend/metal/allocator.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -261,6 +261,12 @@ void reset_peak_memory() {
size_t get_cache_memory() {
return metal::allocator().get_cache_memory();
}
size_t get_active_resource_count() {
return metal::allocator().get_active_resource_count();
}
size_t get_cache_count() {
return metal::allocator().get_cache_count();
}
void clear_cache() {
return metal::allocator().clear_cache();
}
Expand Down
6 changes: 6 additions & 0 deletions mlx/backend/metal/allocator.h
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,12 @@ class MetalAllocator : public allocator::Allocator {
size_t get_cache_memory() {
return buffer_cache_.cache_size();
};
size_t get_active_resource_count() {
return num_resources_;
};
size_t get_cache_count() {
return buffer_cache_.cache_count();
};
size_t set_cache_limit(size_t limit);
size_t set_memory_limit(size_t limit);
size_t get_memory_limit();
Expand Down
12 changes: 12 additions & 0 deletions mlx/memory.h
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,18 @@ MLX_API void reset_peak_memory();
* */
MLX_API size_t get_cache_memory();

/* Get the total number of GPU buffers (active + cached) currently allocated
* by mlx.
*
* On Metal this is the count of MTLBuffer objects, useful for diagnosing
* descriptor-pressure bugs (see ml-explore/mlx-lm#1185). CUDA returns 0
* (the resource limit on CUDA is bytes, not handles).
* */
MLX_API size_t get_active_resource_count();

/* Get the number of GPU buffers currently sitting in the buffer cache. */
MLX_API size_t get_cache_count();

/* Set the memory limit.
* The memory limit is a guideline for the maximum amount of memory to use
* during graph evaluation. If the memory limit is exceeded and there is no
Expand Down
20 changes: 20 additions & 0 deletions python/src/memory.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -122,4 +122,24 @@ void init_memory(nb::module_& m) {

After calling this, :func:`get_cache_memory` should return ``0``.
)pbdoc");
m.def(
"get_active_resource_count",
&mx::get_active_resource_count,
R"pbdoc(
Get the total number of GPU buffers (active + cached) currently
allocated by mlx.

On Metal this is the count of MTLBuffer objects, useful for diagnosing
descriptor-pressure bugs such as ml-explore/mlx-lm#1185. CUDA returns
``0`` (the resource limit on CUDA is bytes, not handles).
)pbdoc");
m.def(
"get_cache_count",
&mx::get_cache_count,
R"pbdoc(
Get the number of GPU buffers currently sitting in the buffer cache.

Useful in conjunction with :func:`get_active_resource_count` for
debugging descriptor-pressure issues. Returns ``0`` on CUDA.
)pbdoc");
}
27 changes: 27 additions & 0 deletions tests/allocator_tests.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
#include "doctest/doctest.h"

#include "mlx/allocator.h"
#include "mlx/memory.h"

using namespace mlx::core;

Expand Down Expand Up @@ -39,3 +40,29 @@ TEST_CASE("test large allocations") {
allocator::free(buffer);
}
}

TEST_CASE("buffer-count introspection probes track alloc/free/cache state") {
// Probes added for ml-explore/mlx-lm#1185, where descriptor pressure was
// diagnosed only after a crash because there was no way to observe the
// MTLBuffer count from Python. These checks verify the probes report
// sensible deltas under a controlled alloc/free pattern. They do not
// depend on any specific eviction policy — only that the counters are
// monotonic with respect to alloc/free/cache operations.

clear_cache();
size_t active_baseline = get_active_resource_count();
size_t cache_baseline = get_cache_count();

auto buffer = allocator::malloc(1 << 20); // 1 MB
// After a fresh malloc the active count must have grown by at least 1.
CHECK_GE(get_active_resource_count(), active_baseline + 1);

allocator::free(buffer);
// After free, the buffer either landed in the cache or was released.
// Either way, the cache count must be >= baseline (cache only grows here).
CHECK_GE(get_cache_count(), cache_baseline);

clear_cache();
// clear_cache must drop cache count to zero.
CHECK_EQ(get_cache_count(), 0u);
}