diff --git a/CMakeLists.txt b/CMakeLists.txt index 6dbf75efc..7b7aaba15 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -568,10 +568,11 @@ if(NOT SNMALLOC_HEADER_ONLY_LIBRARY) if (${TEST} MATCHES "release-.*") message(VERBOSE "Adding test: ${TESTNAME} only for release configs") - add_test(NAME ${TESTNAME} COMMAND ${TESTNAME} CONFIGURATIONS "Release") + add_test(NAME ${TESTNAME} COMMAND ${TESTNAME} --smoke + CONFIGURATIONS "Release") else() message(VERBOSE "Adding test: ${TESTNAME}") - add_test(${TESTNAME} ${TESTNAME}) + add_test(NAME ${TESTNAME} COMMAND ${TESTNAME} --smoke) endif() if (${TEST_CATEGORY} MATCHES "perf") message(VERBOSE "Single threaded test: ${TESTNAME}") diff --git a/src/test/func/memory/memory.cc b/src/test/func/memory/memory.cc index 9bf335087..253628282 100644 --- a/src/test/func/memory/memory.cc +++ b/src/test/func/memory/memory.cc @@ -462,7 +462,7 @@ void test_static_sized_alloc() test_static_sized_alloc(); } -template +template void test_static_sized_allocs() { if (max_size < 16) @@ -554,6 +554,11 @@ int main(int, char**) } #endif auto start = std::chrono::steady_clock::now(); + // Most tests below have substantial internal iteration (size-class + // sweeps, per-offset loops, batch alloc/dealloc), so a large outer + // repetition is redundant for coverage. A small outer count still + // catches consolidation/leak issues that only manifest across + // repeated entry to a test. #define TEST(testname) \ do \ { \ @@ -561,7 +566,7 @@ int main(int, char**) auto diff_seconds = \ std::chrono::duration_cast(end - start).count(); \ std::cout << "Running " #testname << " @ " << diff_seconds << std::endl; \ - for (size_t i = 0; i < 50; i++) \ + for (size_t i = 0; i < 3; i++) \ testname(); \ } while (0); @@ -574,7 +579,13 @@ int main(int, char**) TEST(test_calloc_large_bug); TEST(test_external_pointer_stack); TEST(test_external_pointer_dealloc_bug); - TEST(test_external_pointer_large); + // test_external_pointer_large allocates ~16MB per object across 32 + // objects (~512MB total) and walks every 16MB-aligned interior + // pointer. It is its own internal stress; running it once is + // enough, so it is invoked outside the TEST(...) outer-repeat + // macro. + std::cout << "Running test_external_pointer_large (single pass)" << std::endl; + test_external_pointer_large(); TEST(test_external_pointer); TEST(test_alloc_16M); TEST(test_calloc_16M); diff --git a/src/test/perf/contention/contention.cc b/src/test/perf/contention/contention.cc index bca2a4889..ac1e6acb5 100644 --- a/src/test/perf/contention/contention.cc +++ b/src/test/perf/contention/contention.cc @@ -162,8 +162,15 @@ int main(int argc, char** argv) opt::Opt opt(argc, argv); size_t cores = opt.is("--cores", 8); - size_t count = opt.is("--swapcount", 1 << 20); - size_t size = opt.is("--swapsize", 1 << 18); + // `--smoke` lowers the *defaults* for the iteration knobs so ctest + // runs at modest cost. Explicit `--swapcount` / `--swapsize` on the + // command line still win. The smoke values must remain large + // enough to cross the remote-deallocation cache thresholds + // (otherwise `mem/remotecache.h` and `mem/remoteallocator.h` + // coverage drops sharply). + bool smoke = opt.has("--smoke"); + size_t count = opt.is("--swapcount", smoke ? 1u << 18 : 1u << 20); + size_t size = opt.is("--swapsize", smoke ? 1u << 16 : 1u << 18); use_malloc = opt.has("--use_malloc"); std::cout << "Allocator is " << (use_malloc ? "System" : "snmalloc") diff --git a/src/test/perf/external_pointer/externalpointer.cc b/src/test/perf/external_pointer/externalpointer.cc index 07e69cef9..a15c27ffc 100644 --- a/src/test/perf/external_pointer/externalpointer.cc +++ b/src/test/perf/external_pointer/externalpointer.cc @@ -1,4 +1,5 @@ #include +#include #include #include #include @@ -50,21 +51,8 @@ namespace test snmalloc::debug_check_empty(); } - void test_external_pointer(xoroshiro::p128r64& r) + void test_external_pointer(xoroshiro::p128r64& r, size_t iterations) { - // This is very slow on Windows at the moment. Until this is fixed, help - // CI terminate. -#if defined(NDEBUG) && !defined(_MSC_VER) - static constexpr size_t iterations = 10000000; -#else -# ifdef _MSC_VER - // Windows Debug build is very slow on this test. - // Reduce complexity to balance CI times. - static constexpr size_t iterations = 50000; -# else - static constexpr size_t iterations = 100000; -# endif -#endif setup(r); { @@ -93,15 +81,38 @@ namespace test } } -int main(int, char**) +int main(int argc, char** argv) { setup(); - xoroshiro::p128r64 r; + opt::Opt opt(argc, argv); + + // Default iteration count varies by build (Release runs many more + // iterations). Smoke mode shrinks both to the smallest count that + // still exercises every interior-pointer dispatch path. + size_t cli_default; + // This is very slow on Windows at the moment. Until this is fixed, help + // CI terminate. +#if defined(NDEBUG) && !defined(_MSC_VER) + cli_default = 10000000; +#elif defined(_MSC_VER) + // Windows Debug build is very slow on this test. + // Reduce complexity to balance CI times. + cli_default = 50000; +#else + cli_default = 100000; +#endif + size_t iterations = opt.has("--smoke") ? 10000 : cli_default; - size_t nn = snmalloc::Debug ? 30 : 3; + // Outer-repeat count: Debug repeats 30x to amortise setup, Release 3x. + // Smoke shrinks both ends; one repeat is enough to hit every path + // since `setup()` re-randomises the object table each call. + size_t nn_default = snmalloc::Debug ? 30 : 3; + size_t nn = opt.has("--smoke") ? 1 : nn_default; + + xoroshiro::p128r64 r; for (size_t n = 0; n < nn; n++) - test::test_external_pointer(r); + test::test_external_pointer(r, iterations); return 0; } diff --git a/src/test/perf/large_alloc/large_alloc.cc b/src/test/perf/large_alloc/large_alloc.cc index b0f0f2bc8..5d3db7c64 100644 --- a/src/test/perf/large_alloc/large_alloc.cc +++ b/src/test/perf/large_alloc/large_alloc.cc @@ -1,19 +1,19 @@ #include +#include #include #include using namespace snmalloc; static constexpr size_t ALLOC_SIZE = 800 * 1024; // 800 KB -static constexpr size_t ITERATIONS = 100000; -void test_alloc_dealloc_cycle() +void test_alloc_dealloc_cycle(size_t iterations) { { MeasureTime m; - m << "Alloc/dealloc 800KB x " << ITERATIONS; + m << "Alloc/dealloc 800KB x " << iterations; - for (size_t i = 0; i < ITERATIONS; i++) + for (size_t i = 0; i < iterations; i++) { void* p = snmalloc::alloc(ALLOC_SIZE); SNMALLOC_CHECK(p != nullptr); @@ -24,7 +24,7 @@ void test_alloc_dealloc_cycle() snmalloc::debug_check_empty(); } -void test_batch_alloc_then_dealloc() +void test_batch_alloc_then_dealloc(size_t iterations) { static constexpr size_t BATCH = 128; @@ -32,7 +32,7 @@ void test_batch_alloc_then_dealloc() MeasureTime m; m << "Batch alloc then dealloc 800KB x " << BATCH; - for (size_t j = 0; j < ITERATIONS / BATCH; j++) + for (size_t j = 0; j < iterations / BATCH; j++) { for (size_t i = 0; i < BATCH; i++) { @@ -49,13 +49,13 @@ void test_batch_alloc_then_dealloc() snmalloc::debug_check_empty(); } -void test_alloc_dealloc_with_touch() +void test_alloc_dealloc_with_touch(size_t iterations) { { MeasureTime m; - m << "Alloc/touch/dealloc 800KB x " << ITERATIONS; + m << "Alloc/touch/dealloc 800KB x " << iterations; - for (size_t i = 0; i < ITERATIONS; i++) + for (size_t i = 0; i < iterations; i++) { char* p = static_cast(snmalloc::alloc(ALLOC_SIZE)); SNMALLOC_CHECK(p != nullptr); @@ -71,13 +71,20 @@ void test_alloc_dealloc_with_touch() snmalloc::debug_check_empty(); } -int main(int, char**) +int main(int argc, char** argv) { setup(); - test_alloc_dealloc_cycle(); - test_batch_alloc_then_dealloc(); - test_alloc_dealloc_with_touch(); + opt::Opt opt(argc, argv); + // Each test does alloc/dealloc cycles driven by `iterations`. The + // batch test divides by BATCH=128, so the smoke value is chosen so + // that `smoke / 128 >= 1` (i.e. the batch test still runs at least + // one full batch round). + size_t iterations = opt.has("--smoke") ? 8192 : 100000; + + test_alloc_dealloc_cycle(iterations); + test_batch_alloc_then_dealloc(iterations); + test_alloc_dealloc_with_touch(iterations); return 0; } diff --git a/src/test/perf/lotsofthreads/lotsofthread.cc b/src/test/perf/lotsofthreads/lotsofthread.cc index 9705dfff3..b0ab56373 100644 --- a/src/test/perf/lotsofthreads/lotsofthread.cc +++ b/src/test/perf/lotsofthreads/lotsofthread.cc @@ -101,6 +101,14 @@ int main() #else size_t iterations = 200000; #endif +#ifndef NDEBUG + // Debug builds run with full instrumentation enabled and are + // ~10x slower per iteration. The cross-thread batch behaviour + // this benchmark stresses is observable at much lower counts; + // reduce iterations so this test does not dominate Debug ctest + // wall-time. Release builds are unaffected. + iterations /= 10; +#endif int threadcount = 8; vector threads; diff --git a/src/test/perf/memcpy/memcpy.cc b/src/test/perf/memcpy/memcpy.cc index 6a8928c52..e554106c0 100644 --- a/src/test/perf/memcpy/memcpy.cc +++ b/src/test/perf/memcpy/memcpy.cc @@ -25,9 +25,15 @@ size_t my_random() std::vector allocs; +// Number of distinct destination buffers per size class. Each `test()` +// call iterates over every entry in `allocs` and runs the memcpy +// implementation under measurement, so this is the per-size repeat +// count. Set by `main()` from `--smoke`. +size_t allocs_per_size = 1000; + void shape(size_t size) { - for (size_t i = 0; i < 1000; i++) + for (size_t i = 0; i < allocs_per_size; i++) { auto rsize = size * 2; auto offset = 0; @@ -70,6 +76,12 @@ void test( { auto src = snmalloc::alloc(size); shape(size); + // The outer loop is a measurement-variance loop, not a coverage knob: + // it gathers ten timing samples per size for the perf statistics. + // Under `--smoke` it still runs ten times, but each `test_memcpy` + // call exercises only `allocs_per_size` (smoke value) memcpys, so the + // total work is small. Coverage is unaffected because every code path + // is hit on the first pass. for (size_t i = 0; i < 10; i++) { MeasureTime m(true); @@ -108,6 +120,12 @@ int main(int argc, char** argv) opt::Opt opt(argc, argv); bool full_test = opt.has("--full_test"); + // Number of destination buffers per size class. Smoke mode shrinks + // it dramatically because each `test()` call already runs ten + // measurement passes per size, which is more than enough to exercise + // every memcpy code path. + allocs_per_size = opt.has("--smoke") ? 100 : 1000; + // size_t size = 0; auto mc_platform_checked = [](void* dst, const void* src, size_t len) { memcpy_platform_checked(dst, src, len); diff --git a/src/test/perf/msgpass/msgpass.cc b/src/test/perf/msgpass/msgpass.cc index e7b455541..b8c0d9d2b 100644 --- a/src/test/perf/msgpass/msgpass.cc +++ b/src/test/perf/msgpass/msgpass.cc @@ -191,10 +191,16 @@ int main(int argc, char** argv) struct params param; opt::Opt opt(argc, argv); + // `--smoke` lowers the *default* per-producer batch count so ctest + // runs at modest cost. Explicit `--batches` on the command line + // still wins. The smoke value must remain large enough for the + // cross-thread remote-deallocation cache thresholds in + // `mem/remotecache.h` / `mem/remoteallocator.h` to fire. + size_t batches_default = opt.has("--smoke") ? 1u << 18 : 1024 * 1024; param.N_PRODUCER = opt.is("--producers", 3); param.N_CONSUMER = opt.is("--consumers", 3); param.N_PROXY = opt.is("--proxies", 2); - param.N_PRODUCER_BATCH = opt.is("--batches", 1024 * 1024); + param.N_PRODUCER_BATCH = opt.is("--batches", batches_default); param.N_MAX_OUTSTANDING = opt.is("--max-out", 4 * 1024); param.N_MAX_BATCH_SIZE = opt.is("--max-batch", 16); diff --git a/src/test/perf/singlethread/singlethread.cc b/src/test/perf/singlethread/singlethread.cc index bf173969d..b02643e7a 100644 --- a/src/test/perf/singlethread/singlethread.cc +++ b/src/test/perf/singlethread/singlethread.cc @@ -1,4 +1,5 @@ #include +#include #include #include #include @@ -62,24 +63,31 @@ void test_alloc_dealloc(size_t count, size_t size, bool write) snmalloc::debug_check_empty(); } -int main(int, char**) +int main(int argc, char** argv) { setup(); + opt::Opt opt(argc, argv); + // Default `count` exercises sizeclass dispatch many times; under + // `--smoke` we keep one alloc/dealloc cycle through every code + // path but cut the bulk repetitions. + size_t count_small = opt.has("--smoke") ? 1u << 12 : 1u << 15; + size_t count_large = opt.has("--smoke") ? 1u << 8 : 1u << 10; + for (size_t size = 16; size <= 128; size <<= 1) { - test_alloc_dealloc(1 << 15, size, false); - test_alloc_dealloc(1 << 15, size, true); - test_alloc_dealloc(1 << 15, size, false); - test_alloc_dealloc(1 << 15, size, true); + test_alloc_dealloc(count_small, size, false); + test_alloc_dealloc(count_small, size, true); + test_alloc_dealloc(count_small, size, false); + test_alloc_dealloc(count_small, size, true); } for (size_t size = 1 << 12; size <= 1 << 17; size <<= 1) { - test_alloc_dealloc(1 << 10, size, false); - test_alloc_dealloc(1 << 10, size, true); - test_alloc_dealloc(1 << 10, size, false); - test_alloc_dealloc(1 << 10, size, true); + test_alloc_dealloc(count_large, size, false); + test_alloc_dealloc(count_large, size, true); + test_alloc_dealloc(count_large, size, false); + test_alloc_dealloc(count_large, size, true); } return 0;