snmalloc/src/test/perf/lotsofthreads/lotsofthread.cc at 28dae9b25fae4c329aeb84e536659be2db4e72f9 · mjp41/snmalloc · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
/**
 * This benchmark is based on
 * https://github.com/microsoft/mimalloc/issues/1002#issuecomment-2630410617
 *
 * It causes large batchs of memory to be freed on a remote thread, and causes
 * many aspects of the backend to be under-contention.
 *
 * The benchmark has a single freeing thread, and many allocating threads. The
 * allocating threads communicate using a shared list of memory to free, which
 * is protected by a mutex. This causes interesting batch behaviour which
 * triggered a bug in the linux backend.
 */
#include <assert.h>
#include <atomic>
#include <mutex>
#include <stdio.h>
#include <stdlib.h>
#include <thread>
#include <vector>
using namespace std;

#include <test/snmalloc_testlib.h>
#define malloc snmalloc::libc::malloc
#define free snmalloc::libc::free
#define malloc_usable_size snmalloc::libc::malloc_usable_size

std::mutex global_tofree_list_mtx;
std::vector<void*> global_tofree_list;

std::atomic_int mustexit;

void freeloop()
{
  size_t max_list_bytes = 0;
  while (1)
  {
    std::lock_guard<std::mutex> guard{global_tofree_list_mtx};
    size_t list_bytes = 0;
    for (auto& p : global_tofree_list)
    {
      list_bytes += malloc_usable_size(p);
      free(p);
    }
    global_tofree_list.clear();

    if (list_bytes > max_list_bytes)
    {
      printf("%zd bytes\n", list_bytes);
      max_list_bytes = list_bytes;
    }

    if (mustexit)
      return;
  }
}

void looper(size_t iterations)
{
  std::vector<void*> tofree_list;
  auto flush = [&]() {
    {
      std::lock_guard<std::mutex> guard{global_tofree_list_mtx};
      for (auto& p : tofree_list)
        global_tofree_list.push_back(p);
    }
    tofree_list.clear();
  };

  auto do_free = [&](void* p) {
    tofree_list.push_back(p);
    if (tofree_list.size() > 100)
    {
      flush();
    }
  };

  for (size_t i = 0; i < iterations; ++i)
  {
    size_t s = snmalloc::bits::one_at_bit(i % 20);
    for (size_t j = 0; j < 8; j++)
    {
      auto ptr = (int*)malloc(s * sizeof(int));
      if (ptr == nullptr)
        continue;
      *ptr = 1523;
      do_free(ptr);
    }
  }

  flush();
}

int main()
{
#ifdef SNMALLOC_THREAD_SANITIZER_ENABLED
  size_t iterations = 50000;
#elif defined(__APPLE__) && !defined(SNMALLOC_APPLE_HAS_OS_SYNC_WAIT_ON_ADDRESS)
  size_t iterations = 50000;
#elif defined(WIN32)
  size_t iterations = 50000;
#else
  size_t iterations = 200000;
#endif
#ifndef NDEBUG
  // Debug builds run with full instrumentation enabled and are
  // ~10x slower per iteration. The cross-thread batch behaviour
  // this benchmark stresses is observable at much lower counts;
  // reduce iterations so this test does not dominate Debug ctest
  // wall-time. Release builds are unaffected.
  iterations /= 10;
#endif

  int threadcount = 8;
  vector<thread> threads;

  for (int i = 0; i < threadcount; ++i)
    threads.emplace_back(looper, iterations);

  std::thread freeloop_thread(freeloop);

  for (auto& thread : threads)
  {
    thread.join();
  }

  mustexit.store(1);
  freeloop_thread.join();

  puts("Done!");
  return 0;
}