From 4f847581b1f31cc171d6fbdcea9a3c605566a89d Mon Sep 17 00:00:00 2001
From: Francesco Montorsi <francesco.montorsi@gmail.com>
Date: Tue, 13 Aug 2019 11:19:36 +0200
Subject: [PATCH 01/11] Implement a very simple zero-lock message pool to test
 performance gains by completely removing malloc/free from the hot path of
 benchmark util

---
 include/atomicops.h         | 676 +++++++++++++++++++++++++++
 include/readerwriterqueue.h | 906 ++++++++++++++++++++++++++++++++++++
 perf/remote_thr.cpp         | 108 +++++
 src/msg.cpp                 |  28 +-
 4 files changed, 1717 insertions(+), 1 deletion(-)
 create mode 100644 include/atomicops.h
 create mode 100644 include/readerwriterqueue.h
diff --git a/include/atomicops.h b/include/atomicops.h
new file mode 100644
index 0000000000..4fd1748293
--- /dev/null
+++ b/include/atomicops.h
@@ -0,0 +1,676 @@
+﻿// ©2013-2016 Cameron Desrochers.
+// Distributed under the simplified BSD license (see the license file that
+// should have come with this header).
+// Uses Jeff Preshing's semaphore implementation (under the terms of its
+// separate zlib license, embedded below).
+
+#pragma once
+
+// Provides portable (VC++2010+, Intel ICC 13, GCC 4.7+, and anything C++11 compliant) implementation
+// of low-level memory barriers, plus a few semi-portable utility macros (for inlining and alignment).
+// Also has a basic atomic type (limited to hardware-supported atomics with no memory ordering guarantees).
+// Uses the AE_* prefix for macros (historical reasons), and the "moodycamel" namespace for symbols.
+
+#include <cassert>
+#include <type_traits>
+#include <cerrno>
+#include <cstdint>
+#include <ctime>
+
+// Platform detection
+#if defined(__INTEL_COMPILER)
+#define AE_ICC
+#elif defined(_MSC_VER)
+#define AE_VCPP
+#elif defined(__GNUC__)
+#define AE_GCC
+#endif
+
+#if defined(_M_IA64) || defined(__ia64__)
+#define AE_ARCH_IA64
+#elif defined(_WIN64) || defined(__amd64__) || defined(_M_X64) || defined(__x86_64__)
+#define AE_ARCH_X64
+#elif defined(_M_IX86) || defined(__i386__)
+#define AE_ARCH_X86
+#elif defined(_M_PPC) || defined(__powerpc__)
+#define AE_ARCH_PPC
+#else
+#define AE_ARCH_UNKNOWN
+#endif
+
+
+// AE_UNUSED
+#define AE_UNUSED(x) ((void)x)
+
+// AE_NO_TSAN
+#if defined(__has_feature)
+#if __has_feature(thread_sanitizer)
+#define AE_NO_TSAN __attribute__((no_sanitize("thread")))
+#else
+#define AE_NO_TSAN
+#endif
+#else
+#define AE_NO_TSAN
+#endif
+
+
+// AE_FORCEINLINE
+#if defined(AE_VCPP) || defined(AE_ICC)
+#define AE_FORCEINLINE __forceinline
+#elif defined(AE_GCC)
+//#define AE_FORCEINLINE __attribute__((always_inline)) 
+#define AE_FORCEINLINE inline
+#else
+#define AE_FORCEINLINE inline
+#endif
+
+
+// AE_ALIGN
+#if defined(AE_VCPP) || defined(AE_ICC)
+#define AE_ALIGN(x) __declspec(align(x))
+#elif defined(AE_GCC)
+#define AE_ALIGN(x) __attribute__((aligned(x)))
+#else
+// Assume GCC compliant syntax...
+#define AE_ALIGN(x) __attribute__((aligned(x)))
+#endif
+
+
+// Portable atomic fences implemented below:
+
+namespace moodycamel {
+
+enum memory_order {
+	memory_order_relaxed,
+	memory_order_acquire,
+	memory_order_release,
+	memory_order_acq_rel,
+	memory_order_seq_cst,
+
+	// memory_order_sync: Forces a full sync:
+	// #LoadLoad, #LoadStore, #StoreStore, and most significantly, #StoreLoad
+	memory_order_sync = memory_order_seq_cst
+};
+
+}    // end namespace moodycamel
+
+#if (defined(AE_VCPP) && (_MSC_VER < 1700 || defined(__cplusplus_cli))) || (defined(AE_ICC) && __INTEL_COMPILER < 1600)
+// VS2010 and ICC13 don't support std::atomic_*_fence, implement our own fences
+
+#include <intrin.h>
+
+#if defined(AE_ARCH_X64) || defined(AE_ARCH_X86)
+#define AeFullSync _mm_mfence
+#define AeLiteSync _mm_mfence
+#elif defined(AE_ARCH_IA64)
+#define AeFullSync __mf
+#define AeLiteSync __mf
+#elif defined(AE_ARCH_PPC)
+#include <ppcintrinsics.h>
+#define AeFullSync __sync
+#define AeLiteSync __lwsync
+#endif
+
+
+#ifdef AE_VCPP
+#pragma warning(push)
+#pragma warning(disable: 4365)		// Disable erroneous 'conversion from long to unsigned int, signed/unsigned mismatch' error when using `assert`
+#ifdef __cplusplus_cli
+#pragma managed(push, off)
+#endif
+#endif
+
+namespace moodycamel {
+
+AE_FORCEINLINE void compiler_fence(memory_order order) AE_NO_TSAN
+{
+	switch (order) {
+		case memory_order_relaxed: break;
+		case memory_order_acquire: _ReadBarrier(); break;
+		case memory_order_release: _WriteBarrier(); break;
+		case memory_order_acq_rel: _ReadWriteBarrier(); break;
+		case memory_order_seq_cst: _ReadWriteBarrier(); break;
+		default: assert(false);
+	}
+}
+
+// x86/x64 have a strong memory model -- all loads and stores have
+// acquire and release semantics automatically (so only need compiler
+// barriers for those).
+#if defined(AE_ARCH_X86) || defined(AE_ARCH_X64)
+AE_FORCEINLINE void fence(memory_order order) AE_NO_TSAN
+{
+	switch (order) {
+		case memory_order_relaxed: break;
+		case memory_order_acquire: _ReadBarrier(); break;
+		case memory_order_release: _WriteBarrier(); break;
+		case memory_order_acq_rel: _ReadWriteBarrier(); break;
+		case memory_order_seq_cst:
+			_ReadWriteBarrier();
+			AeFullSync();
+			_ReadWriteBarrier();
+			break;
+		default: assert(false);
+	}
+}
+#else
+AE_FORCEINLINE void fence(memory_order order) AE_NO_TSAN
+{
+	// Non-specialized arch, use heavier memory barriers everywhere just in case :-(
+	switch (order) {
+		case memory_order_relaxed:
+			break;
+		case memory_order_acquire:
+			_ReadBarrier();
+			AeLiteSync();
+			_ReadBarrier();
+			break;
+		case memory_order_release:
+			_WriteBarrier();
+			AeLiteSync();
+			_WriteBarrier();
+			break;
+		case memory_order_acq_rel:
+			_ReadWriteBarrier();
+			AeLiteSync();
+			_ReadWriteBarrier();
+			break;
+		case memory_order_seq_cst:
+			_ReadWriteBarrier();
+			AeFullSync();
+			_ReadWriteBarrier();
+			break;
+		default: assert(false);
+	}
+}
+#endif
+}    // end namespace moodycamel
+#else
+// Use standard library of atomics
+#include <atomic>
+
+namespace moodycamel {
+
+AE_FORCEINLINE void compiler_fence(memory_order order) AE_NO_TSAN
+{
+	switch (order) {
+		case memory_order_relaxed: break;
+		case memory_order_acquire: std::atomic_signal_fence(std::memory_order_acquire); break;
+		case memory_order_release: std::atomic_signal_fence(std::memory_order_release); break;
+		case memory_order_acq_rel: std::atomic_signal_fence(std::memory_order_acq_rel); break;
+		case memory_order_seq_cst: std::atomic_signal_fence(std::memory_order_seq_cst); break;
+		default: assert(false);
+	}
+}
+
+AE_FORCEINLINE void fence(memory_order order) AE_NO_TSAN
+{
+	switch (order) {
+		case memory_order_relaxed: break;
+		case memory_order_acquire: std::atomic_thread_fence(std::memory_order_acquire); break;
+		case memory_order_release: std::atomic_thread_fence(std::memory_order_release); break;
+		case memory_order_acq_rel: std::atomic_thread_fence(std::memory_order_acq_rel); break;
+		case memory_order_seq_cst: std::atomic_thread_fence(std::memory_order_seq_cst); break;
+		default: assert(false);
+	}
+}
+
+}    // end namespace moodycamel
+
+#endif
+
+
+#if !defined(AE_VCPP) || (_MSC_VER >= 1700 && !defined(__cplusplus_cli))
+#define AE_USE_STD_ATOMIC_FOR_WEAK_ATOMIC
+#endif
+
+#ifdef AE_USE_STD_ATOMIC_FOR_WEAK_ATOMIC
+#include <atomic>
+#endif
+#include <utility>
+
+// WARNING: *NOT* A REPLACEMENT FOR std::atomic. READ CAREFULLY:
+// Provides basic support for atomic variables -- no memory ordering guarantees are provided.
+// The guarantee of atomicity is only made for types that already have atomic load and store guarantees
+// at the hardware level -- on most platforms this generally means aligned pointers and integers (only).
+namespace moodycamel {
+template<typename T>
+class weak_atomic
+{
+public:
+	AE_NO_TSAN weak_atomic() { }
+#ifdef AE_VCPP
+#pragma warning(push)
+#pragma warning(disable: 4100)		// Get rid of (erroneous) 'unreferenced formal parameter' warning
+#endif
+	template<typename U> AE_NO_TSAN weak_atomic(U&& x) : value(std::forward<U>(x)) {  }
+#ifdef __cplusplus_cli
+	// Work around bug with universal reference/nullptr combination that only appears when /clr is on
+	AE_NO_TSAN weak_atomic(nullptr_t) : value(nullptr) {  }
+#endif
+	AE_NO_TSAN weak_atomic(weak_atomic const& other) : value(other.load()) {  }
+	AE_NO_TSAN weak_atomic(weak_atomic&& other) : value(std::move(other.load())) {  }
+#ifdef AE_VCPP
+#pragma warning(pop)
+#endif
+
+	AE_FORCEINLINE operator T() const AE_NO_TSAN { return load(); }
+
+	
+#ifndef AE_USE_STD_ATOMIC_FOR_WEAK_ATOMIC
+	template<typename U> AE_FORCEINLINE weak_atomic const& operator=(U&& x) AE_NO_TSAN { value = std::forward<U>(x); return *this; }
+	AE_FORCEINLINE weak_atomic const& operator=(weak_atomic const& other) AE_NO_TSAN { value = other.value; return *this; }
+	
+	AE_FORCEINLINE T load() const AE_NO_TSAN { return value; }
+	
+	AE_FORCEINLINE T fetch_add_acquire(T increment) AE_NO_TSAN
+	{
+#if defined(AE_ARCH_X64) || defined(AE_ARCH_X86)
+		if (sizeof(T) == 4) return _InterlockedExchangeAdd((long volatile*)&value, (long)increment);
+#if defined(_M_AMD64)
+		else if (sizeof(T) == 8) return _InterlockedExchangeAdd64((long long volatile*)&value, (long long)increment);
+#endif
+#else
+#error Unsupported platform
+#endif
+		assert(false && "T must be either a 32 or 64 bit type");
+		return value;
+	}
+	
+	AE_FORCEINLINE T fetch_add_release(T increment) AE_NO_TSAN
+	{
+#if defined(AE_ARCH_X64) || defined(AE_ARCH_X86)
+		if (sizeof(T) == 4) return _InterlockedExchangeAdd((long volatile*)&value, (long)increment);
+#if defined(_M_AMD64)
+		else if (sizeof(T) == 8) return _InterlockedExchangeAdd64((long long volatile*)&value, (long long)increment);
+#endif
+#else
+#error Unsupported platform
+#endif
+		assert(false && "T must be either a 32 or 64 bit type");
+		return value;
+	}
+#else
+	template<typename U>
+	AE_FORCEINLINE weak_atomic const& operator=(U&& x) AE_NO_TSAN
+	{
+		value.store(std::forward<U>(x), std::memory_order_relaxed);
+		return *this;
+	}
+	
+	AE_FORCEINLINE weak_atomic const& operator=(weak_atomic const& other) AE_NO_TSAN
+	{
+		value.store(other.value.load(std::memory_order_relaxed), std::memory_order_relaxed);
+		return *this;
+	}
+
+	AE_FORCEINLINE T load() const AE_NO_TSAN { return value.load(std::memory_order_relaxed); }
+	
+	AE_FORCEINLINE T fetch_add_acquire(T increment) AE_NO_TSAN
+	{
+		return value.fetch_add(increment, std::memory_order_acquire);
+	}
+	
+	AE_FORCEINLINE T fetch_add_release(T increment) AE_NO_TSAN
+	{
+		return value.fetch_add(increment, std::memory_order_release);
+	}
+#endif
+	
+
+private:
+#ifndef AE_USE_STD_ATOMIC_FOR_WEAK_ATOMIC
+	// No std::atomic support, but still need to circumvent compiler optimizations.
+	// `volatile` will make memory access slow, but is guaranteed to be reliable.
+	volatile T value;
+#else
+	std::atomic<T> value;
+#endif
+};
+
+}	// end namespace moodycamel
+
+
+
+// Portable single-producer, single-consumer semaphore below:
+
+#if defined(_WIN32)
+// Avoid including windows.h in a header; we only need a handful of
+// items, so we'll redeclare them here (this is relatively safe since
+// the API generally has to remain stable between Windows versions).
+// I know this is an ugly hack but it still beats polluting the global
+// namespace with thousands of generic names or adding a .cpp for nothing.
+extern "C" {
+	struct _SECURITY_ATTRIBUTES;
+	__declspec(dllimport) void* __stdcall CreateSemaphoreW(_SECURITY_ATTRIBUTES* lpSemaphoreAttributes, long lInitialCount, long lMaximumCount, const wchar_t* lpName);
+	__declspec(dllimport) int __stdcall CloseHandle(void* hObject);
+	__declspec(dllimport) unsigned long __stdcall WaitForSingleObject(void* hHandle, unsigned long dwMilliseconds);
+	__declspec(dllimport) int __stdcall ReleaseSemaphore(void* hSemaphore, long lReleaseCount, long* lpPreviousCount);
+}
+#elif defined(__MACH__)
+#include <mach/mach.h>
+#elif defined(__unix__)
+#include <semaphore.h>
+#endif
+
+namespace moodycamel
+{
+	// Code in the spsc_sema namespace below is an adaptation of Jeff Preshing's
+	// portable + lightweight semaphore implementations, originally from
+	// https://github.com/preshing/cpp11-on-multicore/blob/master/common/sema.h
+	// LICENSE:
+	// Copyright (c) 2015 Jeff Preshing
+	//
+	// This software is provided 'as-is', without any express or implied
+	// warranty. In no event will the authors be held liable for any damages
+	// arising from the use of this software.
+	//
+	// Permission is granted to anyone to use this software for any purpose,
+	// including commercial applications, and to alter it and redistribute it
+	// freely, subject to the following restrictions:
+	//
+	// 1. The origin of this software must not be misrepresented; you must not
+	//    claim that you wrote the original software. If you use this software
+	//    in a product, an acknowledgement in the product documentation would be
+	//    appreciated but is not required.
+	// 2. Altered source versions must be plainly marked as such, and must not be
+	//    misrepresented as being the original software.
+	// 3. This notice may not be removed or altered from any source distribution.
+	namespace spsc_sema
+	{
+#if defined(_WIN32)
+		class Semaphore
+		{
+		private:
+		    void* m_hSema;
+		    
+		    Semaphore(const Semaphore& other);
+		    Semaphore& operator=(const Semaphore& other);
+
+		public:
+		    AE_NO_TSAN Semaphore(int initialCount = 0)
+		    {
+		        assert(initialCount >= 0);
+		        const long maxLong = 0x7fffffff;
+		        m_hSema = CreateSemaphoreW(nullptr, initialCount, maxLong, nullptr);
+		    }
+
+		    AE_NO_TSAN ~Semaphore()
+		    {
+		        CloseHandle(m_hSema);
+		    }
+
+		    void wait() AE_NO_TSAN
+		    {
+		    	const unsigned long infinite = 0xffffffff;
+		        WaitForSingleObject(m_hSema, infinite);
+		    }
+
+			bool try_wait() AE_NO_TSAN
+			{
+				const unsigned long RC_WAIT_TIMEOUT = 0x00000102;
+				return WaitForSingleObject(m_hSema, 0) != RC_WAIT_TIMEOUT;
+			}
+
+			bool timed_wait(std::uint64_t usecs) AE_NO_TSAN
+			{
+				const unsigned long RC_WAIT_TIMEOUT = 0x00000102;
+				return WaitForSingleObject(m_hSema, (unsigned long)(usecs / 1000)) != RC_WAIT_TIMEOUT;
+			}
+
+		    void signal(int count = 1) AE_NO_TSAN
+		    {
+		        ReleaseSemaphore(m_hSema, count, nullptr);
+		    }
+		};
+#elif defined(__MACH__)
+		//---------------------------------------------------------
+		// Semaphore (Apple iOS and OSX)
+		// Can't use POSIX semaphores due to http://lists.apple.com/archives/darwin-kernel/2009/Apr/msg00010.html
+		//---------------------------------------------------------
+		class Semaphore
+		{
+		private:
+		    semaphore_t m_sema;
+
+		    Semaphore(const Semaphore& other);
+		    Semaphore& operator=(const Semaphore& other);
+
+		public:
+		    AE_NO_TSAN Semaphore(int initialCount = 0)
+		    {
+		        assert(initialCount >= 0);
+		        semaphore_create(mach_task_self(), &m_sema, SYNC_POLICY_FIFO, initialCount);
+		    }
+
+		    AE_NO_TSAN ~Semaphore()
+		    {
+		        semaphore_destroy(mach_task_self(), m_sema);
+		    }
+
+		    void wait() AE_NO_TSAN
+		    {
+		        semaphore_wait(m_sema);
+		    }
+
+			bool try_wait() AE_NO_TSAN
+			{
+				return timed_wait(0);
+			}
+
+			bool timed_wait(std::int64_t timeout_usecs) AE_NO_TSAN
+			{
+				mach_timespec_t ts;
+				ts.tv_sec = static_cast<unsigned int>(timeout_usecs / 1000000);
+				ts.tv_nsec = (timeout_usecs % 1000000) * 1000;
+
+				// added in OSX 10.10: https://developer.apple.com/library/prerelease/mac/documentation/General/Reference/APIDiffsMacOSX10_10SeedDiff/modules/Darwin.html
+				kern_return_t rc = semaphore_timedwait(m_sema, ts);
+
+				return rc != KERN_OPERATION_TIMED_OUT && rc != KERN_ABORTED;
+			}
+
+		    void signal() AE_NO_TSAN
+		    {
+		        semaphore_signal(m_sema);
+		    }
+
+		    void signal(int count) AE_NO_TSAN
+		    {
+		        while (count-- > 0)
+		        {
+		            semaphore_signal(m_sema);
+		        }
+		    }
+		};
+#elif defined(__unix__)
+		//---------------------------------------------------------
+		// Semaphore (POSIX, Linux)
+		//---------------------------------------------------------
+		class Semaphore
+		{
+		private:
+		    sem_t m_sema;
+
+		    Semaphore(const Semaphore& other);
+		    Semaphore& operator=(const Semaphore& other);
+
+		public:
+		    AE_NO_TSAN Semaphore(int initialCount = 0)
+		    {
+		        assert(initialCount >= 0);
+		        sem_init(&m_sema, 0, initialCount);
+		    }
+
+		    AE_NO_TSAN ~Semaphore()
+		    {
+		        sem_destroy(&m_sema);
+		    }
+
+		    void wait() AE_NO_TSAN
+		    {
+		        // http://stackoverflow.com/questions/2013181/gdb-causes-sem-wait-to-fail-with-eintr-error
+		        int rc;
+		        do
+		        {
+		            rc = sem_wait(&m_sema);
+		        }
+		        while (rc == -1 && errno == EINTR);
+		    }
+
+			bool try_wait() AE_NO_TSAN
+			{
+				int rc;
+				do {
+					rc = sem_trywait(&m_sema);
+				} while (rc == -1 && errno == EINTR);
+				return !(rc == -1 && errno == EAGAIN);
+			}
+
+			bool timed_wait(std::uint64_t usecs) AE_NO_TSAN
+			{
+				struct timespec ts;
+				const int usecs_in_1_sec = 1000000;
+				const int nsecs_in_1_sec = 1000000000;
+				clock_gettime(CLOCK_REALTIME, &ts);
+				ts.tv_sec += usecs / usecs_in_1_sec;
+				ts.tv_nsec += (usecs % usecs_in_1_sec) * 1000;
+				// sem_timedwait bombs if you have more than 1e9 in tv_nsec
+				// so we have to clean things up before passing it in
+				if (ts.tv_nsec >= nsecs_in_1_sec) {
+					ts.tv_nsec -= nsecs_in_1_sec;
+					++ts.tv_sec;
+				}
+
+				int rc;
+				do {
+					rc = sem_timedwait(&m_sema, &ts);
+				} while (rc == -1 && errno == EINTR);
+				return !(rc == -1 && errno == ETIMEDOUT);
+			}
+
+		    void signal() AE_NO_TSAN
+		    {
+		        sem_post(&m_sema);
+		    }
+
+		    void signal(int count) AE_NO_TSAN
+		    {
+		        while (count-- > 0)
+		        {
+		            sem_post(&m_sema);
+		        }
+		    }
+		};
+#else
+#error Unsupported platform! (No semaphore wrapper available)
+#endif
+
+		//---------------------------------------------------------
+		// LightweightSemaphore
+		//---------------------------------------------------------
+		class LightweightSemaphore
+		{
+		public:
+			typedef std::make_signed<std::size_t>::type ssize_t;
+			
+		private:
+		    weak_atomic<ssize_t> m_count;
+		    Semaphore m_sema;
+
+		    bool waitWithPartialSpinning(std::int64_t timeout_usecs = -1) AE_NO_TSAN
+		    {
+		        ssize_t oldCount;
+		        // Is there a better way to set the initial spin count?
+		        // If we lower it to 1000, testBenaphore becomes 15x slower on my Core i7-5930K Windows PC,
+		        // as threads start hitting the kernel semaphore.
+		        int spin = 10000;
+		        while (--spin >= 0)
+		        {
+		            if (m_count.load() > 0)
+		            {
+		                m_count.fetch_add_acquire(-1);
+		                return true;
+		            }
+		            compiler_fence(memory_order_acquire);     // Prevent the compiler from collapsing the loop.
+		        }
+		        oldCount = m_count.fetch_add_acquire(-1);
+				if (oldCount > 0)
+					return true;
+		        if (timeout_usecs < 0)
+				{
+					m_sema.wait();
+					return true;
+				}
+				if (m_sema.timed_wait(timeout_usecs))
+					return true;
+				// At this point, we've timed out waiting for the semaphore, but the
+				// count is still decremented indicating we may still be waiting on
+				// it. So we have to re-adjust the count, but only if the semaphore
+				// wasn't signaled enough times for us too since then. If it was, we
+				// need to release the semaphore too.
+				while (true)
+				{
+					oldCount = m_count.fetch_add_release(1);
+					if (oldCount < 0)
+						return false;    // successfully restored things to the way they were
+					// Oh, the producer thread just signaled the semaphore after all. Try again:
+					oldCount = m_count.fetch_add_acquire(-1);
+					if (oldCount > 0 && m_sema.try_wait())
+						return true;
+				}
+		    }
+
+		public:
+		    AE_NO_TSAN LightweightSemaphore(ssize_t initialCount = 0) : m_count(initialCount)
+		    {
+		        assert(initialCount >= 0);
+		    }
+
+		    bool tryWait() AE_NO_TSAN
+		    {
+		        if (m_count.load() > 0)
+		        {
+		        	m_count.fetch_add_acquire(-1);
+		        	return true;
+		        }
+		        return false;
+		    }
+
+		    void wait() AE_NO_TSAN
+		    {
+		        if (!tryWait())
+		            waitWithPartialSpinning();
+		    }
+
+			bool wait(std::int64_t timeout_usecs) AE_NO_TSAN
+			{
+				return tryWait() || waitWithPartialSpinning(timeout_usecs);
+			}
+
+		    void signal(ssize_t count = 1) AE_NO_TSAN
+		    {
+		    	assert(count >= 0);
+		        ssize_t oldCount = m_count.fetch_add_release(count);
+		        assert(oldCount >= -1);
+		        if (oldCount < 0)
+		        {
+		            m_sema.signal(1);
+		        }
+		    }
+		    
+		    ssize_t availableApprox() const AE_NO_TSAN
+		    {
+		    	ssize_t count = m_count.load();
+		    	return count > 0 ? count : 0;
+		    }
+		};
+	}	// end namespace spsc_sema
+}	// end namespace moodycamel
+
+#if defined(AE_VCPP) && (_MSC_VER < 1700 || defined(__cplusplus_cli))
+#pragma warning(pop)
+#ifdef __cplusplus_cli
+#pragma managed(pop)
+#endif
+#endif
diff --git a/include/readerwriterqueue.h b/include/readerwriterqueue.h
new file mode 100644
index 0000000000..071147c3e1
--- /dev/null
+++ b/include/readerwriterqueue.h
@@ -0,0 +1,906 @@
+// ©2013-2016 Cameron Desrochers.
+// Distributed under the simplified BSD license (see the license file that
+// should have come with this header).
+
+#pragma once
+
+#include "atomicops.h"
+#include <type_traits>
+#include <utility>
+#include <cassert>
+#include <stdexcept>
+#include <new>
+#include <cstdint>
+#include <cstdlib>		// For malloc/free/abort & size_t
+#include <memory>
+#if __cplusplus > 199711L || _MSC_VER >= 1700 // C++11 or VS2012
+#include <chrono>
+#endif
+
+
+// A lock-free queue for a single-consumer, single-producer architecture.
+// The queue is also wait-free in the common path (except if more memory
+// needs to be allocated, in which case malloc is called).
+// Allocates memory sparingly (O(lg(n) times, amortized), and only once if
+// the original maximum size estimate is never exceeded.
+// Tested on x86/x64 processors, but semantics should be correct for all
+// architectures (given the right implementations in atomicops.h), provided
+// that aligned integer and pointer accesses are naturally atomic.
+// Note that there should only be one consumer thread and producer thread;
+// Switching roles of the threads, or using multiple consecutive threads for
+// one role, is not safe unless properly synchronized.
+// Using the queue exclusively from one thread is fine, though a bit silly.
+
+#ifndef MOODYCAMEL_CACHE_LINE_SIZE
+#define MOODYCAMEL_CACHE_LINE_SIZE 64
+#endif
+
+#ifndef MOODYCAMEL_EXCEPTIONS_ENABLED
+#if (defined(_MSC_VER) && defined(_CPPUNWIND)) || (defined(__GNUC__) && defined(__EXCEPTIONS)) || (!defined(_MSC_VER) && !defined(__GNUC__))
+#define MOODYCAMEL_EXCEPTIONS_ENABLED
+#endif
+#endif
+
+#ifndef MOODYCAMEL_HAS_EMPLACE
+#if !defined(_MSC_VER) || _MSC_VER >= 1800 // variadic templates: either a non-MS compiler or VS >= 2013
+#define MOODYCAMEL_HAS_EMPLACE    1
+#endif
+#endif
+
+#ifdef AE_VCPP
+#pragma warning(push)
+#pragma warning(disable: 4324)	// structure was padded due to __declspec(align())
+#pragma warning(disable: 4820)	// padding was added
+#pragma warning(disable: 4127)	// conditional expression is constant
+#endif
+
+namespace moodycamel {
+
+template<typename T, size_t MAX_BLOCK_SIZE = 512>
+class ReaderWriterQueue
+{
+	// Design: Based on a queue-of-queues. The low-level queues are just
+	// circular buffers with front and tail indices indicating where the
+	// next element to dequeue is and where the next element can be enqueued,
+	// respectively. Each low-level queue is called a "block". Each block
+	// wastes exactly one element's worth of space to keep the design simple
+	// (if front == tail then the queue is empty, and can't be full).
+	// The high-level queue is a circular linked list of blocks; again there
+	// is a front and tail, but this time they are pointers to the blocks.
+	// The front block is where the next element to be dequeued is, provided
+	// the block is not empty. The back block is where elements are to be
+	// enqueued, provided the block is not full.
+	// The producer thread owns all the tail indices/pointers. The consumer
+	// thread owns all the front indices/pointers. Both threads read each
+	// other's variables, but only the owning thread updates them. E.g. After
+	// the consumer reads the producer's tail, the tail may change before the
+	// consumer is done dequeuing an object, but the consumer knows the tail
+	// will never go backwards, only forwards.
+	// If there is no room to enqueue an object, an additional block (of
+	// equal size to the last block) is added. Blocks are never removed.
+
+public:
+	typedef T value_type;
+
+	// Constructs a queue that can hold maxSize elements without further
+	// allocations. If more than MAX_BLOCK_SIZE elements are requested,
+	// then several blocks of MAX_BLOCK_SIZE each are reserved (including
+	// at least one extra buffer block).
+	AE_NO_TSAN explicit ReaderWriterQueue(size_t maxSize = 15)
+#ifndef NDEBUG
+		: enqueuing(false)
+		,dequeuing(false)
+#endif
+	{
+		assert(maxSize > 0);
+		assert(MAX_BLOCK_SIZE == ceilToPow2(MAX_BLOCK_SIZE) && "MAX_BLOCK_SIZE must be a power of 2");
+		assert(MAX_BLOCK_SIZE >= 2 && "MAX_BLOCK_SIZE must be at least 2");
+		
+		Block* firstBlock = nullptr;
+		
+		largestBlockSize = ceilToPow2(maxSize + 1);		// We need a spare slot to fit maxSize elements in the block
+		if (largestBlockSize > MAX_BLOCK_SIZE * 2) {
+			// We need a spare block in case the producer is writing to a different block the consumer is reading from, and
+			// wants to enqueue the maximum number of elements. We also need a spare element in each block to avoid the ambiguity
+			// between front == tail meaning "empty" and "full".
+			// So the effective number of slots that are guaranteed to be usable at any time is the block size - 1 times the
+			// number of blocks - 1. Solving for maxSize and applying a ceiling to the division gives us (after simplifying):
+			size_t initialBlockCount = (maxSize + MAX_BLOCK_SIZE * 2 - 3) / (MAX_BLOCK_SIZE - 1);
+			largestBlockSize = MAX_BLOCK_SIZE;
+			Block* lastBlock = nullptr;
+			for (size_t i = 0; i != initialBlockCount; ++i) {
+				auto block = make_block(largestBlockSize);
+				if (block == nullptr) {
+#ifdef MOODYCAMEL_EXCEPTIONS_ENABLED
+					throw std::bad_alloc();
+#else
+					abort();
+#endif
+				}
+				if (firstBlock == nullptr) {
+					firstBlock = block;
+				}
+				else {
+					lastBlock->next = block;
+				}
+				lastBlock = block;
+				block->next = firstBlock;
+			}
+		}
+		else {
+			firstBlock = make_block(largestBlockSize);
+			if (firstBlock == nullptr) {
+#ifdef MOODYCAMEL_EXCEPTIONS_ENABLED
+				throw std::bad_alloc();
+#else
+				abort();
+#endif
+			}
+			firstBlock->next = firstBlock;
+		}
+		frontBlock = firstBlock;
+		tailBlock = firstBlock;
+		
+		// Make sure the reader/writer threads will have the initialized memory setup above:
+		fence(memory_order_sync);
+	}
+
+	// Note: The queue should not be accessed concurrently while it's
+	// being moved. It's up to the user to synchronize this.
+	AE_NO_TSAN ReaderWriterQueue(ReaderWriterQueue&& other)
+		: frontBlock(other.frontBlock.load()),
+		tailBlock(other.tailBlock.load()),
+		largestBlockSize(other.largestBlockSize)
+#ifndef NDEBUG
+		,enqueuing(false)
+		,dequeuing(false)
+#endif
+	{
+		other.largestBlockSize = 32;
+		Block* b = other.make_block(other.largestBlockSize);
+		if (b == nullptr) {
+#ifdef MOODYCAMEL_EXCEPTIONS_ENABLED
+			throw std::bad_alloc();
+#else
+			abort();
+#endif
+		}
+		b->next = b;
+		other.frontBlock = b;
+		other.tailBlock = b;
+	}
+
+	// Note: The queue should not be accessed concurrently while it's
+	// being moved. It's up to the user to synchronize this.
+	ReaderWriterQueue& operator=(ReaderWriterQueue&& other) AE_NO_TSAN
+	{
+		Block* b = frontBlock.load();
+		frontBlock = other.frontBlock.load();
+		other.frontBlock = b;
+		b = tailBlock.load();
+		tailBlock = other.tailBlock.load();
+		other.tailBlock = b;
+		std::swap(largestBlockSize, other.largestBlockSize);
+		return *this;
+	}
+
+	// Note: The queue should not be accessed concurrently while it's
+	// being deleted. It's up to the user to synchronize this.
+	AE_NO_TSAN ~ReaderWriterQueue()
+	{
+		// Make sure we get the latest version of all variables from other CPUs:
+		fence(memory_order_sync);
+
+		// Destroy any remaining objects in queue and free memory
+		Block* frontBlock_ = frontBlock;
+		Block* block = frontBlock_;
+		do {
+			Block* nextBlock = block->next;
+			size_t blockFront = block->front;
+			size_t blockTail = block->tail;
+
+			for (size_t i = blockFront; i != blockTail; i = (i + 1) & block->sizeMask) {
+				auto element = reinterpret_cast<T*>(block->data + i * sizeof(T));
+				element->~T();
+				(void)element;
+			}
+			
+			auto rawBlock = block->rawThis;
+			block->~Block();
+			std::free(rawBlock);
+			block = nextBlock;
+		} while (block != frontBlock_);
+	}
+
+
+	// Enqueues a copy of element if there is room in the queue.
+	// Returns true if the element was enqueued, false otherwise.
+	// Does not allocate memory.
+	AE_FORCEINLINE bool try_enqueue(T const& element) AE_NO_TSAN
+	{
+		return inner_enqueue<CannotAlloc>(element);
+	}
+
+	// Enqueues a moved copy of element if there is room in the queue.
+	// Returns true if the element was enqueued, false otherwise.
+	// Does not allocate memory.
+	AE_FORCEINLINE bool try_enqueue(T&& element) AE_NO_TSAN
+	{
+		return inner_enqueue<CannotAlloc>(std::forward<T>(element));
+	}
+
+#if MOODYCAMEL_HAS_EMPLACE
+	// Like try_enqueue() but with emplace semantics (i.e. construct-in-place).
+	template<typename... Args>
+	AE_FORCEINLINE bool try_emplace(Args&&... args) AE_NO_TSAN
+	{
+		return inner_enqueue<CannotAlloc>(std::forward<Args>(args)...);
+	}
+#endif
+
+	// Enqueues a copy of element on the queue.
+	// Allocates an additional block of memory if needed.
+	// Only fails (returns false) if memory allocation fails.
+	AE_FORCEINLINE bool enqueue(T const& element) AE_NO_TSAN
+	{
+		return inner_enqueue<CanAlloc>(element);
+	}
+
+	// Enqueues a moved copy of element on the queue.
+	// Allocates an additional block of memory if needed.
+	// Only fails (returns false) if memory allocation fails.
+	AE_FORCEINLINE bool enqueue(T&& element) AE_NO_TSAN
+	{
+		return inner_enqueue<CanAlloc>(std::forward<T>(element));
+	}
+
+#if MOODYCAMEL_HAS_EMPLACE
+	// Like enqueue() but with emplace semantics (i.e. construct-in-place).
+	template<typename... Args>
+	AE_FORCEINLINE bool emplace(Args&&... args) AE_NO_TSAN
+	{
+		return inner_enqueue<CanAlloc>(std::forward<Args>(args)...);
+	}
+#endif
+
+	// Attempts to dequeue an element; if the queue is empty,
+	// returns false instead. If the queue has at least one element,
+	// moves front to result using operator=, then returns true.
+	template<typename U>
+	bool try_dequeue(U& result) AE_NO_TSAN
+	{
+#ifndef NDEBUG
+		ReentrantGuard guard(this->dequeuing);
+#endif
+
+		// High-level pseudocode:
+		// Remember where the tail block is
+		// If the front block has an element in it, dequeue it
+		// Else
+		//     If front block was the tail block when we entered the function, return false
+		//     Else advance to next block and dequeue the item there
+
+		// Note that we have to use the value of the tail block from before we check if the front
+		// block is full or not, in case the front block is empty and then, before we check if the
+		// tail block is at the front block or not, the producer fills up the front block *and
+		// moves on*, which would make us skip a filled block. Seems unlikely, but was consistently
+		// reproducible in practice.
+		// In order to avoid overhead in the common case, though, we do a double-checked pattern
+		// where we have the fast path if the front block is not empty, then read the tail block,
+		// then re-read the front block and check if it's not empty again, then check if the tail
+		// block has advanced.
+		
+		Block* frontBlock_ = frontBlock.load();
+		size_t blockTail = frontBlock_->localTail;
+		size_t blockFront = frontBlock_->front.load();
+		
+		if (blockFront != blockTail || blockFront != (frontBlock_->localTail = frontBlock_->tail.load())) {
+			fence(memory_order_acquire);
+			
+		non_empty_front_block:
+			// Front block not empty, dequeue from here
+			auto element = reinterpret_cast<T*>(frontBlock_->data + blockFront * sizeof(T));
+			result = std::move(*element);
+			element->~T();
+
+			blockFront = (blockFront + 1) & frontBlock_->sizeMask;
+
+			fence(memory_order_release);
+			frontBlock_->front = blockFront;
+		}
+		else if (frontBlock_ != tailBlock.load()) {
+			fence(memory_order_acquire);
+
+			frontBlock_ = frontBlock.load();
+			blockTail = frontBlock_->localTail = frontBlock_->tail.load();
+			blockFront = frontBlock_->front.load();
+			fence(memory_order_acquire);
+			
+			if (blockFront != blockTail) {
+				// Oh look, the front block isn't empty after all
+				goto non_empty_front_block;
+			}
+			
+			// Front block is empty but there's another block ahead, advance to it
+			Block* nextBlock = frontBlock_->next;
+			// Don't need an acquire fence here since next can only ever be set on the tailBlock,
+			// and we're not the tailBlock, and we did an acquire earlier after reading tailBlock which
+			// ensures next is up-to-date on this CPU in case we recently were at tailBlock.
+
+			size_t nextBlockFront = nextBlock->front.load();
+			size_t nextBlockTail = nextBlock->localTail = nextBlock->tail.load();
+			fence(memory_order_acquire);
+
+			// Since the tailBlock is only ever advanced after being written to,
+			// we know there's for sure an element to dequeue on it
+			assert(nextBlockFront != nextBlockTail);
+			AE_UNUSED(nextBlockTail);
+
+			// We're done with this block, let the producer use it if it needs
+			fence(memory_order_release);		// Expose possibly pending changes to frontBlock->front from last dequeue
+			frontBlock = frontBlock_ = nextBlock;
+
+			compiler_fence(memory_order_release);	// Not strictly needed
+
+			auto element = reinterpret_cast<T*>(frontBlock_->data + nextBlockFront * sizeof(T));
+			
+			result = std::move(*element);
+			element->~T();
+
+			nextBlockFront = (nextBlockFront + 1) & frontBlock_->sizeMask;
+			
+			fence(memory_order_release);
+			frontBlock_->front = nextBlockFront;
+		}
+		else {
+			// No elements in current block and no other block to advance to
+			return false;
+		}
+
+		return true;
+	}
+
+
+	// Returns a pointer to the front element in the queue (the one that
+	// would be removed next by a call to `try_dequeue` or `pop`). If the
+	// queue appears empty at the time the method is called, nullptr is
+	// returned instead.
+	// Must be called only from the consumer thread.
+	T* peek() AE_NO_TSAN
+	{
+#ifndef NDEBUG
+		ReentrantGuard guard(this->dequeuing);
+#endif
+		// See try_dequeue() for reasoning
+
+		Block* frontBlock_ = frontBlock.load();
+		size_t blockTail = frontBlock_->localTail;
+		size_t blockFront = frontBlock_->front.load();
+		
+		if (blockFront != blockTail || blockFront != (frontBlock_->localTail = frontBlock_->tail.load())) {
+			fence(memory_order_acquire);
+		non_empty_front_block:
+			return reinterpret_cast<T*>(frontBlock_->data + blockFront * sizeof(T));
+		}
+		else if (frontBlock_ != tailBlock.load()) {
+			fence(memory_order_acquire);
+			frontBlock_ = frontBlock.load();
+			blockTail = frontBlock_->localTail = frontBlock_->tail.load();
+			blockFront = frontBlock_->front.load();
+			fence(memory_order_acquire);
+			
+			if (blockFront != blockTail) {
+				goto non_empty_front_block;
+			}
+			
+			Block* nextBlock = frontBlock_->next;
+			
+			size_t nextBlockFront = nextBlock->front.load();
+			fence(memory_order_acquire);
+
+			assert(nextBlockFront != nextBlock->tail.load());
+			return reinterpret_cast<T*>(nextBlock->data + nextBlockFront * sizeof(T));
+		}
+		
+		return nullptr;
+	}
+	
+	// Removes the front element from the queue, if any, without returning it.
+	// Returns true on success, or false if the queue appeared empty at the time
+	// `pop` was called.
+	bool pop() AE_NO_TSAN
+	{
+#ifndef NDEBUG
+		ReentrantGuard guard(this->dequeuing);
+#endif
+		// See try_dequeue() for reasoning
+		
+		Block* frontBlock_ = frontBlock.load();
+		size_t blockTail = frontBlock_->localTail;
+		size_t blockFront = frontBlock_->front.load();
+		
+		if (blockFront != blockTail || blockFront != (frontBlock_->localTail = frontBlock_->tail.load())) {
+			fence(memory_order_acquire);
+			
+		non_empty_front_block:
+			auto element = reinterpret_cast<T*>(frontBlock_->data + blockFront * sizeof(T));
+			element->~T();
+
+			blockFront = (blockFront + 1) & frontBlock_->sizeMask;
+
+			fence(memory_order_release);
+			frontBlock_->front = blockFront;
+		}
+		else if (frontBlock_ != tailBlock.load()) {
+			fence(memory_order_acquire);
+			frontBlock_ = frontBlock.load();
+			blockTail = frontBlock_->localTail = frontBlock_->tail.load();
+			blockFront = frontBlock_->front.load();
+			fence(memory_order_acquire);
+			
+			if (blockFront != blockTail) {
+				goto non_empty_front_block;
+			}
+			
+			// Front block is empty but there's another block ahead, advance to it
+			Block* nextBlock = frontBlock_->next;
+			
+			size_t nextBlockFront = nextBlock->front.load();
+			size_t nextBlockTail = nextBlock->localTail = nextBlock->tail.load();
+			fence(memory_order_acquire);
+
+			assert(nextBlockFront != nextBlockTail);
+			AE_UNUSED(nextBlockTail);
+
+			fence(memory_order_release);
+			frontBlock = frontBlock_ = nextBlock;
+
+			compiler_fence(memory_order_release);
+
+			auto element = reinterpret_cast<T*>(frontBlock_->data + nextBlockFront * sizeof(T));
+			element->~T();
+
+			nextBlockFront = (nextBlockFront + 1) & frontBlock_->sizeMask;
+			
+			fence(memory_order_release);
+			frontBlock_->front = nextBlockFront;
+		}
+		else {
+			// No elements in current block and no other block to advance to
+			return false;
+		}
+
+		return true;
+	}
+	
+	// Returns the approximate number of items currently in the queue.
+	// Safe to call from both the producer and consumer threads.
+	inline size_t size_approx() const AE_NO_TSAN
+	{
+		size_t result = 0;
+		Block* frontBlock_ = frontBlock.load();
+		Block* block = frontBlock_;
+		do {
+			fence(memory_order_acquire);
+			size_t blockFront = block->front.load();
+			size_t blockTail = block->tail.load();
+			result += (blockTail - blockFront) & block->sizeMask;
+			block = block->next.load();
+		} while (block != frontBlock_);
+		return result;
+	}
+
+
+private:
+	enum AllocationMode { CanAlloc, CannotAlloc };
+
+#if MOODYCAMEL_HAS_EMPLACE
+	template<AllocationMode canAlloc, typename... Args>
+	bool inner_enqueue(Args&&... args) AE_NO_TSAN
+#else
+	template<AllocationMode canAlloc, typename U>
+	bool inner_enqueue(U&& element) AE_NO_TSAN
+#endif
+	{
+#ifndef NDEBUG
+		ReentrantGuard guard(this->enqueuing);
+#endif
+
+		// High-level pseudocode (assuming we're allowed to alloc a new block):
+		// If room in tail block, add to tail
+		// Else check next block
+		//     If next block is not the head block, enqueue on next block
+		//     Else create a new block and enqueue there
+		//     Advance tail to the block we just enqueued to
+
+		Block* tailBlock_ = tailBlock.load();
+		size_t blockFront = tailBlock_->localFront;
+		size_t blockTail = tailBlock_->tail.load();
+
+		size_t nextBlockTail = (blockTail + 1) & tailBlock_->sizeMask;
+		if (nextBlockTail != blockFront || nextBlockTail != (tailBlock_->localFront = tailBlock_->front.load())) {
+			fence(memory_order_acquire);
+			// This block has room for at least one more element
+			char* location = tailBlock_->data + blockTail * sizeof(T);
+#if MOODYCAMEL_HAS_EMPLACE
+			new (location) T(std::forward<Args>(args)...);
+#else
+			new (location) T(std::forward<U>(element));
+#endif
+
+			fence(memory_order_release);
+			tailBlock_->tail = nextBlockTail;
+		}
+		else {
+			fence(memory_order_acquire);
+			if (tailBlock_->next.load() != frontBlock) {
+				// Note that the reason we can't advance to the frontBlock and start adding new entries there
+				// is because if we did, then dequeue would stay in that block, eventually reading the new values,
+				// instead of advancing to the next full block (whose values were enqueued first and so should be
+				// consumed first).
+
+				fence(memory_order_acquire);		// Ensure we get latest writes if we got the latest frontBlock
+
+				// tailBlock is full, but there's a free block ahead, use it
+				Block* tailBlockNext = tailBlock_->next.load();
+				size_t nextBlockFront = tailBlockNext->localFront = tailBlockNext->front.load();
+				nextBlockTail = tailBlockNext->tail.load();
+				fence(memory_order_acquire);
+
+				// This block must be empty since it's not the head block and we
+				// go through the blocks in a circle
+				assert(nextBlockFront == nextBlockTail);
+				tailBlockNext->localFront = nextBlockFront;
+
+				char* location = tailBlockNext->data + nextBlockTail * sizeof(T);
+#if MOODYCAMEL_HAS_EMPLACE
+				new (location) T(std::forward<Args>(args)...);
+#else
+				new (location) T(std::forward<U>(element));
+#endif
+
+				tailBlockNext->tail = (nextBlockTail + 1) & tailBlockNext->sizeMask;
+
+				fence(memory_order_release);
+				tailBlock = tailBlockNext;
+			}
+			else if (canAlloc == CanAlloc) {
+				// tailBlock is full and there's no free block ahead; create a new block
+				auto newBlockSize = largestBlockSize >= MAX_BLOCK_SIZE ? largestBlockSize : largestBlockSize * 2;
+				auto newBlock = make_block(newBlockSize);
+				if (newBlock == nullptr) {
+					// Could not allocate a block!
+					return false;
+				}
+				largestBlockSize = newBlockSize;
+
+#if MOODYCAMEL_HAS_EMPLACE
+				new (newBlock->data) T(std::forward<Args>(args)...);
+#else
+				new (newBlock->data) T(std::forward<U>(element));
+#endif
+				assert(newBlock->front == 0);
+				newBlock->tail = newBlock->localTail = 1;
+
+				newBlock->next = tailBlock_->next.load();
+				tailBlock_->next = newBlock;
+
+				// Might be possible for the dequeue thread to see the new tailBlock->next
+				// *without* seeing the new tailBlock value, but this is OK since it can't
+				// advance to the next block until tailBlock is set anyway (because the only
+				// case where it could try to read the next is if it's already at the tailBlock,
+				// and it won't advance past tailBlock in any circumstance).
+
+				fence(memory_order_release);
+				tailBlock = newBlock;
+			}
+			else if (canAlloc == CannotAlloc) {
+				// Would have had to allocate a new block to enqueue, but not allowed
+				return false;
+			}
+			else {
+				assert(false && "Should be unreachable code");
+				return false;
+			}
+		}
+
+		return true;
+	}
+
+
+	// Disable copying
+	ReaderWriterQueue(ReaderWriterQueue const&) {  }
+
+	// Disable assignment
+	ReaderWriterQueue& operator=(ReaderWriterQueue const&) {  }
+
+
+
+	AE_FORCEINLINE static size_t ceilToPow2(size_t x)
+	{
+		// From http://graphics.stanford.edu/~seander/bithacks.html#RoundUpPowerOf2
+		--x;
+		x |= x >> 1;
+		x |= x >> 2;
+		x |= x >> 4;
+		for (size_t i = 1; i < sizeof(size_t); i <<= 1) {
+			x |= x >> (i << 3);
+		}
+		++x;
+		return x;
+	}
+	
+	template<typename U>
+	static AE_FORCEINLINE char* align_for(char* ptr) AE_NO_TSAN
+	{
+		const std::size_t alignment = std::alignment_of<U>::value;
+		return ptr + (alignment - (reinterpret_cast<std::uintptr_t>(ptr) % alignment)) % alignment;
+	}
+private:
+#ifndef NDEBUG
+	struct ReentrantGuard
+	{
+		AE_NO_TSAN ReentrantGuard(bool& _inSection)
+			: inSection(_inSection)
+		{
+			assert(!inSection && "Concurrent (or re-entrant) enqueue or dequeue operation detected (only one thread at a time may hold the producer or consumer role)");
+			inSection = true;
+		}
+
+		AE_NO_TSAN ~ReentrantGuard() { inSection = false; }
+
+	private:
+		ReentrantGuard& operator=(ReentrantGuard const&);
+
+	private:
+		bool& inSection;
+	};
+#endif
+
+	struct Block
+	{
+		// Avoid false-sharing by putting highly contended variables on their own cache lines
+		weak_atomic<size_t> front;	// (Atomic) Elements are read from here
+		size_t localTail;			// An uncontended shadow copy of tail, owned by the consumer
+		
+		char cachelineFiller0[MOODYCAMEL_CACHE_LINE_SIZE - sizeof(weak_atomic<size_t>) - sizeof(size_t)];
+		weak_atomic<size_t> tail;	// (Atomic) Elements are enqueued here
+		size_t localFront;
+		
+		char cachelineFiller1[MOODYCAMEL_CACHE_LINE_SIZE - sizeof(weak_atomic<size_t>) - sizeof(size_t)];	// next isn't very contended, but we don't want it on the same cache line as tail (which is)
+		weak_atomic<Block*> next;	// (Atomic)
+		
+		char* data;		// Contents (on heap) are aligned to T's alignment
+
+		const size_t sizeMask;
+
+
+		// size must be a power of two (and greater than 0)
+		AE_NO_TSAN Block(size_t const& _size, char* _rawThis, char* _data)
+			: front(0), localTail(0), tail(0), localFront(0), next(nullptr), data(_data), sizeMask(_size - 1), rawThis(_rawThis)
+		{
+		}
+
+	private:
+		// C4512 - Assignment operator could not be generated
+		Block& operator=(Block const&);
+
+	public:
+		char* rawThis;
+	};
+	
+	
+	static Block* make_block(size_t capacity) AE_NO_TSAN
+	{
+		// Allocate enough memory for the block itself, as well as all the elements it will contain
+		auto size = sizeof(Block) + std::alignment_of<Block>::value - 1;
+		size += sizeof(T) * capacity + std::alignment_of<T>::value - 1;
+		auto newBlockRaw = static_cast<char*>(std::malloc(size));
+		if (newBlockRaw == nullptr) {
+			return nullptr;
+		}
+		
+		auto newBlockAligned = align_for<Block>(newBlockRaw);
+		auto newBlockData = align_for<T>(newBlockAligned + sizeof(Block));
+		return new (newBlockAligned) Block(capacity, newBlockRaw, newBlockData);
+	}
+
+private:
+	weak_atomic<Block*> frontBlock;		// (Atomic) Elements are enqueued to this block
+	
+	char cachelineFiller[MOODYCAMEL_CACHE_LINE_SIZE - sizeof(weak_atomic<Block*>)];
+	weak_atomic<Block*> tailBlock;		// (Atomic) Elements are dequeued from this block
+
+	size_t largestBlockSize;
+
+#ifndef NDEBUG
+	bool enqueuing;
+	bool dequeuing;
+#endif
+};
+
+// Like ReaderWriterQueue, but also providees blocking operations
+template<typename T, size_t MAX_BLOCK_SIZE = 512>
+class BlockingReaderWriterQueue
+{
+private:
+	typedef ::moodycamel::ReaderWriterQueue<T, MAX_BLOCK_SIZE> ReaderWriterQueue;
+	
+public:
+	explicit BlockingReaderWriterQueue(size_t maxSize = 15) AE_NO_TSAN
+		: inner(maxSize), sema(new spsc_sema::LightweightSemaphore())
+	{ }
+
+	BlockingReaderWriterQueue(BlockingReaderWriterQueue&& other) AE_NO_TSAN
+		: inner(std::move(other.inner)), sema(std::move(other.sema))
+	{ }
+
+	BlockingReaderWriterQueue& operator=(BlockingReaderWriterQueue&& other) AE_NO_TSAN
+	{
+		std::swap(sema, other.sema);
+		std::swap(inner, other.inner);
+		return *this;
+	}
+
+
+	// Enqueues a copy of element if there is room in the queue.
+	// Returns true if the element was enqueued, false otherwise.
+	// Does not allocate memory.
+	AE_FORCEINLINE bool try_enqueue(T const& element) AE_NO_TSAN
+	{
+		if (inner.try_enqueue(element)) {
+			sema->signal();
+			return true;
+		}
+		return false;
+	}
+
+	// Enqueues a moved copy of element if there is room in the queue.
+	// Returns true if the element was enqueued, false otherwise.
+	// Does not allocate memory.
+	AE_FORCEINLINE bool try_enqueue(T&& element) AE_NO_TSAN
+	{
+		if (inner.try_enqueue(std::forward<T>(element))) {
+			sema->signal();
+			return true;
+		}
+		return false;
+	}
+
+
+	// Enqueues a copy of element on the queue.
+	// Allocates an additional block of memory if needed.
+	// Only fails (returns false) if memory allocation fails.
+	AE_FORCEINLINE bool enqueue(T const& element) AE_NO_TSAN
+	{
+		if (inner.enqueue(element)) {
+			sema->signal();
+			return true;
+		}
+		return false;
+	}
+
+	// Enqueues a moved copy of element on the queue.
+	// Allocates an additional block of memory if needed.
+	// Only fails (returns false) if memory allocation fails.
+	AE_FORCEINLINE bool enqueue(T&& element) AE_NO_TSAN
+	{
+		if (inner.enqueue(std::forward<T>(element))) {
+			sema->signal();
+			return true;
+		}
+		return false;
+	}
+
+
+	// Attempts to dequeue an element; if the queue is empty,
+	// returns false instead. If the queue has at least one element,
+	// moves front to result using operator=, then returns true.
+	template<typename U>
+	bool try_dequeue(U& result) AE_NO_TSAN
+	{
+		if (sema->tryWait()) {
+			bool success = inner.try_dequeue(result);
+			assert(success);
+			AE_UNUSED(success);
+			return true;
+		}
+		return false;
+	}
+	
+	
+	// Attempts to dequeue an element; if the queue is empty,
+	// waits until an element is available, then dequeues it.
+	template<typename U>
+	void wait_dequeue(U& result) AE_NO_TSAN
+	{
+		sema->wait();
+		bool success = inner.try_dequeue(result);
+		AE_UNUSED(result);
+		assert(success);
+		AE_UNUSED(success);
+	}
+
+
+	// Attempts to dequeue an element; if the queue is empty,
+	// waits until an element is available up to the specified timeout,
+	// then dequeues it and returns true, or returns false if the timeout
+	// expires before an element can be dequeued.
+	// Using a negative timeout indicates an indefinite timeout,
+	// and is thus functionally equivalent to calling wait_dequeue.
+	template<typename U>
+	bool wait_dequeue_timed(U& result, std::int64_t timeout_usecs) AE_NO_TSAN
+	{
+		if (!sema->wait(timeout_usecs)) {
+			return false;
+		}
+		bool success = inner.try_dequeue(result);
+		AE_UNUSED(result);
+		assert(success);
+		AE_UNUSED(success);
+		return true;
+	}
+
+
+#if __cplusplus > 199711L || _MSC_VER >= 1700
+	// Attempts to dequeue an element; if the queue is empty,
+	// waits until an element is available up to the specified timeout,
+	// then dequeues it and returns true, or returns false if the timeout
+	// expires before an element can be dequeued.
+	// Using a negative timeout indicates an indefinite timeout,
+	// and is thus functionally equivalent to calling wait_dequeue.
+	template<typename U, typename Rep, typename Period>
+	inline bool wait_dequeue_timed(U& result, std::chrono::duration<Rep, Period> const& timeout) AE_NO_TSAN
+	{
+        return wait_dequeue_timed(result, std::chrono::duration_cast<std::chrono::microseconds>(timeout).count());
+	}
+#endif
+
+
+	// Returns a pointer to the front element in the queue (the one that
+	// would be removed next by a call to `try_dequeue` or `pop`). If the
+	// queue appears empty at the time the method is called, nullptr is
+	// returned instead.
+	// Must be called only from the consumer thread.
+	AE_FORCEINLINE T* peek() AE_NO_TSAN
+	{
+		return inner.peek();
+	}
+	
+	// Removes the front element from the queue, if any, without returning it.
+	// Returns true on success, or false if the queue appeared empty at the time
+	// `pop` was called.
+	AE_FORCEINLINE bool pop() AE_NO_TSAN
+	{
+		if (sema->tryWait()) {
+			bool result = inner.pop();
+			assert(result);
+			AE_UNUSED(result);
+			return true;
+		}
+		return false;
+	}
+	
+	// Returns the approximate number of items currently in the queue.
+	// Safe to call from both the producer and consumer threads.
+	AE_FORCEINLINE size_t size_approx() const AE_NO_TSAN
+	{
+		return sema->availableApprox();
+	}
+
+
+private:
+	// Disable copying & assignment
+	BlockingReaderWriterQueue(BlockingReaderWriterQueue const&) {  }
+	BlockingReaderWriterQueue& operator=(BlockingReaderWriterQueue const&) {  }
+	
+private:
+	ReaderWriterQueue inner;
+	std::unique_ptr<spsc_sema::LightweightSemaphore> sema;
+};
+
+}    // end namespace moodycamel
+
+#ifdef AE_VCPP
+#pragma warning(pop)
+#endif
diff --git a/perf/remote_thr.cpp b/perf/remote_thr.cpp
index 8378950cd9..6969e8c25f 100644
--- a/perf/remote_thr.cpp
+++ b/perf/remote_thr.cpp
@@ -31,12 +31,93 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
+#include <assert.h>
+
+#include "readerwriterqueue.h"
 
 // keys are arbitrary but must match local_lat.cpp
 const char server_pubkey[] = "DX4nh=yUn{-9ugra0X3Src4SU-4xTgqxcYY.+<SH";
 const char client_pubkey[] = "<n^oA}I:66W+*ds3tAmi1+KJzv-}k&fC2aA5Bj0K";
 const char client_prvkey[] = "9R9bV}[6z6DC-%$!jTVTKvWc=LEL{4i4gzUe$@Zx";
 
+#define SIZE_OF_CONTENT_T_USED_BY_ZMQ_VLM (40)
+#define MAX_ACTIVE_MESSAGES (8192)
+#define MSG_BLOCK_SIZE (256)
+
+#define MAX_MESSAGE_SIZE                                                       \
+    (MSG_BLOCK_SIZE - SIZE_OF_CONTENT_T_USED_BY_ZMQ_VLM - 1 /* canary */)
+
+typedef struct
+{
+    uint8_t content_block
+      [SIZE_OF_CONTENT_T_USED_BY_ZMQ_VLM]; // will be used by ZMQ internally
+    uint8_t raw_data[MAX_MESSAGE_SIZE];
+    uint8_t canary;
+} msg_block_t;
+
+
+class ZmqMessagePool
+{
+  public:
+    ZmqMessagePool ()
+    {
+        // enqueue all available blocks in the free list:
+        for (int i = 0; i < MAX_ACTIVE_MESSAGES; i++) {
+            m_storage[i].canary = 0xAB;
+            m_free_list.enqueue (&m_storage[i]);
+        }
+    }
+    ~ZmqMessagePool () {}
+
+
+    bool allocate_msg (zmq_msg_t *out,
+                       size_t len) // consumer thread: user app thread
+    {
+        assert (len < MAX_MESSAGE_SIZE);
+
+        // consume 1 block from the list of free msg blocks
+        msg_block_t *next_avail = nullptr;
+        if (!m_free_list.try_dequeue (next_avail)) {
+            assert (0); // I want to find out if this ever happens
+            return false;
+        }
+
+        assert (next_avail);
+        int rc = zmq_msg_init_data (
+          out, next_avail, len + SIZE_OF_CONTENT_T_USED_BY_ZMQ_VLM,
+          (zmq_free_fn *) ZmqMessagePool::deallocate_msg, this);
+        assert (rc == 0);
+
+        assert (zmq_msg_size (out) == len);
+        assert (zmq_msg_data (out) == next_avail->raw_data);
+
+        return true;
+    }
+
+    static void
+    deallocate_msg (void *data_,
+                    void *hint_) // producer thread: ZMQ background IO thread
+    {
+        ZmqMessagePool *pPool = reinterpret_cast<ZmqMessagePool *> (hint_);
+
+        // recover the beginning of this msg_block:
+        uint8_t *data_ptr_ = (uint8_t *) data_;
+        msg_block_t *to_return =
+          (msg_block_t *) (data_ptr_ - SIZE_OF_CONTENT_T_USED_BY_ZMQ_VLM);
+        assert (to_return->canary == 0xAB);
+
+        // produce a new free msg block:
+        pPool->m_free_list.enqueue (to_return);
+    }
+
+    size_t size () const { return m_free_list.size_approx (); }
+
+  private:
+    msg_block_t m_storage[MAX_ACTIVE_MESSAGES];
+    moodycamel::ReaderWriterQueue<msg_block_t *> m_free_list;
+};
+
+
 int main (int argc, char *argv[])
 {
     const char *connect_to;
@@ -104,6 +185,7 @@ int main (int argc, char *argv[])
         return -1;
     }
 
+#if 0
     for (i = 0; i != message_count; i++) {
         rc = zmq_msg_init_size (&msg, message_size);
         if (rc != 0) {
@@ -121,6 +203,32 @@ int main (int argc, char *argv[])
             return -1;
         }
     }
+#else
+    printf ("msg block size: %zu; max msg size: %d\n", sizeof (msg_block_t),
+            MAX_MESSAGE_SIZE);
+    ZmqMessagePool pool;
+    for (i = 0; i != message_count; i++) {
+        pool.allocate_msg (&msg, message_size);
+
+        // to be fair when comparing the results generated by the other #if/#endif branch
+        // avoid any kind of initialization of message memory:
+        //memset (zmq_msg_data (&msg), message_size, 0xAB);
+
+        rc = zmq_sendmsg (s, &msg, 0);
+        if (rc < 0) {
+            printf ("error in zmq_sendmsg: %s\n", zmq_strerror (errno));
+            return -1;
+        }
+        rc = zmq_msg_close (&msg);
+        if (rc != 0) {
+            printf ("error in zmq_msg_close: %s\n", zmq_strerror (errno));
+            return -1;
+        }
+
+        //if ((i % 1000) == 0)
+        //    printf ("mempool msg size: %zu\n", pool.size ());
+    }
+#endif
 
     rc = zmq_close (s);
     if (rc != 0) {
diff --git a/src/msg.cpp b/src/msg.cpp
index 5e32341bd0..dc1081c4c2 100644
--- a/src/msg.cpp
+++ b/src/msg.cpp
@@ -47,6 +47,9 @@ typedef char
   zmq_msg_size_check[2 * ((sizeof (zmq::msg_t) == sizeof (zmq_msg_t)) != 0)
                      - 1];
 
+#define ALLOCATE_CONTENT_T_INSIDE_PROVIDED_BUFFER (1)
+
+
 bool zmq::msg_t::check () const
 {
     return _u.base.type >= type_min && _u.base.type <= type_max;
@@ -166,15 +169,26 @@ int zmq::msg_t::init_data (void *data_,
         _u.lmsg.flags = 0;
         _u.lmsg.group[0] = '\0';
         _u.lmsg.routing_id = 0;
+#if ALLOCATE_CONTENT_T_INSIDE_PROVIDED_BUFFER
+        zmq_assert (size_ > sizeof (content_t));
+        _u.lmsg.content = reinterpret_cast<content_t *> (data_);
+#else
         _u.lmsg.content =
           static_cast<content_t *> (malloc (sizeof (content_t)));
+#endif
         if (!_u.lmsg.content) {
             errno = ENOMEM;
             return -1;
         }
 
+#if ALLOCATE_CONTENT_T_INSIDE_PROVIDED_BUFFER
+        uint8_t *data_bytes = (uint8_t *) data_;
+        _u.lmsg.content->data = data_bytes + sizeof (content_t);
+        _u.lmsg.content->size = size_ - sizeof (content_t);
+#else
         _u.lmsg.content->data = data_;
         _u.lmsg.content->size = size_;
+#endif
         _u.lmsg.content->ffn = ffn_;
         _u.lmsg.content->hint = hint_;
         new (&_u.lmsg.content->refcnt) zmq::atomic_counter_t ();
@@ -228,11 +242,23 @@ int zmq::msg_t::close ()
             //  We used "placement new" operator to initialize the reference
             //  counter so we call the destructor explicitly now.
             _u.lmsg.content->refcnt.~atomic_counter_t ();
-
+#if ALLOCATE_CONTENT_T_INSIDE_PROVIDED_BUFFER
+            // take a local copy since we are going to remove (through the user-provided deallocator)
+            // the whole malloc'ed buffer, including the content_t block itself!
+            // NOTE: this copy should not be strictly needed but it's here just to help debugging:
+            content_t content;
+            content.data = _u.lmsg.content->data;
+            content.size = _u.lmsg.content->size;
+            content.ffn = _u.lmsg.content->ffn;
+            content.hint = _u.lmsg.content->hint;
+            if (content.ffn)
+                content.ffn (content.data, content.hint);
+#else
             if (_u.lmsg.content->ffn)
                 _u.lmsg.content->ffn (_u.lmsg.content->data,
                                       _u.lmsg.content->hint);
             free (_u.lmsg.content);
+#endif
         }
     }
 

From a24f2af2579a7a402c9576e683f18d9b75ae7308 Mon Sep 17 00:00:00 2001
From: Francesco Montorsi <francesco.montorsi@gmail.com>
Date: Tue, 13 Aug 2019 11:39:00 +0200
Subject: [PATCH 02/11] Allow to choose message sizes as well

---
 perf/generate_csv.sh | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/perf/generate_csv.sh b/perf/generate_csv.sh
index d307f29e49..cf23ed5640 100755
--- a/perf/generate_csv.sh
+++ b/perf/generate_csv.sh
@@ -10,6 +10,7 @@
 #    export LOCAL_TEST_ENDPOINT="tcp://192.168.1.1:1234"
 #    export REMOTE_TEST_ENDPOINT="tcp://192.168.1.2:1234"
 #    export REMOTE_LIBZMQ_PATH="/home/fmontorsi/libzmq/perf"
+#    export MESSAGE_SIZE_LIST="8 16 32 64 128 210"
 #    ./generate_csv.sh
 #
 
@@ -22,7 +23,7 @@ LOCAL_TEST_ENDPOINT=${LOCAL_TEST_ENDPOINT:-tcp://192.168.1.1:1234}
 REMOTE_TEST_ENDPOINT=${REMOTE_TEST_ENDPOINT:-tcp://192.168.1.2:1234}
 
 # constant values:
-MESSAGE_SIZE_LIST="8 16 32 64 128 256 512 1024 2048 4096 8192 16384 32768 65536 131072"
+MESSAGE_SIZE_LIST="${MESSAGE_SIZE_LIST:-8 16 32 64 128 256 512 1024 2048 4096 8192 16384 32768 65536 131072}"
 OUTPUT_DIR="results"
 OUTPUT_FILE_PREFIX="results.txt"
 OUTPUT_FILE_CSV_PREFIX="results.csv"

From 1bd2ae1530380937c69bd64262f42cb8710c6b7b Mon Sep 17 00:00:00 2001
From: Francesco Montorsi <francesco.montorsi@gmail.com>
Date: Tue, 13 Aug 2019 11:50:49 +0200
Subject: [PATCH 03/11] Allow using env variables to do some basic overriding

---
 perf/generate_graphs.py | 49 ++++++++++++++++++++++++++++-------------
 1 file changed, 34 insertions(+), 15 deletions(-)

diff --git a/perf/generate_graphs.py b/perf/generate_graphs.py
index 20651b7160..77fbb88ff3 100755
--- a/perf/generate_graphs.py
+++ b/perf/generate_graphs.py
@@ -1,19 +1,13 @@
 #!/usr/bin/python3
 
 #
-# This script assumes that the set of CSV files produced by "generate_csv.sh" is provided as input
-# and that locally there is the "results" folder.
+# This script assumes that the set of CSV files produced by "generate_csv.sh" is provided as input.
+#
+# Usage example:
+#   export RESULT_DIRECTORY="./results"
+#   export TCP_LINK_SPEED_GBPS="10"     # or 1 or 100 as you like
+#   ./generate_graphs.py
 #
-
-# results for TCP:
-INPUT_FILE_PUSHPULL_TCP_THROUGHPUT="results/pushpull_tcp_thr_results.csv"
-INPUT_FILE_REQREP_TCP_LATENCY="results/reqrep_tcp_lat_results.csv"
-TCP_LINK_GPBS=100
-
-# results for INPROC:
-INPUT_FILE_PUSHPULL_INPROC_THROUGHPUT="results/pushpull_inproc_thr_results.csv"
-INPUT_FILE_PUBSUBPROXY_INPROC_THROUGHPUT="results/pubsubproxy_inproc_thr_results.csv"
-
 
 # dependencies
 #
@@ -22,13 +16,15 @@
 
 import matplotlib.pyplot as plt
 import numpy as np
+import os
 
 
 # functions
 
-def plot_throughput(csv_filename, title, is_tcp=False):
+def plot_throughput(csv_filename, title, is_tcp=False, tcp_link_speed_gbps=10):
     message_size_bytes, message_count, pps, mbps = np.loadtxt(csv_filename, delimiter=',', unpack=True)
 
+    print("Generating PNG image file [%s] from CSV results '%s'" % (title, csv_filename))
     fig, ax1 = plt.subplots()
 
     # PPS axis
@@ -44,7 +40,7 @@ def plot_throughput(csv_filename, title, is_tcp=False):
     ax2.set_ylabel('Throughput [Gb/s]', color=color)
     ax2.semilogx(message_size_bytes, mbps / 1e3, label='Throughput [Gb/s]', marker='o')
     if is_tcp:
-        ax2.set_yticks(np.arange(0, TCP_LINK_GPBS + 1, TCP_LINK_GPBS/10)) 
+        ax2.set_yticks(np.arange(0, tcp_link_speed_gbps + 1, tcp_link_speed_gbps/10)) 
     ax2.tick_params(axis='y', labelcolor=color)
     ax2.grid(True)
     
@@ -55,6 +51,8 @@ def plot_throughput(csv_filename, title, is_tcp=False):
 
 def plot_latency(csv_filename, title):
     message_size_bytes, message_count, lat = np.loadtxt(csv_filename, delimiter=',', unpack=True)
+
+    print("Generating PNG image file [%s] from CSV results '%s'" % (title, csv_filename))
     plt.semilogx(message_size_bytes, lat, label='Latency [us]', marker='o')
     
     plt.xlabel('Message size [B]')
@@ -67,7 +65,28 @@ def plot_latency(csv_filename, title):
 
 # main
 
-plot_throughput(INPUT_FILE_PUSHPULL_TCP_THROUGHPUT, 'ZeroMQ PUSH/PULL socket throughput, TCP transport', is_tcp=True)
+try:
+    result_dir = os.environ['RESULT_DIRECTORY']
+except:
+    result_dir = "results" # default value
+
+try:
+    tcp_link_speed_gbps = os.environ['TCP_LINK_SPEED_GBPS']
+except:
+    tcp_link_speed_gbps = "10" # default value
+    
+    
+
+# result files for TCP:
+INPUT_FILE_PUSHPULL_TCP_THROUGHPUT = result_dir + "/pushpull_tcp_thr_results.csv"
+INPUT_FILE_REQREP_TCP_LATENCY = result_dir + "/reqrep_tcp_lat_results.csv"
+
+# results for INPROC:
+INPUT_FILE_PUSHPULL_INPROC_THROUGHPUT = result_dir + "/pushpull_inproc_thr_results.csv"
+INPUT_FILE_PUBSUBPROXY_INPROC_THROUGHPUT = result_dir + "/pubsubproxy_inproc_thr_results.csv"
+
+# generate plots
+plot_throughput(INPUT_FILE_PUSHPULL_TCP_THROUGHPUT, 'ZeroMQ PUSH/PULL socket throughput, TCP transport', is_tcp=True, tcp_link_speed_gbps=tcp_link_speed_gbps)
 plot_throughput(INPUT_FILE_PUSHPULL_INPROC_THROUGHPUT, 'ZeroMQ PUSH/PULL socket throughput, INPROC transport')
 plot_throughput(INPUT_FILE_PUBSUBPROXY_INPROC_THROUGHPUT, 'ZeroMQ PUB/SUB PROXY socket throughput, INPROC transport')
 plot_latency(INPUT_FILE_REQREP_TCP_LATENCY, 'ZeroMQ REQ/REP socket latency, TCP transport')

From 252e8d449c6c40919f81de351d34be8a02af6ed2 Mon Sep 17 00:00:00 2001
From: Francesco Montorsi <francesco.montorsi@gmail.com>
Date: Tue, 13 Aug 2019 11:51:39 +0200
Subject: [PATCH 04/11] fix typo

---
 perf/generate_graphs.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/perf/generate_graphs.py b/perf/generate_graphs.py
index 77fbb88ff3..c323e4cce6 100755
--- a/perf/generate_graphs.py
+++ b/perf/generate_graphs.py
@@ -71,9 +71,9 @@ def plot_latency(csv_filename, title):
     result_dir = "results" # default value
 
 try:
-    tcp_link_speed_gbps = os.environ['TCP_LINK_SPEED_GBPS']
+    tcp_link_speed_gbps = int(os.environ['TCP_LINK_SPEED_GBPS'])
 except:
-    tcp_link_speed_gbps = "10" # default value
+    tcp_link_speed_gbps = 10 # default value
     
     
 

From 4a3079560b6be68a9a1ad5291ed469cf1b35379a Mon Sep 17 00:00:00 2001
From: Francesco Montorsi <francesco.montorsi@gmail.com>
Date: Tue, 13 Aug 2019 12:05:50 +0200
Subject: [PATCH 05/11] add TCP kernel socket buffer setting

---
 perf/generate_csv.sh | 30 ++++++++++++++++++++++++++++++
 1 file changed, 30 insertions(+)

diff --git a/perf/generate_csv.sh b/perf/generate_csv.sh
index cf23ed5640..da8ff0a4cd 100755
--- a/perf/generate_csv.sh
+++ b/perf/generate_csv.sh
@@ -48,6 +48,35 @@ function verify_ssh()
     echo "SSH connection to the remote $REMOTE_IP_SSH is working fine."
 }
 
+function set_reproducible_tcp_kernel_buff_size()
+{
+    sysctl -w net.core.rmem_max=8388608 && \
+        sysctl -w net.core.wmem_max=8388608 && \
+        sysctl -w net.core.rmem_default=65536 && \
+        sysctl -w net.core.wmem_default=65536 && \
+        sysctl -w net.ipv4.tcp_rmem='4096 87380 8388608' && \
+        sysctl -w net.ipv4.tcp_wmem='4096 65536 8388608' && \
+        sysctl -w net.ipv4.tcp_mem='8388608 8388608 8388608' && \
+        sysctl -w net.ipv4.route.flush=1
+    if [ $? -ne 0 ]; then
+        echo "Failed setting kernel socket buffer sizes LOCALLY"
+        exit 2
+    fi
+
+    ssh $REMOTE_IP_SSH "sysctl -w net.core.rmem_max=8388608 && \
+        sysctl -w net.core.wmem_max=8388608 && \
+        sysctl -w net.core.rmem_default=65536 && \
+        sysctl -w net.core.wmem_default=65536 && \
+        sysctl -w net.ipv4.tcp_rmem='4096 87380 8388608' && \
+        sysctl -w net.ipv4.tcp_wmem='4096 65536 8388608' && \
+        sysctl -w net.ipv4.tcp_mem='8388608 8388608 8388608' && \
+        sysctl -w net.ipv4.route.flush=1"
+    if [ $? -ne 0 ]; then
+        echo "Failed setting kernel socket buffer sizes on the REMOTE system $REMOTE_IP_SSH"
+        exit 2
+    fi
+}
+
 function run_remote_perf_util()
 {
     local MESSAGE_SIZE_BYTES="$1"
@@ -112,6 +141,7 @@ function generate_output_file()
 # main:
 
 verify_ssh
+set_reproducible_tcp_kernel_buff_size
 
 THROUGHPUT_CSV_HEADER_LINE="# message_size,message_count,PPS[msg/s],throughput[Mb/s]"
 

From 00e514e2c9b8ae0373a2b9d0e594310efd152fe5 Mon Sep 17 00:00:00 2001
From: Francesco Montorsi <francesco.montorsi@gmail.com>
Date: Thu, 29 Aug 2019 00:39:31 +0200
Subject: [PATCH 06/11] First implementation of global memory pool for ZMQ

---
 Makefile.am                 |    2 +
 include/atomicops.h         |  676 -------
 include/readerwriterqueue.h |  906 ---------
 include/zmq.h               |   14 +
 perf/remote_thr.cpp         |  124 +-
 src/allocator.cpp           |   97 +
 src/allocator.hpp           |  181 ++
 src/concurrentqueue.h       | 3636 +++++++++++++++++++++++++++++++++++
 src/ctx.cpp                 |   16 +
 src/ctx.hpp                 |    4 +
 src/msg.cpp                 |   79 +-
 src/msg.hpp                 |    6 +-
 src/zmq.cpp                 |   38 +
 13 files changed, 4058 insertions(+), 1721 deletions(-)
 delete mode 100644 include/atomicops.h
 delete mode 100644 include/readerwriterqueue.h
 create mode 100644 src/allocator.cpp
 create mode 100644 src/allocator.hpp
 create mode 100644 src/concurrentqueue.h

diff --git a/Makefile.am b/Makefile.am
index 4c4abc4415..e81a4ca7a9 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -22,6 +22,8 @@ include_HEADERS = \
 src_libzmq_la_SOURCES = \
 	src/address.cpp \
 	src/address.hpp \
+	src/allocator.cpp \
+	src/allocator.hpp \
 	src/array.hpp \
 	src/atomic_counter.hpp \
 	src/atomic_ptr.hpp \
diff --git a/include/atomicops.h b/include/atomicops.h
deleted file mode 100644
index 4fd1748293..0000000000
--- a/include/atomicops.h
+++ /dev/null
@@ -1,676 +0,0 @@
-﻿// ©2013-2016 Cameron Desrochers.
-// Distributed under the simplified BSD license (see the license file that
-// should have come with this header).
-// Uses Jeff Preshing's semaphore implementation (under the terms of its
-// separate zlib license, embedded below).
-
-#pragma once
-
-// Provides portable (VC++2010+, Intel ICC 13, GCC 4.7+, and anything C++11 compliant) implementation
-// of low-level memory barriers, plus a few semi-portable utility macros (for inlining and alignment).
-// Also has a basic atomic type (limited to hardware-supported atomics with no memory ordering guarantees).
-// Uses the AE_* prefix for macros (historical reasons), and the "moodycamel" namespace for symbols.
-
-#include <cassert>
-#include <type_traits>
-#include <cerrno>
-#include <cstdint>
-#include <ctime>
-
-// Platform detection
-#if defined(__INTEL_COMPILER)
-#define AE_ICC
-#elif defined(_MSC_VER)
-#define AE_VCPP
-#elif defined(__GNUC__)
-#define AE_GCC
-#endif
-
-#if defined(_M_IA64) || defined(__ia64__)
-#define AE_ARCH_IA64
-#elif defined(_WIN64) || defined(__amd64__) || defined(_M_X64) || defined(__x86_64__)
-#define AE_ARCH_X64
-#elif defined(_M_IX86) || defined(__i386__)
-#define AE_ARCH_X86
-#elif defined(_M_PPC) || defined(__powerpc__)
-#define AE_ARCH_PPC
-#else
-#define AE_ARCH_UNKNOWN
-#endif
-
-
-// AE_UNUSED
-#define AE_UNUSED(x) ((void)x)
-
-// AE_NO_TSAN
-#if defined(__has_feature)
-#if __has_feature(thread_sanitizer)
-#define AE_NO_TSAN __attribute__((no_sanitize("thread")))
-#else
-#define AE_NO_TSAN
-#endif
-#else
-#define AE_NO_TSAN
-#endif
-
-
-// AE_FORCEINLINE
-#if defined(AE_VCPP) || defined(AE_ICC)
-#define AE_FORCEINLINE __forceinline
-#elif defined(AE_GCC)
-//#define AE_FORCEINLINE __attribute__((always_inline)) 
-#define AE_FORCEINLINE inline
-#else
-#define AE_FORCEINLINE inline
-#endif
-
-
-// AE_ALIGN
-#if defined(AE_VCPP) || defined(AE_ICC)
-#define AE_ALIGN(x) __declspec(align(x))
-#elif defined(AE_GCC)
-#define AE_ALIGN(x) __attribute__((aligned(x)))
-#else
-// Assume GCC compliant syntax...
-#define AE_ALIGN(x) __attribute__((aligned(x)))
-#endif
-
-
-// Portable atomic fences implemented below:
-
-namespace moodycamel {
-
-enum memory_order {
-	memory_order_relaxed,
-	memory_order_acquire,
-	memory_order_release,
-	memory_order_acq_rel,
-	memory_order_seq_cst,
-
-	// memory_order_sync: Forces a full sync:
-	// #LoadLoad, #LoadStore, #StoreStore, and most significantly, #StoreLoad
-	memory_order_sync = memory_order_seq_cst
-};
-
-}    // end namespace moodycamel
-
-#if (defined(AE_VCPP) && (_MSC_VER < 1700 || defined(__cplusplus_cli))) || (defined(AE_ICC) && __INTEL_COMPILER < 1600)
-// VS2010 and ICC13 don't support std::atomic_*_fence, implement our own fences
-
-#include <intrin.h>
-
-#if defined(AE_ARCH_X64) || defined(AE_ARCH_X86)
-#define AeFullSync _mm_mfence
-#define AeLiteSync _mm_mfence
-#elif defined(AE_ARCH_IA64)
-#define AeFullSync __mf
-#define AeLiteSync __mf
-#elif defined(AE_ARCH_PPC)
-#include <ppcintrinsics.h>
-#define AeFullSync __sync
-#define AeLiteSync __lwsync
-#endif
-
-
-#ifdef AE_VCPP
-#pragma warning(push)
-#pragma warning(disable: 4365)		// Disable erroneous 'conversion from long to unsigned int, signed/unsigned mismatch' error when using `assert`
-#ifdef __cplusplus_cli
-#pragma managed(push, off)
-#endif
-#endif
-
-namespace moodycamel {
-
-AE_FORCEINLINE void compiler_fence(memory_order order) AE_NO_TSAN
-{
-	switch (order) {
-		case memory_order_relaxed: break;
-		case memory_order_acquire: _ReadBarrier(); break;
-		case memory_order_release: _WriteBarrier(); break;
-		case memory_order_acq_rel: _ReadWriteBarrier(); break;
-		case memory_order_seq_cst: _ReadWriteBarrier(); break;
-		default: assert(false);
-	}
-}
-
-// x86/x64 have a strong memory model -- all loads and stores have
-// acquire and release semantics automatically (so only need compiler
-// barriers for those).
-#if defined(AE_ARCH_X86) || defined(AE_ARCH_X64)
-AE_FORCEINLINE void fence(memory_order order) AE_NO_TSAN
-{
-	switch (order) {
-		case memory_order_relaxed: break;
-		case memory_order_acquire: _ReadBarrier(); break;
-		case memory_order_release: _WriteBarrier(); break;
-		case memory_order_acq_rel: _ReadWriteBarrier(); break;
-		case memory_order_seq_cst:
-			_ReadWriteBarrier();
-			AeFullSync();
-			_ReadWriteBarrier();
-			break;
-		default: assert(false);
-	}
-}
-#else
-AE_FORCEINLINE void fence(memory_order order) AE_NO_TSAN
-{
-	// Non-specialized arch, use heavier memory barriers everywhere just in case :-(
-	switch (order) {
-		case memory_order_relaxed:
-			break;
-		case memory_order_acquire:
-			_ReadBarrier();
-			AeLiteSync();
-			_ReadBarrier();
-			break;
-		case memory_order_release:
-			_WriteBarrier();
-			AeLiteSync();
-			_WriteBarrier();
-			break;
-		case memory_order_acq_rel:
-			_ReadWriteBarrier();
-			AeLiteSync();
-			_ReadWriteBarrier();
-			break;
-		case memory_order_seq_cst:
-			_ReadWriteBarrier();
-			AeFullSync();
-			_ReadWriteBarrier();
-			break;
-		default: assert(false);
-	}
-}
-#endif
-}    // end namespace moodycamel
-#else
-// Use standard library of atomics
-#include <atomic>
-
-namespace moodycamel {
-
-AE_FORCEINLINE void compiler_fence(memory_order order) AE_NO_TSAN
-{
-	switch (order) {
-		case memory_order_relaxed: break;
-		case memory_order_acquire: std::atomic_signal_fence(std::memory_order_acquire); break;
-		case memory_order_release: std::atomic_signal_fence(std::memory_order_release); break;
-		case memory_order_acq_rel: std::atomic_signal_fence(std::memory_order_acq_rel); break;
-		case memory_order_seq_cst: std::atomic_signal_fence(std::memory_order_seq_cst); break;
-		default: assert(false);
-	}
-}
-
-AE_FORCEINLINE void fence(memory_order order) AE_NO_TSAN
-{
-	switch (order) {
-		case memory_order_relaxed: break;
-		case memory_order_acquire: std::atomic_thread_fence(std::memory_order_acquire); break;
-		case memory_order_release: std::atomic_thread_fence(std::memory_order_release); break;
-		case memory_order_acq_rel: std::atomic_thread_fence(std::memory_order_acq_rel); break;
-		case memory_order_seq_cst: std::atomic_thread_fence(std::memory_order_seq_cst); break;
-		default: assert(false);
-	}
-}
-
-}    // end namespace moodycamel
-
-#endif
-
-
-#if !defined(AE_VCPP) || (_MSC_VER >= 1700 && !defined(__cplusplus_cli))
-#define AE_USE_STD_ATOMIC_FOR_WEAK_ATOMIC
-#endif
-
-#ifdef AE_USE_STD_ATOMIC_FOR_WEAK_ATOMIC
-#include <atomic>
-#endif
-#include <utility>
-
-// WARNING: *NOT* A REPLACEMENT FOR std::atomic. READ CAREFULLY:
-// Provides basic support for atomic variables -- no memory ordering guarantees are provided.
-// The guarantee of atomicity is only made for types that already have atomic load and store guarantees
-// at the hardware level -- on most platforms this generally means aligned pointers and integers (only).
-namespace moodycamel {
-template<typename T>
-class weak_atomic
-{
-public:
-	AE_NO_TSAN weak_atomic() { }
-#ifdef AE_VCPP
-#pragma warning(push)
-#pragma warning(disable: 4100)		// Get rid of (erroneous) 'unreferenced formal parameter' warning
-#endif
-	template<typename U> AE_NO_TSAN weak_atomic(U&& x) : value(std::forward<U>(x)) {  }
-#ifdef __cplusplus_cli
-	// Work around bug with universal reference/nullptr combination that only appears when /clr is on
-	AE_NO_TSAN weak_atomic(nullptr_t) : value(nullptr) {  }
-#endif
-	AE_NO_TSAN weak_atomic(weak_atomic const& other) : value(other.load()) {  }
-	AE_NO_TSAN weak_atomic(weak_atomic&& other) : value(std::move(other.load())) {  }
-#ifdef AE_VCPP
-#pragma warning(pop)
-#endif
-
-	AE_FORCEINLINE operator T() const AE_NO_TSAN { return load(); }
-
-	
-#ifndef AE_USE_STD_ATOMIC_FOR_WEAK_ATOMIC
-	template<typename U> AE_FORCEINLINE weak_atomic const& operator=(U&& x) AE_NO_TSAN { value = std::forward<U>(x); return *this; }
-	AE_FORCEINLINE weak_atomic const& operator=(weak_atomic const& other) AE_NO_TSAN { value = other.value; return *this; }
-	
-	AE_FORCEINLINE T load() const AE_NO_TSAN { return value; }
-	
-	AE_FORCEINLINE T fetch_add_acquire(T increment) AE_NO_TSAN
-	{
-#if defined(AE_ARCH_X64) || defined(AE_ARCH_X86)
-		if (sizeof(T) == 4) return _InterlockedExchangeAdd((long volatile*)&value, (long)increment);
-#if defined(_M_AMD64)
-		else if (sizeof(T) == 8) return _InterlockedExchangeAdd64((long long volatile*)&value, (long long)increment);
-#endif
-#else
-#error Unsupported platform
-#endif
-		assert(false && "T must be either a 32 or 64 bit type");
-		return value;
-	}
-	
-	AE_FORCEINLINE T fetch_add_release(T increment) AE_NO_TSAN
-	{
-#if defined(AE_ARCH_X64) || defined(AE_ARCH_X86)
-		if (sizeof(T) == 4) return _InterlockedExchangeAdd((long volatile*)&value, (long)increment);
-#if defined(_M_AMD64)
-		else if (sizeof(T) == 8) return _InterlockedExchangeAdd64((long long volatile*)&value, (long long)increment);
-#endif
-#else
-#error Unsupported platform
-#endif
-		assert(false && "T must be either a 32 or 64 bit type");
-		return value;
-	}
-#else
-	template<typename U>
-	AE_FORCEINLINE weak_atomic const& operator=(U&& x) AE_NO_TSAN
-	{
-		value.store(std::forward<U>(x), std::memory_order_relaxed);
-		return *this;
-	}
-	
-	AE_FORCEINLINE weak_atomic const& operator=(weak_atomic const& other) AE_NO_TSAN
-	{
-		value.store(other.value.load(std::memory_order_relaxed), std::memory_order_relaxed);
-		return *this;
-	}
-
-	AE_FORCEINLINE T load() const AE_NO_TSAN { return value.load(std::memory_order_relaxed); }
-	
-	AE_FORCEINLINE T fetch_add_acquire(T increment) AE_NO_TSAN
-	{
-		return value.fetch_add(increment, std::memory_order_acquire);
-	}
-	
-	AE_FORCEINLINE T fetch_add_release(T increment) AE_NO_TSAN
-	{
-		return value.fetch_add(increment, std::memory_order_release);
-	}
-#endif
-	
-
-private:
-#ifndef AE_USE_STD_ATOMIC_FOR_WEAK_ATOMIC
-	// No std::atomic support, but still need to circumvent compiler optimizations.
-	// `volatile` will make memory access slow, but is guaranteed to be reliable.
-	volatile T value;
-#else
-	std::atomic<T> value;
-#endif
-};
-
-}	// end namespace moodycamel
-
-
-
-// Portable single-producer, single-consumer semaphore below:
-
-#if defined(_WIN32)
-// Avoid including windows.h in a header; we only need a handful of
-// items, so we'll redeclare them here (this is relatively safe since
-// the API generally has to remain stable between Windows versions).
-// I know this is an ugly hack but it still beats polluting the global
-// namespace with thousands of generic names or adding a .cpp for nothing.
-extern "C" {
-	struct _SECURITY_ATTRIBUTES;
-	__declspec(dllimport) void* __stdcall CreateSemaphoreW(_SECURITY_ATTRIBUTES* lpSemaphoreAttributes, long lInitialCount, long lMaximumCount, const wchar_t* lpName);
-	__declspec(dllimport) int __stdcall CloseHandle(void* hObject);
-	__declspec(dllimport) unsigned long __stdcall WaitForSingleObject(void* hHandle, unsigned long dwMilliseconds);
-	__declspec(dllimport) int __stdcall ReleaseSemaphore(void* hSemaphore, long lReleaseCount, long* lpPreviousCount);
-}
-#elif defined(__MACH__)
-#include <mach/mach.h>
-#elif defined(__unix__)
-#include <semaphore.h>
-#endif
-
-namespace moodycamel
-{
-	// Code in the spsc_sema namespace below is an adaptation of Jeff Preshing's
-	// portable + lightweight semaphore implementations, originally from
-	// https://github.com/preshing/cpp11-on-multicore/blob/master/common/sema.h
-	// LICENSE:
-	// Copyright (c) 2015 Jeff Preshing
-	//
-	// This software is provided 'as-is', without any express or implied
-	// warranty. In no event will the authors be held liable for any damages
-	// arising from the use of this software.
-	//
-	// Permission is granted to anyone to use this software for any purpose,
-	// including commercial applications, and to alter it and redistribute it
-	// freely, subject to the following restrictions:
-	//
-	// 1. The origin of this software must not be misrepresented; you must not
-	//    claim that you wrote the original software. If you use this software
-	//    in a product, an acknowledgement in the product documentation would be
-	//    appreciated but is not required.
-	// 2. Altered source versions must be plainly marked as such, and must not be
-	//    misrepresented as being the original software.
-	// 3. This notice may not be removed or altered from any source distribution.
-	namespace spsc_sema
-	{
-#if defined(_WIN32)
-		class Semaphore
-		{
-		private:
-		    void* m_hSema;
-		    
-		    Semaphore(const Semaphore& other);
-		    Semaphore& operator=(const Semaphore& other);
-
-		public:
-		    AE_NO_TSAN Semaphore(int initialCount = 0)
-		    {
-		        assert(initialCount >= 0);
-		        const long maxLong = 0x7fffffff;
-		        m_hSema = CreateSemaphoreW(nullptr, initialCount, maxLong, nullptr);
-		    }
-
-		    AE_NO_TSAN ~Semaphore()
-		    {
-		        CloseHandle(m_hSema);
-		    }
-
-		    void wait() AE_NO_TSAN
-		    {
-		    	const unsigned long infinite = 0xffffffff;
-		        WaitForSingleObject(m_hSema, infinite);
-		    }
-
-			bool try_wait() AE_NO_TSAN
-			{
-				const unsigned long RC_WAIT_TIMEOUT = 0x00000102;
-				return WaitForSingleObject(m_hSema, 0) != RC_WAIT_TIMEOUT;
-			}
-
-			bool timed_wait(std::uint64_t usecs) AE_NO_TSAN
-			{
-				const unsigned long RC_WAIT_TIMEOUT = 0x00000102;
-				return WaitForSingleObject(m_hSema, (unsigned long)(usecs / 1000)) != RC_WAIT_TIMEOUT;
-			}
-
-		    void signal(int count = 1) AE_NO_TSAN
-		    {
-		        ReleaseSemaphore(m_hSema, count, nullptr);
-		    }
-		};
-#elif defined(__MACH__)
-		//---------------------------------------------------------
-		// Semaphore (Apple iOS and OSX)
-		// Can't use POSIX semaphores due to http://lists.apple.com/archives/darwin-kernel/2009/Apr/msg00010.html
-		//---------------------------------------------------------
-		class Semaphore
-		{
-		private:
-		    semaphore_t m_sema;
-
-		    Semaphore(const Semaphore& other);
-		    Semaphore& operator=(const Semaphore& other);
-
-		public:
-		    AE_NO_TSAN Semaphore(int initialCount = 0)
-		    {
-		        assert(initialCount >= 0);
-		        semaphore_create(mach_task_self(), &m_sema, SYNC_POLICY_FIFO, initialCount);
-		    }
-
-		    AE_NO_TSAN ~Semaphore()
-		    {
-		        semaphore_destroy(mach_task_self(), m_sema);
-		    }
-
-		    void wait() AE_NO_TSAN
-		    {
-		        semaphore_wait(m_sema);
-		    }
-
-			bool try_wait() AE_NO_TSAN
-			{
-				return timed_wait(0);
-			}
-
-			bool timed_wait(std::int64_t timeout_usecs) AE_NO_TSAN
-			{
-				mach_timespec_t ts;
-				ts.tv_sec = static_cast<unsigned int>(timeout_usecs / 1000000);
-				ts.tv_nsec = (timeout_usecs % 1000000) * 1000;
-
-				// added in OSX 10.10: https://developer.apple.com/library/prerelease/mac/documentation/General/Reference/APIDiffsMacOSX10_10SeedDiff/modules/Darwin.html
-				kern_return_t rc = semaphore_timedwait(m_sema, ts);
-
-				return rc != KERN_OPERATION_TIMED_OUT && rc != KERN_ABORTED;
-			}
-
-		    void signal() AE_NO_TSAN
-		    {
-		        semaphore_signal(m_sema);
-		    }
-
-		    void signal(int count) AE_NO_TSAN
-		    {
-		        while (count-- > 0)
-		        {
-		            semaphore_signal(m_sema);
-		        }
-		    }
-		};
-#elif defined(__unix__)
-		//---------------------------------------------------------
-		// Semaphore (POSIX, Linux)
-		//---------------------------------------------------------
-		class Semaphore
-		{
-		private:
-		    sem_t m_sema;
-
-		    Semaphore(const Semaphore& other);
-		    Semaphore& operator=(const Semaphore& other);
-
-		public:
-		    AE_NO_TSAN Semaphore(int initialCount = 0)
-		    {
-		        assert(initialCount >= 0);
-		        sem_init(&m_sema, 0, initialCount);
-		    }
-
-		    AE_NO_TSAN ~Semaphore()
-		    {
-		        sem_destroy(&m_sema);
-		    }
-
-		    void wait() AE_NO_TSAN
-		    {
-		        // http://stackoverflow.com/questions/2013181/gdb-causes-sem-wait-to-fail-with-eintr-error
-		        int rc;
-		        do
-		        {
-		            rc = sem_wait(&m_sema);
-		        }
-		        while (rc == -1 && errno == EINTR);
-		    }
-
-			bool try_wait() AE_NO_TSAN
-			{
-				int rc;
-				do {
-					rc = sem_trywait(&m_sema);
-				} while (rc == -1 && errno == EINTR);
-				return !(rc == -1 && errno == EAGAIN);
-			}
-
-			bool timed_wait(std::uint64_t usecs) AE_NO_TSAN
-			{
-				struct timespec ts;
-				const int usecs_in_1_sec = 1000000;
-				const int nsecs_in_1_sec = 1000000000;
-				clock_gettime(CLOCK_REALTIME, &ts);
-				ts.tv_sec += usecs / usecs_in_1_sec;
-				ts.tv_nsec += (usecs % usecs_in_1_sec) * 1000;
-				// sem_timedwait bombs if you have more than 1e9 in tv_nsec
-				// so we have to clean things up before passing it in
-				if (ts.tv_nsec >= nsecs_in_1_sec) {
-					ts.tv_nsec -= nsecs_in_1_sec;
-					++ts.tv_sec;
-				}
-
-				int rc;
-				do {
-					rc = sem_timedwait(&m_sema, &ts);
-				} while (rc == -1 && errno == EINTR);
-				return !(rc == -1 && errno == ETIMEDOUT);
-			}
-
-		    void signal() AE_NO_TSAN
-		    {
-		        sem_post(&m_sema);
-		    }
-
-		    void signal(int count) AE_NO_TSAN
-		    {
-		        while (count-- > 0)
-		        {
-		            sem_post(&m_sema);
-		        }
-		    }
-		};
-#else
-#error Unsupported platform! (No semaphore wrapper available)
-#endif
-
-		//---------------------------------------------------------
-		// LightweightSemaphore
-		//---------------------------------------------------------
-		class LightweightSemaphore
-		{
-		public:
-			typedef std::make_signed<std::size_t>::type ssize_t;
-			
-		private:
-		    weak_atomic<ssize_t> m_count;
-		    Semaphore m_sema;
-
-		    bool waitWithPartialSpinning(std::int64_t timeout_usecs = -1) AE_NO_TSAN
-		    {
-		        ssize_t oldCount;
-		        // Is there a better way to set the initial spin count?
-		        // If we lower it to 1000, testBenaphore becomes 15x slower on my Core i7-5930K Windows PC,
-		        // as threads start hitting the kernel semaphore.
-		        int spin = 10000;
-		        while (--spin >= 0)
-		        {
-		            if (m_count.load() > 0)
-		            {
-		                m_count.fetch_add_acquire(-1);
-		                return true;
-		            }
-		            compiler_fence(memory_order_acquire);     // Prevent the compiler from collapsing the loop.
-		        }
-		        oldCount = m_count.fetch_add_acquire(-1);
-				if (oldCount > 0)
-					return true;
-		        if (timeout_usecs < 0)
-				{
-					m_sema.wait();
-					return true;
-				}
-				if (m_sema.timed_wait(timeout_usecs))
-					return true;
-				// At this point, we've timed out waiting for the semaphore, but the
-				// count is still decremented indicating we may still be waiting on
-				// it. So we have to re-adjust the count, but only if the semaphore
-				// wasn't signaled enough times for us too since then. If it was, we
-				// need to release the semaphore too.
-				while (true)
-				{
-					oldCount = m_count.fetch_add_release(1);
-					if (oldCount < 0)
-						return false;    // successfully restored things to the way they were
-					// Oh, the producer thread just signaled the semaphore after all. Try again:
-					oldCount = m_count.fetch_add_acquire(-1);
-					if (oldCount > 0 && m_sema.try_wait())
-						return true;
-				}
-		    }
-
-		public:
-		    AE_NO_TSAN LightweightSemaphore(ssize_t initialCount = 0) : m_count(initialCount)
-		    {
-		        assert(initialCount >= 0);
-		    }
-
-		    bool tryWait() AE_NO_TSAN
-		    {
-		        if (m_count.load() > 0)
-		        {
-		        	m_count.fetch_add_acquire(-1);
-		        	return true;
-		        }
-		        return false;
-		    }
-
-		    void wait() AE_NO_TSAN
-		    {
-		        if (!tryWait())
-		            waitWithPartialSpinning();
-		    }
-
-			bool wait(std::int64_t timeout_usecs) AE_NO_TSAN
-			{
-				return tryWait() || waitWithPartialSpinning(timeout_usecs);
-			}
-
-		    void signal(ssize_t count = 1) AE_NO_TSAN
-		    {
-		    	assert(count >= 0);
-		        ssize_t oldCount = m_count.fetch_add_release(count);
-		        assert(oldCount >= -1);
-		        if (oldCount < 0)
-		        {
-		            m_sema.signal(1);
-		        }
-		    }
-		    
-		    ssize_t availableApprox() const AE_NO_TSAN
-		    {
-		    	ssize_t count = m_count.load();
-		    	return count > 0 ? count : 0;
-		    }
-		};
-	}	// end namespace spsc_sema
-}	// end namespace moodycamel
-
-#if defined(AE_VCPP) && (_MSC_VER < 1700 || defined(__cplusplus_cli))
-#pragma warning(pop)
-#ifdef __cplusplus_cli
-#pragma managed(pop)
-#endif
-#endif
diff --git a/include/readerwriterqueue.h b/include/readerwriterqueue.h
deleted file mode 100644
index 071147c3e1..0000000000
--- a/include/readerwriterqueue.h
+++ /dev/null
@@ -1,906 +0,0 @@
-// ©2013-2016 Cameron Desrochers.
-// Distributed under the simplified BSD license (see the license file that
-// should have come with this header).
-
-#pragma once
-
-#include "atomicops.h"
-#include <type_traits>
-#include <utility>
-#include <cassert>
-#include <stdexcept>
-#include <new>
-#include <cstdint>
-#include <cstdlib>		// For malloc/free/abort & size_t
-#include <memory>
-#if __cplusplus > 199711L || _MSC_VER >= 1700 // C++11 or VS2012
-#include <chrono>
-#endif
-
-
-// A lock-free queue for a single-consumer, single-producer architecture.
-// The queue is also wait-free in the common path (except if more memory
-// needs to be allocated, in which case malloc is called).
-// Allocates memory sparingly (O(lg(n) times, amortized), and only once if
-// the original maximum size estimate is never exceeded.
-// Tested on x86/x64 processors, but semantics should be correct for all
-// architectures (given the right implementations in atomicops.h), provided
-// that aligned integer and pointer accesses are naturally atomic.
-// Note that there should only be one consumer thread and producer thread;
-// Switching roles of the threads, or using multiple consecutive threads for
-// one role, is not safe unless properly synchronized.
-// Using the queue exclusively from one thread is fine, though a bit silly.
-
-#ifndef MOODYCAMEL_CACHE_LINE_SIZE
-#define MOODYCAMEL_CACHE_LINE_SIZE 64
-#endif
-
-#ifndef MOODYCAMEL_EXCEPTIONS_ENABLED
-#if (defined(_MSC_VER) && defined(_CPPUNWIND)) || (defined(__GNUC__) && defined(__EXCEPTIONS)) || (!defined(_MSC_VER) && !defined(__GNUC__))
-#define MOODYCAMEL_EXCEPTIONS_ENABLED
-#endif
-#endif
-
-#ifndef MOODYCAMEL_HAS_EMPLACE
-#if !defined(_MSC_VER) || _MSC_VER >= 1800 // variadic templates: either a non-MS compiler or VS >= 2013
-#define MOODYCAMEL_HAS_EMPLACE    1
-#endif
-#endif
-
-#ifdef AE_VCPP
-#pragma warning(push)
-#pragma warning(disable: 4324)	// structure was padded due to __declspec(align())
-#pragma warning(disable: 4820)	// padding was added
-#pragma warning(disable: 4127)	// conditional expression is constant
-#endif
-
-namespace moodycamel {
-
-template<typename T, size_t MAX_BLOCK_SIZE = 512>
-class ReaderWriterQueue
-{
-	// Design: Based on a queue-of-queues. The low-level queues are just
-	// circular buffers with front and tail indices indicating where the
-	// next element to dequeue is and where the next element can be enqueued,
-	// respectively. Each low-level queue is called a "block". Each block
-	// wastes exactly one element's worth of space to keep the design simple
-	// (if front == tail then the queue is empty, and can't be full).
-	// The high-level queue is a circular linked list of blocks; again there
-	// is a front and tail, but this time they are pointers to the blocks.
-	// The front block is where the next element to be dequeued is, provided
-	// the block is not empty. The back block is where elements are to be
-	// enqueued, provided the block is not full.
-	// The producer thread owns all the tail indices/pointers. The consumer
-	// thread owns all the front indices/pointers. Both threads read each
-	// other's variables, but only the owning thread updates them. E.g. After
-	// the consumer reads the producer's tail, the tail may change before the
-	// consumer is done dequeuing an object, but the consumer knows the tail
-	// will never go backwards, only forwards.
-	// If there is no room to enqueue an object, an additional block (of
-	// equal size to the last block) is added. Blocks are never removed.
-
-public:
-	typedef T value_type;
-
-	// Constructs a queue that can hold maxSize elements without further
-	// allocations. If more than MAX_BLOCK_SIZE elements are requested,
-	// then several blocks of MAX_BLOCK_SIZE each are reserved (including
-	// at least one extra buffer block).
-	AE_NO_TSAN explicit ReaderWriterQueue(size_t maxSize = 15)
-#ifndef NDEBUG
-		: enqueuing(false)
-		,dequeuing(false)
-#endif
-	{
-		assert(maxSize > 0);
-		assert(MAX_BLOCK_SIZE == ceilToPow2(MAX_BLOCK_SIZE) && "MAX_BLOCK_SIZE must be a power of 2");
-		assert(MAX_BLOCK_SIZE >= 2 && "MAX_BLOCK_SIZE must be at least 2");
-		
-		Block* firstBlock = nullptr;
-		
-		largestBlockSize = ceilToPow2(maxSize + 1);		// We need a spare slot to fit maxSize elements in the block
-		if (largestBlockSize > MAX_BLOCK_SIZE * 2) {
-			// We need a spare block in case the producer is writing to a different block the consumer is reading from, and
-			// wants to enqueue the maximum number of elements. We also need a spare element in each block to avoid the ambiguity
-			// between front == tail meaning "empty" and "full".
-			// So the effective number of slots that are guaranteed to be usable at any time is the block size - 1 times the
-			// number of blocks - 1. Solving for maxSize and applying a ceiling to the division gives us (after simplifying):
-			size_t initialBlockCount = (maxSize + MAX_BLOCK_SIZE * 2 - 3) / (MAX_BLOCK_SIZE - 1);
-			largestBlockSize = MAX_BLOCK_SIZE;
-			Block* lastBlock = nullptr;
-			for (size_t i = 0; i != initialBlockCount; ++i) {
-				auto block = make_block(largestBlockSize);
-				if (block == nullptr) {
-#ifdef MOODYCAMEL_EXCEPTIONS_ENABLED
-					throw std::bad_alloc();
-#else
-					abort();
-#endif
-				}
-				if (firstBlock == nullptr) {
-					firstBlock = block;
-				}
-				else {
-					lastBlock->next = block;
-				}
-				lastBlock = block;
-				block->next = firstBlock;
-			}
-		}
-		else {
-			firstBlock = make_block(largestBlockSize);
-			if (firstBlock == nullptr) {
-#ifdef MOODYCAMEL_EXCEPTIONS_ENABLED
-				throw std::bad_alloc();
-#else
-				abort();
-#endif
-			}
-			firstBlock->next = firstBlock;
-		}
-		frontBlock = firstBlock;
-		tailBlock = firstBlock;
-		
-		// Make sure the reader/writer threads will have the initialized memory setup above:
-		fence(memory_order_sync);
-	}
-
-	// Note: The queue should not be accessed concurrently while it's
-	// being moved. It's up to the user to synchronize this.
-	AE_NO_TSAN ReaderWriterQueue(ReaderWriterQueue&& other)
-		: frontBlock(other.frontBlock.load()),
-		tailBlock(other.tailBlock.load()),
-		largestBlockSize(other.largestBlockSize)
-#ifndef NDEBUG
-		,enqueuing(false)
-		,dequeuing(false)
-#endif
-	{
-		other.largestBlockSize = 32;
-		Block* b = other.make_block(other.largestBlockSize);
-		if (b == nullptr) {
-#ifdef MOODYCAMEL_EXCEPTIONS_ENABLED
-			throw std::bad_alloc();
-#else
-			abort();
-#endif
-		}
-		b->next = b;
-		other.frontBlock = b;
-		other.tailBlock = b;
-	}
-
-	// Note: The queue should not be accessed concurrently while it's
-	// being moved. It's up to the user to synchronize this.
-	ReaderWriterQueue& operator=(ReaderWriterQueue&& other) AE_NO_TSAN
-	{
-		Block* b = frontBlock.load();
-		frontBlock = other.frontBlock.load();
-		other.frontBlock = b;
-		b = tailBlock.load();
-		tailBlock = other.tailBlock.load();
-		other.tailBlock = b;
-		std::swap(largestBlockSize, other.largestBlockSize);
-		return *this;
-	}
-
-	// Note: The queue should not be accessed concurrently while it's
-	// being deleted. It's up to the user to synchronize this.
-	AE_NO_TSAN ~ReaderWriterQueue()
-	{
-		// Make sure we get the latest version of all variables from other CPUs:
-		fence(memory_order_sync);
-
-		// Destroy any remaining objects in queue and free memory
-		Block* frontBlock_ = frontBlock;
-		Block* block = frontBlock_;
-		do {
-			Block* nextBlock = block->next;
-			size_t blockFront = block->front;
-			size_t blockTail = block->tail;
-
-			for (size_t i = blockFront; i != blockTail; i = (i + 1) & block->sizeMask) {
-				auto element = reinterpret_cast<T*>(block->data + i * sizeof(T));
-				element->~T();
-				(void)element;
-			}
-			
-			auto rawBlock = block->rawThis;
-			block->~Block();
-			std::free(rawBlock);
-			block = nextBlock;
-		} while (block != frontBlock_);
-	}
-
-
-	// Enqueues a copy of element if there is room in the queue.
-	// Returns true if the element was enqueued, false otherwise.
-	// Does not allocate memory.
-	AE_FORCEINLINE bool try_enqueue(T const& element) AE_NO_TSAN
-	{
-		return inner_enqueue<CannotAlloc>(element);
-	}
-
-	// Enqueues a moved copy of element if there is room in the queue.
-	// Returns true if the element was enqueued, false otherwise.
-	// Does not allocate memory.
-	AE_FORCEINLINE bool try_enqueue(T&& element) AE_NO_TSAN
-	{
-		return inner_enqueue<CannotAlloc>(std::forward<T>(element));
-	}
-
-#if MOODYCAMEL_HAS_EMPLACE
-	// Like try_enqueue() but with emplace semantics (i.e. construct-in-place).
-	template<typename... Args>
-	AE_FORCEINLINE bool try_emplace(Args&&... args) AE_NO_TSAN
-	{
-		return inner_enqueue<CannotAlloc>(std::forward<Args>(args)...);
-	}
-#endif
-
-	// Enqueues a copy of element on the queue.
-	// Allocates an additional block of memory if needed.
-	// Only fails (returns false) if memory allocation fails.
-	AE_FORCEINLINE bool enqueue(T const& element) AE_NO_TSAN
-	{
-		return inner_enqueue<CanAlloc>(element);
-	}
-
-	// Enqueues a moved copy of element on the queue.
-	// Allocates an additional block of memory if needed.
-	// Only fails (returns false) if memory allocation fails.
-	AE_FORCEINLINE bool enqueue(T&& element) AE_NO_TSAN
-	{
-		return inner_enqueue<CanAlloc>(std::forward<T>(element));
-	}
-
-#if MOODYCAMEL_HAS_EMPLACE
-	// Like enqueue() but with emplace semantics (i.e. construct-in-place).
-	template<typename... Args>
-	AE_FORCEINLINE bool emplace(Args&&... args) AE_NO_TSAN
-	{
-		return inner_enqueue<CanAlloc>(std::forward<Args>(args)...);
-	}
-#endif
-
-	// Attempts to dequeue an element; if the queue is empty,
-	// returns false instead. If the queue has at least one element,
-	// moves front to result using operator=, then returns true.
-	template<typename U>
-	bool try_dequeue(U& result) AE_NO_TSAN
-	{
-#ifndef NDEBUG
-		ReentrantGuard guard(this->dequeuing);
-#endif
-
-		// High-level pseudocode:
-		// Remember where the tail block is
-		// If the front block has an element in it, dequeue it
-		// Else
-		//     If front block was the tail block when we entered the function, return false
-		//     Else advance to next block and dequeue the item there
-
-		// Note that we have to use the value of the tail block from before we check if the front
-		// block is full or not, in case the front block is empty and then, before we check if the
-		// tail block is at the front block or not, the producer fills up the front block *and
-		// moves on*, which would make us skip a filled block. Seems unlikely, but was consistently
-		// reproducible in practice.
-		// In order to avoid overhead in the common case, though, we do a double-checked pattern
-		// where we have the fast path if the front block is not empty, then read the tail block,
-		// then re-read the front block and check if it's not empty again, then check if the tail
-		// block has advanced.
-		
-		Block* frontBlock_ = frontBlock.load();
-		size_t blockTail = frontBlock_->localTail;
-		size_t blockFront = frontBlock_->front.load();
-		
-		if (blockFront != blockTail || blockFront != (frontBlock_->localTail = frontBlock_->tail.load())) {
-			fence(memory_order_acquire);
-			
-		non_empty_front_block:
-			// Front block not empty, dequeue from here
-			auto element = reinterpret_cast<T*>(frontBlock_->data + blockFront * sizeof(T));
-			result = std::move(*element);
-			element->~T();
-
-			blockFront = (blockFront + 1) & frontBlock_->sizeMask;
-
-			fence(memory_order_release);
-			frontBlock_->front = blockFront;
-		}
-		else if (frontBlock_ != tailBlock.load()) {
-			fence(memory_order_acquire);
-
-			frontBlock_ = frontBlock.load();
-			blockTail = frontBlock_->localTail = frontBlock_->tail.load();
-			blockFront = frontBlock_->front.load();
-			fence(memory_order_acquire);
-			
-			if (blockFront != blockTail) {
-				// Oh look, the front block isn't empty after all
-				goto non_empty_front_block;
-			}
-			
-			// Front block is empty but there's another block ahead, advance to it
-			Block* nextBlock = frontBlock_->next;
-			// Don't need an acquire fence here since next can only ever be set on the tailBlock,
-			// and we're not the tailBlock, and we did an acquire earlier after reading tailBlock which
-			// ensures next is up-to-date on this CPU in case we recently were at tailBlock.
-
-			size_t nextBlockFront = nextBlock->front.load();
-			size_t nextBlockTail = nextBlock->localTail = nextBlock->tail.load();
-			fence(memory_order_acquire);
-
-			// Since the tailBlock is only ever advanced after being written to,
-			// we know there's for sure an element to dequeue on it
-			assert(nextBlockFront != nextBlockTail);
-			AE_UNUSED(nextBlockTail);
-
-			// We're done with this block, let the producer use it if it needs
-			fence(memory_order_release);		// Expose possibly pending changes to frontBlock->front from last dequeue
-			frontBlock = frontBlock_ = nextBlock;
-
-			compiler_fence(memory_order_release);	// Not strictly needed
-
-			auto element = reinterpret_cast<T*>(frontBlock_->data + nextBlockFront * sizeof(T));
-			
-			result = std::move(*element);
-			element->~T();
-
-			nextBlockFront = (nextBlockFront + 1) & frontBlock_->sizeMask;
-			
-			fence(memory_order_release);
-			frontBlock_->front = nextBlockFront;
-		}
-		else {
-			// No elements in current block and no other block to advance to
-			return false;
-		}
-
-		return true;
-	}
-
-
-	// Returns a pointer to the front element in the queue (the one that
-	// would be removed next by a call to `try_dequeue` or `pop`). If the
-	// queue appears empty at the time the method is called, nullptr is
-	// returned instead.
-	// Must be called only from the consumer thread.
-	T* peek() AE_NO_TSAN
-	{
-#ifndef NDEBUG
-		ReentrantGuard guard(this->dequeuing);
-#endif
-		// See try_dequeue() for reasoning
-
-		Block* frontBlock_ = frontBlock.load();
-		size_t blockTail = frontBlock_->localTail;
-		size_t blockFront = frontBlock_->front.load();
-		
-		if (blockFront != blockTail || blockFront != (frontBlock_->localTail = frontBlock_->tail.load())) {
-			fence(memory_order_acquire);
-		non_empty_front_block:
-			return reinterpret_cast<T*>(frontBlock_->data + blockFront * sizeof(T));
-		}
-		else if (frontBlock_ != tailBlock.load()) {
-			fence(memory_order_acquire);
-			frontBlock_ = frontBlock.load();
-			blockTail = frontBlock_->localTail = frontBlock_->tail.load();
-			blockFront = frontBlock_->front.load();
-			fence(memory_order_acquire);
-			
-			if (blockFront != blockTail) {
-				goto non_empty_front_block;
-			}
-			
-			Block* nextBlock = frontBlock_->next;
-			
-			size_t nextBlockFront = nextBlock->front.load();
-			fence(memory_order_acquire);
-
-			assert(nextBlockFront != nextBlock->tail.load());
-			return reinterpret_cast<T*>(nextBlock->data + nextBlockFront * sizeof(T));
-		}
-		
-		return nullptr;
-	}
-	
-	// Removes the front element from the queue, if any, without returning it.
-	// Returns true on success, or false if the queue appeared empty at the time
-	// `pop` was called.
-	bool pop() AE_NO_TSAN
-	{
-#ifndef NDEBUG
-		ReentrantGuard guard(this->dequeuing);
-#endif
-		// See try_dequeue() for reasoning
-		
-		Block* frontBlock_ = frontBlock.load();
-		size_t blockTail = frontBlock_->localTail;
-		size_t blockFront = frontBlock_->front.load();
-		
-		if (blockFront != blockTail || blockFront != (frontBlock_->localTail = frontBlock_->tail.load())) {
-			fence(memory_order_acquire);
-			
-		non_empty_front_block:
-			auto element = reinterpret_cast<T*>(frontBlock_->data + blockFront * sizeof(T));
-			element->~T();
-
-			blockFront = (blockFront + 1) & frontBlock_->sizeMask;
-
-			fence(memory_order_release);
-			frontBlock_->front = blockFront;
-		}
-		else if (frontBlock_ != tailBlock.load()) {
-			fence(memory_order_acquire);
-			frontBlock_ = frontBlock.load();
-			blockTail = frontBlock_->localTail = frontBlock_->tail.load();
-			blockFront = frontBlock_->front.load();
-			fence(memory_order_acquire);
-			
-			if (blockFront != blockTail) {
-				goto non_empty_front_block;
-			}
-			
-			// Front block is empty but there's another block ahead, advance to it
-			Block* nextBlock = frontBlock_->next;
-			
-			size_t nextBlockFront = nextBlock->front.load();
-			size_t nextBlockTail = nextBlock->localTail = nextBlock->tail.load();
-			fence(memory_order_acquire);
-
-			assert(nextBlockFront != nextBlockTail);
-			AE_UNUSED(nextBlockTail);
-
-			fence(memory_order_release);
-			frontBlock = frontBlock_ = nextBlock;
-
-			compiler_fence(memory_order_release);
-
-			auto element = reinterpret_cast<T*>(frontBlock_->data + nextBlockFront * sizeof(T));
-			element->~T();
-
-			nextBlockFront = (nextBlockFront + 1) & frontBlock_->sizeMask;
-			
-			fence(memory_order_release);
-			frontBlock_->front = nextBlockFront;
-		}
-		else {
-			// No elements in current block and no other block to advance to
-			return false;
-		}
-
-		return true;
-	}
-	
-	// Returns the approximate number of items currently in the queue.
-	// Safe to call from both the producer and consumer threads.
-	inline size_t size_approx() const AE_NO_TSAN
-	{
-		size_t result = 0;
-		Block* frontBlock_ = frontBlock.load();
-		Block* block = frontBlock_;
-		do {
-			fence(memory_order_acquire);
-			size_t blockFront = block->front.load();
-			size_t blockTail = block->tail.load();
-			result += (blockTail - blockFront) & block->sizeMask;
-			block = block->next.load();
-		} while (block != frontBlock_);
-		return result;
-	}
-
-
-private:
-	enum AllocationMode { CanAlloc, CannotAlloc };
-
-#if MOODYCAMEL_HAS_EMPLACE
-	template<AllocationMode canAlloc, typename... Args>
-	bool inner_enqueue(Args&&... args) AE_NO_TSAN
-#else
-	template<AllocationMode canAlloc, typename U>
-	bool inner_enqueue(U&& element) AE_NO_TSAN
-#endif
-	{
-#ifndef NDEBUG
-		ReentrantGuard guard(this->enqueuing);
-#endif
-
-		// High-level pseudocode (assuming we're allowed to alloc a new block):
-		// If room in tail block, add to tail
-		// Else check next block
-		//     If next block is not the head block, enqueue on next block
-		//     Else create a new block and enqueue there
-		//     Advance tail to the block we just enqueued to
-
-		Block* tailBlock_ = tailBlock.load();
-		size_t blockFront = tailBlock_->localFront;
-		size_t blockTail = tailBlock_->tail.load();
-
-		size_t nextBlockTail = (blockTail + 1) & tailBlock_->sizeMask;
-		if (nextBlockTail != blockFront || nextBlockTail != (tailBlock_->localFront = tailBlock_->front.load())) {
-			fence(memory_order_acquire);
-			// This block has room for at least one more element
-			char* location = tailBlock_->data + blockTail * sizeof(T);
-#if MOODYCAMEL_HAS_EMPLACE
-			new (location) T(std::forward<Args>(args)...);
-#else
-			new (location) T(std::forward<U>(element));
-#endif
-
-			fence(memory_order_release);
-			tailBlock_->tail = nextBlockTail;
-		}
-		else {
-			fence(memory_order_acquire);
-			if (tailBlock_->next.load() != frontBlock) {
-				// Note that the reason we can't advance to the frontBlock and start adding new entries there
-				// is because if we did, then dequeue would stay in that block, eventually reading the new values,
-				// instead of advancing to the next full block (whose values were enqueued first and so should be
-				// consumed first).
-
-				fence(memory_order_acquire);		// Ensure we get latest writes if we got the latest frontBlock
-
-				// tailBlock is full, but there's a free block ahead, use it
-				Block* tailBlockNext = tailBlock_->next.load();
-				size_t nextBlockFront = tailBlockNext->localFront = tailBlockNext->front.load();
-				nextBlockTail = tailBlockNext->tail.load();
-				fence(memory_order_acquire);
-
-				// This block must be empty since it's not the head block and we
-				// go through the blocks in a circle
-				assert(nextBlockFront == nextBlockTail);
-				tailBlockNext->localFront = nextBlockFront;
-
-				char* location = tailBlockNext->data + nextBlockTail * sizeof(T);
-#if MOODYCAMEL_HAS_EMPLACE
-				new (location) T(std::forward<Args>(args)...);
-#else
-				new (location) T(std::forward<U>(element));
-#endif
-
-				tailBlockNext->tail = (nextBlockTail + 1) & tailBlockNext->sizeMask;
-
-				fence(memory_order_release);
-				tailBlock = tailBlockNext;
-			}
-			else if (canAlloc == CanAlloc) {
-				// tailBlock is full and there's no free block ahead; create a new block
-				auto newBlockSize = largestBlockSize >= MAX_BLOCK_SIZE ? largestBlockSize : largestBlockSize * 2;
-				auto newBlock = make_block(newBlockSize);
-				if (newBlock == nullptr) {
-					// Could not allocate a block!
-					return false;
-				}
-				largestBlockSize = newBlockSize;
-
-#if MOODYCAMEL_HAS_EMPLACE
-				new (newBlock->data) T(std::forward<Args>(args)...);
-#else
-				new (newBlock->data) T(std::forward<U>(element));
-#endif
-				assert(newBlock->front == 0);
-				newBlock->tail = newBlock->localTail = 1;
-
-				newBlock->next = tailBlock_->next.load();
-				tailBlock_->next = newBlock;
-
-				// Might be possible for the dequeue thread to see the new tailBlock->next
-				// *without* seeing the new tailBlock value, but this is OK since it can't
-				// advance to the next block until tailBlock is set anyway (because the only
-				// case where it could try to read the next is if it's already at the tailBlock,
-				// and it won't advance past tailBlock in any circumstance).
-
-				fence(memory_order_release);
-				tailBlock = newBlock;
-			}
-			else if (canAlloc == CannotAlloc) {
-				// Would have had to allocate a new block to enqueue, but not allowed
-				return false;
-			}
-			else {
-				assert(false && "Should be unreachable code");
-				return false;
-			}
-		}
-
-		return true;
-	}
-
-
-	// Disable copying
-	ReaderWriterQueue(ReaderWriterQueue const&) {  }
-
-	// Disable assignment
-	ReaderWriterQueue& operator=(ReaderWriterQueue const&) {  }
-
-
-
-	AE_FORCEINLINE static size_t ceilToPow2(size_t x)
-	{
-		// From http://graphics.stanford.edu/~seander/bithacks.html#RoundUpPowerOf2
-		--x;
-		x |= x >> 1;
-		x |= x >> 2;
-		x |= x >> 4;
-		for (size_t i = 1; i < sizeof(size_t); i <<= 1) {
-			x |= x >> (i << 3);
-		}
-		++x;
-		return x;
-	}
-	
-	template<typename U>
-	static AE_FORCEINLINE char* align_for(char* ptr) AE_NO_TSAN
-	{
-		const std::size_t alignment = std::alignment_of<U>::value;
-		return ptr + (alignment - (reinterpret_cast<std::uintptr_t>(ptr) % alignment)) % alignment;
-	}
-private:
-#ifndef NDEBUG
-	struct ReentrantGuard
-	{
-		AE_NO_TSAN ReentrantGuard(bool& _inSection)
-			: inSection(_inSection)
-		{
-			assert(!inSection && "Concurrent (or re-entrant) enqueue or dequeue operation detected (only one thread at a time may hold the producer or consumer role)");
-			inSection = true;
-		}
-
-		AE_NO_TSAN ~ReentrantGuard() { inSection = false; }
-
-	private:
-		ReentrantGuard& operator=(ReentrantGuard const&);
-
-	private:
-		bool& inSection;
-	};
-#endif
-
-	struct Block
-	{
-		// Avoid false-sharing by putting highly contended variables on their own cache lines
-		weak_atomic<size_t> front;	// (Atomic) Elements are read from here
-		size_t localTail;			// An uncontended shadow copy of tail, owned by the consumer
-		
-		char cachelineFiller0[MOODYCAMEL_CACHE_LINE_SIZE - sizeof(weak_atomic<size_t>) - sizeof(size_t)];
-		weak_atomic<size_t> tail;	// (Atomic) Elements are enqueued here
-		size_t localFront;
-		
-		char cachelineFiller1[MOODYCAMEL_CACHE_LINE_SIZE - sizeof(weak_atomic<size_t>) - sizeof(size_t)];	// next isn't very contended, but we don't want it on the same cache line as tail (which is)
-		weak_atomic<Block*> next;	// (Atomic)
-		
-		char* data;		// Contents (on heap) are aligned to T's alignment
-
-		const size_t sizeMask;
-
-
-		// size must be a power of two (and greater than 0)
-		AE_NO_TSAN Block(size_t const& _size, char* _rawThis, char* _data)
-			: front(0), localTail(0), tail(0), localFront(0), next(nullptr), data(_data), sizeMask(_size - 1), rawThis(_rawThis)
-		{
-		}
-
-	private:
-		// C4512 - Assignment operator could not be generated
-		Block& operator=(Block const&);
-
-	public:
-		char* rawThis;
-	};
-	
-	
-	static Block* make_block(size_t capacity) AE_NO_TSAN
-	{
-		// Allocate enough memory for the block itself, as well as all the elements it will contain
-		auto size = sizeof(Block) + std::alignment_of<Block>::value - 1;
-		size += sizeof(T) * capacity + std::alignment_of<T>::value - 1;
-		auto newBlockRaw = static_cast<char*>(std::malloc(size));
-		if (newBlockRaw == nullptr) {
-			return nullptr;
-		}
-		
-		auto newBlockAligned = align_for<Block>(newBlockRaw);
-		auto newBlockData = align_for<T>(newBlockAligned + sizeof(Block));
-		return new (newBlockAligned) Block(capacity, newBlockRaw, newBlockData);
-	}
-
-private:
-	weak_atomic<Block*> frontBlock;		// (Atomic) Elements are enqueued to this block
-	
-	char cachelineFiller[MOODYCAMEL_CACHE_LINE_SIZE - sizeof(weak_atomic<Block*>)];
-	weak_atomic<Block*> tailBlock;		// (Atomic) Elements are dequeued from this block
-
-	size_t largestBlockSize;
-
-#ifndef NDEBUG
-	bool enqueuing;
-	bool dequeuing;
-#endif
-};
-
-// Like ReaderWriterQueue, but also providees blocking operations
-template<typename T, size_t MAX_BLOCK_SIZE = 512>
-class BlockingReaderWriterQueue
-{
-private:
-	typedef ::moodycamel::ReaderWriterQueue<T, MAX_BLOCK_SIZE> ReaderWriterQueue;
-	
-public:
-	explicit BlockingReaderWriterQueue(size_t maxSize = 15) AE_NO_TSAN
-		: inner(maxSize), sema(new spsc_sema::LightweightSemaphore())
-	{ }
-
-	BlockingReaderWriterQueue(BlockingReaderWriterQueue&& other) AE_NO_TSAN
-		: inner(std::move(other.inner)), sema(std::move(other.sema))
-	{ }
-
-	BlockingReaderWriterQueue& operator=(BlockingReaderWriterQueue&& other) AE_NO_TSAN
-	{
-		std::swap(sema, other.sema);
-		std::swap(inner, other.inner);
-		return *this;
-	}
-
-
-	// Enqueues a copy of element if there is room in the queue.
-	// Returns true if the element was enqueued, false otherwise.
-	// Does not allocate memory.
-	AE_FORCEINLINE bool try_enqueue(T const& element) AE_NO_TSAN
-	{
-		if (inner.try_enqueue(element)) {
-			sema->signal();
-			return true;
-		}
-		return false;
-	}
-
-	// Enqueues a moved copy of element if there is room in the queue.
-	// Returns true if the element was enqueued, false otherwise.
-	// Does not allocate memory.
-	AE_FORCEINLINE bool try_enqueue(T&& element) AE_NO_TSAN
-	{
-		if (inner.try_enqueue(std::forward<T>(element))) {
-			sema->signal();
-			return true;
-		}
-		return false;
-	}
-
-
-	// Enqueues a copy of element on the queue.
-	// Allocates an additional block of memory if needed.
-	// Only fails (returns false) if memory allocation fails.
-	AE_FORCEINLINE bool enqueue(T const& element) AE_NO_TSAN
-	{
-		if (inner.enqueue(element)) {
-			sema->signal();
-			return true;
-		}
-		return false;
-	}
-
-	// Enqueues a moved copy of element on the queue.
-	// Allocates an additional block of memory if needed.
-	// Only fails (returns false) if memory allocation fails.
-	AE_FORCEINLINE bool enqueue(T&& element) AE_NO_TSAN
-	{
-		if (inner.enqueue(std::forward<T>(element))) {
-			sema->signal();
-			return true;
-		}
-		return false;
-	}
-
-
-	// Attempts to dequeue an element; if the queue is empty,
-	// returns false instead. If the queue has at least one element,
-	// moves front to result using operator=, then returns true.
-	template<typename U>
-	bool try_dequeue(U& result) AE_NO_TSAN
-	{
-		if (sema->tryWait()) {
-			bool success = inner.try_dequeue(result);
-			assert(success);
-			AE_UNUSED(success);
-			return true;
-		}
-		return false;
-	}
-	
-	
-	// Attempts to dequeue an element; if the queue is empty,
-	// waits until an element is available, then dequeues it.
-	template<typename U>
-	void wait_dequeue(U& result) AE_NO_TSAN
-	{
-		sema->wait();
-		bool success = inner.try_dequeue(result);
-		AE_UNUSED(result);
-		assert(success);
-		AE_UNUSED(success);
-	}
-
-
-	// Attempts to dequeue an element; if the queue is empty,
-	// waits until an element is available up to the specified timeout,
-	// then dequeues it and returns true, or returns false if the timeout
-	// expires before an element can be dequeued.
-	// Using a negative timeout indicates an indefinite timeout,
-	// and is thus functionally equivalent to calling wait_dequeue.
-	template<typename U>
-	bool wait_dequeue_timed(U& result, std::int64_t timeout_usecs) AE_NO_TSAN
-	{
-		if (!sema->wait(timeout_usecs)) {
-			return false;
-		}
-		bool success = inner.try_dequeue(result);
-		AE_UNUSED(result);
-		assert(success);
-		AE_UNUSED(success);
-		return true;
-	}
-
-
-#if __cplusplus > 199711L || _MSC_VER >= 1700
-	// Attempts to dequeue an element; if the queue is empty,
-	// waits until an element is available up to the specified timeout,
-	// then dequeues it and returns true, or returns false if the timeout
-	// expires before an element can be dequeued.
-	// Using a negative timeout indicates an indefinite timeout,
-	// and is thus functionally equivalent to calling wait_dequeue.
-	template<typename U, typename Rep, typename Period>
-	inline bool wait_dequeue_timed(U& result, std::chrono::duration<Rep, Period> const& timeout) AE_NO_TSAN
-	{
-        return wait_dequeue_timed(result, std::chrono::duration_cast<std::chrono::microseconds>(timeout).count());
-	}
-#endif
-
-
-	// Returns a pointer to the front element in the queue (the one that
-	// would be removed next by a call to `try_dequeue` or `pop`). If the
-	// queue appears empty at the time the method is called, nullptr is
-	// returned instead.
-	// Must be called only from the consumer thread.
-	AE_FORCEINLINE T* peek() AE_NO_TSAN
-	{
-		return inner.peek();
-	}
-	
-	// Removes the front element from the queue, if any, without returning it.
-	// Returns true on success, or false if the queue appeared empty at the time
-	// `pop` was called.
-	AE_FORCEINLINE bool pop() AE_NO_TSAN
-	{
-		if (sema->tryWait()) {
-			bool result = inner.pop();
-			assert(result);
-			AE_UNUSED(result);
-			return true;
-		}
-		return false;
-	}
-	
-	// Returns the approximate number of items currently in the queue.
-	// Safe to call from both the producer and consumer threads.
-	AE_FORCEINLINE size_t size_approx() const AE_NO_TSAN
-	{
-		return sema->availableApprox();
-	}
-
-
-private:
-	// Disable copying & assignment
-	BlockingReaderWriterQueue(BlockingReaderWriterQueue const&) {  }
-	BlockingReaderWriterQueue& operator=(BlockingReaderWriterQueue const&) {  }
-	
-private:
-	ReaderWriterQueue inner;
-	std::unique_ptr<spsc_sema::LightweightSemaphore> sema;
-};
-
-}    // end namespace moodycamel
-
-#ifdef AE_VCPP
-#pragma warning(pop)
-#endif
diff --git a/include/zmq.h b/include/zmq.h
index edf28efd2b..d0174c5a1a 100644
--- a/include/zmq.h
+++ b/include/zmq.h
@@ -268,6 +268,8 @@ typedef void(zmq_free_fn) (void *data_, void *hint_);
 
 ZMQ_EXPORT int zmq_msg_init (zmq_msg_t *msg_);
 ZMQ_EXPORT int zmq_msg_init_size (zmq_msg_t *msg_, size_t size_);
+ZMQ_EXPORT int
+zmq_msg_init_allocator (zmq_msg_t *msg_, size_t size_, void *allocator_);
 ZMQ_EXPORT int zmq_msg_init_data (
   zmq_msg_t *msg_, void *data_, size_t size_, zmq_free_fn *ffn_, void *hint_);
 ZMQ_EXPORT int zmq_msg_send (zmq_msg_t *msg_, void *s_, int flags_);
@@ -669,6 +671,7 @@ ZMQ_EXPORT void zmq_threadclose (void *thread_);
 
 /*  DRAFT Context options                                                     */
 #define ZMQ_ZERO_COPY_RECV 10
+//#define ZMQ_MSG_ALLOCATOR 11
 
 /*  DRAFT Context methods.                                                    */
 ZMQ_EXPORT int zmq_ctx_set_ext (void *context_,
@@ -680,6 +683,17 @@ ZMQ_EXPORT int zmq_ctx_get_ext (void *context_,
                                 void *optval_,
                                 size_t *optvallen_);
 
+/* ZMQ-provided message-pool implementations.                                 */
+// default allocator using malloc/free
+#define ZMQ_MSG_ALLOCATOR_DEFAULT 0
+// using internally a SPSC queue (cannot be used with inproc maybe?) or perhaps an MPMC queue anyway
+#define ZMQ_MSG_ALLOCATOR_PER_THREAD_POOL 1
+// using internally a MPMC queue
+#define ZMQ_MSG_ALLOCATOR_GLOBAL_POOL 2
+
+ZMQ_EXPORT void *zmq_msg_allocator_new (int type_);
+ZMQ_EXPORT int zmq_msg_allocator_destroy (void **allocator_);
+
 /*  DRAFT Socket methods.                                                     */
 ZMQ_EXPORT int zmq_join (void *s, const char *group);
 ZMQ_EXPORT int zmq_leave (void *s, const char *group);
diff --git a/perf/remote_thr.cpp b/perf/remote_thr.cpp
index 6969e8c25f..3f47234622 100644
--- a/perf/remote_thr.cpp
+++ b/perf/remote_thr.cpp
@@ -27,97 +27,17 @@
     along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */
 
+#include "../src/platform.hpp"
 #include "../include/zmq.h"
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
-#include <assert.h>
-
-#include "readerwriterqueue.h"
 
 // keys are arbitrary but must match local_lat.cpp
 const char server_pubkey[] = "DX4nh=yUn{-9ugra0X3Src4SU-4xTgqxcYY.+<SH";
 const char client_pubkey[] = "<n^oA}I:66W+*ds3tAmi1+KJzv-}k&fC2aA5Bj0K";
 const char client_prvkey[] = "9R9bV}[6z6DC-%$!jTVTKvWc=LEL{4i4gzUe$@Zx";
 
-#define SIZE_OF_CONTENT_T_USED_BY_ZMQ_VLM (40)
-#define MAX_ACTIVE_MESSAGES (8192)
-#define MSG_BLOCK_SIZE (256)
-
-#define MAX_MESSAGE_SIZE                                                       \
-    (MSG_BLOCK_SIZE - SIZE_OF_CONTENT_T_USED_BY_ZMQ_VLM - 1 /* canary */)
-
-typedef struct
-{
-    uint8_t content_block
-      [SIZE_OF_CONTENT_T_USED_BY_ZMQ_VLM]; // will be used by ZMQ internally
-    uint8_t raw_data[MAX_MESSAGE_SIZE];
-    uint8_t canary;
-} msg_block_t;
-
-
-class ZmqMessagePool
-{
-  public:
-    ZmqMessagePool ()
-    {
-        // enqueue all available blocks in the free list:
-        for (int i = 0; i < MAX_ACTIVE_MESSAGES; i++) {
-            m_storage[i].canary = 0xAB;
-            m_free_list.enqueue (&m_storage[i]);
-        }
-    }
-    ~ZmqMessagePool () {}
-
-
-    bool allocate_msg (zmq_msg_t *out,
-                       size_t len) // consumer thread: user app thread
-    {
-        assert (len < MAX_MESSAGE_SIZE);
-
-        // consume 1 block from the list of free msg blocks
-        msg_block_t *next_avail = nullptr;
-        if (!m_free_list.try_dequeue (next_avail)) {
-            assert (0); // I want to find out if this ever happens
-            return false;
-        }
-
-        assert (next_avail);
-        int rc = zmq_msg_init_data (
-          out, next_avail, len + SIZE_OF_CONTENT_T_USED_BY_ZMQ_VLM,
-          (zmq_free_fn *) ZmqMessagePool::deallocate_msg, this);
-        assert (rc == 0);
-
-        assert (zmq_msg_size (out) == len);
-        assert (zmq_msg_data (out) == next_avail->raw_data);
-
-        return true;
-    }
-
-    static void
-    deallocate_msg (void *data_,
-                    void *hint_) // producer thread: ZMQ background IO thread
-    {
-        ZmqMessagePool *pPool = reinterpret_cast<ZmqMessagePool *> (hint_);
-
-        // recover the beginning of this msg_block:
-        uint8_t *data_ptr_ = (uint8_t *) data_;
-        msg_block_t *to_return =
-          (msg_block_t *) (data_ptr_ - SIZE_OF_CONTENT_T_USED_BY_ZMQ_VLM);
-        assert (to_return->canary == 0xAB);
-
-        // produce a new free msg block:
-        pPool->m_free_list.enqueue (to_return);
-    }
-
-    size_t size () const { return m_free_list.size_approx (); }
-
-  private:
-    msg_block_t m_storage[MAX_ACTIVE_MESSAGES];
-    moodycamel::ReaderWriterQueue<msg_block_t *> m_free_list;
-};
-
-
 int main (int argc, char *argv[])
 {
     const char *connect_to;
@@ -148,6 +68,11 @@ int main (int argc, char *argv[])
         return -1;
     }
 
+#ifdef ZMQ_BUILD_DRAFT_API
+    // EXPERIMENTAL ALLOCATOR FOR MSG_T
+    void *allocator = zmq_msg_allocator_new (ZMQ_MSG_ALLOCATOR_GLOBAL_POOL);
+#endif
+
     s = zmq_socket (ctx, ZMQ_PUSH);
     if (!s) {
         printf ("error in zmq_socket: %s\n", zmq_strerror (errno));
@@ -185,9 +110,12 @@ int main (int argc, char *argv[])
         return -1;
     }
 
-#if 0
     for (i = 0; i != message_count; i++) {
+#ifdef ZMQ_BUILD_DRAFT_API
+        rc = zmq_msg_init_allocator (&msg, message_size, allocator);
+#else
         rc = zmq_msg_init_size (&msg, message_size);
+#endif
         if (rc != 0) {
             printf ("error in zmq_msg_init_size: %s\n", zmq_strerror (errno));
             return -1;
@@ -203,32 +131,6 @@ int main (int argc, char *argv[])
             return -1;
         }
     }
-#else
-    printf ("msg block size: %zu; max msg size: %d\n", sizeof (msg_block_t),
-            MAX_MESSAGE_SIZE);
-    ZmqMessagePool pool;
-    for (i = 0; i != message_count; i++) {
-        pool.allocate_msg (&msg, message_size);
-
-        // to be fair when comparing the results generated by the other #if/#endif branch
-        // avoid any kind of initialization of message memory:
-        //memset (zmq_msg_data (&msg), message_size, 0xAB);
-
-        rc = zmq_sendmsg (s, &msg, 0);
-        if (rc < 0) {
-            printf ("error in zmq_sendmsg: %s\n", zmq_strerror (errno));
-            return -1;
-        }
-        rc = zmq_msg_close (&msg);
-        if (rc != 0) {
-            printf ("error in zmq_msg_close: %s\n", zmq_strerror (errno));
-            return -1;
-        }
-
-        //if ((i % 1000) == 0)
-        //    printf ("mempool msg size: %zu\n", pool.size ());
-    }
-#endif
 
     rc = zmq_close (s);
     if (rc != 0) {
@@ -242,5 +144,11 @@ int main (int argc, char *argv[])
         return -1;
     }
 
+#ifdef ZMQ_BUILD_DRAFT_API
+    // IMPORTANT: destroy the allocator only after zmq_ctx_term() since otherwise
+    // some zmq_msg_t may still be "in fly"
+    zmq_msg_allocator_destroy (&allocator);
+#endif
+
     return 0;
 }
diff --git a/src/allocator.cpp b/src/allocator.cpp
new file mode 100644
index 0000000000..ff6b6320fa
--- /dev/null
+++ b/src/allocator.cpp
@@ -0,0 +1,97 @@
+/*
+    Copyright (c) 2007-2016 Contributors as noted in the AUTHORS file
+
+    This file is part of libzmq, the ZeroMQ core engine in C++.
+
+    libzmq is free software; you can redistribute it and/or modify it under
+    the terms of the GNU Lesser General Public License (LGPL) as published
+    by the Free Software Foundation; either version 3 of the License, or
+    (at your option) any later version.
+
+    As a special exception, the Contributors give you permission to link
+    this library with independent modules to produce an executable,
+    regardless of the license terms of these independent modules, and to
+    copy and distribute the resulting executable under terms of your choice,
+    provided that you also meet, for each linked independent module, the
+    terms and conditions of the license of that module. An independent
+    module is a module which is not derived from or based on this library.
+    If you modify this library, you must extend this exception to your
+    version of the library.
+
+    libzmq is distributed in the hope that it will be useful, but WITHOUT
+    ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+    FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+    License for more details.
+
+    You should have received a copy of the GNU Lesser General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "precompiled.hpp"
+#include "allocator.hpp"
+
+
+zmq::allocator_t::allocator_t ()
+{
+    _type = ZMQ_MSG_ALLOCATOR_DEFAULT;
+    _tag = 0xCAFEEBEB;
+}
+
+size_t zmq::allocator_t::size () const
+{
+    switch (_type) {
+        case ZMQ_MSG_ALLOCATOR_DEFAULT:
+            return 0;
+
+            // using internally a SPSC queue (cannot be used with inproc maybe?) or perhaps an MPMC queue anyway
+        case ZMQ_MSG_ALLOCATOR_PER_THREAD_POOL:
+            return 0;
+
+            // using internally a MPMC queue
+        case ZMQ_MSG_ALLOCATOR_GLOBAL_POOL:
+            return _global_pool.size ();
+
+        default:
+            return 0;
+    }
+}
+
+
+void *zmq::allocator_t::allocate (size_t len)
+{
+    switch (_type) {
+        case ZMQ_MSG_ALLOCATOR_DEFAULT:
+            return malloc (len);
+
+            // using internally a SPSC queue (cannot be used with inproc maybe?) or perhaps an MPMC queue anyway
+        case ZMQ_MSG_ALLOCATOR_PER_THREAD_POOL:
+            // FIXME
+            return NULL;
+
+            // using internally a MPMC queue
+        case ZMQ_MSG_ALLOCATOR_GLOBAL_POOL:
+            return _global_pool.allocate_msg (len);
+    }
+    return NULL;
+}
+
+void zmq::allocator_t::deallocate_msg (void *data_, void *hint_)
+{
+    allocator_t *alloc = reinterpret_cast<allocator_t *> (hint_);
+    switch (alloc->_type) {
+        case ZMQ_MSG_ALLOCATOR_DEFAULT:
+            free (data_);
+            return;
+
+            // using internally a SPSC queue (cannot be used with inproc maybe?) or perhaps an MPMC queue anyway
+        case ZMQ_MSG_ALLOCATOR_PER_THREAD_POOL:
+            // FIXME
+            return;
+
+            // using internally a MPMC queue
+        case ZMQ_MSG_ALLOCATOR_GLOBAL_POOL:
+            zmq::msg_t::content_t *msg_content =
+              (zmq::msg_t::content_t *) data_;
+            alloc->_global_pool.deallocate_msg (msg_content, msg_content->size);
+    }
+}
diff --git a/src/allocator.hpp b/src/allocator.hpp
new file mode 100644
index 0000000000..8cac7e8584
--- /dev/null
+++ b/src/allocator.hpp
@@ -0,0 +1,181 @@
+/*
+    Copyright (c) 2007-2016 Contributors as noted in the AUTHORS file
+
+    This file is part of libzmq, the ZeroMQ core engine in C++.
+
+    libzmq is free software; you can redistribute it and/or modify it under
+    the terms of the GNU Lesser General Public License (LGPL) as published
+    by the Free Software Foundation; either version 3 of the License, or
+    (at your option) any later version.
+
+    As a special exception, the Contributors give you permission to link
+    this library with independent modules to produce an executable,
+    regardless of the license terms of these independent modules, and to
+    copy and distribute the resulting executable under terms of your choice,
+    provided that you also meet, for each linked independent module, the
+    terms and conditions of the license of that module. An independent
+    module is a module which is not derived from or based on this library.
+    If you modify this library, you must extend this exception to your
+    version of the library.
+
+    libzmq is distributed in the hope that it will be useful, but WITHOUT
+    ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+    FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+    License for more details.
+
+    You should have received a copy of the GNU Lesser General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#ifndef __ZMQ_MEMORYPOOL_HPP_INCLUDED__
+#define __ZMQ_MEMORYPOOL_HPP_INCLUDED__
+
+#include <vector>
+#include "msg.hpp"
+#include "concurrentqueue.h"
+
+// FIXME: we need to grow dynamically the mempool
+#define MAX_ACTIVE_MESSAGES (8192)
+
+namespace zmq
+{
+class global_memory_pool_t
+{
+    typedef struct
+    {
+        size_t num_msgs;
+        // actual user data
+        uint8_t *raw_data;
+    } msg_block_t;
+
+    typedef enum
+    {
+        MsgBlock_SizeClass_256 = 0, // for messages up to 256B long
+        MsgBlock_SizeClass_512,
+        MsgBlock_SizeClass_1024,
+        MsgBlock_SizeClass_2048,
+        MsgBlock_SizeClass_4096,
+        MsgBlock_SizeClass_8192,
+
+        MsgBlock_NumSizeClasses
+    } MsgBlock_e;
+
+    inline size_t MsgBlockToBytes (MsgBlock_e block_class)
+    {
+        switch (block_class) {
+            case MsgBlock_SizeClass_256:
+                return 256;
+            case MsgBlock_SizeClass_512:
+                return 512;
+            case MsgBlock_SizeClass_1024:
+                return 1024;
+            case MsgBlock_SizeClass_2048:
+                return 2048;
+            case MsgBlock_SizeClass_4096:
+                return 4096;
+            case MsgBlock_SizeClass_8192:
+                return 8192;
+            default:
+                return 0;
+        }
+    }
+    inline MsgBlock_e BytesToMsgBlock (size_t n)
+    {
+        if (n < 256)
+            return MsgBlock_SizeClass_256;
+        else if (n < 512)
+            return MsgBlock_SizeClass_512;
+
+        return MsgBlock_NumSizeClasses;
+    }
+
+  public:
+    global_memory_pool_t ()
+    {
+        // enqueue all available blocks in the free list:
+        for (int i = 0; i < MsgBlock_NumSizeClasses; i++) {
+            size_t msg_size = MsgBlockToBytes ((MsgBlock_e) i);
+
+            m_storage[i].num_msgs = MAX_ACTIVE_MESSAGES;
+            m_storage[i].raw_data =
+              (uint8_t *) malloc (MAX_ACTIVE_MESSAGES * msg_size);
+
+            uint8_t *msg_memory = m_storage[i].raw_data;
+            for (int j = 0; j < MAX_ACTIVE_MESSAGES; j++) {
+                m_free_list[i].enqueue (msg_memory);
+                msg_memory += msg_size;
+            }
+        }
+    }
+    ~global_memory_pool_t () {}
+
+    void *allocate_msg (size_t len) // consumer thread: user app thread
+    {
+        MsgBlock_e bl = BytesToMsgBlock (len);
+        assert (bl != MsgBlock_NumSizeClasses);
+
+        // consume 1 block from the list of free msg
+        uint8_t *next_avail = nullptr;
+        if (!m_free_list[bl].try_dequeue (next_avail)) {
+            assert (0); // I want to find out if this ever happens
+            return NULL;
+        }
+
+        assert (next_avail);
+        return next_avail;
+    }
+
+    void
+    deallocate_msg (void *data_,
+                    size_t len) // producer thread: ZMQ background IO thread
+    {
+        MsgBlock_e bl = BytesToMsgBlock (len);
+        assert (bl != MsgBlock_NumSizeClasses);
+
+        // produce a new free msg:
+        m_free_list[bl].enqueue ((uint8_t *) data_);
+    }
+
+    size_t size () const
+    {
+        size_t acc = 0;
+        for (int i = 0; i < MsgBlock_NumSizeClasses; i++)
+            acc += m_free_list[i].size_approx ();
+        return acc;
+    }
+
+  private:
+    msg_block_t m_storage[MsgBlock_NumSizeClasses];
+    moodycamel::ConcurrentQueue<uint8_t *> m_free_list[MsgBlock_NumSizeClasses];
+};
+
+class allocator_t
+{
+  public:
+    allocator_t ();
+    ~allocator_t ()
+    {
+        //  Mark this instance as dead
+        _tag = 0xdeadbeef;
+    }
+
+    void init (int type_) { _type = type_; }
+
+    // allocate() gets called by the consumer thread: the user app thread
+    void *allocate (size_t len);
+
+    // deallocate_msg() gets called by the producer thread: the ZMQ background IO thread
+    static void deallocate_msg (void *data_, void *hint_);
+
+    size_t size () const;
+    bool check_tag () const { return _tag == 0xCAFEEBEB; }
+
+
+  private:
+    int _type;
+    uint32_t _tag;
+    global_memory_pool_t _global_pool;
+};
+}
+
+#endif
diff --git a/src/concurrentqueue.h b/src/concurrentqueue.h
new file mode 100644
index 0000000000..21cb9375aa
--- /dev/null
+++ b/src/concurrentqueue.h
@@ -0,0 +1,3636 @@
+// Provides a C++11 implementation of a multi-producer, multi-consumer lock-free queue.
+// An overview, including benchmark results, is provided here:
+//     http://moodycamel.com/blog/2014/a-fast-general-purpose-lock-free-queue-for-c++
+// The full design is also described in excruciating detail at:
+//    http://moodycamel.com/blog/2014/detailed-design-of-a-lock-free-queue
+
+// Simplified BSD license:
+// Copyright (c) 2013-2016, Cameron Desrochers.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+// - Redistributions of source code must retain the above copyright notice, this list of
+// conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice, this list of
+// conditions and the following disclaimer in the documentation and/or other materials
+// provided with the distribution.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL
+// THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT
+// OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+// HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR
+// TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
+// EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+#pragma once
+
+#if defined(__GNUC__)
+// Disable -Wconversion warnings (spuriously triggered when Traits::size_t and
+// Traits::index_t are set to < 32 bits, causing integer promotion, causing warnings
+// upon assigning any computed values)
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wconversion"
+
+#ifdef MCDBGQ_USE_RELACY
+#pragma GCC diagnostic ignored "-Wint-to-pointer-cast"
+#endif
+#endif
+
+#if defined(__APPLE__)
+#include "TargetConditionals.h"
+#endif
+
+#ifdef MCDBGQ_USE_RELACY
+#include "relacy/relacy_std.hpp"
+#include "relacy_shims.h"
+// We only use malloc/free anyway, and the delete macro messes up `= delete` method declarations.
+// We'll override the default trait malloc ourselves without a macro.
+#undef new
+#undef delete
+#undef malloc
+#undef free
+#else
+#include <atomic>		// Requires C++11. Sorry VS2010.
+#include <cassert>
+#endif
+#include <cstddef>              // for max_align_t
+#include <cstdint>
+#include <cstdlib>
+#include <type_traits>
+#include <algorithm>
+#include <utility>
+#include <limits>
+#include <climits>		// for CHAR_BIT
+#include <array>
+#include <thread>		// partly for __WINPTHREADS_VERSION if on MinGW-w64 w/ POSIX threading
+
+// Platform-specific definitions of a numeric thread ID type and an invalid value
+namespace moodycamel { namespace details {
+	template<typename thread_id_t> struct thread_id_converter {
+		typedef thread_id_t thread_id_numeric_size_t;
+		typedef thread_id_t thread_id_hash_t;
+		static thread_id_hash_t prehash(thread_id_t const& x) { return x; }
+	};
+} }
+#if defined(MCDBGQ_USE_RELACY)
+namespace moodycamel { namespace details {
+	typedef std::uint32_t thread_id_t;
+	static const thread_id_t invalid_thread_id  = 0xFFFFFFFFU;
+	static const thread_id_t invalid_thread_id2 = 0xFFFFFFFEU;
+	static inline thread_id_t thread_id() { return rl::thread_index(); }
+} }
+#elif defined(_WIN32) || defined(__WINDOWS__) || defined(__WIN32__)
+// No sense pulling in windows.h in a header, we'll manually declare the function
+// we use and rely on backwards-compatibility for this not to break
+extern "C" __declspec(dllimport) unsigned long __stdcall GetCurrentThreadId(void);
+namespace moodycamel { namespace details {
+	static_assert(sizeof(unsigned long) == sizeof(std::uint32_t), "Expected size of unsigned long to be 32 bits on Windows");
+	typedef std::uint32_t thread_id_t;
+	static const thread_id_t invalid_thread_id  = 0;			// See http://blogs.msdn.com/b/oldnewthing/archive/2004/02/23/78395.aspx
+	static const thread_id_t invalid_thread_id2 = 0xFFFFFFFFU;	// Not technically guaranteed to be invalid, but is never used in practice. Note that all Win32 thread IDs are presently multiples of 4.
+	static inline thread_id_t thread_id() { return static_cast<thread_id_t>(::GetCurrentThreadId()); }
+} }
+#elif defined(__arm__) || defined(_M_ARM) || defined(__aarch64__) || (defined(__APPLE__) && TARGET_OS_IPHONE)
+namespace moodycamel { namespace details {
+	static_assert(sizeof(std::thread::id) == 4 || sizeof(std::thread::id) == 8, "std::thread::id is expected to be either 4 or 8 bytes");
+	
+	typedef std::thread::id thread_id_t;
+	static const thread_id_t invalid_thread_id;         // Default ctor creates invalid ID
+
+	// Note we don't define a invalid_thread_id2 since std::thread::id doesn't have one; it's
+	// only used if MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED is defined anyway, which it won't
+	// be.
+	static inline thread_id_t thread_id() { return std::this_thread::get_id(); }
+
+	template<std::size_t> struct thread_id_size { };
+	template<> struct thread_id_size<4> { typedef std::uint32_t numeric_t; };
+	template<> struct thread_id_size<8> { typedef std::uint64_t numeric_t; };
+
+	template<> struct thread_id_converter<thread_id_t> {
+		typedef thread_id_size<sizeof(thread_id_t)>::numeric_t thread_id_numeric_size_t;
+#ifndef __APPLE__
+		typedef std::size_t thread_id_hash_t;
+#else
+		typedef thread_id_numeric_size_t thread_id_hash_t;
+#endif
+
+		static thread_id_hash_t prehash(thread_id_t const& x)
+		{
+#ifndef __APPLE__
+			return std::hash<std::thread::id>()(x);
+#else
+			return *reinterpret_cast<thread_id_hash_t const*>(&x);
+#endif
+		}
+	};
+} }
+#else
+// Use a nice trick from this answer: http://stackoverflow.com/a/8438730/21475
+// In order to get a numeric thread ID in a platform-independent way, we use a thread-local
+// static variable's address as a thread identifier :-)
+#if defined(__GNUC__) || defined(__INTEL_COMPILER)
+#define MOODYCAMEL_THREADLOCAL __thread
+#elif defined(_MSC_VER)
+#define MOODYCAMEL_THREADLOCAL __declspec(thread)
+#else
+// Assume C++11 compliant compiler
+#define MOODYCAMEL_THREADLOCAL thread_local
+#endif
+namespace moodycamel { namespace details {
+	typedef std::uintptr_t thread_id_t;
+	static const thread_id_t invalid_thread_id  = 0;		// Address can't be nullptr
+	static const thread_id_t invalid_thread_id2 = 1;		// Member accesses off a null pointer are also generally invalid. Plus it's not aligned.
+	static inline thread_id_t thread_id() { static MOODYCAMEL_THREADLOCAL int x; return reinterpret_cast<thread_id_t>(&x); }
+} }
+#endif
+
+// Exceptions
+#ifndef MOODYCAMEL_EXCEPTIONS_ENABLED
+#if (defined(_MSC_VER) && defined(_CPPUNWIND)) || (defined(__GNUC__) && defined(__EXCEPTIONS)) || (!defined(_MSC_VER) && !defined(__GNUC__))
+#define MOODYCAMEL_EXCEPTIONS_ENABLED
+#endif
+#endif
+#ifdef MOODYCAMEL_EXCEPTIONS_ENABLED
+#define MOODYCAMEL_TRY try
+#define MOODYCAMEL_CATCH(...) catch(__VA_ARGS__)
+#define MOODYCAMEL_RETHROW throw
+#define MOODYCAMEL_THROW(expr) throw (expr)
+#else
+#define MOODYCAMEL_TRY if (true)
+#define MOODYCAMEL_CATCH(...) else if (false)
+#define MOODYCAMEL_RETHROW
+#define MOODYCAMEL_THROW(expr)
+#endif
+
+#ifndef MOODYCAMEL_NOEXCEPT
+#if !defined(MOODYCAMEL_EXCEPTIONS_ENABLED)
+#define MOODYCAMEL_NOEXCEPT
+#define MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, expr) true
+#define MOODYCAMEL_NOEXCEPT_ASSIGN(type, valueType, expr) true
+#elif defined(_MSC_VER) && defined(_NOEXCEPT) && _MSC_VER < 1800
+// VS2012's std::is_nothrow_[move_]constructible is broken and returns true when it shouldn't :-(
+// We have to assume *all* non-trivial constructors may throw on VS2012!
+#define MOODYCAMEL_NOEXCEPT _NOEXCEPT
+#define MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, expr) (std::is_rvalue_reference<valueType>::value && std::is_move_constructible<type>::value ? std::is_trivially_move_constructible<type>::value : std::is_trivially_copy_constructible<type>::value)
+#define MOODYCAMEL_NOEXCEPT_ASSIGN(type, valueType, expr) ((std::is_rvalue_reference<valueType>::value && std::is_move_assignable<type>::value ? std::is_trivially_move_assignable<type>::value || std::is_nothrow_move_assignable<type>::value : std::is_trivially_copy_assignable<type>::value || std::is_nothrow_copy_assignable<type>::value) && MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, expr))
+#elif defined(_MSC_VER) && defined(_NOEXCEPT) && _MSC_VER < 1900
+#define MOODYCAMEL_NOEXCEPT _NOEXCEPT
+#define MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, expr) (std::is_rvalue_reference<valueType>::value && std::is_move_constructible<type>::value ? std::is_trivially_move_constructible<type>::value || std::is_nothrow_move_constructible<type>::value : std::is_trivially_copy_constructible<type>::value || std::is_nothrow_copy_constructible<type>::value)
+#define MOODYCAMEL_NOEXCEPT_ASSIGN(type, valueType, expr) ((std::is_rvalue_reference<valueType>::value && std::is_move_assignable<type>::value ? std::is_trivially_move_assignable<type>::value || std::is_nothrow_move_assignable<type>::value : std::is_trivially_copy_assignable<type>::value || std::is_nothrow_copy_assignable<type>::value) && MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, expr))
+#else
+#define MOODYCAMEL_NOEXCEPT noexcept
+#define MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, expr) noexcept(expr)
+#define MOODYCAMEL_NOEXCEPT_ASSIGN(type, valueType, expr) noexcept(expr)
+#endif
+#endif
+
+#ifndef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED
+#ifdef MCDBGQ_USE_RELACY
+#define MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED
+#else
+// VS2013 doesn't support `thread_local`, and MinGW-w64 w/ POSIX threading has a crippling bug: http://sourceforge.net/p/mingw-w64/bugs/445
+// g++ <=4.7 doesn't support thread_local either.
+// Finally, iOS/ARM doesn't have support for it either, and g++/ARM allows it to compile but it's unconfirmed to actually work
+#if (!defined(_MSC_VER) || _MSC_VER >= 1900) && (!defined(__MINGW32__) && !defined(__MINGW64__) || !defined(__WINPTHREADS_VERSION)) && (!defined(__GNUC__) || __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 8)) && (!defined(__APPLE__) || !TARGET_OS_IPHONE) && !defined(__arm__) && !defined(_M_ARM) && !defined(__aarch64__)
+// Assume `thread_local` is fully supported in all other C++11 compilers/platforms
+//#define MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED    // always disabled for now since several users report having problems with it on
+#endif
+#endif
+#endif
+
+// VS2012 doesn't support deleted functions. 
+// In this case, we declare the function normally but don't define it. A link error will be generated if the function is called.
+#ifndef MOODYCAMEL_DELETE_FUNCTION
+#if defined(_MSC_VER) && _MSC_VER < 1800
+#define MOODYCAMEL_DELETE_FUNCTION
+#else
+#define MOODYCAMEL_DELETE_FUNCTION = delete
+#endif
+#endif
+
+// Compiler-specific likely/unlikely hints
+namespace moodycamel { namespace details {
+#if defined(__GNUC__)
+	static inline bool (likely)(bool x) { return __builtin_expect((x), true); }
+	static inline bool (unlikely)(bool x) { return __builtin_expect((x), false); }
+#else
+	static inline bool (likely)(bool x) { return x; }
+	static inline bool (unlikely)(bool x) { return x; }
+#endif
+} }
+
+#ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG
+#include "internal/concurrentqueue_internal_debug.h"
+#endif
+
+namespace moodycamel {
+namespace details {
+	template<typename T>
+	struct const_numeric_max {
+		static_assert(std::is_integral<T>::value, "const_numeric_max can only be used with integers");
+		static const T value = std::numeric_limits<T>::is_signed
+			? (static_cast<T>(1) << (sizeof(T) * CHAR_BIT - 1)) - static_cast<T>(1)
+			: static_cast<T>(-1);
+	};
+
+#if defined(__GLIBCXX__)
+	typedef ::max_align_t std_max_align_t;      // libstdc++ forgot to add it to std:: for a while
+#else
+	typedef std::max_align_t std_max_align_t;   // Others (e.g. MSVC) insist it can *only* be accessed via std::
+#endif
+
+	// Some platforms have incorrectly set max_align_t to a type with <8 bytes alignment even while supporting
+	// 8-byte aligned scalar values (*cough* 32-bit iOS). Work around this with our own union. See issue #64.
+	typedef union {
+		std_max_align_t x;
+		long long y;
+		void* z;
+	} max_align_t;
+}
+
+// Default traits for the ConcurrentQueue. To change some of the
+// traits without re-implementing all of them, inherit from this
+// struct and shadow the declarations you wish to be different;
+// since the traits are used as a template type parameter, the
+// shadowed declarations will be used where defined, and the defaults
+// otherwise.
+struct ConcurrentQueueDefaultTraits
+{
+	// General-purpose size type. std::size_t is strongly recommended.
+	typedef std::size_t size_t;
+	
+	// The type used for the enqueue and dequeue indices. Must be at least as
+	// large as size_t. Should be significantly larger than the number of elements
+	// you expect to hold at once, especially if you have a high turnover rate;
+	// for example, on 32-bit x86, if you expect to have over a hundred million
+	// elements or pump several million elements through your queue in a very
+	// short space of time, using a 32-bit type *may* trigger a race condition.
+	// A 64-bit int type is recommended in that case, and in practice will
+	// prevent a race condition no matter the usage of the queue. Note that
+	// whether the queue is lock-free with a 64-int type depends on the whether
+	// std::atomic<std::uint64_t> is lock-free, which is platform-specific.
+	typedef std::size_t index_t;
+	
+	// Internally, all elements are enqueued and dequeued from multi-element
+	// blocks; this is the smallest controllable unit. If you expect few elements
+	// but many producers, a smaller block size should be favoured. For few producers
+	// and/or many elements, a larger block size is preferred. A sane default
+	// is provided. Must be a power of 2.
+	static const size_t BLOCK_SIZE = 32;
+	
+	// For explicit producers (i.e. when using a producer token), the block is
+	// checked for being empty by iterating through a list of flags, one per element.
+	// For large block sizes, this is too inefficient, and switching to an atomic
+	// counter-based approach is faster. The switch is made for block sizes strictly
+	// larger than this threshold.
+	static const size_t EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD = 32;
+	
+	// How many full blocks can be expected for a single explicit producer? This should
+	// reflect that number's maximum for optimal performance. Must be a power of 2.
+	static const size_t EXPLICIT_INITIAL_INDEX_SIZE = 32;
+	
+	// How many full blocks can be expected for a single implicit producer? This should
+	// reflect that number's maximum for optimal performance. Must be a power of 2.
+	static const size_t IMPLICIT_INITIAL_INDEX_SIZE = 32;
+	
+	// The initial size of the hash table mapping thread IDs to implicit producers.
+	// Note that the hash is resized every time it becomes half full.
+	// Must be a power of two, and either 0 or at least 1. If 0, implicit production
+	// (using the enqueue methods without an explicit producer token) is disabled.
+	static const size_t INITIAL_IMPLICIT_PRODUCER_HASH_SIZE = 32;
+	
+	// Controls the number of items that an explicit consumer (i.e. one with a token)
+	// must consume before it causes all consumers to rotate and move on to the next
+	// internal queue.
+	static const std::uint32_t EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE = 256;
+	
+	// The maximum number of elements (inclusive) that can be enqueued to a sub-queue.
+	// Enqueue operations that would cause this limit to be surpassed will fail. Note
+	// that this limit is enforced at the block level (for performance reasons), i.e.
+	// it's rounded up to the nearest block size.
+	static const size_t MAX_SUBQUEUE_SIZE = details::const_numeric_max<size_t>::value;
+	
+	
+#ifndef MCDBGQ_USE_RELACY
+	// Memory allocation can be customized if needed.
+	// malloc should return nullptr on failure, and handle alignment like std::malloc.
+#if defined(malloc) || defined(free)
+	// Gah, this is 2015, stop defining macros that break standard code already!
+	// Work around malloc/free being special macros:
+	static inline void* WORKAROUND_malloc(size_t size) { return malloc(size); }
+	static inline void WORKAROUND_free(void* ptr) { return free(ptr); }
+	static inline void* (malloc)(size_t size) { return WORKAROUND_malloc(size); }
+	static inline void (free)(void* ptr) { return WORKAROUND_free(ptr); }
+#else
+	static inline void* malloc(size_t size) { return std::malloc(size); }
+	static inline void free(void* ptr) { return std::free(ptr); }
+#endif
+#else
+	// Debug versions when running under the Relacy race detector (ignore
+	// these in user code)
+	static inline void* malloc(size_t size) { return rl::rl_malloc(size, $); }
+	static inline void free(void* ptr) { return rl::rl_free(ptr, $); }
+#endif
+};
+
+
+// When producing or consuming many elements, the most efficient way is to:
+//    1) Use one of the bulk-operation methods of the queue with a token
+//    2) Failing that, use the bulk-operation methods without a token
+//    3) Failing that, create a token and use that with the single-item methods
+//    4) Failing that, use the single-parameter methods of the queue
+// Having said that, don't create tokens willy-nilly -- ideally there should be
+// a maximum of one token per thread (of each kind).
+struct ProducerToken;
+struct ConsumerToken;
+
+template<typename T, typename Traits> class ConcurrentQueue;
+template<typename T, typename Traits> class BlockingConcurrentQueue;
+class ConcurrentQueueTests;
+
+
+namespace details
+{
+	struct ConcurrentQueueProducerTypelessBase
+	{
+		ConcurrentQueueProducerTypelessBase* next;
+		std::atomic<bool> inactive;
+		ProducerToken* token;
+		
+		ConcurrentQueueProducerTypelessBase()
+			: next(nullptr), inactive(false), token(nullptr)
+		{
+		}
+	};
+	
+	template<bool use32> struct _hash_32_or_64 {
+		static inline std::uint32_t hash(std::uint32_t h)
+		{
+			// MurmurHash3 finalizer -- see https://code.google.com/p/smhasher/source/browse/trunk/MurmurHash3.cpp
+			// Since the thread ID is already unique, all we really want to do is propagate that
+			// uniqueness evenly across all the bits, so that we can use a subset of the bits while
+			// reducing collisions significantly
+			h ^= h >> 16;
+			h *= 0x85ebca6b;
+			h ^= h >> 13;
+			h *= 0xc2b2ae35;
+			return h ^ (h >> 16);
+		}
+	};
+	template<> struct _hash_32_or_64<1> {
+		static inline std::uint64_t hash(std::uint64_t h)
+		{
+			h ^= h >> 33;
+			h *= 0xff51afd7ed558ccd;
+			h ^= h >> 33;
+			h *= 0xc4ceb9fe1a85ec53;
+			return h ^ (h >> 33);
+		}
+	};
+	template<std::size_t size> struct hash_32_or_64 : public _hash_32_or_64<(size > 4)> {  };
+	
+	static inline size_t hash_thread_id(thread_id_t id)
+	{
+		static_assert(sizeof(thread_id_t) <= 8, "Expected a platform where thread IDs are at most 64-bit values");
+		return static_cast<size_t>(hash_32_or_64<sizeof(thread_id_converter<thread_id_t>::thread_id_hash_t)>::hash(
+			thread_id_converter<thread_id_t>::prehash(id)));
+	}
+	
+	template<typename T>
+	static inline bool circular_less_than(T a, T b)
+	{
+#ifdef _MSC_VER
+#pragma warning(push)
+#pragma warning(disable: 4554)
+#endif
+		static_assert(std::is_integral<T>::value && !std::numeric_limits<T>::is_signed, "circular_less_than is intended to be used only with unsigned integer types");
+		return static_cast<T>(a - b) > static_cast<T>(static_cast<T>(1) << static_cast<T>(sizeof(T) * CHAR_BIT - 1));
+#ifdef _MSC_VER
+#pragma warning(pop)
+#endif
+	}
+	
+	template<typename U>
+	static inline char* align_for(char* ptr)
+	{
+		const std::size_t alignment = std::alignment_of<U>::value;
+		return ptr + (alignment - (reinterpret_cast<std::uintptr_t>(ptr) % alignment)) % alignment;
+	}
+
+	template<typename T>
+	static inline T ceil_to_pow_2(T x)
+	{
+		static_assert(std::is_integral<T>::value && !std::numeric_limits<T>::is_signed, "ceil_to_pow_2 is intended to be used only with unsigned integer types");
+
+		// Adapted from http://graphics.stanford.edu/~seander/bithacks.html#RoundUpPowerOf2
+		--x;
+		x |= x >> 1;
+		x |= x >> 2;
+		x |= x >> 4;
+		for (std::size_t i = 1; i < sizeof(T); i <<= 1) {
+			x |= x >> (i << 3);
+		}
+		++x;
+		return x;
+	}
+	
+	template<typename T>
+	static inline void swap_relaxed(std::atomic<T>& left, std::atomic<T>& right)
+	{
+		T temp = std::move(left.load(std::memory_order_relaxed));
+		left.store(std::move(right.load(std::memory_order_relaxed)), std::memory_order_relaxed);
+		right.store(std::move(temp), std::memory_order_relaxed);
+	}
+	
+	template<typename T>
+	static inline T const& nomove(T const& x)
+	{
+		return x;
+	}
+	
+	template<bool Enable>
+	struct nomove_if
+	{
+		template<typename T>
+		static inline T const& eval(T const& x)
+		{
+			return x;
+		}
+	};
+	
+	template<>
+	struct nomove_if<false>
+	{
+		template<typename U>
+		static inline auto eval(U&& x)
+			-> decltype(std::forward<U>(x))
+		{
+			return std::forward<U>(x);
+		}
+	};
+	
+	template<typename It>
+	static inline auto deref_noexcept(It& it) MOODYCAMEL_NOEXCEPT -> decltype(*it)
+	{
+		return *it;
+	}
+	
+#if defined(__clang__) || !defined(__GNUC__) || __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 8)
+	template<typename T> struct is_trivially_destructible : std::is_trivially_destructible<T> { };
+#else
+	template<typename T> struct is_trivially_destructible : std::has_trivial_destructor<T> { };
+#endif
+	
+#ifdef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED
+#ifdef MCDBGQ_USE_RELACY
+	typedef RelacyThreadExitListener ThreadExitListener;
+	typedef RelacyThreadExitNotifier ThreadExitNotifier;
+#else
+	struct ThreadExitListener
+	{
+		typedef void (*callback_t)(void*);
+		callback_t callback;
+		void* userData;
+		
+		ThreadExitListener* next;		// reserved for use by the ThreadExitNotifier
+	};
+	
+	
+	class ThreadExitNotifier
+	{
+	public:
+		static void subscribe(ThreadExitListener* listener)
+		{
+			auto& tlsInst = instance();
+			listener->next = tlsInst.tail;
+			tlsInst.tail = listener;
+		}
+		
+		static void unsubscribe(ThreadExitListener* listener)
+		{
+			auto& tlsInst = instance();
+			ThreadExitListener** prev = &tlsInst.tail;
+			for (auto ptr = tlsInst.tail; ptr != nullptr; ptr = ptr->next) {
+				if (ptr == listener) {
+					*prev = ptr->next;
+					break;
+				}
+				prev = &ptr->next;
+			}
+		}
+		
+	private:
+		ThreadExitNotifier() : tail(nullptr) { }
+		ThreadExitNotifier(ThreadExitNotifier const&) MOODYCAMEL_DELETE_FUNCTION;
+		ThreadExitNotifier& operator=(ThreadExitNotifier const&) MOODYCAMEL_DELETE_FUNCTION;
+		
+		~ThreadExitNotifier()
+		{
+			// This thread is about to exit, let everyone know!
+			assert(this == &instance() && "If this assert fails, you likely have a buggy compiler! Change the preprocessor conditions such that MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED is no longer defined.");
+			for (auto ptr = tail; ptr != nullptr; ptr = ptr->next) {
+				ptr->callback(ptr->userData);
+			}
+		}
+		
+		// Thread-local
+		static inline ThreadExitNotifier& instance()
+		{
+			static thread_local ThreadExitNotifier notifier;
+			return notifier;
+		}
+		
+	private:
+		ThreadExitListener* tail;
+	};
+#endif
+#endif
+	
+	template<typename T> struct static_is_lock_free_num { enum { value = 0 }; };
+	template<> struct static_is_lock_free_num<signed char> { enum { value = ATOMIC_CHAR_LOCK_FREE }; };
+	template<> struct static_is_lock_free_num<short> { enum { value = ATOMIC_SHORT_LOCK_FREE }; };
+	template<> struct static_is_lock_free_num<int> { enum { value = ATOMIC_INT_LOCK_FREE }; };
+	template<> struct static_is_lock_free_num<long> { enum { value = ATOMIC_LONG_LOCK_FREE }; };
+	template<> struct static_is_lock_free_num<long long> { enum { value = ATOMIC_LLONG_LOCK_FREE }; };
+	template<typename T> struct static_is_lock_free : static_is_lock_free_num<typename std::make_signed<T>::type> {  };
+	template<> struct static_is_lock_free<bool> { enum { value = ATOMIC_BOOL_LOCK_FREE }; };
+	template<typename U> struct static_is_lock_free<U*> { enum { value = ATOMIC_POINTER_LOCK_FREE }; };
+}
+
+
+struct ProducerToken
+{
+	template<typename T, typename Traits>
+	explicit ProducerToken(ConcurrentQueue<T, Traits>& queue);
+	
+	template<typename T, typename Traits>
+	explicit ProducerToken(BlockingConcurrentQueue<T, Traits>& queue);
+	
+	ProducerToken(ProducerToken&& other) MOODYCAMEL_NOEXCEPT
+		: producer(other.producer)
+	{
+		other.producer = nullptr;
+		if (producer != nullptr) {
+			producer->token = this;
+		}
+	}
+	
+	inline ProducerToken& operator=(ProducerToken&& other) MOODYCAMEL_NOEXCEPT
+	{
+		swap(other);
+		return *this;
+	}
+	
+	void swap(ProducerToken& other) MOODYCAMEL_NOEXCEPT
+	{
+		std::swap(producer, other.producer);
+		if (producer != nullptr) {
+			producer->token = this;
+		}
+		if (other.producer != nullptr) {
+			other.producer->token = &other;
+		}
+	}
+	
+	// A token is always valid unless:
+	//     1) Memory allocation failed during construction
+	//     2) It was moved via the move constructor
+	//        (Note: assignment does a swap, leaving both potentially valid)
+	//     3) The associated queue was destroyed
+	// Note that if valid() returns true, that only indicates
+	// that the token is valid for use with a specific queue,
+	// but not which one; that's up to the user to track.
+	inline bool valid() const { return producer != nullptr; }
+	
+	~ProducerToken()
+	{
+		if (producer != nullptr) {
+			producer->token = nullptr;
+			producer->inactive.store(true, std::memory_order_release);
+		}
+	}
+	
+	// Disable copying and assignment
+	ProducerToken(ProducerToken const&) MOODYCAMEL_DELETE_FUNCTION;
+	ProducerToken& operator=(ProducerToken const&) MOODYCAMEL_DELETE_FUNCTION;
+	
+private:
+	template<typename T, typename Traits> friend class ConcurrentQueue;
+	friend class ConcurrentQueueTests;
+	
+protected:
+	details::ConcurrentQueueProducerTypelessBase* producer;
+};
+
+
+struct ConsumerToken
+{
+	template<typename T, typename Traits>
+	explicit ConsumerToken(ConcurrentQueue<T, Traits>& q);
+	
+	template<typename T, typename Traits>
+	explicit ConsumerToken(BlockingConcurrentQueue<T, Traits>& q);
+	
+	ConsumerToken(ConsumerToken&& other) MOODYCAMEL_NOEXCEPT
+		: initialOffset(other.initialOffset), lastKnownGlobalOffset(other.lastKnownGlobalOffset), itemsConsumedFromCurrent(other.itemsConsumedFromCurrent), currentProducer(other.currentProducer), desiredProducer(other.desiredProducer)
+	{
+	}
+	
+	inline ConsumerToken& operator=(ConsumerToken&& other) MOODYCAMEL_NOEXCEPT
+	{
+		swap(other);
+		return *this;
+	}
+	
+	void swap(ConsumerToken& other) MOODYCAMEL_NOEXCEPT
+	{
+		std::swap(initialOffset, other.initialOffset);
+		std::swap(lastKnownGlobalOffset, other.lastKnownGlobalOffset);
+		std::swap(itemsConsumedFromCurrent, other.itemsConsumedFromCurrent);
+		std::swap(currentProducer, other.currentProducer);
+		std::swap(desiredProducer, other.desiredProducer);
+	}
+	
+	// Disable copying and assignment
+	ConsumerToken(ConsumerToken const&) MOODYCAMEL_DELETE_FUNCTION;
+	ConsumerToken& operator=(ConsumerToken const&) MOODYCAMEL_DELETE_FUNCTION;
+
+private:
+	template<typename T, typename Traits> friend class ConcurrentQueue;
+	friend class ConcurrentQueueTests;
+	
+private: // but shared with ConcurrentQueue
+	std::uint32_t initialOffset;
+	std::uint32_t lastKnownGlobalOffset;
+	std::uint32_t itemsConsumedFromCurrent;
+	details::ConcurrentQueueProducerTypelessBase* currentProducer;
+	details::ConcurrentQueueProducerTypelessBase* desiredProducer;
+};
+
+// Need to forward-declare this swap because it's in a namespace.
+// See http://stackoverflow.com/questions/4492062/why-does-a-c-friend-class-need-a-forward-declaration-only-in-other-namespaces
+template<typename T, typename Traits>
+inline void swap(typename ConcurrentQueue<T, Traits>::ImplicitProducerKVP& a, typename ConcurrentQueue<T, Traits>::ImplicitProducerKVP& b) MOODYCAMEL_NOEXCEPT;
+
+
+template<typename T, typename Traits = ConcurrentQueueDefaultTraits>
+class ConcurrentQueue
+{
+public:
+	typedef ::moodycamel::ProducerToken producer_token_t;
+	typedef ::moodycamel::ConsumerToken consumer_token_t;
+	
+	typedef typename Traits::index_t index_t;
+	typedef typename Traits::size_t size_t;
+	
+	static const size_t BLOCK_SIZE = static_cast<size_t>(Traits::BLOCK_SIZE);
+	static const size_t EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD = static_cast<size_t>(Traits::EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD);
+	static const size_t EXPLICIT_INITIAL_INDEX_SIZE = static_cast<size_t>(Traits::EXPLICIT_INITIAL_INDEX_SIZE);
+	static const size_t IMPLICIT_INITIAL_INDEX_SIZE = static_cast<size_t>(Traits::IMPLICIT_INITIAL_INDEX_SIZE);
+	static const size_t INITIAL_IMPLICIT_PRODUCER_HASH_SIZE = static_cast<size_t>(Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE);
+	static const std::uint32_t EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE = static_cast<std::uint32_t>(Traits::EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE);
+#ifdef _MSC_VER
+#pragma warning(push)
+#pragma warning(disable: 4307)		// + integral constant overflow (that's what the ternary expression is for!)
+#pragma warning(disable: 4309)		// static_cast: Truncation of constant value
+#endif
+	static const size_t MAX_SUBQUEUE_SIZE = (details::const_numeric_max<size_t>::value - static_cast<size_t>(Traits::MAX_SUBQUEUE_SIZE) < BLOCK_SIZE) ? details::const_numeric_max<size_t>::value : ((static_cast<size_t>(Traits::MAX_SUBQUEUE_SIZE) + (BLOCK_SIZE - 1)) / BLOCK_SIZE * BLOCK_SIZE);
+#ifdef _MSC_VER
+#pragma warning(pop)
+#endif
+
+	static_assert(!std::numeric_limits<size_t>::is_signed && std::is_integral<size_t>::value, "Traits::size_t must be an unsigned integral type");
+	static_assert(!std::numeric_limits<index_t>::is_signed && std::is_integral<index_t>::value, "Traits::index_t must be an unsigned integral type");
+	static_assert(sizeof(index_t) >= sizeof(size_t), "Traits::index_t must be at least as wide as Traits::size_t");
+	static_assert((BLOCK_SIZE > 1) && !(BLOCK_SIZE & (BLOCK_SIZE - 1)), "Traits::BLOCK_SIZE must be a power of 2 (and at least 2)");
+	static_assert((EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD > 1) && !(EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD & (EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD - 1)), "Traits::EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD must be a power of 2 (and greater than 1)");
+	static_assert((EXPLICIT_INITIAL_INDEX_SIZE > 1) && !(EXPLICIT_INITIAL_INDEX_SIZE & (EXPLICIT_INITIAL_INDEX_SIZE - 1)), "Traits::EXPLICIT_INITIAL_INDEX_SIZE must be a power of 2 (and greater than 1)");
+	static_assert((IMPLICIT_INITIAL_INDEX_SIZE > 1) && !(IMPLICIT_INITIAL_INDEX_SIZE & (IMPLICIT_INITIAL_INDEX_SIZE - 1)), "Traits::IMPLICIT_INITIAL_INDEX_SIZE must be a power of 2 (and greater than 1)");
+	static_assert((INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) || !(INITIAL_IMPLICIT_PRODUCER_HASH_SIZE & (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE - 1)), "Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE must be a power of 2");
+	static_assert(INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0 || INITIAL_IMPLICIT_PRODUCER_HASH_SIZE >= 1, "Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE must be at least 1 (or 0 to disable implicit enqueueing)");
+
+public:
+	// Creates a queue with at least `capacity` element slots; note that the
+	// actual number of elements that can be inserted without additional memory
+	// allocation depends on the number of producers and the block size (e.g. if
+	// the block size is equal to `capacity`, only a single block will be allocated
+	// up-front, which means only a single producer will be able to enqueue elements
+	// without an extra allocation -- blocks aren't shared between producers).
+	// This method is not thread safe -- it is up to the user to ensure that the
+	// queue is fully constructed before it starts being used by other threads (this
+	// includes making the memory effects of construction visible, possibly with a
+	// memory barrier).
+	explicit ConcurrentQueue(size_t capacity = 6 * BLOCK_SIZE)
+		: producerListTail(nullptr),
+		producerCount(0),
+		initialBlockPoolIndex(0),
+		nextExplicitConsumerId(0),
+		globalExplicitConsumerOffset(0)
+	{
+		implicitProducerHashResizeInProgress.clear(std::memory_order_relaxed);
+		populate_initial_implicit_producer_hash();
+		populate_initial_block_list(capacity / BLOCK_SIZE + ((capacity & (BLOCK_SIZE - 1)) == 0 ? 0 : 1));
+		
+#ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG
+		// Track all the producers using a fully-resolved typed list for
+		// each kind; this makes it possible to debug them starting from
+		// the root queue object (otherwise wacky casts are needed that
+		// don't compile in the debugger's expression evaluator).
+		explicitProducers.store(nullptr, std::memory_order_relaxed);
+		implicitProducers.store(nullptr, std::memory_order_relaxed);
+#endif
+	}
+	
+	// Computes the correct amount of pre-allocated blocks for you based
+	// on the minimum number of elements you want available at any given
+	// time, and the maximum concurrent number of each type of producer.
+	ConcurrentQueue(size_t minCapacity, size_t maxExplicitProducers, size_t maxImplicitProducers)
+		: producerListTail(nullptr),
+		producerCount(0),
+		initialBlockPoolIndex(0),
+		nextExplicitConsumerId(0),
+		globalExplicitConsumerOffset(0)
+	{
+		implicitProducerHashResizeInProgress.clear(std::memory_order_relaxed);
+		populate_initial_implicit_producer_hash();
+		size_t blocks = (((minCapacity + BLOCK_SIZE - 1) / BLOCK_SIZE) - 1) * (maxExplicitProducers + 1) + 2 * (maxExplicitProducers + maxImplicitProducers);
+		populate_initial_block_list(blocks);
+		
+#ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG
+		explicitProducers.store(nullptr, std::memory_order_relaxed);
+		implicitProducers.store(nullptr, std::memory_order_relaxed);
+#endif
+	}
+	
+	// Note: The queue should not be accessed concurrently while it's
+	// being deleted. It's up to the user to synchronize this.
+	// This method is not thread safe.
+	~ConcurrentQueue()
+	{
+		// Destroy producers
+		auto ptr = producerListTail.load(std::memory_order_relaxed);
+		while (ptr != nullptr) {
+			auto next = ptr->next_prod();
+			if (ptr->token != nullptr) {
+				ptr->token->producer = nullptr;
+			}
+			destroy(ptr);
+			ptr = next;
+		}
+		
+		// Destroy implicit producer hash tables
+		if (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE != 0) {
+			auto hash = implicitProducerHash.load(std::memory_order_relaxed);
+			while (hash != nullptr) {
+				auto prev = hash->prev;
+				if (prev != nullptr) {		// The last hash is part of this object and was not allocated dynamically
+					for (size_t i = 0; i != hash->capacity; ++i) {
+						hash->entries[i].~ImplicitProducerKVP();
+					}
+					hash->~ImplicitProducerHash();
+					(Traits::free)(hash);
+				}
+				hash = prev;
+			}
+		}
+		
+		// Destroy global free list
+		auto block = freeList.head_unsafe();
+		while (block != nullptr) {
+			auto next = block->freeListNext.load(std::memory_order_relaxed);
+			if (block->dynamicallyAllocated) {
+				destroy(block);
+			}
+			block = next;
+		}
+		
+		// Destroy initial free list
+		destroy_array(initialBlockPool, initialBlockPoolSize);
+	}
+
+	// Disable copying and copy assignment
+	ConcurrentQueue(ConcurrentQueue const&) MOODYCAMEL_DELETE_FUNCTION;
+	ConcurrentQueue& operator=(ConcurrentQueue const&) MOODYCAMEL_DELETE_FUNCTION;
+	
+	// Moving is supported, but note that it is *not* a thread-safe operation.
+	// Nobody can use the queue while it's being moved, and the memory effects
+	// of that move must be propagated to other threads before they can use it.
+	// Note: When a queue is moved, its tokens are still valid but can only be
+	// used with the destination queue (i.e. semantically they are moved along
+	// with the queue itself).
+	ConcurrentQueue(ConcurrentQueue&& other) MOODYCAMEL_NOEXCEPT
+		: producerListTail(other.producerListTail.load(std::memory_order_relaxed)),
+		producerCount(other.producerCount.load(std::memory_order_relaxed)),
+		initialBlockPoolIndex(other.initialBlockPoolIndex.load(std::memory_order_relaxed)),
+		initialBlockPool(other.initialBlockPool),
+		initialBlockPoolSize(other.initialBlockPoolSize),
+		freeList(std::move(other.freeList)),
+		nextExplicitConsumerId(other.nextExplicitConsumerId.load(std::memory_order_relaxed)),
+		globalExplicitConsumerOffset(other.globalExplicitConsumerOffset.load(std::memory_order_relaxed))
+	{
+		// Move the other one into this, and leave the other one as an empty queue
+		implicitProducerHashResizeInProgress.clear(std::memory_order_relaxed);
+		populate_initial_implicit_producer_hash();
+		swap_implicit_producer_hashes(other);
+		
+		other.producerListTail.store(nullptr, std::memory_order_relaxed);
+		other.producerCount.store(0, std::memory_order_relaxed);
+		other.nextExplicitConsumerId.store(0, std::memory_order_relaxed);
+		other.globalExplicitConsumerOffset.store(0, std::memory_order_relaxed);
+		
+#ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG
+		explicitProducers.store(other.explicitProducers.load(std::memory_order_relaxed), std::memory_order_relaxed);
+		other.explicitProducers.store(nullptr, std::memory_order_relaxed);
+		implicitProducers.store(other.implicitProducers.load(std::memory_order_relaxed), std::memory_order_relaxed);
+		other.implicitProducers.store(nullptr, std::memory_order_relaxed);
+#endif
+		
+		other.initialBlockPoolIndex.store(0, std::memory_order_relaxed);
+		other.initialBlockPoolSize = 0;
+		other.initialBlockPool = nullptr;
+		
+		reown_producers();
+	}
+	
+	inline ConcurrentQueue& operator=(ConcurrentQueue&& other) MOODYCAMEL_NOEXCEPT
+	{
+		return swap_internal(other);
+	}
+	
+	// Swaps this queue's state with the other's. Not thread-safe.
+	// Swapping two queues does not invalidate their tokens, however
+	// the tokens that were created for one queue must be used with
+	// only the swapped queue (i.e. the tokens are tied to the
+	// queue's movable state, not the object itself).
+	inline void swap(ConcurrentQueue& other) MOODYCAMEL_NOEXCEPT
+	{
+		swap_internal(other);
+	}
+	
+private:
+	ConcurrentQueue& swap_internal(ConcurrentQueue& other)
+	{
+		if (this == &other) {
+			return *this;
+		}
+		
+		details::swap_relaxed(producerListTail, other.producerListTail);
+		details::swap_relaxed(producerCount, other.producerCount);
+		details::swap_relaxed(initialBlockPoolIndex, other.initialBlockPoolIndex);
+		std::swap(initialBlockPool, other.initialBlockPool);
+		std::swap(initialBlockPoolSize, other.initialBlockPoolSize);
+		freeList.swap(other.freeList);
+		details::swap_relaxed(nextExplicitConsumerId, other.nextExplicitConsumerId);
+		details::swap_relaxed(globalExplicitConsumerOffset, other.globalExplicitConsumerOffset);
+		
+		swap_implicit_producer_hashes(other);
+		
+		reown_producers();
+		other.reown_producers();
+		
+#ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG
+		details::swap_relaxed(explicitProducers, other.explicitProducers);
+		details::swap_relaxed(implicitProducers, other.implicitProducers);
+#endif
+		
+		return *this;
+	}
+	
+public:
+	// Enqueues a single item (by copying it).
+	// Allocates memory if required. Only fails if memory allocation fails (or implicit
+	// production is disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE is 0,
+	// or Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed).
+	// Thread-safe.
+	inline bool enqueue(T const& item)
+	{
+		if (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) return false;
+		return inner_enqueue<CanAlloc>(item);
+	}
+	
+	// Enqueues a single item (by moving it, if possible).
+	// Allocates memory if required. Only fails if memory allocation fails (or implicit
+	// production is disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE is 0,
+	// or Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed).
+	// Thread-safe.
+	inline bool enqueue(T&& item)
+	{
+		if (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) return false;
+		return inner_enqueue<CanAlloc>(std::move(item));
+	}
+	
+	// Enqueues a single item (by copying it) using an explicit producer token.
+	// Allocates memory if required. Only fails if memory allocation fails (or
+	// Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed).
+	// Thread-safe.
+	inline bool enqueue(producer_token_t const& token, T const& item)
+	{
+		return inner_enqueue<CanAlloc>(token, item);
+	}
+	
+	// Enqueues a single item (by moving it, if possible) using an explicit producer token.
+	// Allocates memory if required. Only fails if memory allocation fails (or
+	// Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed).
+	// Thread-safe.
+	inline bool enqueue(producer_token_t const& token, T&& item)
+	{
+		return inner_enqueue<CanAlloc>(token, std::move(item));
+	}
+	
+	// Enqueues several items.
+	// Allocates memory if required. Only fails if memory allocation fails (or
+	// implicit production is disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE
+	// is 0, or Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed).
+	// Note: Use std::make_move_iterator if the elements should be moved instead of copied.
+	// Thread-safe.
+	template<typename It>
+	bool enqueue_bulk(It itemFirst, size_t count)
+	{
+		if (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) return false;
+		return inner_enqueue_bulk<CanAlloc>(itemFirst, count);
+	}
+	
+	// Enqueues several items using an explicit producer token.
+	// Allocates memory if required. Only fails if memory allocation fails
+	// (or Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed).
+	// Note: Use std::make_move_iterator if the elements should be moved
+	// instead of copied.
+	// Thread-safe.
+	template<typename It>
+	bool enqueue_bulk(producer_token_t const& token, It itemFirst, size_t count)
+	{
+		return inner_enqueue_bulk<CanAlloc>(token, itemFirst, count);
+	}
+	
+	// Enqueues a single item (by copying it).
+	// Does not allocate memory. Fails if not enough room to enqueue (or implicit
+	// production is disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE
+	// is 0).
+	// Thread-safe.
+	inline bool try_enqueue(T const& item)
+	{
+		if (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) return false;
+		return inner_enqueue<CannotAlloc>(item);
+	}
+	
+	// Enqueues a single item (by moving it, if possible).
+	// Does not allocate memory (except for one-time implicit producer).
+	// Fails if not enough room to enqueue (or implicit production is
+	// disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE is 0).
+	// Thread-safe.
+	inline bool try_enqueue(T&& item)
+	{
+		if (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) return false;
+		return inner_enqueue<CannotAlloc>(std::move(item));
+	}
+	
+	// Enqueues a single item (by copying it) using an explicit producer token.
+	// Does not allocate memory. Fails if not enough room to enqueue.
+	// Thread-safe.
+	inline bool try_enqueue(producer_token_t const& token, T const& item)
+	{
+		return inner_enqueue<CannotAlloc>(token, item);
+	}
+	
+	// Enqueues a single item (by moving it, if possible) using an explicit producer token.
+	// Does not allocate memory. Fails if not enough room to enqueue.
+	// Thread-safe.
+	inline bool try_enqueue(producer_token_t const& token, T&& item)
+	{
+		return inner_enqueue<CannotAlloc>(token, std::move(item));
+	}
+	
+	// Enqueues several items.
+	// Does not allocate memory (except for one-time implicit producer).
+	// Fails if not enough room to enqueue (or implicit production is
+	// disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE is 0).
+	// Note: Use std::make_move_iterator if the elements should be moved
+	// instead of copied.
+	// Thread-safe.
+	template<typename It>
+	bool try_enqueue_bulk(It itemFirst, size_t count)
+	{
+		if (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) return false;
+		return inner_enqueue_bulk<CannotAlloc>(itemFirst, count);
+	}
+	
+	// Enqueues several items using an explicit producer token.
+	// Does not allocate memory. Fails if not enough room to enqueue.
+	// Note: Use std::make_move_iterator if the elements should be moved
+	// instead of copied.
+	// Thread-safe.
+	template<typename It>
+	bool try_enqueue_bulk(producer_token_t const& token, It itemFirst, size_t count)
+	{
+		return inner_enqueue_bulk<CannotAlloc>(token, itemFirst, count);
+	}
+	
+	
+	
+	// Attempts to dequeue from the queue.
+	// Returns false if all producer streams appeared empty at the time they
+	// were checked (so, the queue is likely but not guaranteed to be empty).
+	// Never allocates. Thread-safe.
+	template<typename U>
+	bool try_dequeue(U& item)
+	{
+		// Instead of simply trying each producer in turn (which could cause needless contention on the first
+		// producer), we score them heuristically.
+		size_t nonEmptyCount = 0;
+		ProducerBase* best = nullptr;
+		size_t bestSize = 0;
+		for (auto ptr = producerListTail.load(std::memory_order_acquire); nonEmptyCount < 3 && ptr != nullptr; ptr = ptr->next_prod()) {
+			auto size = ptr->size_approx();
+			if (size > 0) {
+				if (size > bestSize) {
+					bestSize = size;
+					best = ptr;
+				}
+				++nonEmptyCount;
+			}
+		}
+		
+		// If there was at least one non-empty queue but it appears empty at the time
+		// we try to dequeue from it, we need to make sure every queue's been tried
+		if (nonEmptyCount > 0) {
+			if ((details::likely)(best->dequeue(item))) {
+				return true;
+			}
+			for (auto ptr = producerListTail.load(std::memory_order_acquire); ptr != nullptr; ptr = ptr->next_prod()) {
+				if (ptr != best && ptr->dequeue(item)) {
+					return true;
+				}
+			}
+		}
+		return false;
+	}
+	
+	// Attempts to dequeue from the queue.
+	// Returns false if all producer streams appeared empty at the time they
+	// were checked (so, the queue is likely but not guaranteed to be empty).
+	// This differs from the try_dequeue(item) method in that this one does
+	// not attempt to reduce contention by interleaving the order that producer
+	// streams are dequeued from. So, using this method can reduce overall throughput
+	// under contention, but will give more predictable results in single-threaded
+	// consumer scenarios. This is mostly only useful for internal unit tests.
+	// Never allocates. Thread-safe.
+	template<typename U>
+	bool try_dequeue_non_interleaved(U& item)
+	{
+		for (auto ptr = producerListTail.load(std::memory_order_acquire); ptr != nullptr; ptr = ptr->next_prod()) {
+			if (ptr->dequeue(item)) {
+				return true;
+			}
+		}
+		return false;
+	}
+	
+	// Attempts to dequeue from the queue using an explicit consumer token.
+	// Returns false if all producer streams appeared empty at the time they
+	// were checked (so, the queue is likely but not guaranteed to be empty).
+	// Never allocates. Thread-safe.
+	template<typename U>
+	bool try_dequeue(consumer_token_t& token, U& item)
+	{
+		// The idea is roughly as follows:
+		// Every 256 items from one producer, make everyone rotate (increase the global offset) -> this means the highest efficiency consumer dictates the rotation speed of everyone else, more or less
+		// If you see that the global offset has changed, you must reset your consumption counter and move to your designated place
+		// If there's no items where you're supposed to be, keep moving until you find a producer with some items
+		// If the global offset has not changed but you've run out of items to consume, move over from your current position until you find an producer with something in it
+		
+		if (token.desiredProducer == nullptr || token.lastKnownGlobalOffset != globalExplicitConsumerOffset.load(std::memory_order_relaxed)) {
+			if (!update_current_producer_after_rotation(token)) {
+				return false;
+			}
+		}
+		
+		// If there was at least one non-empty queue but it appears empty at the time
+		// we try to dequeue from it, we need to make sure every queue's been tried
+		if (static_cast<ProducerBase*>(token.currentProducer)->dequeue(item)) {
+			if (++token.itemsConsumedFromCurrent == EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE) {
+				globalExplicitConsumerOffset.fetch_add(1, std::memory_order_relaxed);
+			}
+			return true;
+		}
+		
+		auto tail = producerListTail.load(std::memory_order_acquire);
+		auto ptr = static_cast<ProducerBase*>(token.currentProducer)->next_prod();
+		if (ptr == nullptr) {
+			ptr = tail;
+		}
+		while (ptr != static_cast<ProducerBase*>(token.currentProducer)) {
+			if (ptr->dequeue(item)) {
+				token.currentProducer = ptr;
+				token.itemsConsumedFromCurrent = 1;
+				return true;
+			}
+			ptr = ptr->next_prod();
+			if (ptr == nullptr) {
+				ptr = tail;
+			}
+		}
+		return false;
+	}
+	
+	// Attempts to dequeue several elements from the queue.
+	// Returns the number of items actually dequeued.
+	// Returns 0 if all producer streams appeared empty at the time they
+	// were checked (so, the queue is likely but not guaranteed to be empty).
+	// Never allocates. Thread-safe.
+	template<typename It>
+	size_t try_dequeue_bulk(It itemFirst, size_t max)
+	{
+		size_t count = 0;
+		for (auto ptr = producerListTail.load(std::memory_order_acquire); ptr != nullptr; ptr = ptr->next_prod()) {
+			count += ptr->dequeue_bulk(itemFirst, max - count);
+			if (count == max) {
+				break;
+			}
+		}
+		return count;
+	}
+	
+	// Attempts to dequeue several elements from the queue using an explicit consumer token.
+	// Returns the number of items actually dequeued.
+	// Returns 0 if all producer streams appeared empty at the time they
+	// were checked (so, the queue is likely but not guaranteed to be empty).
+	// Never allocates. Thread-safe.
+	template<typename It>
+	size_t try_dequeue_bulk(consumer_token_t& token, It itemFirst, size_t max)
+	{
+		if (token.desiredProducer == nullptr || token.lastKnownGlobalOffset != globalExplicitConsumerOffset.load(std::memory_order_relaxed)) {
+			if (!update_current_producer_after_rotation(token)) {
+				return 0;
+			}
+		}
+		
+		size_t count = static_cast<ProducerBase*>(token.currentProducer)->dequeue_bulk(itemFirst, max);
+		if (count == max) {
+			if ((token.itemsConsumedFromCurrent += static_cast<std::uint32_t>(max)) >= EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE) {
+				globalExplicitConsumerOffset.fetch_add(1, std::memory_order_relaxed);
+			}
+			return max;
+		}
+		token.itemsConsumedFromCurrent += static_cast<std::uint32_t>(count);
+		max -= count;
+		
+		auto tail = producerListTail.load(std::memory_order_acquire);
+		auto ptr = static_cast<ProducerBase*>(token.currentProducer)->next_prod();
+		if (ptr == nullptr) {
+			ptr = tail;
+		}
+		while (ptr != static_cast<ProducerBase*>(token.currentProducer)) {
+			auto dequeued = ptr->dequeue_bulk(itemFirst, max);
+			count += dequeued;
+			if (dequeued != 0) {
+				token.currentProducer = ptr;
+				token.itemsConsumedFromCurrent = static_cast<std::uint32_t>(dequeued);
+			}
+			if (dequeued == max) {
+				break;
+			}
+			max -= dequeued;
+			ptr = ptr->next_prod();
+			if (ptr == nullptr) {
+				ptr = tail;
+			}
+		}
+		return count;
+	}
+	
+	
+	
+	// Attempts to dequeue from a specific producer's inner queue.
+	// If you happen to know which producer you want to dequeue from, this
+	// is significantly faster than using the general-case try_dequeue methods.
+	// Returns false if the producer's queue appeared empty at the time it
+	// was checked (so, the queue is likely but not guaranteed to be empty).
+	// Never allocates. Thread-safe.
+	template<typename U>
+	inline bool try_dequeue_from_producer(producer_token_t const& producer, U& item)
+	{
+		return static_cast<ExplicitProducer*>(producer.producer)->dequeue(item);
+	}
+	
+	// Attempts to dequeue several elements from a specific producer's inner queue.
+	// Returns the number of items actually dequeued.
+	// If you happen to know which producer you want to dequeue from, this
+	// is significantly faster than using the general-case try_dequeue methods.
+	// Returns 0 if the producer's queue appeared empty at the time it
+	// was checked (so, the queue is likely but not guaranteed to be empty).
+	// Never allocates. Thread-safe.
+	template<typename It>
+	inline size_t try_dequeue_bulk_from_producer(producer_token_t const& producer, It itemFirst, size_t max)
+	{
+		return static_cast<ExplicitProducer*>(producer.producer)->dequeue_bulk(itemFirst, max);
+	}
+	
+	
+	// Returns an estimate of the total number of elements currently in the queue. This
+	// estimate is only accurate if the queue has completely stabilized before it is called
+	// (i.e. all enqueue and dequeue operations have completed and their memory effects are
+	// visible on the calling thread, and no further operations start while this method is
+	// being called).
+	// Thread-safe.
+	size_t size_approx() const
+	{
+		size_t size = 0;
+		for (auto ptr = producerListTail.load(std::memory_order_acquire); ptr != nullptr; ptr = ptr->next_prod()) {
+			size += ptr->size_approx();
+		}
+		return size;
+	}
+	
+	
+	// Returns true if the underlying atomic variables used by
+	// the queue are lock-free (they should be on most platforms).
+	// Thread-safe.
+	static bool is_lock_free()
+	{
+		return
+			details::static_is_lock_free<bool>::value == 2 &&
+			details::static_is_lock_free<size_t>::value == 2 &&
+			details::static_is_lock_free<std::uint32_t>::value == 2 &&
+			details::static_is_lock_free<index_t>::value == 2 &&
+			details::static_is_lock_free<void*>::value == 2 &&
+			details::static_is_lock_free<typename details::thread_id_converter<details::thread_id_t>::thread_id_numeric_size_t>::value == 2;
+	}
+
+
+private:
+	friend struct ProducerToken;
+	friend struct ConsumerToken;
+	struct ExplicitProducer;
+	friend struct ExplicitProducer;
+	struct ImplicitProducer;
+	friend struct ImplicitProducer;
+	friend class ConcurrentQueueTests;
+		
+	enum AllocationMode { CanAlloc, CannotAlloc };
+	
+	
+	///////////////////////////////
+	// Queue methods
+	///////////////////////////////
+	
+	template<AllocationMode canAlloc, typename U>
+	inline bool inner_enqueue(producer_token_t const& token, U&& element)
+	{
+		return static_cast<ExplicitProducer*>(token.producer)->ConcurrentQueue::ExplicitProducer::template enqueue<canAlloc>(std::forward<U>(element));
+	}
+	
+	template<AllocationMode canAlloc, typename U>
+	inline bool inner_enqueue(U&& element)
+	{
+		auto producer = get_or_add_implicit_producer();
+		return producer == nullptr ? false : producer->ConcurrentQueue::ImplicitProducer::template enqueue<canAlloc>(std::forward<U>(element));
+	}
+	
+	template<AllocationMode canAlloc, typename It>
+	inline bool inner_enqueue_bulk(producer_token_t const& token, It itemFirst, size_t count)
+	{
+		return static_cast<ExplicitProducer*>(token.producer)->ConcurrentQueue::ExplicitProducer::template enqueue_bulk<canAlloc>(itemFirst, count);
+	}
+	
+	template<AllocationMode canAlloc, typename It>
+	inline bool inner_enqueue_bulk(It itemFirst, size_t count)
+	{
+		auto producer = get_or_add_implicit_producer();
+		return producer == nullptr ? false : producer->ConcurrentQueue::ImplicitProducer::template enqueue_bulk<canAlloc>(itemFirst, count);
+	}
+	
+	inline bool update_current_producer_after_rotation(consumer_token_t& token)
+	{
+		// Ah, there's been a rotation, figure out where we should be!
+		auto tail = producerListTail.load(std::memory_order_acquire);
+		if (token.desiredProducer == nullptr && tail == nullptr) {
+			return false;
+		}
+		auto prodCount = producerCount.load(std::memory_order_relaxed);
+		auto globalOffset = globalExplicitConsumerOffset.load(std::memory_order_relaxed);
+		if ((details::unlikely)(token.desiredProducer == nullptr)) {
+			// Aha, first time we're dequeueing anything.
+			// Figure out our local position
+			// Note: offset is from start, not end, but we're traversing from end -- subtract from count first
+			std::uint32_t offset = prodCount - 1 - (token.initialOffset % prodCount);
+			token.desiredProducer = tail;
+			for (std::uint32_t i = 0; i != offset; ++i) {
+				token.desiredProducer = static_cast<ProducerBase*>(token.desiredProducer)->next_prod();
+				if (token.desiredProducer == nullptr) {
+					token.desiredProducer = tail;
+				}
+			}
+		}
+		
+		std::uint32_t delta = globalOffset - token.lastKnownGlobalOffset;
+		if (delta >= prodCount) {
+			delta = delta % prodCount;
+		}
+		for (std::uint32_t i = 0; i != delta; ++i) {
+			token.desiredProducer = static_cast<ProducerBase*>(token.desiredProducer)->next_prod();
+			if (token.desiredProducer == nullptr) {
+				token.desiredProducer = tail;
+			}
+		}
+		
+		token.lastKnownGlobalOffset = globalOffset;
+		token.currentProducer = token.desiredProducer;
+		token.itemsConsumedFromCurrent = 0;
+		return true;
+	}
+	
+	
+	///////////////////////////
+	// Free list
+	///////////////////////////
+	
+	template <typename N>
+	struct FreeListNode
+	{
+		FreeListNode() : freeListRefs(0), freeListNext(nullptr) { }
+		
+		std::atomic<std::uint32_t> freeListRefs;
+		std::atomic<N*> freeListNext;
+	};
+	
+	// A simple CAS-based lock-free free list. Not the fastest thing in the world under heavy contention, but
+	// simple and correct (assuming nodes are never freed until after the free list is destroyed), and fairly
+	// speedy under low contention.
+	template<typename N>		// N must inherit FreeListNode or have the same fields (and initialization of them)
+	struct FreeList
+	{
+		FreeList() : freeListHead(nullptr) { }
+		FreeList(FreeList&& other) : freeListHead(other.freeListHead.load(std::memory_order_relaxed)) { other.freeListHead.store(nullptr, std::memory_order_relaxed); }
+		void swap(FreeList& other) { details::swap_relaxed(freeListHead, other.freeListHead); }
+		
+		FreeList(FreeList const&) MOODYCAMEL_DELETE_FUNCTION;
+		FreeList& operator=(FreeList const&) MOODYCAMEL_DELETE_FUNCTION;
+		
+		inline void add(N* node)
+		{
+#ifdef MCDBGQ_NOLOCKFREE_FREELIST
+			debug::DebugLock lock(mutex);
+#endif		
+			// We know that the should-be-on-freelist bit is 0 at this point, so it's safe to
+			// set it using a fetch_add
+			if (node->freeListRefs.fetch_add(SHOULD_BE_ON_FREELIST, std::memory_order_acq_rel) == 0) {
+				// Oh look! We were the last ones referencing this node, and we know
+				// we want to add it to the free list, so let's do it!
+		 		add_knowing_refcount_is_zero(node);
+			}
+		}
+		
+		inline N* try_get()
+		{
+#ifdef MCDBGQ_NOLOCKFREE_FREELIST
+			debug::DebugLock lock(mutex);
+#endif		
+			auto head = freeListHead.load(std::memory_order_acquire);
+			while (head != nullptr) {
+				auto prevHead = head;
+				auto refs = head->freeListRefs.load(std::memory_order_relaxed);
+				if ((refs & REFS_MASK) == 0 || !head->freeListRefs.compare_exchange_strong(refs, refs + 1, std::memory_order_acquire, std::memory_order_relaxed)) {
+					head = freeListHead.load(std::memory_order_acquire);
+					continue;
+				}
+				
+				// Good, reference count has been incremented (it wasn't at zero), which means we can read the
+				// next and not worry about it changing between now and the time we do the CAS
+				auto next = head->freeListNext.load(std::memory_order_relaxed);
+				if (freeListHead.compare_exchange_strong(head, next, std::memory_order_acquire, std::memory_order_relaxed)) {
+					// Yay, got the node. This means it was on the list, which means shouldBeOnFreeList must be false no
+					// matter the refcount (because nobody else knows it's been taken off yet, it can't have been put back on).
+					assert((head->freeListRefs.load(std::memory_order_relaxed) & SHOULD_BE_ON_FREELIST) == 0);
+					
+					// Decrease refcount twice, once for our ref, and once for the list's ref
+					head->freeListRefs.fetch_sub(2, std::memory_order_release);
+					return head;
+				}
+				
+				// OK, the head must have changed on us, but we still need to decrease the refcount we increased.
+				// Note that we don't need to release any memory effects, but we do need to ensure that the reference
+				// count decrement happens-after the CAS on the head.
+				refs = prevHead->freeListRefs.fetch_sub(1, std::memory_order_acq_rel);
+				if (refs == SHOULD_BE_ON_FREELIST + 1) {
+					add_knowing_refcount_is_zero(prevHead);
+				}
+			}
+			
+			return nullptr;
+		}
+		
+		// Useful for traversing the list when there's no contention (e.g. to destroy remaining nodes)
+		N* head_unsafe() const { return freeListHead.load(std::memory_order_relaxed); }
+		
+	private:
+		inline void add_knowing_refcount_is_zero(N* node)
+		{
+			// Since the refcount is zero, and nobody can increase it once it's zero (except us, and we run
+			// only one copy of this method per node at a time, i.e. the single thread case), then we know
+			// we can safely change the next pointer of the node; however, once the refcount is back above
+			// zero, then other threads could increase it (happens under heavy contention, when the refcount
+			// goes to zero in between a load and a refcount increment of a node in try_get, then back up to
+			// something non-zero, then the refcount increment is done by the other thread) -- so, if the CAS
+			// to add the node to the actual list fails, decrease the refcount and leave the add operation to
+			// the next thread who puts the refcount back at zero (which could be us, hence the loop).
+			auto head = freeListHead.load(std::memory_order_relaxed);
+			while (true) {
+				node->freeListNext.store(head, std::memory_order_relaxed);
+				node->freeListRefs.store(1, std::memory_order_release);
+				if (!freeListHead.compare_exchange_strong(head, node, std::memory_order_release, std::memory_order_relaxed)) {
+					// Hmm, the add failed, but we can only try again when the refcount goes back to zero
+					if (node->freeListRefs.fetch_add(SHOULD_BE_ON_FREELIST - 1, std::memory_order_release) == 1) {
+						continue;
+					}
+				}
+				return;
+			}
+		}
+		
+	private:
+		// Implemented like a stack, but where node order doesn't matter (nodes are inserted out of order under contention)
+		std::atomic<N*> freeListHead;
+	
+	static const std::uint32_t REFS_MASK = 0x7FFFFFFF;
+	static const std::uint32_t SHOULD_BE_ON_FREELIST = 0x80000000;
+		
+#ifdef MCDBGQ_NOLOCKFREE_FREELIST
+		debug::DebugMutex mutex;
+#endif
+	};
+	
+	
+	///////////////////////////
+	// Block
+	///////////////////////////
+	
+	enum InnerQueueContext { implicit_context = 0, explicit_context = 1 };
+	
+	struct Block
+	{
+		Block()
+			: next(nullptr), elementsCompletelyDequeued(0), freeListRefs(0), freeListNext(nullptr), shouldBeOnFreeList(false), dynamicallyAllocated(true)
+		{
+#ifdef MCDBGQ_TRACKMEM
+			owner = nullptr;
+#endif
+		}
+		
+		template<InnerQueueContext context>
+		inline bool is_empty() const
+		{
+			if (context == explicit_context && BLOCK_SIZE <= EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD) {
+				// Check flags
+				for (size_t i = 0; i < BLOCK_SIZE; ++i) {
+					if (!emptyFlags[i].load(std::memory_order_relaxed)) {
+						return false;
+					}
+				}
+				
+				// Aha, empty; make sure we have all other memory effects that happened before the empty flags were set
+				std::atomic_thread_fence(std::memory_order_acquire);
+				return true;
+			}
+			else {
+				// Check counter
+				if (elementsCompletelyDequeued.load(std::memory_order_relaxed) == BLOCK_SIZE) {
+					std::atomic_thread_fence(std::memory_order_acquire);
+					return true;
+				}
+				assert(elementsCompletelyDequeued.load(std::memory_order_relaxed) <= BLOCK_SIZE);
+				return false;
+			}
+		}
+		
+		// Returns true if the block is now empty (does not apply in explicit context)
+		template<InnerQueueContext context>
+		inline bool set_empty(index_t i)
+		{
+			if (context == explicit_context && BLOCK_SIZE <= EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD) {
+				// Set flag
+				assert(!emptyFlags[BLOCK_SIZE - 1 - static_cast<size_t>(i & static_cast<index_t>(BLOCK_SIZE - 1))].load(std::memory_order_relaxed));
+				emptyFlags[BLOCK_SIZE - 1 - static_cast<size_t>(i & static_cast<index_t>(BLOCK_SIZE - 1))].store(true, std::memory_order_release);
+				return false;
+			}
+			else {
+				// Increment counter
+				auto prevVal = elementsCompletelyDequeued.fetch_add(1, std::memory_order_release);
+				assert(prevVal < BLOCK_SIZE);
+				return prevVal == BLOCK_SIZE - 1;
+			}
+		}
+		
+		// Sets multiple contiguous item statuses to 'empty' (assumes no wrapping and count > 0).
+		// Returns true if the block is now empty (does not apply in explicit context).
+		template<InnerQueueContext context>
+		inline bool set_many_empty(index_t i, size_t count)
+		{
+			if (context == explicit_context && BLOCK_SIZE <= EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD) {
+				// Set flags
+				std::atomic_thread_fence(std::memory_order_release);
+				i = BLOCK_SIZE - 1 - static_cast<size_t>(i & static_cast<index_t>(BLOCK_SIZE - 1)) - count + 1;
+				for (size_t j = 0; j != count; ++j) {
+					assert(!emptyFlags[i + j].load(std::memory_order_relaxed));
+					emptyFlags[i + j].store(true, std::memory_order_relaxed);
+				}
+				return false;
+			}
+			else {
+				// Increment counter
+				auto prevVal = elementsCompletelyDequeued.fetch_add(count, std::memory_order_release);
+				assert(prevVal + count <= BLOCK_SIZE);
+				return prevVal + count == BLOCK_SIZE;
+			}
+		}
+		
+		template<InnerQueueContext context>
+		inline void set_all_empty()
+		{
+			if (context == explicit_context && BLOCK_SIZE <= EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD) {
+				// Set all flags
+				for (size_t i = 0; i != BLOCK_SIZE; ++i) {
+					emptyFlags[i].store(true, std::memory_order_relaxed);
+				}
+			}
+			else {
+				// Reset counter
+				elementsCompletelyDequeued.store(BLOCK_SIZE, std::memory_order_relaxed);
+			}
+		}
+		
+		template<InnerQueueContext context>
+		inline void reset_empty()
+		{
+			if (context == explicit_context && BLOCK_SIZE <= EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD) {
+				// Reset flags
+				for (size_t i = 0; i != BLOCK_SIZE; ++i) {
+					emptyFlags[i].store(false, std::memory_order_relaxed);
+				}
+			}
+			else {
+				// Reset counter
+				elementsCompletelyDequeued.store(0, std::memory_order_relaxed);
+			}
+		}
+		
+		inline T* operator[](index_t idx) MOODYCAMEL_NOEXCEPT { return static_cast<T*>(static_cast<void*>(elements)) + static_cast<size_t>(idx & static_cast<index_t>(BLOCK_SIZE - 1)); }
+		inline T const* operator[](index_t idx) const MOODYCAMEL_NOEXCEPT { return static_cast<T const*>(static_cast<void const*>(elements)) + static_cast<size_t>(idx & static_cast<index_t>(BLOCK_SIZE - 1)); }
+		
+	private:
+		// IMPORTANT: This must be the first member in Block, so that if T depends on the alignment of
+		// addresses returned by malloc, that alignment will be preserved. Apparently clang actually
+		// generates code that uses this assumption for AVX instructions in some cases. Ideally, we
+		// should also align Block to the alignment of T in case it's higher than malloc's 16-byte
+		// alignment, but this is hard to do in a cross-platform way. Assert for this case:
+		static_assert(std::alignment_of<T>::value <= std::alignment_of<details::max_align_t>::value, "The queue does not support super-aligned types at this time");
+		// Additionally, we need the alignment of Block itself to be a multiple of max_align_t since
+		// otherwise the appropriate padding will not be added at the end of Block in order to make
+		// arrays of Blocks all be properly aligned (not just the first one). We use a union to force
+		// this.
+		union {
+			char elements[sizeof(T) * BLOCK_SIZE];
+			details::max_align_t dummy;
+		};
+	public:
+		Block* next;
+		std::atomic<size_t> elementsCompletelyDequeued;
+		std::atomic<bool> emptyFlags[BLOCK_SIZE <= EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD ? BLOCK_SIZE : 1];
+	public:
+		std::atomic<std::uint32_t> freeListRefs;
+		std::atomic<Block*> freeListNext;
+		std::atomic<bool> shouldBeOnFreeList;
+		bool dynamicallyAllocated;		// Perhaps a better name for this would be 'isNotPartOfInitialBlockPool'
+		
+#ifdef MCDBGQ_TRACKMEM
+		void* owner;
+#endif
+	};
+	static_assert(std::alignment_of<Block>::value >= std::alignment_of<details::max_align_t>::value, "Internal error: Blocks must be at least as aligned as the type they are wrapping");
+
+
+#ifdef MCDBGQ_TRACKMEM
+public:
+	struct MemStats;
+private:
+#endif
+	
+	///////////////////////////
+	// Producer base
+	///////////////////////////
+	
+	struct ProducerBase : public details::ConcurrentQueueProducerTypelessBase
+	{
+		ProducerBase(ConcurrentQueue* parent_, bool isExplicit_) :
+			tailIndex(0),
+			headIndex(0),
+			dequeueOptimisticCount(0),
+			dequeueOvercommit(0),
+			tailBlock(nullptr),
+			isExplicit(isExplicit_),
+			parent(parent_)
+		{
+		}
+		
+		virtual ~ProducerBase() { };
+		
+		template<typename U>
+		inline bool dequeue(U& element)
+		{
+			if (isExplicit) {
+				return static_cast<ExplicitProducer*>(this)->dequeue(element);
+			}
+			else {
+				return static_cast<ImplicitProducer*>(this)->dequeue(element);
+			}
+		}
+		
+		template<typename It>
+		inline size_t dequeue_bulk(It& itemFirst, size_t max)
+		{
+			if (isExplicit) {
+				return static_cast<ExplicitProducer*>(this)->dequeue_bulk(itemFirst, max);
+			}
+			else {
+				return static_cast<ImplicitProducer*>(this)->dequeue_bulk(itemFirst, max);
+			}
+		}
+		
+		inline ProducerBase* next_prod() const { return static_cast<ProducerBase*>(next); }
+		
+		inline size_t size_approx() const
+		{
+			auto tail = tailIndex.load(std::memory_order_relaxed);
+			auto head = headIndex.load(std::memory_order_relaxed);
+			return details::circular_less_than(head, tail) ? static_cast<size_t>(tail - head) : 0;
+		}
+		
+		inline index_t getTail() const { return tailIndex.load(std::memory_order_relaxed); }
+	protected:
+		std::atomic<index_t> tailIndex;		// Where to enqueue to next
+		std::atomic<index_t> headIndex;		// Where to dequeue from next
+		
+		std::atomic<index_t> dequeueOptimisticCount;
+		std::atomic<index_t> dequeueOvercommit;
+		
+		Block* tailBlock;
+		
+	public:
+		bool isExplicit;
+		ConcurrentQueue* parent;
+		
+	protected:
+#ifdef MCDBGQ_TRACKMEM
+		friend struct MemStats;
+#endif
+	};
+	
+	
+	///////////////////////////
+	// Explicit queue
+	///////////////////////////
+		
+	struct ExplicitProducer : public ProducerBase
+	{
+		explicit ExplicitProducer(ConcurrentQueue* parent) :
+			ProducerBase(parent, true),
+			blockIndex(nullptr),
+			pr_blockIndexSlotsUsed(0),
+			pr_blockIndexSize(EXPLICIT_INITIAL_INDEX_SIZE >> 1),
+			pr_blockIndexFront(0),
+			pr_blockIndexEntries(nullptr),
+			pr_blockIndexRaw(nullptr)
+		{
+			size_t poolBasedIndexSize = details::ceil_to_pow_2(parent->initialBlockPoolSize) >> 1;
+			if (poolBasedIndexSize > pr_blockIndexSize) {
+				pr_blockIndexSize = poolBasedIndexSize;
+			}
+			
+			new_block_index(0);		// This creates an index with double the number of current entries, i.e. EXPLICIT_INITIAL_INDEX_SIZE
+		}
+		
+		~ExplicitProducer()
+		{
+			// Destruct any elements not yet dequeued.
+			// Since we're in the destructor, we can assume all elements
+			// are either completely dequeued or completely not (no halfways).
+			if (this->tailBlock != nullptr) {		// Note this means there must be a block index too
+				// First find the block that's partially dequeued, if any
+				Block* halfDequeuedBlock = nullptr;
+				if ((this->headIndex.load(std::memory_order_relaxed) & static_cast<index_t>(BLOCK_SIZE - 1)) != 0) {
+					// The head's not on a block boundary, meaning a block somewhere is partially dequeued
+					// (or the head block is the tail block and was fully dequeued, but the head/tail are still not on a boundary)
+					size_t i = (pr_blockIndexFront - pr_blockIndexSlotsUsed) & (pr_blockIndexSize - 1);
+					while (details::circular_less_than<index_t>(pr_blockIndexEntries[i].base + BLOCK_SIZE, this->headIndex.load(std::memory_order_relaxed))) {
+						i = (i + 1) & (pr_blockIndexSize - 1);
+					}
+					assert(details::circular_less_than<index_t>(pr_blockIndexEntries[i].base, this->headIndex.load(std::memory_order_relaxed)));
+					halfDequeuedBlock = pr_blockIndexEntries[i].block;
+				}
+				
+				// Start at the head block (note the first line in the loop gives us the head from the tail on the first iteration)
+				auto block = this->tailBlock;
+				do {
+					block = block->next;
+					if (block->ConcurrentQueue::Block::template is_empty<explicit_context>()) {
+						continue;
+					}
+					
+					size_t i = 0;	// Offset into block
+					if (block == halfDequeuedBlock) {
+						i = static_cast<size_t>(this->headIndex.load(std::memory_order_relaxed) & static_cast<index_t>(BLOCK_SIZE - 1));
+					}
+					
+					// Walk through all the items in the block; if this is the tail block, we need to stop when we reach the tail index
+					auto lastValidIndex = (this->tailIndex.load(std::memory_order_relaxed) & static_cast<index_t>(BLOCK_SIZE - 1)) == 0 ? BLOCK_SIZE : static_cast<size_t>(this->tailIndex.load(std::memory_order_relaxed) & static_cast<index_t>(BLOCK_SIZE - 1));
+					while (i != BLOCK_SIZE && (block != this->tailBlock || i != lastValidIndex)) {
+						(*block)[i++]->~T();
+					}
+				} while (block != this->tailBlock);
+			}
+			
+			// Destroy all blocks that we own
+			if (this->tailBlock != nullptr) {
+				auto block = this->tailBlock;
+				do {
+					auto nextBlock = block->next;
+					if (block->dynamicallyAllocated) {
+						destroy(block);
+					}
+					else {
+						this->parent->add_block_to_free_list(block);
+					}
+					block = nextBlock;
+				} while (block != this->tailBlock);
+			}
+			
+			// Destroy the block indices
+			auto header = static_cast<BlockIndexHeader*>(pr_blockIndexRaw);
+			while (header != nullptr) {
+				auto prev = static_cast<BlockIndexHeader*>(header->prev);
+				header->~BlockIndexHeader();
+				(Traits::free)(header);
+				header = prev;
+			}
+		}
+		
+		template<AllocationMode allocMode, typename U>
+		inline bool enqueue(U&& element)
+		{
+			index_t currentTailIndex = this->tailIndex.load(std::memory_order_relaxed);
+			index_t newTailIndex = 1 + currentTailIndex;
+			if ((currentTailIndex & static_cast<index_t>(BLOCK_SIZE - 1)) == 0) {
+				// We reached the end of a block, start a new one
+				auto startBlock = this->tailBlock;
+				auto originalBlockIndexSlotsUsed = pr_blockIndexSlotsUsed;
+				if (this->tailBlock != nullptr && this->tailBlock->next->ConcurrentQueue::Block::template is_empty<explicit_context>()) {
+					// We can re-use the block ahead of us, it's empty!					
+					this->tailBlock = this->tailBlock->next;
+					this->tailBlock->ConcurrentQueue::Block::template reset_empty<explicit_context>();
+					
+					// We'll put the block on the block index (guaranteed to be room since we're conceptually removing the
+					// last block from it first -- except instead of removing then adding, we can just overwrite).
+					// Note that there must be a valid block index here, since even if allocation failed in the ctor,
+					// it would have been re-attempted when adding the first block to the queue; since there is such
+					// a block, a block index must have been successfully allocated.
+				}
+				else {
+					// Whatever head value we see here is >= the last value we saw here (relatively),
+					// and <= its current value. Since we have the most recent tail, the head must be
+					// <= to it.
+					auto head = this->headIndex.load(std::memory_order_relaxed);
+					assert(!details::circular_less_than<index_t>(currentTailIndex, head));
+					if (!details::circular_less_than<index_t>(head, currentTailIndex + BLOCK_SIZE)
+						|| (MAX_SUBQUEUE_SIZE != details::const_numeric_max<size_t>::value && (MAX_SUBQUEUE_SIZE == 0 || MAX_SUBQUEUE_SIZE - BLOCK_SIZE < currentTailIndex - head))) {
+						// We can't enqueue in another block because there's not enough leeway -- the
+						// tail could surpass the head by the time the block fills up! (Or we'll exceed
+						// the size limit, if the second part of the condition was true.)
+						return false;
+					}
+					// We're going to need a new block; check that the block index has room
+					if (pr_blockIndexRaw == nullptr || pr_blockIndexSlotsUsed == pr_blockIndexSize) {
+						// Hmm, the circular block index is already full -- we'll need
+						// to allocate a new index. Note pr_blockIndexRaw can only be nullptr if
+						// the initial allocation failed in the constructor.
+						
+						if (allocMode == CannotAlloc || !new_block_index(pr_blockIndexSlotsUsed)) {
+							return false;
+						}
+					}
+					
+					// Insert a new block in the circular linked list
+					auto newBlock = this->parent->ConcurrentQueue::template requisition_block<allocMode>();
+					if (newBlock == nullptr) {
+						return false;
+					}
+#ifdef MCDBGQ_TRACKMEM
+					newBlock->owner = this;
+#endif
+					newBlock->ConcurrentQueue::Block::template reset_empty<explicit_context>();
+					if (this->tailBlock == nullptr) {
+						newBlock->next = newBlock;
+					}
+					else {
+						newBlock->next = this->tailBlock->next;
+						this->tailBlock->next = newBlock;
+					}
+					this->tailBlock = newBlock;
+					++pr_blockIndexSlotsUsed;
+				}
+
+				if (!MOODYCAMEL_NOEXCEPT_CTOR(T, U, new ((T*)nullptr) T(std::forward<U>(element)))) {
+					// The constructor may throw. We want the element not to appear in the queue in
+					// that case (without corrupting the queue):
+					MOODYCAMEL_TRY {
+						new ((*this->tailBlock)[currentTailIndex]) T(std::forward<U>(element));
+					}
+					MOODYCAMEL_CATCH (...) {
+						// Revert change to the current block, but leave the new block available
+						// for next time
+						pr_blockIndexSlotsUsed = originalBlockIndexSlotsUsed;
+						this->tailBlock = startBlock == nullptr ? this->tailBlock : startBlock;
+						MOODYCAMEL_RETHROW;
+					}
+				}
+				else {
+					(void)startBlock;
+					(void)originalBlockIndexSlotsUsed;
+				}
+				
+				// Add block to block index
+				auto& entry = blockIndex.load(std::memory_order_relaxed)->entries[pr_blockIndexFront];
+				entry.base = currentTailIndex;
+				entry.block = this->tailBlock;
+				blockIndex.load(std::memory_order_relaxed)->front.store(pr_blockIndexFront, std::memory_order_release);
+				pr_blockIndexFront = (pr_blockIndexFront + 1) & (pr_blockIndexSize - 1);
+				
+				if (!MOODYCAMEL_NOEXCEPT_CTOR(T, U, new ((T*)nullptr) T(std::forward<U>(element)))) {
+					this->tailIndex.store(newTailIndex, std::memory_order_release);
+					return true;
+				}
+			}
+			
+			// Enqueue
+			new ((*this->tailBlock)[currentTailIndex]) T(std::forward<U>(element));
+			
+			this->tailIndex.store(newTailIndex, std::memory_order_release);
+			return true;
+		}
+		
+		template<typename U>
+		bool dequeue(U& element)
+		{
+			auto tail = this->tailIndex.load(std::memory_order_relaxed);
+			auto overcommit = this->dequeueOvercommit.load(std::memory_order_relaxed);
+			if (details::circular_less_than<index_t>(this->dequeueOptimisticCount.load(std::memory_order_relaxed) - overcommit, tail)) {
+				// Might be something to dequeue, let's give it a try
+				
+				// Note that this if is purely for performance purposes in the common case when the queue is
+				// empty and the values are eventually consistent -- we may enter here spuriously.
+				
+				// Note that whatever the values of overcommit and tail are, they are not going to change (unless we
+				// change them) and must be the same value at this point (inside the if) as when the if condition was
+				// evaluated.
+
+				// We insert an acquire fence here to synchronize-with the release upon incrementing dequeueOvercommit below.
+				// This ensures that whatever the value we got loaded into overcommit, the load of dequeueOptisticCount in
+				// the fetch_add below will result in a value at least as recent as that (and therefore at least as large).
+				// Note that I believe a compiler (signal) fence here would be sufficient due to the nature of fetch_add (all
+				// read-modify-write operations are guaranteed to work on the latest value in the modification order), but
+				// unfortunately that can't be shown to be correct using only the C++11 standard.
+				// See http://stackoverflow.com/questions/18223161/what-are-the-c11-memory-ordering-guarantees-in-this-corner-case
+				std::atomic_thread_fence(std::memory_order_acquire);
+				
+				// Increment optimistic counter, then check if it went over the boundary
+				auto myDequeueCount = this->dequeueOptimisticCount.fetch_add(1, std::memory_order_relaxed);
+				
+				// Note that since dequeueOvercommit must be <= dequeueOptimisticCount (because dequeueOvercommit is only ever
+				// incremented after dequeueOptimisticCount -- this is enforced in the `else` block below), and since we now
+				// have a version of dequeueOptimisticCount that is at least as recent as overcommit (due to the release upon
+				// incrementing dequeueOvercommit and the acquire above that synchronizes with it), overcommit <= myDequeueCount.
+				// However, we can't assert this since both dequeueOptimisticCount and dequeueOvercommit may (independently)
+				// overflow; in such a case, though, the logic still holds since the difference between the two is maintained.
+				
+				// Note that we reload tail here in case it changed; it will be the same value as before or greater, since
+				// this load is sequenced after (happens after) the earlier load above. This is supported by read-read
+				// coherency (as defined in the standard), explained here: http://en.cppreference.com/w/cpp/atomic/memory_order
+				tail = this->tailIndex.load(std::memory_order_acquire);
+				if ((details::likely)(details::circular_less_than<index_t>(myDequeueCount - overcommit, tail))) {
+					// Guaranteed to be at least one element to dequeue!
+					
+					// Get the index. Note that since there's guaranteed to be at least one element, this
+					// will never exceed tail. We need to do an acquire-release fence here since it's possible
+					// that whatever condition got us to this point was for an earlier enqueued element (that
+					// we already see the memory effects for), but that by the time we increment somebody else
+					// has incremented it, and we need to see the memory effects for *that* element, which is
+					// in such a case is necessarily visible on the thread that incremented it in the first
+					// place with the more current condition (they must have acquired a tail that is at least
+					// as recent).
+					auto index = this->headIndex.fetch_add(1, std::memory_order_acq_rel);
+					
+					
+					// Determine which block the element is in
+					
+					auto localBlockIndex = blockIndex.load(std::memory_order_acquire);
+					auto localBlockIndexHead = localBlockIndex->front.load(std::memory_order_acquire);
+					
+					// We need to be careful here about subtracting and dividing because of index wrap-around.
+					// When an index wraps, we need to preserve the sign of the offset when dividing it by the
+					// block size (in order to get a correct signed block count offset in all cases):
+					auto headBase = localBlockIndex->entries[localBlockIndexHead].base;
+					auto blockBaseIndex = index & ~static_cast<index_t>(BLOCK_SIZE - 1);
+					auto offset = static_cast<size_t>(static_cast<typename std::make_signed<index_t>::type>(blockBaseIndex - headBase) / BLOCK_SIZE);
+					auto block = localBlockIndex->entries[(localBlockIndexHead + offset) & (localBlockIndex->size - 1)].block;
+					
+					// Dequeue
+					auto& el = *((*block)[index]);
+					if (!MOODYCAMEL_NOEXCEPT_ASSIGN(T, T&&, element = std::move(el))) {
+						// Make sure the element is still fully dequeued and destroyed even if the assignment
+						// throws
+						struct Guard {
+							Block* block;
+							index_t index;
+							
+							~Guard()
+							{
+								(*block)[index]->~T();
+								block->ConcurrentQueue::Block::template set_empty<explicit_context>(index);
+							}
+						} guard = { block, index };
+
+						element = std::move(el); // NOLINT
+					}
+					else {
+						element = std::move(el); // NOLINT
+						el.~T(); // NOLINT
+						block->ConcurrentQueue::Block::template set_empty<explicit_context>(index);
+					}
+					
+					return true;
+				}
+				else {
+					// Wasn't anything to dequeue after all; make the effective dequeue count eventually consistent
+					this->dequeueOvercommit.fetch_add(1, std::memory_order_release);		// Release so that the fetch_add on dequeueOptimisticCount is guaranteed to happen before this write
+				}
+			}
+		
+			return false;
+		}
+		
+		template<AllocationMode allocMode, typename It>
+		bool enqueue_bulk(It itemFirst, size_t count)
+		{
+			// First, we need to make sure we have enough room to enqueue all of the elements;
+			// this means pre-allocating blocks and putting them in the block index (but only if
+			// all the allocations succeeded).
+			index_t startTailIndex = this->tailIndex.load(std::memory_order_relaxed);
+			auto startBlock = this->tailBlock;
+			auto originalBlockIndexFront = pr_blockIndexFront;
+			auto originalBlockIndexSlotsUsed = pr_blockIndexSlotsUsed;
+			
+			Block* firstAllocatedBlock = nullptr;
+			
+			// Figure out how many blocks we'll need to allocate, and do so
+			size_t blockBaseDiff = ((startTailIndex + count - 1) & ~static_cast<index_t>(BLOCK_SIZE - 1)) - ((startTailIndex - 1) & ~static_cast<index_t>(BLOCK_SIZE - 1));
+			index_t currentTailIndex = (startTailIndex - 1) & ~static_cast<index_t>(BLOCK_SIZE - 1);
+			if (blockBaseDiff > 0) {
+				// Allocate as many blocks as possible from ahead
+				while (blockBaseDiff > 0 && this->tailBlock != nullptr && this->tailBlock->next != firstAllocatedBlock && this->tailBlock->next->ConcurrentQueue::Block::template is_empty<explicit_context>()) {
+					blockBaseDiff -= static_cast<index_t>(BLOCK_SIZE);
+					currentTailIndex += static_cast<index_t>(BLOCK_SIZE);
+					
+					this->tailBlock = this->tailBlock->next;
+					firstAllocatedBlock = firstAllocatedBlock == nullptr ? this->tailBlock : firstAllocatedBlock;
+					
+					auto& entry = blockIndex.load(std::memory_order_relaxed)->entries[pr_blockIndexFront];
+					entry.base = currentTailIndex;
+					entry.block = this->tailBlock;
+					pr_blockIndexFront = (pr_blockIndexFront + 1) & (pr_blockIndexSize - 1);
+				}
+				
+				// Now allocate as many blocks as necessary from the block pool
+				while (blockBaseDiff > 0) {
+					blockBaseDiff -= static_cast<index_t>(BLOCK_SIZE);
+					currentTailIndex += static_cast<index_t>(BLOCK_SIZE);
+					
+					auto head = this->headIndex.load(std::memory_order_relaxed);
+					assert(!details::circular_less_than<index_t>(currentTailIndex, head));
+					bool full = !details::circular_less_than<index_t>(head, currentTailIndex + BLOCK_SIZE) || (MAX_SUBQUEUE_SIZE != details::const_numeric_max<size_t>::value && (MAX_SUBQUEUE_SIZE == 0 || MAX_SUBQUEUE_SIZE - BLOCK_SIZE < currentTailIndex - head));
+					if (pr_blockIndexRaw == nullptr || pr_blockIndexSlotsUsed == pr_blockIndexSize || full) {
+						if (allocMode == CannotAlloc || full || !new_block_index(originalBlockIndexSlotsUsed)) {
+							// Failed to allocate, undo changes (but keep injected blocks)
+							pr_blockIndexFront = originalBlockIndexFront;
+							pr_blockIndexSlotsUsed = originalBlockIndexSlotsUsed;
+							this->tailBlock = startBlock == nullptr ? firstAllocatedBlock : startBlock;
+							return false;
+						}
+						
+						// pr_blockIndexFront is updated inside new_block_index, so we need to
+						// update our fallback value too (since we keep the new index even if we
+						// later fail)
+						originalBlockIndexFront = originalBlockIndexSlotsUsed;
+					}
+					
+					// Insert a new block in the circular linked list
+					auto newBlock = this->parent->ConcurrentQueue::template requisition_block<allocMode>();
+					if (newBlock == nullptr) {
+						pr_blockIndexFront = originalBlockIndexFront;
+						pr_blockIndexSlotsUsed = originalBlockIndexSlotsUsed;
+						this->tailBlock = startBlock == nullptr ? firstAllocatedBlock : startBlock;
+						return false;
+					}
+					
+#ifdef MCDBGQ_TRACKMEM
+					newBlock->owner = this;
+#endif
+					newBlock->ConcurrentQueue::Block::template set_all_empty<explicit_context>();
+					if (this->tailBlock == nullptr) {
+						newBlock->next = newBlock;
+					}
+					else {
+						newBlock->next = this->tailBlock->next;
+						this->tailBlock->next = newBlock;
+					}
+					this->tailBlock = newBlock;
+					firstAllocatedBlock = firstAllocatedBlock == nullptr ? this->tailBlock : firstAllocatedBlock;
+					
+					++pr_blockIndexSlotsUsed;
+					
+					auto& entry = blockIndex.load(std::memory_order_relaxed)->entries[pr_blockIndexFront];
+					entry.base = currentTailIndex;
+					entry.block = this->tailBlock;
+					pr_blockIndexFront = (pr_blockIndexFront + 1) & (pr_blockIndexSize - 1);
+				}
+				
+				// Excellent, all allocations succeeded. Reset each block's emptiness before we fill them up, and
+				// publish the new block index front
+				auto block = firstAllocatedBlock;
+				while (true) {
+					block->ConcurrentQueue::Block::template reset_empty<explicit_context>();
+					if (block == this->tailBlock) {
+						break;
+					}
+					block = block->next;
+				}
+				
+				if (MOODYCAMEL_NOEXCEPT_CTOR(T, decltype(*itemFirst), new ((T*)nullptr) T(details::deref_noexcept(itemFirst)))) {
+					blockIndex.load(std::memory_order_relaxed)->front.store((pr_blockIndexFront - 1) & (pr_blockIndexSize - 1), std::memory_order_release);
+				}
+			}
+			
+			// Enqueue, one block at a time
+			index_t newTailIndex = startTailIndex + static_cast<index_t>(count);
+			currentTailIndex = startTailIndex;
+			auto endBlock = this->tailBlock;
+			this->tailBlock = startBlock;
+			assert((startTailIndex & static_cast<index_t>(BLOCK_SIZE - 1)) != 0 || firstAllocatedBlock != nullptr || count == 0);
+			if ((startTailIndex & static_cast<index_t>(BLOCK_SIZE - 1)) == 0 && firstAllocatedBlock != nullptr) {
+				this->tailBlock = firstAllocatedBlock;
+			}
+			while (true) {
+				auto stopIndex = (currentTailIndex & ~static_cast<index_t>(BLOCK_SIZE - 1)) + static_cast<index_t>(BLOCK_SIZE);
+				if (details::circular_less_than<index_t>(newTailIndex, stopIndex)) {
+					stopIndex = newTailIndex;
+				}
+				if (MOODYCAMEL_NOEXCEPT_CTOR(T, decltype(*itemFirst), new ((T*)nullptr) T(details::deref_noexcept(itemFirst)))) {
+					while (currentTailIndex != stopIndex) {
+						new ((*this->tailBlock)[currentTailIndex++]) T(*itemFirst++);
+					}
+				}
+				else {
+					MOODYCAMEL_TRY {
+						while (currentTailIndex != stopIndex) {
+							// Must use copy constructor even if move constructor is available
+							// because we may have to revert if there's an exception.
+							// Sorry about the horrible templated next line, but it was the only way
+							// to disable moving *at compile time*, which is important because a type
+							// may only define a (noexcept) move constructor, and so calls to the
+							// cctor will not compile, even if they are in an if branch that will never
+							// be executed
+							new ((*this->tailBlock)[currentTailIndex]) T(details::nomove_if<(bool)!MOODYCAMEL_NOEXCEPT_CTOR(T, decltype(*itemFirst), new ((T*)nullptr) T(details::deref_noexcept(itemFirst)))>::eval(*itemFirst));
+							++currentTailIndex;
+							++itemFirst;
+						}
+					}
+					MOODYCAMEL_CATCH (...) {
+						// Oh dear, an exception's been thrown -- destroy the elements that
+						// were enqueued so far and revert the entire bulk operation (we'll keep
+						// any allocated blocks in our linked list for later, though).
+						auto constructedStopIndex = currentTailIndex;
+						auto lastBlockEnqueued = this->tailBlock;
+						
+						pr_blockIndexFront = originalBlockIndexFront;
+						pr_blockIndexSlotsUsed = originalBlockIndexSlotsUsed;
+						this->tailBlock = startBlock == nullptr ? firstAllocatedBlock : startBlock;
+						
+						if (!details::is_trivially_destructible<T>::value) {
+							auto block = startBlock;
+							if ((startTailIndex & static_cast<index_t>(BLOCK_SIZE - 1)) == 0) {
+								block = firstAllocatedBlock;
+							}
+							currentTailIndex = startTailIndex;
+							while (true) {
+								stopIndex = (currentTailIndex & ~static_cast<index_t>(BLOCK_SIZE - 1)) + static_cast<index_t>(BLOCK_SIZE);
+								if (details::circular_less_than<index_t>(constructedStopIndex, stopIndex)) {
+									stopIndex = constructedStopIndex;
+								}
+								while (currentTailIndex != stopIndex) {
+									(*block)[currentTailIndex++]->~T();
+								}
+								if (block == lastBlockEnqueued) {
+									break;
+								}
+								block = block->next;
+							}
+						}
+						MOODYCAMEL_RETHROW;
+					}
+				}
+				
+				if (this->tailBlock == endBlock) {
+					assert(currentTailIndex == newTailIndex);
+					break;
+				}
+				this->tailBlock = this->tailBlock->next;
+			}
+			
+			if (!MOODYCAMEL_NOEXCEPT_CTOR(T, decltype(*itemFirst), new ((T*)nullptr) T(details::deref_noexcept(itemFirst))) && firstAllocatedBlock != nullptr) {
+				blockIndex.load(std::memory_order_relaxed)->front.store((pr_blockIndexFront - 1) & (pr_blockIndexSize - 1), std::memory_order_release);
+			}
+			
+			this->tailIndex.store(newTailIndex, std::memory_order_release);
+			return true;
+		}
+		
+		template<typename It>
+		size_t dequeue_bulk(It& itemFirst, size_t max)
+		{
+			auto tail = this->tailIndex.load(std::memory_order_relaxed);
+			auto overcommit = this->dequeueOvercommit.load(std::memory_order_relaxed);
+			auto desiredCount = static_cast<size_t>(tail - (this->dequeueOptimisticCount.load(std::memory_order_relaxed) - overcommit));
+			if (details::circular_less_than<size_t>(0, desiredCount)) {
+				desiredCount = desiredCount < max ? desiredCount : max;
+				std::atomic_thread_fence(std::memory_order_acquire);
+				
+				auto myDequeueCount = this->dequeueOptimisticCount.fetch_add(desiredCount, std::memory_order_relaxed);;
+				
+				tail = this->tailIndex.load(std::memory_order_acquire);
+				auto actualCount = static_cast<size_t>(tail - (myDequeueCount - overcommit));
+				if (details::circular_less_than<size_t>(0, actualCount)) {
+					actualCount = desiredCount < actualCount ? desiredCount : actualCount;
+					if (actualCount < desiredCount) {
+						this->dequeueOvercommit.fetch_add(desiredCount - actualCount, std::memory_order_release);
+					}
+					
+					// Get the first index. Note that since there's guaranteed to be at least actualCount elements, this
+					// will never exceed tail.
+					auto firstIndex = this->headIndex.fetch_add(actualCount, std::memory_order_acq_rel);
+					
+					// Determine which block the first element is in
+					auto localBlockIndex = blockIndex.load(std::memory_order_acquire);
+					auto localBlockIndexHead = localBlockIndex->front.load(std::memory_order_acquire);
+					
+					auto headBase = localBlockIndex->entries[localBlockIndexHead].base;
+					auto firstBlockBaseIndex = firstIndex & ~static_cast<index_t>(BLOCK_SIZE - 1);
+					auto offset = static_cast<size_t>(static_cast<typename std::make_signed<index_t>::type>(firstBlockBaseIndex - headBase) / BLOCK_SIZE);
+					auto indexIndex = (localBlockIndexHead + offset) & (localBlockIndex->size - 1);
+					
+					// Iterate the blocks and dequeue
+					auto index = firstIndex;
+					do {
+						auto firstIndexInBlock = index;
+						auto endIndex = (index & ~static_cast<index_t>(BLOCK_SIZE - 1)) + static_cast<index_t>(BLOCK_SIZE);
+						endIndex = details::circular_less_than<index_t>(firstIndex + static_cast<index_t>(actualCount), endIndex) ? firstIndex + static_cast<index_t>(actualCount) : endIndex;
+						auto block = localBlockIndex->entries[indexIndex].block;
+						if (MOODYCAMEL_NOEXCEPT_ASSIGN(T, T&&, details::deref_noexcept(itemFirst) = std::move((*(*block)[index])))) {
+							while (index != endIndex) {
+								auto& el = *((*block)[index]);
+								*itemFirst++ = std::move(el);
+								el.~T();
+								++index;
+							}
+						}
+						else {
+							MOODYCAMEL_TRY {
+								while (index != endIndex) {
+									auto& el = *((*block)[index]);
+									*itemFirst = std::move(el);
+									++itemFirst;
+									el.~T();
+									++index;
+								}
+							}
+							MOODYCAMEL_CATCH (...) {
+								// It's too late to revert the dequeue, but we can make sure that all
+								// the dequeued objects are properly destroyed and the block index
+								// (and empty count) are properly updated before we propagate the exception
+								do {
+									block = localBlockIndex->entries[indexIndex].block;
+									while (index != endIndex) {
+										(*block)[index++]->~T();
+									}
+									block->ConcurrentQueue::Block::template set_many_empty<explicit_context>(firstIndexInBlock, static_cast<size_t>(endIndex - firstIndexInBlock));
+									indexIndex = (indexIndex + 1) & (localBlockIndex->size - 1);
+									
+									firstIndexInBlock = index;
+									endIndex = (index & ~static_cast<index_t>(BLOCK_SIZE - 1)) + static_cast<index_t>(BLOCK_SIZE);
+									endIndex = details::circular_less_than<index_t>(firstIndex + static_cast<index_t>(actualCount), endIndex) ? firstIndex + static_cast<index_t>(actualCount) : endIndex;
+								} while (index != firstIndex + actualCount);
+								
+								MOODYCAMEL_RETHROW;
+							}
+						}
+						block->ConcurrentQueue::Block::template set_many_empty<explicit_context>(firstIndexInBlock, static_cast<size_t>(endIndex - firstIndexInBlock));
+						indexIndex = (indexIndex + 1) & (localBlockIndex->size - 1);
+					} while (index != firstIndex + actualCount);
+					
+					return actualCount;
+				}
+				else {
+					// Wasn't anything to dequeue after all; make the effective dequeue count eventually consistent
+					this->dequeueOvercommit.fetch_add(desiredCount, std::memory_order_release);
+				}
+			}
+			
+			return 0;
+		}
+		
+	private:
+		struct BlockIndexEntry
+		{
+			index_t base;
+			Block* block;
+		};
+		
+		struct BlockIndexHeader
+		{
+			size_t size;
+			std::atomic<size_t> front;		// Current slot (not next, like pr_blockIndexFront)
+			BlockIndexEntry* entries;
+			void* prev;
+		};
+		
+		
+		bool new_block_index(size_t numberOfFilledSlotsToExpose)
+		{
+			auto prevBlockSizeMask = pr_blockIndexSize - 1;
+			
+			// Create the new block
+			pr_blockIndexSize <<= 1;
+			auto newRawPtr = static_cast<char*>((Traits::malloc)(sizeof(BlockIndexHeader) + std::alignment_of<BlockIndexEntry>::value - 1 + sizeof(BlockIndexEntry) * pr_blockIndexSize));
+			if (newRawPtr == nullptr) {
+				pr_blockIndexSize >>= 1;		// Reset to allow graceful retry
+				return false;
+			}
+			
+			auto newBlockIndexEntries = reinterpret_cast<BlockIndexEntry*>(details::align_for<BlockIndexEntry>(newRawPtr + sizeof(BlockIndexHeader)));
+			
+			// Copy in all the old indices, if any
+			size_t j = 0;
+			if (pr_blockIndexSlotsUsed != 0) {
+				auto i = (pr_blockIndexFront - pr_blockIndexSlotsUsed) & prevBlockSizeMask;
+				do {
+					newBlockIndexEntries[j++] = pr_blockIndexEntries[i];
+					i = (i + 1) & prevBlockSizeMask;
+				} while (i != pr_blockIndexFront);
+			}
+			
+			// Update everything
+			auto header = new (newRawPtr) BlockIndexHeader;
+			header->size = pr_blockIndexSize;
+			header->front.store(numberOfFilledSlotsToExpose - 1, std::memory_order_relaxed);
+			header->entries = newBlockIndexEntries;
+			header->prev = pr_blockIndexRaw;		// we link the new block to the old one so we can free it later
+			
+			pr_blockIndexFront = j;
+			pr_blockIndexEntries = newBlockIndexEntries;
+			pr_blockIndexRaw = newRawPtr;
+			blockIndex.store(header, std::memory_order_release);
+			
+			return true;
+		}
+		
+	private:
+		std::atomic<BlockIndexHeader*> blockIndex;
+		
+		// To be used by producer only -- consumer must use the ones in referenced by blockIndex
+		size_t pr_blockIndexSlotsUsed;
+		size_t pr_blockIndexSize;
+		size_t pr_blockIndexFront;		// Next slot (not current)
+		BlockIndexEntry* pr_blockIndexEntries;
+		void* pr_blockIndexRaw;
+		
+#ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG
+	public:
+		ExplicitProducer* nextExplicitProducer;
+	private:
+#endif
+		
+#ifdef MCDBGQ_TRACKMEM
+		friend struct MemStats;
+#endif
+	};
+	
+	
+	//////////////////////////////////
+	// Implicit queue
+	//////////////////////////////////
+	
+	struct ImplicitProducer : public ProducerBase
+	{			
+		ImplicitProducer(ConcurrentQueue* parent) :
+			ProducerBase(parent, false),
+			nextBlockIndexCapacity(IMPLICIT_INITIAL_INDEX_SIZE),
+			blockIndex(nullptr)
+		{
+			new_block_index();
+		}
+		
+		~ImplicitProducer()
+		{
+			// Note that since we're in the destructor we can assume that all enqueue/dequeue operations
+			// completed already; this means that all undequeued elements are placed contiguously across
+			// contiguous blocks, and that only the first and last remaining blocks can be only partially
+			// empty (all other remaining blocks must be completely full).
+			
+#ifdef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED
+			// Unregister ourselves for thread termination notification
+			if (!this->inactive.load(std::memory_order_relaxed)) {
+				details::ThreadExitNotifier::unsubscribe(&threadExitListener);
+			}
+#endif
+			
+			// Destroy all remaining elements!
+			auto tail = this->tailIndex.load(std::memory_order_relaxed);
+			auto index = this->headIndex.load(std::memory_order_relaxed);
+			Block* block = nullptr;
+			assert(index == tail || details::circular_less_than(index, tail));
+			bool forceFreeLastBlock = index != tail;		// If we enter the loop, then the last (tail) block will not be freed
+			while (index != tail) {
+				if ((index & static_cast<index_t>(BLOCK_SIZE - 1)) == 0 || block == nullptr) {
+					if (block != nullptr) {
+						// Free the old block
+						this->parent->add_block_to_free_list(block);
+					}
+					
+					block = get_block_index_entry_for_index(index)->value.load(std::memory_order_relaxed);
+				}
+				
+				((*block)[index])->~T();
+				++index;
+			}
+			// Even if the queue is empty, there's still one block that's not on the free list
+			// (unless the head index reached the end of it, in which case the tail will be poised
+			// to create a new block).
+			if (this->tailBlock != nullptr && (forceFreeLastBlock || (tail & static_cast<index_t>(BLOCK_SIZE - 1)) != 0)) {
+				this->parent->add_block_to_free_list(this->tailBlock);
+			}
+			
+			// Destroy block index
+			auto localBlockIndex = blockIndex.load(std::memory_order_relaxed);
+			if (localBlockIndex != nullptr) {
+				for (size_t i = 0; i != localBlockIndex->capacity; ++i) {
+					localBlockIndex->index[i]->~BlockIndexEntry();
+				}
+				do {
+					auto prev = localBlockIndex->prev;
+					localBlockIndex->~BlockIndexHeader();
+					(Traits::free)(localBlockIndex);
+					localBlockIndex = prev;
+				} while (localBlockIndex != nullptr);
+			}
+		}
+		
+		template<AllocationMode allocMode, typename U>
+		inline bool enqueue(U&& element)
+		{
+			index_t currentTailIndex = this->tailIndex.load(std::memory_order_relaxed);
+			index_t newTailIndex = 1 + currentTailIndex;
+			if ((currentTailIndex & static_cast<index_t>(BLOCK_SIZE - 1)) == 0) {
+				// We reached the end of a block, start a new one
+				auto head = this->headIndex.load(std::memory_order_relaxed);
+				assert(!details::circular_less_than<index_t>(currentTailIndex, head));
+				if (!details::circular_less_than<index_t>(head, currentTailIndex + BLOCK_SIZE) || (MAX_SUBQUEUE_SIZE != details::const_numeric_max<size_t>::value && (MAX_SUBQUEUE_SIZE == 0 || MAX_SUBQUEUE_SIZE - BLOCK_SIZE < currentTailIndex - head))) {
+					return false;
+				}
+#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX
+				debug::DebugLock lock(mutex);
+#endif
+				// Find out where we'll be inserting this block in the block index
+				BlockIndexEntry* idxEntry;
+				if (!insert_block_index_entry<allocMode>(idxEntry, currentTailIndex)) {
+					return false;
+				}
+				
+				// Get ahold of a new block
+				auto newBlock = this->parent->ConcurrentQueue::template requisition_block<allocMode>();
+				if (newBlock == nullptr) {
+					rewind_block_index_tail();
+					idxEntry->value.store(nullptr, std::memory_order_relaxed);
+					return false;
+				}
+#ifdef MCDBGQ_TRACKMEM
+				newBlock->owner = this;
+#endif
+				newBlock->ConcurrentQueue::Block::template reset_empty<implicit_context>();
+				
+				if (!MOODYCAMEL_NOEXCEPT_CTOR(T, U, new ((T*)nullptr) T(std::forward<U>(element)))) {
+					// May throw, try to insert now before we publish the fact that we have this new block
+					MOODYCAMEL_TRY {
+						new ((*newBlock)[currentTailIndex]) T(std::forward<U>(element));
+					}
+					MOODYCAMEL_CATCH (...) {
+						rewind_block_index_tail();
+						idxEntry->value.store(nullptr, std::memory_order_relaxed);
+						this->parent->add_block_to_free_list(newBlock);
+						MOODYCAMEL_RETHROW;
+					}
+				}
+				
+				// Insert the new block into the index
+				idxEntry->value.store(newBlock, std::memory_order_relaxed);
+				
+				this->tailBlock = newBlock;
+				
+				if (!MOODYCAMEL_NOEXCEPT_CTOR(T, U, new ((T*)nullptr) T(std::forward<U>(element)))) {
+					this->tailIndex.store(newTailIndex, std::memory_order_release);
+					return true;
+				}
+			}
+			
+			// Enqueue
+			new ((*this->tailBlock)[currentTailIndex]) T(std::forward<U>(element));
+			
+			this->tailIndex.store(newTailIndex, std::memory_order_release);
+			return true;
+		}
+		
+		template<typename U>
+		bool dequeue(U& element)
+		{
+			// See ExplicitProducer::dequeue for rationale and explanation
+			index_t tail = this->tailIndex.load(std::memory_order_relaxed);
+			index_t overcommit = this->dequeueOvercommit.load(std::memory_order_relaxed);
+			if (details::circular_less_than<index_t>(this->dequeueOptimisticCount.load(std::memory_order_relaxed) - overcommit, tail)) {
+				std::atomic_thread_fence(std::memory_order_acquire);
+				
+				index_t myDequeueCount = this->dequeueOptimisticCount.fetch_add(1, std::memory_order_relaxed);
+				tail = this->tailIndex.load(std::memory_order_acquire);
+				if ((details::likely)(details::circular_less_than<index_t>(myDequeueCount - overcommit, tail))) {
+					index_t index = this->headIndex.fetch_add(1, std::memory_order_acq_rel);
+					
+					// Determine which block the element is in
+					auto entry = get_block_index_entry_for_index(index);
+					
+					// Dequeue
+					auto block = entry->value.load(std::memory_order_relaxed);
+					auto& el = *((*block)[index]);
+					
+					if (!MOODYCAMEL_NOEXCEPT_ASSIGN(T, T&&, element = std::move(el))) {
+#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX
+						// Note: Acquiring the mutex with every dequeue instead of only when a block
+						// is released is very sub-optimal, but it is, after all, purely debug code.
+						debug::DebugLock lock(producer->mutex);
+#endif
+						struct Guard {
+							Block* block;
+							index_t index;
+							BlockIndexEntry* entry;
+							ConcurrentQueue* parent;
+							
+							~Guard()
+							{
+								(*block)[index]->~T();
+								if (block->ConcurrentQueue::Block::template set_empty<implicit_context>(index)) {
+									entry->value.store(nullptr, std::memory_order_relaxed);
+									parent->add_block_to_free_list(block);
+								}
+							}
+						} guard = { block, index, entry, this->parent };
+
+						element = std::move(el); // NOLINT
+					}
+					else {
+						element = std::move(el); // NOLINT
+						el.~T(); // NOLINT
+
+						if (block->ConcurrentQueue::Block::template set_empty<implicit_context>(index)) {
+							{
+#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX
+								debug::DebugLock lock(mutex);
+#endif
+								// Add the block back into the global free pool (and remove from block index)
+								entry->value.store(nullptr, std::memory_order_relaxed);
+							}
+							this->parent->add_block_to_free_list(block);		// releases the above store
+						}
+					}
+					
+					return true;
+				}
+				else {
+					this->dequeueOvercommit.fetch_add(1, std::memory_order_release);
+				}
+			}
+		
+			return false;
+		}
+		
+		template<AllocationMode allocMode, typename It>
+		bool enqueue_bulk(It itemFirst, size_t count)
+		{
+			// First, we need to make sure we have enough room to enqueue all of the elements;
+			// this means pre-allocating blocks and putting them in the block index (but only if
+			// all the allocations succeeded).
+			
+			// Note that the tailBlock we start off with may not be owned by us any more;
+			// this happens if it was filled up exactly to the top (setting tailIndex to
+			// the first index of the next block which is not yet allocated), then dequeued
+			// completely (putting it on the free list) before we enqueue again.
+			
+			index_t startTailIndex = this->tailIndex.load(std::memory_order_relaxed);
+			auto startBlock = this->tailBlock;
+			Block* firstAllocatedBlock = nullptr;
+			auto endBlock = this->tailBlock;
+			
+			// Figure out how many blocks we'll need to allocate, and do so
+			size_t blockBaseDiff = ((startTailIndex + count - 1) & ~static_cast<index_t>(BLOCK_SIZE - 1)) - ((startTailIndex - 1) & ~static_cast<index_t>(BLOCK_SIZE - 1));
+			index_t currentTailIndex = (startTailIndex - 1) & ~static_cast<index_t>(BLOCK_SIZE - 1);
+			if (blockBaseDiff > 0) {
+#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX
+				debug::DebugLock lock(mutex);
+#endif
+				do {
+					blockBaseDiff -= static_cast<index_t>(BLOCK_SIZE);
+					currentTailIndex += static_cast<index_t>(BLOCK_SIZE);
+					
+					// Find out where we'll be inserting this block in the block index
+					BlockIndexEntry* idxEntry = nullptr;  // initialization here unnecessary but compiler can't always tell
+					Block* newBlock;
+					bool indexInserted = false;
+					auto head = this->headIndex.load(std::memory_order_relaxed);
+					assert(!details::circular_less_than<index_t>(currentTailIndex, head));
+					bool full = !details::circular_less_than<index_t>(head, currentTailIndex + BLOCK_SIZE) || (MAX_SUBQUEUE_SIZE != details::const_numeric_max<size_t>::value && (MAX_SUBQUEUE_SIZE == 0 || MAX_SUBQUEUE_SIZE - BLOCK_SIZE < currentTailIndex - head));
+					if (full || !(indexInserted = insert_block_index_entry<allocMode>(idxEntry, currentTailIndex)) || (newBlock = this->parent->ConcurrentQueue::template requisition_block<allocMode>()) == nullptr) {
+						// Index allocation or block allocation failed; revert any other allocations
+						// and index insertions done so far for this operation
+						if (indexInserted) {
+							rewind_block_index_tail();
+							idxEntry->value.store(nullptr, std::memory_order_relaxed);
+						}
+						currentTailIndex = (startTailIndex - 1) & ~static_cast<index_t>(BLOCK_SIZE - 1);
+						for (auto block = firstAllocatedBlock; block != nullptr; block = block->next) {
+							currentTailIndex += static_cast<index_t>(BLOCK_SIZE);
+							idxEntry = get_block_index_entry_for_index(currentTailIndex);
+							idxEntry->value.store(nullptr, std::memory_order_relaxed);
+							rewind_block_index_tail();
+						}
+						this->parent->add_blocks_to_free_list(firstAllocatedBlock);
+						this->tailBlock = startBlock;
+						
+						return false;
+					}
+					
+#ifdef MCDBGQ_TRACKMEM
+					newBlock->owner = this;
+#endif
+					newBlock->ConcurrentQueue::Block::template reset_empty<implicit_context>();
+					newBlock->next = nullptr;
+					
+					// Insert the new block into the index
+					idxEntry->value.store(newBlock, std::memory_order_relaxed);
+					
+					// Store the chain of blocks so that we can undo if later allocations fail,
+					// and so that we can find the blocks when we do the actual enqueueing
+					if ((startTailIndex & static_cast<index_t>(BLOCK_SIZE - 1)) != 0 || firstAllocatedBlock != nullptr) {
+						assert(this->tailBlock != nullptr);
+						this->tailBlock->next = newBlock;
+					}
+					this->tailBlock = newBlock;
+					endBlock = newBlock;
+					firstAllocatedBlock = firstAllocatedBlock == nullptr ? newBlock : firstAllocatedBlock;
+				} while (blockBaseDiff > 0);
+			}
+			
+			// Enqueue, one block at a time
+			index_t newTailIndex = startTailIndex + static_cast<index_t>(count);
+			currentTailIndex = startTailIndex;
+			this->tailBlock = startBlock;
+			assert((startTailIndex & static_cast<index_t>(BLOCK_SIZE - 1)) != 0 || firstAllocatedBlock != nullptr || count == 0);
+			if ((startTailIndex & static_cast<index_t>(BLOCK_SIZE - 1)) == 0 && firstAllocatedBlock != nullptr) {
+				this->tailBlock = firstAllocatedBlock;
+			}
+			while (true) {
+				auto stopIndex = (currentTailIndex & ~static_cast<index_t>(BLOCK_SIZE - 1)) + static_cast<index_t>(BLOCK_SIZE);
+				if (details::circular_less_than<index_t>(newTailIndex, stopIndex)) {
+					stopIndex = newTailIndex;
+				}
+				if (MOODYCAMEL_NOEXCEPT_CTOR(T, decltype(*itemFirst), new ((T*)nullptr) T(details::deref_noexcept(itemFirst)))) {
+					while (currentTailIndex != stopIndex) {
+						new ((*this->tailBlock)[currentTailIndex++]) T(*itemFirst++);
+					}
+				}
+				else {
+					MOODYCAMEL_TRY {
+						while (currentTailIndex != stopIndex) {
+							new ((*this->tailBlock)[currentTailIndex]) T(details::nomove_if<(bool)!MOODYCAMEL_NOEXCEPT_CTOR(T, decltype(*itemFirst), new ((T*)nullptr) T(details::deref_noexcept(itemFirst)))>::eval(*itemFirst));
+							++currentTailIndex;
+							++itemFirst;
+						}
+					}
+					MOODYCAMEL_CATCH (...) {
+						auto constructedStopIndex = currentTailIndex;
+						auto lastBlockEnqueued = this->tailBlock;
+						
+						if (!details::is_trivially_destructible<T>::value) {
+							auto block = startBlock;
+							if ((startTailIndex & static_cast<index_t>(BLOCK_SIZE - 1)) == 0) {
+								block = firstAllocatedBlock;
+							}
+							currentTailIndex = startTailIndex;
+							while (true) {
+								stopIndex = (currentTailIndex & ~static_cast<index_t>(BLOCK_SIZE - 1)) + static_cast<index_t>(BLOCK_SIZE);
+								if (details::circular_less_than<index_t>(constructedStopIndex, stopIndex)) {
+									stopIndex = constructedStopIndex;
+								}
+								while (currentTailIndex != stopIndex) {
+									(*block)[currentTailIndex++]->~T();
+								}
+								if (block == lastBlockEnqueued) {
+									break;
+								}
+								block = block->next;
+							}
+						}
+						
+						currentTailIndex = (startTailIndex - 1) & ~static_cast<index_t>(BLOCK_SIZE - 1);
+						for (auto block = firstAllocatedBlock; block != nullptr; block = block->next) {
+							currentTailIndex += static_cast<index_t>(BLOCK_SIZE);
+							auto idxEntry = get_block_index_entry_for_index(currentTailIndex);
+							idxEntry->value.store(nullptr, std::memory_order_relaxed);
+							rewind_block_index_tail();
+						}
+						this->parent->add_blocks_to_free_list(firstAllocatedBlock);
+						this->tailBlock = startBlock;
+						MOODYCAMEL_RETHROW;
+					}
+				}
+				
+				if (this->tailBlock == endBlock) {
+					assert(currentTailIndex == newTailIndex);
+					break;
+				}
+				this->tailBlock = this->tailBlock->next;
+			}
+			this->tailIndex.store(newTailIndex, std::memory_order_release);
+			return true;
+		}
+		
+		template<typename It>
+		size_t dequeue_bulk(It& itemFirst, size_t max)
+		{
+			auto tail = this->tailIndex.load(std::memory_order_relaxed);
+			auto overcommit = this->dequeueOvercommit.load(std::memory_order_relaxed);
+			auto desiredCount = static_cast<size_t>(tail - (this->dequeueOptimisticCount.load(std::memory_order_relaxed) - overcommit));
+			if (details::circular_less_than<size_t>(0, desiredCount)) {
+				desiredCount = desiredCount < max ? desiredCount : max;
+				std::atomic_thread_fence(std::memory_order_acquire);
+				
+				auto myDequeueCount = this->dequeueOptimisticCount.fetch_add(desiredCount, std::memory_order_relaxed);
+				
+				tail = this->tailIndex.load(std::memory_order_acquire);
+				auto actualCount = static_cast<size_t>(tail - (myDequeueCount - overcommit));
+				if (details::circular_less_than<size_t>(0, actualCount)) {
+					actualCount = desiredCount < actualCount ? desiredCount : actualCount;
+					if (actualCount < desiredCount) {
+						this->dequeueOvercommit.fetch_add(desiredCount - actualCount, std::memory_order_release);
+					}
+					
+					// Get the first index. Note that since there's guaranteed to be at least actualCount elements, this
+					// will never exceed tail.
+					auto firstIndex = this->headIndex.fetch_add(actualCount, std::memory_order_acq_rel);
+					
+					// Iterate the blocks and dequeue
+					auto index = firstIndex;
+					BlockIndexHeader* localBlockIndex;
+					auto indexIndex = get_block_index_index_for_index(index, localBlockIndex);
+					do {
+						auto blockStartIndex = index;
+						auto endIndex = (index & ~static_cast<index_t>(BLOCK_SIZE - 1)) + static_cast<index_t>(BLOCK_SIZE);
+						endIndex = details::circular_less_than<index_t>(firstIndex + static_cast<index_t>(actualCount), endIndex) ? firstIndex + static_cast<index_t>(actualCount) : endIndex;
+						
+						auto entry = localBlockIndex->index[indexIndex];
+						auto block = entry->value.load(std::memory_order_relaxed);
+						if (MOODYCAMEL_NOEXCEPT_ASSIGN(T, T&&, details::deref_noexcept(itemFirst) = std::move((*(*block)[index])))) {
+							while (index != endIndex) {
+								auto& el = *((*block)[index]);
+								*itemFirst++ = std::move(el);
+								el.~T();
+								++index;
+							}
+						}
+						else {
+							MOODYCAMEL_TRY {
+								while (index != endIndex) {
+									auto& el = *((*block)[index]);
+									*itemFirst = std::move(el);
+									++itemFirst;
+									el.~T();
+									++index;
+								}
+							}
+							MOODYCAMEL_CATCH (...) {
+								do {
+									entry = localBlockIndex->index[indexIndex];
+									block = entry->value.load(std::memory_order_relaxed);
+									while (index != endIndex) {
+										(*block)[index++]->~T();
+									}
+									
+									if (block->ConcurrentQueue::Block::template set_many_empty<implicit_context>(blockStartIndex, static_cast<size_t>(endIndex - blockStartIndex))) {
+#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX
+										debug::DebugLock lock(mutex);
+#endif
+										entry->value.store(nullptr, std::memory_order_relaxed);
+										this->parent->add_block_to_free_list(block);
+									}
+									indexIndex = (indexIndex + 1) & (localBlockIndex->capacity - 1);
+									
+									blockStartIndex = index;
+									endIndex = (index & ~static_cast<index_t>(BLOCK_SIZE - 1)) + static_cast<index_t>(BLOCK_SIZE);
+									endIndex = details::circular_less_than<index_t>(firstIndex + static_cast<index_t>(actualCount), endIndex) ? firstIndex + static_cast<index_t>(actualCount) : endIndex;
+								} while (index != firstIndex + actualCount);
+								
+								MOODYCAMEL_RETHROW;
+							}
+						}
+						if (block->ConcurrentQueue::Block::template set_many_empty<implicit_context>(blockStartIndex, static_cast<size_t>(endIndex - blockStartIndex))) {
+							{
+#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX
+								debug::DebugLock lock(mutex);
+#endif
+								// Note that the set_many_empty above did a release, meaning that anybody who acquires the block
+								// we're about to free can use it safely since our writes (and reads!) will have happened-before then.
+								entry->value.store(nullptr, std::memory_order_relaxed);
+							}
+							this->parent->add_block_to_free_list(block);		// releases the above store
+						}
+						indexIndex = (indexIndex + 1) & (localBlockIndex->capacity - 1);
+					} while (index != firstIndex + actualCount);
+					
+					return actualCount;
+				}
+				else {
+					this->dequeueOvercommit.fetch_add(desiredCount, std::memory_order_release);
+				}
+			}
+			
+			return 0;
+		}
+		
+	private:
+		// The block size must be > 1, so any number with the low bit set is an invalid block base index
+		static const index_t INVALID_BLOCK_BASE = 1;
+		
+		struct BlockIndexEntry
+		{
+			std::atomic<index_t> key;
+			std::atomic<Block*> value;
+		};
+		
+		struct BlockIndexHeader
+		{
+			size_t capacity;
+			std::atomic<size_t> tail;
+			BlockIndexEntry* entries;
+			BlockIndexEntry** index;
+			BlockIndexHeader* prev;
+		};
+		
+		template<AllocationMode allocMode>
+		inline bool insert_block_index_entry(BlockIndexEntry*& idxEntry, index_t blockStartIndex)
+		{
+			auto localBlockIndex = blockIndex.load(std::memory_order_relaxed);		// We're the only writer thread, relaxed is OK
+			if (localBlockIndex == nullptr) {
+				return false;  // this can happen if new_block_index failed in the constructor
+			}
+			auto newTail = (localBlockIndex->tail.load(std::memory_order_relaxed) + 1) & (localBlockIndex->capacity - 1);
+			idxEntry = localBlockIndex->index[newTail];
+			if (idxEntry->key.load(std::memory_order_relaxed) == INVALID_BLOCK_BASE ||
+				idxEntry->value.load(std::memory_order_relaxed) == nullptr) {
+				
+				idxEntry->key.store(blockStartIndex, std::memory_order_relaxed);
+				localBlockIndex->tail.store(newTail, std::memory_order_release);
+				return true;
+			}
+			
+			// No room in the old block index, try to allocate another one!
+			if (allocMode == CannotAlloc || !new_block_index()) {
+				return false;
+			}
+			localBlockIndex = blockIndex.load(std::memory_order_relaxed);
+			newTail = (localBlockIndex->tail.load(std::memory_order_relaxed) + 1) & (localBlockIndex->capacity - 1);
+			idxEntry = localBlockIndex->index[newTail];
+			assert(idxEntry->key.load(std::memory_order_relaxed) == INVALID_BLOCK_BASE);
+			idxEntry->key.store(blockStartIndex, std::memory_order_relaxed);
+			localBlockIndex->tail.store(newTail, std::memory_order_release);
+			return true;
+		}
+		
+		inline void rewind_block_index_tail()
+		{
+			auto localBlockIndex = blockIndex.load(std::memory_order_relaxed);
+			localBlockIndex->tail.store((localBlockIndex->tail.load(std::memory_order_relaxed) - 1) & (localBlockIndex->capacity - 1), std::memory_order_relaxed);
+		}
+		
+		inline BlockIndexEntry* get_block_index_entry_for_index(index_t index) const
+		{
+			BlockIndexHeader* localBlockIndex;
+			auto idx = get_block_index_index_for_index(index, localBlockIndex);
+			return localBlockIndex->index[idx];
+		}
+		
+		inline size_t get_block_index_index_for_index(index_t index, BlockIndexHeader*& localBlockIndex) const
+		{
+#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX
+			debug::DebugLock lock(mutex);
+#endif
+			index &= ~static_cast<index_t>(BLOCK_SIZE - 1);
+			localBlockIndex = blockIndex.load(std::memory_order_acquire);
+			auto tail = localBlockIndex->tail.load(std::memory_order_acquire);
+			auto tailBase = localBlockIndex->index[tail]->key.load(std::memory_order_relaxed);
+			assert(tailBase != INVALID_BLOCK_BASE);
+			// Note: Must use division instead of shift because the index may wrap around, causing a negative
+			// offset, whose negativity we want to preserve
+			auto offset = static_cast<size_t>(static_cast<typename std::make_signed<index_t>::type>(index - tailBase) / BLOCK_SIZE);
+			size_t idx = (tail + offset) & (localBlockIndex->capacity - 1);
+			assert(localBlockIndex->index[idx]->key.load(std::memory_order_relaxed) == index && localBlockIndex->index[idx]->value.load(std::memory_order_relaxed) != nullptr);
+			return idx;
+		}
+		
+		bool new_block_index()
+		{
+			auto prev = blockIndex.load(std::memory_order_relaxed);
+			size_t prevCapacity = prev == nullptr ? 0 : prev->capacity;
+			auto entryCount = prev == nullptr ? nextBlockIndexCapacity : prevCapacity;
+			auto raw = static_cast<char*>((Traits::malloc)(
+				sizeof(BlockIndexHeader) +
+				std::alignment_of<BlockIndexEntry>::value - 1 + sizeof(BlockIndexEntry) * entryCount +
+				std::alignment_of<BlockIndexEntry*>::value - 1 + sizeof(BlockIndexEntry*) * nextBlockIndexCapacity));
+			if (raw == nullptr) {
+				return false;
+			}
+			
+			auto header = new (raw) BlockIndexHeader;
+			auto entries = reinterpret_cast<BlockIndexEntry*>(details::align_for<BlockIndexEntry>(raw + sizeof(BlockIndexHeader)));
+			auto index = reinterpret_cast<BlockIndexEntry**>(details::align_for<BlockIndexEntry*>(reinterpret_cast<char*>(entries) + sizeof(BlockIndexEntry) * entryCount));
+			if (prev != nullptr) {
+				auto prevTail = prev->tail.load(std::memory_order_relaxed);
+				auto prevPos = prevTail;
+				size_t i = 0;
+				do {
+					prevPos = (prevPos + 1) & (prev->capacity - 1);
+					index[i++] = prev->index[prevPos];
+				} while (prevPos != prevTail);
+				assert(i == prevCapacity);
+			}
+			for (size_t i = 0; i != entryCount; ++i) {
+				new (entries + i) BlockIndexEntry;
+				entries[i].key.store(INVALID_BLOCK_BASE, std::memory_order_relaxed);
+				index[prevCapacity + i] = entries + i;
+			}
+			header->prev = prev;
+			header->entries = entries;
+			header->index = index;
+			header->capacity = nextBlockIndexCapacity;
+			header->tail.store((prevCapacity - 1) & (nextBlockIndexCapacity - 1), std::memory_order_relaxed);
+			
+			blockIndex.store(header, std::memory_order_release);
+			
+			nextBlockIndexCapacity <<= 1;
+			
+			return true;
+		}
+		
+	private:
+		size_t nextBlockIndexCapacity;
+		std::atomic<BlockIndexHeader*> blockIndex;
+
+#ifdef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED
+	public:
+		details::ThreadExitListener threadExitListener;
+	private:
+#endif
+		
+#ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG
+	public:
+		ImplicitProducer* nextImplicitProducer;
+	private:
+#endif
+
+#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX
+		mutable debug::DebugMutex mutex;
+#endif
+#ifdef MCDBGQ_TRACKMEM
+		friend struct MemStats;
+#endif
+	};
+	
+	
+	//////////////////////////////////
+	// Block pool manipulation
+	//////////////////////////////////
+	
+	void populate_initial_block_list(size_t blockCount)
+	{
+		initialBlockPoolSize = blockCount;
+		if (initialBlockPoolSize == 0) {
+			initialBlockPool = nullptr;
+			return;
+		}
+		
+		initialBlockPool = create_array<Block>(blockCount);
+		if (initialBlockPool == nullptr) {
+			initialBlockPoolSize = 0;
+		}
+		for (size_t i = 0; i < initialBlockPoolSize; ++i) {
+			initialBlockPool[i].dynamicallyAllocated = false;
+		}
+	}
+	
+	inline Block* try_get_block_from_initial_pool()
+	{
+		if (initialBlockPoolIndex.load(std::memory_order_relaxed) >= initialBlockPoolSize) {
+			return nullptr;
+		}
+		
+		auto index = initialBlockPoolIndex.fetch_add(1, std::memory_order_relaxed);
+		
+		return index < initialBlockPoolSize ? (initialBlockPool + index) : nullptr;
+	}
+	
+	inline void add_block_to_free_list(Block* block)
+	{
+#ifdef MCDBGQ_TRACKMEM
+		block->owner = nullptr;
+#endif
+		freeList.add(block);
+	}
+	
+	inline void add_blocks_to_free_list(Block* block)
+	{
+		while (block != nullptr) {
+			auto next = block->next;
+			add_block_to_free_list(block);
+			block = next;
+		}
+	}
+	
+	inline Block* try_get_block_from_free_list()
+	{
+		return freeList.try_get();
+	}
+	
+	// Gets a free block from one of the memory pools, or allocates a new one (if applicable)
+	template<AllocationMode canAlloc>
+	Block* requisition_block()
+	{
+		auto block = try_get_block_from_initial_pool();
+		if (block != nullptr) {
+			return block;
+		}
+		
+		block = try_get_block_from_free_list();
+		if (block != nullptr) {
+			return block;
+		}
+		
+		if (canAlloc == CanAlloc) {
+			return create<Block>();
+		}
+		
+		return nullptr;
+	}
+	
+
+#ifdef MCDBGQ_TRACKMEM
+	public:
+		struct MemStats {
+			size_t allocatedBlocks;
+			size_t usedBlocks;
+			size_t freeBlocks;
+			size_t ownedBlocksExplicit;
+			size_t ownedBlocksImplicit;
+			size_t implicitProducers;
+			size_t explicitProducers;
+			size_t elementsEnqueued;
+			size_t blockClassBytes;
+			size_t queueClassBytes;
+			size_t implicitBlockIndexBytes;
+			size_t explicitBlockIndexBytes;
+			
+			friend class ConcurrentQueue;
+			
+		private:
+			static MemStats getFor(ConcurrentQueue* q)
+			{
+				MemStats stats = { 0 };
+				
+				stats.elementsEnqueued = q->size_approx();
+			
+				auto block = q->freeList.head_unsafe();
+				while (block != nullptr) {
+					++stats.allocatedBlocks;
+					++stats.freeBlocks;
+					block = block->freeListNext.load(std::memory_order_relaxed);
+				}
+				
+				for (auto ptr = q->producerListTail.load(std::memory_order_acquire); ptr != nullptr; ptr = ptr->next_prod()) {
+					bool implicit = dynamic_cast<ImplicitProducer*>(ptr) != nullptr;
+					stats.implicitProducers += implicit ? 1 : 0;
+					stats.explicitProducers += implicit ? 0 : 1;
+					
+					if (implicit) {
+						auto prod = static_cast<ImplicitProducer*>(ptr);
+						stats.queueClassBytes += sizeof(ImplicitProducer);
+						auto head = prod->headIndex.load(std::memory_order_relaxed);
+						auto tail = prod->tailIndex.load(std::memory_order_relaxed);
+						auto hash = prod->blockIndex.load(std::memory_order_relaxed);
+						if (hash != nullptr) {
+							for (size_t i = 0; i != hash->capacity; ++i) {
+								if (hash->index[i]->key.load(std::memory_order_relaxed) != ImplicitProducer::INVALID_BLOCK_BASE && hash->index[i]->value.load(std::memory_order_relaxed) != nullptr) {
+									++stats.allocatedBlocks;
+									++stats.ownedBlocksImplicit;
+								}
+							}
+							stats.implicitBlockIndexBytes += hash->capacity * sizeof(typename ImplicitProducer::BlockIndexEntry);
+							for (; hash != nullptr; hash = hash->prev) {
+								stats.implicitBlockIndexBytes += sizeof(typename ImplicitProducer::BlockIndexHeader) + hash->capacity * sizeof(typename ImplicitProducer::BlockIndexEntry*);
+							}
+						}
+						for (; details::circular_less_than<index_t>(head, tail); head += BLOCK_SIZE) {
+							//auto block = prod->get_block_index_entry_for_index(head);
+							++stats.usedBlocks;
+						}
+					}
+					else {
+						auto prod = static_cast<ExplicitProducer*>(ptr);
+						stats.queueClassBytes += sizeof(ExplicitProducer);
+						auto tailBlock = prod->tailBlock;
+						bool wasNonEmpty = false;
+						if (tailBlock != nullptr) {
+							auto block = tailBlock;
+							do {
+								++stats.allocatedBlocks;
+								if (!block->ConcurrentQueue::Block::template is_empty<explicit_context>() || wasNonEmpty) {
+									++stats.usedBlocks;
+									wasNonEmpty = wasNonEmpty || block != tailBlock;
+								}
+								++stats.ownedBlocksExplicit;
+								block = block->next;
+							} while (block != tailBlock);
+						}
+						auto index = prod->blockIndex.load(std::memory_order_relaxed);
+						while (index != nullptr) {
+							stats.explicitBlockIndexBytes += sizeof(typename ExplicitProducer::BlockIndexHeader) + index->size * sizeof(typename ExplicitProducer::BlockIndexEntry);
+							index = static_cast<typename ExplicitProducer::BlockIndexHeader*>(index->prev);
+						}
+					}
+				}
+				
+				auto freeOnInitialPool = q->initialBlockPoolIndex.load(std::memory_order_relaxed) >= q->initialBlockPoolSize ? 0 : q->initialBlockPoolSize - q->initialBlockPoolIndex.load(std::memory_order_relaxed);
+				stats.allocatedBlocks += freeOnInitialPool;
+				stats.freeBlocks += freeOnInitialPool;
+				
+				stats.blockClassBytes = sizeof(Block) * stats.allocatedBlocks;
+				stats.queueClassBytes += sizeof(ConcurrentQueue);
+				
+				return stats;
+			}
+		};
+		
+		// For debugging only. Not thread-safe.
+		MemStats getMemStats()
+		{
+			return MemStats::getFor(this);
+		}
+	private:
+		friend struct MemStats;
+#endif
+	
+	
+	//////////////////////////////////
+	// Producer list manipulation
+	//////////////////////////////////	
+	
+	ProducerBase* recycle_or_create_producer(bool isExplicit)
+	{
+		bool recycled;
+		return recycle_or_create_producer(isExplicit, recycled);
+	}
+	
+	ProducerBase* recycle_or_create_producer(bool isExplicit, bool& recycled)
+	{
+#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODHASH
+		debug::DebugLock lock(implicitProdMutex);
+#endif
+		// Try to re-use one first
+		for (auto ptr = producerListTail.load(std::memory_order_acquire); ptr != nullptr; ptr = ptr->next_prod()) {
+			if (ptr->inactive.load(std::memory_order_relaxed) && ptr->isExplicit == isExplicit) {
+				bool expected = true;
+				if (ptr->inactive.compare_exchange_strong(expected, /* desired */ false, std::memory_order_acquire, std::memory_order_relaxed)) {
+					// We caught one! It's been marked as activated, the caller can have it
+					recycled = true;
+					return ptr;
+				}
+			}
+		}
+		
+		recycled = false;
+		return add_producer(isExplicit ? static_cast<ProducerBase*>(create<ExplicitProducer>(this)) : create<ImplicitProducer>(this));
+	}
+	
+	ProducerBase* add_producer(ProducerBase* producer)
+	{
+		// Handle failed memory allocation
+		if (producer == nullptr) {
+			return nullptr;
+		}
+		
+		producerCount.fetch_add(1, std::memory_order_relaxed);
+		
+		// Add it to the lock-free list
+		auto prevTail = producerListTail.load(std::memory_order_relaxed);
+		do {
+			producer->next = prevTail;
+		} while (!producerListTail.compare_exchange_weak(prevTail, producer, std::memory_order_release, std::memory_order_relaxed));
+		
+#ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG
+		if (producer->isExplicit) {
+			auto prevTailExplicit = explicitProducers.load(std::memory_order_relaxed);
+			do {
+				static_cast<ExplicitProducer*>(producer)->nextExplicitProducer = prevTailExplicit;
+			} while (!explicitProducers.compare_exchange_weak(prevTailExplicit, static_cast<ExplicitProducer*>(producer), std::memory_order_release, std::memory_order_relaxed));
+		}
+		else {
+			auto prevTailImplicit = implicitProducers.load(std::memory_order_relaxed);
+			do {
+				static_cast<ImplicitProducer*>(producer)->nextImplicitProducer = prevTailImplicit;
+			} while (!implicitProducers.compare_exchange_weak(prevTailImplicit, static_cast<ImplicitProducer*>(producer), std::memory_order_release, std::memory_order_relaxed));
+		}
+#endif
+		
+		return producer;
+	}
+	
+	void reown_producers()
+	{
+		// After another instance is moved-into/swapped-with this one, all the
+		// producers we stole still think their parents are the other queue.
+		// So fix them up!
+		for (auto ptr = producerListTail.load(std::memory_order_relaxed); ptr != nullptr; ptr = ptr->next_prod()) {
+			ptr->parent = this;
+		}
+	}
+	
+	
+	//////////////////////////////////
+	// Implicit producer hash
+	//////////////////////////////////
+	
+	struct ImplicitProducerKVP
+	{
+		std::atomic<details::thread_id_t> key;
+		ImplicitProducer* value;		// No need for atomicity since it's only read by the thread that sets it in the first place
+		
+		ImplicitProducerKVP() : value(nullptr) { }
+		
+		ImplicitProducerKVP(ImplicitProducerKVP&& other) MOODYCAMEL_NOEXCEPT
+		{
+			key.store(other.key.load(std::memory_order_relaxed), std::memory_order_relaxed);
+			value = other.value;
+		}
+		
+		inline ImplicitProducerKVP& operator=(ImplicitProducerKVP&& other) MOODYCAMEL_NOEXCEPT
+		{
+			swap(other);
+			return *this;
+		}
+		
+		inline void swap(ImplicitProducerKVP& other) MOODYCAMEL_NOEXCEPT
+		{
+			if (this != &other) {
+				details::swap_relaxed(key, other.key);
+				std::swap(value, other.value);
+			}
+		}
+	};
+	
+	template<typename XT, typename XTraits>
+	friend void moodycamel::swap(typename ConcurrentQueue<XT, XTraits>::ImplicitProducerKVP&, typename ConcurrentQueue<XT, XTraits>::ImplicitProducerKVP&) MOODYCAMEL_NOEXCEPT;
+	
+	struct ImplicitProducerHash
+	{
+		size_t capacity;
+		ImplicitProducerKVP* entries;
+		ImplicitProducerHash* prev;
+	};
+	
+	inline void populate_initial_implicit_producer_hash()
+	{
+		if (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) return;
+		
+		implicitProducerHashCount.store(0, std::memory_order_relaxed);
+		auto hash = &initialImplicitProducerHash;
+		hash->capacity = INITIAL_IMPLICIT_PRODUCER_HASH_SIZE;
+		hash->entries = &initialImplicitProducerHashEntries[0];
+		for (size_t i = 0; i != INITIAL_IMPLICIT_PRODUCER_HASH_SIZE; ++i) {
+			initialImplicitProducerHashEntries[i].key.store(details::invalid_thread_id, std::memory_order_relaxed);
+		}
+		hash->prev = nullptr;
+		implicitProducerHash.store(hash, std::memory_order_relaxed);
+	}
+	
+	void swap_implicit_producer_hashes(ConcurrentQueue& other)
+	{
+		if (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) return;
+		
+		// Swap (assumes our implicit producer hash is initialized)
+		initialImplicitProducerHashEntries.swap(other.initialImplicitProducerHashEntries);
+		initialImplicitProducerHash.entries = &initialImplicitProducerHashEntries[0];
+		other.initialImplicitProducerHash.entries = &other.initialImplicitProducerHashEntries[0];
+		
+		details::swap_relaxed(implicitProducerHashCount, other.implicitProducerHashCount);
+		
+		details::swap_relaxed(implicitProducerHash, other.implicitProducerHash);
+		if (implicitProducerHash.load(std::memory_order_relaxed) == &other.initialImplicitProducerHash) {
+			implicitProducerHash.store(&initialImplicitProducerHash, std::memory_order_relaxed);
+		}
+		else {
+			ImplicitProducerHash* hash;
+			for (hash = implicitProducerHash.load(std::memory_order_relaxed); hash->prev != &other.initialImplicitProducerHash; hash = hash->prev) {
+				continue;
+			}
+			hash->prev = &initialImplicitProducerHash;
+		}
+		if (other.implicitProducerHash.load(std::memory_order_relaxed) == &initialImplicitProducerHash) {
+			other.implicitProducerHash.store(&other.initialImplicitProducerHash, std::memory_order_relaxed);
+		}
+		else {
+			ImplicitProducerHash* hash;
+			for (hash = other.implicitProducerHash.load(std::memory_order_relaxed); hash->prev != &initialImplicitProducerHash; hash = hash->prev) {
+				continue;
+			}
+			hash->prev = &other.initialImplicitProducerHash;
+		}
+	}
+	
+	// Only fails (returns nullptr) if memory allocation fails
+	ImplicitProducer* get_or_add_implicit_producer()
+	{
+		// Note that since the data is essentially thread-local (key is thread ID),
+		// there's a reduced need for fences (memory ordering is already consistent
+		// for any individual thread), except for the current table itself.
+		
+		// Start by looking for the thread ID in the current and all previous hash tables.
+		// If it's not found, it must not be in there yet, since this same thread would
+		// have added it previously to one of the tables that we traversed.
+		
+		// Code and algorithm adapted from http://preshing.com/20130605/the-worlds-simplest-lock-free-hash-table
+		
+#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODHASH
+		debug::DebugLock lock(implicitProdMutex);
+#endif
+		
+		auto id = details::thread_id();
+		auto hashedId = details::hash_thread_id(id);
+		
+		auto mainHash = implicitProducerHash.load(std::memory_order_acquire);
+		for (auto hash = mainHash; hash != nullptr; hash = hash->prev) {
+			// Look for the id in this hash
+			auto index = hashedId;
+			while (true) {		// Not an infinite loop because at least one slot is free in the hash table
+				index &= hash->capacity - 1;
+				
+				auto probedKey = hash->entries[index].key.load(std::memory_order_relaxed);
+				if (probedKey == id) {
+					// Found it! If we had to search several hashes deep, though, we should lazily add it
+					// to the current main hash table to avoid the extended search next time.
+					// Note there's guaranteed to be room in the current hash table since every subsequent
+					// table implicitly reserves space for all previous tables (there's only one
+					// implicitProducerHashCount).
+					auto value = hash->entries[index].value;
+					if (hash != mainHash) {
+						index = hashedId;
+						while (true) {
+							index &= mainHash->capacity - 1;
+							probedKey = mainHash->entries[index].key.load(std::memory_order_relaxed);
+							auto empty = details::invalid_thread_id;
+#ifdef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED
+							auto reusable = details::invalid_thread_id2;
+							if ((probedKey == empty    && mainHash->entries[index].key.compare_exchange_strong(empty,    id, std::memory_order_relaxed, std::memory_order_relaxed)) ||
+								(probedKey == reusable && mainHash->entries[index].key.compare_exchange_strong(reusable, id, std::memory_order_acquire, std::memory_order_acquire))) {
+#else
+							if ((probedKey == empty    && mainHash->entries[index].key.compare_exchange_strong(empty,    id, std::memory_order_relaxed, std::memory_order_relaxed))) {
+#endif
+								mainHash->entries[index].value = value;
+								break;
+							}
+							++index;
+						}
+					}
+					
+					return value;
+				}
+				if (probedKey == details::invalid_thread_id) {
+					break;		// Not in this hash table
+				}
+				++index;
+			}
+		}
+		
+		// Insert!
+		auto newCount = 1 + implicitProducerHashCount.fetch_add(1, std::memory_order_relaxed);
+		while (true) {
+			// NOLINTNEXTLINE(clang-analyzer-core.NullDereference)
+			if (newCount >= (mainHash->capacity >> 1) && !implicitProducerHashResizeInProgress.test_and_set(std::memory_order_acquire)) {
+				// We've acquired the resize lock, try to allocate a bigger hash table.
+				// Note the acquire fence synchronizes with the release fence at the end of this block, and hence when
+				// we reload implicitProducerHash it must be the most recent version (it only gets changed within this
+				// locked block).
+				mainHash = implicitProducerHash.load(std::memory_order_acquire);
+				if (newCount >= (mainHash->capacity >> 1)) {
+					auto newCapacity = mainHash->capacity << 1;
+					while (newCount >= (newCapacity >> 1)) {
+						newCapacity <<= 1;
+					}
+					auto raw = static_cast<char*>((Traits::malloc)(sizeof(ImplicitProducerHash) + std::alignment_of<ImplicitProducerKVP>::value - 1 + sizeof(ImplicitProducerKVP) * newCapacity));
+					if (raw == nullptr) {
+						// Allocation failed
+						implicitProducerHashCount.fetch_sub(1, std::memory_order_relaxed);
+						implicitProducerHashResizeInProgress.clear(std::memory_order_relaxed);
+						return nullptr;
+					}
+					
+					auto newHash = new (raw) ImplicitProducerHash;
+					newHash->capacity = newCapacity;
+					newHash->entries = reinterpret_cast<ImplicitProducerKVP*>(details::align_for<ImplicitProducerKVP>(raw + sizeof(ImplicitProducerHash)));
+					for (size_t i = 0; i != newCapacity; ++i) {
+						new (newHash->entries + i) ImplicitProducerKVP;
+						newHash->entries[i].key.store(details::invalid_thread_id, std::memory_order_relaxed);
+					}
+					newHash->prev = mainHash;
+					implicitProducerHash.store(newHash, std::memory_order_release);
+					implicitProducerHashResizeInProgress.clear(std::memory_order_release);
+					mainHash = newHash;
+				}
+				else {
+					implicitProducerHashResizeInProgress.clear(std::memory_order_release);
+				}
+			}
+			
+			// If it's < three-quarters full, add to the old one anyway so that we don't have to wait for the next table
+			// to finish being allocated by another thread (and if we just finished allocating above, the condition will
+			// always be true)
+			if (newCount < (mainHash->capacity >> 1) + (mainHash->capacity >> 2)) {
+				bool recycled;
+				auto producer = static_cast<ImplicitProducer*>(recycle_or_create_producer(false, recycled));
+				if (producer == nullptr) {
+					implicitProducerHashCount.fetch_sub(1, std::memory_order_relaxed);
+					return nullptr;
+				}
+				if (recycled) {
+					implicitProducerHashCount.fetch_sub(1, std::memory_order_relaxed);
+				}
+				
+#ifdef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED
+				producer->threadExitListener.callback = &ConcurrentQueue::implicit_producer_thread_exited_callback;
+				producer->threadExitListener.userData = producer;
+				details::ThreadExitNotifier::subscribe(&producer->threadExitListener);
+#endif
+				
+				auto index = hashedId;
+				while (true) {
+					index &= mainHash->capacity - 1;
+					auto probedKey = mainHash->entries[index].key.load(std::memory_order_relaxed);
+					
+					auto empty = details::invalid_thread_id;
+#ifdef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED
+					auto reusable = details::invalid_thread_id2;
+					if ((probedKey == empty    && mainHash->entries[index].key.compare_exchange_strong(empty,    id, std::memory_order_relaxed, std::memory_order_relaxed)) ||
+						(probedKey == reusable && mainHash->entries[index].key.compare_exchange_strong(reusable, id, std::memory_order_acquire, std::memory_order_acquire))) {
+#else
+					if ((probedKey == empty    && mainHash->entries[index].key.compare_exchange_strong(empty,    id, std::memory_order_relaxed, std::memory_order_relaxed))) {
+#endif
+						mainHash->entries[index].value = producer;
+						break;
+					}
+					++index;
+				}
+				return producer;
+			}
+			
+			// Hmm, the old hash is quite full and somebody else is busy allocating a new one.
+			// We need to wait for the allocating thread to finish (if it succeeds, we add, if not,
+			// we try to allocate ourselves).
+			mainHash = implicitProducerHash.load(std::memory_order_acquire);
+		}
+	}
+	
+#ifdef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED
+	void implicit_producer_thread_exited(ImplicitProducer* producer)
+	{
+		// Remove from thread exit listeners
+		details::ThreadExitNotifier::unsubscribe(&producer->threadExitListener);
+		
+		// Remove from hash
+#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODHASH
+		debug::DebugLock lock(implicitProdMutex);
+#endif
+		auto hash = implicitProducerHash.load(std::memory_order_acquire);
+		assert(hash != nullptr);		// The thread exit listener is only registered if we were added to a hash in the first place
+		auto id = details::thread_id();
+		auto hashedId = details::hash_thread_id(id);
+		details::thread_id_t probedKey;
+		
+		// We need to traverse all the hashes just in case other threads aren't on the current one yet and are
+		// trying to add an entry thinking there's a free slot (because they reused a producer)
+		for (; hash != nullptr; hash = hash->prev) {
+			auto index = hashedId;
+			do {
+				index &= hash->capacity - 1;
+				probedKey = hash->entries[index].key.load(std::memory_order_relaxed);
+				if (probedKey == id) {
+					hash->entries[index].key.store(details::invalid_thread_id2, std::memory_order_release);
+					break;
+				}
+				++index;
+			} while (probedKey != details::invalid_thread_id);		// Can happen if the hash has changed but we weren't put back in it yet, or if we weren't added to this hash in the first place
+		}
+		
+		// Mark the queue as being recyclable
+		producer->inactive.store(true, std::memory_order_release);
+	}
+	
+	static void implicit_producer_thread_exited_callback(void* userData)
+	{
+		auto producer = static_cast<ImplicitProducer*>(userData);
+		auto queue = producer->parent;
+		queue->implicit_producer_thread_exited(producer);
+	}
+#endif
+	
+	//////////////////////////////////
+	// Utility functions
+	//////////////////////////////////
+	
+	template<typename U>
+	static inline U* create_array(size_t count)
+	{
+		assert(count > 0);
+		auto p = static_cast<U*>((Traits::malloc)(sizeof(U) * count));
+		if (p == nullptr) {
+			return nullptr;
+		}
+		
+		for (size_t i = 0; i != count; ++i) {
+			new (p + i) U();
+		}
+		return p;
+	}
+	
+	template<typename U>
+	static inline void destroy_array(U* p, size_t count)
+	{
+		if (p != nullptr) {
+			assert(count > 0);
+			for (size_t i = count; i != 0; ) {
+				(p + --i)->~U();
+			}
+			(Traits::free)(p);
+		}
+	}
+	
+	template<typename U>
+	static inline U* create()
+	{
+		auto p = (Traits::malloc)(sizeof(U));
+		return p != nullptr ? new (p) U : nullptr;
+	}
+	
+	template<typename U, typename A1>
+	static inline U* create(A1&& a1)
+	{
+		auto p = (Traits::malloc)(sizeof(U));
+		return p != nullptr ? new (p) U(std::forward<A1>(a1)) : nullptr;
+	}
+	
+	template<typename U>
+	static inline void destroy(U* p)
+	{
+		if (p != nullptr) {
+			p->~U();
+		}
+		(Traits::free)(p);
+	}
+
+private:
+	std::atomic<ProducerBase*> producerListTail;
+	std::atomic<std::uint32_t> producerCount;
+	
+	std::atomic<size_t> initialBlockPoolIndex;
+	Block* initialBlockPool;
+	size_t initialBlockPoolSize;
+	
+#if !MCDBGQ_USEDEBUGFREELIST
+	FreeList<Block> freeList;
+#else
+	debug::DebugFreeList<Block> freeList;
+#endif
+	
+	std::atomic<ImplicitProducerHash*> implicitProducerHash;
+	std::atomic<size_t> implicitProducerHashCount;		// Number of slots logically used
+	ImplicitProducerHash initialImplicitProducerHash;
+	std::array<ImplicitProducerKVP, INITIAL_IMPLICIT_PRODUCER_HASH_SIZE> initialImplicitProducerHashEntries;
+	std::atomic_flag implicitProducerHashResizeInProgress;
+	
+	std::atomic<std::uint32_t> nextExplicitConsumerId;
+	std::atomic<std::uint32_t> globalExplicitConsumerOffset;
+	
+#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODHASH
+	debug::DebugMutex implicitProdMutex;
+#endif
+	
+#ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG
+	std::atomic<ExplicitProducer*> explicitProducers;
+	std::atomic<ImplicitProducer*> implicitProducers;
+#endif
+};
+
+
+template<typename T, typename Traits>
+ProducerToken::ProducerToken(ConcurrentQueue<T, Traits>& queue)
+	: producer(queue.recycle_or_create_producer(true))
+{
+	if (producer != nullptr) {
+		producer->token = this;
+	}
+}
+
+template<typename T, typename Traits>
+ProducerToken::ProducerToken(BlockingConcurrentQueue<T, Traits>& queue)
+	: producer(reinterpret_cast<ConcurrentQueue<T, Traits>*>(&queue)->recycle_or_create_producer(true))
+{
+	if (producer != nullptr) {
+		producer->token = this;
+	}
+}
+
+template<typename T, typename Traits>
+ConsumerToken::ConsumerToken(ConcurrentQueue<T, Traits>& queue)
+	: itemsConsumedFromCurrent(0), currentProducer(nullptr), desiredProducer(nullptr)
+{
+	initialOffset = queue.nextExplicitConsumerId.fetch_add(1, std::memory_order_release);
+	lastKnownGlobalOffset = -1;
+}
+
+template<typename T, typename Traits>
+ConsumerToken::ConsumerToken(BlockingConcurrentQueue<T, Traits>& queue)
+	: itemsConsumedFromCurrent(0), currentProducer(nullptr), desiredProducer(nullptr)
+{
+	initialOffset = reinterpret_cast<ConcurrentQueue<T, Traits>*>(&queue)->nextExplicitConsumerId.fetch_add(1, std::memory_order_release);
+	lastKnownGlobalOffset = -1;
+}
+
+template<typename T, typename Traits>
+inline void swap(ConcurrentQueue<T, Traits>& a, ConcurrentQueue<T, Traits>& b) MOODYCAMEL_NOEXCEPT
+{
+	a.swap(b);
+}
+
+inline void swap(ProducerToken& a, ProducerToken& b) MOODYCAMEL_NOEXCEPT
+{
+	a.swap(b);
+}
+
+inline void swap(ConsumerToken& a, ConsumerToken& b) MOODYCAMEL_NOEXCEPT
+{
+	a.swap(b);
+}
+
+template<typename T, typename Traits>
+inline void swap(typename ConcurrentQueue<T, Traits>::ImplicitProducerKVP& a, typename ConcurrentQueue<T, Traits>::ImplicitProducerKVP& b) MOODYCAMEL_NOEXCEPT
+{
+	a.swap(b);
+}
+
+}
+
+#if defined(__GNUC__)
+#pragma GCC diagnostic pop
+#endif
diff --git a/src/ctx.cpp b/src/ctx.cpp
index fdf0c163ac..edcc1aae31 100644
--- a/src/ctx.cpp
+++ b/src/ctx.cpp
@@ -47,6 +47,7 @@
 #include "err.hpp"
 #include "msg.hpp"
 #include "random.hpp"
+#include "allocator.hpp"
 
 #ifdef ZMQ_HAVE_VMCI
 #include <vmci_sockets.h>
@@ -279,6 +280,18 @@ int zmq::ctx_t::set (int option_, const void *optval_, size_t optvallen_)
             }
             break;
 
+            /*case ZMQ_MSG_ALLOCATOR: {
+            if (optvallen_ == sizeof (zmq::allocator_t)) {
+                const zmq::allocator_t *all =
+                  static_cast<const zmq::allocator_t *> (optval_);
+                if (all->check_tag ()) {
+                    _allocator = const_cast<zmq::allocator_t *> (all);
+                    return 0;
+                }
+            }
+            break;
+    }*/
+
         default: {
             return thread_ctx_t::set (option_, optval_, optvallen_);
         }
@@ -349,6 +362,9 @@ int zmq::ctx_t::get (int option_, void *optval_, size_t *optvallen_)
                 return 0;
             }
             break;
+            /*
+        case ZMQ_MSG_ALLOCATOR: {
+        } break;*/
 
         default: {
             return thread_ctx_t::get (option_, optval_, optvallen_);
diff --git a/src/ctx.hpp b/src/ctx.hpp
index 9aef843485..e8975d16eb 100644
--- a/src/ctx.hpp
+++ b/src/ctx.hpp
@@ -35,6 +35,7 @@
 #include <string>
 #include <stdarg.h>
 
+//#include "allocator.hpp"
 #include "mailbox.hpp"
 #include "array.hpp"
 #include "config.hpp"
@@ -220,6 +221,9 @@ class ctx_t : public thread_ctx_t
     //  Synchronisation of access to the list of inproc endpoints.
     mutex_t _endpoints_sync;
 
+    // Allocator for messages
+    //allocator_t *_allocator;
+
     //  Maximum socket ID.
     static atomic_counter_t max_socket_id;
 
diff --git a/src/msg.cpp b/src/msg.cpp
index dc1081c4c2..0a6b7a4570 100644
--- a/src/msg.cpp
+++ b/src/msg.cpp
@@ -39,6 +39,7 @@
 #include "likely.hpp"
 #include "metadata.hpp"
 #include "err.hpp"
+#include "allocator.hpp"
 
 //  Check whether the sizes of public representation of the message (zmq_msg_t)
 //  and private representation of the message (zmq::msg_t) match.
@@ -47,8 +48,6 @@ typedef char
   zmq_msg_size_check[2 * ((sizeof (zmq::msg_t) == sizeof (zmq_msg_t)) != 0)
                      - 1];
 
-#define ALLOCATE_CONTENT_T_INSIDE_PROVIDED_BUFFER (1)
-
 
 bool zmq::msg_t::check () const
 {
@@ -100,6 +99,7 @@ int zmq::msg_t::init_size (size_t size_)
         _u.lmsg.metadata = NULL;
         _u.lmsg.type = type_lmsg;
         _u.lmsg.flags = 0;
+        _u.lmsg.allocator_was_used = 0;
         _u.lmsg.group[0] = '\0';
         _u.lmsg.routing_id = 0;
         _u.lmsg.content = NULL;
@@ -167,28 +167,18 @@ int zmq::msg_t::init_data (void *data_,
         _u.lmsg.metadata = NULL;
         _u.lmsg.type = type_lmsg;
         _u.lmsg.flags = 0;
+        _u.lmsg.allocator_was_used = 0;
         _u.lmsg.group[0] = '\0';
         _u.lmsg.routing_id = 0;
-#if ALLOCATE_CONTENT_T_INSIDE_PROVIDED_BUFFER
-        zmq_assert (size_ > sizeof (content_t));
-        _u.lmsg.content = reinterpret_cast<content_t *> (data_);
-#else
         _u.lmsg.content =
           static_cast<content_t *> (malloc (sizeof (content_t)));
-#endif
         if (!_u.lmsg.content) {
             errno = ENOMEM;
             return -1;
         }
 
-#if ALLOCATE_CONTENT_T_INSIDE_PROVIDED_BUFFER
-        uint8_t *data_bytes = (uint8_t *) data_;
-        _u.lmsg.content->data = data_bytes + sizeof (content_t);
-        _u.lmsg.content->size = size_ - sizeof (content_t);
-#else
         _u.lmsg.content->data = data_;
         _u.lmsg.content->size = size_;
-#endif
         _u.lmsg.content->ffn = ffn_;
         _u.lmsg.content->hint = hint_;
         new (&_u.lmsg.content->refcnt) zmq::atomic_counter_t ();
@@ -196,6 +186,33 @@ int zmq::msg_t::init_data (void *data_,
     return 0;
 }
 
+int zmq::msg_t::init_from_allocator (size_t size_, zmq::allocator_t *alloc_)
+{
+    zmq_assert (alloc_ != NULL && size_ != 0);
+
+    _u.lmsg.metadata = NULL;
+    _u.lmsg.type = type_lmsg;
+    _u.lmsg.flags = 0;
+    _u.lmsg.allocator_was_used = 1;
+    _u.lmsg.group[0] = '\0';
+    _u.lmsg.routing_id = 0;
+    _u.lmsg.content = reinterpret_cast<content_t *> (
+      alloc_->allocate (size_ + sizeof (content_t)));
+
+    if (!_u.lmsg.content) {
+        errno = ENOMEM;
+        return -1;
+    }
+
+    _u.lmsg.content->data = _u.lmsg.content + 1;
+    _u.lmsg.content->size = size_;
+    _u.lmsg.content->ffn = (zmq_free_fn *) alloc_->deallocate_msg;
+    _u.lmsg.content->hint = alloc_;
+    new (&_u.lmsg.content->refcnt) zmq::atomic_counter_t ();
+
+    return 0;
+}
+
 int zmq::msg_t::init_delimiter ()
 {
     _u.delimiter.metadata = NULL;
@@ -242,23 +259,25 @@ int zmq::msg_t::close ()
             //  We used "placement new" operator to initialize the reference
             //  counter so we call the destructor explicitly now.
             _u.lmsg.content->refcnt.~atomic_counter_t ();
-#if ALLOCATE_CONTENT_T_INSIDE_PROVIDED_BUFFER
-            // take a local copy since we are going to remove (through the user-provided deallocator)
-            // the whole malloc'ed buffer, including the content_t block itself!
-            // NOTE: this copy should not be strictly needed but it's here just to help debugging:
-            content_t content;
-            content.data = _u.lmsg.content->data;
-            content.size = _u.lmsg.content->size;
-            content.ffn = _u.lmsg.content->ffn;
-            content.hint = _u.lmsg.content->hint;
-            if (content.ffn)
-                content.ffn (content.data, content.hint);
-#else
-            if (_u.lmsg.content->ffn)
-                _u.lmsg.content->ffn (_u.lmsg.content->data,
-                                      _u.lmsg.content->hint);
-            free (_u.lmsg.content);
-#endif
+
+            if (_u.lmsg.allocator_was_used) {
+                // take a local copy since we are going to remove (through the user-provided deallocator)
+                // the whole malloc'ed buffer, including the content_t block itself!
+                // NOTE: this copy should not be strictly needed but it's here just to help debugging:
+                content_t content;
+                content.data = _u.lmsg.content->data;
+                content.size = _u.lmsg.content->size;
+                content.ffn = _u.lmsg.content->ffn;
+                content.hint = _u.lmsg.content->hint;
+                if (content.ffn)
+                    /* return to the allocator the memory starting from the content_t struct */
+                    content.ffn (_u.lmsg.content, content.hint);
+            } else {
+                if (_u.lmsg.content->ffn)
+                    _u.lmsg.content->ffn (_u.lmsg.content->data,
+                                          _u.lmsg.content->hint);
+                free (_u.lmsg.content);
+            }
         }
     }
 
diff --git a/src/msg.hpp b/src/msg.hpp
index c4407c286c..bcbac866d4 100644
--- a/src/msg.hpp
+++ b/src/msg.hpp
@@ -51,6 +51,8 @@ typedef void(msg_free_fn) (void *data_, void *hint_);
 
 namespace zmq
 {
+class allocator_t;
+
 //  Note that this structure needs to be explicitly constructed
 //  (init functions) and destructed (close function).
 
@@ -105,6 +107,7 @@ class msg_t
                                size_t size_,
                                msg_free_fn *ffn_,
                                void *hint_);
+    int init_from_allocator (size_t size_, zmq::allocator_t *alloc_);
     int init_delimiter ();
     int init_join ();
     int init_leave ();
@@ -236,9 +239,10 @@ class msg_t
         {
             metadata_t *metadata;
             content_t *content;
+            unsigned char allocator_was_used; // boolean flag
             unsigned char unused[msg_t_size
                                  - (sizeof (metadata_t *) + sizeof (content_t *)
-                                    + 2 + 16 + sizeof (uint32_t))];
+                                    + 3 + 16 + sizeof (uint32_t))];
             unsigned char type;
             unsigned char flags;
             char group[16];
diff --git a/src/zmq.cpp b/src/zmq.cpp
index 0931e61f62..c8b1dc4041 100644
--- a/src/zmq.cpp
+++ b/src/zmq.cpp
@@ -95,6 +95,7 @@ struct iovec
 #include "timers.hpp"
 #include "ip.hpp"
 #include "address.hpp"
+#include "allocator.hpp"
 
 #if defined ZMQ_HAVE_OPENPGM
 #define __PGM_WININT_H__
@@ -215,6 +216,36 @@ int zmq_ctx_get_ext (void *ctx_, int option_, void *optval_, size_t *optvallen_)
 }
 
 
+// New allocator API
+
+void *zmq_msg_allocator_new (int type_)
+{
+    zmq::allocator_t *pool = new (std::nothrow) zmq::allocator_t;
+    if (!pool) {
+        errno = ENOMEM;
+        return NULL;
+    }
+
+    pool->init (type_);
+    return pool;
+}
+
+int zmq_msg_allocator_destroy (void **allocator_)
+{
+    if (allocator_) {
+        zmq::allocator_t *const allocator =
+          static_cast<zmq::allocator_t *> (*allocator_);
+        if (allocator && allocator->check_tag ()) {
+            delete allocator;
+            *allocator_ = NULL;
+            return 0;
+        }
+    }
+    errno = EFAULT;
+    return -1;
+}
+
+
 //  Stable/legacy context API
 
 void *zmq_init (int io_threads_)
@@ -600,6 +631,13 @@ int zmq_msg_init_size (zmq_msg_t *msg_, size_t size_)
     return (reinterpret_cast<zmq::msg_t *> (msg_))->init_size (size_);
 }
 
+int zmq_msg_init_allocator (zmq_msg_t *msg_, size_t size_, void *allocator_)
+{
+    return (reinterpret_cast<zmq::msg_t *> (msg_))
+      ->init_from_allocator (size_,
+                             reinterpret_cast<zmq::allocator_t *> (allocator_));
+}
+
 int zmq_msg_init_data (
   zmq_msg_t *msg_, void *data_, size_t size_, zmq_free_fn *ffn_, void *hint_)
 {

From 18c52c4648116590003ba0aa5f1bcf3c9db7bee5 Mon Sep 17 00:00:00 2001
From: Francesco Montorsi <francesco.montorsi@gmail.com>
Date: Thu, 29 Aug 2019 00:42:02 +0200
Subject: [PATCH 07/11] Remove changes related to graph generation

---
 perf/generate_csv.sh    | 33 +--------------------------
 perf/generate_graphs.py | 49 +++++++++++++----------------------------
 2 files changed, 16 insertions(+), 66 deletions(-)

diff --git a/perf/generate_csv.sh b/perf/generate_csv.sh
index da8ff0a4cd..d307f29e49 100755
--- a/perf/generate_csv.sh
+++ b/perf/generate_csv.sh
@@ -10,7 +10,6 @@
 #    export LOCAL_TEST_ENDPOINT="tcp://192.168.1.1:1234"
 #    export REMOTE_TEST_ENDPOINT="tcp://192.168.1.2:1234"
 #    export REMOTE_LIBZMQ_PATH="/home/fmontorsi/libzmq/perf"
-#    export MESSAGE_SIZE_LIST="8 16 32 64 128 210"
 #    ./generate_csv.sh
 #
 
@@ -23,7 +22,7 @@ LOCAL_TEST_ENDPOINT=${LOCAL_TEST_ENDPOINT:-tcp://192.168.1.1:1234}
 REMOTE_TEST_ENDPOINT=${REMOTE_TEST_ENDPOINT:-tcp://192.168.1.2:1234}
 
 # constant values:
-MESSAGE_SIZE_LIST="${MESSAGE_SIZE_LIST:-8 16 32 64 128 256 512 1024 2048 4096 8192 16384 32768 65536 131072}"
+MESSAGE_SIZE_LIST="8 16 32 64 128 256 512 1024 2048 4096 8192 16384 32768 65536 131072"
 OUTPUT_DIR="results"
 OUTPUT_FILE_PREFIX="results.txt"
 OUTPUT_FILE_CSV_PREFIX="results.csv"
@@ -48,35 +47,6 @@ function verify_ssh()
     echo "SSH connection to the remote $REMOTE_IP_SSH is working fine."
 }
 
-function set_reproducible_tcp_kernel_buff_size()
-{
-    sysctl -w net.core.rmem_max=8388608 && \
-        sysctl -w net.core.wmem_max=8388608 && \
-        sysctl -w net.core.rmem_default=65536 && \
-        sysctl -w net.core.wmem_default=65536 && \
-        sysctl -w net.ipv4.tcp_rmem='4096 87380 8388608' && \
-        sysctl -w net.ipv4.tcp_wmem='4096 65536 8388608' && \
-        sysctl -w net.ipv4.tcp_mem='8388608 8388608 8388608' && \
-        sysctl -w net.ipv4.route.flush=1
-    if [ $? -ne 0 ]; then
-        echo "Failed setting kernel socket buffer sizes LOCALLY"
-        exit 2
-    fi
-
-    ssh $REMOTE_IP_SSH "sysctl -w net.core.rmem_max=8388608 && \
-        sysctl -w net.core.wmem_max=8388608 && \
-        sysctl -w net.core.rmem_default=65536 && \
-        sysctl -w net.core.wmem_default=65536 && \
-        sysctl -w net.ipv4.tcp_rmem='4096 87380 8388608' && \
-        sysctl -w net.ipv4.tcp_wmem='4096 65536 8388608' && \
-        sysctl -w net.ipv4.tcp_mem='8388608 8388608 8388608' && \
-        sysctl -w net.ipv4.route.flush=1"
-    if [ $? -ne 0 ]; then
-        echo "Failed setting kernel socket buffer sizes on the REMOTE system $REMOTE_IP_SSH"
-        exit 2
-    fi
-}
-
 function run_remote_perf_util()
 {
     local MESSAGE_SIZE_BYTES="$1"
@@ -141,7 +111,6 @@ function generate_output_file()
 # main:
 
 verify_ssh
-set_reproducible_tcp_kernel_buff_size
 
 THROUGHPUT_CSV_HEADER_LINE="# message_size,message_count,PPS[msg/s],throughput[Mb/s]"
 
diff --git a/perf/generate_graphs.py b/perf/generate_graphs.py
index c323e4cce6..20651b7160 100755
--- a/perf/generate_graphs.py
+++ b/perf/generate_graphs.py
@@ -1,14 +1,20 @@
 #!/usr/bin/python3
 
 #
-# This script assumes that the set of CSV files produced by "generate_csv.sh" is provided as input.
-#
-# Usage example:
-#   export RESULT_DIRECTORY="./results"
-#   export TCP_LINK_SPEED_GBPS="10"     # or 1 or 100 as you like
-#   ./generate_graphs.py
+# This script assumes that the set of CSV files produced by "generate_csv.sh" is provided as input
+# and that locally there is the "results" folder.
 #
 
+# results for TCP:
+INPUT_FILE_PUSHPULL_TCP_THROUGHPUT="results/pushpull_tcp_thr_results.csv"
+INPUT_FILE_REQREP_TCP_LATENCY="results/reqrep_tcp_lat_results.csv"
+TCP_LINK_GPBS=100
+
+# results for INPROC:
+INPUT_FILE_PUSHPULL_INPROC_THROUGHPUT="results/pushpull_inproc_thr_results.csv"
+INPUT_FILE_PUBSUBPROXY_INPROC_THROUGHPUT="results/pubsubproxy_inproc_thr_results.csv"
+
+
 # dependencies
 #
 # pip3 install matplotlib
@@ -16,15 +22,13 @@
 
 import matplotlib.pyplot as plt
 import numpy as np
-import os
 
 
 # functions
 
-def plot_throughput(csv_filename, title, is_tcp=False, tcp_link_speed_gbps=10):
+def plot_throughput(csv_filename, title, is_tcp=False):
     message_size_bytes, message_count, pps, mbps = np.loadtxt(csv_filename, delimiter=',', unpack=True)
 
-    print("Generating PNG image file [%s] from CSV results '%s'" % (title, csv_filename))
     fig, ax1 = plt.subplots()
 
     # PPS axis
@@ -40,7 +44,7 @@ def plot_throughput(csv_filename, title, is_tcp=False, tcp_link_speed_gbps=10):
     ax2.set_ylabel('Throughput [Gb/s]', color=color)
     ax2.semilogx(message_size_bytes, mbps / 1e3, label='Throughput [Gb/s]', marker='o')
     if is_tcp:
-        ax2.set_yticks(np.arange(0, tcp_link_speed_gbps + 1, tcp_link_speed_gbps/10)) 
+        ax2.set_yticks(np.arange(0, TCP_LINK_GPBS + 1, TCP_LINK_GPBS/10)) 
     ax2.tick_params(axis='y', labelcolor=color)
     ax2.grid(True)
     
@@ -51,8 +55,6 @@ def plot_throughput(csv_filename, title, is_tcp=False, tcp_link_speed_gbps=10):
 
 def plot_latency(csv_filename, title):
     message_size_bytes, message_count, lat = np.loadtxt(csv_filename, delimiter=',', unpack=True)
-
-    print("Generating PNG image file [%s] from CSV results '%s'" % (title, csv_filename))
     plt.semilogx(message_size_bytes, lat, label='Latency [us]', marker='o')
     
     plt.xlabel('Message size [B]')
@@ -65,28 +67,7 @@ def plot_latency(csv_filename, title):
 
 # main
 
-try:
-    result_dir = os.environ['RESULT_DIRECTORY']
-except:
-    result_dir = "results" # default value
-
-try:
-    tcp_link_speed_gbps = int(os.environ['TCP_LINK_SPEED_GBPS'])
-except:
-    tcp_link_speed_gbps = 10 # default value
-    
-    
-
-# result files for TCP:
-INPUT_FILE_PUSHPULL_TCP_THROUGHPUT = result_dir + "/pushpull_tcp_thr_results.csv"
-INPUT_FILE_REQREP_TCP_LATENCY = result_dir + "/reqrep_tcp_lat_results.csv"
-
-# results for INPROC:
-INPUT_FILE_PUSHPULL_INPROC_THROUGHPUT = result_dir + "/pushpull_inproc_thr_results.csv"
-INPUT_FILE_PUBSUBPROXY_INPROC_THROUGHPUT = result_dir + "/pubsubproxy_inproc_thr_results.csv"
-
-# generate plots
-plot_throughput(INPUT_FILE_PUSHPULL_TCP_THROUGHPUT, 'ZeroMQ PUSH/PULL socket throughput, TCP transport', is_tcp=True, tcp_link_speed_gbps=tcp_link_speed_gbps)
+plot_throughput(INPUT_FILE_PUSHPULL_TCP_THROUGHPUT, 'ZeroMQ PUSH/PULL socket throughput, TCP transport', is_tcp=True)
 plot_throughput(INPUT_FILE_PUSHPULL_INPROC_THROUGHPUT, 'ZeroMQ PUSH/PULL socket throughput, INPROC transport')
 plot_throughput(INPUT_FILE_PUBSUBPROXY_INPROC_THROUGHPUT, 'ZeroMQ PUB/SUB PROXY socket throughput, INPROC transport')
 plot_latency(INPUT_FILE_REQREP_TCP_LATENCY, 'ZeroMQ REQ/REP socket latency, TCP transport')

From a720a311d6712248ba556a4b33f84c5761365982 Mon Sep 17 00:00:00 2001
From: Francesco Montorsi <francesco.montorsi@gmail.com>
Date: Thu, 29 Aug 2019 00:49:47 +0200
Subject: [PATCH 08/11] allow testing up to 8k msg sizes

---
 src/allocator.hpp | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/src/allocator.hpp b/src/allocator.hpp
index 8cac7e8584..f9e7001786 100644
--- a/src/allocator.hpp
+++ b/src/allocator.hpp
@@ -35,7 +35,7 @@
 #include "concurrentqueue.h"
 
 // FIXME: we need to grow dynamically the mempool
-#define MAX_ACTIVE_MESSAGES (8192)
+#define MAX_ACTIVE_MESSAGES (16384)
 
 namespace zmq
 {
@@ -85,7 +85,16 @@ class global_memory_pool_t
             return MsgBlock_SizeClass_256;
         else if (n < 512)
             return MsgBlock_SizeClass_512;
-
+        else if (n < 1024)
+            return MsgBlock_SizeClass_1024;
+        else if (n < 2048)
+            return MsgBlock_SizeClass_2048;
+        else if (n < 4096)
+            return MsgBlock_SizeClass_4096;
+        else if (n < 8192)
+            return MsgBlock_SizeClass_8192;
+
+        // size too big
         return MsgBlock_NumSizeClasses;
     }
 

From b9e1f016e42ca67413c03699e24f025b6d981f5f Mon Sep 17 00:00:00 2001
From: Francesco Montorsi <francesco.montorsi@gmail.com>
Date: Fri, 30 Aug 2019 23:51:42 +0200
Subject: [PATCH 09/11] correctly deallocate memory pool blocks

---
 src/allocator.hpp | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/src/allocator.hpp b/src/allocator.hpp
index f9e7001786..e81c4db17b 100644
--- a/src/allocator.hpp
+++ b/src/allocator.hpp
@@ -116,7 +116,14 @@ class global_memory_pool_t
             }
         }
     }
-    ~global_memory_pool_t () {}
+    ~global_memory_pool_t ()
+    {
+        // deallocate all message classes
+        for (int i = 0; i < MsgBlock_NumSizeClasses; i++) {
+            free (m_storage[i].raw_data);
+            m_storage[i].raw_data = NULL;
+        }
+    }
 
     void *allocate_msg (size_t len) // consumer thread: user app thread
     {

From 1649701137fef97ddcd7e76f61635fdd9a23d4dc Mon Sep 17 00:00:00 2001
From: Francesco Montorsi <francesco.montorsi@gmail.com>
Date: Sat, 31 Aug 2019 00:13:16 +0200
Subject: [PATCH 10/11] fix build with no draft API

---
 src/allocator.hpp |  4 ++--
 src/zmq_draft.h   | 11 +++++++++++
 2 files changed, 13 insertions(+), 2 deletions(-)

diff --git a/src/allocator.hpp b/src/allocator.hpp
index e81c4db17b..3e1352e9e9 100644
--- a/src/allocator.hpp
+++ b/src/allocator.hpp
@@ -177,10 +177,10 @@ class allocator_t
 
     void init (int type_) { _type = type_; }
 
-    // allocate() gets called by the consumer thread: the user app thread
+    // allocate() typically gets called by the consumer thread: the user app thread(s)
     void *allocate (size_t len);
 
-    // deallocate_msg() gets called by the producer thread: the ZMQ background IO thread
+    // deallocate_msg() typically gets called by the producer thread: the ZMQ background IO thread(s)
     static void deallocate_msg (void *data_, void *hint_);
 
     size_t size () const;
diff --git a/src/zmq_draft.h b/src/zmq_draft.h
index e558958c5d..46909dc383 100644
--- a/src/zmq_draft.h
+++ b/src/zmq_draft.h
@@ -71,6 +71,17 @@ int zmq_ctx_get_ext (void *context_,
                      void *optval_,
                      size_t *optvallen_);
 
+/* ZMQ-provided message-pool implementations.                                 */
+// default allocator using malloc/free
+#define ZMQ_MSG_ALLOCATOR_DEFAULT 0
+// using internally a SPSC queue (cannot be used with inproc maybe?) or perhaps an MPMC queue anyway
+#define ZMQ_MSG_ALLOCATOR_PER_THREAD_POOL 1
+// using internally a MPMC queue
+#define ZMQ_MSG_ALLOCATOR_GLOBAL_POOL 2
+
+void *zmq_msg_allocator_new (int type_);
+int zmq_msg_allocator_destroy (void **allocator_);
+
 /*  DRAFT Socket methods.                                                     */
 int zmq_join (void *s_, const char *group_);
 int zmq_leave (void *s_, const char *group_);

From 0baafa49fb80b850a172efb52b95693ae69f80fb Mon Sep 17 00:00:00 2001
From: Francesco Montorsi <francesco.montorsi@gmail.com>
Date: Sat, 31 Aug 2019 17:00:33 +0200
Subject: [PATCH 11/11] never use allocator for VSM

---
 src/msg.cpp | 47 +++++++++++++++++++++++++++++------------------
 1 file changed, 29 insertions(+), 18 deletions(-)

diff --git a/src/msg.cpp b/src/msg.cpp
index 0a6b7a4570..867de62966 100644
--- a/src/msg.cpp
+++ b/src/msg.cpp
@@ -190,25 +190,36 @@ int zmq::msg_t::init_from_allocator (size_t size_, zmq::allocator_t *alloc_)
 {
     zmq_assert (alloc_ != NULL && size_ != 0);
 
-    _u.lmsg.metadata = NULL;
-    _u.lmsg.type = type_lmsg;
-    _u.lmsg.flags = 0;
-    _u.lmsg.allocator_was_used = 1;
-    _u.lmsg.group[0] = '\0';
-    _u.lmsg.routing_id = 0;
-    _u.lmsg.content = reinterpret_cast<content_t *> (
-      alloc_->allocate (size_ + sizeof (content_t)));
-
-    if (!_u.lmsg.content) {
-        errno = ENOMEM;
-        return -1;
-    }
+    if (size_ <= max_vsm_size) {
+        // in case we can fit the message data inside the msg_t itself, this option will always
+        // be fastest rather than using the allocator!
+        _u.vsm.metadata = NULL;
+        _u.vsm.type = type_vsm;
+        _u.vsm.flags = 0;
+        _u.vsm.size = static_cast<unsigned char> (size_);
+        _u.vsm.group[0] = '\0';
+        _u.vsm.routing_id = 0;
+    } else {
+        _u.lmsg.metadata = NULL;
+        _u.lmsg.type = type_lmsg;
+        _u.lmsg.flags = 0;
+        _u.lmsg.allocator_was_used = 1;
+        _u.lmsg.group[0] = '\0';
+        _u.lmsg.routing_id = 0;
+        _u.lmsg.content = reinterpret_cast<content_t *> (
+          alloc_->allocate (size_ + sizeof (content_t)));
+
+        if (!_u.lmsg.content) {
+            errno = ENOMEM;
+            return -1;
+        }
 
-    _u.lmsg.content->data = _u.lmsg.content + 1;
-    _u.lmsg.content->size = size_;
-    _u.lmsg.content->ffn = (zmq_free_fn *) alloc_->deallocate_msg;
-    _u.lmsg.content->hint = alloc_;
-    new (&_u.lmsg.content->refcnt) zmq::atomic_counter_t ();
+        _u.lmsg.content->data = _u.lmsg.content + 1;
+        _u.lmsg.content->size = size_;
+        _u.lmsg.content->ffn = (zmq_free_fn *) alloc_->deallocate_msg;
+        _u.lmsg.content->hint = alloc_;
+        new (&_u.lmsg.content->refcnt) zmq::atomic_counter_t ();
+    }
 
     return 0;
 }