Skip to content

Commit d756a03

Browse files
committed
MDEV-21423 - lock-free trx_sys get performance regression cause by lf_find and ut_delay
Under high concurrency, MVCC snapshot creation may spend a significant amount of time in lf_hash_iterate()/lfind() while collecting active read-write transaction identifiers. This overhead is particularly visible in sysbench oltp_read_write with transaction-isolation=READ-COMMITTED. Iteration cost becomes high due to significant TLB thrashing and poor memory locality in this hot code path because snapshot creation touches many rw_trx_hash nodes distributed across memory, including dummy nodes that are irrelevant for snapshot construction. In addition, traversing LF_HASH requires issuing heavyweight memory barriers. This is a performance regression after 53cc9aa, which changed MVCC snapshot creation to scan LF_HASH instead of maintaining a global sorted vector protected by the global mutex. Add trx_sys.rw_trx_ids, a compact traversal-friendly vector of active read-write transaction identifiers and serialization numbers optimized for MVCC snapshot creation, while rw_trx_hash remains responsible for transaction lookup. The vector may contain empty slots corresponding to idle or read-only transactions that currently do not own a read-write transaction identifier. Such slots are skipped by snapshot creation. This reduces traversal overhead during MVCC snapshot creation by improving memory locality, reducing TLB pressure, and avoiding repeated memory barriers required for rw_trx_hash traversal.
1 parent c8bfb4d commit d756a03

9 files changed

Lines changed: 172 additions & 81 deletions

File tree

storage/innobase/include/trx0purge.h

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -37,9 +37,11 @@ Created 3/26/1996 Heikki Tuuri
3737
Remove the undo log segment from the rseg slot if it is too big for reuse.
3838
@param[in] trx transaction
3939
@param[in,out] undo undo log
40-
@param[in,out] mtr mini-transaction */
40+
@param[in,out] mtr mini-transaction
41+
@param[in] end transaction serialisation number */
4142
void
42-
trx_purge_add_undo_to_history(const trx_t* trx, trx_undo_t*& undo, mtr_t* mtr);
43+
trx_purge_add_undo_to_history(const trx_t* trx, trx_undo_t*& undo, mtr_t* mtr,
44+
trx_id_t end);
4345

4446
/**
4547
Remove unnecessary history data from rollback segments. NOTE that when this

storage/innobase/include/trx0sys.h

Lines changed: 148 additions & 70 deletions
Original file line numberDiff line numberDiff line change
@@ -340,14 +340,6 @@ struct rw_trx_hash_element_t
340340

341341

342342
trx_id_t id; /* lf_hash_init() relies on this to be first in the struct */
343-
344-
/**
345-
Transaction serialization number.
346-
347-
Assigned shortly before the transaction is moved to COMMITTED_IN_MEMORY
348-
state. Initially set to TRX_ID_MAX.
349-
*/
350-
Atomic_counter<trx_id_t> no;
351343
trx_t *trx;
352344
srw_mutex mutex;
353345
};
@@ -443,7 +435,6 @@ class rw_trx_hash_t
443435
ut_ad(element->trx == 0);
444436
element->trx= trx;
445437
element->id= trx->id;
446-
element->no= TRX_ID_MAX;
447438
trx->rw_trx_hash_element= element;
448439
}
449440

@@ -512,7 +503,6 @@ class rw_trx_hash_t
512503
if (element->trx)
513504
validate_element(element->trx);
514505
element->mutex.wr_unlock();
515-
ut_ad(element->id < element->no);
516506
return arg->action(element, arg->argument);
517507
}
518508
#endif
@@ -849,6 +839,119 @@ class thread_safe_trx_ilist_t
849839
alignas(CPU_LEVEL1_DCACHE_LINESIZE) ilist<trx_t> trx_list;
850840
};
851841

842+
/**
843+
Active read-write transaction identifiers and serialisation numbers container.
844+
845+
Unlike rw_trx_hash_t, which is optimized for direct lookup, this
846+
structure is optimized for compact storage and traversal of active
847+
transactions by MVCC read view construction.
848+
849+
The vector may contain empty slots corresponding to idle or read-only
850+
transactions that currently do not own an active read-write trx_id.
851+
Such slots are skipped during traversal.
852+
*/
853+
class rw_trx_vector
854+
{
855+
struct rw_trx_id
856+
{
857+
Atomic_relaxed<trx_id_t> id{TRX_ID_MAX};
858+
Atomic_relaxed<trx_id_t> no{TRX_ID_MAX};
859+
trx_t *trx;
860+
rw_trx_id(trx_t *t): trx(t) {}
861+
};
862+
alignas(CPU_LEVEL1_DCACHE_LINESIZE)
863+
std::vector<rw_trx_id, ut_allocator<rw_trx_id>>
864+
ids{ut_allocator<rw_trx_id>(mem_key_trx_sys_t_rw_trx_ids)};
865+
alignas(CPU_LEVEL1_DCACHE_LINESIZE) mutable srw_spin_lock_low latch;
866+
867+
public:
868+
void assign_new_trx_no(const trx_t *trx, trx_id_t no) noexcept
869+
{
870+
latch.rd_lock();
871+
ut_ad(trx->rw_trx_ids_slot < ids.size());
872+
ut_ad(ids[trx->rw_trx_ids_slot].trx == trx);
873+
ut_ad(ids[trx->rw_trx_ids_slot].id == trx->id);
874+
ut_ad(ids[trx->rw_trx_ids_slot].no == TRX_ID_MAX);
875+
ids[trx->rw_trx_ids_slot].no= no;
876+
latch.rd_unlock();
877+
}
878+
trx_id_t snapshot_ids(trx_ids_t &view_ids,
879+
const trx_id_t max_trx_id) const noexcept
880+
{
881+
trx_id_t min_trx_no{max_trx_id};
882+
view_ids.clear();
883+
latch.rd_lock();
884+
view_ids.reserve(ids.size());
885+
for (const auto &it : ids)
886+
{
887+
trx_id_t id{it.id};
888+
if (id < max_trx_id)
889+
{
890+
view_ids.push_back(id);
891+
const trx_id_t no{it.no};
892+
if (no < min_trx_no)
893+
min_trx_no= no;
894+
}
895+
}
896+
latch.rd_unlock();
897+
return min_trx_no;
898+
}
899+
void register_rw(const trx_t *trx) noexcept
900+
{
901+
latch.rd_lock();
902+
ut_ad(trx->rw_trx_ids_slot < ids.size());
903+
ut_ad(ids[trx->rw_trx_ids_slot].trx == trx);
904+
ut_ad(ids[trx->rw_trx_ids_slot].id == TRX_ID_MAX);
905+
ut_ad(ids[trx->rw_trx_ids_slot].no == TRX_ID_MAX);
906+
ids[trx->rw_trx_ids_slot].id= trx->id;
907+
latch.rd_unlock();
908+
}
909+
void deregister_rw(const trx_t *trx) noexcept
910+
{
911+
latch.rd_lock();
912+
ut_ad(trx->rw_trx_ids_slot < ids.size());
913+
rw_trx_id &slot= ids[trx->rw_trx_ids_slot];
914+
ut_ad(slot.trx == trx);
915+
ut_ad(slot.id == trx->id);
916+
slot.id= TRX_ID_MAX;
917+
slot.no= TRX_ID_MAX;
918+
latch.rd_unlock();
919+
}
920+
void register_trx(trx_t *trx) noexcept
921+
{
922+
ut_ad(trx->rw_trx_ids_slot == std::numeric_limits<uint32_t>::max());
923+
latch.wr_lock();
924+
trx->rw_trx_ids_slot= static_cast<uint32_t>(ids.size());
925+
ids.emplace_back(trx);
926+
latch.wr_unlock();
927+
}
928+
void deregister_trx(trx_t *trx) noexcept
929+
{
930+
latch.wr_lock();
931+
ut_ad(trx->rw_trx_ids_slot < ids.size());
932+
ut_ad(ids[trx->rw_trx_ids_slot].trx == trx);
933+
if (trx->rw_trx_ids_slot + 1 < ids.size())
934+
{
935+
trx_t *move_trx= ids.back().trx;
936+
ids[trx->rw_trx_ids_slot]= std::move(ids.back());
937+
move_trx->rw_trx_ids_slot= trx->rw_trx_ids_slot;
938+
}
939+
ids.pop_back();
940+
latch.wr_unlock();
941+
trx->rw_trx_ids_slot= std::numeric_limits<uint32_t>::max();
942+
}
943+
void create() noexcept
944+
{
945+
ut_ad(ids.size() == 0);
946+
latch.init();
947+
}
948+
void destroy() noexcept
949+
{
950+
ut_ad(ids.size() == 0);
951+
latch.destroy();
952+
}
953+
};
954+
852955
/** The transaction system central memory data structure. */
853956
class trx_sys_t
854957
{
@@ -876,6 +979,15 @@ class trx_sys_t
876979
/** False if there is no undo log to purge or rollback */
877980
bool undo_log_nonempty;
878981
public:
982+
/**
983+
Collection of active read-write transaction identifiers and serialization
984+
numbers used for MVCC snapshot creation.
985+
986+
This complements rw_trx_hash with a traversal-friendly representation
987+
optimized for collecting active transaction ids.
988+
*/
989+
rw_trx_vector rw_trx_ids;
990+
879991
/** List of all transactions. */
880992
thread_safe_trx_ilist_t trx_list;
881993

@@ -1014,7 +1126,7 @@ class trx_sys_t
10141126
next call to trx_sys.get_new_trx_id()
10151127
*/
10161128

1017-
trx_id_t get_max_trx_id()
1129+
trx_id_t get_max_trx_id() const noexcept
10181130
{
10191131
return m_max_trx_id;
10201132
}
@@ -1037,7 +1149,7 @@ class trx_sys_t
10371149
Allocates and assigns new transaction serialisation number.
10381150
10391151
There's a gap between m_max_trx_id increment and transaction serialisation
1040-
number becoming visible through rw_trx_hash. While we're in this gap
1152+
number becoming visible through rw_trx_ids. While we're in this gap
10411153
concurrent thread may come and do MVCC snapshot without seeing allocated
10421154
but not yet assigned serialisation number. Then at some point purge thread
10431155
may clone this view. As a result it won't see newly allocated serialisation
@@ -1047,58 +1159,44 @@ class trx_sys_t
10471159
m_rw_trx_hash_version is intended to solve this problem. MVCC snapshot has
10481160
to wait until m_max_trx_id == m_rw_trx_hash_version, which effectively
10491161
means that all transaction serialisation numbers up to m_max_trx_id are
1050-
available through rw_trx_hash.
1162+
available through rw_trx_ids.
10511163
10521164
We rely on refresh_rw_trx_hash_version() to issue RELEASE memory barrier so
1053-
that m_rw_trx_hash_version increment happens after
1054-
trx->rw_trx_hash_element->no becomes visible through rw_trx_hash.
1165+
that m_rw_trx_hash_version increment happens after transaction serialisation
1166+
number becomes visible through rw_trx_ids.
10551167
10561168
@param trx transaction
10571169
*/
1058-
void assign_new_trx_no(trx_t *trx)
1170+
trx_id_t assign_new_trx_no(trx_t *trx)
10591171
{
1060-
trx->rw_trx_hash_element->no= get_new_trx_id_no_refresh();
1172+
trx_id_t no= get_new_trx_id_no_refresh();
1173+
rw_trx_ids.assign_new_trx_no(trx, no);
10611174
refresh_rw_trx_hash_version();
1175+
return no;
10621176
}
10631177

10641178

10651179
/**
10661180
Takes MVCC snapshot.
10671181
1068-
To reduce malloc probability we reserve rw_trx_hash.size() + 32 elements
1069-
in ids.
1070-
10711182
For details about get_rw_trx_hash_version() != get_max_trx_id() spin
10721183
@sa register_rw() and @sa assign_new_trx_no().
10731184
10741185
We rely on get_rw_trx_hash_version() to issue ACQUIRE memory barrier so
1075-
that loading of m_rw_trx_hash_version happens before accessing rw_trx_hash.
1076-
1077-
To optimise snapshot creation rw_trx_hash.iterate() is being used instead
1078-
of rw_trx_hash.iterate_no_dups(). It means that some transaction
1079-
identifiers may appear multiple times in ids.
1186+
that loading of m_rw_trx_hash_version happens before accessing rw_trx_ids.
10801187
1081-
@param[in,out] caller_trx used to get access to rw_trx_hash_pins
10821188
@param[out] ids array to store registered transaction identifiers
10831189
@param[out] max_trx_id variable to store m_max_trx_id value
1084-
@param[out] mix_trx_no variable to store min(no) value
1190+
1191+
@return min(no)
10851192
*/
10861193

1087-
void snapshot_ids(trx_t *caller_trx, trx_ids_t *ids, trx_id_t *max_trx_id,
1088-
trx_id_t *min_trx_no)
1194+
trx_id_t snapshot_ids(trx_ids_t &ids, trx_id_t &max_trx_id) const noexcept
10891195
{
1090-
snapshot_ids_arg arg(ids);
1091-
1092-
while ((arg.m_id= get_rw_trx_hash_version()) != get_max_trx_id())
1196+
while ((max_trx_id= get_rw_trx_hash_version()) != get_max_trx_id())
10931197
ut_delay(1);
1094-
arg.m_no= arg.m_id;
1095-
1096-
ids->clear();
1097-
ids->reserve(rw_trx_hash.size() + 32);
1098-
rw_trx_hash.iterate(caller_trx, copy_one_id, &arg);
10991198

1100-
*max_trx_id= arg.m_id;
1101-
*min_trx_no= arg.m_no;
1199+
return rw_trx_ids.snapshot_ids(ids, max_trx_id);
11021200
}
11031201

11041202

@@ -1149,7 +1247,7 @@ class trx_sys_t
11491247
Transaction becomes visible to MVCC.
11501248
11511249
There's a gap between m_max_trx_id increment and transaction becoming
1152-
visible through rw_trx_hash. While we're in this gap concurrent thread may
1250+
visible through rw_trx_ids. While we're in this gap concurrent thread may
11531251
come and do MVCC snapshot. As a result concurrent read view will be able to
11541252
observe records owned by this transaction even before it was committed.
11551253
@@ -1166,20 +1264,23 @@ class trx_sys_t
11661264
void register_rw(trx_t *trx)
11671265
{
11681266
trx->id= get_new_trx_id_no_refresh();
1169-
rw_trx_hash.insert(trx);
1267+
rw_trx_ids.register_rw(trx);
11701268
refresh_rw_trx_hash_version();
1269+
rw_trx_hash.insert(trx);
11711270
}
11721271

11731272

11741273
/**
11751274
Deregisters read-write transaction.
11761275
1177-
Transaction is removed from rw_trx_hash, which releases all implicit locks.
1178-
MVCC snapshot won't see this transaction anymore.
1276+
After this call the transaction is no longer visible as active to MVCC read
1277+
views created subsequently, and all implicit locks held by the transaction
1278+
have been released.
11791279
*/
11801280

1181-
void deregister_rw(trx_t *trx)
1281+
void deregister_rw(trx_t *trx) noexcept
11821282
{
1283+
rw_trx_ids.deregister_rw(trx);
11831284
rw_trx_hash.erase(trx);
11841285
}
11851286

@@ -1204,6 +1305,7 @@ class trx_sys_t
12041305
void register_trx(trx_t *trx)
12051306
{
12061307
trx_list.push_front(*trx);
1308+
rw_trx_ids.register_trx(trx);
12071309
}
12081310

12091311

@@ -1214,6 +1316,7 @@ class trx_sys_t
12141316
*/
12151317
void deregister_trx(trx_t *trx)
12161318
{
1319+
rw_trx_ids.deregister_trx(trx);
12171320
trx_list.remove(*trx);
12181321
}
12191322

@@ -1266,33 +1369,8 @@ class trx_sys_t
12661369
private:
12671370
static my_bool find_same_or_older_callback(void *el, void *i) noexcept;
12681371

1269-
1270-
struct snapshot_ids_arg
1271-
{
1272-
snapshot_ids_arg(trx_ids_t *ids): m_ids(ids) {}
1273-
trx_ids_t *m_ids;
1274-
trx_id_t m_id;
1275-
trx_id_t m_no;
1276-
};
1277-
1278-
1279-
static my_bool copy_one_id(void* el, void *a)
1280-
{
1281-
auto element= static_cast<const rw_trx_hash_element_t *>(el);
1282-
auto arg= static_cast<snapshot_ids_arg*>(a);
1283-
if (element->id < arg->m_id)
1284-
{
1285-
trx_id_t no= element->no;
1286-
arg->m_ids->push_back(element->id);
1287-
if (no < arg->m_no)
1288-
arg->m_no= no;
1289-
}
1290-
return 0;
1291-
}
1292-
1293-
12941372
/** Getter for m_rw_trx_hash_version, must issue ACQUIRE memory barrier. */
1295-
trx_id_t get_rw_trx_hash_version()
1373+
trx_id_t get_rw_trx_hash_version() const noexcept
12961374
{
12971375
return m_rw_trx_hash_version.load(std::memory_order_acquire);
12981376
}

storage/innobase/include/trx0trx.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -629,6 +629,8 @@ struct trx_t : ilist_node<>
629629

630630

631631
public:
632+
/** trx_sys.rw_trx_ids index, protected by trx_sys.rw_trx_ids.latch */
633+
uint32_t rw_trx_ids_slot;
632634
/** Transaction identifier (0 if no locks were acquired).
633635
Set by trx_sys_t::register_rw() or trx_resurrect() before
634636
the transaction is added to trx_sys.rw_trx_hash.

storage/innobase/include/ut0new.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -174,6 +174,7 @@ extern PSI_memory_key mem_key_other;
174174
extern PSI_memory_key mem_key_row_log_buf;
175175
extern PSI_memory_key mem_key_row_merge_sort;
176176
extern PSI_memory_key mem_key_std;
177+
extern PSI_memory_key mem_key_trx_sys_t_rw_trx_ids;
177178

178179
/** Setup the internal objects needed for UT_NEW() to operate.
179180
This must be called before the first call to UT_NEW(). */

storage/innobase/read/read0read.cc

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -172,7 +172,7 @@ For details see: row_undo_mod_sec_is_unsafe() and row_purge_poss_sec()
172172
*/
173173
inline void ReadViewBase::snapshot(trx_t *trx)
174174
{
175-
trx_sys.snapshot_ids(trx, &m_ids, &m_low_limit_id, &m_low_limit_no);
175+
m_low_limit_no= trx_sys.snapshot_ids(m_ids, m_low_limit_id);
176176
if (m_ids.empty())
177177
{
178178
m_up_limit_id= m_low_limit_id;

0 commit comments

Comments
 (0)