Critical sections are slow especially for extensions which are dynamically loaded because accessing thread states is slow and there are multiple function calls even for the fastpath of no contention for acquisition of critical section.
Dump of assembler code for function _ssl_RAND_status:
0x00007ffff7b97090 <+0>: push %rbx
0x00007ffff7b97091 <+1>: sub $0x10,%rsp
0x00007ffff7b97095 <+5>: lea 0xa(%rdi),%rbx
0x00007ffff7b97099 <+9>: mov $0x1,%cl
0x00007ffff7b9709b <+11>: xor %eax,%eax
0x00007ffff7b9709d <+13>: lock cmpxchg %cl,0xa(%rdi)
0x00007ffff7b970a2 <+18>: jne 0x7ffff7b970c5 <_ssl_RAND_status+53>
0x00007ffff7b970a4 <+20>: call 0x7ffff7b950e0 <_PyThreadState_GetCurrent@plt>
0x00007ffff7b970a9 <+25>: mov %rbx,0x8(%rsp)
0x00007ffff7b970ae <+30>: mov 0xb0(%rax),%rcx
0x00007ffff7b970b5 <+37>: mov %rcx,(%rsp)
0x00007ffff7b970b9 <+41>: mov %rsp,%rcx
0x00007ffff7b970bc <+44>: mov %rcx,0xb0(%rax)
0x00007ffff7b970c3 <+51>: jmp 0x7ffff7b970d0 <_ssl_RAND_status+64>
0x00007ffff7b970c5 <+53>: mov %rsp,%rdi
0x00007ffff7b970c8 <+56>: mov %rbx,%rsi
0x00007ffff7b970cb <+59>: call 0x7ffff7b95cc0 <_PyCriticalSection_BeginSlow@plt>
0x00007ffff7b970d0 <+64>: call 0x7ffff7b95c30 <RAND_status@plt>
0x00007ffff7b970d5 <+69>: movslq %eax,%rdi
0x00007ffff7b970d8 <+72>: call 0x7ffff7b96560 <PyBool_FromLong@plt>
0x00007ffff7b970dd <+77>: mov %rax,%rbx
0x00007ffff7b970e0 <+80>: mov 0x8(%rsp),%rdi
0x00007ffff7b970e5 <+85>: test %rdi,%rdi
0x00007ffff7b970e8 <+88>: je 0x7ffff7b97116 <_ssl_RAND_status+134>
0x00007ffff7b970ea <+90>: xor %ecx,%ecx
0x00007ffff7b970ec <+92>: mov $0x1,%al
0x00007ffff7b970ee <+94>: lock cmpxchg %cl,(%rdi)
0x00007ffff7b970f2 <+98>: je 0x7ffff7b970f9 <_ssl_RAND_status+105>
0x00007ffff7b970f4 <+100>: call 0x7ffff7b954a0 <PyMutex_Unlock@plt>
0x00007ffff7b970f9 <+105>: call 0x7ffff7b950e0 <_PyThreadState_GetCurrent@plt>
0x00007ffff7b970fe <+110>: mov (%rsp),%rcx
0x00007ffff7b97102 <+114>: mov %rcx,0xb0(%rax)
0x00007ffff7b97109 <+121>: test $0x1,%cl
0x00007ffff7b9710c <+124>: je 0x7ffff7b97116 <_ssl_RAND_status+134>
0x00007ffff7b9710e <+126>: mov %rax,%rdi
0x00007ffff7b97111 <+129>: call 0x7ffff7b95f90 <_PyCriticalSection_Resume@plt>
0x00007ffff7b97116 <+134>: mov %rbx,%rax
0x00007ffff7b97119 <+137>: add $0x10,%rsp
0x00007ffff7b9711d <+141>: pop %rbx
0x00007ffff7b9711e <+142>: ret
End of assembler dump.
0x00007ffff773bd00 <+0>: push %rax
0x00007ffff773bd01 <+1>: call 0x7ffff773ac20 <RAND_status@plt>
0x00007ffff773bd06 <+6>: movslq %eax,%rdi
0x00007ffff773bd09 <+9>: pop %rax
0x00007ffff773bd0a <+10>: jmp 0x7ffff773b520 <PyBool_FromLong@plt>
On free-threading there is a large ~20% performance regression under
asyncio_tcp_sslbenchmark. A large part of slowdown is from #124993 which added critical sections and locks for thread safety however 20% is large slowdown for the important single threaded use-case.Critical sections are slow especially for extensions which are dynamically loaded because accessing thread states is slow and there are multiple function calls even for the fastpath of no contention for acquisition of critical section.
Comparing the assembly of
_ssl_RAND_statusin free-threading vs normal build:Linked PRs