Skip to content

Commit fd8889e

Browse files
Use recursive mutex to deal with GIL <-> internal lock deadlocks (#462)
Related to #435 and #456 #435 tries to use the same connection across multiple threads, which is not supported. Specifically, in this case the result is overwritten and destroyed concurrently, causing a segfault. This PR fixes the segfault by synchronising access to the result with a recursive mutex. But note that connections cannot be used across threads, no matter what. #456: we sometimes need to destruct python-managed objects _after_ releasing the GIL. This PR makes sure that we hold on to the GIL during the destruction of PyRelation and Connection::Close(). Note: I rely on CI to check whether the fix actually works here...
2 parents 7b77328 + 74fecdb commit fd8889e

4 files changed

Lines changed: 370 additions & 13 deletions

File tree

src/duckdb_py/include/duckdb_python/pyconnection/pyconnection.hpp

Lines changed: 27 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -163,9 +163,35 @@ struct DuckDBPyConnection : public enable_shared_from_this<DuckDBPyConnection> {
163163
};
164164

165165
public:
166+
// RAII guard for the connection mutex (see py_connection_lock below). Constructing
167+
// one releases the GIL while waiting for the mutex and reacquires it before
168+
// returning, so callers always come out of the constructor with the GIL held
169+
// and the mutex locked. The mutex is released when the guard goes out of scope.
170+
// Holding the GIL while blocked on this mutex would deadlock against a thread
171+
// that holds the mutex and is mid-way through a GIL-releasing native call —
172+
// see duckdb-python#435.
173+
class ConnectionLockGuard {
174+
public:
175+
explicit ConnectionLockGuard(DuckDBPyConnection &conn) : lock_(conn.py_connection_lock, std::defer_lock) {
176+
D_ASSERT(py::gil_check());
177+
py::gil_scoped_release release;
178+
lock_.lock();
179+
}
180+
181+
private:
182+
std::unique_lock<std::recursive_mutex> lock_;
183+
};
184+
166185
ConnectionGuard con;
167186
Cursors cursors;
168-
std::mutex py_connection_lock;
187+
// Recursive so that the outer lock taken at the top of execute/fetch
188+
// methods (while still holding the GIL) does not deadlock against the
189+
// inner lock taken by PrepareQuery / ExecuteInternal /
190+
// PrepareAndExecuteInternal (after releasing the GIL). Serialises every
191+
// path that touches `con.result` so concurrent calls on a single
192+
// DuckDBPyConnection cannot dereference an already-freed result — see
193+
// duckdb-python#435.
194+
std::recursive_mutex py_connection_lock;
169195
//! MemoryFileSystem used to temporarily store file-like objects for reading
170196
shared_ptr<ModifiedMemoryFileSystem> internal_object_filesystem;
171197
case_insensitive_map_t<unique_ptr<ExternalDependency>> registered_functions;

src/duckdb_py/pyconnection.cpp

Lines changed: 49 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -75,10 +75,18 @@ std::string DuckDBPyConnection::formatted_python_version = "";
7575

7676
DuckDBPyConnection::~DuckDBPyConnection() {
7777
try {
78-
py::gil_scoped_release gil;
79-
// Release any structures that do not need to hold the GIL here
80-
con.SetDatabase(nullptr);
81-
con.SetConnection(nullptr);
78+
// The native Connection / DuckDB teardown is pure C++ work — release
79+
// the GIL for it so other Python threads can run. The implicit member
80+
// destructors that fire after this scope (notably
81+
// `registered_functions`, a `case_insensitive_map_t<unique_ptr<ExternalDependency>>`
82+
// whose entries transitively own pybind-managed Python references)
83+
// run with the GIL reacquired because `gil` is destroyed at the end
84+
// of the inner block.
85+
{
86+
py::gil_scoped_release gil;
87+
con.SetDatabase(nullptr);
88+
con.SetConnection(nullptr);
89+
}
8290
} catch (...) { // NOLINT
8391
}
8492
}
@@ -492,6 +500,7 @@ void DuckDBPyConnection::Initialize(py::handle &m) {
492500

493501
shared_ptr<DuckDBPyConnection> DuckDBPyConnection::ExecuteMany(const py::object &query, py::object params_p) {
494502
py::gil_scoped_acquire gil;
503+
ConnectionLockGuard conn_lock(*this);
495504
con.SetResult(nullptr);
496505
if (params_p.is_none()) {
497506
params_p = py::list();
@@ -623,7 +632,7 @@ unique_ptr<PreparedStatement> DuckDBPyConnection::PrepareQuery(unique_ptr<SQLSta
623632
{
624633
D_ASSERT(py::gil_check());
625634
py::gil_scoped_release release;
626-
unique_lock<mutex> lock(py_connection_lock);
635+
unique_lock<std::recursive_mutex> lock(py_connection_lock);
627636

628637
prep = connection.Prepare(std::move(statement));
629638
if (prep->HasError()) {
@@ -644,7 +653,7 @@ unique_ptr<QueryResult> DuckDBPyConnection::ExecuteInternal(PreparedStatement &p
644653
{
645654
D_ASSERT(py::gil_check());
646655
py::gil_scoped_release release;
647-
unique_lock<std::mutex> lock(py_connection_lock);
656+
unique_lock<std::recursive_mutex> lock(py_connection_lock);
648657

649658
auto pending_query = prep.PendingQuery(named_values);
650659
if (pending_query->HasError()) {
@@ -671,7 +680,7 @@ unique_ptr<QueryResult> DuckDBPyConnection::PrepareAndExecuteInternal(unique_ptr
671680
{
672681
D_ASSERT(py::gil_check());
673682
py::gil_scoped_release release;
674-
unique_lock<std::mutex> lock(py_connection_lock);
683+
unique_lock<std::recursive_mutex> lock(py_connection_lock);
675684

676685
auto pending_query = con.GetConnection().PendingQuery(std::move(statement), named_values, true);
677686

@@ -710,6 +719,7 @@ shared_ptr<DuckDBPyConnection> DuckDBPyConnection::ExecuteFromString(const strin
710719

711720
shared_ptr<DuckDBPyConnection> DuckDBPyConnection::Execute(const py::object &query, py::object params) {
712721
py::gil_scoped_acquire gil;
722+
ConnectionLockGuard conn_lock(*this);
713723
con.SetResult(nullptr);
714724

715725
auto statements = GetStatements(query);
@@ -1879,6 +1889,7 @@ shared_ptr<DuckDBPyConnection> DuckDBPyConnection::Checkpoint() {
18791889
}
18801890

18811891
Optional<py::list> DuckDBPyConnection::GetDescription() {
1892+
ConnectionLockGuard conn_lock(*this);
18821893
if (!con.HasResult()) {
18831894
return py::none();
18841895
}
@@ -1891,11 +1902,22 @@ int DuckDBPyConnection::GetRowcount() {
18911902
}
18921903

18931904
void DuckDBPyConnection::Close() {
1905+
ConnectionLockGuard conn_lock(*this);
18941906
con.SetResult(nullptr);
18951907
D_ASSERT(py::gil_check());
1896-
py::gil_scoped_release release;
1897-
con.SetConnection(nullptr);
1898-
con.SetDatabase(nullptr);
1908+
// Release the GIL only for the native Connection / DuckDB teardown, which
1909+
// is pure C++ work and can take noticeable time. Hold the GIL back for
1910+
// `registered_functions.clear()` because the
1911+
// `case_insensitive_map_t<unique_ptr<ExternalDependency>>` it destroys
1912+
// transitively owns pybind-managed Python references (Python UDF
1913+
// callables, registered Python objects, …). Decrementing those
1914+
// references with the GIL released is undefined behaviour — see
1915+
// duckdb-python#456.
1916+
{
1917+
py::gil_scoped_release release;
1918+
con.SetConnection(nullptr);
1919+
con.SetDatabase(nullptr);
1920+
}
18991921
// https://peps.python.org/pep-0249/#Connection.close
19001922
cursors.ClearCursors();
19011923
registered_functions.clear();
@@ -2025,7 +2047,13 @@ shared_ptr<DuckDBPyConnection> DuckDBPyConnection::Cursor() {
20252047
}
20262048

20272049
// these should be functions on the result but well
2050+
//
2051+
// All of the connection-level fetch methods below take `py_connection_lock`
2052+
// before touching `con.GetResult()`, so that another thread cannot replace
2053+
// or destroy the connection's current result while we are mid-fetch — see
2054+
// duckdb-python#435.
20282055
Optional<py::tuple> DuckDBPyConnection::FetchOne() {
2056+
ConnectionLockGuard conn_lock(*this);
20292057
if (!con.HasResult()) {
20302058
throw InvalidInputException("No open result set");
20312059
}
@@ -2034,6 +2062,7 @@ Optional<py::tuple> DuckDBPyConnection::FetchOne() {
20342062
}
20352063

20362064
py::list DuckDBPyConnection::FetchMany(idx_t size) {
2065+
ConnectionLockGuard conn_lock(*this);
20372066
if (!con.HasResult()) {
20382067
throw InvalidInputException("No open result set");
20392068
}
@@ -2042,6 +2071,7 @@ py::list DuckDBPyConnection::FetchMany(idx_t size) {
20422071
}
20432072

20442073
py::list DuckDBPyConnection::FetchAll() {
2074+
ConnectionLockGuard conn_lock(*this);
20452075
if (!con.HasResult()) {
20462076
throw InvalidInputException("No open result set");
20472077
}
@@ -2050,6 +2080,7 @@ py::list DuckDBPyConnection::FetchAll() {
20502080
}
20512081

20522082
py::dict DuckDBPyConnection::FetchNumpy() {
2083+
ConnectionLockGuard conn_lock(*this);
20532084
if (!con.HasResult()) {
20542085
throw InvalidInputException("No open result set");
20552086
}
@@ -2058,6 +2089,7 @@ py::dict DuckDBPyConnection::FetchNumpy() {
20582089
}
20592090

20602091
PandasDataFrame DuckDBPyConnection::FetchDF(bool date_as_object) {
2092+
ConnectionLockGuard conn_lock(*this);
20612093
if (!con.HasResult()) {
20622094
throw InvalidInputException("No open result set");
20632095
}
@@ -2066,6 +2098,7 @@ PandasDataFrame DuckDBPyConnection::FetchDF(bool date_as_object) {
20662098
}
20672099

20682100
PandasDataFrame DuckDBPyConnection::FetchDFChunk(const idx_t vectors_per_chunk, bool date_as_object) {
2101+
ConnectionLockGuard conn_lock(*this);
20692102
if (!con.HasResult()) {
20702103
throw InvalidInputException("No open result set");
20712104
}
@@ -2074,6 +2107,7 @@ PandasDataFrame DuckDBPyConnection::FetchDFChunk(const idx_t vectors_per_chunk,
20742107
}
20752108

20762109
duckdb::pyarrow::Table DuckDBPyConnection::FetchArrow(idx_t rows_per_batch) {
2110+
ConnectionLockGuard conn_lock(*this);
20772111
if (!con.HasResult()) {
20782112
throw InvalidInputException("No open result set");
20792113
}
@@ -2082,6 +2116,7 @@ duckdb::pyarrow::Table DuckDBPyConnection::FetchArrow(idx_t rows_per_batch) {
20822116
}
20832117

20842118
py::dict DuckDBPyConnection::FetchPyTorch() {
2119+
ConnectionLockGuard conn_lock(*this);
20852120
if (!con.HasResult()) {
20862121
throw InvalidInputException("No open result set");
20872122
}
@@ -2090,6 +2125,7 @@ py::dict DuckDBPyConnection::FetchPyTorch() {
20902125
}
20912126

20922127
py::dict DuckDBPyConnection::FetchTF() {
2128+
ConnectionLockGuard conn_lock(*this);
20932129
if (!con.HasResult()) {
20942130
throw InvalidInputException("No open result set");
20952131
}
@@ -2098,6 +2134,7 @@ py::dict DuckDBPyConnection::FetchTF() {
20982134
}
20992135

21002136
PolarsDataFrame DuckDBPyConnection::FetchPolars(idx_t rows_per_batch, bool lazy) {
2137+
ConnectionLockGuard conn_lock(*this);
21012138
if (!con.HasResult()) {
21022139
throw InvalidInputException("No open result set");
21032140
}
@@ -2106,6 +2143,7 @@ PolarsDataFrame DuckDBPyConnection::FetchPolars(idx_t rows_per_batch, bool lazy)
21062143
}
21072144

21082145
duckdb::pyarrow::RecordBatchReader DuckDBPyConnection::FetchRecordBatchReader(const idx_t rows_per_batch) {
2146+
ConnectionLockGuard conn_lock(*this);
21092147
if (!con.HasResult()) {
21102148
throw InvalidInputException("No open result set");
21112149
}
@@ -2185,7 +2223,7 @@ static shared_ptr<DuckDBPyConnection> FetchOrCreateInstance(const string &databa
21852223
{
21862224
D_ASSERT(py::gil_check());
21872225
py::gil_scoped_release release;
2188-
unique_lock<mutex> lock(res->py_connection_lock);
2226+
unique_lock<std::recursive_mutex> lock(res->py_connection_lock);
21892227
auto database =
21902228
instance_cache.GetOrCreateInstance(database_path, config, cache_instance, InstantiateNewInstance);
21912229
res->con.SetDatabase(std::move(database));

src/duckdb_py/pyresult.cpp

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,9 +34,14 @@ DuckDBPyResult::DuckDBPyResult(unique_ptr<QueryResult> result_p) : result(std::m
3434
}
3535

3636
DuckDBPyResult::~DuckDBPyResult() {
37+
// The destructor must run with the GIL held: `result` and `current_chunk`
38+
// can transitively own pybind-managed Python references (registered
39+
// objects, arrow release callbacks, PYTHON_OBJECT vector values, etc.),
40+
// whose teardown calls into the Python C API. Releasing the GIL here
41+
// (as the previous implementation did) causes Py_DECREF / PyObject_Free
42+
// to run without a valid PyThreadState — see duckdb-python#456.
3743
try {
3844
D_ASSERT(py::gil_check());
39-
py::gil_scoped_release gil;
4045
result.reset();
4146
current_chunk.reset();
4247
} catch (...) { // NOLINT

0 commit comments

Comments
 (0)