77#include " duckdb/common/types/data_chunk.hpp"
88#include " duckdb/common/types/vector.hpp"
99
10+ #include < condition_variable>
11+ #include < mutex>
12+ #include < thread>
13+
1014namespace duckdb {
1115
1216// Maps the engine's textual log level (stored as VARCHAR in the LOG_ENTRIES chunk) to the
@@ -30,6 +34,129 @@ static int LevelStringToPython(const string &level_str) {
3034 return 30 ;
3135}
3236
37+ // ===--------------------------------------------------------------------===//
38+ // Asynchronous forwarder
39+ //
40+ // The engine invokes FlushChunk while holding LogManager::lock — a non-recursive mutex that is
41+ // also taken by LogManager::CreateLogger / WriteLogEntry / Flush. Acquiring the GIL from inside
42+ // that lock deadlocks: a worker thread holding the lock blocks on the GIL, while another thread
43+ // holding the GIL blocks on the lock (e.g. via CreateLogger at the start of a concurrent query).
44+ // We observed exactly this with two threads each running execute() on one database.
45+ //
46+ // So forwarding is decoupled. FlushChunk only copies plain (level, message) data into this
47+ // process-global queue (no GIL, no Python). A single background thread drains the queue and
48+ // forwards to logging.getLogger("duckdb") with the GIL held but NO engine lock held — breaking
49+ // the lock-ordering cycle. One global thread (not one per DatabaseInstance) avoids spawning a
50+ // thread per connection. The queue holds owned copies, so it is independent of any storage's
51+ // lifetime.
52+ // ===--------------------------------------------------------------------===//
53+ namespace {
54+
55+ struct PendingLogEntry {
56+ int level;
57+ string message;
58+ };
59+
60+ struct LogForwarder {
61+ std::mutex mutex; // guards the fields below; NEVER held while acquiring the GIL
62+ std::condition_variable cv; // forwarder waits here for work
63+ std::condition_variable idle_cv; // drainers wait here for the queue to empty
64+ vector<PendingLogEntry> queue;
65+ bool stop = false ;
66+ bool started = false ;
67+ bool busy = false ; // a batch has been dequeued but not yet forwarded
68+ std::thread thread;
69+ };
70+
71+ LogForwarder &GetForwarder () {
72+ static LogForwarder forwarder;
73+ return forwarder;
74+ }
75+
76+ void ForwarderLoop () {
77+ auto &fwd = GetForwarder ();
78+ while (true ) {
79+ vector<PendingLogEntry> batch;
80+ {
81+ std::unique_lock<std::mutex> lck (fwd.mutex );
82+ fwd.cv .wait (lck, [&fwd] { return fwd.stop || !fwd.queue .empty (); });
83+ if (fwd.stop && fwd.queue .empty ()) {
84+ return ;
85+ }
86+ batch.swap (fwd.queue );
87+ fwd.busy = true ; // queue is empty again, but this batch isn't delivered yet
88+ }
89+ // No engine lock and no forwarder lock held here, so acquiring the GIL cannot deadlock.
90+ if (Py_IsInitialized ()) { // else interpreter is finalizing — acquiring the GIL would crash
91+ try {
92+ py::gil_scoped_acquire gil;
93+ auto logging = py::module::import (" logging" );
94+ auto logger = logging.attr (" getLogger" )(" duckdb" );
95+ for (auto &entry : batch) {
96+ logger.attr (" log" )(entry.level , entry.message );
97+ }
98+ } catch (...) {
99+ // Logging must never disrupt anything.
100+ }
101+ }
102+ {
103+ std::unique_lock<std::mutex> lck (fwd.mutex );
104+ fwd.busy = false ;
105+ fwd.idle_cv .notify_all (); // wake any DrainForwarder() waiters
106+ }
107+ }
108+ }
109+
110+ // atexit callback: stop and join the forwarder while the interpreter is still alive. Runs on the
111+ // main thread with the GIL held; the GIL is released around join() because the forwarder may be
112+ // parked in take_gil and could not otherwise wake to observe `stop`.
113+ void StopForwarder () {
114+ auto &fwd = GetForwarder ();
115+ {
116+ std::unique_lock<std::mutex> lck (fwd.mutex );
117+ if (!fwd.started ) {
118+ return ;
119+ }
120+ fwd.stop = true ;
121+ }
122+ fwd.cv .notify_all ();
123+ if (fwd.thread .joinable ()) {
124+ py::gil_scoped_release release;
125+ fwd.thread .join ();
126+ }
127+ }
128+
129+ } // namespace
130+
131+ void PythonLogStorage::EnsureForwarderStarted () {
132+ // Called from Connect() with the GIL held and no engine lock held.
133+ auto &fwd = GetForwarder ();
134+ {
135+ std::unique_lock<std::mutex> lck (fwd.mutex );
136+ if (fwd.started ) {
137+ return ;
138+ }
139+ fwd.started = true ;
140+ fwd.thread = std::thread (ForwarderLoop);
141+ }
142+ // Stop+join before interpreter finalization. Joining a GIL-blocked thread after Py_Finalize
143+ // would crash, so we hook atexit (which runs while the interpreter is still valid).
144+ try {
145+ auto atexit = py::module::import (" atexit" );
146+ atexit.attr (" register" )(py::cpp_function ([]() { StopForwarder (); }));
147+ } catch (...) {
148+ }
149+ }
150+
151+ void PythonLogStorage::DrainForwarder () {
152+ auto &fwd = GetForwarder ();
153+ // Release the GIL while waiting: the forwarder thread needs it to finish its in-flight batch
154+ // and signal idle. Holding it here would deadlock the very thread we're waiting on.
155+ py::gil_scoped_release release;
156+ std::unique_lock<std::mutex> lck (fwd.mutex );
157+ fwd.idle_cv .wait (lck, [&fwd] { return fwd.queue .empty () && !fwd.busy ; });
158+ }
159+
33160PythonLogStorage::PythonLogStorage (DatabaseInstance &db) : BufferingLogStorage(db, 1 , true ) {
34161 log_storage_buffers[LoggingTargetTable::LOG_ENTRIES ] =
35162 make_uniq<ColumnDataCollection>(Allocator::DefaultAllocator (), GetSchema (LoggingTargetTable::LOG_ENTRIES ));
@@ -48,43 +175,38 @@ ColumnDataCollection &PythonLogStorage::GetBuffer(LoggingTargetTable table) cons
48175 return *res->second ;
49176}
50177
51- void PythonLogStorage::ForwardEntriesToPython (DataChunk &chunk) {
52- // This fires from engine worker threads with the GIL released, and from within both the
53- // LogManager lock and this storage's lock. It runs arbitrary user Python (logging
54- // handlers) and MUST NOT let an exception escape: the engine calls the write path with no
55- // try/catch, directly from query binding/execution, so a raising handler would otherwise
56- // fail the user's query. Hence we swallow everything here .
178+ void PythonLogStorage::EnqueueEntriesForPython (DataChunk &chunk) {
179+ // Runs under LogManager::lock (and our scan lock). It MUST NOT touch the GIL or call Python:
180+ // doing so here would deadlock against any thread that holds the GIL and then enters a
181+ // LogManager method that needs the same lock (CreateLogger / WriteLogEntry / Flush). So we
182+ // only copy plain data into the global queue; the forwarder thread does the Python work
183+ // lock-free. The strings are deep-copied (GetString), so they outlive this chunk .
57184 //
58- // Caveat: because a lock is held across this call, a handler that re-enters DuckDB on the
59- // same thread and emits another log entry can self-deadlock on the non-recursive lock.
60- // That is outside our control (and matches the engine's own contract for log storages).
61- if (!Py_IsInitialized ()) {
62- return ; // interpreter is finalizing — acquiring the GIL would crash
63- }
64- try {
65- py::gil_scoped_acquire gil;
66- auto logging = py::module::import (" logging" );
67- auto logger = logging.attr (" getLogger" )(" duckdb" );
68- // LOG_ENTRIES schema: context_id, timestamp, type, log_level (idx 3), message (idx 4).
69- // log_level and message are both VARCHAR; the buffer chunk is flat.
70- auto level_data = FlatVector::GetData<string_t >(chunk.data [3 ]);
71- auto message_data = FlatVector::GetData<string_t >(chunk.data [4 ]);
185+ // A side benefit of decoupling: a user logging handler that raises now runs on the forwarder
186+ // thread, where the exception is swallowed — it can never reach the engine's query path.
187+ //
188+ // LOG_ENTRIES schema: context_id, timestamp, type, log_level (idx 3), message (idx 4).
189+ // log_level and message are both VARCHAR; the buffer chunk is flat.
190+ auto level_data = FlatVector::GetData<string_t >(chunk.data [3 ]);
191+ auto message_data = FlatVector::GetData<string_t >(chunk.data [4 ]);
192+ auto &fwd = GetForwarder ();
193+ {
194+ std::unique_lock<std::mutex> lck (fwd.mutex );
72195 for (idx_t i = 0 ; i < chunk.size (); i++) {
73- logger. attr ( " log " )( LevelStringToPython (level_data[i].GetString ()), message_data[i].GetString ());
196+ fwd. queue . push_back ({ LevelStringToPython (level_data[i].GetString ()), message_data[i].GetString ()} );
74197 }
75- } catch (...) {
76- // Logging must never disrupt query execution.
77198 }
199+ fwd.cv .notify_one ();
78200}
79201
80202void PythonLogStorage::FlushChunk (LoggingTargetTable table, DataChunk &chunk) {
81203 D_ASSERT (table == LoggingTargetTable::LOG_ENTRIES || table == LoggingTargetTable::LOG_CONTEXTS );
82204 // Retain the entry for duckdb_logs FIRST, so a misbehaving Python handler can never cost
83205 // us a stored row.
84206 log_storage_buffers[table]->Append (chunk);
85- // Forward only real log entries (not context metadata) to Python's logging module .
207+ // Queue only real log entries (not context metadata) for async forwarding to logging .
86208 if (table == LoggingTargetTable::LOG_ENTRIES ) {
87- ForwardEntriesToPython (chunk);
209+ EnqueueEntriesForPython (chunk);
88210 }
89211}
90212
0 commit comments