Get the arrow schema in a separate transaction for materialized data only

evertlammerts · evertlammerts · commit c8c35ebd7988 · 2026-06-16T11:20:39.000+02:00
diff --git a/src/duckdb_py/arrow/arrow_export_utils.cpp b/src/duckdb_py/arrow/arrow_export_utils.cpp
@@ -17,7 +17,7 @@ namespace duckdb {
 
 namespace pyarrow {
 
-py::object ToPyArrowSchema(ArrowSchema &schema) {
+py::object ToPyArrowSchema(const ArrowSchema &schema) {
 	py::gil_scoped_acquire acquire;
 
 	auto pyarrow_lib_module = py::module::import("pyarrow").attr("lib");
diff --git a/src/duckdb_py/include/duckdb_python/arrow/arrow_export_utils.hpp b/src/duckdb_py/include/duckdb_python/arrow/arrow_export_utils.hpp
@@ -6,7 +6,7 @@ namespace duckdb {
 
 namespace pyarrow {
 
-py::object ToPyArrowSchema(ArrowSchema &schema);
+py::object ToPyArrowSchema(const ArrowSchema &schema);
 
 py::object ToArrowTable(const vector<LogicalType> &types, const vector<string> &names, const py::list &batches,
                         ClientProperties &options);
diff --git a/src/duckdb_py/include/duckdb_python/pyresult.hpp b/src/duckdb_py/include/duckdb_python/pyresult.hpp
@@ -36,15 +36,13 @@ struct DuckDBPyResult {
 
 	PandasDataFrame FetchDF(bool date_as_object);
 
-	duckdb::pyarrow::Table FetchArrowTable(idx_t rows_per_batch, bool to_polars);
-
 	PandasDataFrame FetchDFChunk(const idx_t vectors_per_chunk = 1, bool date_as_object = false);
 
 	py::dict FetchPyTorch();
 
 	py::dict FetchTF();
 
-	ArrowArrayStream FetchArrowArrayStream(idx_t rows_per_batch = 1000000);
+	duckdb::pyarrow::Table FetchArrowTable(idx_t rows_per_batch, bool to_polars);
 	duckdb::pyarrow::RecordBatchReader FetchRecordBatchReader(idx_t rows_per_batch = 1000000);
 	py::object FetchArrowCapsule(idx_t rows_per_batch = 1000000);
 
@@ -78,7 +76,11 @@ struct DuckDBPyResult {
 	//! the context (so it survives `del conn`). Never call these on a StreamQueryResult:
 	//! a lazy result already has a live context and is converted/wrapped directly.
 	void PromoteMaterializedToArrow(idx_t batch_size);
-	void PromoteMaterializedToStream();
+
+	template <typename T>
+	T RunWithArrowSchema(const std::function<T(const ArrowSchema &)> &fun, bool dedup_col_names);
+	duckdb::pyarrow::Table MaterializedResultToArrowTable(const ArrowSchema &arrow_schema, idx_t rows_per_batch);
+	ArrowArrayStream FetchArrowArrayStream(idx_t rows_per_batch);
 
 private:
 	idx_t chunk_offset = 0;
diff --git a/src/duckdb_py/pyrelation.cpp b/src/duckdb_py/pyrelation.cpp
@@ -23,59 +23,6 @@
 #include "duckdb/common/arrow/physical_arrow_collector.hpp"
 #include "duckdb_python/arrow/arrow_export_utils.hpp"
 
-namespace {
-
-// A helper for arrow conversion. We want to be able to fetch a result's schema in the same transaction that
-// creates the result, so we have to wrap both calls in the same transaction. This helper always reverts the
-// transaction if we haven't committed it explicitly. Note that this is not the same as RunFunctionInTransaction:
-// we run _queries_ in a transaction (where each query acquires the context lock) while RFIT runs a function
-// while holding the context lock for that duration.
-// Note: this is a workaround that is intended to be temporary. We should really just cache the schema in the
-// ArrowQueryResult.
-
-void RunOrThrow(duckdb::ClientContext &context, const char *sql) {
-	auto result = context.Query(sql, duckdb::QueryParameters(false));
-	if (result->HasError()) {
-		result->ThrowError();
-	}
-}
-
-class ArrowConversionTransaction {
-public:
-	explicit ArrowConversionTransaction(duckdb::ClientContext &context_p) : context(context_p), owns(false) {
-		auto &txn = context.transaction;
-		if (txn.IsAutoCommit() && !txn.HasActiveTransaction()) {
-			RunOrThrow(context, "BEGIN TRANSACTION");
-			owns = true;
-		}
-	}
-
-	~ArrowConversionTransaction() {
-		if (owns) {
-			try {
-				RunOrThrow(context, "ROLLBACK");
-			} catch (...) { // NOLINT
-			}
-		}
-	}
-
-	void Commit() {
-		if (owns) {
-			RunOrThrow(context, "COMMIT");
-			owns = false;
-		}
-	}
-
-	ArrowConversionTransaction(const ArrowConversionTransaction &) = delete;
-	ArrowConversionTransaction &operator=(const ArrowConversionTransaction &) = delete;
-
-private:
-	duckdb::ClientContext &context;
-	bool owns;
-};
-
-} // namespace
-
 namespace duckdb {
 
 DuckDBPyRelation::DuckDBPyRelation(shared_ptr<Relation> rel_p) : rel(std::move(rel_p)) {
@@ -1013,22 +960,10 @@ PandasDataFrame DuckDBPyRelation::FetchDFChunk(idx_t vectors_per_chunk, bool dat
 	return result->FetchDFChunk(vectors_per_chunk, date_as_object);
 }
 
-duckdb::pyarrow::Table DuckDBPyRelation::ToArrowTableInternal(idx_t batch_size, bool to_polars) {
+pyarrow::Table DuckDBPyRelation::ToArrowTableInternal(idx_t batch_size, bool to_polars) {
 	if (!result && !rel) {
 		return py::none();
 	}
-	// Make sure we have a valid client context
-	shared_ptr<ClientContext> context;
-	if (rel) {
-		context = rel->context->GetContext();
-	} else if (auto cc = result->GetClientProperties().client_context) {
-		context = cc->shared_from_this();
-	} else {
-		throw ConnectionException("Cannot fetch an arrow table without a valid connection");
-	}
-	// Start (or piggyback on) a transaction for the conversion
-	ArrowConversionTransaction conversion_txn(*context);
-
 	if (!result) {
 		auto &config = ClientConfig::GetConfig(*rel->context->GetContext());
 		ScopedConfigSetting scoped_setting(
@@ -1044,8 +979,6 @@ duckdb::pyarrow::Table DuckDBPyRelation::ToArrowTableInternal(idx_t batch_size,
 	AssertResultOpen();
 	auto res = result->FetchArrowTable(batch_size, to_polars);
 	result = nullptr;
-	// We must commit the transaction before returning
-	conversion_txn.Commit();
 	return res;
 }
 
diff --git a/src/duckdb_py/pyresult.cpp b/src/duckdb_py/pyresult.cpp
@@ -483,104 +483,112 @@ void DuckDBPyResult::PromoteMaterializedToArrow(idx_t batch_size) {
 	result = std::move(new_result);
 }
 
-// Re-feed a materialized result as a lazy stream on the user's own context. The
-// StreamQueryResult co-owns the context, so conversion survives `del conn` and runs under a
-// live transaction (geometry/extension correctness, #492).
-void DuckDBPyResult::PromoteMaterializedToStream() {
-	D_ASSERT(result->type == QueryResultType::MATERIALIZED_RESULT);
-	auto client_context = result->client_properties.client_context;
-	if (!client_context) {
-		throw InternalException("Cannot promote result to an Arrow stream: the originating client context is gone");
+template <typename T>
+T DuckDBPyResult::RunWithArrowSchema(const std::function<T(const ArrowSchema &)> &fun, bool dedup_col_names) {
+	D_ASSERT(result);
+	if (!result->client_properties.client_context) {
+		throw ConnectionException("Cannot fetch arrow schema without a valid connection");
 	}
-	auto context = client_context->shared_from_this();
-	auto &materialized = result->Cast<MaterializedQueryResult>();
-	auto names = result->names;
-	auto select = MakeColumnDataScanStatement(materialized.TakeCollection(), names);
+	auto ctx = result->client_properties.client_context->shared_from_this();
 
-	unique_ptr<QueryResult> new_result;
-	{
-		D_ASSERT(py::gil_check());
-		py::gil_scoped_release release;
-		auto pending_query = context->PendingQuery(std::move(select), QueryParameters(true));
-		new_result = DuckDBPyConnection::CompletePendingQuery(*pending_query);
-	}
-	if (new_result->HasError()) {
-		new_result->ThrowError();
+	auto names = result->names;
+	if (dedup_col_names) {
+		QueryResult::DeduplicateColumns(names);
 	}
-	new_result->names = std::move(names);
-	result = std::move(new_result);
+
+	ArrowSchema arrow_schema;
+	ctx->RunFunctionInTransaction(
+	    [&] { ArrowConverter::ToArrowSchema(&arrow_schema, result->types, names, result->client_properties); });
+
+	return fun(arrow_schema);
 }
 
-duckdb::pyarrow::Table DuckDBPyResult::FetchArrowTable(idx_t rows_per_batch, bool to_polars) {
-	if (!result) {
-		throw InvalidInputException("There is no query result");
-	}
-	// ARROW_RESULT: fresh collector output. MATERIALIZED: re-feed for parallel conversion.
-	// STREAM: a live result, converted directly below (never materialized to re-feed).
+duckdb::pyarrow::Table DuckDBPyResult::MaterializedResultToArrowTable(const ArrowSchema &arrow_schema,
+                                                                      const idx_t rows_per_batch) {
+	D_ASSERT(result);
+	D_ASSERT(result->type == QueryResultType::MATERIALIZED_RESULT || result->type == QueryResultType::ARROW_RESULT);
+
+	auto pyarrow_schema = pyarrow::ToPyArrowSchema(arrow_schema);
 	if (result->type == QueryResultType::MATERIALIZED_RESULT) {
 		PromoteMaterializedToArrow(rows_per_batch);
 	}
-
-	auto names = result->names;
-	if (to_polars) {
-		QueryResult::DeduplicateColumns(names);
+	py::list batches;
+	auto &arrow_result = result->Cast<ArrowQueryResult>();
+	auto arrays = arrow_result.ConsumeArrays();
+	for (auto &array : arrays) {
+		ArrowArray data = array->arrow_array;
+		array->arrow_array.release = nullptr;
+		TransformDuckToArrowChunk(pyarrow_schema, data, batches);
 	}
+	return pyarrow::ToArrowTable(std::move(batches), pyarrow_schema);
+}
 
-	// Fetch the schema once
-	ArrowSchema arrow_schema;
-	ArrowConverter::ToArrowSchema(&arrow_schema, result->types, names, result->client_properties);
-	auto pyarrow_schema = pyarrow::ToPyArrowSchema(arrow_schema);
-
-	py::list batches;
-	if (result->type == QueryResultType::ARROW_RESULT) {
-		auto &arrow_result = result->Cast<ArrowQueryResult>();
-		auto arrays = arrow_result.ConsumeArrays();
-		for (auto &array : arrays) {
-			ArrowArray data = array->arrow_array;
-			array->arrow_array.release = nullptr;
-			TransformDuckToArrowChunk(pyarrow_schema, data, batches);
-		}
-	} else {
-		// STREAM_RESULT: pull the live stream directly into Arrow batches.
-		QueryResultChunkScanState scan_state(*result);
-		while (true) {
-			ArrowArray data;
-			idx_t count;
-			{
-				D_ASSERT(py::gil_check());
-				py::gil_scoped_release release;
-				count = ArrowUtil::FetchChunk(scan_state, result->client_properties, rows_per_batch, &data,
-				                              ArrowTypeExtensionData::GetExtensionTypes(
-				                                  *result->client_properties.client_context, result->types));
-			}
-			if (count == 0) {
-				break;
-			}
-			TransformDuckToArrowChunk(pyarrow_schema, data, batches);
-		}
+duckdb::pyarrow::Table DuckDBPyResult::FetchArrowTable(const idx_t rows_per_batch, const bool to_polars) {
+	if (!result) {
+		throw InvalidInputException("There is no query result");
 	}
 
-	return pyarrow::ToArrowTable(std::move(batches), pyarrow_schema);
+	return RunWithArrowSchema<duckdb::pyarrow::Table>(
+	    [&](const ArrowSchema &schema) -> duckdb::pyarrow::Table {
+		    if (result->type == QueryResultType::MATERIALIZED_RESULT || result->type == QueryResultType::ARROW_RESULT) {
+			    return MaterializedResultToArrowTable(schema, rows_per_batch);
+		    }
+		    if (result->type != QueryResultType::STREAM_RESULT) {
+			    throw InternalException("FetchArrowTable called with unsupported query result: %d", result->type);
+		    }
+		    auto pyarrow_schema = pyarrow::ToPyArrowSchema(schema);
+		    py::list batches;
+		    QueryResultChunkScanState scan_state(*result);
+		    while (true) {
+			    ArrowArray data;
+			    idx_t count;
+			    {
+				    D_ASSERT(py::gil_check());
+				    py::gil_scoped_release release;
+				    count = ArrowUtil::FetchChunk(scan_state, result->client_properties, rows_per_batch, &data,
+				                                  ArrowTypeExtensionData::GetExtensionTypes(
+				                                      *result->client_properties.client_context, result->types));
+			    }
+			    if (count == 0) {
+				    break;
+			    }
+			    TransformDuckToArrowChunk(pyarrow_schema, data, batches);
+		    }
+		    return pyarrow::ToArrowTable(std::move(batches), pyarrow_schema);
+	    },
+	    to_polars);
 }
 
 ArrowArrayStream DuckDBPyResult::FetchArrowArrayStream(idx_t rows_per_batch) {
 	if (!result) {
 		throw InvalidInputException("There is no query result");
 	}
-	// Re-feed a materialized result to get a context-owning stream; a StreamQueryResult is
-	// wrapped directly (already has a live context).
-	if (result->type == QueryResultType::MATERIALIZED_RESULT) {
-		PromoteMaterializedToStream();
+	if (result->type != QueryResultType::STREAM_RESULT) {
+		throw InternalException("FetchArrowArrayStream called with unsupported query result: %d", result->type);
 	}
 	// The wrapper is owned by the ArrowArrayStream's private_data (released with the stream).
-	ResultArrowArrayStreamWrapper *result_stream = new ResultArrowArrayStreamWrapper(std::move(result), rows_per_batch);
+	const auto result_stream = new ResultArrowArrayStreamWrapper(std::move(result), rows_per_batch);
 	return result_stream->stream;
 }
 
 duckdb::pyarrow::RecordBatchReader DuckDBPyResult::FetchRecordBatchReader(idx_t rows_per_batch) {
 	if (!result) {
 		throw InvalidInputException("There is no query result");
 	}
+
+	if (result->type == QueryResultType::MATERIALIZED_RESULT || result->type == QueryResultType::ARROW_RESULT) {
+		constexpr bool dedup_column_names = false;
+		return RunWithArrowSchema<duckdb::pyarrow::RecordBatchReader>(
+		    [&](const ArrowSchema &schema) -> duckdb::pyarrow::RecordBatchReader {
+			    const auto table = MaterializedResultToArrowTable(schema, rows_per_batch);
+			    return py::cast<duckdb::pyarrow::RecordBatchReader>(
+			        table.attr("to_reader")(py::arg("max_chunksize") = rows_per_batch));
+		    },
+		    dedup_column_names);
+	}
+	if (result->type != QueryResultType::STREAM_RESULT) {
+		throw InternalException("FetchRecordBatchReader called with unsupported query result: %d", result->type);
+	}
 	py::gil_scoped_acquire acquire;
 	auto pyarrow_lib_module = py::module::import("pyarrow").attr("lib");
 	auto record_batch_reader_func = pyarrow_lib_module.attr("RecordBatchReader").attr("_import_from_c");
@@ -601,11 +609,23 @@ static void ArrowArrayStreamPyCapsuleDestructor(PyObject *object) {
 	delete stream;
 }
 
-py::object DuckDBPyResult::FetchArrowCapsule(idx_t rows_per_batch) {
+py::object DuckDBPyResult::FetchArrowCapsule(const idx_t rows_per_batch) {
 	if (!result) {
 		throw InvalidInputException("There is no query result");
 	}
-	// Lazy streaming capsule backed by a context-owning stream (see FetchArrowArrayStream).
+
+	constexpr bool dedup_column_names = false;
+	if (result->type == QueryResultType::MATERIALIZED_RESULT || result->type == QueryResultType::ARROW_RESULT) {
+		return RunWithArrowSchema<py::object>(
+		    [&](const ArrowSchema &schema) -> py::object {
+			    const auto table = MaterializedResultToArrowTable(schema, rows_per_batch);
+			    return table.attr("__arrow_c_stream__")();
+		    },
+		    dedup_column_names);
+	}
+	if (result->type != QueryResultType::STREAM_RESULT) {
+		throw InternalException("FetchArrowCapsule called with unsupported query result: %d", result->type);
+	}
 	auto inner_stream = FetchArrowArrayStream(rows_per_batch);
 	auto stream = new ArrowArrayStream();
 	*stream = inner_stream;