@@ -66,6 +66,44 @@ unique_ptr<ArrowArrayStreamWrapper> PythonTableArrowArrayStreamFactory::Produce(
6666 py::handle arrow_obj_handle (factory->arrow_object );
6767 auto arrow_object_type = DuckDBPyConnection::GetArrowType (arrow_obj_handle);
6868
69+ if (arrow_object_type == PyArrowObjectType::PyCapsuleInterface) {
70+ py::object capsule_obj = arrow_obj_handle.attr (" __arrow_c_stream__" )();
71+ auto capsule = py::reinterpret_borrow<py::capsule>(capsule_obj);
72+ auto stream = capsule.get_pointer <struct ArrowArrayStream >();
73+ if (!stream->release ) {
74+ throw InvalidInputException (
75+ " The __arrow_c_stream__() method returned a released stream. "
76+ " If this object is single-use, implement __arrow_c_schema__() or expose a .schema attribute "
77+ " with _export_to_c() so that DuckDB can extract the schema without consuming the stream." );
78+ }
79+
80+ if (ModuleIsLoaded<PyarrowDatasetCacheItem>()) {
81+ // Tier A: full pushdown via pyarrow.dataset
82+ // Import as RecordBatchReader, feed through Scanner.from_batches for projection/filter pushdown.
83+ auto pyarrow_lib_module = py::module::import (" pyarrow" ).attr (" lib" );
84+ auto import_func = pyarrow_lib_module.attr (" RecordBatchReader" ).attr (" _import_from_c" );
85+ py::object reader = import_func (reinterpret_cast <uint64_t >(stream));
86+ // _import_from_c takes ownership of the stream; null out to prevent capsule double-free
87+ stream->release = nullptr ;
88+ auto &import_cache = *DuckDBPyConnection::ImportCache ();
89+ py::object arrow_batch_scanner = import_cache.pyarrow .dataset .Scanner ().attr (" from_batches" );
90+ py::handle reader_handle = reader;
91+ auto scanner = ProduceScanner (arrow_batch_scanner, reader_handle, parameters, factory->client_properties );
92+ auto record_batches = scanner.attr (" to_reader" )();
93+ auto res = make_uniq<ArrowArrayStreamWrapper>();
94+ auto export_to_c = record_batches.attr (" _export_to_c" );
95+ export_to_c (reinterpret_cast <uint64_t >(&res->arrow_array_stream ));
96+ return res;
97+ } else {
98+ // Tier B: no pyarrow.dataset, return raw stream (no pushdown)
99+ // DuckDB applies projection/filter post-scan via arrow_scan_dumb
100+ auto res = make_uniq<ArrowArrayStreamWrapper>();
101+ res->arrow_array_stream = *stream;
102+ stream->release = nullptr ;
103+ return res;
104+ }
105+ }
106+
69107 if (arrow_object_type == PyArrowObjectType::PyCapsule) {
70108 auto res = make_uniq<ArrowArrayStreamWrapper>();
71109 auto capsule = py::reinterpret_borrow<py::capsule>(arrow_obj_handle);
@@ -78,21 +116,12 @@ unique_ptr<ArrowArrayStreamWrapper> PythonTableArrowArrayStreamFactory::Produce(
78116 return res;
79117 }
80118
119+ // Scanner and Dataset: require pyarrow.dataset for pushdown
120+ VerifyArrowDatasetLoaded ();
81121 auto &import_cache = *DuckDBPyConnection::ImportCache ();
82122 py::object scanner;
83123 py::object arrow_batch_scanner = import_cache.pyarrow .dataset .Scanner ().attr (" from_batches" );
84124 switch (arrow_object_type) {
85- case PyArrowObjectType::Table: {
86- auto arrow_dataset = import_cache.pyarrow .dataset ().attr (" dataset" );
87- auto dataset = arrow_dataset (arrow_obj_handle);
88- py::object arrow_scanner = dataset.attr (" __class__" ).attr (" scanner" );
89- scanner = ProduceScanner (arrow_scanner, dataset, parameters, factory->client_properties );
90- break ;
91- }
92- case PyArrowObjectType::RecordBatchReader: {
93- scanner = ProduceScanner (arrow_batch_scanner, arrow_obj_handle, parameters, factory->client_properties );
94- break ;
95- }
96125 case PyArrowObjectType::Scanner: {
97126 // If it's a scanner we have to turn it to a record batch reader, and then a scanner again since we can't stack
98127 // scanners on arrow Otherwise pushed-down projections and filters will disappear like tears in the rain
@@ -119,37 +148,29 @@ unique_ptr<ArrowArrayStreamWrapper> PythonTableArrowArrayStreamFactory::Produce(
119148}
120149
121150void PythonTableArrowArrayStreamFactory::GetSchemaInternal (py::handle arrow_obj_handle, ArrowSchemaWrapper &schema) {
151+ // PyCapsule (from bare capsule Produce path)
122152 if (py::isinstance<py::capsule>(arrow_obj_handle)) {
123153 auto capsule = py::reinterpret_borrow<py::capsule>(arrow_obj_handle);
124154 auto stream = capsule.get_pointer <struct ArrowArrayStream >();
125155 if (!stream->release ) {
126156 throw InternalException (" ArrowArrayStream was released by another thread/library" );
127157 }
128- stream->get_schema (stream, &schema.arrow_schema );
129- return ;
130- }
131-
132- auto table_class = py::module::import (" pyarrow" ).attr (" Table" );
133- if (py::isinstance (arrow_obj_handle, table_class)) {
134- auto obj_schema = arrow_obj_handle.attr (" schema" );
135- auto export_to_c = obj_schema.attr (" _export_to_c" );
136- export_to_c (reinterpret_cast <uint64_t >(&schema.arrow_schema ));
158+ if (stream->get_schema (stream, &schema.arrow_schema )) {
159+ throw InvalidInputException (" Failed to get Arrow schema from stream: %s" ,
160+ stream->get_last_error ? stream->get_last_error (stream) : " unknown error" );
161+ }
137162 return ;
138163 }
139164
165+ // Scanner: use projected_schema; everything else (RecordBatchReader, Dataset): use .schema
140166 VerifyArrowDatasetLoaded ();
141-
142167 auto &import_cache = *DuckDBPyConnection::ImportCache ();
143- auto scanner_class = import_cache.pyarrow .dataset .Scanner ();
144-
145- if (py::isinstance (arrow_obj_handle, scanner_class)) {
168+ if (py::isinstance (arrow_obj_handle, import_cache.pyarrow .dataset .Scanner ())) {
146169 auto obj_schema = arrow_obj_handle.attr (" projected_schema" );
147- auto export_to_c = obj_schema.attr (" _export_to_c" );
148- export_to_c (reinterpret_cast <uint64_t >(&schema));
170+ obj_schema.attr (" _export_to_c" )(reinterpret_cast <uint64_t >(&schema.arrow_schema ));
149171 } else {
150172 auto obj_schema = arrow_obj_handle.attr (" schema" );
151- auto export_to_c = obj_schema.attr (" _export_to_c" );
152- export_to_c (reinterpret_cast <uint64_t >(&schema));
173+ obj_schema.attr (" _export_to_c" )(reinterpret_cast <uint64_t >(&schema.arrow_schema ));
153174 }
154175}
155176
@@ -158,6 +179,36 @@ void PythonTableArrowArrayStreamFactory::GetSchema(uintptr_t factory_ptr, ArrowS
158179 auto factory = static_cast <PythonTableArrowArrayStreamFactory *>(reinterpret_cast <void *>(factory_ptr)); // NOLINT
159180 D_ASSERT (factory->arrow_object );
160181 py::handle arrow_obj_handle (factory->arrow_object );
182+
183+ auto type = DuckDBPyConnection::GetArrowType (arrow_obj_handle);
184+ if (type == PyArrowObjectType::PyCapsuleInterface) {
185+ // Get __arrow_c_schema__ if it exists
186+ if (py::hasattr (arrow_obj_handle, " __arrow_c_schema__" )) {
187+ auto schema_capsule = arrow_obj_handle.attr (" __arrow_c_schema__" )();
188+ auto capsule = py::reinterpret_borrow<py::capsule>(schema_capsule);
189+ auto arrow_schema = capsule.get_pointer <struct ArrowSchema >();
190+ schema.arrow_schema = *arrow_schema;
191+ arrow_schema->release = nullptr ; // take ownership
192+ return ;
193+ }
194+ // Otherwise try to use .schema with _export_to_c
195+ if (py::hasattr (arrow_obj_handle, " schema" )) {
196+ auto obj_schema = arrow_obj_handle.attr (" schema" );
197+ if (py::hasattr (obj_schema, " _export_to_c" )) {
198+ obj_schema.attr (" _export_to_c" )(reinterpret_cast <uint64_t >(&schema.arrow_schema ));
199+ return ;
200+ }
201+ }
202+ // Fallback: create a temporary stream just for the schema (consumes single-use streams!)
203+ auto stream_capsule = arrow_obj_handle.attr (" __arrow_c_stream__" )();
204+ auto capsule = py::reinterpret_borrow<py::capsule>(stream_capsule);
205+ auto stream = capsule.get_pointer <struct ArrowArrayStream >();
206+ if (stream->get_schema (stream, &schema.arrow_schema )) {
207+ throw InvalidInputException (" Failed to get Arrow schema from stream: %s" ,
208+ stream->get_last_error ? stream->get_last_error (stream) : " unknown error" );
209+ }
210+ return ; // stream_capsule goes out of scope, stream released by capsule destructor
211+ }
161212 GetSchemaInternal (arrow_obj_handle, schema);
162213}
163214
0 commit comments