@@ -17,12 +17,12 @@ use vortex::array::arrays::ChunkedArray;
1717use vortex:: array:: arrays:: ExtensionArray ;
1818use vortex:: array:: arrow:: FromArrowArray ;
1919use vortex:: dtype:: DType ;
20+ use vortex:: dtype:: arrow:: ARROW_EXT_NAME_VARIANT ;
2021use vortex:: dtype:: arrow:: FromArrowType ;
2122use vortex:: dtype:: extension:: ExtId ;
2223use vortex:: dtype:: session:: DTypeSessionExt ;
2324use vortex:: error:: VortexError ;
2425use vortex:: error:: VortexResult ;
25- use vortex:: error:: vortex_err;
2626use vortex:: session:: VortexSession ;
2727
2828use crate :: SESSION ;
@@ -37,8 +37,7 @@ use crate::error::PyVortexResult;
3737
3838/// Convert a Python `pyarrow` array (including `pa.ExtensionArray`) into a Vortex array.
3939///
40- /// Arrow's C ABI strips extension identity from the array layer — it lives on `Field`
41- /// metadata, and a leaf `pa.ExtensionArray` has no enclosing field. We recover it from the
40+ /// The Arrow C ABI strips extension identity from leaf arrays; we recover it from the
4241/// Python object via `extension_name` and `__arrow_ext_serialize__`.
4342pub trait FromPyArrowArray : Sized {
4443 /// Convert a Python `pyarrow` array to a Vortex array.
@@ -78,22 +77,27 @@ impl FromPyArrowArray for ArrayRef {
7877 }
7978}
8079
81- /// `__arrow_ext_serialize__` returns raw bytes; we pass them straight to the plugin.
82- /// Base64 is only the encoding used in the Arrow Field-metadata string channel — going
83- /// directly Python → registry skips that hop .
80+ /// Raw bytes from `__arrow_ext_serialize__` — no base64 (that's only for the
81+ /// Arrow Field-metadata string channel). Variant short-circuits to `None` so it surfaces
82+ /// as `DType::Variant` via the storage path, mirroring `dtype/arrow.rs::dtype_from_field` .
8483fn extract_extension_info ( py_array : & Bound < ' _ , PyAny > ) -> PyResult < Option < ( String , Vec < u8 > ) > > {
8584 let py = py_array. py ( ) ;
8685 let py_type = py_array. getattr ( intern ! ( py, "type" ) ) ?;
8786 if !py_type. is_instance ( extension_type_class ( py) ?) ? {
8887 return Ok ( None ) ;
8988 }
9089 let ext_name: String = py_type. getattr ( intern ! ( py, "extension_name" ) ) ?. extract ( ) ?;
90+ if ext_name == ARROW_EXT_NAME_VARIANT {
91+ return Ok ( None ) ;
92+ }
9193 let ext_meta_bytes: Vec < u8 > = py_type
9294 . call_method0 ( intern ! ( py, "__arrow_ext_serialize__" ) ) ?
9395 . extract ( ) ?;
9496 Ok ( Some ( ( ext_name, ext_meta_bytes) ) )
9597}
9698
99+ /// Soft fallback to storage on registry miss or malformed metadata, mirroring
100+ /// `dtype/arrow.rs::resolve_extension_dtype`.
97101fn wrap_with_extension (
98102 storage : ArrayRef ,
99103 ext_name : & str ,
@@ -102,11 +106,20 @@ fn wrap_with_extension(
102106) -> VortexResult < ArrayRef > {
103107 let ext_id = ExtId :: new ( ext_name) ;
104108 let dtypes = session. dtypes ( ) ;
105- let plugin = dtypes
106- . registry ( )
107- . find ( & ext_id)
108- . ok_or_else ( || vortex_err ! ( "extension `{ext_name}` is not registered on the session" ) ) ?;
109- let ext_dtype = plugin. deserialize ( ext_meta_bytes, storage. dtype ( ) . clone ( ) ) ?;
109+ let Some ( plugin) = dtypes. registry ( ) . find ( & ext_id) else {
110+ log:: warn!( "pyarrow extension {ext_name:?} not registered on session; using storage dtype" ) ;
111+ return Ok ( storage) ;
112+ } ;
113+ let ext_dtype = match plugin. deserialize ( ext_meta_bytes, storage. dtype ( ) . clone ( ) ) {
114+ Ok ( dt) => dt,
115+ Err ( e) => {
116+ log:: warn!(
117+ "pyarrow extension {ext_name:?} failed to deserialize metadata ({e}); \
118+ using storage dtype",
119+ ) ;
120+ return Ok ( storage) ;
121+ }
122+ } ;
110123 Ok ( ExtensionArray :: try_new ( ext_dtype, storage) ?. into_array ( ) )
111124}
112125
@@ -135,28 +148,49 @@ pub(super) fn from_arrow(obj: &Borrowed<'_, '_, PyAny>) -> PyVortexResult<PyArra
135148 Ok ( PyArrayRef :: from ( enc_array) )
136149 } else if obj. is_instance ( chunked_array) ? {
137150 let chunks: Vec < Bound < PyAny > > = obj. getattr ( intern ! ( py, "chunks" ) ) ?. extract ( ) ?;
151+ // ChunkedArray has a uniform type — peek extension identity once and reuse.
152+ let bound = obj. to_owned ( ) ;
153+ let ext_info = extract_extension_info ( & bound) ?;
138154 let encoded_chunks = chunks
139155 . iter ( )
140- . map ( |a| {
141- let arrow_array = ArrowArrayData :: from_pyarrow ( & a. as_borrowed ( ) ) . map ( make_array) ?;
142- ArrayRef :: from_arrow ( arrow_array. as_ref ( ) , false ) . map_err ( PyVortexError :: from)
156+ . map ( |chunk| {
157+ let arrow_array =
158+ ArrowArrayData :: from_pyarrow ( & chunk. as_borrowed ( ) ) . map ( make_array) ?;
159+ let storage = ArrayRef :: from_arrow_with_session (
160+ arrow_array. as_ref ( ) ,
161+ arrow_array. is_nullable ( ) ,
162+ & SESSION ,
163+ )
164+ . map_err ( PyVortexError :: from) ?;
165+ match & ext_info {
166+ None => Ok ( storage) ,
167+ Some ( ( name, meta) ) => wrap_with_extension ( storage, name, meta, & SESSION )
168+ . map_err ( |e| PyVortexError :: from ( e) . into ( ) ) ,
169+ }
143170 } )
144- . collect :: < PyVortexResult < Vec < _ > > > ( ) ?;
145- let dtype: DType = obj
146- . getattr ( intern ! ( py, "type" ) )
147- . and_then ( |v| DataType :: from_pyarrow ( & v. as_borrowed ( ) ) )
148- . map ( |dt| DType :: from_arrow ( & Field :: new ( "_" , dt, false ) ) ) ?;
171+ . collect :: < PyResult < Vec < _ > > > ( ) ?;
172+ let dtype: DType = if let Some ( first) = encoded_chunks. first ( ) {
173+ first. dtype ( ) . clone ( )
174+ } else {
175+ // Empty array: `obj.type` over the C ABI loses extension metadata, so we
176+ // recover only the storage dtype.
177+ obj. getattr ( intern ! ( py, "type" ) )
178+ . and_then ( |v| DataType :: from_pyarrow ( & v. as_borrowed ( ) ) )
179+ . map ( |dt| DType :: from_arrow_with_session ( & Field :: new ( "_" , dt, false ) , & SESSION ) ) ?
180+ } ;
149181 Ok ( PyArrayRef :: from (
150182 ChunkedArray :: try_new ( encoded_chunks, dtype) ?. into_array ( ) ,
151183 ) )
152184 } else if obj. is_instance ( table) ? {
185+ // The C ABI Stream carries Field metadata on the schema — session-aware
186+ // conversion recovers extensions directly, no Python peek needed.
153187 let array_stream = ArrowArrayStreamReader :: from_pyarrow ( & obj. as_borrowed ( ) ) ?;
154- let dtype = DType :: from_arrow ( array_stream. schema ( ) ) ;
188+ let dtype = DType :: from_arrow_with_session ( array_stream. schema ( ) , & SESSION ) ;
155189 let chunks = array_stream
156190 . into_iter ( )
157191 . map ( |b| {
158192 b. map_err ( VortexError :: from)
159- . and_then ( |b| ArrayRef :: from_arrow ( b, false ) )
193+ . and_then ( |b| ArrayRef :: from_arrow_with_session ( b, false , & SESSION ) )
160194 } )
161195 . collect :: < VortexResult < Vec < _ > > > ( ) ?;
162196 Ok ( PyArrayRef :: from (
0 commit comments