Skip to content

Commit b2aa0ed

Browse files
committed
simplify comments
Signed-off-by: Baris Palaska <barispalaska@gmail.com>
1 parent 5ff6116 commit b2aa0ed

9 files changed

Lines changed: 47 additions & 57 deletions

File tree

vortex-array/src/arrow/convert.rs

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -743,8 +743,8 @@ impl FromArrowArray<&RecordBatch> for ArrayRef {
743743
}
744744
}
745745

746-
/// Rewrap `storage` as `ExtensionArray` if `field` carries `ARROW:extension:name`
747-
/// for a registered extension. Inverse of `field_from_dtype` in `dtype/arrow.rs`.
746+
/// Rewrap `storage` as an `ExtensionArray` when `field` carries extension metadata
747+
/// for an extension registered on the session.
748748
fn wrap_extension_if_field_has_metadata(
749749
storage: ArrayRef,
750750
field: &Field,
@@ -1968,8 +1968,8 @@ mod tests {
19681968
ArrayRef::from_arrow(null_struct_array_with_non_nullable_field.as_ref(), true).unwrap();
19691969
}
19701970

1971-
/// Dictionary value with a struct field carrying registered extension metadata recovers
1972-
/// the extension dtype when converted via the session-aware path.
1971+
/// Extension metadata on a struct field nested inside a Dictionary's values is
1972+
/// recovered when the session has the extension registered.
19731973
#[test]
19741974
fn dictionary_struct_value_recovers_extension_through_session() {
19751975
let session = VortexSession::empty().with::<DTypeSession>();

vortex-array/src/arrow/executor/mod.rs

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -99,9 +99,8 @@ impl ArrowArrayExecutor for ArrayRef {
9999
None => preferred_arrow_type(&self)?,
100100
};
101101

102-
// Temporal extensions keep their wrapper so `to_arrow_temporal` can read the
103-
// metadata. Other extensions unwrap to storage; their identity rides on Field
104-
// metadata.
102+
// Temporal extensions stay wrapped — `to_arrow_temporal` reads their metadata.
103+
// Other extensions unwrap to storage; their identity lives on the Field.
105104
if let DType::Extension(ext) = self.dtype()
106105
&& ext.metadata_opt::<AnyTemporal>().is_none()
107106
{
@@ -279,8 +278,8 @@ mod tests {
279278
assert_eq!(primitives.values(), &[0, 1, 2, 3, 4, 5]);
280279
}
281280

282-
/// Temporal extensions have native Arrow mappings. The executor must keep the extension
283-
/// array intact so `to_arrow_temporal` can read `TemporalMetadata` from its dtype.
281+
/// Temporal extensions map to native Arrow types — the executor must not unwrap them,
282+
/// otherwise `to_arrow_temporal` can't read the time unit / timezone.
284283
#[test]
285284
fn execute_arrow_keeps_temporal_extension_for_native_arrow_type() {
286285
use crate::dtype::DType;

vortex-array/src/arrow/executor/struct_.rs

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -86,9 +86,8 @@ pub(super) fn to_arrow_struct(
8686

8787
// Otherwise, we fall back to executing to a StructArray.
8888
let array = if let Some(fields) = target_fields {
89-
// Thread the ctx session so any extension Field metadata in `fields` resolves to
90-
// `DType::Extension` in the cast target — keeps the cast path consistent with the
91-
// short-circuit Struct/`Pack` paths above.
89+
// Use the ctx session so extension fields resolve to `DType::Extension` in the
90+
// cast target, matching the short-circuit paths above.
9291
let vx_fields = StructFields::from_arrow_with_session(fields, ctx.session());
9392
// We apply a cast to ensure we push down casting where possible into the struct fields.
9493
array.cast(DType::Struct(

vortex-array/src/arrow/mod.rs

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -28,9 +28,8 @@ use crate::VortexSessionExecute;
2828
pub trait FromArrowArray<A>: Sized {
2929
fn from_arrow(array: A, nullable: bool) -> VortexResult<Self>;
3030

31-
/// Same conversion, with session for resolving `ARROW:extension:name` field metadata to
32-
/// registered extension dtypes. The default ignores the session — override on impls that
33-
/// see Arrow `Field`s (RecordBatch, Struct, List, FSL).
31+
/// Like [`Self::from_arrow`], but resolves `ARROW:extension:name` field metadata
32+
/// against extensions registered on `session`. The default ignores the session.
3433
fn from_arrow_with_session(
3534
array: A,
3635
nullable: bool,

vortex-array/src/arrow/record_batch.rs

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -22,9 +22,7 @@ impl StructArray {
2222
self.into_record_batch_with_schema_with_session(schema, &LEGACY_SESSION)
2323
}
2424

25-
/// Same as [`Self::into_record_batch_with_schema`], but routes execution through `session`
26-
/// so canonical Arrow extension aliases declared on registered vtables apply uniformly to
27-
/// both schema construction and array conversion.
25+
/// Like [`Self::into_record_batch_with_schema`], but runs the conversion under `session`.
2826
pub fn into_record_batch_with_schema_with_session(
2927
self,
3028
schema: impl AsRef<Schema>,

vortex-array/src/dtype/arrow.rs

Lines changed: 17 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -53,18 +53,17 @@ use crate::extension::datetime::Time;
5353
use crate::extension::datetime::TimeUnit;
5454
use crate::extension::datetime::Timestamp;
5555

56-
/// Canonical Arrow extension name for Parquet Variant — handled as `DType::Variant` rather
57-
/// than going through the extension registry.
56+
/// Arrow's Parquet Variant extension name. We map it to `DType::Variant` directly,
57+
/// not through the extension registry.
5858
pub const ARROW_EXT_NAME_VARIANT: &str = "arrow.parquet.variant";
5959

6060
/// Trait for converting Arrow types to Vortex types.
6161
pub trait FromArrowType<T>: Sized {
6262
/// Convert the Arrow type to a Vortex type.
6363
fn from_arrow(value: T) -> Self;
6464

65-
/// Convert the Arrow type to a Vortex type, consulting `session` for extension lookup.
66-
///
67-
/// Unregistered or malformed extension metadata falls back to the storage dtype.
65+
/// Convert an Arrow type to a Vortex type, looking up extensions in `session`.
66+
/// Unregistered or malformed extensions fall back to the storage dtype.
6867
fn from_arrow_with_session(value: T, session: &VortexSession) -> Self {
6968
let _ = session;
7069
Self::from_arrow(value)
@@ -269,8 +268,9 @@ fn dtype_from_field(field: &Field, dtypes: &DTypeSession) -> DType {
269268
}
270269
}
271270

272-
/// Resolve the [`ExtDTypeRef`] for a Field whose `ARROW:extension:name` names a registered
273-
/// Vortex extension. Returns `None` for missing/unregistered/malformed metadata.
271+
/// Look up the extension dtype for a Field's `ARROW:extension:name` against the session.
272+
/// Returns `None` (after logging a warning) if the extension is missing, unregistered,
273+
/// or has malformed metadata.
274274
pub(crate) fn resolve_extension_dtype(
275275
field: &Field,
276276
dtypes: &DTypeSession,
@@ -315,8 +315,7 @@ pub(crate) fn resolve_extension_dtype(
315315
}
316316
}
317317

318-
/// Extensions base64-encode arbitrary binary metadata to survive Arrow's String-typed
319-
/// metadata channel.
318+
/// Arrow's metadata channel is a `String`, so we base64-encode the raw extension bytes.
320319
fn decode_extension_metadata(field: &Field) -> VortexResult<Vec<u8>> {
321320
match field.extension_type_metadata() {
322321
None | Some("") => Ok(Vec::new()),
@@ -326,8 +325,7 @@ fn decode_extension_metadata(field: &Field) -> VortexResult<Vec<u8>> {
326325
}
327326
}
328327

329-
/// Build the storage [`DType`] for `field`, recursing through nested children so each level
330-
/// runs the extension lookup against `dtypes`.
328+
/// Build the storage [`DType`] for `field`, running the extension lookup at every nested level.
331329
fn storage_dtype_from_field(field: &Field, dtypes: &DTypeSession) -> DType {
332330
let nullability: Nullability = field.is_nullable().into();
333331
match field.data_type() {
@@ -374,10 +372,10 @@ impl DType {
374372
Ok(builder.finish())
375373
}
376374

377-
/// Returns the Arrow [`DataType`] that best corresponds to this Vortex [`DType`].
375+
/// Returns the Arrow [`DataType`] for this Vortex [`DType`].
378376
///
379-
/// Extensions without a native Arrow mapping degrade to their storage `DataType`;
380-
/// identity only survives via `Field` metadata (see [`Self::to_arrow_schema`]).
377+
/// Extensions without a native Arrow mapping return only their storage `DataType`.
378+
/// Use [`Self::to_arrow_schema`] to keep the extension identity on the Field.
381379
pub fn to_arrow_dtype(&self) -> VortexResult<DataType> {
382380
arrow_dtype_from_dtype(self)
383381
}
@@ -450,8 +448,7 @@ fn arrow_dtype_from_dtype(dtype: &DType) -> VortexResult<DataType> {
450448
if let Some(native) = native_arrow_dtype_for_extension(ext_dtype) {
451449
return Ok(native);
452450
}
453-
// Extension identity lives on the Field (see `field_from_dtype`), not on
454-
// DataType, so here we only encode the storage type.
451+
// Extension identity lives on the Field, not the DataType — emit only storage here.
455452
arrow_dtype_from_dtype(ext_dtype.storage_dtype())?
456453
}
457454
})
@@ -473,8 +470,8 @@ fn field_from_dtype(name: &str, dtype: &DType) -> VortexResult<Field> {
473470
}
474471

475472
if let DType::Extension(ext) = dtype {
476-
// Native Arrow mapping carries the semantics in DataType; emitting extension metadata
477-
// on top would break consumers that only understand native Arrow types.
473+
// Temporal extensions map to native Arrow types — don't add extension metadata,
474+
// it would confuse Arrow-only consumers.
478475
if let Some(native) = native_arrow_dtype_for_extension(ext) {
479476
return Ok(Field::new(name, native, dtype.is_nullable()));
480477
}
@@ -501,8 +498,8 @@ fn field_from_dtype(name: &str, dtype: &DType) -> VortexResult<Field> {
501498
))
502499
}
503500

504-
/// Returns the native Arrow [`DataType`] for extensions Arrow models directly (e.g. temporal).
505-
/// `None` means the extension should round-trip via storage + Field metadata.
501+
/// The native Arrow [`DataType`] for extensions Arrow models directly (currently temporal),
502+
/// or `None` if the extension should round-trip via storage + Field metadata.
506503
fn native_arrow_dtype_for_extension(ext_dtype: &ExtDTypeRef) -> Option<DataType> {
507504
let temporal = ext_dtype.metadata_opt::<AnyTemporal>()?;
508505
Some(match temporal {

vortex-python/src/arrays/from_arrow.rs

Lines changed: 11 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -37,8 +37,8 @@ use crate::error::PyVortexResult;
3737

3838
/// Convert a Python `pyarrow` array (including `pa.ExtensionArray`) into a Vortex array.
3939
///
40-
/// The Arrow C ABI strips extension identity from leaf arrays; we recover it from the
41-
/// Python object via `extension_name` and `__arrow_ext_serialize__`.
40+
/// The Arrow C ABI doesn't carry extension identity on leaf arrays, so we read it from
41+
/// the Python object via `extension_name` and `__arrow_ext_serialize__`.
4242
pub trait FromPyArrowArray: Sized {
4343
/// Convert a Python `pyarrow` array to a Vortex array.
4444
fn from_pyarrow(py_array: &Bound<'_, PyAny>, nullable: bool) -> PyResult<Self>;
@@ -77,9 +77,9 @@ impl FromPyArrowArray for ArrayRef {
7777
}
7878
}
7979

80-
/// Raw bytes from `__arrow_ext_serialize__` — no base64 (that's only for the
81-
/// Arrow Field-metadata string channel). Variant short-circuits to `None` so it surfaces
82-
/// as `DType::Variant` via the storage path, mirroring `dtype/arrow.rs::dtype_from_field`.
80+
/// `__arrow_ext_serialize__` returns raw bytes — no base64 (that's only for the Field
81+
/// metadata channel). Variant returns `None` so it surfaces as `DType::Variant` instead
82+
/// of going through the extension registry.
8383
fn extract_extension_info(py_array: &Bound<'_, PyAny>) -> PyResult<Option<(String, Vec<u8>)>> {
8484
let py = py_array.py();
8585
let py_type = py_array.getattr(intern!(py, "type"))?;
@@ -96,8 +96,8 @@ fn extract_extension_info(py_array: &Bound<'_, PyAny>) -> PyResult<Option<(Strin
9696
Ok(Some((ext_name, ext_meta_bytes)))
9797
}
9898

99-
/// Soft fallback to storage on registry miss or malformed metadata, mirroring
100-
/// `dtype/arrow.rs::resolve_extension_dtype`.
99+
/// Wrap `storage` as an extension array if the extension is registered. On a registry
100+
/// miss or malformed metadata, log a warning and return `storage` as-is.
101101
fn wrap_with_extension(
102102
storage: ArrayRef,
103103
ext_name: &str,
@@ -148,7 +148,7 @@ pub(super) fn from_arrow(obj: &Borrowed<'_, '_, PyAny>) -> PyVortexResult<PyArra
148148
Ok(PyArrayRef::from(enc_array))
149149
} else if obj.is_instance(chunked_array)? {
150150
let chunks: Vec<Bound<PyAny>> = obj.getattr(intern!(py, "chunks"))?.extract()?;
151-
// ChunkedArray has a uniform type — peek extension identity once and reuse.
151+
// All chunks share the same type, so read the extension info once.
152152
let bound = obj.to_owned();
153153
let ext_info = extract_extension_info(&bound)?;
154154
let encoded_chunks = chunks
@@ -172,8 +172,8 @@ pub(super) fn from_arrow(obj: &Borrowed<'_, '_, PyAny>) -> PyVortexResult<PyArra
172172
let dtype: DType = if let Some(first) = encoded_chunks.first() {
173173
first.dtype().clone()
174174
} else {
175-
// Empty array: `obj.type` over the C ABI loses extension metadata, so we
176-
// recover only the storage dtype.
175+
// Empty: there's no chunk to read the dtype from, and `obj.type` over the C ABI
176+
// doesn't carry extension metadata. Fall back to the storage dtype.
177177
obj.getattr(intern!(py, "type"))
178178
.and_then(|v| DataType::from_pyarrow(&v.as_borrowed()))
179179
.map(|dt| DType::from_arrow_with_session(&Field::new("_", dt, false), &SESSION))?
@@ -182,8 +182,7 @@ pub(super) fn from_arrow(obj: &Borrowed<'_, '_, PyAny>) -> PyVortexResult<PyArra
182182
ChunkedArray::try_new(encoded_chunks, dtype)?.into_array(),
183183
))
184184
} else if obj.is_instance(table)? {
185-
// The C ABI Stream carries Field metadata on the schema — session-aware
186-
// conversion recovers extensions directly, no Python peek needed.
185+
// The C ABI Stream carries Field metadata, so we don't need a Python peek here.
187186
let array_stream = ArrowArrayStreamReader::from_pyarrow(&obj.as_borrowed())?;
188187
let dtype = DType::from_arrow_with_session(array_stream.schema(), &SESSION);
189188
let chunks = array_stream

vortex-python/test/test_arrow_extension.py

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,7 @@
11
# SPDX-License-Identifier: Apache-2.0
22
# SPDX-FileCopyrightText: Copyright the Vortex contributors
33

4-
"""Round-trip tests for `vx.array` against pyarrow inputs carrying Vortex
5-
extension identity over the Arrow C ABI."""
4+
"""Tests that `vx.array` recovers Vortex extension identity from pyarrow inputs."""
65

76
from __future__ import annotations
87

@@ -12,16 +11,16 @@
1211

1312
import vortex as vx
1413

15-
# vortex.timestamp wire format: u8 unit_tag + u16 LE tz_len (us=1, no tz).
16-
# See vortex-array/src/extension/datetime/timestamp.rs::serialize_metadata.
14+
# Wire format: u8 unit_tag + u16 LE tz_len. Microseconds = 1, no timezone.
15+
# See vortex-array/src/extension/datetime/timestamp.rs.
1716
_TIMESTAMP_US_METADATA = bytes([1, 0, 0])
1817

1918

2019
class VortexTimestampType(pa.ExtensionType):
21-
"""A pyarrow `ExtensionType` matching Vortex's `vortex.timestamp` extension."""
20+
"""pyarrow `ExtensionType` matching Vortex's `vortex.timestamp`."""
2221

2322
def __init__(self, unit: str = "us"):
24-
# pyarrow calls `__arrow_ext_serialize__` in __init__, so set `_unit` first.
23+
# pyarrow calls `__arrow_ext_serialize__` from __init__, so `_unit` must be set first.
2524
self._unit = unit
2625
pa.ExtensionType.__init__(self, pa.int64(), "vortex.timestamp")
2726

vortex-tensor/src/tests/arrow_roundtrip.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -54,7 +54,7 @@ fn vector_forward_carries_extension_name() {
5454
.map(String::as_str),
5555
Some(Vector.id().as_str()),
5656
);
57-
// EmptyMetadata no metadata key.
57+
// Vector uses EmptyMetadata, so no metadata key is written.
5858
assert!(field.metadata().get(EXTENSION_TYPE_METADATA_KEY).is_none());
5959

6060
let DataType::FixedSizeList(element, size) = field.data_type() else {

0 commit comments

Comments
 (0)