@@ -1458,12 +1458,18 @@ def _init_columns(
14581458 dparams = col_storage .get ("dparams" ),
14591459 )
14601460 continue
1461+ # Recompute chunks/blocks using the actual dtype so that wide
1462+ # string columns (e.g. U183642) don't produce multi-GB chunks.
1463+ chunks = col_storage ["chunks" ]
1464+ blocks = col_storage ["blocks" ]
1465+ if col .config .chunks is None and col .config .blocks is None :
1466+ chunks , blocks = compute_chunks_blocks ((expected_size ,), dtype = col .dtype )
14611467 self ._cols [col .name ] = storage .create_column (
14621468 col .name ,
14631469 dtype = col .dtype ,
14641470 shape = (expected_size ,),
1465- chunks = col_storage [ " chunks" ] ,
1466- blocks = col_storage [ " blocks" ] ,
1471+ chunks = chunks ,
1472+ blocks = blocks ,
14671473 cparams = col_storage .get ("cparams" ),
14681474 dparams = col_storage .get ("dparams" ),
14691475 )
@@ -3495,8 +3501,10 @@ def sort_by(
34953501 If a column used as a sort key does not support ordering
34963502 (e.g. complex numbers).
34973503 """
3498- if self .base is not None :
3499- raise ValueError ("Cannot sort a view. Materialise it first with .to_table() or sort the parent." )
3504+ if self .base is not None and inplace :
3505+ raise ValueError (
3506+ "Cannot sort a view inplace (would modify shared column data). Use sort_by(inplace=False) to get a sorted copy."
3507+ )
35003508 if inplace and self ._read_only :
35013509 raise ValueError ("Table is read-only (opened with mode='r')." )
35023510
@@ -3523,8 +3531,15 @@ def sort_by(
35233531 sorted_pos = live_pos [order ]
35243532
35253533 if inplace :
3526- for _col_name , arr in self ._cols .items ():
3527- arr [:n ] = arr [sorted_pos ]
3534+ for col in self ._schema .columns :
3535+ arr = self ._cols [col .name ]
3536+ if self ._is_list_column (col ):
3537+ new_arr = ListArray (spec = col .spec )
3538+ new_arr .extend ((arr [int (pos )] for pos in sorted_pos ), validate = False )
3539+ new_arr .flush ()
3540+ self ._cols [col .name ] = new_arr
3541+ else :
3542+ arr [:n ] = arr [sorted_pos ]
35283543 self ._valid_rows [:n ] = True
35293544 self ._valid_rows [n :] = False
35303545 self ._n_rows = n
@@ -3538,7 +3553,7 @@ def sort_by(
35383553 col_name = col .name
35393554 arr = self ._cols [col_name ]
35403555 if self ._is_list_column (col ):
3541- result ._cols [col_name ].extend (arr [int (pos )] for pos in sorted_pos )
3556+ result ._cols [col_name ].extend (( arr [int (pos )] for pos in sorted_pos ), validate = False )
35423557 result ._cols [col_name ].flush ()
35433558 else :
35443559 result ._cols [col_name ][:n ] = arr [sorted_pos ]
@@ -3548,11 +3563,66 @@ def sort_by(
35483563 result ._last_pos = n
35493564 return result
35503565
3551- def _empty_copy (self ) -> CTable :
3566+ def copy (self , compact : bool = True ) -> CTable :
3567+ """Return a new standalone in-memory copy of this table.
3568+
3569+ Parameters
3570+ ----------
3571+ compact:
3572+ If ``True`` (default), only live (non-deleted) rows are copied.
3573+ The result is a dense table with no tombstones and no parent
3574+ dependency — ideal for materialising a filtered view.
3575+ If ``False``, all physical slots are copied including deleted gaps,
3576+ preserving the tombstone state exactly.
3577+ """
3578+ valid_np = self ._valid_rows [:]
3579+ live_pos = np .where (valid_np )[0 ]
3580+ n_live = len (live_pos )
3581+
3582+ if compact :
3583+ n = n_live
3584+ else :
3585+ # High watermark: number of slots ever written.
3586+ # List columns are written sequentially with no gaps — their length
3587+ # is the exact high watermark. For scalar-only tables fall back to
3588+ # the last live position + 1 (writes are always sequential so no
3589+ # deleted slot can exist beyond the last live one).
3590+ n = 0
3591+ for col in self ._schema .columns :
3592+ if self ._is_list_column (col ):
3593+ n = len (self ._cols [col .name ])
3594+ break
3595+ if n == 0 :
3596+ n = int (live_pos [- 1 ]) + 1 if n_live > 0 else 0
3597+
3598+ result = self ._empty_copy (capacity = n )
3599+
3600+ for col in self ._schema .columns :
3601+ col_name = col .name
3602+ arr = self ._cols [col_name ]
3603+ if self ._is_list_column (col ):
3604+ src = (arr [int (pos )] for pos in live_pos ) if compact else (arr [i ] for i in range (n ))
3605+ result ._cols [col_name ].extend (src , validate = False )
3606+ result ._cols [col_name ].flush ()
3607+ else :
3608+ result ._cols [col_name ][:n ] = arr [live_pos ] if compact else arr [:n ]
3609+
3610+ if compact :
3611+ result ._valid_rows [:n ] = True
3612+ result ._n_rows = n
3613+ result ._last_pos = n - 1 if n > 0 else None
3614+ else :
3615+ result ._valid_rows [:n ] = valid_np [:n ]
3616+ result ._n_rows = n_live
3617+ result ._last_pos = None # recomputed lazily on next append
3618+
3619+ return result
3620+
3621+ def _empty_copy (self , capacity : int | None = None ) -> CTable :
35523622 """Return a new empty in-memory CTable with the same schema and capacity."""
35533623 from blosc2 import compute_chunks_blocks
35543624
3555- capacity = max (self ._n_rows , 1 )
3625+ capacity = max (capacity if capacity is not None else self ._n_rows , 1 )
35563626 default_chunks , default_blocks = compute_chunks_blocks ((capacity ,))
35573627 mem_storage = InMemoryTableStorage ()
35583628
@@ -4386,10 +4456,18 @@ def _try_index_where(self, expr_result: blosc2.LazyExpr) -> np.ndarray | None:
43864456 primary_col_name , primary_col_arr , _ = indexed_columns [0 ]
43874457
43884458 # Inject every usable table-owned descriptor so plan_query can combine them.
4459+ # In .b2z read mode all columns share the same urlpath, so _array_key()
4460+ # returns the same key for every column — causing _SIDECAR_HANDLE_CACHE
4461+ # collisions across queries. Clear stale handles before each injection so
4462+ # the upcoming query always loads the correct sidecar for this column.
4463+ from blosc2 .indexing import _clear_cached_data
4464+
43894465 for _col_name , col_arr , descriptor in indexed_columns :
43904466 arr_key = _array_key (col_arr )
43914467 if _is_persistent_array (col_arr ):
43924468 store = _PERSISTENT_INDEXES .get (arr_key ) or _default_index_store ()
4469+ if store ["indexes" ].get (descriptor ["token" ]) is not descriptor :
4470+ _clear_cached_data (col_arr , descriptor ["token" ])
43934471 store ["indexes" ][descriptor ["token" ]] = descriptor
43944472 _PERSISTENT_INDEXES [arr_key ] = store
43954473 else :
@@ -4604,7 +4682,11 @@ def extend(self, data: list | CTable | Any, *, validate: bool | None = None) ->
46044682 raw_columns [name ] = data ._cols [name ][: data ._n_rows ]
46054683 provided_names .add (name )
46064684 else :
4607- if isinstance (data , np .ndarray ) and data .dtype .names is not None :
4685+ if isinstance (data , dict ):
4686+ provided_names = set (data ) & set (current_col_names )
4687+ new_nrows = len (next (iter (data .values ())))
4688+ raw_columns = {name : data [name ] for name in provided_names }
4689+ elif isinstance (data , np .ndarray ) and data .dtype .names is not None :
46084690 new_nrows = len (data )
46094691 raw_columns = {name : data [name ] for name in data .dtype .names if name in current_col_names }
46104692 provided_names = set (raw_columns )
@@ -4635,7 +4717,7 @@ def extend(self, data: list | CTable | Any, *, validate: bool | None = None) ->
46354717 list_processed_cols [name ] = list (raw_columns [name ])
46364718 else :
46374719 target_dtype = self ._cols [name ].dtype
4638- scalar_processed_cols [name ] = blosc2 . asarray (raw_columns [name ], dtype = target_dtype )
4720+ scalar_processed_cols [name ] = np . ascontiguousarray (raw_columns [name ], dtype = target_dtype )
46394721
46404722 end_pos = start_pos + new_nrows
46414723
@@ -4650,7 +4732,7 @@ def extend(self, data: list | CTable | Any, *, validate: bool | None = None) ->
46504732 for name in current_col_names :
46514733 col_meta = self ._schema .columns_by_name [name ]
46524734 if self ._is_list_column (col_meta ):
4653- self ._cols [name ].extend (list_processed_cols [name ])
4735+ self ._cols [name ].extend (list_processed_cols [name ], validate = do_validate )
46544736 else :
46554737 self ._cols [name ][start_pos :end_pos ] = scalar_processed_cols [name ][:]
46564738
0 commit comments