Skip to content

Commit c9e4036

Browse files
committed
Add CTable null policy and nullable schema support
- Add context-scoped NullPolicy for inferred null sentinels - Support nullable=True in scalar CTable schema specs - Add per-column column_null_values overrides - Validate policy-derived null sentinels against column specs - Simplify Arrow import API to CTable.from_arrow(schema, batches) - Flush imported list columns batch-wise by default - Update docs, examples, and tests for nullable CTable columns
1 parent af68a33 commit c9e4036

14 files changed

Lines changed: 516 additions & 125 deletions

doc/reference/ctable.rst

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,15 +42,63 @@ Construction
4242
CTable.open
4343
CTable.load
4444
CTable.from_arrow
45+
CTable.from_parquet
4546
CTable.from_csv
4647

4748
.. automethod:: CTable.__init__
4849
.. automethod:: CTable.open
4950
.. automethod:: CTable.load
5051
.. automethod:: CTable.from_arrow
52+
.. automethod:: CTable.from_parquet
5153
.. automethod:: CTable.from_csv
5254

5355

56+
Null policy
57+
-----------
58+
59+
Nullable scalar CTable columns are represented with per-column sentinel values,
60+
not native validity bitmaps. When CTable has to infer those sentinels, the
61+
selection can be customized with :class:`NullPolicy` and scoped with
62+
:func:`null_policy`::
63+
64+
policy = blosc2.NullPolicy(
65+
signed_int_strategy="max",
66+
string_value="<NULL>",
67+
column_null_values={"user_id": -1, "country": "NA"},
68+
)
69+
70+
with blosc2.null_policy(policy):
71+
table = blosc2.CTable.from_parquet("data.parquet")
72+
73+
The same policy is used by explicit nullable schema specs when no
74+
``null_value`` is supplied::
75+
76+
from dataclasses import dataclass
77+
78+
@dataclass
79+
class Row:
80+
user_id: int = blosc2.field(blosc2.int64(nullable=True))
81+
country: str = blosc2.field(blosc2.string(nullable=True))
82+
83+
with blosc2.null_policy(policy):
84+
table = blosc2.CTable(Row)
85+
86+
Sentinels are resolved in this order: explicit ``null_value`` in the schema,
87+
``NullPolicy.column_null_values`` for a matching column, then the type-wide
88+
``NullPolicy`` default. Columns without ``nullable=True`` or an explicit
89+
``null_value`` are not nullable.
90+
91+
.. autosummary::
92+
93+
NullPolicy
94+
null_policy
95+
get_null_policy
96+
97+
.. autoclass:: NullPolicy
98+
.. autofunction:: null_policy
99+
.. autofunction:: get_null_policy
100+
101+
54102
Attributes
55103
----------
56104

examples/ctable/arrow_interop.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -40,15 +40,15 @@ class Stock:
4040
at = t.to_arrow()
4141
print(f"Arrow table: {len(at)} rows, schema={at.schema}\n")
4242

43-
# -- from_arrow(): schema is inferred from Arrow types ---------------------
43+
# -- from_arrow(): import an Arrow schema and record batches ---------------
4444
at2 = pa.table(
4545
{
4646
"x": pa.array([1.0, 2.0, 3.0], type=pa.float32()),
4747
"y": pa.array([10, 20, 30], type=pa.int32()),
4848
"label": pa.array(["a", "bb", "ccc"], type=pa.string()),
4949
}
5050
)
51-
t2 = blosc2.CTable.from_arrow(at2)
51+
t2 = blosc2.CTable.from_arrow(at2.schema, at2.to_batches())
5252
print("CTable from Arrow (inferred schema):")
5353
print(t2)
5454
print(f" label dtype: {t2['label'].dtype} (max_length inferred from data)")
@@ -69,7 +69,8 @@ class Stock:
6969
print(df_original)
7070

7171
# pandas → Arrow → CTable
72-
t_from_pd = blosc2.CTable.from_arrow(pa.Table.from_pandas(df_original, preserve_index=False))
72+
at_pd = pa.Table.from_pandas(df_original, preserve_index=False)
73+
t_from_pd = blosc2.CTable.from_arrow(at_pd.schema, at_pd.to_batches())
7374
print("\nCTable from pandas:")
7475
print(t_from_pd)
7576

examples/ctable/nullable.py

Lines changed: 51 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -5,17 +5,18 @@
55
# SPDX-License-Identifier: BSD-3-Clause
66
#######################################################################
77

8-
# Nullable columns: null_value sentinels, null-aware aggregates,
9-
# is_null / notnull, sort nulls-last, Arrow null masking, CSV empty cells.
8+
# Nullable columns: null_value sentinels, nullable=True, NullPolicy,
9+
# null-aware aggregates, is_null / notnull, sort nulls-last, Arrow null masking,
10+
# and CSV empty cells.
1011
#
1112
# CTable does not have a built-in "missing" bit per row like pandas does.
12-
# Instead it uses a *sentinel value* approach: you choose a specific value
13-
# that represents "null" for a column, and the library treats it
14-
# transparently in aggregates, sorting, unique(), value_counts(), and
15-
# Arrow export.
13+
# Instead it uses a *sentinel value* approach: each nullable column stores a
14+
# specific value that represents "null". The library treats that value
15+
# transparently in aggregates, sorting, unique(), value_counts(), and Arrow
16+
# export.
1617
#
17-
# This is especially useful for integer and string columns that have no
18-
# natural null (unlike float, which can use NaN).
18+
# You can either choose sentinels explicitly with null_value=, or ask CTable to
19+
# choose them from the active NullPolicy with nullable=True.
1920

2021
import os
2122
import tempfile
@@ -24,24 +25,57 @@
2425
import blosc2
2526

2627
# ---------------------------------------------------------------------------
27-
# Schema with nullable columns
28+
# Schema with explicit null_value sentinels
2829
# ---------------------------------------------------------------------------
29-
# Use null_value= on any spec to declare the sentinel.
30-
# The sentinel bypasses validation constraints (ge/le etc.) so you can
31-
# store it even when it would otherwise violate them.
30+
# Use null_value= on any spec to declare the sentinel. The sentinel bypasses
31+
# validation constraints (ge/le etc.) so you can store it even when it would
32+
# otherwise violate them.
3233

3334

3435
@dataclass
3536
class Reading:
3637
sensor_id: int = blosc2.field(blosc2.int32(ge=0))
3738
# -999 is "no reading" for temperature (normally ge=-50, le=60)
38-
temperature: float = blosc2.field(blosc2.float64(ge=-50.0, le=60.0, null_value=-999.0), default=-999.0)
39+
temperature: float = blosc2.field(blosc2.float64(ge=-50.0, le=60.0, null_value=-999.0))
3940
# "" is "unknown" for location (string)
40-
location: str = blosc2.field(blosc2.string(max_length=16, null_value=""), default="")
41+
location: str = blosc2.field(blosc2.string(max_length=16, null_value=""))
4142
# -1 is "not measured" for signal strength (normally ge=0, le=100)
42-
signal: int = blosc2.field(blosc2.int8(ge=0, le=100, null_value=-1), default=-1)
43+
signal: int = blosc2.field(blosc2.int8(ge=0, le=100, null_value=-1))
4344

4445

46+
# ---------------------------------------------------------------------------
47+
# Schema using nullable=True and NullPolicy
48+
# ---------------------------------------------------------------------------
49+
# nullable=True means "make this column nullable and choose the sentinel from
50+
# the current NullPolicy". column_null_values overrides the type-wide policy for
51+
# specific columns.
52+
53+
54+
@dataclass
55+
class AutoReading:
56+
sensor_id: int = blosc2.field(blosc2.int32(ge=0))
57+
temperature: float = blosc2.field(blosc2.float64(ge=-50.0, le=60.0, nullable=True))
58+
location: str = blosc2.field(blosc2.string(max_length=16, nullable=True))
59+
signal: int = blosc2.field(blosc2.int8(ge=0, le=100, nullable=True))
60+
61+
62+
policy = blosc2.NullPolicy(
63+
float_value=-999.0,
64+
string_value="",
65+
column_null_values={"signal": -1},
66+
)
67+
with blosc2.null_policy(policy):
68+
auto = blosc2.CTable(AutoReading)
69+
70+
print("NullPolicy + nullable=True selected these sentinels:")
71+
print(f"temperature: {auto['temperature'].null_value!r}")
72+
print(f"location : {auto['location'].null_value!r}")
73+
print(f"signal : {auto['signal'].null_value!r}")
74+
75+
# ---------------------------------------------------------------------------
76+
# Work with nullable columns
77+
# ---------------------------------------------------------------------------
78+
4579
data = [
4680
(0, 22.3, "roof", 87),
4781
(1, -999.0, "cellar", 41), # temperature unknown
@@ -52,7 +86,7 @@ class Reading:
5286
]
5387

5488
t = blosc2.CTable(Reading, new_data=data)
55-
print("Table with nullable columns:")
89+
print("\nTable with nullable columns:")
5690
print(t)
5791

5892
# ---------------------------------------------------------------------------
@@ -74,7 +108,7 @@ class Reading:
74108
# Null-aware aggregates
75109
# ---------------------------------------------------------------------------
76110
print("\n--- Aggregates skip null sentinels ---")
77-
print(f"temperature.mean() = {t['temperature'].mean():.2f} (only 3 non-null readings)")
111+
print(f"temperature.mean() = {t['temperature'].mean():.2f} (only 4 non-null readings)")
78112
print(f"temperature.min() = {t['temperature'].min():.2f}")
79113
print(f"temperature.max() = {t['temperature'].max():.2f}")
80114
print(f"signal.sum() = {t['signal'].sum()} (non-null: 87+41+62+95 = 285)")

examples/ctable/real_world.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -98,7 +98,8 @@ class WeatherReading:
9898
path = f"{tmpdir}/station3"
9999
try:
100100
# Views cannot be sorted or saved directly — materialise via Arrow first
101-
s3_copy = blosc2.CTable.from_arrow(station3.to_arrow())
101+
arrow = station3.to_arrow()
102+
s3_copy = blosc2.CTable.from_arrow(arrow.schema, arrow.to_batches())
102103
s3_copy.sort_by("day_of_year", inplace=True)
103104
sorted_s3 = s3_copy
104105
sorted_s3.save(path, overwrite=True)

off/parquet-to-blosc2.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88

99
"""Import/export Parquet datasets through a CTable store.
1010
11-
Default mode imports parquet -> .b2z/.b2d using CTable.from_arrow_batches().
11+
Default mode imports parquet -> .b2z/.b2d using CTable.from_arrow().
1212
The output extension selects the storage layout: .b2z is compact/zip-backed,
1313
.b2d is sparse directory-backed. Additional modes:
1414
@@ -546,7 +546,7 @@ def import_once(args, input_path: Path, output_path: Path, force_list_strings: s
546546

547547
t0 = time.perf_counter()
548548
maybe_memory_report(args, "before CTable import", pa)
549-
ct = blosc2.CTable.from_arrow_batches(
549+
ct = blosc2.CTable.from_arrow(
550550
arrow_schema,
551551
progress_batches(pa, pf, args, selected_cols, list_wrap_cols),
552552
urlpath=str(output_path),

pyproject.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -129,6 +129,7 @@ ignore = [
129129
"RUF015",
130130
"RUF059",
131131
"SIM108",
132+
"SIM117",
132133
]
133134

134135
[tool.ruff.lint.extend-per-file-ignores]

src/blosc2/__init__.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -633,7 +633,7 @@ def _raise(exc):
633633
# Delayed imports for avoiding overwriting of python builtins.
634634
# Note: bool, bytes, string shadow builtins in the blosc2 namespace by design —
635635
# they are schema spec constructors (b2.bool(), b2.bytes(), etc.).
636-
from .ctable import Column, CTable
636+
from .ctable import DEFAULT_NULL_POLICY, Column, CTable, NullPolicy, get_null_policy, null_policy
637637
from .ndarray import (
638638
abs,
639639
acos,
@@ -769,6 +769,7 @@ def _raise(exc):
769769
"DEFAULT_FLOAT",
770770
"DEFAULT_INDEX",
771771
"DEFAULT_INT",
772+
"DEFAULT_NULL_POLICY",
772773
# Mathematical constants
773774
"e",
774775
"pi",
@@ -812,6 +813,7 @@ def _raise(exc):
812813
"LazyExpr",
813814
"LazyUDF",
814815
"ListArray",
816+
"NullPolicy",
815817
"NDArray",
816818
"NDField",
817819
"Operand",
@@ -1029,4 +1031,6 @@ def _raise(exc):
10291031
"where",
10301032
"zeros",
10311033
"zeros_like",
1034+
"get_null_policy",
1035+
"null_policy",
10321036
]

0 commit comments

Comments
 (0)