Blosc
diff --git a/‎doc/reference/ctable.rst‎
Lines changed: 48 additions & 0 deletions b/‎doc/reference/ctable.rst‎
Lines changed: 48 additions & 0 deletions
diff --git a/‎examples/ctable/arrow_interop.py‎
Lines changed: 4 additions & 3 deletions b/‎examples/ctable/arrow_interop.py‎
Lines changed: 4 additions & 3 deletions
diff --git a/‎examples/ctable/nullable.py‎
Lines changed: 51 additions & 17 deletions b/‎examples/ctable/nullable.py‎
Lines changed: 51 additions & 17 deletions
diff --git a/‎examples/ctable/real_world.py‎
Lines changed: 2 additions & 1 deletion b/‎examples/ctable/real_world.py‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎off/parquet-to-blosc2.py‎
Lines changed: 2 additions & 2 deletions b/‎off/parquet-to-blosc2.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎pyproject.toml‎
Lines changed: 1 addition & 0 deletions b/‎pyproject.toml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎src/blosc2/__init__.py‎
Lines changed: 5 additions & 1 deletion b/‎src/blosc2/__init__.py‎
Lines changed: 5 additions & 1 deletion
@@ -42,15 +42,63 @@ Construction
     CTable.open
     CTable.load
     CTable.from_arrow
+    CTable.from_parquet
     CTable.from_csv
 
 .. automethod:: CTable.__init__
 .. automethod:: CTable.open
 .. automethod:: CTable.load
 .. automethod:: CTable.from_arrow
+.. automethod:: CTable.from_parquet
 .. automethod:: CTable.from_csv
 
 
+Null policy
+-----------
+
+Nullable scalar CTable columns are represented with per-column sentinel values,
+not native validity bitmaps.  When CTable has to infer those sentinels, the
+selection can be customized with :class:`NullPolicy` and scoped with
+:func:`null_policy`::
+
+    policy = blosc2.NullPolicy(
+        signed_int_strategy="max",
+        string_value="<NULL>",
+        column_null_values={"user_id": -1, "country": "NA"},
+    )
+
+    with blosc2.null_policy(policy):
+        table = blosc2.CTable.from_parquet("data.parquet")
+
+The same policy is used by explicit nullable schema specs when no
+``null_value`` is supplied::
+
+    from dataclasses import dataclass
+
+    @dataclass
+    class Row:
+        user_id: int = blosc2.field(blosc2.int64(nullable=True))
+        country: str = blosc2.field(blosc2.string(nullable=True))
+
+    with blosc2.null_policy(policy):
+        table = blosc2.CTable(Row)
+
+Sentinels are resolved in this order: explicit ``null_value`` in the schema,
+``NullPolicy.column_null_values`` for a matching column, then the type-wide
+``NullPolicy`` default.  Columns without ``nullable=True`` or an explicit
+``null_value`` are not nullable.
+
+.. autosummary::
+
+    NullPolicy
+    null_policy
+    get_null_policy
+
+.. autoclass:: NullPolicy
+.. autofunction:: null_policy
+.. autofunction:: get_null_policy
+
+
 Attributes
 ----------
 
 
@@ -40,15 +40,15 @@ class Stock:
 at = t.to_arrow()
 print(f"Arrow table: {len(at)} rows, schema={at.schema}\n")
 
-# -- from_arrow(): schema is inferred from Arrow types ---------------------
+# -- from_arrow(): import an Arrow schema and record batches ---------------
 at2 = pa.table(
     {
         "x": pa.array([1.0, 2.0, 3.0], type=pa.float32()),
         "y": pa.array([10, 20, 30], type=pa.int32()),
         "label": pa.array(["a", "bb", "ccc"], type=pa.string()),
     }
 )
-t2 = blosc2.CTable.from_arrow(at2)
+t2 = blosc2.CTable.from_arrow(at2.schema, at2.to_batches())
 print("CTable from Arrow (inferred schema):")
 print(t2)
 print(f"  label dtype: {t2['label'].dtype}  (max_length inferred from data)")
@@ -69,7 +69,8 @@ class Stock:
     print(df_original)
 
     # pandas → Arrow → CTable
-    t_from_pd = blosc2.CTable.from_arrow(pa.Table.from_pandas(df_original, preserve_index=False))
+    at_pd = pa.Table.from_pandas(df_original, preserve_index=False)
+    t_from_pd = blosc2.CTable.from_arrow(at_pd.schema, at_pd.to_batches())
     print("\nCTable from pandas:")
     print(t_from_pd)
 
 
@@ -5,17 +5,18 @@
 # SPDX-License-Identifier: BSD-3-Clause
 #######################################################################
 
-# Nullable columns: null_value sentinels, null-aware aggregates,
-# is_null / notnull, sort nulls-last, Arrow null masking, CSV empty cells.
+# Nullable columns: null_value sentinels, nullable=True, NullPolicy,
+# null-aware aggregates, is_null / notnull, sort nulls-last, Arrow null masking,
+# and CSV empty cells.
 #
 # CTable does not have a built-in "missing" bit per row like pandas does.
-# Instead it uses a *sentinel value* approach: you choose a specific value
-# that represents "null" for a column, and the library treats it
-# transparently in aggregates, sorting, unique(), value_counts(), and
-# Arrow export.
+# Instead it uses a *sentinel value* approach: each nullable column stores a
+# specific value that represents "null".  The library treats that value
+# transparently in aggregates, sorting, unique(), value_counts(), and Arrow
+# export.
 #
-# This is especially useful for integer and string columns that have no
-# natural null (unlike float, which can use NaN).
+# You can either choose sentinels explicitly with null_value=, or ask CTable to
+# choose them from the active NullPolicy with nullable=True.
 
 import os
 import tempfile
@@ -24,24 +25,57 @@
 import blosc2
 
 # ---------------------------------------------------------------------------
-# Schema with nullable columns
+# Schema with explicit null_value sentinels
 # ---------------------------------------------------------------------------
-# Use null_value= on any spec to declare the sentinel.
-# The sentinel bypasses validation constraints (ge/le etc.) so you can
-# store it even when it would otherwise violate them.
+# Use null_value= on any spec to declare the sentinel.  The sentinel bypasses
+# validation constraints (ge/le etc.) so you can store it even when it would
+# otherwise violate them.
 
 
 @dataclass
 class Reading:
     sensor_id: int = blosc2.field(blosc2.int32(ge=0))
     # -999 is "no reading" for temperature (normally ge=-50, le=60)
-    temperature: float = blosc2.field(blosc2.float64(ge=-50.0, le=60.0, null_value=-999.0), default=-999.0)
+    temperature: float = blosc2.field(blosc2.float64(ge=-50.0, le=60.0, null_value=-999.0))
     # "" is "unknown" for location (string)
-    location: str = blosc2.field(blosc2.string(max_length=16, null_value=""), default="")
+    location: str = blosc2.field(blosc2.string(max_length=16, null_value=""))
     # -1 is "not measured" for signal strength (normally ge=0, le=100)
-    signal: int = blosc2.field(blosc2.int8(ge=0, le=100, null_value=-1), default=-1)
+    signal: int = blosc2.field(blosc2.int8(ge=0, le=100, null_value=-1))
 
 
+# ---------------------------------------------------------------------------
+# Schema using nullable=True and NullPolicy
+# ---------------------------------------------------------------------------
+# nullable=True means "make this column nullable and choose the sentinel from
+# the current NullPolicy".  column_null_values overrides the type-wide policy for
+# specific columns.
+
+
+@dataclass
+class AutoReading:
+    sensor_id: int = blosc2.field(blosc2.int32(ge=0))
+    temperature: float = blosc2.field(blosc2.float64(ge=-50.0, le=60.0, nullable=True))
+    location: str = blosc2.field(blosc2.string(max_length=16, nullable=True))
+    signal: int = blosc2.field(blosc2.int8(ge=0, le=100, nullable=True))
+
+
+policy = blosc2.NullPolicy(
+    float_value=-999.0,
+    string_value="",
+    column_null_values={"signal": -1},
+)
+with blosc2.null_policy(policy):
+    auto = blosc2.CTable(AutoReading)
+
+print("NullPolicy + nullable=True selected these sentinels:")
+print(f"temperature: {auto['temperature'].null_value!r}")
+print(f"location   : {auto['location'].null_value!r}")
+print(f"signal     : {auto['signal'].null_value!r}")
+
+# ---------------------------------------------------------------------------
+# Work with nullable columns
+# ---------------------------------------------------------------------------
+
 data = [
     (0, 22.3, "roof", 87),
     (1, -999.0, "cellar", 41),  # temperature unknown
@@ -52,7 +86,7 @@ class Reading:
 ]
 
 t = blosc2.CTable(Reading, new_data=data)
-print("Table with nullable columns:")
+print("\nTable with nullable columns:")
 print(t)
 
 # ---------------------------------------------------------------------------
@@ -74,7 +108,7 @@ class Reading:
 # Null-aware aggregates
 # ---------------------------------------------------------------------------
 print("\n--- Aggregates skip null sentinels ---")
-print(f"temperature.mean() = {t['temperature'].mean():.2f}   (only 3 non-null readings)")
+print(f"temperature.mean() = {t['temperature'].mean():.2f}   (only 4 non-null readings)")
 print(f"temperature.min()  = {t['temperature'].min():.2f}")
 print(f"temperature.max()  = {t['temperature'].max():.2f}")
 print(f"signal.sum()       = {t['signal'].sum()}   (non-null: 87+41+62+95 = 285)")
 
@@ -98,7 +98,8 @@ class WeatherReading:
 path = f"{tmpdir}/station3"
 try:
     # Views cannot be sorted or saved directly — materialise via Arrow first
-    s3_copy = blosc2.CTable.from_arrow(station3.to_arrow())
+    arrow = station3.to_arrow()
+    s3_copy = blosc2.CTable.from_arrow(arrow.schema, arrow.to_batches())
     s3_copy.sort_by("day_of_year", inplace=True)
     sorted_s3 = s3_copy
     sorted_s3.save(path, overwrite=True)
 
@@ -8,7 +8,7 @@
 
 """Import/export Parquet datasets through a CTable store.
 
-Default mode imports parquet -> .b2z/.b2d using CTable.from_arrow_batches().
+Default mode imports parquet -> .b2z/.b2d using CTable.from_arrow().
 The output extension selects the storage layout: .b2z is compact/zip-backed,
 .b2d is sparse directory-backed.  Additional modes:
 
@@ -546,7 +546,7 @@ def import_once(args, input_path: Path, output_path: Path, force_list_strings: s
 
     t0 = time.perf_counter()
     maybe_memory_report(args, "before CTable import", pa)
-    ct = blosc2.CTable.from_arrow_batches(
+    ct = blosc2.CTable.from_arrow(
         arrow_schema,
         progress_batches(pa, pf, args, selected_cols, list_wrap_cols),
         urlpath=str(output_path),
 
@@ -129,6 +129,7 @@ ignore = [
     "RUF015",
     "RUF059",
     "SIM108",
+    "SIM117",
 ]
 
 [tool.ruff.lint.extend-per-file-ignores]
 
@@ -633,7 +633,7 @@ def _raise(exc):
 # Delayed imports for avoiding overwriting of python builtins.
 # Note: bool, bytes, string shadow builtins in the blosc2 namespace by design —
 # they are schema spec constructors (b2.bool(), b2.bytes(), etc.).
-from .ctable import Column, CTable
+from .ctable import DEFAULT_NULL_POLICY, Column, CTable, NullPolicy, get_null_policy, null_policy
 from .ndarray import (
     abs,
     acos,
@@ -769,6 +769,7 @@ def _raise(exc):
     "DEFAULT_FLOAT",
     "DEFAULT_INDEX",
     "DEFAULT_INT",
+    "DEFAULT_NULL_POLICY",
     # Mathematical constants
     "e",
     "pi",
@@ -812,6 +813,7 @@ def _raise(exc):
     "LazyExpr",
     "LazyUDF",
     "ListArray",
+    "NullPolicy",
     "NDArray",
     "NDField",
     "Operand",
@@ -1029,4 +1031,6 @@ def _raise(exc):
     "where",
     "zeros",
     "zeros_like",
+    "get_null_policy",
+    "null_policy",
 ]
Original file line number	Diff line number	Diff line change
`@@ -129,6 +129,7 @@ ignore = [`
`129`	`129`	`"RUF015",`
`130`	`130`	`"RUF059",`
`131`	`131`	`"SIM108",`
	`132`	`+ "SIM117",`
`132`	`133`	`]`
`133`	`134`
`134`	`135`	`[tool.ruff.lint.extend-per-file-ignores]`