55# SPDX-License-Identifier: BSD-3-Clause
66#######################################################################
77
8- # Nullable columns: null_value sentinels, null-aware aggregates,
9- # is_null / notnull, sort nulls-last, Arrow null masking, CSV empty cells.
8+ # Nullable columns: null_value sentinels, nullable=True, NullPolicy,
9+ # null-aware aggregates, is_null / notnull, sort nulls-last, Arrow null masking,
10+ # and CSV empty cells.
1011#
1112# CTable does not have a built-in "missing" bit per row like pandas does.
12- # Instead it uses a *sentinel value* approach: you choose a specific value
13- # that represents "null" for a column, and the library treats it
14- # transparently in aggregates, sorting, unique(), value_counts(), and
15- # Arrow export.
13+ # Instead it uses a *sentinel value* approach: each nullable column stores a
14+ # specific value that represents "null". The library treats that value
15+ # transparently in aggregates, sorting, unique(), value_counts(), and Arrow
16+ # export.
1617#
17- # This is especially useful for integer and string columns that have no
18- # natural null (unlike float, which can use NaN) .
18+ # You can either choose sentinels explicitly with null_value=, or ask CTable to
19+ # choose them from the active NullPolicy with nullable=True .
1920
2021import os
2122import tempfile
2425import blosc2
2526
2627# ---------------------------------------------------------------------------
27- # Schema with nullable columns
28+ # Schema with explicit null_value sentinels
2829# ---------------------------------------------------------------------------
29- # Use null_value= on any spec to declare the sentinel.
30- # The sentinel bypasses validation constraints (ge/le etc.) so you can
31- # store it even when it would otherwise violate them.
30+ # Use null_value= on any spec to declare the sentinel. The sentinel bypasses
31+ # validation constraints (ge/le etc.) so you can store it even when it would
32+ # otherwise violate them.
3233
3334
3435@dataclass
3536class Reading :
3637 sensor_id : int = blosc2 .field (blosc2 .int32 (ge = 0 ))
3738 # -999 is "no reading" for temperature (normally ge=-50, le=60)
38- temperature : float = blosc2 .field (blosc2 .float64 (ge = - 50.0 , le = 60.0 , null_value = - 999.0 ), default = - 999.0 )
39+ temperature : float = blosc2 .field (blosc2 .float64 (ge = - 50.0 , le = 60.0 , null_value = - 999.0 ))
3940 # "" is "unknown" for location (string)
40- location : str = blosc2 .field (blosc2 .string (max_length = 16 , null_value = "" ), default = "" )
41+ location : str = blosc2 .field (blosc2 .string (max_length = 16 , null_value = "" ))
4142 # -1 is "not measured" for signal strength (normally ge=0, le=100)
42- signal : int = blosc2 .field (blosc2 .int8 (ge = 0 , le = 100 , null_value = - 1 ), default = - 1 )
43+ signal : int = blosc2 .field (blosc2 .int8 (ge = 0 , le = 100 , null_value = - 1 ))
4344
4445
46+ # ---------------------------------------------------------------------------
47+ # Schema using nullable=True and NullPolicy
48+ # ---------------------------------------------------------------------------
49+ # nullable=True means "make this column nullable and choose the sentinel from
50+ # the current NullPolicy". column_null_values overrides the type-wide policy for
51+ # specific columns.
52+
53+
54+ @dataclass
55+ class AutoReading :
56+ sensor_id : int = blosc2 .field (blosc2 .int32 (ge = 0 ))
57+ temperature : float = blosc2 .field (blosc2 .float64 (ge = - 50.0 , le = 60.0 , nullable = True ))
58+ location : str = blosc2 .field (blosc2 .string (max_length = 16 , nullable = True ))
59+ signal : int = blosc2 .field (blosc2 .int8 (ge = 0 , le = 100 , nullable = True ))
60+
61+
62+ policy = blosc2 .NullPolicy (
63+ float_value = - 999.0 ,
64+ string_value = "" ,
65+ column_null_values = {"signal" : - 1 },
66+ )
67+ with blosc2 .null_policy (policy ):
68+ auto = blosc2 .CTable (AutoReading )
69+
70+ print ("NullPolicy + nullable=True selected these sentinels:" )
71+ print (f"temperature: { auto ['temperature' ].null_value !r} " )
72+ print (f"location : { auto ['location' ].null_value !r} " )
73+ print (f"signal : { auto ['signal' ].null_value !r} " )
74+
75+ # ---------------------------------------------------------------------------
76+ # Work with nullable columns
77+ # ---------------------------------------------------------------------------
78+
4579data = [
4680 (0 , 22.3 , "roof" , 87 ),
4781 (1 , - 999.0 , "cellar" , 41 ), # temperature unknown
@@ -52,7 +86,7 @@ class Reading:
5286]
5387
5488t = blosc2 .CTable (Reading , new_data = data )
55- print ("Table with nullable columns:" )
89+ print ("\n Table with nullable columns:" )
5690print (t )
5791
5892# ---------------------------------------------------------------------------
@@ -74,7 +108,7 @@ class Reading:
74108# Null-aware aggregates
75109# ---------------------------------------------------------------------------
76110print ("\n --- Aggregates skip null sentinels ---" )
77- print (f"temperature.mean() = { t ['temperature' ].mean ():.2f} (only 3 non-null readings)" )
111+ print (f"temperature.mean() = { t ['temperature' ].mean ():.2f} (only 4 non-null readings)" )
78112print (f"temperature.min() = { t ['temperature' ].min ():.2f} " )
79113print (f"temperature.max() = { t ['temperature' ].max ():.2f} " )
80114print (f"signal.sum() = { t ['signal' ].sum ()} (non-null: 87+41+62+95 = 285)" )
0 commit comments