Skip to content

Commit e2352d1

Browse files
authored
python(feat): add HDF5 client-side detect_config (#536)
1 parent a50948e commit e2352d1

6 files changed

Lines changed: 294 additions & 12 deletions

File tree

Lines changed: 130 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,130 @@
1+
from __future__ import annotations
2+
3+
from pathlib import Path
4+
5+
import h5py
6+
import numpy as np
7+
8+
from sift_client.sift_types.channel import ChannelDataType
9+
from sift_client.sift_types.data_import import Hdf5DataColumn, Hdf5ImportConfig, TimeFormat
10+
11+
# Common HDF5 attribute names used to detect channel metadata.
12+
_NAME_ATTRS = ["Name", "name", "Title", "title", "Sensor", "sensor", "Channel", "channel"]
13+
_UNIT_ATTRS = ["Unit", "unit", "Units", "units"]
14+
_DESCRIPTION_ATTRS = ["Description", "description"]
15+
16+
_NUMPY_TO_SIFT: dict[type, ChannelDataType] = {
17+
np.bool_: ChannelDataType.BOOL,
18+
np.int8: ChannelDataType.INT_32,
19+
np.int16: ChannelDataType.INT_32,
20+
np.int32: ChannelDataType.INT_32,
21+
np.int64: ChannelDataType.INT_64,
22+
np.uint8: ChannelDataType.UINT_32,
23+
np.uint16: ChannelDataType.UINT_32,
24+
np.uint32: ChannelDataType.UINT_32,
25+
np.uint64: ChannelDataType.UINT_64,
26+
np.float32: ChannelDataType.FLOAT,
27+
np.float64: ChannelDataType.DOUBLE,
28+
np.datetime64: ChannelDataType.INT_64,
29+
np.complex64: ChannelDataType.FLOAT,
30+
np.complex128: ChannelDataType.DOUBLE,
31+
np.str_: ChannelDataType.STRING,
32+
# HDF5/TDMS fixed-length strings are stored as np.bytes_; use STRING, not
33+
# BYTES (np.void below handles truly opaque binary data).
34+
np.bytes_: ChannelDataType.STRING,
35+
# Numpy uses object dtype for variable-length strings; TDMS/HDF5 files
36+
# cannot produce non-string object arrays.
37+
np.object_: ChannelDataType.STRING,
38+
np.void: ChannelDataType.BYTES,
39+
}
40+
41+
42+
def _detect_attr(dataset: h5py.Dataset, candidates: list[str], default: str = "") -> str:
43+
"""Return the first matching HDF5 attribute value, or *default*."""
44+
possible = [dataset.attrs.get(attr) for attr in candidates if dataset.attrs.get(attr)]
45+
return str(possible[0]) if possible else default
46+
47+
48+
def _numpy_to_sift_type(dtype: np.dtype) -> ChannelDataType:
49+
"""Map a numpy dtype to a Sift ChannelDataType."""
50+
sift_type = _NUMPY_TO_SIFT.get(dtype.type)
51+
if sift_type is None:
52+
raise ValueError(f"Unsupported numpy dtype: {dtype}")
53+
return sift_type
54+
55+
56+
def detect_hdf5_config(file_path: str | Path) -> Hdf5ImportConfig:
57+
"""Detect an HDF5 import config by inspecting the file's datasets.
58+
59+
Traverses the HDF5 file and produces (time dataset, value dataset) pairs.
60+
For compound datasets with multiple fields, the first field is assumed to
61+
be time and remaining fields become value channels. For simple datasets,
62+
a root-level ``time`` dataset is used if present.
63+
"""
64+
path = Path(file_path)
65+
66+
with h5py.File(path, "r") as h5file:
67+
columns: list[Hdf5DataColumn] = []
68+
seen_names: set[str] = set()
69+
has_root_time = "time" in h5file
70+
71+
def _visit(dataset_name: str, obj: object) -> None:
72+
if not isinstance(obj, h5py.Dataset):
73+
return
74+
75+
# Skip root "time" dataset — it's used as the time source, not a value channel.
76+
if dataset_name == "time" and obj.parent == h5file:
77+
return
78+
79+
n_fields = len(obj.dtype.names) if obj.dtype.names else 0
80+
81+
if n_fields > 1:
82+
# Compound type: first field is time, remaining are value channels.
83+
for value_index in range(1, n_fields):
84+
channel_name = _detect_attr(obj, _NAME_ATTRS, dataset_name)
85+
if channel_name in seen_names:
86+
channel_name = f"{channel_name}.{dataset_name}.{value_index}"
87+
88+
columns.append(
89+
Hdf5DataColumn(
90+
name=channel_name,
91+
data_type=_numpy_to_sift_type(obj.dtype[value_index]),
92+
units=_detect_attr(obj, _UNIT_ATTRS),
93+
description=_detect_attr(obj, _DESCRIPTION_ATTRS),
94+
time_dataset=dataset_name,
95+
value_dataset=dataset_name,
96+
time_index=0,
97+
value_index=0,
98+
time_field=obj.dtype.names[0],
99+
value_field=obj.dtype.names[value_index],
100+
)
101+
)
102+
seen_names.add(channel_name)
103+
104+
elif n_fields in (0, 1):
105+
# Single column. Use root "time" as time dataset if available.
106+
channel_name = _detect_attr(obj, _NAME_ATTRS, dataset_name)
107+
if channel_name in seen_names:
108+
channel_name = f"{channel_name}.{dataset_name}"
109+
110+
columns.append(
111+
Hdf5DataColumn(
112+
name=channel_name,
113+
data_type=_numpy_to_sift_type(obj.dtype),
114+
units=_detect_attr(obj, _UNIT_ATTRS),
115+
description=_detect_attr(obj, _DESCRIPTION_ATTRS),
116+
time_dataset="time" if has_root_time else "",
117+
value_dataset=dataset_name,
118+
time_index=0,
119+
value_index=0,
120+
)
121+
)
122+
seen_names.add(channel_name)
123+
124+
h5file.visititems(_visit)
125+
126+
return Hdf5ImportConfig(
127+
asset_name="",
128+
time_format=TimeFormat.ABSOLUTE_UNIX_NANOSECONDS,
129+
data=columns,
130+
)
Lines changed: 132 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,132 @@
1+
"""Tests for detect_hdf5_config."""
2+
3+
import h5py
4+
import numpy as np
5+
import pytest
6+
7+
from sift_client._internal.util.hdf5 import detect_hdf5_config
8+
from sift_client.sift_types.channel import ChannelDataType
9+
10+
11+
@pytest.fixture
12+
def create_hdf5_file(tmp_path):
13+
"""Return a helper that writes an HDF5 file and returns its path."""
14+
file_path = tmp_path / "test.h5"
15+
16+
def _create(populate):
17+
with h5py.File(file_path, "w") as hdf5_file:
18+
populate(hdf5_file)
19+
return file_path
20+
21+
return _create
22+
23+
24+
class TestDetectHdf5Config:
25+
def test_compound_dataset(self, create_hdf5_file):
26+
"""Compound type: first field is time, remaining fields become value channels."""
27+
compound_dtype = np.dtype([("timestamp_ns", "<i8"), ("voltage", "<f8"), ("current", "<f4")])
28+
29+
def populate(hdf5_file):
30+
hdf5_file.create_dataset("sensors", shape=(10,), dtype=compound_dtype)
31+
32+
config = detect_hdf5_config(create_hdf5_file(populate))
33+
34+
assert len(config.data) == 2
35+
assert config.data[0].time_field == "timestamp_ns"
36+
assert config.data[0].value_field == "voltage"
37+
assert config.data[0].data_type == ChannelDataType.DOUBLE
38+
assert config.data[0].time_dataset == "sensors"
39+
assert config.data[0].value_dataset == "sensors"
40+
41+
assert config.data[1].time_field == "timestamp_ns"
42+
assert config.data[1].value_field == "current"
43+
assert config.data[1].data_type == ChannelDataType.FLOAT
44+
45+
def test_single_column_with_root_time(self, create_hdf5_file):
46+
"""Single-column datasets use root 'time' as time source when present."""
47+
48+
def populate(hdf5_file):
49+
hdf5_file.create_dataset("time", data=np.arange(100, dtype="<i8"))
50+
hdf5_file.create_dataset("voltage", data=np.random.rand(100).astype("<f8"))
51+
hdf5_file.create_dataset("current", data=np.random.rand(100).astype("<f4"))
52+
53+
config = detect_hdf5_config(create_hdf5_file(populate))
54+
55+
assert len(config.data) == 2
56+
for col in config.data:
57+
assert col.time_dataset == "time"
58+
assert col.time_field is None
59+
assert col.value_field is None
60+
61+
def test_single_column_without_root_time(self, create_hdf5_file):
62+
"""Without root 'time', time_dataset is empty string."""
63+
64+
def populate(hdf5_file):
65+
hdf5_file.create_dataset("voltage", data=np.random.rand(10).astype("<f8"))
66+
67+
config = detect_hdf5_config(create_hdf5_file(populate))
68+
69+
assert len(config.data) == 1
70+
assert config.data[0].time_dataset == ""
71+
assert config.data[0].name == "voltage"
72+
73+
def test_root_time_skipped_as_value_channel(self, create_hdf5_file):
74+
"""The root 'time' dataset must not appear as a value channel."""
75+
76+
def populate(hdf5_file):
77+
hdf5_file.create_dataset("time", data=np.arange(10, dtype="<i8"))
78+
hdf5_file.create_dataset("voltage", data=np.random.rand(10).astype("<f8"))
79+
80+
config = detect_hdf5_config(create_hdf5_file(populate))
81+
82+
channel_names = [col.name for col in config.data]
83+
assert "time" not in channel_names
84+
assert "voltage" in channel_names
85+
86+
def test_duplicate_name_deduplication(self, create_hdf5_file):
87+
"""Duplicate channel names get a .{dataset_name} suffix."""
88+
89+
def populate(hdf5_file):
90+
hdf5_file.create_dataset("time", data=np.arange(10, dtype="<i8"))
91+
sensor_1 = hdf5_file.create_dataset(
92+
"group1/sensor", data=np.random.rand(10).astype("<f8")
93+
)
94+
sensor_1.attrs["Name"] = "pressure"
95+
sensor_2 = hdf5_file.create_dataset(
96+
"group2/sensor", data=np.random.rand(10).astype("<f8")
97+
)
98+
sensor_2.attrs["Name"] = "pressure"
99+
100+
config = detect_hdf5_config(create_hdf5_file(populate))
101+
102+
channel_names = [col.name for col in config.data]
103+
assert len(channel_names) == 2
104+
assert len(set(channel_names)) == 2 # all unique
105+
assert "pressure" in channel_names
106+
107+
def test_attribute_detection(self, create_hdf5_file):
108+
"""Channel name, units, and description are read from HDF5 attributes."""
109+
110+
def populate(hdf5_file):
111+
hdf5_file.create_dataset("time", data=np.arange(5, dtype="<i8"))
112+
dataset = hdf5_file.create_dataset("raw_voltage", data=np.random.rand(5).astype("<f8"))
113+
dataset.attrs["Name"] = "voltage"
114+
dataset.attrs["Units"] = "V"
115+
dataset.attrs["Description"] = "Supply voltage"
116+
117+
config = detect_hdf5_config(create_hdf5_file(populate))
118+
119+
assert len(config.data) == 1
120+
assert config.data[0].name == "voltage"
121+
assert config.data[0].units == "V"
122+
assert config.data[0].description == "Supply voltage"
123+
124+
def test_unsupported_dtype_raises(self, create_hdf5_file):
125+
"""Unsupported numpy dtypes raise ValueError rather than silently dropping data."""
126+
127+
def populate(hdf5_file):
128+
hdf5_file.create_dataset("time", data=np.arange(5, dtype="<i8"))
129+
hdf5_file.create_dataset("data", data=np.zeros(5, dtype=np.float16))
130+
131+
with pytest.raises(ValueError, match="Unsupported numpy dtype"):
132+
detect_hdf5_config(create_hdf5_file(populate))

python/lib/sift_client/resources/data_imports.py

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
from sift_client._internal.low_level_wrappers.data_imports import DataImportsLowLevelClient
77
from sift_client._internal.util.executor import run_sync_function
88
from sift_client._internal.util.file import extract_parquet_footer, upload_file
9+
from sift_client._internal.util.hdf5 import detect_hdf5_config
910
from sift_client.resources._base import ResourceBase
1011
from sift_client.sift_types.asset import Asset
1112
from sift_client.sift_types.channel import ChannelDataType
@@ -61,8 +62,8 @@ async def import_from_path(
6162
completion before proceeding.
6263
6364
When ``config`` is omitted the file format is auto-detected via
64-
``detect_config`` (CSV and Parquet only). For other formats
65-
(TDMS and HDF5), ``config`` must be provided.
65+
``detect_config`` (CSV, Parquet, and HDF5). For other formats
66+
(TDMS), ``config`` must be provided.
6667
When ``asset`` is provided it overrides the config value;
6768
otherwise the config's ``asset_name`` is used.
6869
If neither ``run`` nor ``run_name`` is provided (and none is
@@ -198,9 +199,9 @@ async def detect_config(
198199
is inferred from the file extension when ``data_type`` is not
199200
provided.
200201
201-
Only CSV and Parquet files are currently supported for auto-detection.
202-
For other formats (TDMS, HDF5), create the config manually
203-
using ``TdmsImportConfig`` or ``Hdf5ImportConfig``.
202+
CSV, Parquet, and HDF5 files are supported for auto-detection.
203+
For other formats (TDMS), create the config manually
204+
using ``TdmsImportConfig``.
204205
205206
For CSV files, the server scans the first two rows for an optional
206207
JSON metadata row. Row 1 is checked first; row 2 is checked only
@@ -243,6 +244,9 @@ async def detect_config(
243244

244245
data_type_key = _resolve_data_type_key(path.suffix.lower(), data_type)
245246

247+
if data_type_key == DataTypeKey.HDF5:
248+
return await run_sync_function(lambda: detect_hdf5_config(path))
249+
246250
is_parquet = data_type_key in (
247251
DataTypeKey.PARQUET_FLATDATASET,
248252
DataTypeKey.PARQUET_SINGLE_CHANNEL_PER_ROW,

python/lib/sift_client/resources/sync_stubs/__init__.pyi

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -653,9 +653,9 @@ class DataImportAPI:
653653
is inferred from the file extension when ``data_type`` is not
654654
provided.
655655
656-
Only CSV and Parquet files are currently supported for auto-detection.
657-
For other formats (TDMS, HDF5), create the config manually
658-
using ``TdmsImportConfig`` or ``Hdf5ImportConfig``.
656+
CSV, Parquet, and HDF5 files are supported for auto-detection.
657+
For other formats (TDMS), create the config manually
658+
using ``TdmsImportConfig``.
659659
660660
For CSV files, the server scans the first two rows for an optional
661661
JSON metadata row. Row 1 is checked first; row 2 is checked only
@@ -733,8 +733,8 @@ class DataImportAPI:
733733
completion before proceeding.
734734
735735
When ``config`` is omitted the file format is auto-detected via
736-
``detect_config`` (CSV and Parquet only). For other formats
737-
(TDMS and HDF5), ``config`` must be provided.
736+
``detect_config`` (CSV, Parquet, and HDF5). For other formats
737+
(TDMS), ``config`` must be provided.
738738
When ``asset`` is provided it overrides the config value;
739739
otherwise the config's ``asset_name`` is used.
740740
If neither ``run`` nor ``run_name`` is provided (and none is

python/lib/sift_client/sift_types/data_import.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -593,6 +593,18 @@ class Hdf5ImportConfig(ImportConfigBase):
593593
time_format: TimeFormat
594594
relative_start_time: datetime | None = None
595595

596+
def __getitem__(self, name: str) -> Hdf5DataColumn:
597+
"""Look up a data column by channel name.
598+
599+
Example::
600+
601+
config["temperature"].data_type = ChannelDataType.FLOAT
602+
"""
603+
for dc in self.data:
604+
if dc.name == name:
605+
return dc
606+
raise KeyError(f"No data column named '{name}'")
607+
596608
@model_validator(mode="after")
597609
def _check_relative_start_time(self) -> Hdf5ImportConfig:
598610
if self.time_format.name.startswith("RELATIVE_") and self.relative_start_time is None:

python/pyproject.toml

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -168,7 +168,7 @@ file-imports = [
168168
]
169169
hdf5 = [
170170
'h5py~=3.11',
171-
'polars~=1.8',
171+
'polars~=1.8', # only used by sift_py; remove once sift_py is fully deprecated
172172
]
173173
openssl = [
174174
'cffi~=1.14',
@@ -219,7 +219,7 @@ openssl = ["pyOpenSSL<24.0.0", "types-pyOpenSSL<24.0.0", "cffi~=1.14"]
219219
tdms = ["npTDMS~=1.9"]
220220
rosbags = ["rosbags~=0.0"]
221221
sift-stream = ["sift-stream-bindings==0.2.2"]
222-
hdf5 = ["h5py~=3.11", "polars~=1.8"]
222+
hdf5 = ["h5py~=3.11", "polars~=1.8"] # polars is only used by sift_py; remove once sift_py is fully deprecated
223223
data-review = ["pyarrow>=17.0.0"]
224224

225225
[tool.sift.extras.combine]
@@ -282,6 +282,10 @@ exclude = [
282282

283283
# No official typing stubs for Python gRPC libraries yet.
284284
# https://github.com/grpc/grpc/issues/29041
285+
[[tool.mypy.overrides]]
286+
module = "h5py"
287+
ignore_missing_imports = true
288+
285289
[[tool.mypy.overrides]]
286290
module = "grpc_testing"
287291
ignore_missing_imports = true

0 commit comments

Comments
 (0)