diff --git a/rust/crates/sift_cli/src/cli/mod.rs b/rust/crates/sift_cli/src/cli/mod.rs index ef00512e4..5653f07c9 100644 --- a/rust/crates/sift_cli/src/cli/mod.rs +++ b/rust/crates/sift_cli/src/cli/mod.rs @@ -450,11 +450,26 @@ pub struct ImportHdf5Args { pub relative_start_time: Option, /// (two-d / compound) Index of the time column or field. Defaults to 0. - /// Mutually exclusive with --time-field. - #[arg(long, conflicts_with = "time_field")] + /// Mutually exclusive with --time-field and --time-name. + #[arg( + long, + conflicts_with_all = ["time_field", "time_name"], + help_heading = "Two-d schema options", + )] pub time_index: Option, - /// (compound) Name of the time field. Mutually exclusive with --time-index. - #[arg(long)] + /// (compound) Name of the time field. Mutually exclusive with --time-index + /// and --time-name. + #[arg( + long, + conflicts_with = "time_name", + help_heading = "Compound schema options" + )] pub time_field: Option, + + /// (one-d) Leaf name of the time dataset when it doesn't match the default + /// auto-detected names (time, timestamp, timestamps, ts). Mutually exclusive + /// with --time-index and --time-field. + #[arg(long, help_heading = "One-d schema options")] + pub time_name: Option, } diff --git a/rust/crates/sift_cli/src/cmd/import/hdf5/detect_hdf5_schema.rs b/rust/crates/sift_cli/src/cmd/import/hdf5/detect_hdf5_schema.rs index 1be19ee73..3b94351c1 100644 --- a/rust/crates/sift_cli/src/cmd/import/hdf5/detect_hdf5_schema.rs +++ b/rust/crates/sift_cli/src/cmd/import/hdf5/detect_hdf5_schema.rs @@ -1,20 +1,55 @@ +use std::collections::HashMap; use std::path::Path; use anyhow::{Context as AnyhowContext, Result, anyhow}; use hdf5::types::{FloatSize, IntSize, TypeDescriptor, VarLenAscii, VarLenUnicode}; -use hdf5::{Dataset, File}; +use hdf5::{Dataset, File, Group}; use sift_rs::{ - common::r#type::v1::{ChannelConfig, ChannelDataType}, + common::r#type::v1::{ChannelConfig, ChannelDataType, ChannelEnumType}, data_imports::v2::Hdf5DataConfig, }; use crate::cli::hdf5::Hdf5Schema; +use crate::cmd::import::utils::group_path_to_channel_name; +use crate::util::tty::Output; +const ROOT_PATH: &str = "/"; const TIME_NAMES: &[&str] = &["time", "timestamp", "timestamps", "ts"]; +const VALUE_NAMES: &[&str] = &["value", "values"]; -pub(super) fn is_time_dataset_name(name: &str) -> bool { - let trimmed = name.trim_start_matches('/').to_ascii_lowercase(); - TIME_NAMES.iter().any(|n| *n == trimmed) +pub fn basename(path: &str) -> &str { + path.rsplit('/').next().unwrap_or(path) +} + +pub fn parent_path(path: &str) -> &str { + match path.rfind('/') { + Some(0) => ROOT_PATH, + Some(idx) => &path[..idx], + None => ROOT_PATH, + } +} + +pub fn is_time_dataset_name(name: &str) -> bool { + let leaf = basename(name).to_ascii_lowercase(); + TIME_NAMES.iter().any(|n| *n == leaf) +} + +fn is_value_leaf(name: &str) -> bool { + let leaf = basename(name).to_ascii_lowercase(); + VALUE_NAMES.iter().any(|n| *n == leaf) +} + +fn collect_datasets_recursive(group: &Group) -> Result> { + let mut datasets = group + .datasets() + .with_context(|| format!("failed to enumerate datasets in {}", group.name()))?; + let subgroups = group + .groups() + .with_context(|| format!("failed to enumerate groups in {}", group.name()))?; + for sub in &subgroups { + datasets.extend(collect_datasets_recursive(sub)?); + } + Ok(datasets) } fn get_string_attr(ds: &Dataset, name: &str) -> Option { @@ -28,12 +63,10 @@ fn get_string_attr(ds: &Dataset, name: &str) -> Option { None } -/// Supported HDF5 channel types. Anything outside this set is rejected with a -/// client-side error so users get clear feedback before upload. -pub(super) const SUPPORTED_TYPES_BLURB: &str = - "bool, int8/16/32/64, uint8/16/32/64, float32, float64"; +pub const SUPPORTED_TYPES_BLURB: &str = + "bool, int8/16/32/64, uint8/16/32/64, float32, float64, string, enum"; -pub(super) fn hdf5_to_sift_data_type(ty: &TypeDescriptor) -> Option { +pub fn hdf5_to_sift_data_type(ty: &TypeDescriptor) -> Option { match ty { TypeDescriptor::Boolean => Some(ChannelDataType::Bool), TypeDescriptor::Integer(IntSize::U1) @@ -46,31 +79,57 @@ pub(super) fn hdf5_to_sift_data_type(ty: &TypeDescriptor) -> Option Some(ChannelDataType::Uint64), TypeDescriptor::Float(FloatSize::U4) => Some(ChannelDataType::Float), TypeDescriptor::Float(FloatSize::U8) => Some(ChannelDataType::Double), + TypeDescriptor::VarLenUnicode + | TypeDescriptor::VarLenAscii + | TypeDescriptor::FixedAscii(_) + | TypeDescriptor::FixedUnicode(_) => Some(ChannelDataType::String), + TypeDescriptor::Enum(_) => Some(ChannelDataType::Enum), _ => None, } } -pub(super) fn detect_config( +pub fn enum_types_for(ty: &TypeDescriptor) -> Result> { + let TypeDescriptor::Enum(enum_type) = ty else { + return Ok(Vec::new()); + }; + enum_type + .members + .iter() + .map(|member| { + Ok(ChannelEnumType { + name: member.name.clone(), + key: u32::try_from(member.value).with_context(|| { + format!( + "enum member '{}' value {} doesn't fit in u32", + member.name, member.value + ) + })?, + is_signed: enum_type.signed, + }) + }) + .collect() +} + +pub fn detect_config( path: &Path, schema: Hdf5Schema, time_index: u64, time_field: Option<&str>, + time_name: Option<&str>, ) -> Result<(Vec, Vec)> { let file = File::open(path).map_err(|e| anyhow!("failed to open hdf5 file: {e}"))?; - let datasets = file - .datasets() - .map_err(|e| anyhow!("failed to enumerate datasets: {e}"))?; + let datasets = collect_datasets_recursive(&file)?; let result = match schema { - Hdf5Schema::OneD => detect_one_d(&datasets), + Hdf5Schema::OneD => detect_one_d(&datasets, time_name), Hdf5Schema::TwoD => detect_two_d(&datasets, time_index), Hdf5Schema::Compound => detect_compound(&datasets, time_index, time_field), }; match result { - Ok((data, _)) if data.is_empty() => { - Err(no_match_error(&datasets, schema, time_index, time_field)) - } + Ok((data, _)) if data.is_empty() => Err(no_match_error( + &datasets, schema, time_index, time_field, time_name, + )), Ok(other) => Ok(other), Err(e) => Err(e), } @@ -81,6 +140,7 @@ fn no_match_error( selected: Hdf5Schema, time_index: u64, time_field: Option<&str>, + time_name: Option<&str>, ) -> anyhow::Error { let alternatives: &[(Hdf5Schema, &str)] = &[ (Hdf5Schema::OneD, "one-d"), @@ -93,7 +153,7 @@ fn no_match_error( .filter(|(s, _)| *s != selected) .filter_map(|(s, name)| { let probe = match s { - Hdf5Schema::OneD => detect_one_d(datasets), + Hdf5Schema::OneD => detect_one_d(datasets, time_name), Hdf5Schema::TwoD => detect_two_d(datasets, time_index), Hdf5Schema::Compound => detect_compound(datasets, time_index, time_field), }; @@ -124,36 +184,71 @@ fn no_match_error( } } -fn detect_one_d(datasets: &[Dataset]) -> Result<(Vec, Vec)> { - let time_dataset = datasets - .iter() - .find(|d| is_time_dataset_name(&d.name())) - .map(|d| d.name()) - .ok_or_else(|| { - anyhow!("no time dataset found — expected one of {TIME_NAMES:?} (case-insensitive)") - })?; +fn detect_one_d( + datasets: &[Dataset], + time_name: Option<&str>, +) -> Result<(Vec, Vec)> { + let mut group_time: HashMap = HashMap::new(); + for ds in datasets { + let name = ds.name(); + let matches = match time_name { + Some(want) => basename(&name) == want, + None => is_time_dataset_name(&name), + }; + if !matches || ds.ndim() != 1 { + continue; + } + group_time + .entry(parent_path(&name).to_owned()) + .or_insert(name); + } + + if group_time.is_empty() { + return Err(match time_name { + Some(want) => anyhow!( + "no time dataset found with name '{want}'. \ + Verify --time-name matches a leaf dataset name in the file." + ), + None => anyhow!( + "no time dataset found — expected one of {TIME_NAMES:?} (case-insensitive) \ + at the root or within any group. \ + If your file uses a custom name, pass it via --time-name." + ), + }); + } let mut data_configs = Vec::new(); let mut channel_configs = Vec::new(); for ds in datasets { let name = ds.name(); - if name == time_dataset { + if is_time_dataset_name(&name) || ds.ndim() != 1 { continue; } - if ds.ndim() != 1 { + let Some(time_dataset) = nearest_time_dataset(&group_time, &name) else { continue; - } - let dtype = ds - .dtype() - .map_err(|e| anyhow!("failed to read dtype for {name}: {e}"))? - .to_descriptor() - .map_err(|e| anyhow!("failed to describe dtype for {name}: {e}"))?; + }; + + let dtype = match ds.dtype().and_then(|t| t.to_descriptor()) { + Ok(d) => d, + Err(e) => { + Output::new() + .line(format!( + "skipping {name}: cannot describe HDF5 dtype ({e}). \ + Supported types: {SUPPORTED_TYPES_BLURB}." + )) + .eprint(); + continue; + } + }; let Some(channel_type) = hdf5_to_sift_data_type(&dtype) else { - return Err(anyhow!( - "unsupported HDF5 type for dataset {name}: {dtype:?}. \ - Supported types: {SUPPORTED_TYPES_BLURB}." - )); + Output::new() + .line(format!( + "skipping {name}: unsupported HDF5 type {dtype:?}. \ + Supported types: {SUPPORTED_TYPES_BLURB}." + )) + .eprint(); + continue; }; let units = get_string_attr(ds, "units").unwrap_or_default(); @@ -161,16 +256,19 @@ fn detect_one_d(datasets: &[Dataset]) -> Result<(Vec, Vec Result<(Vec, Vec, value_path: &str) -> Option { + let mut current = parent_path(value_path); + while current != ROOT_PATH { + if let Some(t) = group_time.get(current) { + return Some(t.clone()); + } + current = parent_path(current); + } + group_time.get(ROOT_PATH).cloned() +} + +fn one_d_channel_name(value_path: &str) -> String { + if is_value_leaf(value_path) { + let parent = parent_path(value_path); + if parent != ROOT_PATH { + return group_path_to_channel_name(parent); + } + } + group_path_to_channel_name(value_path) +} + fn detect_two_d( datasets: &[Dataset], time_index: u64, @@ -220,10 +339,11 @@ fn detect_two_d( if col == time_index { continue; } - let channel_name = format!("{}.{col}", name.trim_start_matches('/')); + let channel_name = format!("{}.{col}", group_path_to_channel_name(&name)); let channel_config = ChannelConfig { name: channel_name, data_type: channel_type as i32, + enum_types: enum_types_for(&dtype)?, ..Default::default() }; @@ -295,10 +415,11 @@ fn detect_compound( field.ty )); }; - let channel_name = format!("{}.{}", name.trim_start_matches('/'), field.name); + let channel_name = format!("{}.{}", group_path_to_channel_name(&name), field.name); let channel_config = ChannelConfig { name: channel_name, data_type: channel_type as i32, + enum_types: enum_types_for(&field.ty)?, ..Default::default() }; diff --git a/rust/crates/sift_cli/src/cmd/import/hdf5/import.rs b/rust/crates/sift_cli/src/cmd/import/hdf5/import.rs index c6e3674ab..73983f006 100644 --- a/rust/crates/sift_cli/src/cmd/import/hdf5/import.rs +++ b/rust/crates/sift_cli/src/cmd/import/hdf5/import.rs @@ -44,6 +44,7 @@ pub async fn run(ctx: Context, args: ImportHdf5Args) -> Result { args.schema, args.time_index.unwrap_or(0), args.time_field.as_deref(), + args.time_name.as_deref(), ) { Ok((_, channel_configs)) => { let refs: Vec<&ChannelConfig> = channel_configs.iter().collect(); @@ -65,6 +66,7 @@ pub async fn run(ctx: Context, args: ImportHdf5Args) -> Result { args.schema, args.time_index.unwrap_or(0), args.time_field.as_deref(), + args.time_name.as_deref(), ) .context("failed to parse hdf5 file")?; hdf5_config.data = data_configs; diff --git a/rust/crates/sift_cli/src/cmd/import/hdf5/mod.rs b/rust/crates/sift_cli/src/cmd/import/hdf5/mod.rs index e38bea69f..881173e75 100644 --- a/rust/crates/sift_cli/src/cmd/import/hdf5/mod.rs +++ b/rust/crates/sift_cli/src/cmd/import/hdf5/mod.rs @@ -1,4 +1,4 @@ -pub mod detect_hdf5_schema; +mod detect_hdf5_schema; pub mod import; #[cfg(test)] diff --git a/rust/crates/sift_cli/src/cmd/import/hdf5/tests.rs b/rust/crates/sift_cli/src/cmd/import/hdf5/tests.rs index 9e36b6332..cc62da325 100644 --- a/rust/crates/sift_cli/src/cmd/import/hdf5/tests.rs +++ b/rust/crates/sift_cli/src/cmd/import/hdf5/tests.rs @@ -1,15 +1,18 @@ use std::path::PathBuf; use chrono::DateTime; -use hdf5::types::{FloatSize, IntSize, TypeDescriptor}; +use hdf5::types::{EnumMember, EnumType, FloatSize, IntSize, TypeDescriptor}; use sift_rs::common::r#type::v1::ChannelDataType; use sift_rs::data_imports::v2::TimeFormat as ProtoTimeFormat; use crate::cli::hdf5::Hdf5Schema; use crate::cli::time::TimeFormat; use crate::cli::{CommonImportArgs, ImportHdf5Args}; -use crate::cmd::import::hdf5::detect_hdf5_schema::{hdf5_to_sift_data_type, is_time_dataset_name}; +use crate::cmd::import::hdf5::detect_hdf5_schema::{ + basename, enum_types_for, hdf5_to_sift_data_type, is_time_dataset_name, parent_path, +}; use crate::cmd::import::hdf5::import::build_hdf5_config; +use crate::cmd::import::utils::group_path_to_channel_name; fn make_args() -> ImportHdf5Args { ImportHdf5Args { @@ -26,6 +29,7 @@ fn make_args() -> ImportHdf5Args { relative_start_time: None, time_index: None, time_field: None, + time_name: None, } } @@ -137,6 +141,31 @@ fn is_time_dataset_name_rejects_unrelated_names() { assert!(!is_time_dataset_name("")); } +#[test] +fn is_time_dataset_name_recognizes_nested_paths() { + assert!(is_time_dataset_name("/group1/time")); + assert!(is_time_dataset_name("/a/b/c/Timestamp")); + assert!(is_time_dataset_name("nested/ts")); + assert!(!is_time_dataset_name("/group1/time_series")); + assert!(!is_time_dataset_name("/time/sensor")); +} + +#[test] +fn basename_returns_leaf() { + assert_eq!(basename("/group/sub/leaf"), "leaf"); + assert_eq!(basename("/leaf"), "leaf"); + assert_eq!(basename("leaf"), "leaf"); + assert_eq!(basename("/"), ""); +} + +#[test] +fn parent_path_walks_up() { + assert_eq!(parent_path("/a/b/c"), "/a/b"); + assert_eq!(parent_path("/a"), "/"); + assert_eq!(parent_path("/"), "/"); + assert_eq!(parent_path("leaf"), "/"); +} + #[test] fn hdf5_to_sift_data_type_maps_boolean() { assert_eq!( @@ -218,15 +247,129 @@ fn hdf5_to_sift_data_type_maps_float_u8() { } #[test] -fn hdf5_to_sift_data_type_rejects_strings() { - assert_eq!(hdf5_to_sift_data_type(&TypeDescriptor::VarLenUnicode), None); - assert_eq!(hdf5_to_sift_data_type(&TypeDescriptor::VarLenAscii), None); +fn hdf5_to_sift_data_type_maps_strings() { + assert_eq!( + hdf5_to_sift_data_type(&TypeDescriptor::VarLenUnicode), + Some(ChannelDataType::String) + ); + assert_eq!( + hdf5_to_sift_data_type(&TypeDescriptor::VarLenAscii), + Some(ChannelDataType::String) + ); assert_eq!( hdf5_to_sift_data_type(&TypeDescriptor::FixedAscii(16)), - None + Some(ChannelDataType::String) ); assert_eq!( hdf5_to_sift_data_type(&TypeDescriptor::FixedUnicode(16)), - None + Some(ChannelDataType::String) + ); +} + +#[test] +fn hdf5_to_sift_data_type_maps_enum() { + let ty = TypeDescriptor::Enum(EnumType { + size: IntSize::U4, + signed: false, + members: vec![EnumMember { + name: "RED".into(), + value: 0, + }], + }); + assert_eq!(hdf5_to_sift_data_type(&ty), Some(ChannelDataType::Enum)); +} + +#[test] +fn enum_types_for_extracts_members() { + let ty = TypeDescriptor::Enum(EnumType { + size: IntSize::U4, + signed: true, + members: vec![ + EnumMember { + name: "OFF".into(), + value: 0, + }, + EnumMember { + name: "ON".into(), + value: 1, + }, + ], + }); + let mapped = enum_types_for(&ty).unwrap(); + assert_eq!(mapped.len(), 2); + assert_eq!(mapped[0].name, "OFF"); + assert_eq!(mapped[0].key, 0); + assert!(mapped[0].is_signed); + assert_eq!(mapped[1].name, "ON"); + assert_eq!(mapped[1].key, 1); +} + +#[test] +fn enum_types_for_unsigned_enum() { + let ty = TypeDescriptor::Enum(EnumType { + size: IntSize::U4, + signed: false, + members: vec![ + EnumMember { + name: "IDLE".into(), + value: 0, + }, + EnumMember { + name: "RUNNING".into(), + value: 1, + }, + EnumMember { + name: "ERROR".into(), + value: 99, + }, + ], + }); + let mapped = enum_types_for(&ty).unwrap(); + assert_eq!(mapped.len(), 3); + assert!(!mapped[0].is_signed); + assert!(!mapped[1].is_signed); + assert!(!mapped[2].is_signed); + assert_eq!(mapped[0].name, "IDLE"); + assert_eq!(mapped[0].key, 0); + assert_eq!(mapped[2].name, "ERROR"); + assert_eq!(mapped[2].key, 99); +} + +#[test] +fn enum_types_for_returns_empty_for_non_enum() { + assert!(enum_types_for(&TypeDescriptor::Boolean).unwrap().is_empty()); + assert!( + enum_types_for(&TypeDescriptor::Integer(IntSize::U4)) + .unwrap() + .is_empty() + ); +} + +#[test] +fn group_path_to_channel_name_root_dataset() { + assert_eq!(group_path_to_channel_name("/cpu_usage"), "cpu_usage"); +} + +#[test] +fn group_path_to_channel_name_single_nested_group() { + assert_eq!( + group_path_to_channel_name("/group1/current"), + "group1.current" + ); +} + +#[test] +fn group_path_to_channel_name_deeply_nested() { + assert_eq!( + group_path_to_channel_name("/group2/group3/group4/cell_voltage"), + "group2.group3.group4.cell_voltage" + ); +} + +#[test] +fn group_path_to_channel_name_no_leading_slash() { + assert_eq!( + group_path_to_channel_name("group1/current"), + "group1.current" ); } diff --git a/rust/crates/sift_cli/src/cmd/import/utils.rs b/rust/crates/sift_cli/src/cmd/import/utils.rs index 8fbc73830..23398f1b3 100644 --- a/rust/crates/sift_cli/src/cmd/import/utils.rs +++ b/rust/crates/sift_cli/src/cmd/import/utils.rs @@ -75,6 +75,13 @@ pub fn validate_time_format( } } +/// Convert a slash-delimited group path (HDF5 dataset path, TDMS group/channel, +/// etc.) into a Sift channel name. Strips a leading `/` and rewrites remaining +/// `/` separators as `.`. +pub fn group_path_to_channel_name(path: &str) -> String { + path.trim_start_matches('/').replace('/', ".") +} + pub fn try_parse_enum_config(val: &str) -> Result> { let values = val.split("|").collect::>();