|
1 | 1 | """Unit tests for data import config models and helpers.""" |
2 | 2 |
|
| 3 | +from __future__ import annotations |
| 4 | + |
3 | 5 | from datetime import datetime, timezone |
| 6 | +from typing import TYPE_CHECKING, cast |
4 | 7 |
|
5 | 8 | import pytest |
| 9 | +from sift.common.type.v1.channel_config_pb2 import ChannelConfig as ChannelConfigProto |
| 10 | +from sift.data_imports.v2.data_imports_pb2 import ( |
| 11 | + ParquetColumn, |
| 12 | + ParquetConfig, |
| 13 | + ParquetFlatDatasetConfig, |
| 14 | + ParquetSingleChannelPerRowConfig, |
| 15 | +) |
| 16 | +from sift.data_imports.v2.data_imports_pb2 import ( |
| 17 | + ParquetDataColumn as ParquetDataColumnProto, |
| 18 | +) |
| 19 | +from sift.data_imports.v2.data_imports_pb2 import ( |
| 20 | + ParquetTimeColumn as ParquetTimeColumnProto, |
| 21 | +) |
6 | 22 |
|
7 | 23 | from sift_client.resources import DataImportAPI, DataImportAPIAsync |
8 | | -from sift_client.resources.data_imports import _resolve_data_type_key |
| 24 | +from sift_client.resources.data_imports import ( |
| 25 | + _infer_time_column, |
| 26 | + _parse_parquet_detect_response, |
| 27 | + _resolve_data_type_key, |
| 28 | +) |
9 | 29 | from sift_client.sift_types.channel import ChannelDataType |
10 | 30 | from sift_client.sift_types.data_import import ( |
11 | 31 | CsvDataColumn, |
|
22 | 42 | TimeFormat, |
23 | 43 | ) |
24 | 44 |
|
| 45 | +if TYPE_CHECKING: |
| 46 | + from sift.common.type.v1.channel_data_type_pb2 import ( |
| 47 | + ChannelDataType as ChannelDataTypeProto, |
| 48 | + ) |
| 49 | + |
25 | 50 |
|
26 | 51 | @pytest.mark.integration |
27 | 52 | def test_client_binding(sift_client): |
@@ -362,3 +387,130 @@ def test_explicit_data_type_overrides_extension(self): |
362 | 387 | def test_unknown_extension_raises(self): |
363 | 388 | with pytest.raises(ValueError, match="Unsupported file extension"): |
364 | 389 | _resolve_data_type_key(".xyz", None) |
| 390 | + |
| 391 | + |
| 392 | +class TestInferTimeColumn: |
| 393 | + def test_picks_canonical_skips_other_columns(self): |
| 394 | + path = _infer_time_column( |
| 395 | + [ |
| 396 | + ("delta_time", ChannelDataType.INT_64, "delta_time"), |
| 397 | + ("voltage", ChannelDataType.DOUBLE, "voltage"), |
| 398 | + ("timestamp", ChannelDataType.INT_64, "timestamp"), |
| 399 | + ] |
| 400 | + ) |
| 401 | + assert path == "timestamp" |
| 402 | + |
| 403 | + def test_accepts_uint64(self): |
| 404 | + path = _infer_time_column([("time", ChannelDataType.UINT_64, "time")]) |
| 405 | + assert path == "time" |
| 406 | + |
| 407 | + def test_case_insensitive(self): |
| 408 | + path = _infer_time_column([("TimeStamp", ChannelDataType.INT_64, "TimeStamp")]) |
| 409 | + assert path == "TimeStamp" |
| 410 | + |
| 411 | + def test_multiple_candidates_sorted_alphabetically(self): |
| 412 | + path = _infer_time_column( |
| 413 | + [ |
| 414 | + ("timestamp", ChannelDataType.INT_64, "timestamp"), |
| 415 | + ("time", ChannelDataType.INT_64, "time"), |
| 416 | + ("ts", ChannelDataType.INT_64, "ts"), |
| 417 | + ] |
| 418 | + ) |
| 419 | + assert path == "time" |
| 420 | + |
| 421 | + def test_returns_none_when_no_canonical_int_column(self): |
| 422 | + path = _infer_time_column( |
| 423 | + [ |
| 424 | + ("timestamp", ChannelDataType.DOUBLE, "timestamp"), |
| 425 | + ("event_time", ChannelDataType.INT_64, "event_time"), |
| 426 | + ] |
| 427 | + ) |
| 428 | + assert path is None |
| 429 | + |
| 430 | + |
| 431 | +def _make_flat_dataset_response( |
| 432 | + time_path: str, data_columns: list[tuple[str, int]] |
| 433 | +) -> ParquetConfig: |
| 434 | + return ParquetConfig( |
| 435 | + flat_dataset=ParquetFlatDatasetConfig( |
| 436 | + time_column=ParquetTimeColumnProto(path=time_path), |
| 437 | + data_columns=[ |
| 438 | + ParquetDataColumnProto( |
| 439 | + path=path, |
| 440 | + channel_config=ChannelConfigProto( |
| 441 | + name=path, |
| 442 | + data_type=cast("ChannelDataTypeProto.ValueType", data_type), |
| 443 | + ), |
| 444 | + ) |
| 445 | + for path, data_type in data_columns |
| 446 | + ], |
| 447 | + ) |
| 448 | + ) |
| 449 | + |
| 450 | + |
| 451 | +def _make_scpr_response(time_path: str, columns: list[tuple[str, int]]) -> ParquetConfig: |
| 452 | + return ParquetConfig( |
| 453 | + single_channel_per_row=ParquetSingleChannelPerRowConfig( |
| 454 | + time_column=ParquetTimeColumnProto(path=time_path), |
| 455 | + columns=[ |
| 456 | + ParquetColumn( |
| 457 | + path=path, |
| 458 | + column_config=ChannelConfigProto( |
| 459 | + name=path, |
| 460 | + data_type=cast("ChannelDataTypeProto.ValueType", data_type), |
| 461 | + ), |
| 462 | + ) |
| 463 | + for path, data_type in columns |
| 464 | + ], |
| 465 | + ) |
| 466 | + ) |
| 467 | + |
| 468 | + |
| 469 | +class TestParseParquetDetectResponseTimeFallback: |
| 470 | + def test_flat_dataset_infers_int64_time_column(self): |
| 471 | + proto = _make_flat_dataset_response( |
| 472 | + time_path="", |
| 473 | + data_columns=[ |
| 474 | + ("voltage", ChannelDataType.DOUBLE.value), |
| 475 | + ("timestamp", ChannelDataType.INT_64.value), |
| 476 | + ("status", ChannelDataType.INT_32.value), |
| 477 | + ], |
| 478 | + ) |
| 479 | + config = _parse_parquet_detect_response(proto, "file.parquet", 0, 0) |
| 480 | + assert isinstance(config, ParquetFlatDatasetImportConfig) |
| 481 | + assert config.time_column.path == "timestamp" |
| 482 | + assert [dc.path for dc in config.data_columns] == ["voltage", "status"] |
| 483 | + |
| 484 | + def test_flat_dataset_keeps_server_time_column_when_set(self): |
| 485 | + proto = _make_flat_dataset_response( |
| 486 | + time_path="server_ts", |
| 487 | + data_columns=[ |
| 488 | + ("server_ts", ChannelDataType.INT_64.value), |
| 489 | + ("timestamp", ChannelDataType.INT_64.value), |
| 490 | + ("voltage", ChannelDataType.DOUBLE.value), |
| 491 | + ], |
| 492 | + ) |
| 493 | + config = _parse_parquet_detect_response(proto, "file.parquet", 0, 0) |
| 494 | + assert config.time_column.path == "server_ts" |
| 495 | + assert [dc.path for dc in config.data_columns] == ["timestamp", "voltage"] |
| 496 | + |
| 497 | + def test_flat_dataset_no_int64_match_leaves_time_empty(self): |
| 498 | + proto = _make_flat_dataset_response( |
| 499 | + time_path="", |
| 500 | + data_columns=[("voltage", ChannelDataType.DOUBLE.value)], |
| 501 | + ) |
| 502 | + config = _parse_parquet_detect_response(proto, "file.parquet", 0, 0) |
| 503 | + assert config.time_column.path == "" |
| 504 | + assert [dc.path for dc in config.data_columns] == ["voltage"] |
| 505 | + |
| 506 | + def test_scpr_infers_int64_time_column(self): |
| 507 | + proto = _make_scpr_response( |
| 508 | + time_path="", |
| 509 | + columns=[ |
| 510 | + ("voltage", ChannelDataType.DOUBLE.value), |
| 511 | + ("timestamp", ChannelDataType.INT_64.value), |
| 512 | + ], |
| 513 | + ) |
| 514 | + config = _parse_parquet_detect_response(proto, "file.parquet", 0, 0) |
| 515 | + assert isinstance(config, ParquetSingleChannelPerRowImportConfig) |
| 516 | + assert config.time_column.path == "timestamp" |
0 commit comments