|
1 | 1 | from typing import Sized, Type |
2 | 2 | import capymoa.datasets as capymoa_datasets |
| 3 | +from capymoa.stream import Stream |
3 | 4 | from capymoa.datasets import ElectricityTiny |
4 | 5 | from tempfile import TemporaryDirectory |
5 | 6 | import pytest |
@@ -59,13 +60,58 @@ def test_electricity_tiny_schema(): |
59 | 60 | @pytest.mark.parametrize("dataset_type", _ALL_DOWNLOADABLE_DATASET) |
60 | 61 | def test_all_datasets(dataset_type: Type[_DownloadableDataset]): |
61 | 62 | with TemporaryDirectory() as tmp_dir: |
62 | | - dataset = dataset_type(directory=tmp_dir) |
| 63 | + dataset_arff = dataset_type(directory=tmp_dir) |
| 64 | + assert isinstance(dataset_arff, Stream) |
63 | 65 |
|
64 | 66 | i = 0 |
65 | | - while dataset.has_more_instances(): |
66 | | - dataset.next_instance() |
| 67 | + while dataset_arff.has_more_instances(): |
| 68 | + dataset_arff.next_instance() |
67 | 69 | i += 1 |
68 | 70 |
|
69 | | - assert str(dataset) |
70 | | - assert isinstance(dataset, Sized), "Dataset must be an instance of Sized" |
71 | | - assert len(dataset) == i, "Dataset length must be correct" |
| 71 | + assert str(dataset_arff) |
| 72 | + assert isinstance(dataset_arff, Sized), "Dataset must be an instance of Sized" |
| 73 | + assert len(dataset_arff) == i, "Dataset length must be correct" |
| 74 | + dataset_arff.restart() |
| 75 | + |
| 76 | + try: |
| 77 | + dataset_csv = dataset_type(directory=tmp_dir, file_type="csv") |
| 78 | + assert isinstance(dataset_csv, Stream) |
| 79 | + except ValueError: |
| 80 | + return # If the dataset does not support CSV, skip the rest of the test |
| 81 | + |
| 82 | + # Both should return a schema object |
| 83 | + assert dataset_arff.get_schema() is not None |
| 84 | + assert dataset_csv.get_schema() is not None |
| 85 | + |
| 86 | + i = 0 |
| 87 | + while dataset_arff.has_more_instances() and dataset_csv.has_more_instances(): |
| 88 | + instance_arff = dataset_arff.next_instance() |
| 89 | + instance_csv = dataset_csv.next_instance() |
| 90 | + |
| 91 | + assert instance_arff.x == pytest.approx(instance_csv.x) |
| 92 | + if dataset_csv.get_schema().is_classification(): |
| 93 | + assert instance_arff.y_index == pytest.approx(instance_csv.y_index) |
| 94 | + elif dataset_csv.get_schema().is_regression(): |
| 95 | + assert instance_arff.y_value == pytest.approx(instance_csv.y_value) |
| 96 | + |
| 97 | + i += 1 |
| 98 | + |
| 99 | + # Both datasets should be exhausted by now. |
| 100 | + assert not dataset_arff.has_more_instances() |
| 101 | + assert not dataset_csv.has_more_instances() |
| 102 | + |
| 103 | + # The datasets should be restartable. |
| 104 | + dataset_arff.restart() |
| 105 | + dataset_csv.restart() |
| 106 | + |
| 107 | + # After restarting, the datasets should have more instances. |
| 108 | + assert dataset_arff.has_more_instances() |
| 109 | + assert dataset_csv.has_more_instances() |
| 110 | + |
| 111 | + # The string representation of the datasets should not throw an error |
| 112 | + assert str(dataset_arff) |
| 113 | + assert str(dataset_csv) |
| 114 | + # The datasets should be the same length, and should have a size. |
| 115 | + assert isinstance(dataset_arff, Sized) |
| 116 | + assert isinstance(dataset_csv, Sized) |
| 117 | + assert len(dataset_arff) == len(dataset_csv) == i |
0 commit comments