tests: data transformers logic in BenchmarkExecutor

iraedeus · iraedeus · commit a8ee9f792a7f · 2026-04-16T16:44:32.000+03:00
diff --git a/tests/mocks/core/data_transformers/data_transformer.py b/tests/mocks/core/data_transformers/data_transformer.py
@@ -0,0 +1,89 @@
+# -*- coding: ascii -*-
+
+"""
+Mock data transformer implementations for testing.
+
+This module provides mock implementations of IDataTransformer used for testing
+the transformation pipeline in benchmark execution and algorithm evaluation.
+"""
+
+__author__ = "Danil Totmyanin"
+__copyright__ = "Copyright (c) 2026 PySATL project"
+__license__ = "SPDX-License-Identifier: MIT"
+
+from typing import Any
+
+from pysatl_cpd.analysis.labeled_data import LabeledData
+from pysatl_cpd.core.data_providers.idata_provider import DataProvider
+from pysatl_cpd.core.data_transformers.idata_transformer import IDataTransformer
+
+
+class MockDataTransformer(IDataTransformer[float, float]):
+    """
+    Mock data transformer for testing benchmark execution.
+
+    This transformer adds a specified constant value to every observation
+    in the dataset and keeps track of how many times the `transform` method
+    was applied. It wraps the transformed data back into a `LabeledData` instance.
+
+    Parameters
+    ----------
+    name : str, default="MockTransform"
+        The string identifier for the transformer.
+    add_value : float, default=1.0
+        The numeric value to add to each observation.
+    """
+
+    def __init__(self, name: str = "MockTransform", add_value: float = 1.0) -> None:
+        self._name = name
+        self.add_value = add_value
+        self.call_count = 0
+
+    @property
+    def name(self) -> str:
+        """
+        Return the name of the mock transformer.
+
+        Returns
+        -------
+        str
+            The identifier of this transformer instance.
+        """
+        return self._name
+
+    def __hash__(self) -> int:
+        """
+        Return a hash based on the transformer's properties.
+
+        Used to uniquely identify the pipeline configuration in the cache.
+
+        Returns
+        -------
+        int
+            Hash value representing the transformer configuration.
+        """
+        return hash((self._name, self.add_value))
+
+    def transform(self, provider: DataProvider[float]) -> DataProvider[float]:
+        """
+        Transform the data by adding a constant value to each element.
+
+        Parameters
+        ----------
+        provider : DataProvider[float]
+            The original data provider.
+
+        Returns
+        -------
+        DataProvider[float]
+            A new `LabeledData` instance containing the transformed values.
+        """
+        self.call_count += 1
+
+        # Transform data
+        new_data: list[float] = [float(x) + self.add_value for x in provider]
+
+        # Preserve change points if the provider has them
+        change_points: Any = getattr(provider, "change_points", getattr(provider, "change_point", []))
+
+        return LabeledData(raw_data=new_data, change_points=change_points, name=f"{provider.name}_{self.name}")
diff --git a/tests/unit/benchmark/core/test_benchmark_executor.py b/tests/unit/benchmark/core/test_benchmark_executor.py
@@ -25,6 +25,7 @@
 from pysatl_cpd.core.online.online_cpd_solver import OnlineCpdSolver
 from pysatl_cpd.core.online.online_detection_trace import OnlineDetectionTrace
 from tests.mocks.algorithms.online.simple import MockOnlineAlgorithm
+from tests.mocks.core.data_transformers.data_transformer import MockDataTransformer
 
 
 def _make_provider(
@@ -519,3 +520,139 @@ def test_multiple_thresholds_create_separate_pickle_files(self, tmp_path: Path)
         with open(registry_path, encoding="utf-8") as f:
             rows: list[dict[str, str]] = list(csv.DictReader(f))
         assert len(rows) == 3
+
+
+# ---------------------------------------------------------------------------
+# 6. Data Transformers
+# ---------------------------------------------------------------------------
+class TestBenchmarkExecutorTransformers:
+    """Tests for the DataTransformer integration in BenchmarkExecutor."""
+
+    def test_transformer_modifies_data_passed_to_algorithm(self) -> None:
+        """Executor should pass transformed data, not raw data, to the solver."""
+        algo = MockOnlineAlgorithm[float](name="A", return_sequence=[0.0])
+        transformer = MockDataTransformer(name="T1", add_value=5.0)
+        entry = AlgorithmEntry(algorithm=algo, thresholds=[1.0], transformer=transformer)
+
+        # Original provider with zeros
+        provider: LabeledData[float] = LabeledData(raw_data=[0.0, 0.0, 0.0], change_points=[], name="data")
+        solver: OnlineCpdSolver = OnlineCpdSolver()
+
+        executor: BenchmarkExecutor[float] = BenchmarkExecutor(
+            entries=[entry],
+            providers=[provider],
+            solver=solver,
+        )
+        executor.execute()
+
+        # The algorithm should have received [5.0, 5.0, 5.0]
+        history: list[float] = algo.get_call_history()
+        assert history == [5.0, 5.0, 5.0]
+
+    def test_record_metadata_uses_transformer_name_and_hash(self) -> None:
+        """Benchmark record should inherit the full name and hash from the Entry."""
+        algo = MockOnlineAlgorithm[float](name="BaseAlgo", return_sequence=[0.0])
+        transformer = MockDataTransformer(name="MyTF", add_value=1.0)
+        entry = AlgorithmEntry(algorithm=algo, thresholds=[1.0], transformer=transformer)
+
+        provider: LabeledData[float] = _make_provider(3, name="d1")
+        solver: OnlineCpdSolver = OnlineCpdSolver()
+
+        executor: BenchmarkExecutor[float] = BenchmarkExecutor(
+            entries=[entry],
+            providers=[provider],
+            solver=solver,
+        )
+        results = executor.execute()
+        record: BenchmarkRecord = results[0][0]
+
+        # Name should be combined
+        assert record.algorithm == "BaseAlgo_MyTF"
+        assert record.algorithm == entry.full_name
+
+        # Hash should match the entry's composite hash
+        assert record.configuration_hash == entry.full_hash
+
+    def test_caching_separates_different_transformers(self, tmp_path: Path) -> None:
+        """Using the same algorithm but different transformers should create separate cache records."""
+        algo = MockOnlineAlgorithm[float](name="A", return_sequence=[0.0])
+
+        entry_clean = AlgorithmEntry(algorithm=algo, thresholds=[1.0], transformer=None)
+        entry_transformed = AlgorithmEntry(
+            algorithm=algo, thresholds=[1.0], transformer=MockDataTransformer(name="T1", add_value=2.0)
+        )
+
+        provider: LabeledData[float] = _make_provider(3, name="data")
+        solver: OnlineCpdSolver = OnlineCpdSolver()
+
+        executor: BenchmarkExecutor[float] = BenchmarkExecutor(
+            entries=[entry_clean, entry_transformed],
+            providers=[provider],
+            solver=solver,
+            dump_dir=tmp_path,
+        )
+        executor.execute()
+
+        # Should produce two distinct pickle files
+        pkl_files: list[Path] = list(tmp_path.glob("*.pkl"))
+        assert len(pkl_files) == 2
+
+        # Names of the files should reflect the different algorithm representations
+        file_names: str = " ".join(f.name for f in pkl_files)
+        assert "A_" in file_names
+        assert "A_T1_" in file_names
+
+    def test_transformer_is_called_even_on_cache_hit(self, tmp_path: Path) -> None:
+        """Transformer should be applied before checking cache, incrementing its call count."""
+        transformer = MockDataTransformer(name="T1", add_value=1.0)
+        provider: LabeledData[float] = _make_provider(3, name="data")
+        solver: OnlineCpdSolver = OnlineCpdSolver()
+
+        # First run to populate cache
+        algo1 = MockOnlineAlgorithm[float](name="A", return_sequence=[0.0])
+        entry1 = AlgorithmEntry(algorithm=algo1, thresholds=[1.0], transformer=transformer)
+        exec1: BenchmarkExecutor[float] = BenchmarkExecutor([entry1], [provider], solver, tmp_path)
+        exec1.execute()
+
+        assert transformer.call_count == 1
+        assert len(algo1.get_call_history()) == 3
+
+        # Second run with cache hit (using a fresh algorithm instance to verify it doesn't run)
+        algo2 = MockOnlineAlgorithm[float](name="A", return_sequence=[0.0])
+        entry2 = AlgorithmEntry(algorithm=algo2, thresholds=[1.0], transformer=transformer)
+        exec2: BenchmarkExecutor[float] = BenchmarkExecutor([entry2], [provider], solver, tmp_path)
+        exec2.execute()
+
+        # Transformer is still called during iteration
+        assert transformer.call_count == 2
+
+        # But the solver/algorithm was skipped due to cache hit
+        assert len(algo2.get_call_history()) == 0
+
+    def test_multiple_entries_mixed_transformers(self) -> None:
+        """Executor should properly route data when processing mixed transformer configurations."""
+        algo1 = MockOnlineAlgorithm[float](name="A", return_sequence=[0.0])
+        algo2 = MockOnlineAlgorithm[float](name="A", return_sequence=[0.0])
+        algo3 = MockOnlineAlgorithm[float](name="A", return_sequence=[0.0])
+
+        entry_none = AlgorithmEntry(algorithm=algo1, thresholds=[1.0])
+        entry_t1 = AlgorithmEntry(algorithm=algo2, thresholds=[1.0], transformer=MockDataTransformer("T1", 10.0))
+        entry_t2 = AlgorithmEntry(algorithm=algo3, thresholds=[1.0], transformer=MockDataTransformer("T2", 20.0))
+
+        # Provider yields [1.0, 1.0]
+        provider: LabeledData[float] = _make_provider(2, name="data")
+        solver: OnlineCpdSolver = OnlineCpdSolver()
+
+        executor: BenchmarkExecutor[float] = BenchmarkExecutor(
+            entries=[entry_none, entry_t1, entry_t2],
+            providers=[provider],
+            solver=solver,
+        )
+        results = executor.execute()
+
+        assert len(results) == 3
+
+        # Verify specific algorithm histories to ensure they received correct streams
+        assert algo1.get_call_history() == [1.0, 1.0]  # No transformation
+        assert algo2.get_call_history() == [11.0, 11.0]  # 1.0 + 10.0
+        assert algo3.get_call_history() == [21.0, 21.0]  # 1.0 + 20.0