test: add unittests for DistributionEstimatorBase

TomeHirata · TomeHirata · commit 101fc4c86676 · 2024-07-21T20:14:23.000+09:00
diff --git a/dte_adj/__init__.py b/dte_adj/__init__.py
@@ -2,12 +2,13 @@
 from typing import Tuple
 from scipy.stats import norm
 from copy import deepcopy
-from .util import compute_confidence_intervals, find_le
+from abc import ABC
+from .util import compute_confidence_intervals
 
 __all__ = ["SimpleDistributionEstimator", "AdjustedDistributionEstimator"]
 
 
-class DistributionEstimatorBase(object):
+class DistributionEstimatorBase(ABC):
     """A mixin including several convenience functions to compute and display distribution functions."""
 
     def __init__(self):
@@ -310,6 +311,33 @@ def find_quantile(quantile, arm):
             )
 
         return result
+    
+    def fit(
+        self, confoundings: np.ndarray, treatment_arms: np.ndarray, outcomes: np.ndarray
+    ) -> "DistributionEstimatorBase":
+        """Train the DistributionEstimatorBase.
+
+        Args:
+            confoundings (np.ndarray): Pre-treatment covariates.
+            treatment_arms (np.ndarray): The index of the treatment arm.
+            outcomes (np.ndarray): Scalar-valued observed outcome.
+
+        Returns:
+            DistributionEstimatorBase: The fitted estimator.
+        """
+        if confoundings.shape[0] != treatment_arms.shape[0]:
+            raise ValueError(
+                "The shape of confounding and treatment_arm should be same"
+            )
+
+        if confoundings.shape[0] != outcomes.shape[0]:
+            raise ValueError("The shape of confounding and outcome should be same")
+
+        self.confoundings = confoundings
+        self.treatment_arms = treatment_arms
+        self.outcomes = outcomes
+
+        return self
 
     def predict(self, treatment_arms: np.ndarray, locations: np.ndarray) -> np.ndarray:
         """Compute cumulative distribution values.
@@ -366,33 +394,6 @@ def __init__(self):
         """
         super().__init__()
 
-    def fit(
-        self, confoundings: np.ndarray, treatment_arms: np.ndarray, outcomes: np.ndarray
-    ) -> "SimpleDistributionEstimator":
-        """Train the SimpleDistributionEstimator.
-
-        Args:
-            confoundings (np.ndarray): Pre-treatment covariates.
-            treatment_arms (np.ndarray): The index of the treatment arm.
-            outcomes (np.ndarray): Scalar-valued observed outcome.
-
-        Returns:
-            SimpleDistributionEstimator: The fitted estimator.
-        """
-        if confoundings.shape[0] != treatment_arms.shape[0]:
-            raise ValueError(
-                "The shape of confounding and treatment_arm should be same"
-            )
-
-        if confoundings.shape[0] != outcomes.shape[0]:
-            raise ValueError("The shape of confounding and outcome should be same")
-
-        self.confoundings = confoundings
-        self.treatment_arms = treatment_arms
-        self.outcomes = outcomes
-
-        return self
-
     def _compute_cumulative_distribution(
         self,
         target_treatment_arms: np.ndarray,
@@ -427,7 +428,7 @@ def _compute_cumulative_distribution(
         cumulative_distribution = np.zeros(locations.shape)
         for i, (outcome, arm) in enumerate(zip(locations, target_treatment_arms)):
             cumulative_distribution[i] = (
-                find_le(d_outcome[arm], outcome) + 1
+                np.searchsorted(d_outcome[arm], outcome, side="right")
             ) / d_outcome[arm].shape[0]
         return cumulative_distribution, np.zeros((n_obs, n_loc))
 
@@ -445,60 +446,12 @@ def __init__(self, base_model, folds=3):
         Returns:
             AdjustedDistributionEstimator: An instance of the estimator.
         """
+        if (not hasattr(base_model, 'predict')) and (not hasattr(base_model, 'predict_proba')):
+            raise ValueError('base_model should implement either predict_proba or predict')
         self.base_model = base_model
         self.folds = folds
         super().__init__()
 
-    def fit(
-        self, confoundings: np.ndarray, treatment_arms: np.ndarray, outcomes: np.ndarray
-    ) -> "AdjustedDistributionEstimator":
-        """Train the AdjustedDistributionEstimator.
-
-        Args:
-            confoundings (np.ndarray): Pre-treatment covariates.
-            treatment_arms (np.ndarray): The index of the treatment arm.
-            outcomes (np.ndarray): Scalar-valued observed outcome.
-
-        Returns:
-            AdjustedDistributionEstimator: The fitted estimator.
-        """
-        if confoundings.shape[0] != treatment_arms.shape[0]:
-            raise ValueError(
-                "The shape of confounding and treatment_arm should be same"
-            )
-
-        if confoundings.shape[0] != outcomes.shape[0]:
-            raise ValueError("The shape of confounding and outcome should be same")
-
-        self.confoundings = confoundings
-        self.treatment_arms = treatment_arms
-        self.outcomes = outcomes
-
-        return self
-
-    def predict(self, treatment_arms: np.ndarray, locations: np.ndarray) -> np.ndarray:
-        """Compute cumulative distribution values.
-
-        Args:
-            treatment_arms (np.ndarray): The index of the treatment arm.
-            locations (np.ndarray): Scalar values to be used for computing the cumulative distribution.
-
-        Returns:
-            np.ndarray: Estimated cumulative distribution values for the input.
-        """
-        if self.outcomes is None:
-            raise ValueError(
-                "This estimator has not been trained yet. Please call fit first"
-            )
-
-        return self._compute_cumulative_distribution(
-            treatment_arms,
-            locations,
-            self.confoundings,
-            self.treatment_arms,
-            self.outcomes,
-        )[0]
-
     def _compute_cumulative_distribution(
         self,
         target_treatment_arms: np.ndarray,
diff --git a/dte_adj/util.py b/dte_adj/util.py
@@ -1,5 +1,6 @@
 import numpy as np
 from scipy.stats import norm
+from typing import Tuple
 
 
 def compute_confidence_intervals(
@@ -16,7 +17,7 @@ def compute_confidence_intervals(
     alpha: 0.05,
     variance_type="moment",
     n_bootstrap=500,
-):
+) -> Tuple[np.ndarray, np.ndarray]:
     """Computes the confidence intervals of distribution parameters.
 
     Args:
@@ -106,25 +107,3 @@ def compute_confidence_intervals(
         return vec_dte_lower_simple, vec_dte_upper_simple
     else:
         raise ValueError(f"Invalid variance type was speficied: {variance_type}")
-
-
-def find_le(array: np.ndarray, threshold):
-    """Find the rightmost value less than or equal to threshold in a sorted array
-
-    Args:
-        array (np.ndarray): The sorted array to search in.
-        threshold (float): The threshold value.
-
-    Returns:
-        int: The index where the value first exceeds the threshold.
-    """
-    low, high = 0, array.shape[0] - 1
-    result = -1
-    while low <= high:
-        mid = (low + high) // 2
-        if array[mid] <= threshold:
-            result = mid
-            low = mid + 1
-        else:
-            high = mid - 1
-    return result
diff --git a/tests/test_distribution_estimator_base.py b/tests/test_distribution_estimator_base.py
@@ -0,0 +1,190 @@
+import unittest
+import numpy as np
+from unittest.mock import patch, MagicMock
+from dte_adj import DistributionEstimatorBase
+
+
+def compute_cumulative_distribution(
+    target_treatment_arms: np.ndarray,
+    locations: np.ndarray,
+    confoundings: np.ndarray,
+    treatment_arms: np.ndarray,
+    outcomes: np.array,
+) -> np.ndarray:
+    """Mock implementation for testing purposes."""
+    return np.linspace(
+        0, 0.9, locations.shape[0]
+    ) + target_treatment_arms * 0.1, np.zeros((outcomes.shape[0], locations.shape[0]))
+
+
+class MockDistributionEstimator(DistributionEstimatorBase):
+    def __init__(
+        self, mock_compute_cumulative_distribution=compute_cumulative_distribution
+    ):
+        super().__init__()
+        self.compute_cumulative_distribution = MagicMock()
+        self.compute_cumulative_distribution.side_effect = (
+            mock_compute_cumulative_distribution
+        )
+
+    """Mock class to implement _compute_cumulative_distribution for testing."""
+
+    def _compute_cumulative_distribution(
+        self,
+        target_treatment_arms: np.ndarray,
+        locations: np.ndarray,
+        confoundings: np.ndarray,
+        treatment_arms: np.ndarray,
+        outcomes: np.array,
+    ) -> np.ndarray:
+        return self.compute_cumulative_distribution(
+            target_treatment_arms, locations, confoundings, treatment_arms, outcomes
+        )
+
+
+def compute_confidence_intervals(*args, **kwargs):
+    """Mock function for compute_confidence_intervals."""
+    size = len(kwargs["vec_loc"])
+    lower_bound = np.full(size, 0.1)
+    upper_bound = np.full(size, 0.9)
+    return lower_bound, upper_bound
+
+
+class TestDistributionEstimatorBase(unittest.TestCase):
+    def setUp(self):
+        self.estimator = MockDistributionEstimator()
+        self.confoundings = np.zeros((20, 5))
+        self.treatment_arms = np.hstack([np.zeros(10), np.ones(10)])
+        self.outcomes = np.arange(20)
+        self.estimator.fit(self.confoundings, self.treatment_arms, self.outcomes)
+
+    def test_initialization(self):
+        # Arrange
+        base_estimator = MockDistributionEstimator()
+
+        # Assert
+        self.assertIsNone(base_estimator.confoundings)
+        self.assertIsNone(base_estimator.treatment_arms)
+        self.assertIsNone(base_estimator.outcomes)
+
+    @patch(
+        "dte_adj.compute_confidence_intervals", side_effect=compute_confidence_intervals
+    )
+    def test_predict_dte(self, mock_compute_confidence_intervals):
+        # Arrange
+        target_treatment_arm = 1
+        control_treatment_arm = 0
+        locations = np.arange(20)
+
+        # Act
+        dte, lower_bound, upper_bound = self.estimator.predict_dte(
+            target_treatment_arm, control_treatment_arm, locations
+        )
+
+        # Assert
+        np.testing.assert_array_almost_equal(dte, np.full(locations.shape, 0.1))
+        np.testing.assert_array_almost_equal(lower_bound, np.full(locations.shape, 0.1))
+        np.testing.assert_array_almost_equal(upper_bound, np.full(locations.shape, 0.9))
+        self.estimator.compute_cumulative_distribution.assert_called()
+
+    @patch(
+        "dte_adj.compute_confidence_intervals", side_effect=compute_confidence_intervals
+    )
+    def test_predict_pte(self, mock_compute_confidence_intervals):
+        # Arrange
+        target_treatment_arm = 1
+        control_treatment_arm = 0
+        locations = np.arange(20)
+        width = 0.1
+
+        # Act
+        pte, lower_bound, upper_bound = self.estimator.predict_pte(
+            target_treatment_arm, control_treatment_arm, width, locations
+        )
+
+        # Assert
+        np.testing.assert_array_almost_equal(pte, np.full(locations.shape, 0))
+        np.testing.assert_array_almost_equal(lower_bound, np.full(locations.shape, 0.1))
+        np.testing.assert_array_almost_equal(upper_bound, np.full(locations.shape, 0.9))
+        self.estimator.compute_cumulative_distribution.assert_called()
+
+    def test_predict_qte(self):
+        # Arrange
+        target_treatment_arm = 1
+        control_treatment_arm = 0
+        quantiles = np.array([0.1 * i for i in range(1, 10)])
+        expected_qte = np.array([0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0])
+
+        # Act
+        qte, lower_bound, upper_bound = self.estimator.predict_qte(
+            target_treatment_arm, control_treatment_arm, quantiles, n_bootstrap=5
+        )
+
+        # Assert
+        np.testing.assert_array_almost_equal(qte, expected_qte)
+        np.testing.assert_array_almost_equal(lower_bound.shape, quantiles.shape)
+        np.testing.assert_array_almost_equal(lower_bound.shape, quantiles.shape)
+        self.estimator.compute_cumulative_distribution.assert_called()
+
+    def test_fit_success(self):
+        # Assert
+        self.assertTrue(np.array_equal(self.estimator.confoundings, self.confoundings))
+        self.assertTrue(
+            np.array_equal(self.estimator.treatment_arms, self.treatment_arms)
+        )
+        self.assertTrue(np.array_equal(self.estimator.outcomes, self.outcomes))
+      
+    def test_fit_invalid_shapes(self):
+        # Arrange
+        confoundings_invalid = np.array([[1, 2], [3, 4]])
+        treatment_arms_invalid = np.array([0, 1])
+        outcomes_invalid = np.array([0.5, 0.7])
+
+        # Assert
+        with self.assertRaises(ValueError):
+            self.estimator.fit(confoundings_invalid, self.treatment_arms, self.outcomes)
+
+        with self.assertRaises(ValueError):
+            self.estimator.fit(self.confoundings, treatment_arms_invalid, self.outcomes)
+
+        with self.assertRaises(ValueError):
+            self.estimator.fit(self.confoundings, self.treatment_arms, outcomes_invalid)
+    
+    def test_predict_success(self):
+        # Arrange
+        treatment_arms_test = np.array([0, 1])
+        locations_test = np.array([3, 6])
+        expected_output = np.array([0.4, 0])
+
+        # Act
+        output = self.estimator.predict(treatment_arms_test, locations_test)
+
+        # Assert
+        self.estimator.compute_cumulative_distribution.assert_called_once()
+
+    def test_predict_fail_before_fit(self):
+        # Arrange
+        treatment_arms_test = np.array([0, 1])
+        locations_test = np.array([3, 6])
+        subject = MockDistributionEstimator()
+
+        # Act, Assert
+        with self.assertRaises(ValueError) as cm:
+            subject.predict(treatment_arms_test, locations_test)
+        self.assertEqual(
+            str(cm.exception),
+            "This estimator has not been trained yet. Please call fit first",
+        )    
+
+    def test_predict_fail_invalid_arm(self):
+        # Arrange
+        treatment_arms_invalid = np.array([2])
+        locations_test = np.array([3, 6])
+
+        # Act, Assert
+        with self.assertRaises(ValueError) as cm:
+            self.estimator.predict(treatment_arms_invalid, locations_test)
+        self.assertEqual(
+            str(cm.exception),
+            "This treatment_arms argument contains arms not included in the training data: {2}",
+        )
diff --git a/tests/test_simple_estimator.py b/tests/test_simple_estimator.py