CyberAgentAILab
diff --git a/‎dte_adj/__init__.py‎
Lines changed: 59 additions & 101 deletions b/‎dte_adj/__init__.py‎
Lines changed: 59 additions & 101 deletions
diff --git a/‎dte_adj/util.py‎
Lines changed: 2 additions & 23 deletions b/‎dte_adj/util.py‎
Lines changed: 2 additions & 23 deletions
diff --git a/‎tests/test_adjusted_estimator.py‎
Lines changed: 47 additions & 4 deletions b/‎tests/test_adjusted_estimator.py‎
Lines changed: 47 additions & 4 deletions
@@ -2,12 +2,13 @@
 from typing import Tuple
 from scipy.stats import norm
 from copy import deepcopy
-from .util import compute_confidence_intervals, find_le
+from abc import ABC
+from .util import compute_confidence_intervals
 
 __all__ = ["SimpleDistributionEstimator", "AdjustedDistributionEstimator"]
 
 
-class DistributionFunctionMixin(object):
+class DistributionEstimatorBase(ABC):
     """A mixin including several convenience functions to compute and display distribution functions."""
 
     def __init__(self):
@@ -311,55 +312,18 @@ def find_quantile(quantile, arm):
 
         return result
 
-    def predict(self, treatment_arms: np.ndarray, outcomes: np.ndarray) -> np.ndarray:
-        """Compute cumulative distribution values.
-
-        Args:
-            treatment_arms (np.ndarray): The index of the treatment arm.
-            outcomes (np.ndarray): Scalar values to be used for computing the cumulative distribution.
-
-        Returns:
-            np.ndarray: Estimated cumulative distribution values for the input.
-        """
-        raise NotImplementedError()
-
-    def _compute_cumulative_distribution(
-        self,
-        target_treatment_arms: np.ndarray,
-        locations: np.ndarray,
-        confoundings: np.ndarray,
-        treatment_arms: np.ndarray,
-        outcomes: np.array,
-    ) -> np.ndarray:
-        """Compute the cumulative distribution values."""
-        raise NotImplementedError()
-
-
-class SimpleDistributionEstimator(DistributionFunctionMixin):
-    """A class for computing the empirical distribution function and the distributional parameters
-    based on the distribution function.
-    """
-
-    def __init__(self):
-        """Initializes the SimpleDistributionEstimator.
-
-        Returns:
-            SimpleDistributionEstimator: An instance of the estimator.
-        """
-        super().__init__()
-
     def fit(
         self, confoundings: np.ndarray, treatment_arms: np.ndarray, outcomes: np.ndarray
-    ) -> "SimpleDistributionEstimator":
-        """Train the SimpleDistributionEstimator.
+    ) -> "DistributionEstimatorBase":
+        """Train the DistributionEstimatorBase.
 
         Args:
             confoundings (np.ndarray): Pre-treatment covariates.
             treatment_arms (np.ndarray): The index of the treatment arm.
             outcomes (np.ndarray): Scalar-valued observed outcome.
 
         Returns:
-            SimpleDistributionEstimator: The fitted estimator.
+            DistributionEstimatorBase: The fitted estimator.
         """
         if confoundings.shape[0] != treatment_arms.shape[0]:
             raise ValueError(
@@ -380,7 +344,7 @@ def predict(self, treatment_arms: np.ndarray, locations: np.ndarray) -> np.ndarr
 
         Args:
             treatment_arms (np.ndarray): The index of the treatment arm.
-            locations (np.ndarray): Scalar values to be used for computing the cumulative distribution.
+            outcomes (np.ndarray): Scalar values to be used for computing the cumulative distribution.
 
         Returns:
             np.ndarray: Estimated cumulative distribution values for the input.
@@ -390,6 +354,13 @@ def predict(self, treatment_arms: np.ndarray, locations: np.ndarray) -> np.ndarr
                 "This estimator has not been trained yet. Please call fit first"
             )
 
+        unincluded_arms = set(treatment_arms) - set(self.treatment_arms)
+
+        if len(unincluded_arms) > 0:
+            raise ValueError(
+                f"This treatment_arms argument contains arms not included in the training data: {unincluded_arms}"
+            )
+
         return self._compute_cumulative_distribution(
             treatment_arms,
             locations,
@@ -398,6 +369,31 @@ def predict(self, treatment_arms: np.ndarray, locations: np.ndarray) -> np.ndarr
             self.outcomes,
         )[0]
 
+    def _compute_cumulative_distribution(
+        self,
+        target_treatment_arms: np.ndarray,
+        locations: np.ndarray,
+        confoundings: np.ndarray,
+        treatment_arms: np.ndarray,
+        outcomes: np.array,
+    ) -> np.ndarray:
+        """Compute the cumulative distribution values."""
+        raise NotImplementedError()
+
+
+class SimpleDistributionEstimator(DistributionEstimatorBase):
+    """A class for computing the empirical distribution function and the distributional parameters
+    based on the distribution function.
+    """
+
+    def __init__(self):
+        """Initializes the SimpleDistributionEstimator.
+
+        Returns:
+            SimpleDistributionEstimator: An instance of the estimator.
+        """
+        super().__init__()
+
     def _compute_cumulative_distribution(
         self,
         target_treatment_arms: np.ndarray,
@@ -432,12 +428,12 @@ def _compute_cumulative_distribution(
         cumulative_distribution = np.zeros(locations.shape)
         for i, (outcome, arm) in enumerate(zip(locations, target_treatment_arms)):
             cumulative_distribution[i] = (
-                find_le(d_outcome[arm], outcome) + 1
+                np.searchsorted(d_outcome[arm], outcome, side="right")
             ) / d_outcome[arm].shape[0]
         return cumulative_distribution, np.zeros((n_obs, n_loc))
 
 
-class AdjustedDistributionEstimator(DistributionFunctionMixin):
+class AdjustedDistributionEstimator(DistributionEstimatorBase):
     """A class is for estimating the adjusted distribution function and computing the Distributional parameters based on the trained conditional estimator."""
 
     def __init__(self, base_model, folds=3):
@@ -450,60 +446,16 @@ def __init__(self, base_model, folds=3):
         Returns:
             AdjustedDistributionEstimator: An instance of the estimator.
         """
+        if (not hasattr(base_model, "predict")) and (
+            not hasattr(base_model, "predict_proba")
+        ):
+            raise ValueError(
+                "Base model should implement either predict_proba or predict"
+            )
         self.base_model = base_model
         self.folds = folds
         super().__init__()
 
-    def fit(
-        self, confoundings: np.ndarray, treatment_arms: np.ndarray, outcomes: np.ndarray
-    ) -> "AdjustedDistributionEstimator":
-        """Train the AdjustedDistributionEstimator.
-
-        Args:
-            confoundings (np.ndarray): Pre-treatment covariates.
-            treatment_arms (np.ndarray): The index of the treatment arm.
-            outcomes (np.ndarray): Scalar-valued observed outcome.
-
-        Returns:
-            AdjustedDistributionEstimator: The fitted estimator.
-        """
-        if confoundings.shape[0] != treatment_arms.shape[0]:
-            raise ValueError(
-                "The shape of confounding and treatment_arm should be same"
-            )
-
-        if confoundings.shape[0] != outcomes.shape[0]:
-            raise ValueError("The shape of confounding and outcome should be same")
-
-        self.confoundings = confoundings
-        self.treatment_arms = treatment_arms
-        self.outcomes = outcomes
-
-        return self
-
-    def predict(self, treatment_arms: np.ndarray, locations: np.ndarray) -> np.ndarray:
-        """Compute cumulative distribution values.
-
-        Args:
-            treatment_arms (np.ndarray): The index of the treatment arm.
-            locations (np.ndarray): Scalar values to be used for computing the cumulative distribution.
-
-        Returns:
-            np.ndarray: Estimated cumulative distribution values for the input.
-        """
-        if self.outcomes is None:
-            raise ValueError(
-                "This estimator has not been trained yet. Please call fit first"
-            )
-
-        return self._compute_cumulative_distribution(
-            treatment_arms,
-            locations,
-            self.confoundings,
-            self.treatment_arms,
-            self.outcomes,
-        )[0]
-
     def _compute_cumulative_distribution(
         self,
         target_treatment_arms: np.ndarray,
@@ -548,13 +500,19 @@ def _compute_cumulative_distribution(
                     continue
                 model = deepcopy(self.base_model)
                 model.fit(confounding_train, binominal_train)
-                subset_prediction[subset_mask] = model.predict_proba(confounding_fit)[
-                    :, 1
-                ]
-                superset_prediction[superset_mask, i] = model.predict_proba(
-                    confoundings[superset_mask]
-                )[:, 1]
+                subset_prediction[subset_mask] = self._compute_model_prediction(
+                    model, confounding_fit
+                )
+                superset_prediction[superset_mask, i] = self._compute_model_prediction(
+                    model, confoundings[superset_mask]
+                )
             cumulative_distribution[i] = (
                 cdf - subset_prediction.mean() + superset_prediction[:, i].mean()
             )
         return cumulative_distribution, superset_prediction
+
+    def _compute_model_prediction(self, model, confoundings: np.ndarray) -> np.ndarray:
+        if hasattr(model, "predict_proba"):
+            return model.predict_proba(confoundings)[:, 1]
+        else:
+            return model.predict(confoundings)
@@ -1,5 +1,6 @@
 import numpy as np
 from scipy.stats import norm
+from typing import Tuple
 
 
 def compute_confidence_intervals(
@@ -16,7 +17,7 @@ def compute_confidence_intervals(
     alpha: 0.05,
     variance_type="moment",
     n_bootstrap=500,
-):
+) -> Tuple[np.ndarray, np.ndarray]:
     """Computes the confidence intervals of distribution parameters.
 
     Args:
@@ -106,25 +107,3 @@ def compute_confidence_intervals(
         return vec_dte_lower_simple, vec_dte_upper_simple
     else:
         raise ValueError(f"Invalid variance type was speficied: {variance_type}")
-
-
-def find_le(array: np.ndarray, threshold):
-    """Find the rightmost value less than or equal to threshold in a sorted array
-
-    Args:
-        array (np.ndarray): The sorted array to search in.
-        threshold (float): The threshold value.
-
-    Returns:
-        int: The index where the value first exceeds the threshold.
-    """
-    low, high = 0, array.shape[0] - 1
-    result = -1
-    while low <= high:
-        mid = (low + high) // 2
-        if array[mid] <= threshold:
-            result = mid
-            low = mid + 1
-        else:
-            high = mid - 1
-    return result
@@ -5,11 +5,25 @@
 
 
 class TestAdjustedEstimator(unittest.TestCase):
-    def test_prediction_success(self):
-        # TODO!
-        return
+    def setUp(self):
+        base_model = MagicMock()
+        base_model.predict_proba.side_effect = lambda x, y: x
+        self.estimator = AdjustedDistributionEstimator(base_model, folds=1)
+        self.confoundings = np.zeros((20, 5))
+        self.treatment_arms = np.hstack([np.zeros(10), np.ones(10)])
+        self.outcomes = np.arange(20)
+        self.estimator.fit(self.confoundings, self.treatment_arms, self.outcomes)
+
+    def test_init_fail_incorrect_base_model(self):
+        # Act, Assert
+        with self.assertRaises(ValueError) as cm:
+            AdjustedDistributionEstimator("dummy")
+        self.assertEqual(
+            str(cm.exception),
+            "Base model should implement either predict_proba or predict",
+        )
 
-    def test_prediction_fail_before_fit(self):
+    def test_predict_fail_before_fit(self):
         # Arrange
         D = np.zeros(20)
         D[:10] = 1
@@ -41,3 +55,32 @@ def test_fit_fail_invalid_input(self):
             str(cm.exception),
             "The shape of confounding and treatment_arm should be same",
         )
+
+    def test_compute_cumulative_distribution(self):
+        # Arrange
+        mock_model = self.estimator.base_model
+        mock_model.predict_proba.side_effect = lambda x: np.ones((x.shape[0], 2)) * 0.5
+        target_treatment_arms = np.zeros(10)
+        locations = np.arange(10)
+
+        # Act
+        cumulative_distribution, superset_prediction = (
+            self.estimator._compute_cumulative_distribution(
+                target_treatment_arms,
+                locations,
+                self.confoundings,
+                self.treatment_arms,
+                self.outcomes,
+            )
+        )
+
+        # Assert
+        self.assertEqual(cumulative_distribution.shape, (10,))
+        self.assertEqual(superset_prediction.shape, (20, 10))
+
+        for i in range(10):
+            self.assertAlmostEqual(cumulative_distribution[i], (i + 1) / 10, places=2)
+
+        for i in range(20):
+            for j in range(10):
+                self.assertAlmostEqual(superset_prediction[i, j], 0.5, places=2)