Merge pull request #365 from CITCOM-project/jmafoster1/run-test-adequacy

jmafoster1 · web-flow · commit 31aa186c9af8 · 2025-11-19T10:56:43.000Z
Jmafoster1/run test adequacy
diff --git a/causal_testing/__main__.py b/causal_testing/__main__.py
@@ -31,7 +31,7 @@ def main() -> None:
             effect_type=args.effect_type,
             estimate_type=args.estimate_type,
             estimator=args.estimator,
-            skip=True,
+            skip=False,
         )
         logging.info("Causal test generation completed successfully")
         return
@@ -58,7 +58,14 @@ def main() -> None:
         logging.info(f"Running tests in batches of size {args.batch_size}")
         with tempfile.TemporaryDirectory() as tmpdir:
             output_files = []
-            for i, results in enumerate(framework.run_tests_in_batches(batch_size=args.batch_size, silent=args.silent)):
+            for i, results in enumerate(
+                framework.run_tests_in_batches(
+                    batch_size=args.batch_size,
+                    silent=args.silent,
+                    adequacy=args.adequacy,
+                    bootstrap_size=args.bootstrap_size,
+                )
+            ):
                 temp_file_path = os.path.join(tmpdir, f"output_{i}.json")
                 framework.save_results(results, temp_file_path)
                 output_files.append(temp_file_path)
@@ -77,7 +84,7 @@ def main() -> None:
                 json.dump(all_results, f, indent=4)
     else:
         logging.info("Running tests in regular mode")
-        results = framework.run_tests(silent=args.silent)
+        results = framework.run_tests(silent=args.silent, adequacy=args.adequacy, bootstrap_size=args.bootstrap_size)
         framework.save_results(results)
 
     logging.info("Causal testing completed successfully.")
diff --git a/causal_testing/main.py b/causal_testing/main.py
@@ -22,6 +22,7 @@
 from causal_testing.testing.causal_effect import Negative, NoEffect, Positive, SomeEffect
 from causal_testing.testing.causal_test_case import CausalTestCase
 from causal_testing.testing.causal_test_result import CausalTestResult
+from causal_testing.testing.causal_test_adequacy import DataAdequacy
 
 logger = logging.getLogger(__name__)
 
@@ -335,12 +336,17 @@ def create_causal_test(self, test: dict, base_test: BaseTestCase) -> CausalTestC
             estimator=estimator,
         )
 
-    def run_tests_in_batches(self, batch_size: int = 100, silent: bool = False) -> List[CausalTestResult]:
+    def run_tests_in_batches(
+        self, batch_size: int = 100, silent: bool = False, adequacy: bool = False, bootstrap_size: int = 100
+    ) -> List[CausalTestResult]:
         """
         Run tests in batches to reduce memory usage.
 
         :param batch_size: Number of tests to run in each batch
         :param silent: Whether to suppress errors
+        :param adequacy: Whether to calculate causal test adequacy (defaults to False)
+        :param bootstrap_size: The number of bootstrap samples to use when calculating causal test adequacy
+        (defaults to 100)
         :return: List of all test results
         :raises: ValueError if no tests are loaded
         """
@@ -368,7 +374,12 @@ def run_tests_in_batches(self, batch_size: int = 100, silent: bool = False) -> L
                 batch_results = []
                 for test_case in current_batch:
                     try:
-                        batch_results.append(test_case.execute_test())
+                        result = test_case.execute_test()
+                        if adequacy:
+                            result.adequacy = DataAdequacy(test_case=test_case, bootstrap_size=bootstrap_size)
+                            result.adequacy.measure_adequacy()
+
+                        batch_results.append(result)
                     # pylint: disable=broad-exception-caught
                     except Exception as e:
                         if not silent:
@@ -383,10 +394,17 @@ def run_tests_in_batches(self, batch_size: int = 100, silent: bool = False) -> L
                 yield batch_results
         logger.info(f"Completed processing in {num_batches} batches")
 
-    def run_tests(self, silent=False) -> List[CausalTestResult]:
+    def run_tests(
+        self, silent: bool = False, adequacy: bool = False, bootstrap_size: int = 100
+    ) -> List[CausalTestResult]:
         """
         Run all test cases and return their results.
 
+        :param silent: Whether to suppress errors
+        :param adequacy: Whether to calculate causal test adequacy (defaults to False)
+        :param bootstrap_size: The number of bootstrap samples to use when calculating causal test adequacy
+        (defaults to 100)
+
         :return: List of CausalTestResult objects
         :raises: ValueError if no tests are loaded
         :raises: Exception if test execution fails
@@ -400,6 +418,9 @@ def run_tests(self, silent=False) -> List[CausalTestResult]:
         for test_case in tqdm(self.test_cases):
             try:
                 result = test_case.execute_test()
+                if adequacy:
+                    result.adequacy = DataAdequacy(test_case=test_case, bootstrap_size=bootstrap_size)
+                    result.adequacy.measure_adequacy()
                 results.append(result)
             # pylint: disable=broad-exception-caught
             except Exception as e:
@@ -450,6 +471,7 @@ def save_results(self, results: List[CausalTestResult], output_path: str = None)
                         "adjustment_set": list(result.adjustment_set) if result.adjustment_set else [],
                     }
                     | result.effect_estimate.to_dict()
+                    | (result.adequacy.to_dict() if result.adequacy else {})
                     if result.effect_estimate
                     else {"error": result.error_message}
                 ),
@@ -522,6 +544,17 @@ def parse_args(args: Optional[Sequence[str]] = None) -> argparse.Namespace:
     parser_test.add_argument("-t", "--test_config", help="Path to test configuration file (.json)", required=True)
     parser_test.add_argument("-v", "--verbose", help="Enable verbose logging", action="store_true", default=False)
     parser_test.add_argument("-q", "--query", help="Query string to filter data (e.g. 'age > 18')", type=str)
+    parser_test.add_argument(
+        "-a", "--adequacy", help="Calculate causal test adequacy for each test case", action="store_true", default=False
+    )
+    parser_test.add_argument(
+        "-b",
+        "--adequacy-bootstrap-size",
+        dest="bootstrap_size",
+        help="Number of bootstrap samples for causal test adequacy. Defaults to 100",
+        type=int,
+        default=100,
+    )
     parser_test.add_argument(
         "-s",
         "--silent",
@@ -537,5 +570,10 @@ def parse_args(args: Optional[Sequence[str]] = None) -> argparse.Namespace:
     )
 
     args = main_parser.parse_args(args)
+
+    # Assume the user wants test adequacy if they're setting bootstrap_size
+    if hasattr(args, "bootstrap_size") and args.bootstrap_size:
+        args.adequacy = True
+
     args.command = Command(args.command)
     return args
diff --git a/causal_testing/testing/causal_test_adequacy.py b/causal_testing/testing/causal_test_adequacy.py
@@ -10,7 +10,6 @@
 from lifelines.exceptions import ConvergenceError
 from numpy.linalg import LinAlgError
 
-from causal_testing.estimation.abstract_estimator import Estimator
 from causal_testing.specification.causal_dag import CausalDAG
 from causal_testing.testing.causal_test_case import CausalTestCase
 
@@ -79,12 +78,10 @@ class DataAdequacy:
     def __init__(
         self,
         test_case: CausalTestCase,
-        estimator: Estimator,
         bootstrap_size: int = 100,
         group_by=None,
     ):
         self.test_case = test_case
-        self.estimator = estimator
         self.kurtosis = None
         self.outcomes = None
         self.successful = None
@@ -97,7 +94,7 @@ def measure_adequacy(self):
         """
         results = []
         for i in range(self.bootstrap_size):
-            estimator = deepcopy(self.estimator)
+            estimator = deepcopy(self.test_case.estimator)
 
             if self.group_by is not None:
                 ids = pd.Series(estimator.df[self.group_by].unique())
@@ -120,7 +117,7 @@ def measure_adequacy(self):
         results = pd.concat([c.effect_estimate.to_df() for c in results])
         results["var"] = results.index
 
-        self.kurtosis = results.groupby("var").apply(lambda x: x.kurtosis())["effect_estimate"]
+        self.kurtosis = results.groupby("var")["effect_estimate"].apply(lambda x: x.kurtosis())
         self.outcomes = sum(filter(lambda x: x is not None, outcomes))
         self.successful = sum(x is not None for x in outcomes)
 
diff --git a/tests/testing_tests/test_causal_test_adequacy.py b/tests/testing_tests/test_causal_test_adequacy.py
@@ -46,7 +46,7 @@ def test_data_adequacy_numeric(self):
             estimate_type="coefficient",
             estimator=estimator,
         )
-        adequacy_metric = DataAdequacy(causal_test_case, estimator)
+        adequacy_metric = DataAdequacy(causal_test_case)
         adequacy_metric.measure_adequacy()
         self.assertEqual(
             adequacy_metric.to_dict(),
@@ -66,7 +66,7 @@ def test_data_adequacy_categorical(self):
             estimate_type="coefficient",
             estimator=estimator,
         )
-        adequacy_metric = DataAdequacy(causal_test_case, estimator)
+        adequacy_metric = DataAdequacy(causal_test_case)
         adequacy_metric.measure_adequacy()
         self.assertEqual(
             adequacy_metric.to_dict(),
@@ -100,7 +100,7 @@ def test_data_adequacy_group_by(self):
             estimate_type="hazard_ratio",
             estimator=estimation_model,
         )
-        adequacy_metric = DataAdequacy(causal_test_case, estimation_model, group_by="id")
+        adequacy_metric = DataAdequacy(causal_test_case, group_by="id")
         adequacy_metric.measure_adequacy()
         adequacy_dict = adequacy_metric.to_dict()
         self.assertEqual(round(adequacy_dict["kurtosis"]["trtrand"], 3), -0.857)