Skip to content

Commit d081138

Browse files
authored
Speed up empirical estimation (#33)
* speed up empirical estimation * bump version
1 parent e09ffa4 commit d081138

5 files changed

Lines changed: 71 additions & 79 deletions

File tree

Pipfile

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,3 +21,4 @@ format = "ruff format"
2121
lint = "ruff check"
2222
lint-fix = "ruff check --fix"
2323
unittest = "python -m unittest"
24+
typos = "typos -w dte_adj docs tests"

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
pip install dte_adj
1010
```
1111

12-
2. **Install from Source**
12+
2. **Install from source**
1313

1414
```sh
1515
git clone https://github.com/CyberAgentAILab/python-dte-adjustment

dte_adj/__init__.py

Lines changed: 67 additions & 76 deletions
Original file line numberDiff line numberDiff line change
@@ -417,7 +417,7 @@ def _compute_cumulative_distribution(
417417
covariates: np.ndarray,
418418
treatment_arms: np.ndarray,
419419
outcomes: np.array,
420-
) -> np.ndarray:
420+
) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
421421
"""
422422
Compute the cumulative distribution values.
423423
@@ -429,47 +429,37 @@ def _compute_cumulative_distribution(
429429
outcomes (np.ndarray): An array of outcomes in the observed data
430430
431431
Returns:
432-
np.ndarray: Estimated cumulative distribution values.
432+
Tuple of numpy arrays:
433+
- np.ndarray: Unconditional cumulative distribution values.
434+
- np.ndarray: Adjusted cumulative distribution for each observation.
435+
- np.ndarray: Conditional cumulative distribution for each observation.
433436
"""
434437
n_records = outcomes.shape[0]
435438
n_loc = locations.shape[0]
436-
superset_prediction = np.zeros((n_records, n_loc))
437439
prediction = np.zeros((n_records, n_loc))
438440
treatment_mask = treatment_arms == target_treatment_arm
439441

440442
strata = self.strata
441443
s_list = np.unique(strata)
442-
s_dict = {}
444+
w_s = {}
443445
for s in s_list:
444446
s_mask = strata == s
445-
s_dict[s] = (s_mask & treatment_mask).sum() / s_mask.sum()
447+
w_s[s] = (s_mask & treatment_mask).sum() / s_mask.sum()
446448
n_obs = outcomes.shape[0]
447449
n_loc = locations.shape[0]
448450
for i, outcome in enumerate(locations):
449451
for j in range(n_obs):
450452
s = strata[j]
451-
prediction[j, i] = (
452-
(outcomes[j] <= outcome) / s_dict[s] * treatment_mask[j]
453-
)
453+
prediction[j, i] = (outcomes[j] <= outcome) / w_s[s] * treatment_mask[j]
454454

455-
pred = {}
456-
for j in range(n_obs):
457-
s = strata[j]
458-
s_mask = s == strata
459-
if s in pred:
460-
superset_prediction[j] = pred[s]
461-
else:
462-
superset_prediction[j] = prediction[s_mask].mean(axis=0)
463-
pred[s] = superset_prediction[j]
464-
465-
for i, outcome in enumerate(locations):
466-
for j in range(n_obs):
467-
s = strata[j]
468-
prediction[j, i] = (
469-
(outcomes[j] <= outcome) - superset_prediction[j, i]
470-
) / s_dict[s] * treatment_mask[j] + superset_prediction[j, i]
455+
unconditional_pred = {s: prediction[s == strata].mean(axis=0) for s in s_list}
456+
conditional_prediction = np.array([unconditional_pred[s] for s in strata])
457+
weights = np.array([w_s[s] for s in strata])[:, np.newaxis]
458+
prediction = (
459+
(outcomes[:, np.newaxis] <= locations) - conditional_prediction
460+
) / weights * treatment_mask[:, np.newaxis] + conditional_prediction
471461

472-
return prediction.mean(axis=0), prediction, superset_prediction
462+
return prediction.mean(axis=0), prediction, conditional_prediction
473463

474464
def _compute_interval_probability(
475465
self,
@@ -478,57 +468,52 @@ def _compute_interval_probability(
478468
covariates: np.ndarray,
479469
treatment_arms: np.ndarray,
480470
outcomes: np.array,
481-
) -> np.ndarray:
482-
"""Compute the cumulative distribution values.
471+
) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
472+
"""Compute the interval probabilities.
483473
484474
Args:
485475
target_treatment_arm (int): The index of the treatment arm.
486-
locations (np.ndarray): Scalar values to be used for computing the cumulative distribution.
476+
locations (np.ndarray): Scalar values to be used for computing the interval probabilities.
487477
covariates: (np.ndarray): An array of covariates variables in the observed data.
488478
treatment_arm (np.ndarray): An array of treatment arms in the observed data.
489479
outcomes (np.ndarray): An array of outcomes in the observed data
490480
491481
Returns:
492-
np.ndarray: Estimated cumulative distribution values.
482+
Tuple of numpy arrays:
483+
- np.ndarray: Estimated unconditional interval probabilities.
484+
- np.ndarray: Adjusted for each observation.
485+
- np.ndarray: Conditional for each observation.
493486
"""
494487
n_records = outcomes.shape[0]
495488
n_loc = locations.shape[0]
496-
superset_prediction = np.zeros((n_records, n_loc))
497489
prediction = np.zeros((n_records, n_loc))
498490
treatment_mask = treatment_arms == target_treatment_arm
499491

500492
strata = self.strata
501493
s_list = np.unique(strata)
502-
s_dict = {}
494+
w_s = {}
503495
for s in s_list:
504496
s_mask = strata == s
505-
s_dict[s] = (s_mask & treatment_mask).sum() / s_mask.sum()
497+
w_s[s] = (s_mask & treatment_mask).sum() / s_mask.sum()
506498
n_obs = outcomes.shape[0]
507499
n_loc = locations.shape[0]
508500
for i, outcome in enumerate(locations):
509501
for j in range(n_obs):
510502
s = strata[j]
511-
prediction[j, i] = (
512-
(outcomes[j] <= outcome) / s_dict[s] * treatment_mask[j]
513-
)
503+
prediction[j, i] = (outcomes[j] <= outcome) / w_s[s] * treatment_mask[j]
514504

515-
for j in range(n_obs):
516-
s = strata[j]
517-
s_mask = s == strata
518-
superset_prediction[j] = prediction[s_mask].mean(axis=0)
505+
unconditional_pred = {s: prediction[s == strata].mean(axis=0) for s in s_list}
506+
conditional_prediction = np.array([unconditional_pred[s] for s in strata])
507+
weights = np.array([w_s[s] for s in strata])[:, np.newaxis]
508+
prediction = (
509+
(outcomes[:, np.newaxis] <= locations) - conditional_prediction
510+
) / weights * treatment_mask[:, np.newaxis] + conditional_prediction
519511

520-
for i, outcome in enumerate(locations):
521-
for j in range(n_obs):
522-
s = strata[j]
523-
prediction[j, i] = (
524-
(outcomes[j] <= outcome) - superset_prediction[j, i]
525-
) / s_dict[s] * treatment_mask[j] + superset_prediction[j, i]
526-
return prediction.mean(axis=0), superset_prediction
527512
cdf = prediction.mean(axis=0)
528513
return (
529514
cdf[1:] - cdf[:-1],
530515
prediction[:, 1:] - prediction[:, :-1],
531-
superset_prediction[:, 1:] - superset_prediction[:, :-1],
516+
conditional_prediction[:, 1:] - conditional_prediction[:, :-1],
532517
)
533518

534519

@@ -596,7 +581,7 @@ def _compute_cumulative_distribution(
596581
covariates: np.ndarray,
597582
treatment_arms: np.ndarray,
598583
outcomes: np.array,
599-
) -> np.ndarray:
584+
) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
600585
"""
601586
Compute the cumulative distribution values.
602587
@@ -608,7 +593,10 @@ def _compute_cumulative_distribution(
608593
outcomes (np.ndarray): An array of outcomes in the observed data
609594
610595
Returns:
611-
Tuple[np.ndarray, np.ndarray, np.ndarray]: Estimated cumulative distribution values, prediction for each observation, and superset prediction for each observation.
596+
Tuple of numpy arrays:
597+
- np.ndarray: Unconditional cumulative distribution values.
598+
- np.ndarray: Adjusted cumulative distribution for each observation.
599+
- np.ndarray: Conditional cumulative distribution for each observation.
612600
"""
613601
n_records = outcomes.shape[0]
614602
n_loc = locations.shape[0]
@@ -619,7 +607,7 @@ def _compute_cumulative_distribution(
619607
strata = self.strata
620608
s_list = np.unique(strata)
621609
if self.is_multi_task:
622-
binominal = (outcomes.reshape(-1, 1) <= locations) * 1 # (n_records, n_loc)
610+
binomial = (outcomes.reshape(-1, 1) <= locations) * 1 # (n_records, n_loc)
623611
for fold in range(self.folds):
624612
fold_mask = (folds != fold) & treatment_mask
625613
for s in s_list:
@@ -628,51 +616,51 @@ def _compute_cumulative_distribution(
628616
superset_mask = (folds == fold) & s_mask
629617
subset_train_mask = (folds != fold) & s_mask & treatment_mask
630618
covariates_train = covariates[subset_train_mask]
631-
binominal_train = binominal[subset_train_mask]
632-
if len(np.unique(binominal_train)) > 1:
619+
binomial_train = binomial[subset_train_mask]
620+
if len(np.unique(binomial_train)) > 1:
633621
self.model = deepcopy(self.base_model)
634-
self.model.fit(covariates_train, binominal_train)
622+
self.model.fit(covariates_train, binomial_train)
635623

636624
pred = self._compute_model_prediction(
637625
self.model, covariates[superset_mask]
638626
)
639627
prediction[superset_mask] = (
640628
pred
641629
+ treatment_mask[superset_mask].reshape(-1, 1)
642-
* (binominal[superset_mask] - pred)
630+
* (binomial[superset_mask] - pred)
643631
/ weight
644632
)
645633
superset_prediction[superset_mask] = pred
646634
else:
647635
for i, location in enumerate(locations):
648-
binominal = (outcomes <= location) * 1 # (n_records)
636+
binomial = (outcomes <= location) * 1 # (n_records)
649637
for fold in range(self.folds):
650638
fold_mask = (folds != fold) & treatment_mask
651639
covariates_train = covariates[fold_mask]
652-
binominal_train = binominal[fold_mask]
640+
binomial_train = binomial[fold_mask]
653641
# Pool the records across strata and train the model
654-
if len(np.unique(binominal_train)) > 1:
642+
if len(np.unique(binomial_train)) > 1:
655643
self.model = deepcopy(self.base_model)
656-
self.model.fit(covariates_train, binominal_train)
644+
self.model.fit(covariates_train, binomial_train)
657645
for s in s_list:
658646
s_mask = strata == s
659647
weight = (s_mask & treatment_mask).sum() / s_mask.sum()
660648
superset_mask = (folds == fold) & s_mask
661649
subset_train_mask = (folds != fold) & s_mask & treatment_mask
662650
covariates_train = covariates[subset_train_mask]
663-
binominal_train = binominal[subset_train_mask]
651+
binomial_train = binomial[subset_train_mask]
664652
# TODO: revisit the logic here
665-
if len(np.unique(binominal_train)) > 1:
653+
if len(np.unique(binomial_train)) > 1:
666654
# self.model = deepcopy(self.base_model)
667-
# self.model.fit(covariates_train, binominal_train)
655+
# self.model.fit(covariates_train, binomial_train)
668656
pass
669657
else:
670-
pred = binominal_train[0]
658+
pred = binomial_train[0]
671659
superset_prediction[superset_mask, i] = pred
672660
prediction[superset_mask, i] = (
673661
pred
674662
+ treatment_mask[superset_mask]
675-
* (binominal[superset_mask] - pred)
663+
* (binomial[superset_mask] - pred)
676664
/ weight
677665
)
678666
continue
@@ -682,7 +670,7 @@ def _compute_cumulative_distribution(
682670
prediction[superset_mask, i] = (
683671
pred
684672
+ treatment_mask[superset_mask]
685-
* (binominal[superset_mask] - pred)
673+
* (binomial[superset_mask] - pred)
686674
/ weight
687675
)
688676
superset_prediction[superset_mask, i] = pred
@@ -696,9 +684,9 @@ def _compute_interval_probability(
696684
covariates: np.ndarray,
697685
treatment_arms: np.ndarray,
698686
outcomes: np.array,
699-
) -> np.ndarray:
687+
) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
700688
"""
701-
Compute the cumulative distribution values.
689+
Compute the interval probabilities.
702690
703691
Args:
704692
target_treatment_arm (int): The index of the treatment arm.
@@ -708,7 +696,10 @@ def _compute_interval_probability(
708696
outcomes (np.ndarray): An array of outcomes in the observed data
709697
710698
Returns:
711-
np.ndarray: Estimated cumulative distribution values.
699+
Tuple of numpy arrays:
700+
- np.ndarray: Unconditional interval probabilities.
701+
- np.ndarray: Adjusted interval probabilities for each observation.
702+
- np.ndarray: Conditional interval probabilities for each observation.
712703
"""
713704
n_records = outcomes.shape[0]
714705
n_loc = locations.shape[0]
@@ -720,28 +711,28 @@ def _compute_interval_probability(
720711
s_list = np.unique(strata)
721712
binominals = (outcomes[:, np.newaxis] <= locations) * 1 # (n_records, n_loc)
722713
for i in range(len(locations) - 1):
723-
binominal = binominals[:, i + 1] - binominals[:, i]
714+
binomial = binominals[:, i + 1] - binominals[:, i]
724715
for fold in range(self.folds):
725716
fold_mask = (folds != fold) & treatment_mask
726717
covariates_train = covariates[fold_mask]
727-
binominal_train = binominal[fold_mask]
728-
if len(np.unique(binominal_train)) > 1:
718+
binomial_train = binomial[fold_mask]
719+
if len(np.unique(binomial_train)) > 1:
729720
self.model = deepcopy(self.base_model)
730-
self.model.fit(covariates_train, binominal_train)
721+
self.model.fit(covariates_train, binomial_train)
731722
for s in s_list:
732723
s_mask = strata == s
733724
wight = (s_mask & treatment_mask).sum() / s_mask.sum()
734725
superset_mask = (folds == fold) & s_mask
735726
subset_train_mask = (folds != fold) & s_mask & treatment_mask
736727
covariates_train = covariates[subset_train_mask]
737-
binominal_train = binominal[subset_train_mask]
738-
if len(np.unique(binominal_train)) == 1:
739-
pred = binominal_train[0]
728+
binomial_train = binomial[subset_train_mask]
729+
if len(np.unique(binomial_train)) == 1:
730+
pred = binomial_train[0]
740731
superset_prediction[superset_mask, i] = pred
741732
prediction[superset_mask, i] = (
742733
pred
743734
+ treatment_mask[superset_mask]
744-
* (binominal[superset_mask] - pred)
735+
* (binomial[superset_mask] - pred)
745736
/ wight
746737
)
747738
continue
@@ -751,7 +742,7 @@ def _compute_interval_probability(
751742
prediction[superset_mask, i] = (
752743
pred
753744
+ treatment_mask[superset_mask]
754-
* (binominal[superset_mask] - pred)
745+
* (binomial[superset_mask] - pred)
755746
/ wight
756747
)
757748
superset_prediction[superset_mask, i] = pred

dte_adj/util.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -106,4 +106,4 @@ def compute_confidence_intervals(
106106

107107
return vec_dte_lower_simple, vec_dte_upper_simple
108108
else:
109-
raise ValueError(f"Invalid variance type was speficied: {variance_type}")
109+
raise ValueError(f"Invalid variance type was specified: {variance_type}")

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
44

55
[project]
66
name = "dte_adj"
7-
version = "0.1.5"
7+
version = "0.1.6"
88
description = "This is a Python library for estimating distributional treatment effects"
99
readme = "README.md"
1010
requires-python = ">=3.10"

0 commit comments

Comments
 (0)