human-AI-complementarity-matching/generate_prolific_matchings.py at main · Human-Centric-Machine-Learning/human-AI-complementarity-matching · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
import numpy as np
import random
from tqdm import tqdm
import os
import os.path as osp
import time
import pickle as pkl
import pandas as pd

from MWBM.data_generation import Pool, create_biased_probs
from MWBM.mwbm_algs import BMWBCM, MWBCM

# create a folder to save the data
if not osp.exists('data'):
    os.makedirs('data')

timestamp = time.strftime("%Y%m%d-%H%M%S")
CONFIG_DIR = osp.join('data', timestamp)
os.makedirs(CONFIG_DIR)

# Set random seeds
random.seed(0)
np.random.seed(0)

N_POOLS = 40 #* Modify as needed
N = 20 #* Modify as needed
M = 10
capacities_types = np.array([2]) #* Modify as needed

days_keys = ['Mo-am', 'Mo-pm', 'Tu-am', 'Tu-pm', 'We-am', 'We-pm', 'Th-am', 'Th-pm', 'Fr-am', 'Fr-pm']
prob_y_per_x = [
   [[.2,.3], [20,20], [.2,.3], [20,17], [17, 20], [20,20], [.2,.4], [19, 12], [.2,.3], [.15,.2]], #probs of x1
   [[20,20], [.2,.4], [19,12], [.2,.3], [.15,.2], [.2,.3], [20,20], [.2, .3], [20,17], [17, 20]], #probs of x2
   [[ 1,10], [ 1,10] ,[ 5,10] ,[ 5, 2], [3.1, 4], [19,12], [.2,.3], [.15,.2], [.2,.3], [20, 20]] #probs of x3
]
prob_x = np.array([.2, .45, .35])
prob_y_per_x = np.array(prob_y_per_x)
x_types = len(prob_x)
assert prob_y_per_x.shape == (x_types, M, 2), prob_y_per_x.shape

#save the data
np.save(osp.join(CONFIG_DIR, 'prob_y_per_x.npy'), prob_y_per_x)
np.save(osp.join(CONFIG_DIR, 'capacities_types.npy'), capacities_types)
np.save(osp.join(CONFIG_DIR, 'prob_x.npy'), prob_x)

progress = tqdm(range(N_POOLS), desc='Generating pools', position=0, leave=True)
for pool_idx in progress:
    # realization of the pool
    pool = Pool(N, M, prob_x, prob_y_per_x, capacities_types, type_prob='quantile',
                   bias_loc=None, type_bias_loc=None, tau_loc=None, prob_eps=0.005)
    # Model's noisy probabilities  - average mode created probabilities as explained in the paper.
    g_alg = create_biased_probs(pool.g, mode='average', prob_y_per_x=prob_y_per_x,
                              bias_locs=None, pool_feats=pool.people_feat)

    #optimal matching - perfect probs
    matching_real_prob = MWBCM(people=pool.people_idx, locations=pool.locations_idx,
                        capacities=pool.capacities, g=pool.g, verbose = False)
    matching_real_prob = matching_real_prob.solve_LP(return_solution=True)

    #Save realization of the pool
    pool_dir = osp.join(CONFIG_DIR, f'pool{pool_idx}')
    if osp.exists(pool_dir):
        raise ValueError(f'Pool {pool_idx} already exists')
    else:
        os.makedirs(pool_dir)

    #save the pool
    np.save(osp.join(pool_dir, 'people_idx.npy'), pool.people_idx)
    np.save(osp.join(pool_dir, 'people_feat.npy'), pool.people_feat)
    np.save(osp.join(pool_dir, 'locations_idx.npy'), pool.locations_idx)
    np.save(osp.join(pool_dir, 'capacities.npy'), pool.capacities)
    np.save(osp.join(pool_dir, 'g.npy'), pool.g)
    np.save(osp.join(pool_dir, 'g_alg.npy'), g_alg)
    pkl.dump(pool, open(osp.join(pool_dir, 'pool.pkl'), 'wb'))
    np.save(osp.join(pool_dir, 'matching_opt_real_prob.npy'), matching_real_prob)

    num_matchings = min(pool.N, pool.C)
    B_progress = tqdm(range(num_matchings+1), desc='Biased matchings', position=1, leave=False)
    for B in B_progress:

        # create folder for the specific B
        pool_b_dir = osp.join(pool_dir, f'B{B}')
        if osp.exists(pool_b_dir):
            raise ValueError(f'Pool {pool_idx} already exists')
        else:
            os.makedirs(pool_b_dir)

        #copy pool
        B_pool = pkl.load(open(osp.join(pool_dir, 'pool.pkl'), 'rb'))
        total_LP_matchings = num_matchings - B

        if total_LP_matchings > 0:
            alg_matching_solver = BMWBCM(people=B_pool.people_idx, locations=B_pool.locations_idx,
                                capacities=B_pool.capacities, g=g_alg, B=total_LP_matchings, verbose = False)
            alg_matching = alg_matching_solver.solve_LP(return_solution=True)
            B_pool.set_matching_pairs(alg_matching)

            np.savetxt(osp.join(pool_b_dir, 'alg_matching.csv'), alg_matching.astype(int), delimiter=",")
            #assert B_pool.remaining_people.shape[0]==B, f"Number of remaining people is {pool.remaining_people.shape[0]}, but should be {B}."

        if B_pool.remaining_capacities.sum() > 0 and B_pool.remaining_people.size>0 and B>0:
            opt_human_solver = MWBCM(people=B_pool.remaining_people, locations=B_pool.locations_idx,
                            capacities=B_pool.remaining_capacities, g=B_pool.g, verbose = False)
            opt_human_rem_match = opt_human_solver.solve_LP(return_solution=True) #split but perfect
            #Save as int
            np.savetxt(osp.join(pool_b_dir, 'opt_rem_match.csv'), opt_human_rem_match.astype(int), delimiter=",", fmt='%i')

        remaining_g = B_pool.g[B_pool.remaining_people]
        assert B_pool.remaining_people.shape[0]==B, f"Number of remaining people is {B_pool.remaining_people.shape[0]}, but should be {B}."
        assert remaining_g.shape[0]==B, f"Number of remaining people is {B_pool.remaining_people.shape[0]}, but should be {B}."

        np.savetxt(osp.join(pool_b_dir, 'remaining_people.csv'), B_pool.remaining_people, delimiter=",", fmt='%i')
        np.savetxt(osp.join(pool_b_dir, 'remaining_capacities.csv'), B_pool.remaining_capacities, delimiter=",", fmt='%i')
        np.savetxt(osp.join(pool_b_dir, 'remaining_g.csv'), remaining_g, delimiter=",", fmt='%1.2f')

        #Save the remaining capacities with name of columns (Day, Remaining) where day is Mo-am, Mo-pm, Tu-am, Tu-pm, etc. and remaining is B_pool_remaining_capacities
        assert len(days_keys) == B_pool.remaining_capacities.shape[0], f"Number of days_keys is {B_pool.remaining_capacities.shape[0]}, but should be {len(days_keys)}."
        df = pd.DataFrame(B_pool.remaining_capacities, columns=['Remaining'], index=days_keys)
        df.index.name = 'Day'
        df.to_csv(osp.join(pool_b_dir, 'rem_capacities_TAB.csv'))

        #save the remaining g with the remaining people as index and the days_keys as columns
        df = pd.DataFrame(remaining_g, columns=days_keys)
        df.index = B_pool.remaining_people
        df.index.name = 'Person'
        df.to_csv(osp.join(pool_b_dir, 'rem_probs_TAB.csv'))