-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathplot_template_data.py
More file actions
129 lines (106 loc) · 4.56 KB
/
plot_template_data.py
File metadata and controls
129 lines (106 loc) · 4.56 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
"""
Données parcours-sup 2021-2025
==============================
"""
import pandas
from teachpyx.tools.pandas import read_csv_cached
from sklearn.metrics import mean_absolute_error
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.ensemble import HistGradientBoostingRegressor
# from skrub import TableReport
def get_data():
urls = {
"2021": "https://data.enseignementsup-recherche.gouv.fr/api/explore/v2.1/catalog/datasets/fr-esr-parcoursup_2021/exports/csv?lang=fr&timezone=Europe%2FBerlin&use_labels=true&delimiter=%3B",
"2022": "https://data.enseignementsup-recherche.gouv.fr/api/explore/v2.1/catalog/datasets/fr-esr-parcoursup_2022/exports/csv?lang=fr&timezone=Europe%2FBerlin&use_labels=true&delimiter=%3B",
"2023": "https://data.enseignementsup-recherche.gouv.fr/api/explore/v2.1/catalog/datasets/fr-esr-parcoursup_2023/exports/csv?lang=fr&timezone=Europe%2FBerlin&use_labels=true&delimiter=%3B",
"2024": "https://data.enseignementsup-recherche.gouv.fr/api/explore/v2.1/catalog/datasets/fr-esr-parcoursup_2024/exports/csv?lang=fr&timezone=Europe%2FBerlin&use_labels=true&delimiter=%3B",
"2025": "https://data.enseignementsup-recherche.gouv.fr/api/explore/v2.1/catalog/datasets/fr-esr-parcoursup/exports/csv?lang=fr&timezone=Europe%2FBerlin&use_labels=true&delimiter=%3B",
}
dfs = {}
for k, url in urls.items():
print(f"loading {k!r}")
dfs[k] = read_csv_cached(url, sep=";")
return pandas.concat(dfs.values(), axis=0)
def select_variables_and_clean(df):
keys = [
"Région de l’établissement",
"Session",
"Statut de l’établissement de la filière de formation (public, privé…)",
"Sélectivité",
"Code UAI de l'établissement",
"Établissement",
"Filière de formation détaillée bis",
"Filière de formation très agrégée",
"Filière de formation.1",
"Académie de l’établissement",
"Code départemental de l’établissement",
"Commune de l’établissement",
"Concours communs et banque d'épreuves",
]
cible = "Effectif total des candidats pour une formation"
columns = set(df.columns)
assert set(keys) & set(columns) == set(
keys
), f"Missing columns {set(keys) - set(keys) & set(columns)} in {sorted(df.columns)}"
subset = df[[*keys, cible]]
mask = subset.duplicated(subset=keys, keep=False)
return subset[~mask].reset_index(drop=True), cible
def compute_oracle(table, cible):
vars = [c for c in table.columns if c != cible]
f2025 = table["Session"] == 2025
f2024 = table["Session"] == 2024
ftwo = table[f2025 | f2024]
piv = (
pandas.pivot_table(
ftwo,
index=[c for c in vars if c != "Session"],
columns="Session",
values=cible,
)
.dropna(axis=0)
.sort_index()
)
# Keep only rows where both 2024 and 2025 have non-missing values
piv = piv.dropna(axis=0, how="any")
if piv.empty:
raise ValueError(
"Not enough overlapping data between 2024 and 2025 to compute oracle."
)
return mean_absolute_error(piv[2025], piv[2024])
def split_train_test(table, cible):
X, y = table.drop(cible, axis=1), table[cible]
train_test = X["Session"] < 2025
drop = ["Session", "Code UAI de l'établissement", "Établissement"]
train_X = X[train_test].drop(drop, axis=1)
train_y = y[train_test]
test_X = X[~train_test].drop(drop, axis=1)
test_y = y[~train_test]
return train_X, test_X, train_y, test_y
def make_pipeline(table, cible):
vars = [c for c in table.columns if c != cible]
num_cols = ["Capacité de l’établissement par formation"]
cat_cols = [c for c in vars if c not in num_cols]
transformers = []
if num_cols:
transformers.append(("num", StandardScaler(), num_cols))
if cat_cols:
transformers.append(("cats", OneHotEncoder(handle_unknown="ignore"), cat_cols))
model = Pipeline(
[
(
"preprocessing",
ColumnTransformer(transformers),
),
("regressor", HistGradientBoostingRegressor()),
]
)
return model
data = get_data()
table, cible = select_variables_and_clean(data)
# oracle = compute_oracle(table, cible)
# print(f"oracle : {oracle}")
# train_X, test_X, train_y, test_y = split_train_test(table, cible)
# model = make_pipeline(table, cible)
# model.fit(train_X, train_y)