|
| 1 | +""" |
| 2 | +Données parcours-sup 2021-2025 |
| 3 | +============================== |
| 4 | +
|
| 5 | +""" |
| 6 | + |
| 7 | +import pandas |
| 8 | +from teachpyx.tools.pandas import read_csv_cached |
| 9 | +from sklearn.metrics import mean_absolute_error |
| 10 | +from sklearn.pipeline import Pipeline |
| 11 | +from sklearn.compose import ColumnTransformer |
| 12 | +from sklearn.preprocessing import OneHotEncoder, StandardScaler |
| 13 | +from sklearn.ensemble import HistGradientBoostingRegressor |
| 14 | + |
| 15 | +# from skrub import TableReport |
| 16 | + |
| 17 | + |
| 18 | +def get_data(): |
| 19 | + urls = { |
| 20 | + "2021": "https://data.enseignementsup-recherche.gouv.fr/api/explore/v2.1/catalog/datasets/fr-esr-parcoursup_2021/exports/csv?lang=fr&timezone=Europe%2FBerlin&use_labels=true&delimiter=%3B", |
| 21 | + "2022": "https://data.enseignementsup-recherche.gouv.fr/api/explore/v2.1/catalog/datasets/fr-esr-parcoursup_2022/exports/csv?lang=fr&timezone=Europe%2FBerlin&use_labels=true&delimiter=%3B", |
| 22 | + "2023": "https://data.enseignementsup-recherche.gouv.fr/api/explore/v2.1/catalog/datasets/fr-esr-parcoursup_2023/exports/csv?lang=fr&timezone=Europe%2FBerlin&use_labels=true&delimiter=%3B", |
| 23 | + "2024": "https://data.enseignementsup-recherche.gouv.fr/api/explore/v2.1/catalog/datasets/fr-esr-parcoursup_2024/exports/csv?lang=fr&timezone=Europe%2FBerlin&use_labels=true&delimiter=%3B", |
| 24 | + "2025": "https://data.enseignementsup-recherche.gouv.fr/api/explore/v2.1/catalog/datasets/fr-esr-parcoursup/exports/csv?lang=fr&timezone=Europe%2FBerlin&use_labels=true&delimiter=%3B", |
| 25 | + } |
| 26 | + |
| 27 | + dfs = {} |
| 28 | + for k, url in urls.items(): |
| 29 | + print(f"loading {k!r}") |
| 30 | + dfs[k] = read_csv_cached(url, sep=";") |
| 31 | + |
| 32 | + return pandas.concat(dfs.values(), axis=0) |
| 33 | + |
| 34 | + |
| 35 | +def select_variables_and_clean(df): |
| 36 | + keys = [ |
| 37 | + "Région de l’établissement", |
| 38 | + "Session", |
| 39 | + "Statut de l’établissement de la filière de formation (public, privé…)", |
| 40 | + "Sélectivité", |
| 41 | + "Code UAI de l'établissement", |
| 42 | + "Établissement", |
| 43 | + "Filière de formation détaillée bis", |
| 44 | + "Filière de formation très agrégée", |
| 45 | + "Filière de formation.1", |
| 46 | + "Académie de l’établissement", |
| 47 | + "Code départemental de l’établissement", |
| 48 | + "Commune de l’établissement", |
| 49 | + "Concours communs et banque d'épreuves", |
| 50 | + ] |
| 51 | + cible = "Effectif total des candidats pour une formation" |
| 52 | + columns = set(df.columns) |
| 53 | + assert set(keys) & set(columns) == set( |
| 54 | + keys |
| 55 | + ), f"Missing columns {set(keys) - set(keys) & set(columns)} in {sorted(df.columns)}" |
| 56 | + groups = df[[*keys, cible]].groupby(keys).count() |
| 57 | + filtered = groups[groups[cible] > 1].reset_index(drop=False) |
| 58 | + |
| 59 | + mask = filtered.duplicated(subset=keys, keep=False) |
| 60 | + return filtered[~mask][[*keys, cible]], cible |
| 61 | + |
| 62 | + |
| 63 | +def compute_oracle(table, cible): |
| 64 | + vars = [c for c in table.columns if c != cible] |
| 65 | + f2025 = table["Session"] == 2025 |
| 66 | + f2024 = table["Session"] == 2024 |
| 67 | + ftwo = table[f2025 | f2024] |
| 68 | + piv = ( |
| 69 | + pandas.pivot_table( |
| 70 | + ftwo, |
| 71 | + index=[c for c in vars if c != "Session"], |
| 72 | + columns="Session", |
| 73 | + values=cible, |
| 74 | + ) |
| 75 | + # .dropna(axis=0) # fails |
| 76 | + .sort_index() |
| 77 | + ) |
| 78 | + return mean_absolute_error(piv[2025], piv[2024]) |
| 79 | + |
| 80 | + |
| 81 | +def split_train_test(table, cuble): |
| 82 | + X, y = table.drop(cible, axis=1), table[cible] |
| 83 | + |
| 84 | + train_test = X["Session"] < 2025 |
| 85 | + |
| 86 | + drop = ["Session", "Code UAI de l'établissement", "Établissement"] |
| 87 | + |
| 88 | + train_X = X[train_test].drop(drop, axis=1) |
| 89 | + train_y = y[train_test] |
| 90 | + test_X = X[train_test].drop(drop, axis=1) |
| 91 | + test_y = y[train_test] |
| 92 | + return train_X, test_X, train_y, test_y |
| 93 | + |
| 94 | + |
| 95 | +def make_pipeline(table, cible): |
| 96 | + vars = [c for c in table.columns if c != "cible"] |
| 97 | + num_cols = ["Capacité de l’établissement par formation"] |
| 98 | + cat_cols = [c for c in vars if c not in num_cols] |
| 99 | + |
| 100 | + model = Pipeline( |
| 101 | + [ |
| 102 | + ( |
| 103 | + "preprocessing", |
| 104 | + ColumnTransformer( |
| 105 | + [ |
| 106 | + ("num", StandardScaler(), num_cols), |
| 107 | + ("cats", OneHotEncoder(handle_unknown="ignore"), cat_cols), |
| 108 | + ] |
| 109 | + ), |
| 110 | + ), |
| 111 | + ("regressor", HistGradientBoostingRegressor()), |
| 112 | + ] |
| 113 | + ) |
| 114 | + return model |
| 115 | + |
| 116 | + |
| 117 | +data = get_data() |
| 118 | +table, cible = select_variables_and_clean(data) |
| 119 | +oracle = compute_oracle(table, cible) |
| 120 | +print(f"oracle : {oracle}") |
| 121 | + |
| 122 | +train_X, test_X, train_y, test_y = split_train_test(table, cible) |
| 123 | +model = make_pipeline(table, cible) |
| 124 | +model.fit(train_X, train_y) |
0 commit comments