|
1 | 1 | # %% [markdown] |
| 2 | +# # Feature engineering for electricity load forecasting |
2 | 3 | # |
3 | | -# Install extra dependencies for this notebook if needed (when running |
| 4 | +# ## Environment setup |
| 5 | +# |
| 6 | +# We need to install some extra dependencies for this notebook if needed (when running |
4 | 7 | # jupyterlite). We need the development version of skrub to be able to use the |
5 | 8 | # skrub expressions. |
6 | 9 | # %% |
|
17 | 20 | import skrub |
18 | 21 | from pathlib import Path |
19 | 22 |
|
| 23 | +# %% [markdown] |
| 24 | +# ## Time range |
| 25 | +# |
| 26 | +# Let's define a hourly time range from March 23, 2021 to May 31, 2025 that |
| 27 | +# will be used to join the electricity load data and the weather data. The time |
| 28 | +# range is in UTC timezone to avoid any ambiguity when joining with the weather |
| 29 | +# data that is also in UTC. |
| 30 | +# |
| 31 | +# We wrap the polars dataframe in a skrub variable to benefit from the |
| 32 | +# built-in TableReport display in the notebook. Using the skrub expression |
| 33 | +# system will also be useful later. |
| 34 | + |
20 | 35 | # %% |
21 | 36 | time = skrub.var( |
22 | 37 | "time", |
|
32 | 47 | time |
33 | 48 |
|
34 | 49 | # %% [markdown] |
35 | | -# TODO: add instructions to download data manually |
| 50 | +# |
| 51 | +# To avoid network issues when running this notebook, the necessary data |
| 52 | +# files have already been downloaded and saved in the `datasets` folder. |
| 53 | +# See the README.md file for instructions to download the data manually |
| 54 | +# if you want to re-run this notebook with more recent data. |
36 | 55 |
|
37 | 56 | # %% |
38 | 57 | data_source_folder = Path("../datasets") |
39 | 58 | for data_file in sorted(data_source_folder.iterdir()): |
40 | 59 | print(data_file) |
41 | 60 |
|
42 | | -# %% |
43 | | -electricity_raw = skrub.var( |
44 | | - "electricity_raw", |
45 | | - pl.concat( |
46 | | - [ |
47 | | - pl.from_pandas(pd.read_csv(data_file, na_values=["N/A"])) |
48 | | - for data_file in sorted(data_source_folder.iterdir()) |
49 | | - if data_file.name.startswith("Total Load - Day Ahead") |
50 | | - and data_file.name.endswith(".csv") |
51 | | - ], |
52 | | - how="vertical", |
53 | | - ), |
54 | | -) |
55 | | -electricity_raw |
56 | | - |
57 | | -# %% |
58 | | -electricity = ( |
59 | | - electricity_raw.with_columns( |
60 | | - [ |
61 | | - pl.col("Time (UTC)") |
62 | | - .str.split(by=" - ") |
63 | | - .list.first() |
64 | | - .str.to_datetime("%d.%m.%Y %H:%M", time_zone="UTC") |
65 | | - .alias("time"), |
66 | | - ] |
67 | | - ) |
68 | | - .drop(["Time (UTC)", "Day-ahead Total Load Forecast [MW] - BZN|FR"]) |
69 | | - .rename({"Actual Total Load [MW] - BZN|FR": "load_mw"}) |
70 | | - .select(["time", "load_mw"]) |
71 | | -) |
72 | | -electricity |
73 | | - |
74 | | -# %% [markdown] |
75 | | -# Check that the number of rows matches our expectations based on the number of hours that separate the first and the last dates: |
76 | | - |
77 | | -# %% |
78 | | -time.join(electricity, on="time", how="left") |
79 | | - |
80 | 61 | # %% [markdown] |
81 | 62 | # |
82 | 63 | # List of 10 medium to large urban areas to approximately cover most regions in |
|
133 | 114 |
|
134 | 115 | all_city_weather |
135 | 116 |
|
136 | | - |
| 117 | +# %% [markdown] |
| 118 | +# ## Calendar and holidays features |
| 119 | +# |
| 120 | +# We leverage the `holidays` package to enrich the time range with some |
| 121 | +# calendar features such as public holidays and school holidays in France. We |
| 122 | +# also add some features that are useful for time series forecasting such as |
| 123 | +# the day of the week, the day of the year, and the hour of the day. |
| 124 | +# |
| 125 | +# Note that the `holidays` package requires us to extract the date for the |
| 126 | +# French timezone. |
| 127 | +# |
| 128 | +# Similarly for the calendar features: all the time features are extracted |
| 129 | +# from the time in the French timezone. |
137 | 130 | # %% |
138 | 131 | import holidays |
139 | 132 |
|
|
152 | 145 | ) |
153 | 146 | calendar |
154 | 147 |
|
| 148 | +# %% [markdown] |
| 149 | +# ## Electricity load data |
| 150 | +# |
| 151 | +# Finally we load the electricity load data. This data will both be used as a |
| 152 | +# target variable but also to craft some lagged and window-aggregated features. |
| 153 | +# %% |
| 154 | +electricity_raw = skrub.var( |
| 155 | + "electricity_raw", |
| 156 | + pl.concat( |
| 157 | + [ |
| 158 | + pl.from_pandas(pd.read_csv(data_file, na_values=["N/A"])) |
| 159 | + for data_file in sorted(data_source_folder.iterdir()) |
| 160 | + if data_file.name.startswith("Total Load - Day Ahead") |
| 161 | + and data_file.name.endswith(".csv") |
| 162 | + ], |
| 163 | + how="vertical", |
| 164 | + ), |
| 165 | +) |
| 166 | +electricity_raw |
| 167 | + |
155 | 168 | # %% |
| 169 | +electricity = ( |
| 170 | + electricity_raw.with_columns( |
| 171 | + [ |
| 172 | + pl.col("Time (UTC)") |
| 173 | + .str.split(by=" - ") |
| 174 | + .list.first() |
| 175 | + .str.to_datetime("%d.%m.%Y %H:%M", time_zone="UTC") |
| 176 | + .alias("time"), |
| 177 | + ] |
| 178 | + ) |
| 179 | + .drop(["Time (UTC)", "Day-ahead Total Load Forecast [MW] - BZN|FR"]) |
| 180 | + .rename({"Actual Total Load [MW] - BZN|FR": "load_mw"}) |
| 181 | + .select(["time", "load_mw"]) |
| 182 | +) |
| 183 | +electricity |
| 184 | + |
| 185 | +# %% [markdown] |
| 186 | +# Check that the number of rows matches our expectations based on the number of hours that separate the first and the last dates: |
| 187 | + |
| 188 | +# %% |
| 189 | +time.join(electricity, on="time", how="left") |
0 commit comments