@@ -167,47 +167,83 @@ def load_weather_data(time, city_names, data_source_folder):
167167# %% [markdown]
168168# ## Calendar and holidays features
169169#
170- # We leverage the `holidays` package to enrich the time range with some
171- # calendar features such as public holidays in France. We also add some
172- # features that are useful for time series forecasting such as the day of the
173- # week, the day of the year, and the hour of the day.
170+ # We leverage the `holidays` package to enrich the time range with some calendar
171+ # features such as public holidays in France. We also add some features that are useful
172+ # for time series forecasting such as the day of the week, the day of the year, and the
173+ # hour of the day.
174174#
175- # Note that the `holidays` package requires us to extract the date for the
176- # French timezone.
175+ # We want to use the `holidays` package to enrich the time range with some calendar
176+ # features such as public holidays in France. In addition, we want to use `skrub`
177+ # `DatetimeEncoder` to add some features that are useful for time series forecasting
178+ # such as the calendar year, month, day, hour, the day of the week and the day of the
179+ # year.
177180#
178- # Similarly for the calendar features: all the time features are extracted from
179- # the time in the French timezone, since it is likely that electricity usage
180- # patterns are influenced by inhabitants' daily routines aligned with the local
181+ # Note that the `holidays` package requires us to extract the date for the French
181182# timezone.
182-
183+ #
184+ # Similarly for the calendar features: all the time features are extracted from the time
185+ # in the French timezone, since it is likely that electricity usage patterns are
186+ # influenced by inhabitants' daily routines aligned with the local timezone.
187+ #
188+ # ### Exercise
189+ #
190+ # Let's first create some calendar features using `skrub`'s `DatetimeEncoder`.
191+ #
192+ # 1. Create a `DatetimeEncoder` object and by looking at the documentation, make sure
193+ # to add the weekday and the day of the year. Do not add the total seconds since the
194+ # Unix epoch. You can refer to this link:
195+ # https://skrub-data.org/stable/reference/generated/skrub.DatetimeEncoder.html
196+ # 2. As a first operation, we wish to rename the `time` column to `cal` such that
197+ # the all columns corresponding to some calendar features will be prefixed with
198+ # `cal_`. You can simply call the `rename` method (cf.
199+ # https://docs.pola.rs/api/python/stable/reference/dataframe/api/polars.DataFrame.rename.html)
200+ # because `time` can be seen as a polars dataframe.
201+ # 3. Now, we wish to apply the encoder to the `time` dataframe. Refer to the following
202+ # link for all details:
203+ # https://skrub-data.org/stable/reference/generated/skrub.DataOp.skb.apply.html
204+ # 4. Let's call the resulting skrub `DataOp` `time_encoded` and check the output
205+ # representation to check if the preview looks what we expect.
183206
184207# %%
185- @skrub .deferred
186- def prepare_french_calendar_data (time ):
187- fr_time = pl .col ("time" ).dt .convert_time_zone ("Europe/Paris" )
188- fr_year_min = time .select (fr_time .dt .year ().min ()).item ()
189- fr_year_max = time .select (fr_time .dt .year ().max ()).item ()
190- holidays_fr = holidays .country_holidays (
191- "FR" , years = range (fr_year_min , fr_year_max + 1 )
192- )
193- return time .with_columns (
194- [
195- fr_time .dt .hour ().alias ("cal_hour_of_day" ),
196- fr_time .dt .weekday ().alias ("cal_day_of_week" ),
197- fr_time .dt .ordinal_day ().alias ("cal_day_of_year" ),
198- fr_time .dt .year ().alias ("cal_year" ),
199- fr_time .dt .date ().is_in (holidays_fr .keys ()).alias ("cal_is_holiday" ),
200- ],
201- )
208+ from skrub import DatetimeEncoder
202209
203210
204- from skrub import DatetimeEncoder
211+ # %% [markdown]
212+ #
213+ # ### Solution
214+
205215
216+ # %%
206217datetime_encoder = DatetimeEncoder (
207218 add_weekday = True , add_day_of_year = True , add_total_seconds = False
208219)
220+ time_encoded = time .rename ({"time" : "cal" }).skb .apply (datetime_encoder )
221+ time_encoded
222+
223+ # %% [markdown]
224+ #
225+ # ### Exercise
226+ #
227+ # Now, let's create a processing function that is going to be decorated with the
228+ # `@skrub.deferred` decorator. This function should:
229+ # 1. Take the `time` dataframe as an input.
230+ # 2. Convert the "time" column to the French/Paris timezone.
231+ # 3. Extract the French holidays by calling `holidays.country_holidays`. For this
232+ # function, you need to extract the minimum and maximum year from the "time" column.
233+ # 4. Finally, you need to if a date in holiday is a French holiday. You can call this
234+ # column `cal_is_holiday`.
235+ # 5. Apply this function to the `time` `DataOp` and call the resulting variable
236+ # `is_french_holiday`.
237+ # 6. Finally, we wish to concatenate the `time_encoded` and `is_french_holiday` using
238+ # the `.skb.concat` method.
209239
240+ # %%
241+
242+ # %% [markdown]
243+ #
244+ # ### Solution
210245
246+ # %%
211247@skrub .deferred
212248def prepare_holidays (time ):
213249 fr_time = pl .col ("time" ).dt .convert_time_zone ("Europe/Paris" )
@@ -220,11 +256,11 @@ def prepare_holidays(time):
220256 fr_time .dt .date ().is_in (holidays_fr .keys ()).alias ("cal_is_holiday" ),
221257 )
222258
259+ is_french_holiday = prepare_holidays (time )
260+ is_french_holiday
223261
224- time_encoded = time .rename ({"time" : "cal" }).skb .apply (datetime_encoder )
225-
226- calendar = time .skb .concat ([time_encoded , prepare_holidays (time )], axis = 1 )
227- # calendar = prepare_french_calendar_data(time)
262+ # %%
263+ calendar = time .skb .concat ([time_encoded , is_french_holiday ], axis = 1 )
228264calendar
229265
230266
0 commit comments