formatting

rcap107 · rcap107 · commit 731d1acb82e6 · 2026-02-11T15:59:08.000+01:00
diff --git a/content/exercises/01_ex_explore_clean.py b/content/exercises/01_ex_explore_clean.py
@@ -1,19 +1,21 @@
 # %% [markdown]
 # # Exercise: exploring a new table
-# For this exercise, we will use the `employee_salaries` dataframe to answer some 
-# questions. 
+# For this exercise, we will use the `employee_salaries` dataframe to answer some
+# questions.
 #
 # Run the following code to import the dataframe:
 
 # %%
 import pandas as pd
+
 data = pd.read_csv("../data/employee_salaries/data.csv")
 
 # %% [markdown]
-# Now use the skrub `TableReport` and answer the following questions: 
+# Now use the skrub `TableReport` and answer the following questions:
 
 # %%
 from skrub import TableReport
+
 TableReport(data)
 
 # %% [markdown]
@@ -45,40 +47,42 @@
 #     - 9228 rows × 8 columns
 # - How many columns have object/numerical/datetime
 #     - No datetime columns, one integer column (`year_first_hired`), all other columns
-#     are objects. 
+#     are objects.
 # - Are there columns with a large number of missing values?
 #     - No, only the `gender` column contains a small fraction (0.2%) of missing
 #     values.
 # - Are there columns that have a high cardinality?
-#     - Yes, `division`, `employee_position_title`, `date_first_hired` have a 
-#     cardinality larger than 40. 
+#     - Yes, `division`, `employee_position_title`, `date_first_hired` have a
+#     cardinality larger than 40.
 # - Were datetime columns parsed correctly?
-#     - No, the `date_first_hired` column has dtype Object. 
+#     - No, the `date_first_hired` column has dtype Object.
 # - Which columns have outliers?
-#     - No columns seem to include outliers. 
+#     - No columns seem to include outliers.
 # - Which columns have an imbalanced distribution?
-#     - `assignment_category` has an unbalanced distribution. 
+#     - `assignment_category` has an unbalanced distribution.
 # - Which columns are strongly correlated with each other?
-#     - `department` and `department_name` have a Cramer's V of 1, so they are 
-#     very strongly correlated. 
+#     - `department` and `department_name` have a Cramer's V of 1, so they are
+#     very strongly correlated.
 
 # %% [markdown]
-# # Exercise: clean a dataframe using the `Cleaner` 
-# Load the given dataframe. 
+# # Exercise: clean a dataframe using the `Cleaner`
+# Load the given dataframe.
 
 # %%
 import pandas as pd
+
 df = pd.read_csv("../data/cleaner_data.csv")
 
 # %% [markdown]
-# Use the `TableReport` to answer the following questions: 
+# Use the `TableReport` to answer the following questions:
 #
-# - Are there constant columns? 
-# - Are there datetime columns? If so, were they parsed correctly? 
-# - What is the dtype of the numerical features? 
+# - Are there constant columns?
+# - Are there datetime columns? If so, were they parsed correctly?
+# - What is the dtype of the numerical features?
 
 # %%
 from skrub import TableReport
+
 TableReport(df)
 
 # %% [markdown]
@@ -92,14 +96,14 @@
 from skrub import Cleaner
 
 # Write your answer here
-# 
-# 
-# 
-# 
-# 
-# 
-# 
-# 
+#
+#
+#
+#
+#
+#
+#
+#
 
 # %%
 # solution
diff --git a/content/exercises/03_ex_feat_eng.py b/content/exercises/03_ex_feat_eng.py
@@ -1,13 +1,13 @@
 # %% [markdown]
 # # Exercise
 # Use one of the methods explained so far (Cleaner/ApplyToCols) to convert the provided
-# dataframe to datetime dtype, then extract the following features: 
-# - All parts of the datetime 
+# dataframe to datetime dtype, then extract the following features:
+# - All parts of the datetime
 # - The number of seconds from epoch
 # - The day in the week
 # - The day of the year
 #
-# **Hint**: use the format `"%d %B %Y"` for the datetime. 
+# **Hint**: use the format `"%d %B %Y"` for the datetime.
 #
 
 # %%
@@ -29,20 +29,20 @@
 
 # %%
 # Write your solution here
-# 
-# 
-# 
-# 
-# 
-# 
-# 
-# 
-# 
-# 
-# 
-# 
-# 
-# 
+#
+#
+#
+#
+#
+#
+#
+#
+#
+#
+#
+#
+#
+#
 
 # %%
 # Solution with ApplyToCols and ToDatetime
@@ -80,23 +80,23 @@
 
 # %%
 # Write your solution here
-# 
-# 
-# 
-# 
-# 
-# 
-# 
-# 
-# 
-# 
-# 
-# 
-# 
-# 
+#
+#
+#
+#
+#
+#
+#
+#
+#
+#
+#
+#
+#
+#
 
 # %% [markdown]
-# Now modify the script above to add spline features (`periodic_encoding="spline"`). 
+# Now modify the script above to add spline features (`periodic_encoding="spline"`).
 #
 
 # %%
diff --git a/content/exercises/04_ex_table_vec.py b/content/exercises/04_ex_table_vec.py
@@ -1,7 +1,7 @@
 # %% [markdown]
 # # Exercise: implementing a `TableVectorizer` from its components
-# Replicate the behavior of a `TableVectorizer` using `ApplyToCols`, the skrub 
-# selectors, and the given transformers. 
+# Replicate the behavior of a `TableVectorizer` using `ApplyToCols`, the skrub
+# selectors, and the given transformers.
 
 # %%
 from skrub import Cleaner, ApplyToCols, StringEncoder, DatetimeEncoder
@@ -10,24 +10,23 @@
 import skrub.selectors as s
 
 # %% [markdown]
-# Notes on the implementation: 
+# Notes on the implementation:
 #
 # - In the first step, the TableVectorizer cleans the data to parse datetimes and other
 # dtypes.
-# - Numeric features are left untouched, i.e., they use a Passthrough transformer. 
-# - String and categorical feature are split into high and low cardinality features. 
-# - For this exercise, set the the cardinality `threshold` to 4. 
+# - Numeric features are left untouched, i.e., they use a Passthrough transformer.
+# - String and categorical feature are split into high and low cardinality features.
+# - For this exercise, set the the cardinality `threshold` to 4.
 # - High cardinality features are transformed with a `StringEncoder`. In this exercise,
-# set `n_components` to 2. 
-# - Low cardinality features are transformed with a `OneHotEncoder`, and the first 
-# category in binary features is dropped (hint: check the docs of the `OneHotEncoder`
-# for the `drop` parameter). Set `sparse_output=True`.
-# - Remember  `cardinality_below` is one of the skrub selectors. 
-# - Datetimes are transformed by a default `DatetimeEncoder`. 
-# - Everything should be wrapped in a scikit-learn `Pipeline`. 
+# set `n_components` to 2.
+# - Low cardinality features are transformed with a `OneHotEncoder` with 
+# `sparse_output=False` and `drop="if_binary"`.
+# - Remember  `cardinality_below` is one of the skrub selectors.
+# - Datetimes are transformed by a default `DatetimeEncoder`.
+# - Everything should be wrapped in a scikit-learn `Pipeline`.
+# - Remember that the order of the operations matters! 
 #
-#
-# Use the following dataframe to test the result. 
+# Use the following dataframe to test the result.
 
 # %%
 import pandas as pd
@@ -40,21 +39,23 @@
     "str2": ["officer", "manager", "lawyer", "chef", "teacher"],
     "bool": [True, False, True, False, True],
     "datetime-col": [
-            "2020-02-03T12:30:05",
-            "2021-03-15T00:37:15",
-            "2022-02-13T17:03:25",
-            "2023-05-22T08:45:55",
+        "2020-02-03T12:30:05",
+        "2021-03-15T00:37:15",
+        "2022-02-13T17:03:25",
+        "2023-05-22T08:45:55",
     ]
     + [None],
 }
 df = pd.DataFrame(data)
 df
 
 # %% [markdown]
-# Use the following `PassThrough` transformer where needed. 
+# Use the following `PassThrough` transformer where needed.
 
 # %%
-from skrub._apply_to_cols import SingleColumnTransformer
+from skrub._single_column_transformer import SingleColumnTransformer
+
+
 class PassThrough(SingleColumnTransformer):
     def fit_transform(self, column, y=None):
         return column
@@ -78,17 +79,17 @@ def transform(self, column):
 # %%
 # Write your code here
 #
-# 
-# 
-# 
-# 
-# 
-# 
-# 
-# 
-# 
-# 
-# 
+#
+#
+#
+#
+#
+#
+#
+#
+#
+#
+#
 
 # %%
 # Solution
@@ -101,10 +102,10 @@ def transform(self, column):
     cols=s.cardinality_below(4) & s.string(),
 )
 numeric = ApplyToCols(PassThrough(), cols=s.numeric())
-datetime = ApplyToCols(DatetimeEncoder(), cols=s.any_date())
+dt = ApplyToCols(DatetimeEncoder(), cols=s.any_date())
 
 my_table_vectorizer = make_pipeline(
-    cleaner, numeric, high_cardinality, low_cardinality, datetime
+    cleaner, numeric, high_cardinality, low_cardinality, dt
 )
 
 my_table_vectorizer.fit_transform(df)