|
2 | 2 | "cells": [ |
3 | 3 | { |
4 | 4 | "cell_type": "markdown", |
5 | | - "id": "55868e5d", |
| 5 | + "id": "07ca67bb", |
6 | 6 | "metadata": {}, |
7 | 7 | "source": [ |
8 | 8 | "# Exercise\n", |
|
19 | 19 | { |
20 | 20 | "cell_type": "code", |
21 | 21 | "execution_count": null, |
22 | | - "id": "f7c73e92", |
| 22 | + "id": "3fbaa238", |
| 23 | + "metadata": {}, |
| 24 | + "outputs": [], |
| 25 | + "source": [ |
| 26 | + "%pip install skrub" |
| 27 | + ] |
| 28 | + }, |
| 29 | + { |
| 30 | + "cell_type": "code", |
| 31 | + "execution_count": null, |
| 32 | + "id": "065ddca9", |
23 | 33 | "metadata": {}, |
24 | 34 | "outputs": [], |
25 | 35 | "source": [ |
|
43 | 53 | { |
44 | 54 | "cell_type": "code", |
45 | 55 | "execution_count": null, |
46 | | - "id": "4e5634a2", |
| 56 | + "id": "6cde54f4", |
47 | 57 | "metadata": {}, |
48 | 58 | "outputs": [], |
49 | 59 | "source": [ |
|
67 | 77 | { |
68 | 78 | "cell_type": "code", |
69 | 79 | "execution_count": null, |
70 | | - "id": "a1116746", |
| 80 | + "id": "99270841", |
71 | 81 | "metadata": {}, |
72 | 82 | "outputs": [], |
73 | 83 | "source": [ |
|
90 | 100 | { |
91 | 101 | "cell_type": "code", |
92 | 102 | "execution_count": null, |
93 | | - "id": "fb8a8cc4", |
| 103 | + "id": "685ccab3", |
94 | 104 | "metadata": {}, |
95 | 105 | "outputs": [], |
96 | 106 | "source": [ |
|
110 | 120 | }, |
111 | 121 | { |
112 | 122 | "cell_type": "markdown", |
113 | | - "id": "837491a4", |
| 123 | + "id": "f3e17f09", |
114 | 124 | "metadata": {}, |
115 | 125 | "source": [ |
116 | 126 | "Modify the script so that the `DatetimeEncoder` adds periodic encoding with sine\n", |
|
120 | 130 | { |
121 | 131 | "cell_type": "code", |
122 | 132 | "execution_count": null, |
123 | | - "id": "2866891b", |
| 133 | + "id": "acacaaac", |
124 | 134 | "metadata": {}, |
125 | 135 | "outputs": [], |
126 | 136 | "source": [ |
|
141 | 151 | "#" |
142 | 152 | ] |
143 | 153 | }, |
144 | | - { |
145 | | - "cell_type": "markdown", |
146 | | - "id": "2816f4b9", |
147 | | - "metadata": {}, |
148 | | - "source": [ |
149 | | - "Now modify the script above to add spline features (`periodic_encoding=\"spline\"`).\n" |
150 | | - ] |
151 | | - }, |
152 | 154 | { |
153 | 155 | "cell_type": "code", |
154 | 156 | "execution_count": null, |
155 | | - "id": "0b163b46", |
| 157 | + "id": "85f036ad", |
156 | 158 | "metadata": {}, |
157 | 159 | "outputs": [], |
158 | 160 | "source": [ |
|
163 | 165 | "\n", |
164 | 166 | "datetime_encoder = ApplyToCols(\n", |
165 | 167 | " DatetimeEncoder(\n", |
166 | | - " periodic_encoding=\"spline\",\n", |
| 168 | + " periodic_encoding=\"circular\",\n", |
167 | 169 | " add_total_seconds=True,\n", |
168 | 170 | " add_weekday=True,\n", |
169 | 171 | " add_day_of_year=True,\n", |
|
175 | 177 | "encoder.fit_transform(df)" |
176 | 178 | ] |
177 | 179 | }, |
| 180 | + { |
| 181 | + "cell_type": "markdown", |
| 182 | + "id": "4bc63de9", |
| 183 | + "metadata": {}, |
| 184 | + "source": [ |
| 185 | + "# Exercise\n", |
| 186 | + "Build a custom `SingleColumnTransformer` that unpacks the combined string column\n", |
| 187 | + "in the provided dataframe into separate columns for `str_id`, `num_id`, and\n", |
| 188 | + "`datetime`. The `datetime` column should be converted to datetime dtype. Then,\n", |
| 189 | + "use this transformer in a pipeline to extract datetime features as shown in\n", |
| 190 | + "the previous exercises.\n", |
| 191 | + "\n", |
| 192 | + "The transformer should reject columns that are not of string type or that cannot \n", |
| 193 | + "be unpacked properly.\n", |
| 194 | + "IDs are in the format `STR-NUM-DATETIME`, where `STR` is a string identifier, \n", |
| 195 | + "`NUM` is a numeric identifier, and `DATETIME` is a Unix timestamp.\n", |
| 196 | + "\n", |
| 197 | + "Hint: you can use the following snippet to extract the components from the string column:\n", |
| 198 | + "```python\n", |
| 199 | + "split_data = X.str.split(\"-\", expand=True)\n", |
| 200 | + "res = pd.DataFrame(\n", |
| 201 | + " {\n", |
| 202 | + " \"str_id\": split_data[0],\n", |
| 203 | + " \"num_id\": split_data[1].astype(\"int64\"),\n", |
| 204 | + " \"datetime\": pd.to_datetime(split_data[2].astype(\"int64\"), unit=\"s\"),\n", |
| 205 | + " }\n", |
| 206 | + ")\n", |
| 207 | + "```" |
| 208 | + ] |
| 209 | + }, |
| 210 | + { |
| 211 | + "cell_type": "code", |
| 212 | + "execution_count": null, |
| 213 | + "id": "090e3f3c", |
| 214 | + "metadata": { |
| 215 | + "lines_to_next_cell": 0 |
| 216 | + }, |
| 217 | + "outputs": [], |
| 218 | + "source": [ |
| 219 | + "from skrub.core import SingleColumnTransformer, RejectColumn\n", |
| 220 | + "import pandas as pd\n", |
| 221 | + "from skrub import ApplyToCols\n", |
| 222 | + "df_id = pd.DataFrame(\n", |
| 223 | + " {\n", |
| 224 | + " \"id\": [\n", |
| 225 | + " \"BQG-1001-1577836800\",\n", |
| 226 | + " \"TYW-1002-1577923200\",\n", |
| 227 | + " \"JAY-1003-1578009600\",\n", |
| 228 | + " ]\n", |
| 229 | + " }\n", |
| 230 | + ")" |
| 231 | + ] |
| 232 | + }, |
| 233 | + { |
| 234 | + "cell_type": "code", |
| 235 | + "execution_count": null, |
| 236 | + "id": "251f93c6", |
| 237 | + "metadata": {}, |
| 238 | + "outputs": [], |
| 239 | + "source": [ |
| 240 | + "# Write your solution here\n", |
| 241 | + "#\n", |
| 242 | + "#\n", |
| 243 | + "#\n", |
| 244 | + "#\n", |
| 245 | + "#\n", |
| 246 | + "#\n", |
| 247 | + "#" |
| 248 | + ] |
| 249 | + }, |
| 250 | + { |
| 251 | + "cell_type": "code", |
| 252 | + "execution_count": null, |
| 253 | + "id": "0595b38e", |
| 254 | + "metadata": {}, |
| 255 | + "outputs": [], |
| 256 | + "source": [ |
| 257 | + "# Solution\n", |
| 258 | + "class Unpacker(SingleColumnTransformer):\n", |
| 259 | + " \"\"\"Unpacker for pandas DataFrames.\"\"\"\n", |
| 260 | + "\n", |
| 261 | + " def fit_transform(self, X, y=None):\n", |
| 262 | + " \"\"\"Unpack combined string column into separate columns.\"\"\"\n", |
| 263 | + " if X.dtype != object:\n", |
| 264 | + " raise RejectColumn(\"UnpackerPandas only works on string columns.\")\n", |
| 265 | + " try:\n", |
| 266 | + " split_data = X.str.split(\"-\", expand=True)\n", |
| 267 | + " res = pd.DataFrame(\n", |
| 268 | + " {\n", |
| 269 | + " \"str_id\": split_data[0],\n", |
| 270 | + " \"num_id\": split_data[1].astype(\"int64\"),\n", |
| 271 | + " \"datetime\": pd.to_datetime(split_data[2].astype(\"int64\"), unit=\"s\"),\n", |
| 272 | + " }\n", |
| 273 | + " )\n", |
| 274 | + " return res\n", |
| 275 | + " except Exception as exc:\n", |
| 276 | + " raise RejectColumn(\"UnpackerPandas failed to unpack the column.\") from exc\n", |
| 277 | + "\n", |
| 278 | + "\n", |
| 279 | + "ApplyToCols(Unpacker(), allow_reject=True).fit_transform(df_id)" |
| 280 | + ] |
| 281 | + }, |
| 282 | + { |
| 283 | + "cell_type": "markdown", |
| 284 | + "id": "144f8f59", |
| 285 | + "metadata": {}, |
| 286 | + "source": [ |
| 287 | + "Now use this `Unpacker` in a pipeline to extract datetime features as shown in\n", |
| 288 | + "the previous exercises. You can use the default `DatetimeEncoder` settings for\n", |
| 289 | + "this part." |
| 290 | + ] |
| 291 | + }, |
| 292 | + { |
| 293 | + "cell_type": "code", |
| 294 | + "execution_count": null, |
| 295 | + "id": "42d9de34", |
| 296 | + "metadata": {}, |
| 297 | + "outputs": [], |
| 298 | + "source": [ |
| 299 | + "# Write your solution here\n", |
| 300 | + "#\n", |
| 301 | + "#\n", |
| 302 | + "#\n", |
| 303 | + "#\n", |
| 304 | + "#\n", |
| 305 | + "#\n", |
| 306 | + "#" |
| 307 | + ] |
| 308 | + }, |
| 309 | + { |
| 310 | + "cell_type": "code", |
| 311 | + "execution_count": null, |
| 312 | + "id": "6b727f5f", |
| 313 | + "metadata": { |
| 314 | + "lines_to_next_cell": 0 |
| 315 | + }, |
| 316 | + "outputs": [], |
| 317 | + "source": [ |
| 318 | + "from sklearn.pipeline import make_pipeline\n", |
| 319 | + "from skrub import DatetimeEncoder\n", |
| 320 | + "\n", |
| 321 | + "pipeline = make_pipeline(\n", |
| 322 | + " ApplyToCols(Unpacker(), allow_reject=True),\n", |
| 323 | + " ApplyToCols(DatetimeEncoder(), allow_reject=True),\n", |
| 324 | + ")\n", |
| 325 | + "pipeline.fit_transform(df_id)" |
| 326 | + ] |
| 327 | + }, |
178 | 328 | { |
179 | 329 | "cell_type": "code", |
180 | 330 | "execution_count": null, |
181 | | - "id": "d37c2d83", |
| 331 | + "id": "cdada497", |
182 | 332 | "metadata": {}, |
183 | 333 | "outputs": [], |
184 | 334 | "source": [] |
|
0 commit comments