Skip to content

Commit efd217a

Browse files
committed
WIP on data fetching and feature engineering
1 parent 2d04d19 commit efd217a

8 files changed

Lines changed: 747 additions & 8794 deletions

.gitignore

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -87,3 +87,6 @@ content/notebooks
8787

8888
# jupyterlite build
8989
jupyterlite/.jupyterlite.doit.db
90+
91+
# requests cache for weather data download
92+
.cache.sqlite

content/datasets/Total Load - Day Ahead _ Actual_202001010000-202101010000.csv

Lines changed: 0 additions & 8785 deletions
This file was deleted.
548 KB
Binary file not shown.
549 KB
Binary file not shown.

content/python_files/feature_engineering.py

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -16,8 +16,8 @@
1616
"time",
1717
pl.DataFrame().with_columns(
1818
pl.datetime_range(
19-
pl.datetime(2023, 1, 1, hour=0),
20-
pl.datetime(2024, 12, 31, hour=23),
19+
pl.datetime(2021, 1, 1, hour=0),
20+
pl.datetime(2025, 6, 30, hour=23),
2121
time_zone="UTC",
2222
interval="1h",
2323
).alias("time"),
@@ -75,17 +75,17 @@
7575

7676
# %%
7777
some_city_weather_raw = skrub.var(
78-
"some_city_weather_raw",
79-
pl.read_csv("../datasets/open-meteo-48.12N1.65W44m.csv", skip_rows=3),
78+
"paris_weather_raw",
79+
pl.read_parquet("../datasets/weather_paris.parquet"),
80+
).with_columns(
81+
[
82+
pl.col("time").dt.cast_time_unit("us"), # Ensure time column has the same type
83+
]
8084
)
8185
some_city_weather_raw
8286

8387
# %%
84-
some_city_weather = some_city_weather_raw.with_columns(
85-
[
86-
pl.col("time").str.to_datetime("%Y-%m-%dT%H:%M", time_zone="UTC"),
87-
]
88-
).rename(lambda x: x if x == "time" else x + " some_city")
88+
some_city_weather = some_city_weather_raw.rename(lambda x: x if x == "time" else x + " some_city")
8989
time.join(some_city_weather, on="time", how="left")
9090

9191
# %%
Lines changed: 104 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,104 @@
1+
# %%
2+
%pip install -q openmeteo-requests retry-requests requests-cache ipyleaflet
3+
4+
# %%
5+
from pathlib import Path
6+
from ipyleaflet import Map, Marker
7+
import openmeteo_requests
8+
9+
import pandas as pd
10+
import requests_cache
11+
from retry_requests import retry
12+
13+
# %%
14+
# List of 10 median to large urban areas with their GPS coordinates to cover
15+
# the most populous areas in France.
16+
cities = [
17+
{"name": "Paris", "latitude": 48.8566, "longitude": 2.3522},
18+
{"name": "Lyon", "latitude": 45.7640, "longitude": 4.8357},
19+
# {"name": "Marseille", "latitude": 43.2965, "longitude": 5.3698},
20+
# {"name": "Toulouse", "latitude": 43.6047, "longitude": 1.4442},
21+
# {"name": "Lille", "latitude": 50.6292, "longitude": 3.0573},
22+
# {"name": "Limoges", "latitude": 45.8336, "longitude": 1.2616},
23+
# {"name": "Nantes", "latitude": 47.2184, "longitude": -1.5536},
24+
# {"name": "Strasbourg", "latitude": 48.5734, "longitude": 7.7521},
25+
# {"name": "Brest", "latitude": 48.3904, "longitude": -4.4861},
26+
# {"name": "Bayonne", "latitude": 43.4833, "longitude": -1.4667},
27+
]
28+
29+
map_center = [46.6034, 1.8883] # Approximate center of France
30+
m = Map(center=map_center, zoom=6)
31+
for city in cities:
32+
marker = Marker(location=(city["latitude"], city["longitude"]), title=city["name"])
33+
m.add_layer(marker)
34+
m
35+
36+
37+
# %%
38+
39+
def download_weather_data(city):
40+
cache_session = requests_cache.CachedSession(".cache", expire_after=3600)
41+
retry_session = retry(cache_session, retries=5, backoff_factor=0.2)
42+
openmeteo = openmeteo_requests.Client(session=retry_session)
43+
44+
# Make sure all required weather variables are listed here. The order of
45+
# variables in hourly or daily is important to assign them correctly below.
46+
url = "https://historical-forecast-api.open-meteo.com/v1/forecast"
47+
params = {
48+
"latitude": city["latitude"],
49+
"longitude": city["longitude"],
50+
"start_date": "2021-01-01",
51+
"end_date": "2025-05-31",
52+
"hourly": [
53+
"temperature_2m",
54+
"precipitation",
55+
"wind_speed_10m",
56+
"cloud_cover",
57+
"soil_moisture_1_to_3cm",
58+
"relative_humidity_2m",
59+
],
60+
"timezone": "GMT", # Use GMT to ease temporal joins.
61+
}
62+
response = openmeteo.weather_api(url, params=params)[0]
63+
64+
# Process hourly data. The order of variables needs to be the same as requested.
65+
hourly = response.Hourly()
66+
hourly_temperature_2m = hourly.Variables(0).ValuesAsNumpy()
67+
hourly_precipitation = hourly.Variables(1).ValuesAsNumpy()
68+
hourly_wind_speed_10m = hourly.Variables(2).ValuesAsNumpy()
69+
hourly_cloud_cover = hourly.Variables(3).ValuesAsNumpy()
70+
hourly_soil_moisture_1_to_3cm = hourly.Variables(4).ValuesAsNumpy()
71+
hourly_relative_humidity_2m = hourly.Variables(5).ValuesAsNumpy()
72+
73+
hourly_data = {
74+
"time": pd.date_range(
75+
start=pd.to_datetime(hourly.Time(), unit="s", utc=True),
76+
end=pd.to_datetime(hourly.TimeEnd(), unit="s", utc=True),
77+
freq=pd.Timedelta(seconds=hourly.Interval()),
78+
inclusive="left",
79+
)
80+
}
81+
82+
hourly_data["temperature_2m"] = hourly_temperature_2m
83+
hourly_data["precipitation"] = hourly_precipitation
84+
hourly_data["wind_speed_10m"] = hourly_wind_speed_10m
85+
hourly_data["cloud_cover"] = hourly_cloud_cover
86+
hourly_data["soil_moisture_1_to_3cm"] = hourly_soil_moisture_1_to_3cm
87+
hourly_data["relative_humidity_2m"] = hourly_relative_humidity_2m
88+
return pd.DataFrame(data=hourly_data)
89+
90+
91+
# %%
92+
datasets_folder = Path("../datasets")
93+
for city in cities:
94+
filepath = datasets_folder / f"weather_{city['name'].lower()}.parquet"
95+
if filepath.exists():
96+
print(f"Weather data for {city['name']} already exists at {filepath}.")
97+
continue
98+
99+
print(f"Downloading weather data for {city['name']}...")
100+
df = download_weather_data(city)
101+
df.to_parquet(filepath, index=False)
102+
print(f"Weather data for {city['name']} saved to {filepath}.")
103+
104+
# %%

0 commit comments

Comments
 (0)