Skip to content

Commit df4e431

Browse files
authored
Feature: Add support for pandas 3 (MHKiT-Software#443)
## Pandas 3 Changes * Updated pandas dependency to allow versions >=2.2.2 without an upper limit in both `environment-dev.yml` and `pyproject.toml`, enabling support for pandas 3.x. * Added a new function `replace_pandas_missing_values_with_nan` in `mhkit/wave/io/ndbc.py` to replace missing values with `NaN` in a way that is compatible with both pandas 2.x and 3.x. ## Test Updates * Increased `test_get_buoy_metadata` latitude and longitude assertions delta tolerance to allow for a small amount real world buoy drift. * Fix deprecation in `test_request_parse_workflow_multiyear` (`mhkit/tests/wave/io/test_cdip.py`) by using `"D"` instead of `"d"` in the `floor` method. Pandas deprecated `d` in 3.0+ pandas-dev/pandas#58998. period aliases are here: https://pandas.pydata.org/docs/user_guide/timeseries.html#period-aliases
1 parent 4ec2aff commit df4e431

7 files changed

Lines changed: 70 additions & 35 deletions

File tree

.github/workflows/main.yml

Lines changed: 6 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -93,7 +93,7 @@ jobs:
9393
- name: Install testing dependencies
9494
shell: bash -l {0}
9595
run: |
96-
conda install -y pytest coverage coveralls
96+
conda install -y pytest coverage
9797
9898
- name: Install mhkit
9999
shell: bash -l {0}
@@ -138,7 +138,7 @@ jobs:
138138
- name: Install testing dependencies
139139
shell: bash -l {0}
140140
run: |
141-
conda install -y pytest coverage coveralls
141+
conda install -y pytest coverage
142142
143143
- name: Install mhkit
144144
shell: bash -l {0}
@@ -181,7 +181,7 @@ jobs:
181181
- name: Install testing dependencies
182182
shell: bash -l {0}
183183
run: |
184-
conda install -y pytest coverage coveralls
184+
conda install -y pytest coverage
185185
186186
- name: Install mhkit
187187
shell: bash -l {0}
@@ -229,7 +229,7 @@ jobs:
229229
- name: Install testing dependencies
230230
shell: bash -l {0}
231231
run: |
232-
conda install -y pytest coverage coveralls
232+
conda install -y pytest coverage
233233
234234
- name: Install mhkit
235235
shell: bash -l {0}
@@ -291,25 +291,12 @@ jobs:
291291
name: data
292292
path: ~/.cache/mhkit
293293

294-
- name: Install system dependencies
295-
if: runner.os == 'Linux'
296-
run: |
297-
# Update apt-get cache
298-
sudo apt-get update
299-
sudo apt-get install -y libhdf5-dev libnetcdf-dev
300-
301294
- name: Update and install packages
302295
shell: bash
303296
run: |
304297
python -m pip install --upgrade pip wheel
305298
pip install -e ".[all,dev]"
306299
307-
- name: Reinstall h5py and netCDF4 with system libraries
308-
if: runner.os == 'Linux'
309-
shell: bash
310-
run: |
311-
pip install --force-reinstall --no-binary=:all: h5py netCDF4
312-
313300
- name: Install setuptools for Python 3.12
314301
if: matrix.python-version == '3.12'
315302
shell: bash
@@ -368,7 +355,7 @@ jobs:
368355
- name: Install testing dependencies
369356
shell: bash -l {0}
370357
run: |
371-
conda install -y pytest coverage coveralls
358+
conda install -y pytest coverage
372359
373360
- name: Install mhkit
374361
shell: bash -l {0}
@@ -587,7 +574,7 @@ jobs:
587574
- name: Install notebook testing dependencies
588575
shell: bash -l {0}
589576
run: |
590-
conda install -y pytest coverage coveralls nbval jupyter utm folium
577+
conda install -y pytest coverage nbval jupyter utm folium
591578
592579
- name: Install mhkit
593580
shell: bash -l {0}

environment-dev.yml

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,14 +6,16 @@ dependencies:
66
- python>=3.10,<3.13
77
- pip
88
- numpy>=2.0.0
9-
- pandas>=2.2.2,<3.0
9+
- pandas>=2.2.2
10+
# Provides pyarrow storage for the default string dtype in pandas 3.0+
11+
- pyarrow>=16.0.0
1012
- scipy>=1.14.0
1113
- xarray>=2024.6.0
1214
- scikit-learn>=1.5.1
1315
- h5py>=3.11.0
1416
- h5pyd>=0.18.0
1517
- netCDF4>=1.6.5
16-
- hdf5>=1.14.3,<1.14.5.0a0
18+
- hdf5>=1.14.3
1719
- statsmodels>=0.14.2
1820
- requests
1921
- beautifulsoup4

mhkit/tests/conftest.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
"""Shared pytest configuration for the MHKiT test suite."""
2+
3+
import matplotlib
4+
5+
# Use the non-interactive Agg backend so figure/animation tests run headlessly
6+
# on every OS (Windows otherwise picks TkAgg and fails without Tcl/Tk).
7+
matplotlib.use("Agg")

mhkit/tests/wave/io/test_cdip.py

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,6 @@
77
import pytz
88
import os
99

10-
1110
testdir = dirname(abspath(__file__))
1211
datadir = normpath(join(testdir, "..", "..", "..", "..", "examples", "data", "wave"))
1312

@@ -130,18 +129,18 @@ def test_request_parse_workflow_multiyear(self):
130129
expected_index_final = datetime(year2, 12, 31)
131130

132131
wave1D = data["data"]["wave"]
133-
self.assertEqual(wave1D.index[0].floor("d").to_pydatetime(), expected_index0)
132+
self.assertEqual(wave1D.index[0].floor("D").to_pydatetime(), expected_index0)
134133

135134
self.assertEqual(
136-
wave1D.index[-1].floor("d").to_pydatetime(), expected_index_final
135+
wave1D.index[-1].floor("D").to_pydatetime(), expected_index_final
137136
)
138137

139138
for key, wave2D in data["data"]["wave2D"].items():
140139
self.assertEqual(
141-
wave2D.index[0].floor("d").to_pydatetime(), expected_index0
140+
wave2D.index[0].floor("D").to_pydatetime(), expected_index0
142141
)
143142
self.assertEqual(
144-
wave2D.index[-1].floor("d").to_pydatetime(), expected_index_final
143+
wave2D.index[-1].floor("D").to_pydatetime(), expected_index_final
145144
)
146145

147146
def test_plot_boxplot(self):

mhkit/tests/wave/io/test_ndbc.py

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -357,8 +357,13 @@ def test_get_buoy_metadata(self):
357357
metadata["provider"], "Owned and maintained by National Data Buoy Center"
358358
)
359359
self.assertEqual(metadata["type"], "3-meter foam buoy w/ seal cage")
360-
self.assertAlmostEqual(float(metadata["lat"]), 36.785)
361-
self.assertAlmostEqual(float(metadata["lon"]), 122.396)
360+
# NDBC 46042 is a deployed buoy that drifts within its watch
361+
# circle, so the NDBC output position can drift over time (observed
362+
# lat 36.785 -> 36.787, lon 122.396 -> 122.408). Use a loose tolerance
363+
# that tracks the buoy's nominal location without failing on real
364+
# movement, while still catching large parsing errors.
365+
self.assertAlmostEqual(float(metadata["lat"]), 36.785, delta=0.05)
366+
self.assertAlmostEqual(float(metadata["lon"]), 122.396, delta=0.05)
362367
self.assertEqual(metadata["Site elevation"], "sea level")
363368

364369
def test_get_buoy_metadata_invalid_station(self):

mhkit/wave/io/ndbc.py

Lines changed: 35 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -19,8 +19,40 @@
1919
convert_nested_dict_and_pandas,
2020
)
2121

22-
# Set pandas option to opt-in to future behavior
23-
pd.set_option("future.no_silent_downcasting", True)
22+
23+
def replace_pandas_missing_values_with_nan(data, missing_values):
24+
"""
25+
Replace missing values with NaN without silently downcasting dtypes.
26+
27+
Parameters
28+
------------
29+
data: pandas DataFrame
30+
Data in which to replace missing values
31+
32+
missing_values: list of values
33+
List of values that denote missing data
34+
35+
Returns
36+
---------
37+
data: pandas DataFrame
38+
Data with missing values replaced by NaN and object columns converted
39+
to their best-fit dtypes
40+
41+
Notes
42+
-----
43+
pandas versions above 2.x do not silently downcast in ``replace`` and
44+
dropped the ``future.no_silent_downcasting`` option. On pandas 2.x the same
45+
forward-looking behavior is opted into so the result is identical across
46+
supported pandas versions and no deprecation warning is emitted.
47+
``infer_objects`` then converts the resulting object columns explicitly.
48+
"""
49+
pandas_major = int(pd.__version__.split(".")[0])
50+
if pandas_major <= 2:
51+
with pd.option_context("future.no_silent_downcasting", True):
52+
data = data.replace(missing_values, np.nan)
53+
else:
54+
data = data.replace(missing_values, np.nan)
55+
return data.infer_objects()
2456

2557

2658
def read_file(file_name, missing_values=["MM", 9999, 999, 99], to_pandas=True):
@@ -153,8 +185,7 @@ def read_file(file_name, missing_values=["MM", 9999, 999, 99], to_pandas=True):
153185
data.columns = data.columns
154186

155187
# Replace indicated missing values with nan
156-
data = data.replace(missing_values, np.nan)
157-
data = data.infer_objects(copy=False)
188+
data = replace_pandas_missing_values_with_nan(data, missing_values)
158189

159190
if not to_pandas:
160191
data = convert_to_dataset(data)

pyproject.toml

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -25,11 +25,16 @@ dependencies = [
2525
# scenes. xarray, netcdf4, and h5 dependencies should all be synced to avoid
2626
# compatibility issues with individual modules.
2727
"xarray>=2024.6.0",
28-
"netCDF4>=1.7.1.post1",
28+
"netCDF4>=1.6.5",
2929
"h5py>=3.11.0",
3030
"h5pyd>=0.18.0",
3131
"numpy>=2.0.0",
32-
"pandas>=2.2.2,<3.0",
32+
"pandas>=2.2.2",
33+
# pandas 3.0+ uses a new string dtype by default and stores it using pyarrow
34+
# when pyarrow is installed, otherwise a more limited numpy fallback. pandas
35+
# keeps pyarrow optional, so we require it here to always get the reliable
36+
# pyarrow storage. pyarrow 16.0.0 is the first version that supports numpy 2.
37+
"pyarrow>=16.0.0",
3338
"scipy>=1.14.0",
3439
"matplotlib>=3.9.1",
3540
"pecos>=0.3.0",
@@ -91,7 +96,6 @@ dev = [
9196
"pytest-cov",
9297
"pre-commit",
9398
"coverage",
94-
"coveralls",
9599
"black",
96100
]
97101

0 commit comments

Comments
 (0)