From ffe0ebf7b4a31e1867945507db9c3504e3865403 Mon Sep 17 00:00:00 2001 From: Vecko <36369090+VeckoTheGecko@users.noreply.github.com> Date: Fri, 17 Apr 2026 15:15:04 +0200 Subject: [PATCH 01/34] Update .gitignore --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index 45b79435b..6f07f7d82 100644 --- a/.gitignore +++ b/.gitignore @@ -14,6 +14,7 @@ out-* *.pyc **/*.zarr/* .DS_Store +*.parquet .vscode .env From 9ebb653cca1164009a7ad03de71cc727d3fc2c12 Mon Sep 17 00:00:00 2001 From: Vecko <36369090+VeckoTheGecko@users.noreply.github.com> Date: Fri, 17 Apr 2026 15:17:27 +0200 Subject: [PATCH 02/34] Disable zarr writing --- pyproject.toml | 1 + src/parcels/_core/particlefile.py | 27 ++++++------- tests/test_advection.py | 1 + tests/test_fieldset.py | 1 + tests/test_particlefile.py | 63 +++++++++++++++++++++++++++++++ tests/test_uxadvection.py | 1 + 6 files changed, 81 insertions(+), 13 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 85aba3a67..0d4d56784 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -60,6 +60,7 @@ markers = [ # can be skipped by doing `pytest -m "not slow"` etc. "v4alpha: failing tests that should work for v4alpha", "v4future: failing tests that should work for a future release of v4", "v4remove: failing tests that should probably be removed later", + "uses_old_zarr: tests that need to be migrated to the new particleset format" ] filterwarnings = [ diff --git a/src/parcels/_core/particlefile.py b/src/parcels/_core/particlefile.py index 788c6e572..0d4e0d53b 100644 --- a/src/parcels/_core/particlefile.py +++ b/src/parcels/_core/particlefile.py @@ -222,21 +222,22 @@ def _write_particle_data(self, *, particle_data, pclass, time_interval, time, in dims = ["trajectory", "obs"] ds[var.name] = xr.DataArray(data=data, dims=dims, attrs=attrs[var.name]) ds[var.name].encoding["chunks"] = self.chunks[0] if var.to_write == "once" else self.chunks - ds.to_zarr(store, mode="w") + # ds.to_zarr(store, mode="w") self._create_new_zarrfile = False else: - Z = zarr.group(store=store, overwrite=False) - obs = particle_data["obs_written"][indices_to_write] - for var in vars_to_write: - if self._maxids > Z[var.name].shape[0]: - self._extend_zarr_dims(Z[var.name], store, dtype=var.dtype, axis=0) - if var.to_write == "once": - if len(once_ids) > 0: - Z[var.name].vindex[ids_once] = particle_data[var.name][indices_to_write_once] - else: - if max(obs) >= Z[var.name].shape[1]: - self._extend_zarr_dims(Z[var.name], store, dtype=var.dtype, axis=1) - Z[var.name].vindex[ids, obs] = particle_data[var.name][indices_to_write] + pass + # Z = zarr.group(store=store, overwrite=False) + # obs = particle_data["obs_written"][indices_to_write] + # for var in vars_to_write: + # if self._maxids > Z[var.name].shape[0]: + # self._extend_zarr_dims(Z[var.name], store, dtype=var.dtype, axis=0) + # if var.to_write == "once": + # if len(once_ids) > 0: + # Z[var.name].vindex[ids_once] = particle_data[var.name][indices_to_write_once] + # else: + # if max(obs) >= Z[var.name].shape[1]: + # self._extend_zarr_dims(Z[var.name], store, dtype=var.dtype, axis=1) + # Z[var.name].vindex[ids, obs] = particle_data[var.name][indices_to_write] particle_data["obs_written"][indices_to_write] = obs + 1 diff --git a/tests/test_advection.py b/tests/test_advection.py index d8c6d2a45..06e46e231 100644 --- a/tests/test_advection.py +++ b/tests/test_advection.py @@ -60,6 +60,7 @@ def test_advection_zonal(mesh, npart=10): np.testing.assert_allclose(pset.lat, startlat, atol=1e-5) +@pytest.mark.uses_old_zarr def test_advection_zonal_with_particlefile(tmp_store): """Particles at high latitude move geographically faster due to the pole correction.""" npart = 10 diff --git a/tests/test_fieldset.py b/tests/test_fieldset.py index b2b05d33f..e51d13f38 100644 --- a/tests/test_fieldset.py +++ b/tests/test_fieldset.py @@ -95,6 +95,7 @@ def test_fieldset_gridset(fieldset): assert len(fieldset.gridset) == 2 +@pytest.mark.uses_old_zarr def test_fieldset_no_UV(tmp_zarrfile): grid = XGrid.from_dataset(ds, mesh="flat") fieldset = FieldSet([Field("P", ds["U_A_grid"], grid, interp_method=XLinear)]) diff --git a/tests/test_particlefile.py b/tests/test_particlefile.py index 84cb90ffa..5c5f6d566 100755 --- a/tests/test_particlefile.py +++ b/tests/test_particlefile.py @@ -4,6 +4,7 @@ from datetime import datetime, timedelta import numpy as np +import pyarrow as pa import pytest import xarray as xr from zarr.storage import MemoryStore @@ -21,6 +22,7 @@ XGrid, ) from parcels._core.particle import Particle, create_particle_data, get_default_particle +from parcels._core.particlefile import _get_schema from parcels._core.utils.time import TimeInterval, timedelta_to_float from parcels._datasets.structured.generated import peninsula_dataset from parcels._datasets.structured.generic import datasets @@ -44,6 +46,7 @@ def fieldset() -> FieldSet: # TODO v4: Move into a `conftest.py` file and remov ) +@pytest.mark.uses_old_zarr def test_metadata(fieldset, tmp_zarrfile): pset = ParticleSet(fieldset, pclass=Particle, lon=0, lat=0) @@ -54,6 +57,7 @@ def test_metadata(fieldset, tmp_zarrfile): assert ds.attrs["parcels_kernels"].lower() == "DoNothing".lower() +@pytest.mark.uses_old_zarr def test_pfile_array_write_zarr_memorystore(fieldset): """Check that writing to a Zarr MemoryStore works.""" npart = 10 @@ -72,6 +76,7 @@ def test_pfile_array_write_zarr_memorystore(fieldset): assert ds.sizes["trajectory"] == npart +@pytest.mark.uses_old_zarr def test_write_fieldset_without_time(tmp_zarrfile): ds = peninsula_dataset() # DataSet without time assert "time" not in ds.dims @@ -87,6 +92,7 @@ def test_write_fieldset_without_time(tmp_zarrfile): assert ds.time.values[0, 1] == np.timedelta64(1, "s") +@pytest.mark.uses_old_zarr def test_pfile_array_remove_particles(fieldset, tmp_zarrfile): npart = 10 pset = ParticleSet( @@ -108,6 +114,7 @@ def test_pfile_array_remove_particles(fieldset, tmp_zarrfile): assert (np.isnat(timearr[3, 1])) and (np.isfinite(timearr[3, 0])) +@pytest.mark.uses_old_zarr @pytest.mark.parametrize("chunks_obs", [1, None]) def test_pfile_array_remove_all_particles(fieldset, chunks_obs, tmp_zarrfile): npart = 10 @@ -135,6 +142,7 @@ def test_pfile_array_remove_all_particles(fieldset, chunks_obs, tmp_zarrfile): assert np.all(np.isnan(ds["time"][:, 1:])) +@pytest.mark.uses_old_zarr def test_variable_write_double(fieldset, tmp_zarrfile): def Update_lon(particles, fieldset): # pragma: no cover particles.dlon += 0.1 @@ -150,6 +158,7 @@ def Update_lon(particles, fieldset): # pragma: no cover assert isinstance(lons.values[0, 0], np.float64) +@pytest.mark.uses_old_zarr def test_write_dtypes_pfile(fieldset, tmp_zarrfile): dtypes = [ np.float32, @@ -226,6 +235,7 @@ def IncrLon(particles, fieldset): # pragma: no cover assert filesize < 1024 * 65 # test that chunking leads to filesize less than 65KB +@pytest.mark.uses_old_zarr def test_file_warnings(fieldset, tmp_zarrfile): pset = ParticleSet(fieldset, lon=[0, 0], lat=[0, 0], time=[np.timedelta64(0, "s"), np.timedelta64(1, "s")]) pfile = ParticleFile(tmp_zarrfile, outputdt=np.timedelta64(2, "s")) @@ -250,6 +260,7 @@ def test_outputdt_types(outputdt, expectation, tmp_zarrfile): assert pfile.outputdt == timedelta_to_float(outputdt) +@pytest.mark.uses_old_zarr def test_write_timebackward(fieldset, tmp_zarrfile): release_time = fieldset.time_interval.left + [np.timedelta64(i + 1, "s") for i in range(3)] pset = ParticleSet(fieldset, lat=[0, 1, 2], lon=[0, 0, 0], time=release_time) @@ -322,6 +333,7 @@ def SampleP(particles, fieldset): # pragma: no cover assert fieldset.U.grid.lat[yi] <= lat < fieldset.U.grid.lat[yi + 1] +@pytest.mark.uses_old_zarr @pytest.mark.parametrize("outputdt", [np.timedelta64(1, "s"), np.timedelta64(2, "s"), np.timedelta64(3, "s")]) def test_time_is_age(fieldset, tmp_zarrfile, outputdt): # Test that particle age is same as time - initial_time @@ -346,6 +358,7 @@ def IncreaseAge(particles, fieldset): # pragma: no cover np.testing.assert_equal(age, ds_timediff) +@pytest.mark.uses_old_zarr def test_reset_dt(fieldset, tmp_zarrfile): # Assert that p.dt gets reset when a write_time is not a multiple of dt # for p.dt=0.02 to reach outputdt=0.05 and endtime=0.1, the steps should be [0.2, 0.2, 0.1, 0.2, 0.2, 0.1], resulting in 6 kernel executions @@ -362,6 +375,7 @@ def Update_lon(particles, fieldset): # pragma: no cover assert np.allclose(pset.lon, 0.6) +@pytest.mark.uses_old_zarr def test_correct_misaligned_outputdt_dt(fieldset, tmp_zarrfile): """Testing that outputdt does not need to be a multiple of dt.""" @@ -398,6 +412,7 @@ def setup_pset_execute(*, fieldset: FieldSet, outputdt: timedelta, execute_kwarg return ds +@pytest.mark.uses_old_zarr def test_pset_execute_outputdt_forwards(fieldset): """Testing output data dt matches outputdt in forward time.""" outputdt = timedelta(hours=1) @@ -409,6 +424,7 @@ def test_pset_execute_outputdt_forwards(fieldset): assert np.all(ds.isel(trajectory=0).time.diff(dim="obs").values == np.timedelta64(outputdt)) +@pytest.mark.uses_old_zarr def test_pset_execute_output_time_forwards(fieldset): """Testing output times start at initial time and end at initial time + runtime.""" outputdt = np.timedelta64(1, "h") @@ -423,6 +439,7 @@ def test_pset_execute_output_time_forwards(fieldset): ) +@pytest.mark.uses_old_zarr def test_pset_execute_outputdt_backwards(fieldset): """Testing output data dt matches outputdt in backwards time.""" outputdt = timedelta(hours=1) @@ -434,6 +451,7 @@ def test_pset_execute_outputdt_backwards(fieldset): assert np.all(file_outputdt == np.timedelta64(-outputdt)) +@pytest.mark.uses_old_zarr def test_pset_execute_outputdt_backwards_fieldset_timevarying(): """test_pset_execute_outputdt_backwards() still passed despite #1722 as it doesn't account for time-varying fields, which for some reason #1722 @@ -469,6 +487,7 @@ def test_particlefile_init_invalid(tmp_store): # TODO: Add test for read only s ParticleFile(tmp_store, outputdt=np.timedelta64(1, "s"), chunks=1) +@pytest.mark.uses_old_zarr def test_particlefile_write_particle_data(tmp_store): nparticles = 100 @@ -535,3 +554,47 @@ def Update_lon(particles, fieldset): # pragma: no cover # For pytest purposes, we need to reset to original status pset.set_variable_write_status("z", True) pset.set_variable_write_status("lat", True) + + +@pytest.mark.parametrize( + "particle", + [ + Particle, + parcels.ParticleClass( + variables=[ + Variable( + "lon", + dtype=np.float32, + attrs={"standard_name": "longitude", "units": "degrees_east", "axis": "X"}, + ), + Variable( + "lat", + dtype=np.float32, + attrs={"standard_name": "latitude", "units": "degrees_north", "axis": "Y"}, + ), + Variable( + "z", + dtype=np.float32, + attrs={"standard_name": "vertical coordinate", "units": "m", "positive": "down"}, + ), + ] + ), + ], +) +def test_particle_schema(particle): + s = _get_schema(particle, {}) + + written_variables = [v for v in particle.variables if v.to_write] + + assert len(s.names) == len(written_variables), ( + "Number of particles in the output schema should be the same as the writable variables in the ParticleClass object." + ) + + for variable, pyarrow_field in zip( + written_variables, + s, + strict=False, + ): + assert variable.name == pyarrow_field.name + assert variable.attrs == {k.decode(): v.decode() for k, v in pyarrow_field.metadata.items()} + assert pa.from_numpy_dtype(variable.dtype) == pyarrow_field.type diff --git a/tests/test_uxadvection.py b/tests/test_uxadvection.py index 3f27536f8..95e517140 100644 --- a/tests/test_uxadvection.py +++ b/tests/test_uxadvection.py @@ -11,6 +11,7 @@ ) +@pytest.mark.uses_old_zarr @pytest.mark.parametrize("integrator", [AdvectionEE, AdvectionRK2, AdvectionRK4]) def test_ux_constant_flow_face_centered_2D(integrator, tmp_zarrfile): ds = datasets_unstructured["ux_constant_flow_face_centered_2D"] From 0218c28f4d8f13af0ce9946c52915693322ab3d7 Mon Sep 17 00:00:00 2001 From: Vecko <36369090+VeckoTheGecko@users.noreply.github.com> Date: Fri, 17 Apr 2026 15:41:28 +0200 Subject: [PATCH 03/34] Fix parquet writing --- pyproject.toml | 2 +- src/parcels/_core/particlefile.py | 225 ++++++------------------------ src/parcels/_core/particleset.py | 9 +- src/parcels/_reprs.py | 7 +- tests/conftest.py | 5 + tests/test_advection.py | 11 +- 6 files changed, 64 insertions(+), 195 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 0d4d56784..d0e6f7ba1 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -60,7 +60,7 @@ markers = [ # can be skipped by doing `pytest -m "not slow"` etc. "v4alpha: failing tests that should work for v4alpha", "v4future: failing tests that should work for a future release of v4", "v4remove: failing tests that should probably be removed later", - "uses_old_zarr: tests that need to be migrated to the new particleset format" + "uses_old_zarr: tests that need to be migrated to the new particleset format", ] filterwarnings = [ diff --git a/src/parcels/_core/particlefile.py b/src/parcels/_core/particlefile.py index 0d4e0d53b..125a925c7 100644 --- a/src/parcels/_core/particlefile.py +++ b/src/parcels/_core/particlefile.py @@ -2,16 +2,14 @@ from __future__ import annotations -import os from datetime import datetime, timedelta from pathlib import Path -from typing import TYPE_CHECKING, Literal +from typing import TYPE_CHECKING, Any, Literal import cftime import numpy as np -import xarray as xr -import zarr -from zarr.storage import DirectoryStore +import pyarrow as pa +import pyarrow.parquet as pq import parcels from parcels._core.particle import ParticleClass @@ -25,20 +23,19 @@ __all__ = ["ParticleFile"] -_DATATYPES_TO_FILL_VALUES = { - np.dtype(np.float16): np.nan, - np.dtype(np.float32): np.nan, - np.dtype(np.float64): np.nan, - np.dtype(np.bool_): np.iinfo(np.int8).max, - np.dtype(np.int8): np.iinfo(np.int8).max, - np.dtype(np.int16): np.iinfo(np.int16).max, - np.dtype(np.int32): np.iinfo(np.int32).max, - np.dtype(np.int64): np.iinfo(np.int64).min, - np.dtype(np.uint8): np.iinfo(np.uint8).max, - np.dtype(np.uint16): np.iinfo(np.uint16).max, - np.dtype(np.uint32): np.iinfo(np.uint32).max, - np.dtype(np.uint64): np.iinfo(np.uint64).max, -} + +def _get_schema(particle: parcels.ParticleClass, file_metadata: dict[Any, Any]) -> pa.Schema: + return pa.schema( + [ + pa.field( + v.name, + pa.from_numpy_dtype(v.dtype), + metadata=v.attrs, + ) + for v in _get_vars_to_write(particle) + ], + metadata=file_metadata.copy(), + ) class ParticleFile: @@ -54,10 +51,6 @@ class ParticleFile: Interval which dictates the update frequency of file output while ParticleFile is given as an argument of ParticleSet.execute() It is either a numpy.timedelta64, a datimetime.timedelta object or a positive float (in seconds). - chunks : - Tuple (trajs, obs) to control the size of chunks in the zarr output. - create_new_zarrfile : bool - Whether to create a new file. Default is True Returns ------- @@ -65,7 +58,7 @@ class ParticleFile: ParticleFile object that can be used to write particle data to file """ - def __init__(self, store, outputdt, chunks=None, create_new_zarrfile=True): + def __init__(self, path: Path, outputdt): if not isinstance(outputdt, (np.timedelta64, timedelta, float)): raise ValueError( f"Expected outputdt to be a np.timedelta64, datetime.timedelta or float (in seconds), got {type(outputdt)}" @@ -78,21 +71,19 @@ def __init__(self, store, outputdt, chunks=None, create_new_zarrfile=True): self._outputdt = outputdt - _assert_valid_chunks_tuple(chunks) - self._chunks = chunks + self._path = Path( + path + ) # TODO v4: Consider https://arrow.apache.org/docs/python/getstarted.html#working-with-large-data - though a significant question becomes how to partition, perhaps using a particle variable "partition"? + self._writer: pq.ParquetWriter | None = None + if path.exists(): + # TODO: Add logic for recovering/appending to existing parquet file + raise ValueError(f"{path=!r} already exists. Either delete this file or use a path that doesn't exist.") + if not path.parent.exists(): + raise ValueError(f"Folder location for {path=!r} does not exist. Create the folder location first.") + self._maxids = 0 self._pids_written = {} - self.metadata = {} - self._create_new_zarrfile = create_new_zarrfile - - if not isinstance(store, zarr.storage.Store): - store = _get_store_from_pathlike(store) - - self._store = store - - # TODO v4: Enable once updating to zarr v3 - # if store.read_only: - # raise ValueError(f"Store {store} is read-only. Please provide a writable store.") + self.extra_metadata = {} # TODO v4: Add check that if create_new_zarrfile is False, the store already exists @@ -100,7 +91,7 @@ def __repr__(self) -> str: return particlefile_repr(self) def set_metadata(self, parcels_grid_mesh: Literal["spherical", "flat"]): - self.metadata.update( + self.extra_metadata.update( { "feature_type": "trajectory", "Conventions": "CF-1.6/CF-1.7", @@ -115,31 +106,8 @@ def outputdt(self): return self._outputdt @property - def chunks(self): - return self._chunks - - @property - def store(self): - return self._store - - @property - def create_new_zarrfile(self): - return self._create_new_zarrfile - - def _extend_zarr_dims(self, Z, store, dtype, axis): # noqa: N803 - if axis == 1: - a = np.full((Z.shape[0], self.chunks[1]), _DATATYPES_TO_FILL_VALUES[dtype], dtype=dtype) - obs = zarr.group(store=store, overwrite=False)["obs"] - if len(obs) == Z.shape[1]: - obs.append(np.arange(self.chunks[1]) + obs[-1] + 1) - else: - extra_trajs = self._maxids - Z.shape[0] - if len(Z.shape) == 2: - a = np.full((extra_trajs, Z.shape[1]), _DATATYPES_TO_FILL_VALUES[dtype], dtype=dtype) - else: - a = np.full((extra_trajs,), _DATATYPES_TO_FILL_VALUES[dtype], dtype=dtype) - Z.append(a, axis=axis) - zarr.consolidate_metadata(store) + def path(self): + return self._path def write(self, pset: ParticleSet, time, indices=None): """Write all data from one time step to the zarr file, @@ -156,125 +124,35 @@ def write(self, pset: ParticleSet, time, indices=None): time_interval = pset.fieldset.time_interval particle_data = pset._data - self._write_particle_data( - particle_data=particle_data, pclass=pclass, time_interval=time_interval, time=time, indices=indices - ) + if self._writer is None: + assert not self.path.exists(), "If the file exists, the writer should already be set" + self._writer = pq.ParquetWriter(self.path, _get_schema(pclass, self.extra_metadata)) - def _write_particle_data(self, *, particle_data, pclass, time_interval, time, indices=None): - # if pset._data._ncount == 0: - # warnings.warn( - # f"ParticleSet is empty on writing as array at time {time:g}", - # RuntimeWarning, - # stacklevel=2, - # ) - # return if isinstance(time, (np.timedelta64, np.datetime64)): time = timedelta_to_float(time - time_interval.left) - nparticles = len(particle_data["trajectory"]) vars_to_write = _get_vars_to_write(pclass) if indices is None: indices_to_write = _to_write_particles(particle_data, time) else: indices_to_write = indices + + self._writer.write_table( + pa.table({v.name: pa.array(particle_data[v.name][indices_to_write]) for v in vars_to_write}), + ) - if len(indices_to_write) == 0: - return - - pids = particle_data["trajectory"][indices_to_write] - to_add = sorted(set(pids) - set(self._pids_written.keys())) - for i, pid in enumerate(to_add): - self._pids_written[pid] = self._maxids + i - ids = np.array([self._pids_written[p] for p in pids], dtype=int) - self._maxids = len(self._pids_written) - - once_ids = np.where(particle_data["obs_written"][indices_to_write] == 0)[0] - if len(once_ids) > 0: - ids_once = ids[once_ids] - indices_to_write_once = indices_to_write[once_ids] - - store = self.store - if self.create_new_zarrfile: - if self.chunks is None: - self._chunks = (nparticles, 1) - if (self._maxids > len(ids)) or (self._maxids > self.chunks[0]): - arrsize = (self._maxids, self.chunks[1]) - else: - arrsize = (len(ids), self.chunks[1]) - ds = xr.Dataset( - attrs=self.metadata, - coords={"trajectory": ("trajectory", pids), "obs": ("obs", np.arange(arrsize[1], dtype=np.int32))}, - ) - attrs = _create_variables_attribute_dict(pclass, time_interval) - obs = np.zeros((self._maxids), dtype=np.int32) - for var in vars_to_write: - if var.name not in ["trajectory"]: # because 'trajectory' is written as coordinate - if var.to_write == "once": - data = np.full( - (arrsize[0],), - _DATATYPES_TO_FILL_VALUES[var.dtype], - dtype=var.dtype, - ) - data[ids_once] = particle_data[var.name][indices_to_write_once] - dims = ["trajectory"] - else: - data = np.full(arrsize, _DATATYPES_TO_FILL_VALUES[var.dtype], dtype=var.dtype) - data[ids, 0] = particle_data[var.name][indices_to_write] - dims = ["trajectory", "obs"] - ds[var.name] = xr.DataArray(data=data, dims=dims, attrs=attrs[var.name]) - ds[var.name].encoding["chunks"] = self.chunks[0] if var.to_write == "once" else self.chunks - # ds.to_zarr(store, mode="w") - self._create_new_zarrfile = False - else: - pass - # Z = zarr.group(store=store, overwrite=False) - # obs = particle_data["obs_written"][indices_to_write] - # for var in vars_to_write: - # if self._maxids > Z[var.name].shape[0]: - # self._extend_zarr_dims(Z[var.name], store, dtype=var.dtype, axis=0) - # if var.to_write == "once": - # if len(once_ids) > 0: - # Z[var.name].vindex[ids_once] = particle_data[var.name][indices_to_write_once] - # else: - # if max(obs) >= Z[var.name].shape[1]: - # self._extend_zarr_dims(Z[var.name], store, dtype=var.dtype, axis=1) - # Z[var.name].vindex[ids, obs] = particle_data[var.name][indices_to_write] - - particle_data["obs_written"][indices_to_write] = obs + 1 - - -def _get_store_from_pathlike(path: Path | str) -> DirectoryStore: - path = str(Path(path)) # Ensure valid path, and convert to string - extension = os.path.splitext(path)[1] - if extension != ".zarr": - raise ValueError(f"ParticleFile name must end with '.zarr' extension. Got path {path!r}.") + # if len(indices_to_write) == 0: # TODO: Remove this? + # return - return DirectoryStore(path) + def close(self): + if self._writer is not None: + self._writer.close() + self._writer = None def _get_vars_to_write(particle: ParticleClass) -> list[Variable]: return [v for v in particle.variables if v.to_write is not False] -def _create_variables_attribute_dict(particle: ParticleClass, time_interval: TimeInterval) -> dict: - """Creates the dictionary with variable attributes. - - Notes - ----- - For ParticleSet structures other than SoA, and structures where ID != index, this has to be overridden. - """ - attrs = {} - - vars = [var for var in particle.variables if var.to_write is not False] - for var in vars: - fill_value = {"_FillValue": _DATATYPES_TO_FILL_VALUES[var.dtype]} - - attrs[var.name] = {**var.attrs, **fill_value} - - attrs["time"].update(_get_calendar_and_units(time_interval)) - - return attrs - - def _to_write_particles(particle_data, time): """Return the Particles that need to be written at time: if particle.time is between time-dt/2 and time+dt (/2)""" return np.where( @@ -299,7 +177,7 @@ def _to_write_particles(particle_data, time): )[0] -def _get_calendar_and_units(time_interval: TimeInterval) -> dict[str, str]: +def _get_calendar_and_units(time_interval: TimeInterval) -> dict[str, str]: # TODO: Remove? calendar = None units = "seconds" if time_interval: @@ -316,16 +194,3 @@ def _get_calendar_and_units(time_interval: TimeInterval) -> dict[str, str]: attrs["calendar"] = calendar return attrs - - -def _assert_valid_chunks_tuple(chunks): - e = ValueError(f"chunks must be a tuple of integers with length 2, got {chunks=!r} instead.") - if chunks is None: - return - - if not isinstance(chunks, tuple): - raise e - if len(chunks) != 2: - raise e - if not all(isinstance(c, int) for c in chunks): - raise e diff --git a/src/parcels/_core/particleset.py b/src/parcels/_core/particleset.py index 5483ffbe4..e4ecb252b 100644 --- a/src/parcels/_core/particleset.py +++ b/src/parcels/_core/particleset.py @@ -20,7 +20,7 @@ ) from parcels._core.warnings import ParticleSetWarning from parcels._logger import logger -from parcels._reprs import _format_zarr_output_location, particleset_repr +from parcels._reprs import particleset_repr __all__ = ["ParticleSet"] @@ -395,7 +395,7 @@ def execute( if output_file is not None: output_file.set_metadata(self.fieldset.gridset[0]._mesh) - output_file.metadata["parcels_kernels"] = self._kernel.funcname + output_file.extra_metadata["parcels_kernels"] = self._kernel.funcname dt, sign_dt = _convert_dt_to_float(dt) self._data["dt"][:] = dt @@ -415,7 +415,7 @@ def execute( # Set up pbar if output_file: - logger.info(f"Output files are stored in {_format_zarr_output_location(output_file.store)}") + logger.info(f"Output files are stored in {output_file.path}") if verbose_progress: pbar = tqdm(total=end_time - start_time, file=sys.stdout) @@ -451,6 +451,9 @@ def execute( time = next_time + if output_file is not None: + output_file.close() + if verbose_progress: pbar.close() diff --git a/src/parcels/_reprs.py b/src/parcels/_reprs.py index ad6d0cca2..34b6814a0 100644 --- a/src/parcels/_reprs.py +++ b/src/parcels/_reprs.py @@ -128,7 +128,7 @@ def timeinterval_repr(ti: Any) -> str: def particlefile_repr(pfile: Any) -> str: """Return a pretty repr for ParticleFile""" out = f"""<{type(pfile).__name__}> - store : {_format_zarr_output_location(pfile.store)} + path : {pfile.path} outputdt : {pfile.outputdt!r} chunks : {pfile.chunks!r} create_new_zarrfile : {pfile.create_new_zarrfile!r} @@ -178,11 +178,6 @@ def _format_list_items_multiline(items: list[str] | dict, level: int = 1, with_b return "\n".join([textwrap.indent(e, indentation_str) for e in entries]) -def _format_zarr_output_location(zarr_obj): - if isinstance(zarr_obj, DirectoryStore): - return zarr_obj.path - return repr(zarr_obj) - def is_builtin_object(obj): return obj.__class__.__module__ == "builtins" diff --git a/tests/conftest.py b/tests/conftest.py index 82020c37e..3308d7e3e 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -11,3 +11,8 @@ def tmp_zarrfile(tmp_path, request): @pytest.fixture def tmp_store(): return MemoryStore() + +@pytest.fixture +def tmp_parquet(tmp_path): + return tmp_path / 'tmp.parquet' + diff --git a/tests/test_advection.py b/tests/test_advection.py index 06e46e231..365e9e6f4 100644 --- a/tests/test_advection.py +++ b/tests/test_advection.py @@ -4,6 +4,7 @@ import parcels import parcels.tutorial +import pandas as pd from parcels import ( Field, FieldSet, @@ -60,8 +61,7 @@ def test_advection_zonal(mesh, npart=10): np.testing.assert_allclose(pset.lat, startlat, atol=1e-5) -@pytest.mark.uses_old_zarr -def test_advection_zonal_with_particlefile(tmp_store): +def test_advection_zonal_with_particlefile(tmp_parquet): """Particles at high latitude move geographically faster due to the pole correction.""" npart = 10 ds = simple_UV_dataset(mesh="flat") @@ -69,12 +69,13 @@ def test_advection_zonal_with_particlefile(tmp_store): fieldset = FieldSet.from_sgrid_conventions(ds, mesh="flat") pset = ParticleSet(fieldset, lon=np.zeros(npart) + 20.0, lat=np.linspace(0, 80, npart)) - pfile = ParticleFile(tmp_store, outputdt=np.timedelta64(30, "m")) + pfile = ParticleFile(tmp_parquet, outputdt=np.timedelta64(30, "m")) pset.execute(AdvectionRK4, runtime=np.timedelta64(2, "h"), dt=np.timedelta64(15, "m"), output_file=pfile) assert (np.diff(pset.lon) < 1.0e-4).all() - ds = xr.open_zarr(tmp_store) - np.testing.assert_allclose(ds.isel(obs=-1).lon.values, pset.lon) + df = pd.read_parquet(tmp_parquet) + final_time = df["time"].max() + np.testing.assert_allclose(df[df["time"] == final_time]["lon"].values, pset.lon, atol=1e-5) def periodicBC(particles, fieldset): From 4e7de3eab2af85c3c33e2d38fc1872606db5d601 Mon Sep 17 00:00:00 2001 From: Vecko <36369090+VeckoTheGecko@users.noreply.github.com> Date: Fri, 17 Apr 2026 19:48:53 +0200 Subject: [PATCH 04/34] Remove test_vriable_write_double From bc653f1ec5ac2691572aa22d2c802a4885224a71 Mon Sep 17 00:00:00 2001 From: Vecko <36369090+VeckoTheGecko@users.noreply.github.com> Date: Fri, 17 Apr 2026 19:08:16 +0200 Subject: [PATCH 05/34] Fix all "uses_old_zarr" tests --- tests-v3/test_advection.py | 6 +- tests-v3/test_fieldset_sampling.py | 6 +- tests-v3/test_particlesets.py | 6 +- tests/conftest.py | 2 +- tests/test_fieldset.py | 10 +-- tests/test_particlefile.py | 112 ++++++++++++++--------------- tests/test_uxadvection.py | 10 +-- 7 files changed, 76 insertions(+), 76 deletions(-) diff --git a/tests-v3/test_advection.py b/tests-v3/test_advection.py index 3d8f06bac..77abee900 100644 --- a/tests-v3/test_advection.py +++ b/tests-v3/test_advection.py @@ -79,7 +79,7 @@ def test_analyticalAgrid(): @pytest.mark.parametrize("v", [1, -0.3, 0, -1]) @pytest.mark.parametrize("w", [None, 1, -0.3, 0, -1]) @pytest.mark.parametrize("direction", [1, -1]) -def test_uniform_analytical(u, v, w, direction, tmp_zarrfile): +def test_uniform_analytical(u, v, w, direction, tmp_parquet): lon = np.arange(0, 15, dtype=np.float32) lat = np.arange(0, 15, dtype=np.float32) if w is not None: @@ -99,14 +99,14 @@ def test_uniform_analytical(u, v, w, direction, tmp_zarrfile): x0, y0, z0 = 6.1, 6.2, 20 pset = ParticleSet(fieldset, pclass=Particle, lon=x0, lat=y0, depth=z0) - outfile = pset.ParticleFile(name=tmp_zarrfile, outputdt=1, chunks=(1, 1)) + outfile = pset.ParticleFile(name=tmp_parquet, outputdt=1, chunks=(1, 1)) pset.execute(AdvectionAnalytical, runtime=4, dt=direction, output_file=outfile) assert np.abs(pset.lon - x0 - pset.time * u) < 1e-6 assert np.abs(pset.lat - y0 - pset.time * v) < 1e-6 if w is not None: assert np.abs(pset.depth - z0 - pset.time * w) < 1e-4 - ds = xr.open_zarr(tmp_zarrfile) + ds = xr.open_zarr(tmp_parquet) times = (direction * ds["time"][:]).values.astype("timedelta64[s]")[0] timeref = np.arange(1, 5).astype("timedelta64[s]") assert np.allclose(times, timeref, atol=np.timedelta64(1, "ms")) diff --git a/tests-v3/test_fieldset_sampling.py b/tests-v3/test_fieldset_sampling.py index 291c27b88..5f25e355c 100644 --- a/tests-v3/test_fieldset_sampling.py +++ b/tests-v3/test_fieldset_sampling.py @@ -773,7 +773,7 @@ def test_multiple_grid_addlater_error(): assert fail -def test_fieldset_sampling_updating_order(tmp_zarrfile): +def test_fieldset_sampling_updating_order(tmp_parquet): def calc_p(t, y, x): return 10 * t + x + 0.2 * y @@ -805,10 +805,10 @@ def SampleP(particle, fieldset, time): # pragma: no cover kernels = [AdvectionRK4, SampleP] - pfile = pset.ParticleFile(tmp_zarrfile, outputdt=1) + pfile = pset.ParticleFile(tmp_parquet, outputdt=1) pset.execute(kernels, endtime=1, dt=1, output_file=pfile) - ds = xr.open_zarr(tmp_zarrfile) + ds = xr.open_zarr(tmp_parquet) for t in range(len(ds["obs"])): for i in range(len(ds["trajectory"])): assert np.isclose( diff --git a/tests-v3/test_particlesets.py b/tests-v3/test_particlesets.py index ed884f595..5c0f2495f 100644 --- a/tests-v3/test_particlesets.py +++ b/tests-v3/test_particlesets.py @@ -39,7 +39,7 @@ def test_pset_create_list_with_customvariable(fieldset): @pytest.mark.parametrize("restart", [True, False]) -def test_pset_create_fromparticlefile(fieldset, restart, tmp_zarrfile): +def test_pset_create_fromparticlefile(fieldset, restart, tmp_parquet): lon = np.linspace(0, 1, 10, dtype=np.float32) lat = np.linspace(1, 0, 10, dtype=np.float32) @@ -48,7 +48,7 @@ def test_pset_create_fromparticlefile(fieldset, restart, tmp_zarrfile): TestParticle = TestParticle.add_variable("p3", np.float64, to_write="once") pset = ParticleSet(fieldset, lon=lon, lat=lat, depth=[4] * len(lon), pclass=TestParticle, p3=np.arange(len(lon))) - pfile = pset.ParticleFile(tmp_zarrfile, outputdt=1) + pfile = pset.ParticleFile(tmp_parquet, outputdt=1) def Kernel(particle, fieldset, time): # pragma: no cover particle.p = 2.0 @@ -58,7 +58,7 @@ def Kernel(particle, fieldset, time): # pragma: no cover pset.execute(Kernel, runtime=2, dt=1, output_file=pfile) pset_new = ParticleSet.from_particlefile( - fieldset, pclass=TestParticle, filename=tmp_zarrfile, restart=restart, repeatdt=1 + fieldset, pclass=TestParticle, filename=tmp_parquet, restart=restart, repeatdt=1 ) for var in ["lon", "lat", "depth", "time", "p", "p2", "p3"]: diff --git a/tests/conftest.py b/tests/conftest.py index 3308d7e3e..56bbfa480 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -3,7 +3,7 @@ @pytest.fixture() -def tmp_zarrfile(tmp_path, request): +def tmp_parquet(tmp_path, request): test_name = request.node.name yield tmp_path / f"{test_name}-output.zarr" diff --git a/tests/test_fieldset.py b/tests/test_fieldset.py index e51d13f38..d9b8d6000 100644 --- a/tests/test_fieldset.py +++ b/tests/test_fieldset.py @@ -95,8 +95,8 @@ def test_fieldset_gridset(fieldset): assert len(fieldset.gridset) == 2 -@pytest.mark.uses_old_zarr -def test_fieldset_no_UV(tmp_zarrfile): + +def test_fieldset_no_UV(tmp_parquet): grid = XGrid.from_dataset(ds, mesh="flat") fieldset = FieldSet([Field("P", ds["U_A_grid"], grid, interp_method=XLinear)]) @@ -104,11 +104,11 @@ def SampleP(particles, fieldset): particles.dlon += fieldset.P[particles] pset = ParticleSet(fieldset, lon=0, lat=0) - ofile = ParticleFile(tmp_zarrfile, outputdt=np.timedelta64(1, "s")) + ofile = ParticleFile(tmp_parquet, outputdt=np.timedelta64(1, "s")) pset.execute(SampleP, runtime=np.timedelta64(1, "s"), dt=np.timedelta64(1, "s"), output_file=ofile) - ds_out = xr.open_zarr(tmp_zarrfile) - assert ds_out["lon"].shape == (1, 2) + df = xr.open_zarr(tmp_parquet) + assert len(df["lon"]) == 2 @pytest.mark.parametrize("ds", [pytest.param(ds, id=k) for k, ds in datasets_structured.items()]) diff --git a/tests/test_particlefile.py b/tests/test_particlefile.py index 5c5f6d566..67775fa30 100755 --- a/tests/test_particlefile.py +++ b/tests/test_particlefile.py @@ -46,18 +46,18 @@ def fieldset() -> FieldSet: # TODO v4: Move into a `conftest.py` file and remov ) -@pytest.mark.uses_old_zarr -def test_metadata(fieldset, tmp_zarrfile): + +def test_metadata(fieldset, tmp_parquet): pset = ParticleSet(fieldset, pclass=Particle, lon=0, lat=0) - ofile = ParticleFile(tmp_zarrfile, outputdt=np.timedelta64(1, "s")) + ofile = ParticleFile(tmp_parquet, outputdt=np.timedelta64(1, "s")) pset.execute(DoNothing, runtime=np.timedelta64(1, "s"), dt=np.timedelta64(1, "s"), output_file=ofile) - ds = xr.open_zarr(tmp_zarrfile) + ds = xr.open_zarr(tmp_parquet) assert ds.attrs["parcels_kernels"].lower() == "DoNothing".lower() -@pytest.mark.uses_old_zarr + def test_pfile_array_write_zarr_memorystore(fieldset): """Check that writing to a Zarr MemoryStore works.""" npart = 10 @@ -76,8 +76,8 @@ def test_pfile_array_write_zarr_memorystore(fieldset): assert ds.sizes["trajectory"] == npart -@pytest.mark.uses_old_zarr -def test_write_fieldset_without_time(tmp_zarrfile): + +def test_write_fieldset_without_time(tmp_parquet): ds = peninsula_dataset() # DataSet without time assert "time" not in ds.dims grid = XGrid.from_dataset(ds, mesh="flat") @@ -85,15 +85,15 @@ def test_write_fieldset_without_time(tmp_zarrfile): pset = ParticleSet(fieldset, pclass=Particle, lon=0, lat=0) - ofile = ParticleFile(tmp_zarrfile, outputdt=np.timedelta64(1, "s")) + ofile = ParticleFile(tmp_parquet, outputdt=np.timedelta64(1, "s")) pset.execute(DoNothing, runtime=np.timedelta64(1, "s"), dt=np.timedelta64(1, "s"), output_file=ofile) - ds = xr.open_zarr(tmp_zarrfile) + ds = xr.open_zarr(tmp_parquet) assert ds.time.values[0, 1] == np.timedelta64(1, "s") -@pytest.mark.uses_old_zarr -def test_pfile_array_remove_particles(fieldset, tmp_zarrfile): + +def test_pfile_array_remove_particles(fieldset, tmp_parquet): npart = 10 pset = ParticleSet( fieldset, @@ -102,21 +102,21 @@ def test_pfile_array_remove_particles(fieldset, tmp_zarrfile): lat=0.5 * np.ones(npart), time=fieldset.time_interval.left, ) - pfile = ParticleFile(tmp_zarrfile, outputdt=np.timedelta64(1, "s")) + pfile = ParticleFile(tmp_parquet, outputdt=np.timedelta64(1, "s")) pset._data["time"][:] = 0 pfile.write(pset, time=fieldset.time_interval.left) pset.remove_indices(3) new_time = 86400 # s in a day pset._data["time"][:] = new_time pfile.write(pset, new_time) - ds = xr.open_zarr(tmp_zarrfile) + ds = xr.open_zarr(tmp_parquet) timearr = ds["time"][:] assert (np.isnat(timearr[3, 1])) and (np.isfinite(timearr[3, 0])) -@pytest.mark.uses_old_zarr + @pytest.mark.parametrize("chunks_obs", [1, None]) -def test_pfile_array_remove_all_particles(fieldset, chunks_obs, tmp_zarrfile): +def test_pfile_array_remove_all_particles(fieldset, chunks_obs, tmp_parquet): npart = 10 pset = ParticleSet( fieldset, @@ -126,14 +126,14 @@ def test_pfile_array_remove_all_particles(fieldset, chunks_obs, tmp_zarrfile): time=fieldset.time_interval.left, ) chunks = (npart, chunks_obs) if chunks_obs else None - pfile = ParticleFile(tmp_zarrfile, chunks=chunks, outputdt=np.timedelta64(1, "s")) + pfile = ParticleFile(tmp_parquet, chunks=chunks, outputdt=np.timedelta64(1, "s")) pfile.write(pset, time=0) for _ in range(npart): pset.remove_indices(-1) pfile.write(pset, fieldset.time_interval.left + np.timedelta64(1, "D")) pfile.write(pset, fieldset.time_interval.left + np.timedelta64(2, "D")) - ds = xr.open_zarr(tmp_zarrfile) + ds = xr.open_zarr(tmp_parquet) np.testing.assert_allclose(ds["time"][:, 0] - fieldset.time_interval.left, np.timedelta64(0, "s")) if chunks_obs is not None: assert ds["time"][:].shape == chunks @@ -158,8 +158,8 @@ def Update_lon(particles, fieldset): # pragma: no cover assert isinstance(lons.values[0, 0], np.float64) -@pytest.mark.uses_old_zarr -def test_write_dtypes_pfile(fieldset, tmp_zarrfile): + +def test_write_dtypes_pfile(fieldset, tmp_parquet): dtypes = [ np.float32, np.float64, @@ -178,11 +178,11 @@ def test_write_dtypes_pfile(fieldset, tmp_zarrfile): MyParticle = Particle.add_variable(extra_vars) pset = ParticleSet(fieldset, pclass=MyParticle, lon=0, lat=0, time=fieldset.time_interval.left) - pfile = ParticleFile(tmp_zarrfile, outputdt=np.timedelta64(1, "s")) + pfile = ParticleFile(tmp_parquet, outputdt=np.timedelta64(1, "s")) pfile.write(pset, time=fieldset.time_interval.left) ds = xr.open_zarr( - tmp_zarrfile, mask_and_scale=False + tmp_parquet, mask_and_scale=False ) # Note masking issue at https://stackoverflow.com/questions/68460507/xarray-loading-int-data-as-float for d in dtypes: assert ds[f"v_{d.__name__}"].dtype == d @@ -196,7 +196,7 @@ def test_variable_written_once(): @pytest.mark.skip(reason="Pending ParticleFile refactor; see issue #2386") @pytest.mark.parametrize("dt", [-np.timedelta64(1, "s"), np.timedelta64(1, "s")]) @pytest.mark.parametrize("maxvar", [2, 4, 10]) -def test_pset_repeated_release_delayed_adding_deleting(fieldset, tmp_zarrfile, dt, maxvar): +def test_pset_repeated_release_delayed_adding_deleting(fieldset, tmp_parquet, dt, maxvar): """Tests that if particles are released and deleted based on age that resulting output file is correct.""" npart = 10 fieldset.add_constant("maxvar", maxvar) @@ -212,7 +212,7 @@ def test_pset_repeated_release_delayed_adding_deleting(fieldset, tmp_zarrfile, d pclass=MyParticle, time=fieldset.time_interval.left + [np.timedelta64(i + 1, "s") for i in range(npart)], ) - pfile = ParticleFile(tmp_zarrfile, outputdt=abs(dt), chunks=(1, 1)) + pfile = ParticleFile(tmp_parquet, outputdt=abs(dt), chunks=(1, 1)) def IncrLon(particles, fieldset): # pragma: no cover particles.sample_var += 1.0 @@ -225,20 +225,20 @@ def IncrLon(particles, fieldset): # pragma: no cover for _ in range(npart): pset.execute(IncrLon, dt=dt, runtime=np.timedelta64(1, "s"), output_file=pfile) - ds = xr.open_zarr(tmp_zarrfile) + ds = xr.open_zarr(tmp_parquet) samplevar = ds["sample_var"][:] assert samplevar.shape == (npart, min(maxvar, npart + 1)) # test whether samplevar[:, k] = k for k in range(samplevar.shape[1]): assert np.allclose([p for p in samplevar[:, k] if np.isfinite(p)], k + 1) - filesize = os.path.getsize(str(tmp_zarrfile)) + filesize = os.path.getsize(str(tmp_parquet)) assert filesize < 1024 * 65 # test that chunking leads to filesize less than 65KB -@pytest.mark.uses_old_zarr -def test_file_warnings(fieldset, tmp_zarrfile): + +def test_file_warnings(fieldset, tmp_parquet): pset = ParticleSet(fieldset, lon=[0, 0], lat=[0, 0], time=[np.timedelta64(0, "s"), np.timedelta64(1, "s")]) - pfile = ParticleFile(tmp_zarrfile, outputdt=np.timedelta64(2, "s")) + pfile = ParticleFile(tmp_parquet, outputdt=np.timedelta64(2, "s")) with pytest.warns(ParticleSetWarning, match="Some of the particles have a start time difference.*"): pset.execute(AdvectionRK4, runtime=3, dt=1, output_file=pfile) @@ -254,20 +254,20 @@ def test_file_warnings(fieldset, tmp_zarrfile): (-np.timedelta64(5, "s"), pytest.raises(ValueError)), ], ) -def test_outputdt_types(outputdt, expectation, tmp_zarrfile): +def test_outputdt_types(outputdt, expectation, tmp_parquet): with expectation: - pfile = ParticleFile(tmp_zarrfile, outputdt=outputdt) + pfile = ParticleFile(tmp_parquet, outputdt=outputdt) assert pfile.outputdt == timedelta_to_float(outputdt) -@pytest.mark.uses_old_zarr -def test_write_timebackward(fieldset, tmp_zarrfile): + +def test_write_timebackward(fieldset, tmp_parquet): release_time = fieldset.time_interval.left + [np.timedelta64(i + 1, "s") for i in range(3)] pset = ParticleSet(fieldset, lat=[0, 1, 2], lon=[0, 0, 0], time=release_time) - pfile = ParticleFile(tmp_zarrfile, outputdt=np.timedelta64(1, "s")) + pfile = ParticleFile(tmp_parquet, outputdt=np.timedelta64(1, "s")) pset.execute(DoNothing, runtime=np.timedelta64(3, "s"), dt=-np.timedelta64(1, "s"), output_file=pfile) - ds = xr.open_zarr(tmp_zarrfile) + ds = xr.open_zarr(tmp_parquet) trajs = ds["trajectory"][:] output_time = ds["time"][:].values @@ -280,7 +280,7 @@ def test_write_timebackward(fieldset, tmp_zarrfile): @pytest.mark.xfail @pytest.mark.v4alpha -def test_write_xiyi(fieldset, tmp_zarrfile): +def test_write_xiyi(fieldset, tmp_parquet): fieldset.U.data[:] = 1 # set a non-zero zonal velocity fieldset.add_field( Field(name="P", data=np.zeros((3, 20)), lon=np.linspace(0, 1, 20), lat=[-2, 0, 2], interp_method=XLinear) @@ -311,10 +311,10 @@ def SampleP(particles, fieldset): # pragma: no cover _ = fieldset.P[particles] # To trigger sampling of the P field pset = ParticleSet(fieldset, pclass=XiYiParticle, lon=[0, 0.2], lat=[0.2, 1]) - pfile = ParticleFile(tmp_zarrfile, outputdt=dt) + pfile = ParticleFile(tmp_parquet, outputdt=dt) pset.execute([SampleP, Get_XiYi, AdvectionRK4], endtime=10 * dt, dt=dt, output_file=pfile) - ds = xr.open_zarr(tmp_zarrfile) + ds = xr.open_zarr(tmp_parquet) pxi0 = ds["pxi0"][:].values.astype(np.int32) pxi1 = ds["pxi1"][:].values.astype(np.int32) lons = ds["lon"][:].values @@ -333,9 +333,9 @@ def SampleP(particles, fieldset): # pragma: no cover assert fieldset.U.grid.lat[yi] <= lat < fieldset.U.grid.lat[yi + 1] -@pytest.mark.uses_old_zarr + @pytest.mark.parametrize("outputdt", [np.timedelta64(1, "s"), np.timedelta64(2, "s"), np.timedelta64(3, "s")]) -def test_time_is_age(fieldset, tmp_zarrfile, outputdt): +def test_time_is_age(fieldset, tmp_parquet, outputdt): # Test that particle age is same as time - initial_time npart = 10 @@ -346,11 +346,11 @@ def IncreaseAge(particles, fieldset): # pragma: no cover time = fieldset.time_interval.left + np.arange(npart) * np.timedelta64(1, "s") pset = ParticleSet(fieldset, pclass=AgeParticle, lon=npart * [0], lat=npart * [0], time=time) - ofile = ParticleFile(tmp_zarrfile, outputdt=outputdt) + ofile = ParticleFile(tmp_parquet, outputdt=outputdt) pset.execute(IncreaseAge, runtime=np.timedelta64(npart * 2, "s"), dt=np.timedelta64(1, "s"), output_file=ofile) - ds = xr.open_zarr(tmp_zarrfile) + ds = xr.open_zarr(tmp_parquet) age = ds["age"][:].values.astype("timedelta64[s]") ds_timediff = np.zeros_like(age) for i in range(npart): @@ -358,8 +358,8 @@ def IncreaseAge(particles, fieldset): # pragma: no cover np.testing.assert_equal(age, ds_timediff) -@pytest.mark.uses_old_zarr -def test_reset_dt(fieldset, tmp_zarrfile): + +def test_reset_dt(fieldset, tmp_parquet): # Assert that p.dt gets reset when a write_time is not a multiple of dt # for p.dt=0.02 to reach outputdt=0.05 and endtime=0.1, the steps should be [0.2, 0.2, 0.1, 0.2, 0.2, 0.1], resulting in 6 kernel executions dt = np.timedelta64(20, "s") @@ -369,14 +369,14 @@ def Update_lon(particles, fieldset): # pragma: no cover particle = get_default_particle(np.float64) pset = ParticleSet(fieldset, pclass=particle, lon=[0], lat=[0]) - ofile = ParticleFile(tmp_zarrfile, outputdt=np.timedelta64(50, "s")) + ofile = ParticleFile(tmp_parquet, outputdt=np.timedelta64(50, "s")) pset.execute(Update_lon, runtime=5 * dt, dt=dt, output_file=ofile) assert np.allclose(pset.lon, 0.6) -@pytest.mark.uses_old_zarr -def test_correct_misaligned_outputdt_dt(fieldset, tmp_zarrfile): + +def test_correct_misaligned_outputdt_dt(fieldset, tmp_parquet): """Testing that outputdt does not need to be a multiple of dt.""" def Update_lon(particles, fieldset): # pragma: no cover @@ -384,10 +384,10 @@ def Update_lon(particles, fieldset): # pragma: no cover particle = get_default_particle(np.float64) pset = ParticleSet(fieldset, pclass=particle, lon=[0], lat=[0]) - ofile = ParticleFile(tmp_zarrfile, outputdt=np.timedelta64(3, "s")) + ofile = ParticleFile(tmp_parquet, outputdt=np.timedelta64(3, "s")) pset.execute(Update_lon, runtime=np.timedelta64(11, "s"), dt=np.timedelta64(2, "s"), output_file=ofile) - ds = xr.open_zarr(tmp_zarrfile) + ds = xr.open_zarr(tmp_parquet) assert np.allclose(ds.lon.values, [0, 3, 6, 9]) assert np.allclose(timedelta_to_float(ds.time.values - ds.time.values[0, 0]), [0, 3, 6, 9]) @@ -412,7 +412,7 @@ def setup_pset_execute(*, fieldset: FieldSet, outputdt: timedelta, execute_kwarg return ds -@pytest.mark.uses_old_zarr + def test_pset_execute_outputdt_forwards(fieldset): """Testing output data dt matches outputdt in forward time.""" outputdt = timedelta(hours=1) @@ -424,7 +424,7 @@ def test_pset_execute_outputdt_forwards(fieldset): assert np.all(ds.isel(trajectory=0).time.diff(dim="obs").values == np.timedelta64(outputdt)) -@pytest.mark.uses_old_zarr + def test_pset_execute_output_time_forwards(fieldset): """Testing output times start at initial time and end at initial time + runtime.""" outputdt = np.timedelta64(1, "h") @@ -439,7 +439,7 @@ def test_pset_execute_output_time_forwards(fieldset): ) -@pytest.mark.uses_old_zarr + def test_pset_execute_outputdt_backwards(fieldset): """Testing output data dt matches outputdt in backwards time.""" outputdt = timedelta(hours=1) @@ -451,7 +451,7 @@ def test_pset_execute_outputdt_backwards(fieldset): assert np.all(file_outputdt == np.timedelta64(-outputdt)) -@pytest.mark.uses_old_zarr + def test_pset_execute_outputdt_backwards_fieldset_timevarying(): """test_pset_execute_outputdt_backwards() still passed despite #1722 as it doesn't account for time-varying fields, which for some reason #1722 @@ -487,7 +487,7 @@ def test_particlefile_init_invalid(tmp_store): # TODO: Add test for read only s ParticleFile(tmp_store, outputdt=np.timedelta64(1, "s"), chunks=1) -@pytest.mark.uses_old_zarr + def test_particlefile_write_particle_data(tmp_store): nparticles = 100 @@ -533,19 +533,19 @@ def test_pfile_write_custom_particle(): @pytest.mark.xfail( reason="set_variable_write_status should be removed - with Particle writing defined on the particle level. GH2186" ) -def test_pfile_set_towrite_False(fieldset, tmp_zarrfile): +def test_pfile_set_towrite_False(fieldset, tmp_parquet): npart = 10 pset = ParticleSet(fieldset, pclass=Particle, lon=np.linspace(0, 1, npart), lat=0.5 * np.ones(npart)) pset.set_variable_write_status("z", False) pset.set_variable_write_status("lat", False) - pfile = pset.ParticleFile(tmp_zarrfile, outputdt=1) + pfile = pset.ParticleFile(tmp_parquet, outputdt=1) def Update_lon(particles, fieldset): # pragma: no cover particles.dlon += 0.1 pset.execute(Update_lon, runtime=10, output_file=pfile) - ds = xr.open_zarr(tmp_zarrfile) + ds = xr.open_zarr(tmp_parquet) assert "time" in ds assert "z" not in ds assert "lat" not in ds diff --git a/tests/test_uxadvection.py b/tests/test_uxadvection.py index 95e517140..04591019b 100644 --- a/tests/test_uxadvection.py +++ b/tests/test_uxadvection.py @@ -11,19 +11,19 @@ ) -@pytest.mark.uses_old_zarr + @pytest.mark.parametrize("integrator", [AdvectionEE, AdvectionRK2, AdvectionRK4]) -def test_ux_constant_flow_face_centered_2D(integrator, tmp_zarrfile): +def test_ux_constant_flow_face_centered_2D(integrator, tmp_parquet): ds = datasets_unstructured["ux_constant_flow_face_centered_2D"] T = np.timedelta64(3600, "s") dt = np.timedelta64(300, "s") fieldset = parcels.FieldSet.from_ugrid_conventions(ds, mesh="flat") pset = parcels.ParticleSet(fieldset, lon=[5.0], lat=[5.0]) - pfile = parcels.ParticleFile(store=tmp_zarrfile, outputdt=dt) + pfile = parcels.ParticleFile(store=tmp_parquet, outputdt=dt) pset.execute(integrator, runtime=T, dt=dt, output_file=pfile, verbose_progress=False) expected_lon = 8.6 np.testing.assert_allclose(pset.lon, expected_lon, atol=1e-5) - ds_out = xr.open_zarr(tmp_zarrfile) - np.testing.assert_allclose(ds_out["lon"][:, -1], expected_lon, atol=1e-5) + df = xr.open_zarr(tmp_parquet) + np.testing.assert_allclose(df["lon"][:, -1], expected_lon, atol=1e-5) From 5daec3bc131dfb1f2ee43e299c0d2b40b52cdb8a Mon Sep 17 00:00:00 2001 From: Vecko <36369090+VeckoTheGecko@users.noreply.github.com> Date: Fri, 17 Apr 2026 19:33:49 +0200 Subject: [PATCH 06/34] Remove test_variable_write_double Covered by test_write_dtypes_pfile --- tests/test_particlefile.py | 16 ---------------- 1 file changed, 16 deletions(-) diff --git a/tests/test_particlefile.py b/tests/test_particlefile.py index 67775fa30..eeade646f 100755 --- a/tests/test_particlefile.py +++ b/tests/test_particlefile.py @@ -142,22 +142,6 @@ def test_pfile_array_remove_all_particles(fieldset, chunks_obs, tmp_parquet): assert np.all(np.isnan(ds["time"][:, 1:])) -@pytest.mark.uses_old_zarr -def test_variable_write_double(fieldset, tmp_zarrfile): - def Update_lon(particles, fieldset): # pragma: no cover - particles.dlon += 0.1 - - dt = np.timedelta64(1, "s") - particle = get_default_particle(np.float64) - pset = ParticleSet(fieldset, pclass=particle, lon=[0], lat=[0]) - ofile = ParticleFile(tmp_zarrfile, outputdt=dt) - pset.execute(Update_lon, runtime=np.timedelta64(10, "s"), dt=dt, output_file=ofile) - - ds = xr.open_zarr(tmp_zarrfile) - lons = ds["lon"][:] - assert isinstance(lons.values[0, 0], np.float64) - - def test_write_dtypes_pfile(fieldset, tmp_parquet): dtypes = [ From 2a07ced6996d5966a68ab005ca403128a8b6d0e5 Mon Sep 17 00:00:00 2001 From: Vecko <36369090+VeckoTheGecko@users.noreply.github.com> Date: Fri, 17 Apr 2026 19:25:35 +0200 Subject: [PATCH 07/34] Fixing tests --- src/parcels/_core/particlefile.py | 11 ++- tests-v3/test_advection.py | 1 + tests-v3/test_fieldset_sampling.py | 1 + tests/test_fieldset.py | 1 + tests/test_particlefile.py | 116 ++++++++--------------------- 5 files changed, 39 insertions(+), 91 deletions(-) diff --git a/src/parcels/_core/particlefile.py b/src/parcels/_core/particlefile.py index 125a925c7..8d121e55d 100644 --- a/src/parcels/_core/particlefile.py +++ b/src/parcels/_core/particlefile.py @@ -9,6 +9,7 @@ import cftime import numpy as np import pyarrow as pa +from parcels._typing import PathLike import pyarrow.parquet as pq import parcels @@ -58,22 +59,24 @@ class ParticleFile: ParticleFile object that can be used to write particle data to file """ - def __init__(self, path: Path, outputdt): + def __init__(self, path: PathLike, outputdt): if not isinstance(outputdt, (np.timedelta64, timedelta, float)): raise ValueError( f"Expected outputdt to be a np.timedelta64, datetime.timedelta or float (in seconds), got {type(outputdt)}" ) outputdt = timedelta_to_float(outputdt) + path = Path(path) + + if path.suffix != ".parquet": + raise ValueError(f"ParticleFile data is stored in Parquet files - file extension must be '.parquet'. Got {path.suffix=!r}.") if outputdt <= 0: raise ValueError(f"outputdt must be positive/non-zero. Got {outputdt=!r}") self._outputdt = outputdt - self._path = Path( - path - ) # TODO v4: Consider https://arrow.apache.org/docs/python/getstarted.html#working-with-large-data - though a significant question becomes how to partition, perhaps using a particle variable "partition"? + self._path = path # TODO v4: Consider https://arrow.apache.org/docs/python/getstarted.html#working-with-large-data - though a significant question becomes how to partition, perhaps using a particle variable "partition"? self._writer: pq.ParquetWriter | None = None if path.exists(): # TODO: Add logic for recovering/appending to existing parquet file diff --git a/tests-v3/test_advection.py b/tests-v3/test_advection.py index 77abee900..bdd7a4221 100644 --- a/tests-v3/test_advection.py +++ b/tests-v3/test_advection.py @@ -1,5 +1,6 @@ import numpy as np import pytest +import pandas as pd import xarray as xr from parcels import ( diff --git a/tests-v3/test_fieldset_sampling.py b/tests-v3/test_fieldset_sampling.py index 5f25e355c..176eedab1 100644 --- a/tests-v3/test_fieldset_sampling.py +++ b/tests-v3/test_fieldset_sampling.py @@ -3,6 +3,7 @@ from math import cos, pi import numpy as np +import pandas as pd import pytest import xarray as xr diff --git a/tests/test_fieldset.py b/tests/test_fieldset.py index d9b8d6000..8a64546f8 100644 --- a/tests/test_fieldset.py +++ b/tests/test_fieldset.py @@ -4,6 +4,7 @@ import cftime import numpy as np import pytest +import pandas as pd import xarray as xr from parcels import Field, ParticleFile, ParticleSet, VectorField, XGrid, convert diff --git a/tests/test_particlefile.py b/tests/test_particlefile.py index eeade646f..397ec6218 100755 --- a/tests/test_particlefile.py +++ b/tests/test_particlefile.py @@ -2,7 +2,8 @@ import tempfile from contextlib import nullcontext as does_not_raise from datetime import datetime, timedelta - +import pyarrow.parquet as pq +import pandas as pd import numpy as np import pyarrow as pa import pytest @@ -53,28 +54,8 @@ def test_metadata(fieldset, tmp_parquet): ofile = ParticleFile(tmp_parquet, outputdt=np.timedelta64(1, "s")) pset.execute(DoNothing, runtime=np.timedelta64(1, "s"), dt=np.timedelta64(1, "s"), output_file=ofile) - ds = xr.open_zarr(tmp_parquet) - assert ds.attrs["parcels_kernels"].lower() == "DoNothing".lower() - - - -def test_pfile_array_write_zarr_memorystore(fieldset): - """Check that writing to a Zarr MemoryStore works.""" - npart = 10 - zarr_store = MemoryStore() - pset = ParticleSet( - fieldset, - pclass=Particle, - lon=np.linspace(0, 1, npart), - lat=0.5 * np.ones(npart), - time=fieldset.time_interval.left, - ) - pfile = ParticleFile(zarr_store, outputdt=np.timedelta64(1, "s")) - pfile.write(pset, time=fieldset.time_interval.left) - - ds = xr.open_zarr(zarr_store) - assert ds.sizes["trajectory"] == npart - + tab = pq.read_table(tmp_parquet) + assert tab.schema.metadata[b"parcels_kernels"].decode().lower() == "DoNothing".lower() def test_write_fieldset_without_time(tmp_parquet): @@ -88,12 +69,14 @@ def test_write_fieldset_without_time(tmp_parquet): ofile = ParticleFile(tmp_parquet, outputdt=np.timedelta64(1, "s")) pset.execute(DoNothing, runtime=np.timedelta64(1, "s"), dt=np.timedelta64(1, "s"), output_file=ofile) - ds = xr.open_zarr(tmp_parquet) - assert ds.time.values[0, 1] == np.timedelta64(1, "s") - + df = pd.read_parquet(tmp_parquet) + pytest.skip("# TODO: Need to figure out how times work with parquet output (#2386)") + assert df['time'][1] == np.timedelta64(1, "s") +@pytest.mark.xfail("Keep or remove? Introduced in 5d7dd6bba800baa0fe4bd38edfc17ca3e310062b ") def test_pfile_array_remove_particles(fieldset, tmp_parquet): + """If a particle from the middle of a particleset is removed, that writing doesn't crash""" npart = 10 pset = ParticleSet( fieldset, @@ -115,8 +98,7 @@ def test_pfile_array_remove_particles(fieldset, tmp_parquet): -@pytest.mark.parametrize("chunks_obs", [1, None]) -def test_pfile_array_remove_all_particles(fieldset, chunks_obs, tmp_parquet): +def test_pfile_array_remove_all_particles(fieldset, tmp_parquet): npart = 10 pset = ParticleSet( fieldset, @@ -125,21 +107,17 @@ def test_pfile_array_remove_all_particles(fieldset, chunks_obs, tmp_parquet): lat=0.5 * np.ones(npart), time=fieldset.time_interval.left, ) - chunks = (npart, chunks_obs) if chunks_obs else None - pfile = ParticleFile(tmp_parquet, chunks=chunks, outputdt=np.timedelta64(1, "s")) + pfile = ParticleFile(tmp_parquet, outputdt=np.timedelta64(1, "s")) pfile.write(pset, time=0) for _ in range(npart): pset.remove_indices(-1) pfile.write(pset, fieldset.time_interval.left + np.timedelta64(1, "D")) pfile.write(pset, fieldset.time_interval.left + np.timedelta64(2, "D")) + pfile.close() - ds = xr.open_zarr(tmp_parquet) - np.testing.assert_allclose(ds["time"][:, 0] - fieldset.time_interval.left, np.timedelta64(0, "s")) - if chunks_obs is not None: - assert ds["time"][:].shape == chunks - else: - assert ds["time"][:].shape[0] == npart - assert np.all(np.isnan(ds["time"][:, 1:])) + df = pd.read_parquet(tmp_parquet) + # np.testing.assert_allclose(ds["time"][:, 0] - fieldset.time_interval.left, np.timedelta64(0, "s")) # TODO: Need to figure out how times work with parquet output (#2386) + assert df['trajectory'].nunique() == npart @@ -164,12 +142,11 @@ def test_write_dtypes_pfile(fieldset, tmp_parquet): pset = ParticleSet(fieldset, pclass=MyParticle, lon=0, lat=0, time=fieldset.time_interval.left) pfile = ParticleFile(tmp_parquet, outputdt=np.timedelta64(1, "s")) pfile.write(pset, time=fieldset.time_interval.left) + pfile.close() - ds = xr.open_zarr( - tmp_parquet, mask_and_scale=False - ) # Note masking issue at https://stackoverflow.com/questions/68460507/xarray-loading-int-data-as-float + tab = pq.read_table(tmp_parquet) for d in dtypes: - assert ds[f"v_{d.__name__}"].dtype == d + assert tab[f"v_{d.__name__}"].type == pa.from_numpy_dtype(d) def test_variable_written_once(): @@ -196,7 +173,7 @@ def test_pset_repeated_release_delayed_adding_deleting(fieldset, tmp_parquet, dt pclass=MyParticle, time=fieldset.time_interval.left + [np.timedelta64(i + 1, "s") for i in range(npart)], ) - pfile = ParticleFile(tmp_parquet, outputdt=abs(dt), chunks=(1, 1)) + pfile = ParticleFile(tmp_parquet, outputdt=abs(dt)) def IncrLon(particles, fieldset): # pragma: no cover particles.sample_var += 1.0 @@ -387,7 +364,7 @@ def setup_pset_execute(*, fieldset: FieldSet, outputdt: timedelta, execute_kwarg ) with tempfile.TemporaryDirectory() as dir: - name = f"{dir}/test.zarr" + name = f"{dir}/tmp.parquet" output_file = ParticleFile(name, outputdt=outputdt) pset.execute(DoNothing, output_file=output_file, **execute_kwargs) @@ -455,59 +432,24 @@ def test_pset_execute_outputdt_backwards_fieldset_timevarying(): assert np.all(file_outputdt == np.timedelta64(-outputdt)), (file_outputdt, np.timedelta64(-outputdt)) -def test_particlefile_init(tmp_store): - ParticleFile(tmp_store, outputdt=np.timedelta64(1, "s"), chunks=(1, 3)) +def test_particlefile_init(tmp_parquet): + ParticleFile(tmp_parquet, outputdt=np.timedelta64(1, "s")) -@pytest.mark.parametrize("name", ["store", "outputdt", "chunks", "create_new_zarrfile"]) -def test_particlefile_readonly_attrs(tmp_store, name): - pfile = ParticleFile(tmp_store, outputdt=np.timedelta64(1, "s"), chunks=(1, 3)) +@pytest.mark.parametrize("name", ["path", "outputdt"]) +def test_particlefile_readonly_attrs(tmp_parquet, name): + pfile = ParticleFile(tmp_parquet, outputdt=np.timedelta64(1, "s")) with pytest.raises(AttributeError, match="property .* of 'ParticleFile' object has no setter"): setattr(pfile, name, "something") -def test_particlefile_init_invalid(tmp_store): # TODO: Add test for read only store - with pytest.raises(ValueError, match="chunks must be a tuple"): - ParticleFile(tmp_store, outputdt=np.timedelta64(1, "s"), chunks=1) +def test_particlefile_init_invalid(tmp_path): + path = tmp_path / 'file.not-parquet' + with pytest.raises(ValueError, match="file extension must be '.parquet'"): + ParticleFile(path, outputdt=np.timedelta64(1, "s")) -def test_particlefile_write_particle_data(tmp_store): - nparticles = 100 - - pfile = ParticleFile(tmp_store, outputdt=np.timedelta64(1, "s"), chunks=(nparticles, 40)) - pclass = Particle - - left, right = np.datetime64("2019-05-30T12:00:00.000000000", "ns"), np.datetime64("2020-01-02", "ns") - time_interval = TimeInterval(left=left, right=right) - - initial_lon = np.linspace(0, 1, nparticles) - data = create_particle_data( - pclass=pclass, - nparticles=nparticles, - ngrids=4, - time_interval=time_interval, - initial={ - "time": np.full(nparticles, fill_value=0), - "lon": initial_lon, - "dt": np.full(nparticles, fill_value=1.0), - "trajectory": np.arange(nparticles), - }, - ) - np.testing.assert_array_equal(data["time"], 0) - pfile._write_particle_data( - particle_data=data, - pclass=pclass, - time_interval=time_interval, - time=left, - ) - ds = xr.open_zarr(tmp_store) - assert ds.time.dtype == "datetime64[ns]" - np.testing.assert_equal(ds["time"].isel(obs=0).values, left) - assert ds.sizes["trajectory"] == nparticles - np.testing.assert_allclose(ds["lon"].isel(obs=0).values, initial_lon) - - def test_pfile_write_custom_particle(): # Test the writing of a custom particle with variables that are to_write, some to_write once, and some not to_write # ? This is more of an integration test... Should it be housed here? From b8c547761a729ce0503a5986b7f5307ac472d737 Mon Sep 17 00:00:00 2001 From: Vecko <36369090+VeckoTheGecko@users.noreply.github.com> Date: Mon, 20 Apr 2026 11:11:50 +0200 Subject: [PATCH 08/34] More test fixing --- tests/test_particlefile.py | 64 +++++++++++++++++--------------------- 1 file changed, 28 insertions(+), 36 deletions(-) diff --git a/tests/test_particlefile.py b/tests/test_particlefile.py index 397ec6218..a3fe47b00 100755 --- a/tests/test_particlefile.py +++ b/tests/test_particlefile.py @@ -47,7 +47,6 @@ def fieldset() -> FieldSet: # TODO v4: Move into a `conftest.py` file and remov ) - def test_metadata(fieldset, tmp_parquet): pset = ParticleSet(fieldset, pclass=Particle, lon=0, lat=0) @@ -71,10 +70,10 @@ def test_write_fieldset_without_time(tmp_parquet): df = pd.read_parquet(tmp_parquet) pytest.skip("# TODO: Need to figure out how times work with parquet output (#2386)") - assert df['time'][1] == np.timedelta64(1, "s") + assert df["time"][1] == np.timedelta64(1, "s") -@pytest.mark.xfail("Keep or remove? Introduced in 5d7dd6bba800baa0fe4bd38edfc17ca3e310062b ") +@pytest.mark.skip("Keep or remove? Introduced in 5d7dd6bba800baa0fe4bd38edfc17ca3e310062b ") def test_pfile_array_remove_particles(fieldset, tmp_parquet): """If a particle from the middle of a particleset is removed, that writing doesn't crash""" npart = 10 @@ -97,7 +96,6 @@ def test_pfile_array_remove_particles(fieldset, tmp_parquet): assert (np.isnat(timearr[3, 1])) and (np.isfinite(timearr[3, 0])) - def test_pfile_array_remove_all_particles(fieldset, tmp_parquet): npart = 10 pset = ParticleSet( @@ -117,8 +115,7 @@ def test_pfile_array_remove_all_particles(fieldset, tmp_parquet): df = pd.read_parquet(tmp_parquet) # np.testing.assert_allclose(ds["time"][:, 0] - fieldset.time_interval.left, np.timedelta64(0, "s")) # TODO: Need to figure out how times work with parquet output (#2386) - assert df['trajectory'].nunique() == npart - + assert df["trajectory"].nunique() == npart def test_write_dtypes_pfile(fieldset, tmp_parquet): @@ -196,7 +193,6 @@ def IncrLon(particles, fieldset): # pragma: no cover assert filesize < 1024 * 65 # test that chunking leads to filesize less than 65KB - def test_file_warnings(fieldset, tmp_parquet): pset = ParticleSet(fieldset, lon=[0, 0], lat=[0, 0], time=[np.timedelta64(0, "s"), np.timedelta64(1, "s")]) pfile = ParticleFile(tmp_parquet, outputdt=np.timedelta64(2, "s")) @@ -221,22 +217,22 @@ def test_outputdt_types(outputdt, expectation, tmp_parquet): assert pfile.outputdt == timedelta_to_float(outputdt) - def test_write_timebackward(fieldset, tmp_parquet): release_time = fieldset.time_interval.left + [np.timedelta64(i + 1, "s") for i in range(3)] pset = ParticleSet(fieldset, lat=[0, 1, 2], lon=[0, 0, 0], time=release_time) pfile = ParticleFile(tmp_parquet, outputdt=np.timedelta64(1, "s")) pset.execute(DoNothing, runtime=np.timedelta64(3, "s"), dt=-np.timedelta64(1, "s"), output_file=pfile) - ds = xr.open_zarr(tmp_parquet) - trajs = ds["trajectory"][:] - - output_time = ds["time"][:].values + df = pd.read_parquet(tmp_parquet) - assert trajs.values.dtype == "int64" - assert np.all(np.diff(trajs.values) < 0) # all particles written in order of release - doutput_time = np.diff(output_time, axis=1) - assert np.all(doutput_time[~np.isnan(doutput_time)] < 0) # all times written in decreasing order + assert df["trajectory"].dtype == "int64" + assert bool( + df.groupby("trajectory") + .apply( + lambda x: (np.diff(x["time"]) < 0).all() # for each particle - set True if it has decreasing time + ) + .all() # ensure for all particles + ) @pytest.mark.xfail @@ -294,7 +290,6 @@ def SampleP(particles, fieldset): # pragma: no cover assert fieldset.U.grid.lat[yi] <= lat < fieldset.U.grid.lat[yi + 1] - @pytest.mark.parametrize("outputdt", [np.timedelta64(1, "s"), np.timedelta64(2, "s"), np.timedelta64(3, "s")]) def test_time_is_age(fieldset, tmp_parquet, outputdt): # Test that particle age is same as time - initial_time @@ -311,6 +306,7 @@ def IncreaseAge(particles, fieldset): # pragma: no cover pset.execute(IncreaseAge, runtime=np.timedelta64(npart * 2, "s"), dt=np.timedelta64(1, "s"), output_file=ofile) + pytest.skip("# TODO: Need to figure out how times work with parquet output (#2386)") ds = xr.open_zarr(tmp_parquet) age = ds["age"][:].values.astype("timedelta64[s]") ds_timediff = np.zeros_like(age) @@ -319,7 +315,6 @@ def IncreaseAge(particles, fieldset): # pragma: no cover np.testing.assert_equal(age, ds_timediff) - def test_reset_dt(fieldset, tmp_parquet): # Assert that p.dt gets reset when a write_time is not a multiple of dt # for p.dt=0.02 to reach outputdt=0.05 and endtime=0.1, the steps should be [0.2, 0.2, 0.1, 0.2, 0.2, 0.1], resulting in 6 kernel executions @@ -336,7 +331,6 @@ def Update_lon(particles, fieldset): # pragma: no cover assert np.allclose(pset.lon, 0.6) - def test_correct_misaligned_outputdt_dt(fieldset, tmp_parquet): """Testing that outputdt does not need to be a multiple of dt.""" @@ -348,9 +342,10 @@ def Update_lon(particles, fieldset): # pragma: no cover ofile = ParticleFile(tmp_parquet, outputdt=np.timedelta64(3, "s")) pset.execute(Update_lon, runtime=np.timedelta64(11, "s"), dt=np.timedelta64(2, "s"), output_file=ofile) - ds = xr.open_zarr(tmp_parquet) - assert np.allclose(ds.lon.values, [0, 3, 6, 9]) - assert np.allclose(timedelta_to_float(ds.time.values - ds.time.values[0, 0]), [0, 3, 6, 9]) + df = pd.read_parquet(tmp_parquet) + assert np.allclose(df['lon'].values, [0, 3, 6, 9]) + pytest.skip("# TODO: Need to figure out how times work with parquet output (#2386)") + assert np.allclose(timedelta_to_float(df.time.values - df.time.values[0, 0]), [0, 3, 6, 9]) def setup_pset_execute(*, fieldset: FieldSet, outputdt: timedelta, execute_kwargs, particle_class=Particle): @@ -368,10 +363,9 @@ def setup_pset_execute(*, fieldset: FieldSet, outputdt: timedelta, execute_kwarg output_file = ParticleFile(name, outputdt=outputdt) pset.execute(DoNothing, output_file=output_file, **execute_kwargs) - ds = xr.open_zarr(name).load() - - return ds + df = pd.read_parquet(name) + return df def test_pset_execute_outputdt_forwards(fieldset): @@ -380,39 +374,37 @@ def test_pset_execute_outputdt_forwards(fieldset): runtime = timedelta(hours=5) dt = timedelta(minutes=5) - ds = setup_pset_execute(fieldset=fieldset, outputdt=outputdt, execute_kwargs=dict(runtime=runtime, dt=dt)) - + df = setup_pset_execute(fieldset=fieldset, outputdt=outputdt, execute_kwargs=dict(runtime=runtime, dt=dt)) + pytest.skip("# TODO: Need to figure out how times work with parquet output (#2386)") assert np.all(ds.isel(trajectory=0).time.diff(dim="obs").values == np.timedelta64(outputdt)) - def test_pset_execute_output_time_forwards(fieldset): """Testing output times start at initial time and end at initial time + runtime.""" outputdt = np.timedelta64(1, "h") runtime = np.timedelta64(5, "h") dt = np.timedelta64(5, "m") - ds = setup_pset_execute(fieldset=fieldset, outputdt=outputdt, execute_kwargs=dict(runtime=runtime, dt=dt)) - + df = setup_pset_execute(fieldset=fieldset, outputdt=outputdt, execute_kwargs=dict(runtime=runtime, dt=dt)) + pytest.skip("# TODO: Need to figure out how times work with parquet output (#2386)") assert ( ds.time[0, 0].values == fieldset.time_interval.left and ds.time[0, -1].values == fieldset.time_interval.left + runtime ) - def test_pset_execute_outputdt_backwards(fieldset): """Testing output data dt matches outputdt in backwards time.""" outputdt = timedelta(hours=1) runtime = timedelta(days=2) dt = -timedelta(minutes=5) - ds = setup_pset_execute(fieldset=fieldset, outputdt=outputdt, execute_kwargs=dict(runtime=runtime, dt=dt)) + df = setup_pset_execute(fieldset=fieldset, outputdt=outputdt, execute_kwargs=dict(runtime=runtime, dt=dt)) + pytest.skip("# TODO: Need to figure out how times work with parquet output (#2386)") file_outputdt = ds.isel(trajectory=0).time.diff(dim="obs").values assert np.all(file_outputdt == np.timedelta64(-outputdt)) - def test_pset_execute_outputdt_backwards_fieldset_timevarying(): """test_pset_execute_outputdt_backwards() still passed despite #1722 as it doesn't account for time-varying fields, which for some reason #1722 @@ -427,7 +419,8 @@ def test_pset_execute_outputdt_backwards_fieldset_timevarying(): ds_fset = copernicusmarine_to_sgrid(fields=fields) fieldset = FieldSet.from_sgrid_conventions(ds_fset) - ds = setup_pset_execute(outputdt=outputdt, execute_kwargs=dict(runtime=runtime, dt=dt), fieldset=fieldset) + df = setup_pset_execute(outputdt=outputdt, execute_kwargs=dict(runtime=runtime, dt=dt), fieldset=fieldset) + pytest.skip("# TODO: Need to figure out how times work with parquet output (#2386)") file_outputdt = ds.isel(trajectory=0).time.diff(dim="obs").values assert np.all(file_outputdt == np.timedelta64(-outputdt)), (file_outputdt, np.timedelta64(-outputdt)) @@ -444,12 +437,11 @@ def test_particlefile_readonly_attrs(tmp_parquet, name): def test_particlefile_init_invalid(tmp_path): - path = tmp_path / 'file.not-parquet' + path = tmp_path / "file.not-parquet" with pytest.raises(ValueError, match="file extension must be '.parquet'"): ParticleFile(path, outputdt=np.timedelta64(1, "s")) - def test_pfile_write_custom_particle(): # Test the writing of a custom particle with variables that are to_write, some to_write once, and some not to_write # ? This is more of an integration test... Should it be housed here? From 2d438c5f7b9cad3e27264b51f30334d571328af3 Mon Sep 17 00:00:00 2001 From: Vecko <36369090+VeckoTheGecko@users.noreply.github.com> Date: Mon, 20 Apr 2026 11:41:07 +0200 Subject: [PATCH 09/34] Fix last tests --- tests/test_fieldset.py | 2 +- tests/test_uxadvection.py | 7 ++++--- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/tests/test_fieldset.py b/tests/test_fieldset.py index 8a64546f8..c7131608d 100644 --- a/tests/test_fieldset.py +++ b/tests/test_fieldset.py @@ -108,7 +108,7 @@ def SampleP(particles, fieldset): ofile = ParticleFile(tmp_parquet, outputdt=np.timedelta64(1, "s")) pset.execute(SampleP, runtime=np.timedelta64(1, "s"), dt=np.timedelta64(1, "s"), output_file=ofile) - df = xr.open_zarr(tmp_parquet) + df = pd.read_parquet(tmp_parquet) assert len(df["lon"]) == 2 diff --git a/tests/test_uxadvection.py b/tests/test_uxadvection.py index 04591019b..52509ba28 100644 --- a/tests/test_uxadvection.py +++ b/tests/test_uxadvection.py @@ -9,6 +9,7 @@ AdvectionRK2, AdvectionRK4, ) +import pandas as pd @@ -20,10 +21,10 @@ def test_ux_constant_flow_face_centered_2D(integrator, tmp_parquet): fieldset = parcels.FieldSet.from_ugrid_conventions(ds, mesh="flat") pset = parcels.ParticleSet(fieldset, lon=[5.0], lat=[5.0]) - pfile = parcels.ParticleFile(store=tmp_parquet, outputdt=dt) + pfile = parcels.ParticleFile(path=tmp_parquet, outputdt=dt) pset.execute(integrator, runtime=T, dt=dt, output_file=pfile, verbose_progress=False) expected_lon = 8.6 np.testing.assert_allclose(pset.lon, expected_lon, atol=1e-5) - df = xr.open_zarr(tmp_parquet) - np.testing.assert_allclose(df["lon"][:, -1], expected_lon, atol=1e-5) + df = pd.read_parquet(tmp_parquet) + np.testing.assert_allclose(df["lon"].iloc[-1], expected_lon, atol=1e-5) From 32a82fac4236a3824a7190d65b31b39f78cbbc00 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 20 Apr 2026 09:48:43 +0000 Subject: [PATCH 10/34] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- src/parcels/_core/particlefile.py | 10 ++++++---- src/parcels/_core/particleset.py | 2 +- src/parcels/_reprs.py | 2 -- tests/conftest.py | 4 ++-- tests/test_advection.py | 2 +- tests/test_fieldset.py | 3 +-- tests/test_particlefile.py | 12 ++++++------ tests/test_uxadvection.py | 4 +--- 8 files changed, 18 insertions(+), 21 deletions(-) diff --git a/src/parcels/_core/particlefile.py b/src/parcels/_core/particlefile.py index 8d121e55d..c9cdfa62a 100644 --- a/src/parcels/_core/particlefile.py +++ b/src/parcels/_core/particlefile.py @@ -9,13 +9,13 @@ import cftime import numpy as np import pyarrow as pa -from parcels._typing import PathLike import pyarrow.parquet as pq import parcels from parcels._core.particle import ParticleClass from parcels._core.utils.time import timedelta_to_float from parcels._reprs import particlefile_repr +from parcels._typing import PathLike if TYPE_CHECKING: from parcels._core.particle import Variable @@ -69,14 +69,16 @@ def __init__(self, path: PathLike, outputdt): path = Path(path) if path.suffix != ".parquet": - raise ValueError(f"ParticleFile data is stored in Parquet files - file extension must be '.parquet'. Got {path.suffix=!r}.") + raise ValueError( + f"ParticleFile data is stored in Parquet files - file extension must be '.parquet'. Got {path.suffix=!r}." + ) if outputdt <= 0: raise ValueError(f"outputdt must be positive/non-zero. Got {outputdt=!r}") self._outputdt = outputdt - self._path = path # TODO v4: Consider https://arrow.apache.org/docs/python/getstarted.html#working-with-large-data - though a significant question becomes how to partition, perhaps using a particle variable "partition"? + self._path = path # TODO v4: Consider https://arrow.apache.org/docs/python/getstarted.html#working-with-large-data - though a significant question becomes how to partition, perhaps using a particle variable "partition"? self._writer: pq.ParquetWriter | None = None if path.exists(): # TODO: Add logic for recovering/appending to existing parquet file @@ -138,7 +140,7 @@ def write(self, pset: ParticleSet, time, indices=None): indices_to_write = _to_write_particles(particle_data, time) else: indices_to_write = indices - + self._writer.write_table( pa.table({v.name: pa.array(particle_data[v.name][indices_to_write]) for v in vars_to_write}), ) diff --git a/src/parcels/_core/particleset.py b/src/parcels/_core/particleset.py index e4ecb252b..f2e74112f 100644 --- a/src/parcels/_core/particleset.py +++ b/src/parcels/_core/particleset.py @@ -453,7 +453,7 @@ def execute( if output_file is not None: output_file.close() - + if verbose_progress: pbar.close() diff --git a/src/parcels/_reprs.py b/src/parcels/_reprs.py index 34b6814a0..d27eee379 100644 --- a/src/parcels/_reprs.py +++ b/src/parcels/_reprs.py @@ -7,7 +7,6 @@ import numpy as np import xarray as xr -from zarr.storage import DirectoryStore if TYPE_CHECKING: from parcels import Field, FieldSet, ParticleSet @@ -178,6 +177,5 @@ def _format_list_items_multiline(items: list[str] | dict, level: int = 1, with_b return "\n".join([textwrap.indent(e, indentation_str) for e in entries]) - def is_builtin_object(obj): return obj.__class__.__module__ == "builtins" diff --git a/tests/conftest.py b/tests/conftest.py index 56bbfa480..1853f9774 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -12,7 +12,7 @@ def tmp_parquet(tmp_path, request): def tmp_store(): return MemoryStore() + @pytest.fixture def tmp_parquet(tmp_path): - return tmp_path / 'tmp.parquet' - + return tmp_path / "tmp.parquet" diff --git a/tests/test_advection.py b/tests/test_advection.py index 365e9e6f4..95eca30f3 100644 --- a/tests/test_advection.py +++ b/tests/test_advection.py @@ -1,10 +1,10 @@ import numpy as np +import pandas as pd import pytest import xarray as xr import parcels import parcels.tutorial -import pandas as pd from parcels import ( Field, FieldSet, diff --git a/tests/test_fieldset.py b/tests/test_fieldset.py index c7131608d..6eeef20a6 100644 --- a/tests/test_fieldset.py +++ b/tests/test_fieldset.py @@ -3,8 +3,8 @@ import cf_xarray # noqa: F401 import cftime import numpy as np -import pytest import pandas as pd +import pytest import xarray as xr from parcels import Field, ParticleFile, ParticleSet, VectorField, XGrid, convert @@ -96,7 +96,6 @@ def test_fieldset_gridset(fieldset): assert len(fieldset.gridset) == 2 - def test_fieldset_no_UV(tmp_parquet): grid = XGrid.from_dataset(ds, mesh="flat") fieldset = FieldSet([Field("P", ds["U_A_grid"], grid, interp_method=XLinear)]) diff --git a/tests/test_particlefile.py b/tests/test_particlefile.py index a3fe47b00..70ef5b505 100755 --- a/tests/test_particlefile.py +++ b/tests/test_particlefile.py @@ -2,13 +2,13 @@ import tempfile from contextlib import nullcontext as does_not_raise from datetime import datetime, timedelta -import pyarrow.parquet as pq -import pandas as pd + import numpy as np +import pandas as pd import pyarrow as pa +import pyarrow.parquet as pq import pytest import xarray as xr -from zarr.storage import MemoryStore import parcels.tutorial from parcels import ( @@ -22,9 +22,9 @@ VectorField, XGrid, ) -from parcels._core.particle import Particle, create_particle_data, get_default_particle +from parcels._core.particle import Particle, get_default_particle from parcels._core.particlefile import _get_schema -from parcels._core.utils.time import TimeInterval, timedelta_to_float +from parcels._core.utils.time import timedelta_to_float from parcels._datasets.structured.generated import peninsula_dataset from parcels._datasets.structured.generic import datasets from parcels.convert import copernicusmarine_to_sgrid @@ -343,7 +343,7 @@ def Update_lon(particles, fieldset): # pragma: no cover pset.execute(Update_lon, runtime=np.timedelta64(11, "s"), dt=np.timedelta64(2, "s"), output_file=ofile) df = pd.read_parquet(tmp_parquet) - assert np.allclose(df['lon'].values, [0, 3, 6, 9]) + assert np.allclose(df["lon"].values, [0, 3, 6, 9]) pytest.skip("# TODO: Need to figure out how times work with parquet output (#2386)") assert np.allclose(timedelta_to_float(df.time.values - df.time.values[0, 0]), [0, 3, 6, 9]) diff --git a/tests/test_uxadvection.py b/tests/test_uxadvection.py index 52509ba28..d3db9aecd 100644 --- a/tests/test_uxadvection.py +++ b/tests/test_uxadvection.py @@ -1,6 +1,6 @@ import numpy as np +import pandas as pd import pytest -import xarray as xr import parcels from parcels._datasets.unstructured.generic import datasets as datasets_unstructured @@ -9,8 +9,6 @@ AdvectionRK2, AdvectionRK4, ) -import pandas as pd - @pytest.mark.parametrize("integrator", [AdvectionEE, AdvectionRK2, AdvectionRK4]) From 19fbd8dddac5272915a4d9a1d343ecb47a779b41 Mon Sep 17 00:00:00 2001 From: Vecko <36369090+VeckoTheGecko@users.noreply.github.com> Date: Mon, 20 Apr 2026 11:51:15 +0200 Subject: [PATCH 11/34] Remove old fixtures --- tests/conftest.py | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/tests/conftest.py b/tests/conftest.py index 1853f9774..0fd949880 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,16 +1,4 @@ import pytest -from zarr.storage import MemoryStore - - -@pytest.fixture() -def tmp_parquet(tmp_path, request): - test_name = request.node.name - yield tmp_path / f"{test_name}-output.zarr" - - -@pytest.fixture -def tmp_store(): - return MemoryStore() @pytest.fixture From e74672fefe40d7e75d67b256748df0ea47746f22 Mon Sep 17 00:00:00 2001 From: Vecko <36369090+VeckoTheGecko@users.noreply.github.com> Date: Mon, 20 Apr 2026 11:52:04 +0200 Subject: [PATCH 12/34] Fix pre-commit errors --- tests/test_particlefile.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/tests/test_particlefile.py b/tests/test_particlefile.py index 70ef5b505..f2a3c4553 100755 --- a/tests/test_particlefile.py +++ b/tests/test_particlefile.py @@ -374,9 +374,9 @@ def test_pset_execute_outputdt_forwards(fieldset): runtime = timedelta(hours=5) dt = timedelta(minutes=5) - df = setup_pset_execute(fieldset=fieldset, outputdt=outputdt, execute_kwargs=dict(runtime=runtime, dt=dt)) + df = setup_pset_execute(fieldset=fieldset, outputdt=outputdt, execute_kwargs=dict(runtime=runtime, dt=dt)) # noqa: F841 pytest.skip("# TODO: Need to figure out how times work with parquet output (#2386)") - assert np.all(ds.isel(trajectory=0).time.diff(dim="obs").values == np.timedelta64(outputdt)) + assert np.all(ds.isel(trajectory=0).time.diff(dim="obs").values == np.timedelta64(outputdt)) # noqa: F821 def test_pset_execute_output_time_forwards(fieldset): @@ -385,11 +385,11 @@ def test_pset_execute_output_time_forwards(fieldset): runtime = np.timedelta64(5, "h") dt = np.timedelta64(5, "m") - df = setup_pset_execute(fieldset=fieldset, outputdt=outputdt, execute_kwargs=dict(runtime=runtime, dt=dt)) + df = setup_pset_execute(fieldset=fieldset, outputdt=outputdt, execute_kwargs=dict(runtime=runtime, dt=dt)) # noqa: F841 pytest.skip("# TODO: Need to figure out how times work with parquet output (#2386)") assert ( - ds.time[0, 0].values == fieldset.time_interval.left - and ds.time[0, -1].values == fieldset.time_interval.left + runtime + ds.time[0, 0].values == fieldset.time_interval.left # noqa: F821 + and ds.time[0, -1].values == fieldset.time_interval.left + runtime # noqa: F821 ) @@ -399,9 +399,9 @@ def test_pset_execute_outputdt_backwards(fieldset): runtime = timedelta(days=2) dt = -timedelta(minutes=5) - df = setup_pset_execute(fieldset=fieldset, outputdt=outputdt, execute_kwargs=dict(runtime=runtime, dt=dt)) + df = setup_pset_execute(fieldset=fieldset, outputdt=outputdt, execute_kwargs=dict(runtime=runtime, dt=dt)) # noqa: F841 pytest.skip("# TODO: Need to figure out how times work with parquet output (#2386)") - file_outputdt = ds.isel(trajectory=0).time.diff(dim="obs").values + file_outputdt = ds.isel(trajectory=0).time.diff(dim="obs").values # noqa: F821 assert np.all(file_outputdt == np.timedelta64(-outputdt)) @@ -419,9 +419,9 @@ def test_pset_execute_outputdt_backwards_fieldset_timevarying(): ds_fset = copernicusmarine_to_sgrid(fields=fields) fieldset = FieldSet.from_sgrid_conventions(ds_fset) - df = setup_pset_execute(outputdt=outputdt, execute_kwargs=dict(runtime=runtime, dt=dt), fieldset=fieldset) + df = setup_pset_execute(outputdt=outputdt, execute_kwargs=dict(runtime=runtime, dt=dt), fieldset=fieldset) # noqa: F841 pytest.skip("# TODO: Need to figure out how times work with parquet output (#2386)") - file_outputdt = ds.isel(trajectory=0).time.diff(dim="obs").values + file_outputdt = ds.isel(trajectory=0).time.diff(dim="obs").values # noqa: F821 assert np.all(file_outputdt == np.timedelta64(-outputdt)), (file_outputdt, np.timedelta64(-outputdt)) From db9f9834296dfab2804a782a0ad28e936cfe0188 Mon Sep 17 00:00:00 2001 From: Vecko <36369090+VeckoTheGecko@users.noreply.github.com> Date: Mon, 20 Apr 2026 12:08:54 +0200 Subject: [PATCH 13/34] Cleanup This mark was only introduced during refactoring --- pyproject.toml | 1 - 1 file changed, 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index d0e6f7ba1..85aba3a67 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -60,7 +60,6 @@ markers = [ # can be skipped by doing `pytest -m "not slow"` etc. "v4alpha: failing tests that should work for v4alpha", "v4future: failing tests that should work for a future release of v4", "v4remove: failing tests that should probably be removed later", - "uses_old_zarr: tests that need to be migrated to the new particleset format", ] filterwarnings = [ From ac2a8309f197f4a98121d62b3ea8447791685086 Mon Sep 17 00:00:00 2001 From: Vecko <36369090+VeckoTheGecko@users.noreply.github.com> Date: Mon, 20 Apr 2026 13:56:37 +0200 Subject: [PATCH 14/34] Add pandas and pyarrow as explicit dependencies --- .github/ci/recipe.yaml | 2 ++ pixi.toml | 4 ++++ pyproject.toml | 2 ++ 3 files changed, 8 insertions(+) diff --git a/.github/ci/recipe.yaml b/.github/ci/recipe.yaml index 2a887e8b0..388c60b2d 100644 --- a/.github/ci/recipe.yaml +++ b/.github/ci/recipe.yaml @@ -36,6 +36,8 @@ requirements: - numpy >=2.1.0 - tqdm >=4.50.0 - xarray >=2025.8.0,<2026.4.0 # TODO: remove upper pin when https://github.com/UXARRAY/uxarray/issues/1490 is resolved + - pandas >=2.2 + - pyarrow >=20.0.0 - cf_xarray >=0.8.6 - xgcm >=0.9.0 - zarr >=2.15.0,!=2.18.0,<3 diff --git a/pixi.toml b/pixi.toml index a71be98ff..286d7f28c 100644 --- a/pixi.toml +++ b/pixi.toml @@ -24,6 +24,8 @@ netcdf4 = ">=1.6.0" numpy = ">=2.1.0" tqdm = ">=4.50.0" xarray = ">=2024.5.0,<2026.4.0" # TODO: remove upper pin when https://github.com/UXARRAY/uxarray/issues/1490 is resolved +pandas = ">=2.2" +pyarrow = ">=20.0.0" holoviews = ">=1.22.0" # https://github.com/prefix-dev/rattler-build/issues/2326 uxarray = ">=2025.3.0" dask = ">=2024.5.1" @@ -51,6 +53,8 @@ netcdf4 = "1.6.*" numpy = "2.1.*" tqdm = "4.50.*" xarray = "2025.8.*" +pandas = "2.2.*" +pyarrow = "20.0.*" uxarray = "2025.3.*" dask = "2024.6.*" zarr = "2.18.*" diff --git a/pyproject.toml b/pyproject.toml index 85aba3a67..072da3b77 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -27,6 +27,8 @@ dependencies = [ "zarr >=2.15.0,!=2.18.0,<3", "tqdm >=4.50.0", "xarray >=2024.5.0,<2026.4.0", # TODO: remove upper pin when https://github.com/UXARRAY/uxarray/issues/1490 is resolved + "pandas >= 2.2", + "pyarrow >=20.0.0", "uxarray >=2025.3.0", "pooch >=1.8.0", "xgcm >=0.9.0", From de464e561ca97627b0003cb54a1a112e44ffb707 Mon Sep 17 00:00:00 2001 From: Vecko <36369090+VeckoTheGecko@users.noreply.github.com> Date: Wed, 22 Apr 2026 13:47:35 +0200 Subject: [PATCH 15/34] Add assert_cftime_like_particlefile Remove temporary test_cftime.py file --- tests/test_utils.py | 29 +++++++++++++++++++++++++++++ tests/utils.py | 31 +++++++++++++++++++++++++++++++ 2 files changed, 60 insertions(+) diff --git a/tests/test_utils.py b/tests/test_utils.py index b42e13330..65ce4ea7a 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -1,4 +1,9 @@ +import json + import numpy as np +import pyarrow as pa +import pyarrow.parquet as pq +import pytest from tests import utils @@ -17,3 +22,27 @@ def test_round_and_hash_float_array(): arr_test = arr + 0.51 * delta h3 = utils.round_and_hash_float_array(arr_test, decimals=decimals) assert h3 != h + + +@pytest.mark.parametrize("cal", ["julian", "proleptic_gregorian", "365_day", "366_day", "360_day"]) +def test_assert_cftime_like_particlefile(tmp_path, cal): + path = tmp_path / "test.parquet" + attrs = {"units": "seconds since 2000-01-01 17:00:00", "calendar": cal} + field = pa.field("time", pa.float64(), metadata={"attrs": json.dumps(attrs)}) + schema = pa.schema([field]) + table = pa.table({"time": pa.array([-20.0, 1.0])}, schema=schema) + pq.write_table(table, path) + + utils.assert_cftime_like_particlefile(path) + + +def test_assert_cftime_like_particlefile_broken_parquet(tmp_path): + path = tmp_path / "test.parquet" + attrs = {"units": "broken-units", "calendar": "365_day"} + field = pa.field("time", pa.float64(), metadata={"attrs": json.dumps(attrs)}) + schema = pa.schema([field]) + table = pa.table({"time": pa.array([-20.0, 1.0])}, schema=schema) + pq.write_table(table, path) + + with pytest.raises(Exception, match="CF-time values in Parquet did not get properly decoded"): + utils.assert_cftime_like_particlefile(path) diff --git a/tests/utils.py b/tests/utils.py index 3213abd31..d20eaf845 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -151,3 +151,34 @@ def round_and_hash_float_array(arr, decimals=6): # Mimic Java's HashMap hash transformation h ^= (h >> 20) ^ (h >> 12) return h ^ (h >> 7) ^ (h >> 4) + + +def assert_cftime_like_particlefile(parquet_path: Path) -> None: + import json + + import cftime + import pyarrow as pa + import pyarrow.parquet as pq + + assert parquet_path.suffix == ".parquet", "Path must be a parquet file" + + table = pq.read_table(parquet_path) + assert "time" in table.schema.names, "Parquet file must have a 'time' column" + + time_field = table.schema.field("time") + assert pa.types.is_floating(time_field.type) or pa.types.is_integer(time_field.type), ( + f"'time' column must be numeric, got {time_field.type}" + ) + + raw_meta = time_field.metadata + attrs = json.loads(raw_meta[b"attrs"]) + + values = table.column("time").to_pylist() + v = xr.Variable(("time",), values, attrs) + decoded = xr.coders.CFDatetimeCoder(time_unit="s").decode(v) + + # check first value (and hence rest of array) is what we expect + assert isinstance(decoded.values[0], (cftime.datetime, np.datetime64)), ( + "CF-time values in Parquet did not get properly decoded. Are the attributes correct?" + ) + return From 57ccf6f9b49d20417dfe4972589b1ba2d2df6995 Mon Sep 17 00:00:00 2001 From: Vecko <36369090+VeckoTheGecko@users.noreply.github.com> Date: Mon, 20 Apr 2026 14:31:47 +0200 Subject: [PATCH 16/34] MAINT: Cleanup create_particle_data This function is now independent of the time_interval as time is now stored as float --- src/parcels/_core/particle.py | 19 +++++-------------- src/parcels/_core/particleset.py | 1 - tests/test_particle.py | 5 +---- 3 files changed, 6 insertions(+), 19 deletions(-) diff --git a/src/parcels/_core/particle.py b/src/parcels/_core/particle.py index dc39a063c..0e1be1792 100644 --- a/src/parcels/_core/particle.py +++ b/src/parcels/_core/particle.py @@ -8,7 +8,6 @@ from parcels._compat import _attrgetter_helper from parcels._core.statuscodes import StatusCode from parcels._core.utils.string import _assert_str_and_python_varname -from parcels._core.utils.time import TimeInterval from parcels._reprs import particleclass_repr, variable_repr __all__ = ["Particle", "ParticleClass", "Variable"] @@ -176,7 +175,6 @@ def create_particle_data( pclass: ParticleClass, nparticles: int, ngrids: int, - time_interval: TimeInterval, initial: dict[str, np.ndarray] | None = None, ): if initial is None: @@ -207,16 +205,9 @@ def create_particle_data( name_to_copy = var.initial(_attrgetter_helper) data[var.name] = data[name_to_copy].copy() else: - data[var.name] = _create_array_for_variable(var, nparticles, time_interval) + data[var.name] = np.full( + shape=(nparticles,), + fill_value=var.initial, + dtype=var.dtype, + ) return data - - -def _create_array_for_variable(variable: Variable, nparticles: int, time_interval: TimeInterval): - assert not isinstance(variable.initial, operator.attrgetter), ( - "This function cannot handle attrgetter initial values." - ) - return np.full( - shape=(nparticles,), - fill_value=variable.initial, - dtype=variable.dtype, - ) diff --git a/src/parcels/_core/particleset.py b/src/parcels/_core/particleset.py index f2e74112f..e59e2860b 100644 --- a/src/parcels/_core/particleset.py +++ b/src/parcels/_core/particleset.py @@ -111,7 +111,6 @@ def __init__( pclass=pclass, nparticles=lon.size, ngrids=len(fieldset.gridset), - time_interval=fieldset.time_interval, initial=dict( lon=lon, lat=lat, diff --git a/tests/test_particle.py b/tests/test_particle.py index dabe6944c..62eb65cff 100644 --- a/tests/test_particle.py +++ b/tests/test_particle.py @@ -7,8 +7,6 @@ Variable, create_particle_data, ) -from parcels._core.utils.time import TimeInterval -from parcels._datasets.structured.generic import TIME def test_variable_init(): @@ -140,9 +138,8 @@ def test_particleclass_add_variable_collision(): ) @pytest.mark.parametrize("nparticles", [5, 10]) def test_create_particle_data(particle, nparticles): - time_interval = TimeInterval(TIME[0], TIME[-1]) ngrids = 4 - data = create_particle_data(pclass=particle, nparticles=nparticles, ngrids=ngrids, time_interval=time_interval) + data = create_particle_data(pclass=particle, nparticles=nparticles, ngrids=ngrids) assert isinstance(data, dict) assert len(data) == len(particle.variables) + 1 # ei variable is separate From b2bde50c568a985d1c954e7da9a533d5a6f6e48a Mon Sep 17 00:00:00 2001 From: Vecko <36369090+VeckoTheGecko@users.noreply.github.com> Date: Wed, 22 Apr 2026 15:05:18 +0200 Subject: [PATCH 17/34] Add cftime metadata serialization --- src/parcels/_core/particle.py | 6 +++++- src/parcels/_core/particlefile.py | 26 ++++++++++++++++--------- src/parcels/_core/utils/time.py | 32 ++++++++++++++++++++++++++++++- tests/utils/test_time.py | 12 +++++++++++- 4 files changed, 64 insertions(+), 12 deletions(-) diff --git a/src/parcels/_core/particle.py b/src/parcels/_core/particle.py index 0e1be1792..5152b1130 100644 --- a/src/parcels/_core/particle.py +++ b/src/parcels/_core/particle.py @@ -148,7 +148,11 @@ def get_default_particle(spatial_dtype: type[np.float32] | type[np.float64]) -> Variable( "time", dtype=np.float64, - attrs={"standard_name": "time", "units": "seconds", "axis": "T"}, + attrs={ + "standard_name": "time", + "units": "seconds", + "axis": "T", + }, # "units" and "calendar" gets updated/set if working with cftime time domain ), Variable( "trajectory", diff --git a/src/parcels/_core/particlefile.py b/src/parcels/_core/particlefile.py index c9cdfa62a..439db2668 100644 --- a/src/parcels/_core/particlefile.py +++ b/src/parcels/_core/particlefile.py @@ -25,18 +25,24 @@ __all__ = ["ParticleFile"] -def _get_schema(particle: parcels.ParticleClass, file_metadata: dict[Any, Any]) -> pa.Schema: - return pa.schema( - [ +def _get_schema( + particle: parcels.ParticleClass, file_metadata: dict[Any, Any], fset_time_interval: TimeInterval | None +) -> pa.Schema: + + fields = [] + for v in _get_vars_to_write(particle): + attrs = v.attrs.copy() + if v.name == "time": + if fset_time_interval is not None: + attrs.update(fset_time_interval.get_cf_attrs()) + fields.append( pa.field( v.name, pa.from_numpy_dtype(v.dtype), - metadata=v.attrs, + metadata=attrs, ) - for v in _get_vars_to_write(particle) - ], - metadata=file_metadata.copy(), - ) + ) + return pa.schema(fields, metadata=file_metadata.copy()) class ParticleFile: @@ -131,7 +137,9 @@ def write(self, pset: ParticleSet, time, indices=None): if self._writer is None: assert not self.path.exists(), "If the file exists, the writer should already be set" - self._writer = pq.ParquetWriter(self.path, _get_schema(pclass, self.extra_metadata)) + self._writer = pq.ParquetWriter( + self.path, _get_schema(pclass, self.extra_metadata, pset.fieldset.time_interval) + ) if isinstance(time, (np.timedelta64, np.datetime64)): time = timedelta_to_float(time - time_interval.left) diff --git a/src/parcels/_core/utils/time.py b/src/parcels/_core/utils/time.py index b76473a3f..7b9299f3a 100644 --- a/src/parcels/_core/utils/time.py +++ b/src/parcels/_core/utils/time.py @@ -1,7 +1,7 @@ from __future__ import annotations from datetime import datetime, timedelta -from typing import TYPE_CHECKING, TypeVar +from typing import TYPE_CHECKING, Literal, TypeVar, cast import cftime import numpy as np @@ -85,6 +85,36 @@ def intersection(self, other: TimeInterval) -> TimeInterval | None: return TimeInterval(start, end) if start <= end else None + def get_cf_attrs(self) -> dict[Literal["units", "calendar"], str]: + """Return the cf-attrs that would correspond to x seconds from the left edge.""" + return _get_cf_attrs(self.left) + + +def _get_cf_attrs(dt: TimeLike) -> dict[Literal["units", "calendar"], str]: + if isinstance(dt, cftime.datetime): + dt = cast(cftime.datetime, dt) + return {"units": f"seconds since {dt.strftime(dt.format)}", "calendar": dt.calendar} + + from pandas import Timestamp + + if isinstance(dt, np.datetime64): + dt = Timestamp(dt) + + if isinstance(dt, (Timestamp, datetime)): + dt_cf = cftime.datetime( + year=dt.year, + month=dt.month, + day=dt.day, + hour=dt.hour, + minute=dt.minute, + second=dt.second, + microsecond=dt.microsecond, + calendar="gregorian", # What is the cftime proleptic_gregorian calendar? is that relevant here? + ) + return _get_cf_attrs(dt_cf) + + raise NotImplementedError(f"Not implemented for time object {type(dt)=!r}") + def is_compatible( t1: datetime | cftime.datetime | np.timedelta64, t2: datetime | cftime.datetime | np.timedelta64 diff --git a/tests/utils/test_time.py b/tests/utils/test_time.py index ef1f39346..2734b3e91 100644 --- a/tests/utils/test_time.py +++ b/tests/utils/test_time.py @@ -8,7 +8,12 @@ from hypothesis import given from hypothesis import strategies as st -from parcels._core.utils.time import TimeInterval, maybe_convert_python_timedelta_to_numpy, timedelta_to_float +from parcels._core.utils.time import ( + TimeInterval, + _get_cf_attrs, + maybe_convert_python_timedelta_to_numpy, + timedelta_to_float, +) calendar_strategy = st.sampled_from( [ @@ -215,3 +220,8 @@ def test_timedelta_to_float(input, expected): def test_timedelta_to_float_exceptions(): with pytest.raises((ValueError, TypeError)): timedelta_to_float("invalid_type") + + +@given(datetime_strategy()) +def test_datetime_get_cf_attrs(dt): + _get_cf_attrs(dt) From 55493a9d8b5d4353ac90a3fc068a0b843bb76b3d Mon Sep 17 00:00:00 2001 From: Vecko <36369090+VeckoTheGecko@users.noreply.github.com> Date: Wed, 22 Apr 2026 15:58:26 +0200 Subject: [PATCH 18/34] Add np.timedelta64 support --- src/parcels/_core/utils/time.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/parcels/_core/utils/time.py b/src/parcels/_core/utils/time.py index 7b9299f3a..bdce7fc13 100644 --- a/src/parcels/_core/utils/time.py +++ b/src/parcels/_core/utils/time.py @@ -95,6 +95,9 @@ def _get_cf_attrs(dt: TimeLike) -> dict[Literal["units", "calendar"], str]: dt = cast(cftime.datetime, dt) return {"units": f"seconds since {dt.strftime(dt.format)}", "calendar": dt.calendar} + if isinstance(dt, np.timedelta64): + return {"units": "seconds"} + from pandas import Timestamp if isinstance(dt, np.datetime64): From bab4d5d7fc6f56d569b9a64513eb962f4ed243c6 Mon Sep 17 00:00:00 2001 From: Vecko <36369090+VeckoTheGecko@users.noreply.github.com> Date: Wed, 22 Apr 2026 16:04:20 +0200 Subject: [PATCH 19/34] Fix assert_cftime_like_particlefile Remove nested key - save on root instead --- tests/test_advection.py | 3 ++- tests/test_utils.py | 6 ++---- tests/utils.py | 4 +--- 3 files changed, 5 insertions(+), 8 deletions(-) diff --git a/tests/test_advection.py b/tests/test_advection.py index 95eca30f3..e5c000132 100644 --- a/tests/test_advection.py +++ b/tests/test_advection.py @@ -37,7 +37,7 @@ AdvectionRK4_3D, AdvectionRK45, ) -from tests.utils import DEFAULT_PARTICLES +from tests.utils import DEFAULT_PARTICLES, assert_cftime_like_particlefile @pytest.mark.parametrize("mesh", ["spherical", "flat"]) @@ -76,6 +76,7 @@ def test_advection_zonal_with_particlefile(tmp_parquet): df = pd.read_parquet(tmp_parquet) final_time = df["time"].max() np.testing.assert_allclose(df[df["time"] == final_time]["lon"].values, pset.lon, atol=1e-5) + assert_cftime_like_particlefile(tmp_parquet) def periodicBC(particles, fieldset): diff --git a/tests/test_utils.py b/tests/test_utils.py index 65ce4ea7a..d8d695c18 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -1,5 +1,3 @@ -import json - import numpy as np import pyarrow as pa import pyarrow.parquet as pq @@ -28,7 +26,7 @@ def test_round_and_hash_float_array(): def test_assert_cftime_like_particlefile(tmp_path, cal): path = tmp_path / "test.parquet" attrs = {"units": "seconds since 2000-01-01 17:00:00", "calendar": cal} - field = pa.field("time", pa.float64(), metadata={"attrs": json.dumps(attrs)}) + field = pa.field("time", pa.float64(), metadata=attrs) schema = pa.schema([field]) table = pa.table({"time": pa.array([-20.0, 1.0])}, schema=schema) pq.write_table(table, path) @@ -39,7 +37,7 @@ def test_assert_cftime_like_particlefile(tmp_path, cal): def test_assert_cftime_like_particlefile_broken_parquet(tmp_path): path = tmp_path / "test.parquet" attrs = {"units": "broken-units", "calendar": "365_day"} - field = pa.field("time", pa.float64(), metadata={"attrs": json.dumps(attrs)}) + field = pa.field("time", pa.float64(), metadata=attrs) schema = pa.schema([field]) table = pa.table({"time": pa.array([-20.0, 1.0])}, schema=schema) pq.write_table(table, path) diff --git a/tests/utils.py b/tests/utils.py index d20eaf845..f474aca2f 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -154,7 +154,6 @@ def round_and_hash_float_array(arr, decimals=6): def assert_cftime_like_particlefile(parquet_path: Path) -> None: - import json import cftime import pyarrow as pa @@ -170,8 +169,7 @@ def assert_cftime_like_particlefile(parquet_path: Path) -> None: f"'time' column must be numeric, got {time_field.type}" ) - raw_meta = time_field.metadata - attrs = json.loads(raw_meta[b"attrs"]) + attrs = {k.decode(): v.decode() for k, v in time_field.metadata.items()} values = table.column("time").to_pylist() v = xr.Variable(("time",), values, attrs) From b28665c4d8375c9390c42330e42d7f4c2284115a Mon Sep 17 00:00:00 2001 From: Vecko <36369090+VeckoTheGecko@users.noreply.github.com> Date: Wed, 22 Apr 2026 17:27:04 +0200 Subject: [PATCH 20/34] Move imports --- tests/utils.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/tests/utils.py b/tests/utils.py index f474aca2f..374c8ecc3 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -6,7 +6,10 @@ from collections import defaultdict from pathlib import Path +import cftime import numpy as np +import pyarrow as pa +import pyarrow.parquet as pq import xarray as xr import parcels @@ -154,11 +157,6 @@ def round_and_hash_float_array(arr, decimals=6): def assert_cftime_like_particlefile(parquet_path: Path) -> None: - - import cftime - import pyarrow as pa - import pyarrow.parquet as pq - assert parquet_path.suffix == ".parquet", "Path must be a parquet file" table = pq.read_table(parquet_path) From 7184e1f3c7b541c68e598fda71895b8e152a8c5a Mon Sep 17 00:00:00 2001 From: Vecko <36369090+VeckoTheGecko@users.noreply.github.com> Date: Wed, 22 Apr 2026 17:29:23 +0200 Subject: [PATCH 21/34] Fixing tests --- tests/test_particlefile.py | 41 +++++++++++++++++--------------------- 1 file changed, 18 insertions(+), 23 deletions(-) diff --git a/tests/test_particlefile.py b/tests/test_particlefile.py index f2a3c4553..0a9534e05 100755 --- a/tests/test_particlefile.py +++ b/tests/test_particlefile.py @@ -68,9 +68,10 @@ def test_write_fieldset_without_time(tmp_parquet): ofile = ParticleFile(tmp_parquet, outputdt=np.timedelta64(1, "s")) pset.execute(DoNothing, runtime=np.timedelta64(1, "s"), dt=np.timedelta64(1, "s"), output_file=ofile) - df = pd.read_parquet(tmp_parquet) - pytest.skip("# TODO: Need to figure out how times work with parquet output (#2386)") - assert df["time"][1] == np.timedelta64(1, "s") + table = pq.read_table(tmp_parquet) + assert table.schema.field("time").metadata[b"units"] == b"seconds" + assert b"calendar" not in table.schema.field("time").metadata + assert table["time"].to_numpy()[1] == 1.0 @pytest.mark.skip("Keep or remove? Introduced in 5d7dd6bba800baa0fe4bd38edfc17ca3e310062b ") @@ -114,7 +115,6 @@ def test_pfile_array_remove_all_particles(fieldset, tmp_parquet): pfile.close() df = pd.read_parquet(tmp_parquet) - # np.testing.assert_allclose(ds["time"][:, 0] - fieldset.time_interval.left, np.timedelta64(0, "s")) # TODO: Need to figure out how times work with parquet output (#2386) assert df["trajectory"].nunique() == npart @@ -344,8 +344,7 @@ def Update_lon(particles, fieldset): # pragma: no cover df = pd.read_parquet(tmp_parquet) assert np.allclose(df["lon"].values, [0, 3, 6, 9]) - pytest.skip("# TODO: Need to figure out how times work with parquet output (#2386)") - assert np.allclose(timedelta_to_float(df.time.values - df.time.values[0, 0]), [0, 3, 6, 9]) + assert np.allclose(df.time - df.time.min(), [0, 3, 6, 9]) def setup_pset_execute(*, fieldset: FieldSet, outputdt: timedelta, execute_kwargs, particle_class=Particle): @@ -374,9 +373,10 @@ def test_pset_execute_outputdt_forwards(fieldset): runtime = timedelta(hours=5) dt = timedelta(minutes=5) - df = setup_pset_execute(fieldset=fieldset, outputdt=outputdt, execute_kwargs=dict(runtime=runtime, dt=dt)) # noqa: F841 - pytest.skip("# TODO: Need to figure out how times work with parquet output (#2386)") - assert np.all(ds.isel(trajectory=0).time.diff(dim="obs").values == np.timedelta64(outputdt)) # noqa: F821 + df = setup_pset_execute(fieldset=fieldset, outputdt=outputdt, execute_kwargs=dict(runtime=runtime, dt=dt)) + particle_0_times = df[df.trajectory == 0].time.values + + np.testing.assert_equal(np.diff(particle_0_times), outputdt.seconds) def test_pset_execute_output_time_forwards(fieldset): @@ -385,12 +385,9 @@ def test_pset_execute_output_time_forwards(fieldset): runtime = np.timedelta64(5, "h") dt = np.timedelta64(5, "m") - df = setup_pset_execute(fieldset=fieldset, outputdt=outputdt, execute_kwargs=dict(runtime=runtime, dt=dt)) # noqa: F841 - pytest.skip("# TODO: Need to figure out how times work with parquet output (#2386)") - assert ( - ds.time[0, 0].values == fieldset.time_interval.left # noqa: F821 - and ds.time[0, -1].values == fieldset.time_interval.left + runtime # noqa: F821 - ) + df = setup_pset_execute(fieldset=fieldset, outputdt=outputdt, execute_kwargs=dict(runtime=runtime, dt=dt)) + assert df.time.min() == 0.0 + assert df.time.max() == runtime / np.timedelta64(1, "s") def test_pset_execute_outputdt_backwards(fieldset): @@ -399,10 +396,9 @@ def test_pset_execute_outputdt_backwards(fieldset): runtime = timedelta(days=2) dt = -timedelta(minutes=5) - df = setup_pset_execute(fieldset=fieldset, outputdt=outputdt, execute_kwargs=dict(runtime=runtime, dt=dt)) # noqa: F841 - pytest.skip("# TODO: Need to figure out how times work with parquet output (#2386)") - file_outputdt = ds.isel(trajectory=0).time.diff(dim="obs").values # noqa: F821 - assert np.all(file_outputdt == np.timedelta64(-outputdt)) + df = setup_pset_execute(fieldset=fieldset, outputdt=outputdt, execute_kwargs=dict(runtime=runtime, dt=dt)) + particle_0_times = df[df.trajectory == 0].time.values + np.testing.assert_equal(np.diff(particle_0_times), -outputdt.seconds) def test_pset_execute_outputdt_backwards_fieldset_timevarying(): @@ -419,10 +415,9 @@ def test_pset_execute_outputdt_backwards_fieldset_timevarying(): ds_fset = copernicusmarine_to_sgrid(fields=fields) fieldset = FieldSet.from_sgrid_conventions(ds_fset) - df = setup_pset_execute(outputdt=outputdt, execute_kwargs=dict(runtime=runtime, dt=dt), fieldset=fieldset) # noqa: F841 - pytest.skip("# TODO: Need to figure out how times work with parquet output (#2386)") - file_outputdt = ds.isel(trajectory=0).time.diff(dim="obs").values # noqa: F821 - assert np.all(file_outputdt == np.timedelta64(-outputdt)), (file_outputdt, np.timedelta64(-outputdt)) + df = setup_pset_execute(outputdt=outputdt, execute_kwargs=dict(runtime=runtime, dt=dt), fieldset=fieldset) + particle_0_times = df[df.trajectory == 0].time.values + np.testing.assert_equal(np.diff(particle_0_times), -outputdt.seconds) def test_particlefile_init(tmp_parquet): From e7e37ef3a81228e6280787c39d98b9389df0189f Mon Sep 17 00:00:00 2001 From: Vecko <36369090+VeckoTheGecko@users.noreply.github.com> Date: Wed, 22 Apr 2026 18:31:39 +0200 Subject: [PATCH 22/34] Fix test_time_is_age test --- tests/test_particlefile.py | 15 ++++++++------- tests/utils.py | 33 +++++++++++++++++++++++++++++++++ 2 files changed, 41 insertions(+), 7 deletions(-) diff --git a/tests/test_particlefile.py b/tests/test_particlefile.py index 0a9534e05..0afc81f44 100755 --- a/tests/test_particlefile.py +++ b/tests/test_particlefile.py @@ -30,6 +30,7 @@ from parcels.convert import copernicusmarine_to_sgrid from parcels.interpolators import XLinear, XLinear_Velocity from parcels.kernels import AdvectionRK4 +from tests import utils from tests.common_kernels import DoNothing @@ -306,13 +307,13 @@ def IncreaseAge(particles, fieldset): # pragma: no cover pset.execute(IncreaseAge, runtime=np.timedelta64(npart * 2, "s"), dt=np.timedelta64(1, "s"), output_file=ofile) - pytest.skip("# TODO: Need to figure out how times work with parquet output (#2386)") - ds = xr.open_zarr(tmp_parquet) - age = ds["age"][:].values.astype("timedelta64[s]") - ds_timediff = np.zeros_like(age) - for i in range(npart): - ds_timediff[i, :] = ds.time.values[i, :] - time[i] - np.testing.assert_equal(age, ds_timediff) + # df = pd.read_parquet(tmp_parquet) + df = utils.read_particlefile(tmp_parquet) + + # Map sorted trajectory IDs to release times (0, 1, ..., npart-1 seconds) + for index, df_traj in df.groupby("trajectory"): + release_time = time[index] + np.testing.assert_equal(df_traj["age"].astype("timedelta64[s]").values, (df_traj["time"] - release_time).values) def test_reset_dt(fieldset, tmp_parquet): diff --git a/tests/utils.py b/tests/utils.py index 374c8ecc3..f2d88919f 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -8,6 +8,7 @@ import cftime import numpy as np +import pandas as pd import pyarrow as pa import pyarrow.parquet as pq import xarray as xr @@ -178,3 +179,35 @@ def assert_cftime_like_particlefile(parquet_path: Path) -> None: "CF-time values in Parquet did not get properly decoded. Are the attributes correct?" ) return + + +def read_particlefile(path: Path, decode_times: bool = True) -> pd.DataFrame: + assert path.suffix == ".parquet", "Only Parquet files are supported" + + table = pq.read_table(path) + + try: + time_field = table.field("time") + except KeyError as e: + raise ValueError( + f"Could not find 'time' column in parquet file. Are you sure {path=!r} is a particlefile?" + ) from e + + try: + assert b"units" in time_field.metadata + except AssertionError as e: + raise ValueError(f"Could not find 'units' in the 'time' column metadata for parquet {path=!r}.") from e + + attrs = {k.decode(): v.decode() for k, v in time_field.metadata.items()} + + df = pd.read_parquet(path) + if not decode_times: + return df + + values = table.column("time").to_numpy() + var = xr.Variable(("time",), values, attrs) + values = xr.coders.CFDatetimeCoder(time_unit="s").decode(var).values + + df["time"] = values + + return df From 54c829a7dd04584827b4395da33b61741c4d9dae Mon Sep 17 00:00:00 2001 From: Vecko <36369090+VeckoTheGecko@users.noreply.github.com> Date: Thu, 23 Apr 2026 13:03:09 +0200 Subject: [PATCH 23/34] Refactor assert_cftime_like_particlefile --- tests/utils.py | 20 ++++++-------------- 1 file changed, 6 insertions(+), 14 deletions(-) diff --git a/tests/utils.py b/tests/utils.py index f2d88919f..166710bec 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -160,22 +160,10 @@ def round_and_hash_float_array(arr, decimals=6): def assert_cftime_like_particlefile(parquet_path: Path) -> None: assert parquet_path.suffix == ".parquet", "Path must be a parquet file" - table = pq.read_table(parquet_path) - assert "time" in table.schema.names, "Parquet file must have a 'time' column" - - time_field = table.schema.field("time") - assert pa.types.is_floating(time_field.type) or pa.types.is_integer(time_field.type), ( - f"'time' column must be numeric, got {time_field.type}" - ) - - attrs = {k.decode(): v.decode() for k, v in time_field.metadata.items()} - - values = table.column("time").to_pylist() - v = xr.Variable(("time",), values, attrs) - decoded = xr.coders.CFDatetimeCoder(time_unit="s").decode(v) + df = read_particlefile(parquet_path, decode_times=True) # check first value (and hence rest of array) is what we expect - assert isinstance(decoded.values[0], (cftime.datetime, np.datetime64)), ( + assert isinstance(df["time"].values[0], (cftime.datetime, np.datetime64)), ( "CF-time values in Parquet did not get properly decoded. Are the attributes correct?" ) return @@ -193,6 +181,10 @@ def read_particlefile(path: Path, decode_times: bool = True) -> pd.DataFrame: f"Could not find 'time' column in parquet file. Are you sure {path=!r} is a particlefile?" ) from e + assert pa.types.is_floating(time_field.type) or pa.types.is_integer(time_field.type), ( + f"'time' column must be numeric, got {time_field.type}" + ) + try: assert b"units" in time_field.metadata except AssertionError as e: From 8626d486cbceaf2f79cc5c51b2941cba133268fc Mon Sep 17 00:00:00 2001 From: Vecko <36369090+VeckoTheGecko@users.noreply.github.com> Date: Thu, 23 Apr 2026 13:37:45 +0200 Subject: [PATCH 24/34] Self-review feedback --- tests/test_particlefile.py | 6 +++--- tests/utils.py | 5 ++++- tests/utils/test_time.py | 3 ++- 3 files changed, 9 insertions(+), 5 deletions(-) diff --git a/tests/test_particlefile.py b/tests/test_particlefile.py index 0afc81f44..d8cd8fd1f 100755 --- a/tests/test_particlefile.py +++ b/tests/test_particlefile.py @@ -363,7 +363,7 @@ def setup_pset_execute(*, fieldset: FieldSet, outputdt: timedelta, execute_kwarg output_file = ParticleFile(name, outputdt=outputdt) pset.execute(DoNothing, output_file=output_file, **execute_kwargs) - df = pd.read_parquet(name) + df = utils.read_particlefile(name) return df @@ -387,8 +387,8 @@ def test_pset_execute_output_time_forwards(fieldset): dt = np.timedelta64(5, "m") df = setup_pset_execute(fieldset=fieldset, outputdt=outputdt, execute_kwargs=dict(runtime=runtime, dt=dt)) - assert df.time.min() == 0.0 - assert df.time.max() == runtime / np.timedelta64(1, "s") + assert df.time.min() == pd.Timestamp(fieldset.time_interval.left) + assert df.time.max() - df.time.min() == runtime def test_pset_execute_outputdt_backwards(fieldset): diff --git a/tests/utils.py b/tests/utils.py index 166710bec..93214848d 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -17,6 +17,7 @@ from parcels import FieldSet, Particle, Variable from parcels._core.xgrid import _FIELD_DATA_ORDERING, XGrid, get_axis_from_dim_name from parcels._datasets.structured.generated import simple_UV_dataset +from parcels._typing import PathLike PROJECT_ROOT = Path(__file__).resolve().parents[1] TEST_ROOT = PROJECT_ROOT / "tests" @@ -169,7 +170,9 @@ def assert_cftime_like_particlefile(parquet_path: Path) -> None: return -def read_particlefile(path: Path, decode_times: bool = True) -> pd.DataFrame: +def read_particlefile(path: PathLike, decode_times: bool = True) -> pd.DataFrame: + path = Path(path) + assert path.suffix == ".parquet", "Only Parquet files are supported" table = pq.read_table(path) diff --git a/tests/utils/test_time.py b/tests/utils/test_time.py index 2734b3e91..26cc39c1c 100644 --- a/tests/utils/test_time.py +++ b/tests/utils/test_time.py @@ -224,4 +224,5 @@ def test_timedelta_to_float_exceptions(): @given(datetime_strategy()) def test_datetime_get_cf_attrs(dt): - _get_cf_attrs(dt) + attrs = _get_cf_attrs(dt) + assert "seconds" in attrs["units"] From 3693329fcf6fcb0bc44a499d19cddbfbf1b15b8d Mon Sep 17 00:00:00 2001 From: Vecko <36369090+VeckoTheGecko@users.noreply.github.com> Date: Thu, 23 Apr 2026 14:34:53 +0200 Subject: [PATCH 25/34] Fix test_particle_schema --- tests/test_particlefile.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/tests/test_particlefile.py b/tests/test_particlefile.py index d8cd8fd1f..fb5b34bf7 100755 --- a/tests/test_particlefile.py +++ b/tests/test_particlefile.py @@ -24,7 +24,7 @@ ) from parcels._core.particle import Particle, get_default_particle from parcels._core.particlefile import _get_schema -from parcels._core.utils.time import timedelta_to_float +from parcels._core.utils.time import TimeInterval, timedelta_to_float from parcels._datasets.structured.generated import peninsula_dataset from parcels._datasets.structured.generic import datasets from parcels.convert import copernicusmarine_to_sgrid @@ -496,7 +496,7 @@ def Update_lon(particles, fieldset): # pragma: no cover ], ) def test_particle_schema(particle): - s = _get_schema(particle, {}) + s = _get_schema(particle, {}, TimeInterval(datetime(2023, 1, 1, 12, 0), datetime(2023, 1, 2, 12, 0))) written_variables = [v for v in particle.variables if v.to_write] @@ -510,5 +510,9 @@ def test_particle_schema(particle): strict=False, ): assert variable.name == pyarrow_field.name - assert variable.attrs == {k.decode(): v.decode() for k, v in pyarrow_field.metadata.items()} + if variable.name != "time": + assert variable.attrs == {k.decode(): v.decode() for k, v in pyarrow_field.metadata.items()} + else: + assert b"units" in pyarrow_field.metadata + assert b"calendar" in pyarrow_field.metadata assert pa.from_numpy_dtype(variable.dtype) == pyarrow_field.type From 81f127bdb4d6d696b49d6b4e39752d77f46b6e20 Mon Sep 17 00:00:00 2001 From: Vecko <36369090+VeckoTheGecko@users.noreply.github.com> Date: Thu, 23 Apr 2026 14:49:08 +0200 Subject: [PATCH 26/34] Make read_particlefile public --- src/parcels/__init__.py | 3 ++- src/parcels/_core/particlefile.py | 40 ++++++++++++++++++++++++++++ tests/test_particlefile.py | 5 ++-- tests/utils.py | 44 +------------------------------ 4 files changed, 45 insertions(+), 47 deletions(-) diff --git a/src/parcels/__init__.py b/src/parcels/__init__.py index 2a7854cde..7ae1f6928 100644 --- a/src/parcels/__init__.py +++ b/src/parcels/__init__.py @@ -11,7 +11,7 @@ from parcels._core.fieldset import FieldSet from parcels._core.particleset import ParticleSet -from parcels._core.particlefile import ParticleFile +from parcels._core.particlefile import ParticleFile, read_particlefile from parcels._core.particle import ( Variable, Particle, @@ -67,6 +67,7 @@ "ParticleSetWarning", # Utilities "logger", + "read_particlefile", ] _stdlib_warnings.warn( diff --git a/src/parcels/_core/particlefile.py b/src/parcels/_core/particlefile.py index 439db2668..b43d4fad2 100644 --- a/src/parcels/_core/particlefile.py +++ b/src/parcels/_core/particlefile.py @@ -8,8 +8,10 @@ import cftime import numpy as np +import pandas as pd import pyarrow as pa import pyarrow.parquet as pq +import xarray as xr import parcels from parcels._core.particle import ParticleClass @@ -207,3 +209,41 @@ def _get_calendar_and_units(time_interval: TimeInterval) -> dict[str, str]: # T attrs["calendar"] = calendar return attrs + + +def read_particlefile(path: PathLike, decode_times: bool = True) -> pd.DataFrame: + path = Path(path) + + assert path.suffix == ".parquet", "Only Parquet files are supported" + + table = pq.read_table(path) + + try: + time_field = table.field("time") + except KeyError as e: + raise ValueError( + f"Could not find 'time' column in parquet file. Are you sure {path=!r} is a particlefile?" + ) from e + + assert pa.types.is_floating(time_field.type) or pa.types.is_integer(time_field.type), ( + f"'time' column must be numeric, got {time_field.type}" + ) + + try: + assert b"units" in time_field.metadata + except AssertionError as e: + raise ValueError(f"Could not find 'units' in the 'time' column metadata for parquet {path=!r}.") from e + + attrs = {k.decode(): v.decode() for k, v in time_field.metadata.items()} + + df = pd.read_parquet(path) + if not decode_times: + return df + + values = table.column("time").to_numpy() + var = xr.Variable(("time",), values, attrs) + values = xr.coders.CFDatetimeCoder(time_unit="s").decode(var).values + + df["time"] = values + + return df diff --git a/tests/test_particlefile.py b/tests/test_particlefile.py index fb5b34bf7..4fac1a809 100755 --- a/tests/test_particlefile.py +++ b/tests/test_particlefile.py @@ -30,7 +30,6 @@ from parcels.convert import copernicusmarine_to_sgrid from parcels.interpolators import XLinear, XLinear_Velocity from parcels.kernels import AdvectionRK4 -from tests import utils from tests.common_kernels import DoNothing @@ -308,7 +307,7 @@ def IncreaseAge(particles, fieldset): # pragma: no cover pset.execute(IncreaseAge, runtime=np.timedelta64(npart * 2, "s"), dt=np.timedelta64(1, "s"), output_file=ofile) # df = pd.read_parquet(tmp_parquet) - df = utils.read_particlefile(tmp_parquet) + df = parcels.read_particlefile(tmp_parquet) # Map sorted trajectory IDs to release times (0, 1, ..., npart-1 seconds) for index, df_traj in df.groupby("trajectory"): @@ -363,7 +362,7 @@ def setup_pset_execute(*, fieldset: FieldSet, outputdt: timedelta, execute_kwarg output_file = ParticleFile(name, outputdt=outputdt) pset.execute(DoNothing, output_file=output_file, **execute_kwargs) - df = utils.read_particlefile(name) + df = parcels.read_particlefile(name) return df diff --git a/tests/utils.py b/tests/utils.py index 93214848d..33d6e0012 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -8,16 +8,12 @@ import cftime import numpy as np -import pandas as pd -import pyarrow as pa -import pyarrow.parquet as pq import xarray as xr import parcels from parcels import FieldSet, Particle, Variable from parcels._core.xgrid import _FIELD_DATA_ORDERING, XGrid, get_axis_from_dim_name from parcels._datasets.structured.generated import simple_UV_dataset -from parcels._typing import PathLike PROJECT_ROOT = Path(__file__).resolve().parents[1] TEST_ROOT = PROJECT_ROOT / "tests" @@ -161,48 +157,10 @@ def round_and_hash_float_array(arr, decimals=6): def assert_cftime_like_particlefile(parquet_path: Path) -> None: assert parquet_path.suffix == ".parquet", "Path must be a parquet file" - df = read_particlefile(parquet_path, decode_times=True) + df = parcels.read_particlefile(parquet_path, decode_times=True) # check first value (and hence rest of array) is what we expect assert isinstance(df["time"].values[0], (cftime.datetime, np.datetime64)), ( "CF-time values in Parquet did not get properly decoded. Are the attributes correct?" ) return - - -def read_particlefile(path: PathLike, decode_times: bool = True) -> pd.DataFrame: - path = Path(path) - - assert path.suffix == ".parquet", "Only Parquet files are supported" - - table = pq.read_table(path) - - try: - time_field = table.field("time") - except KeyError as e: - raise ValueError( - f"Could not find 'time' column in parquet file. Are you sure {path=!r} is a particlefile?" - ) from e - - assert pa.types.is_floating(time_field.type) or pa.types.is_integer(time_field.type), ( - f"'time' column must be numeric, got {time_field.type}" - ) - - try: - assert b"units" in time_field.metadata - except AssertionError as e: - raise ValueError(f"Could not find 'units' in the 'time' column metadata for parquet {path=!r}.") from e - - attrs = {k.decode(): v.decode() for k, v in time_field.metadata.items()} - - df = pd.read_parquet(path) - if not decode_times: - return df - - values = table.column("time").to_numpy() - var = xr.Variable(("time",), values, attrs) - values = xr.coders.CFDatetimeCoder(time_unit="s").decode(var).values - - df["time"] = values - - return df From 9fcb5bfbc8f89534d4191d17b332929a14034942 Mon Sep 17 00:00:00 2001 From: Vecko <36369090+VeckoTheGecko@users.noreply.github.com> Date: Thu, 23 Apr 2026 14:52:55 +0200 Subject: [PATCH 27/34] Add docstring to read_particlefile --- src/parcels/_core/particlefile.py | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/src/parcels/_core/particlefile.py b/src/parcels/_core/particlefile.py index b43d4fad2..e5b7ee5af 100644 --- a/src/parcels/_core/particlefile.py +++ b/src/parcels/_core/particlefile.py @@ -212,6 +212,31 @@ def _get_calendar_and_units(time_interval: TimeInterval) -> dict[str, str]: # T def read_particlefile(path: PathLike, decode_times: bool = True) -> pd.DataFrame: + """Read a Parcels particlefile (Parquet format) into a pandas DataFrame. + + Parameters + ---------- + path : PathLike + Path to the ``.parquet`` particlefile. + decode_times : bool, optional + If ``True`` (default), use Xarray to decode the numeric ``time`` column from CF + conventions into ``datetime`` or ``cftime.datetime`` values using the units stored in + the column metadata. If ``False``, the raw numeric values are + returned unchanged. + + Returns + ------- + pd.DataFrame + DataFrame containing the particle data. When *decode_times* is + ``True``, the ``time`` column contains datetime-like values; + otherwise it contains the original numeric representation. + + Notes + ----- + For larger datasets, consider using `Polars `_ directly, + e.g. ``polars.read_parquet(path)``, which offers better performance and lower + memory usage than pandas for large Parquet files. + """ path = Path(path) assert path.suffix == ".parquet", "Only Parquet files are supported" From 04d8676f113bea63dd85a2229b9f2e5574fbc852 Mon Sep 17 00:00:00 2001 From: Vecko <36369090+VeckoTheGecko@users.noreply.github.com> Date: Fri, 24 Apr 2026 10:30:17 +0200 Subject: [PATCH 28/34] Review feedback --- src/parcels/_core/particlefile.py | 35 +++--------------------------- tests-v3/test_advection.py | 7 +++--- tests-v3/test_fieldset_sampling.py | 7 +++--- tests-v3/test_particlesets.py | 6 ++--- tests/test_particlefile.py | 1 - 5 files changed, 12 insertions(+), 44 deletions(-) diff --git a/src/parcels/_core/particlefile.py b/src/parcels/_core/particlefile.py index e5b7ee5af..58483572d 100644 --- a/src/parcels/_core/particlefile.py +++ b/src/parcels/_core/particlefile.py @@ -2,11 +2,10 @@ from __future__ import annotations -from datetime import datetime, timedelta +from datetime import timedelta from pathlib import Path from typing import TYPE_CHECKING, Any, Literal -import cftime import numpy as np import pandas as pd import pyarrow as pa @@ -52,10 +51,8 @@ class ParticleFile: Parameters ---------- - name : str - Basename of the output file. This can also be a Zarr store object. - particleset : - ParticleSet to output + path : PathLike + Path of the output Parquet file. outputdt : Interval which dictates the update frequency of file output while ParticleFile is given as an argument of ParticleSet.execute() @@ -94,12 +91,8 @@ def __init__(self, path: PathLike, outputdt): if not path.parent.exists(): raise ValueError(f"Folder location for {path=!r} does not exist. Create the folder location first.") - self._maxids = 0 - self._pids_written = {} self.extra_metadata = {} - # TODO v4: Add check that if create_new_zarrfile is False, the store already exists - def __repr__(self) -> str: return particlefile_repr(self) @@ -155,9 +148,6 @@ def write(self, pset: ParticleSet, time, indices=None): pa.table({v.name: pa.array(particle_data[v.name][indices_to_write]) for v in vars_to_write}), ) - # if len(indices_to_write) == 0: # TODO: Remove this? - # return - def close(self): if self._writer is not None: self._writer.close() @@ -192,25 +182,6 @@ def _to_write_particles(particle_data, time): )[0] -def _get_calendar_and_units(time_interval: TimeInterval) -> dict[str, str]: # TODO: Remove? - calendar = None - units = "seconds" - if time_interval: - if isinstance(time_interval.left, (np.datetime64, datetime)): - calendar = "standard" - elif isinstance(time_interval.left, cftime.datetime): - calendar = time_interval.left.calendar - - if calendar is not None: - units += f" since {time_interval.left}" - - attrs = {"units": units} - if calendar is not None: - attrs["calendar"] = calendar - - return attrs - - def read_particlefile(path: PathLike, decode_times: bool = True) -> pd.DataFrame: """Read a Parcels particlefile (Parquet format) into a pandas DataFrame. diff --git a/tests-v3/test_advection.py b/tests-v3/test_advection.py index bdd7a4221..3d8f06bac 100644 --- a/tests-v3/test_advection.py +++ b/tests-v3/test_advection.py @@ -1,6 +1,5 @@ import numpy as np import pytest -import pandas as pd import xarray as xr from parcels import ( @@ -80,7 +79,7 @@ def test_analyticalAgrid(): @pytest.mark.parametrize("v", [1, -0.3, 0, -1]) @pytest.mark.parametrize("w", [None, 1, -0.3, 0, -1]) @pytest.mark.parametrize("direction", [1, -1]) -def test_uniform_analytical(u, v, w, direction, tmp_parquet): +def test_uniform_analytical(u, v, w, direction, tmp_zarrfile): lon = np.arange(0, 15, dtype=np.float32) lat = np.arange(0, 15, dtype=np.float32) if w is not None: @@ -100,14 +99,14 @@ def test_uniform_analytical(u, v, w, direction, tmp_parquet): x0, y0, z0 = 6.1, 6.2, 20 pset = ParticleSet(fieldset, pclass=Particle, lon=x0, lat=y0, depth=z0) - outfile = pset.ParticleFile(name=tmp_parquet, outputdt=1, chunks=(1, 1)) + outfile = pset.ParticleFile(name=tmp_zarrfile, outputdt=1, chunks=(1, 1)) pset.execute(AdvectionAnalytical, runtime=4, dt=direction, output_file=outfile) assert np.abs(pset.lon - x0 - pset.time * u) < 1e-6 assert np.abs(pset.lat - y0 - pset.time * v) < 1e-6 if w is not None: assert np.abs(pset.depth - z0 - pset.time * w) < 1e-4 - ds = xr.open_zarr(tmp_parquet) + ds = xr.open_zarr(tmp_zarrfile) times = (direction * ds["time"][:]).values.astype("timedelta64[s]")[0] timeref = np.arange(1, 5).astype("timedelta64[s]") assert np.allclose(times, timeref, atol=np.timedelta64(1, "ms")) diff --git a/tests-v3/test_fieldset_sampling.py b/tests-v3/test_fieldset_sampling.py index 176eedab1..291c27b88 100644 --- a/tests-v3/test_fieldset_sampling.py +++ b/tests-v3/test_fieldset_sampling.py @@ -3,7 +3,6 @@ from math import cos, pi import numpy as np -import pandas as pd import pytest import xarray as xr @@ -774,7 +773,7 @@ def test_multiple_grid_addlater_error(): assert fail -def test_fieldset_sampling_updating_order(tmp_parquet): +def test_fieldset_sampling_updating_order(tmp_zarrfile): def calc_p(t, y, x): return 10 * t + x + 0.2 * y @@ -806,10 +805,10 @@ def SampleP(particle, fieldset, time): # pragma: no cover kernels = [AdvectionRK4, SampleP] - pfile = pset.ParticleFile(tmp_parquet, outputdt=1) + pfile = pset.ParticleFile(tmp_zarrfile, outputdt=1) pset.execute(kernels, endtime=1, dt=1, output_file=pfile) - ds = xr.open_zarr(tmp_parquet) + ds = xr.open_zarr(tmp_zarrfile) for t in range(len(ds["obs"])): for i in range(len(ds["trajectory"])): assert np.isclose( diff --git a/tests-v3/test_particlesets.py b/tests-v3/test_particlesets.py index 5c0f2495f..ed884f595 100644 --- a/tests-v3/test_particlesets.py +++ b/tests-v3/test_particlesets.py @@ -39,7 +39,7 @@ def test_pset_create_list_with_customvariable(fieldset): @pytest.mark.parametrize("restart", [True, False]) -def test_pset_create_fromparticlefile(fieldset, restart, tmp_parquet): +def test_pset_create_fromparticlefile(fieldset, restart, tmp_zarrfile): lon = np.linspace(0, 1, 10, dtype=np.float32) lat = np.linspace(1, 0, 10, dtype=np.float32) @@ -48,7 +48,7 @@ def test_pset_create_fromparticlefile(fieldset, restart, tmp_parquet): TestParticle = TestParticle.add_variable("p3", np.float64, to_write="once") pset = ParticleSet(fieldset, lon=lon, lat=lat, depth=[4] * len(lon), pclass=TestParticle, p3=np.arange(len(lon))) - pfile = pset.ParticleFile(tmp_parquet, outputdt=1) + pfile = pset.ParticleFile(tmp_zarrfile, outputdt=1) def Kernel(particle, fieldset, time): # pragma: no cover particle.p = 2.0 @@ -58,7 +58,7 @@ def Kernel(particle, fieldset, time): # pragma: no cover pset.execute(Kernel, runtime=2, dt=1, output_file=pfile) pset_new = ParticleSet.from_particlefile( - fieldset, pclass=TestParticle, filename=tmp_parquet, restart=restart, repeatdt=1 + fieldset, pclass=TestParticle, filename=tmp_zarrfile, restart=restart, repeatdt=1 ) for var in ["lon", "lat", "depth", "time", "p", "p2", "p3"]: diff --git a/tests/test_particlefile.py b/tests/test_particlefile.py index 4fac1a809..d53ff6aca 100755 --- a/tests/test_particlefile.py +++ b/tests/test_particlefile.py @@ -306,7 +306,6 @@ def IncreaseAge(particles, fieldset): # pragma: no cover pset.execute(IncreaseAge, runtime=np.timedelta64(npart * 2, "s"), dt=np.timedelta64(1, "s"), output_file=ofile) - # df = pd.read_parquet(tmp_parquet) df = parcels.read_particlefile(tmp_parquet) # Map sorted trajectory IDs to release times (0, 1, ..., npart-1 seconds) From b4e221412749cf0d3532bbd6383a51c02fec63ff Mon Sep 17 00:00:00 2001 From: Vecko <36369090+VeckoTheGecko@users.noreply.github.com> Date: Fri, 24 Apr 2026 10:54:48 +0200 Subject: [PATCH 29/34] Remove obs_written --- src/parcels/_core/particle.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/parcels/_core/particle.py b/src/parcels/_core/particle.py index 5152b1130..15c55a519 100644 --- a/src/parcels/_core/particle.py +++ b/src/parcels/_core/particle.py @@ -163,7 +163,6 @@ def get_default_particle(spatial_dtype: type[np.float32] | type[np.float64]) -> "cf_role": "trajectory_id", }, ), - Variable("obs_written", dtype=np.int32, initial=0, to_write=False), Variable("dt", dtype=np.float64, initial=1.0, to_write=False), Variable("state", dtype=np.int32, initial=StatusCode.Evaluate, to_write=False), ] From 663f80ca14d8297443f7289eb2456aec316c38d8 Mon Sep 17 00:00:00 2001 From: Vecko <36369090+VeckoTheGecko@users.noreply.github.com> Date: Fri, 24 Apr 2026 10:56:16 +0200 Subject: [PATCH 30/34] Update migration guide --- docs/user_guide/v4-migration.md | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/docs/user_guide/v4-migration.md b/docs/user_guide/v4-migration.md index 7e33be83c..655f1e9c1 100644 --- a/docs/user_guide/v4-migration.md +++ b/docs/user_guide/v4-migration.md @@ -36,7 +36,10 @@ Version 4 of Parcels is unreleased at the moment. The information in this migrat ## ParticleFile - Particlefiles should be created by `ParticleFile(...)` instead of `pset.ParticleFile(...)` -- The `name` argument in `ParticleFile` has been replaced by `store` and can now be a string, a Path or a zarr store. +- `ParticleFile` output is now in Parquet format +- `ParticleFile` writing behaviour now errors out if there's existing output (this be being further discussed in https://github.com/Parcels-code/Parcels/issues/2593 ) +- A utility to read in ParticleFile output is now available. `parcels.read_particlefile()` + ## Field From 5be22aa1adf4fb7386dd101bb5708fcddd1fbe94 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 24 Apr 2026 09:15:56 +0000 Subject: [PATCH 31/34] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- docs/user_guide/v4-migration.md | 1 - 1 file changed, 1 deletion(-) diff --git a/docs/user_guide/v4-migration.md b/docs/user_guide/v4-migration.md index 655f1e9c1..0e06bbc7b 100644 --- a/docs/user_guide/v4-migration.md +++ b/docs/user_guide/v4-migration.md @@ -40,7 +40,6 @@ Version 4 of Parcels is unreleased at the moment. The information in this migrat - `ParticleFile` writing behaviour now errors out if there's existing output (this be being further discussed in https://github.com/Parcels-code/Parcels/issues/2593 ) - A utility to read in ParticleFile output is now available. `parcels.read_particlefile()` - ## Field - `Field.eval()` returns an array of floats instead of a single float (related to the vectorization) From 002b8f2a97e73a7baa059b01cabdc6dd719e8a9a Mon Sep 17 00:00:00 2001 From: Vecko <36369090+VeckoTheGecko@users.noreply.github.com> Date: Fri, 24 Apr 2026 11:18:52 +0200 Subject: [PATCH 32/34] Revert from extra_metadata to metadata --- src/parcels/_core/particlefile.py | 8 +++----- src/parcels/_core/particleset.py | 2 +- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/src/parcels/_core/particlefile.py b/src/parcels/_core/particlefile.py index 58483572d..cb0bf4050 100644 --- a/src/parcels/_core/particlefile.py +++ b/src/parcels/_core/particlefile.py @@ -91,13 +91,13 @@ def __init__(self, path: PathLike, outputdt): if not path.parent.exists(): raise ValueError(f"Folder location for {path=!r} does not exist. Create the folder location first.") - self.extra_metadata = {} + self.metadata = {} def __repr__(self) -> str: return particlefile_repr(self) def set_metadata(self, parcels_grid_mesh: Literal["spherical", "flat"]): - self.extra_metadata.update( + self.metadata.update( { "feature_type": "trajectory", "Conventions": "CF-1.6/CF-1.7", @@ -132,9 +132,7 @@ def write(self, pset: ParticleSet, time, indices=None): if self._writer is None: assert not self.path.exists(), "If the file exists, the writer should already be set" - self._writer = pq.ParquetWriter( - self.path, _get_schema(pclass, self.extra_metadata, pset.fieldset.time_interval) - ) + self._writer = pq.ParquetWriter(self.path, _get_schema(pclass, self.metadata, pset.fieldset.time_interval)) if isinstance(time, (np.timedelta64, np.datetime64)): time = timedelta_to_float(time - time_interval.left) diff --git a/src/parcels/_core/particleset.py b/src/parcels/_core/particleset.py index e59e2860b..b25c269a6 100644 --- a/src/parcels/_core/particleset.py +++ b/src/parcels/_core/particleset.py @@ -394,7 +394,7 @@ def execute( if output_file is not None: output_file.set_metadata(self.fieldset.gridset[0]._mesh) - output_file.extra_metadata["parcels_kernels"] = self._kernel.funcname + output_file.metadata["parcels_kernels"] = self._kernel.funcname dt, sign_dt = _convert_dt_to_float(dt) self._data["dt"][:] = dt From 3c5264798044135b7f603a0a1b8a1ff529fc7f7d Mon Sep 17 00:00:00 2001 From: Vecko <36369090+VeckoTheGecko@users.noreply.github.com> Date: Fri, 24 Apr 2026 11:28:20 +0200 Subject: [PATCH 33/34] Fix test_pfile_array_remove_particles --- tests/test_particlefile.py | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/tests/test_particlefile.py b/tests/test_particlefile.py index d53ff6aca..d782fb171 100755 --- a/tests/test_particlefile.py +++ b/tests/test_particlefile.py @@ -1,4 +1,3 @@ -import os import tempfile from contextlib import nullcontext as does_not_raise from datetime import datetime, timedelta @@ -74,7 +73,6 @@ def test_write_fieldset_without_time(tmp_parquet): assert table["time"].to_numpy()[1] == 1.0 -@pytest.mark.skip("Keep or remove? Introduced in 5d7dd6bba800baa0fe4bd38edfc17ca3e310062b ") def test_pfile_array_remove_particles(fieldset, tmp_parquet): """If a particle from the middle of a particleset is removed, that writing doesn't crash""" npart = 10 @@ -92,9 +90,7 @@ def test_pfile_array_remove_particles(fieldset, tmp_parquet): new_time = 86400 # s in a day pset._data["time"][:] = new_time pfile.write(pset, new_time) - ds = xr.open_zarr(tmp_parquet) - timearr = ds["time"][:] - assert (np.isnat(timearr[3, 1])) and (np.isfinite(timearr[3, 0])) + pfile.close() def test_pfile_array_remove_all_particles(fieldset, tmp_parquet): @@ -189,8 +185,6 @@ def IncrLon(particles, fieldset): # pragma: no cover # test whether samplevar[:, k] = k for k in range(samplevar.shape[1]): assert np.allclose([p for p in samplevar[:, k] if np.isfinite(p)], k + 1) - filesize = os.path.getsize(str(tmp_parquet)) - assert filesize < 1024 * 65 # test that chunking leads to filesize less than 65KB def test_file_warnings(fieldset, tmp_parquet): From ae375b372a7d332a67dbbfe132174c022a4a973a Mon Sep 17 00:00:00 2001 From: Vecko <36369090+VeckoTheGecko@users.noreply.github.com> Date: Fri, 24 Apr 2026 13:44:24 +0200 Subject: [PATCH 34/34] Fix numpy warning https://github.com/Parcels-code/Parcels/pull/2583#discussion_r3131150923 --- src/parcels/_core/particlefile.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/parcels/_core/particlefile.py b/src/parcels/_core/particlefile.py index cb0bf4050..90384f356 100644 --- a/src/parcels/_core/particlefile.py +++ b/src/parcels/_core/particlefile.py @@ -164,15 +164,17 @@ def _to_write_particles(particle_data, time): time - np.abs(particle_data["dt"] / 2), particle_data["time"], where=np.isfinite(particle_data["time"]), + out=None, ) & np.greater_equal( time + np.abs(particle_data["dt"] / 2), particle_data["time"], where=np.isfinite(particle_data["time"]), + out=None, ) # check time - dt/2 <= particle_data["time"] <= time + dt/2 | ( (np.isnan(particle_data["dt"])) - & np.equal(time, particle_data["time"], where=np.isfinite(particle_data["time"])) + & np.equal(time, particle_data["time"], where=np.isfinite(particle_data["time"]), out=None) ) # or dt is NaN and time matches particle_data["time"] ) & (np.isfinite(particle_data["trajectory"]))