Merge pull request #976 from OceanParcels/dask_updates

erikvansebille · web-flow · commit 23c4fe4e8c9d · 2021-01-12T08:03:26.000+01:00
Fixing autochunking.
diff --git a/parcels/examples/example_dask_chunk_OCMs.py b/parcels/examples/example_dask_chunk_OCMs.py
@@ -248,7 +248,7 @@ def test_nemo_3D(mode, chunk_mode):
         assert (len(field_set.U.grid.load_chunk) != 1)
         assert (len(field_set.U.grid.load_chunk) == (1 * int(math.ceil(75.0/25.0)) * int(math.ceil(201.0/201.0)) * int(math.ceil(151.0/151.0))))
         assert (len(field_set.V.grid.load_chunk) != 1)
-        assert (len(field_set.V.grid.load_chunk) == (1 * int(math.ceil(75.0/1.0)) * int(math.ceil(201.0/8.0)) * int(math.ceil(151.0/8.0))))
+        assert (len(field_set.V.grid.load_chunk) == (1 * int(math.ceil(75.0/75.0)) * int(math.ceil(201.0/8.0)) * int(math.ceil(151.0/8.0))))
 
 
 @pytest.mark.parametrize('mode', ['jit'])
@@ -318,7 +318,7 @@ def test_pop(mode, chunk_mode):
         assert (len(field_set.U.grid.load_chunk) != 1)
         assert (len(field_set.V.grid.load_chunk) != 1)
         assert (len(field_set.W.grid.load_chunk) != 1)
-        assert (len(field_set.U.grid.load_chunk) == (int(math.ceil(21.0/8.0)) * int(math.ceil(60.0/8.0)) * int(math.ceil(60.0/8.0))))
+        assert (len(field_set.U.grid.load_chunk) == (int(math.ceil(21.0/3.0)) * int(math.ceil(60.0/8.0)) * int(math.ceil(60.0/8.0))))
 
 
 @pytest.mark.parametrize('mode', ['jit'])
diff --git a/parcels/field.py b/parcels/field.py
@@ -1146,17 +1146,14 @@ def chunk_setup(self):
     def chunk_data(self):
         if not self.chunk_set:
             self.chunk_setup()
-        # self.grid.load_chunk code:
-        # 0: not loaded
-        # 1: was asked to load by kernel in JIT
-        # 2: is loaded and was touched last C call
-        # 3: is loaded
+        g = self.grid
         if isinstance(self.data, da.core.Array):
             for block_id in range(len(self.grid.load_chunk)):
-                if self.grid.load_chunk[block_id] == 1 or self.grid.load_chunk[block_id] > 1 and self.data_chunks[block_id] is None:
+                if g.load_chunk[block_id] == g.chunk_loading_requested \
+                        or g.load_chunk[block_id] in g.chunk_loaded and self.data_chunks[block_id] is None:
                     block = self.get_block(block_id)
                     self.data_chunks[block_id] = np.array(self.data.blocks[(slice(self.grid.tdim),) + block])
-                elif self.grid.load_chunk[block_id] == 0:
+                elif g.load_chunk[block_id] == g.chunk_not_loaded:
                     if isinstance(self.data_chunks, list):
                         self.data_chunks[block_id] = None
                     else:
@@ -1168,7 +1165,7 @@ def chunk_data(self):
             else:
                 self.data_chunks[0, :] = None
             self.c_data_chunks[0] = None
-            self.grid.load_chunk[0] = 2
+            self.grid.load_chunk[0] = g.chunk_loaded_touched
             self.data_chunks[0] = np.array(self.data)
 
     @property
@@ -1189,9 +1186,9 @@ class CField(Structure):
         allow_time_extrapolation = 1 if self.allow_time_extrapolation else 0
         time_periodic = 1 if self.time_periodic else 0
         for i in range(len(self.grid.load_chunk)):
-            if self.grid.load_chunk[i] == 1:
+            if self.grid.load_chunk[i] == self.grid.chunk_loading_requested:
                 raise ValueError('data_chunks should have been loaded by now if requested. grid.load_chunk[bid] cannot be 1')
-            if self.grid.load_chunk[i] > 1:
+            if self.grid.load_chunk[i] in self.grid.chunk_loaded:
                 if not self.data_chunks[i].flags.c_contiguous:
                     self.data_chunks[i] = self.data_chunks[i].copy()
                 self.c_data_chunks[i] = self.data_chunks[i].ctypes.data_as(POINTER(POINTER(c_float)))
@@ -1357,12 +1354,13 @@ def computeTimeChunk(self, data, tindex):
                 ti = g.ti + tindex
             timestamp = self.timestamps[np.where(ti < summedlen)[0][0]]
 
+        rechunk_callback_fields = self.chunk_setup if isinstance(tindex, list) else None
         filebuffer = self._field_fb_class(self.dataFiles[g.ti + tindex], self.dimensions, self.indices,
                                           netcdf_engine=self.netcdf_engine, timestamp=timestamp,
                                           interp_method=self.interp_method,
                                           data_full_zdim=self.data_full_zdim,
                                           chunksize=self.chunksize,
-                                          rechunk_callback_fields=self.chunk_setup,
+                                          rechunk_callback_fields=rechunk_callback_fields,
                                           chunkdims_name_map=self.netcdf_chunkdims_name_map)
         filebuffer.__enter__()
         time_data = filebuffer.time
diff --git a/parcels/fieldfilebuffer.py b/parcels/fieldfilebuffer.py
@@ -3,6 +3,7 @@
 from dask import utils as da_utils
 import numpy as np
 import xarray as xr
+from netCDF4 import Dataset as ncDataset
 
 import datetime
 import math
@@ -188,7 +189,12 @@ def __init__(self, *args, **kwargs):
 
 
 class DaskFileBuffer(NetcdfFileBuffer):
-    _static_name_map = ['time', 'depth', 'lat', 'lon']
+    _static_name_maps = {'time': ['time', 'time_count', 'time_counter', 'timer_count', 't'],
+                         'depth': ['depth', 'depthu', 'depthv', 'depthw', 'depths', 'deptht', 'depthx', 'depthy',
+                                   'depthz', 'z', 'z_u', 'z_v', 'z_w', 'd', 'k', 'w_dep', 'w_deps', 'Z', 'Zp1',
+                                   'Zl', 'Zu', 'level'],
+                         'lat': ['lat', 'nav_lat', 'y', 'latitude', 'la', 'lt', 'j', 'YC', 'YG'],
+                         'lon': ['lon', 'nav_lon', 'x', 'longitude', 'lo', 'ln', 'i', 'XC', 'XG']}
     _min_dim_chunksize = 16
 
     """ Class that encapsulates and manages deferred access to file data. """
@@ -267,6 +273,45 @@ def close(self):
         self.chunking_finalized = False
         self.chunk_mapping = None
 
+    @classmethod
+    def add_to_dimension_name_map_global(self, name_map):
+        """
+        [externally callable]
+        This function adds entries to the name map from parcels_dim -> netcdf_dim. This is required if you want to
+        use auto-chunking on large fields whose map parameters are not defined. This function must be called before
+        entering the filebuffer object. Example:
+        DaskFileBuffer.add_to_dimension_name_map_global({'lat': 'nydim',
+                                                         'lon': 'nxdim',
+                                                         'time': 'ntdim',
+                                                         'depth': 'nddim'})
+        fieldset = FieldSet(..., chunksize='auto')
+        [...]
+        Note that not all parcels dimensions need to be present in 'name_map'.
+        """
+        assert isinstance(name_map, dict)
+        for pcls_dim_name in name_map.keys():
+            if isinstance(name_map[pcls_dim_name], list):
+                for nc_dim_name in name_map[pcls_dim_name]:
+                    self._static_name_maps[pcls_dim_name].append(nc_dim_name)
+            elif isinstance(name_map[pcls_dim_name], str):
+                self._static_name_maps[pcls_dim_name].append(name_map[pcls_dim_name])
+
+    def add_to_dimension_name_map(self, name_map):
+        """
+        [externally callable]
+        This function adds entries to the name map from parcels_dim -> netcdf_dim. This is required if you want to
+        use auto-chunking on large fields whose map parameters are not defined. This function must be called after
+        constructing an filebuffer object and before entering the filebuffer. Example:
+        fb = DaskFileBuffer(...)
+        fb.add_to_dimension_name_map({'lat': 'nydim', 'lon': 'nxdim', 'time': 'ntdim', 'depth': 'nddim'})
+        with fb:
+            [do_stuff}
+        Note that not all parcels dimensions need to be present in 'name_map'.
+        """
+        assert isinstance(name_map, dict)
+        for pcls_dim_name in name_map.keys():
+            self._static_name_maps[pcls_dim_name].append(name_map[pcls_dim_name])
+
     def _get_available_dims_indices_by_request(self):
         """
         [private function - not to be called from outside the class]
@@ -278,7 +323,7 @@ def _get_available_dims_indices_by_request(self):
         neg_offset = 0
         tpl_offset = 0
         for name in ['time', 'depth', 'lat', 'lon']:
-            i = self._static_name_map.index(name)
+            i = list(self._static_name_maps.keys()).index(name)
             if (name not in self.dimensions):
                 result[name] = None
                 tpl_offset += 1
@@ -300,7 +345,32 @@ def _get_available_dims_indices_by_namemap(self):
         """
         result = {}
         for name in ['time', 'depth', 'lat', 'lon']:
-            result[name] = self._static_name_map.index(name)
+            result[name] = list(self._static_name_maps.keys()).index(name)
+        return result
+
+    def _get_available_dims_indices_by_netcdf_file(self):
+        """
+        [private function - not to be called from outside the class]
+        [File needs to be open (i.e. self.dataset is not None) for this to work - otherwise generating an error]
+        Returns a dict mapping 'parcels_dimname' -> [None, int32_index_data_array].
+        This dictionary is based on the information provided by the requested dimensions.
+        Example: {'time': 0, 'depth': 5, 'lat': 3, 'lon': 1}
+                 for NetCDF with dimensions:
+                     timer: 1
+                     x: [0 4000]
+                     xr: [0 3999]
+                     y: [0 2140]
+                     yr: [0 2139]
+                     z: [0 75]
+        """
+        if self.dataset is None:
+            raise IOError("Trying to parse NetCDF header information before opening the file.")
+        result = {}
+        for pcls_dimname in ['time', 'depth', 'lat', 'lon']:
+            for nc_dimname in self._static_name_maps[pcls_dimname]:
+                if nc_dimname not in self.dataset.dims.keys():
+                    continue
+                result[pcls_dimname] = list(self.dataset.dims.keys()).index(nc_dimname)
         return result
 
     def _is_dimension_available(self, dimension_name):
@@ -346,6 +416,14 @@ def _is_dimension_in_dataset(self, parcels_dimension_name, netcdf_dimension_name
         if netcdf_dimension_name is not None and netcdf_dimension_name in self.dataset.dims.keys():
             value = self.dataset.dims[netcdf_dimension_name]
             k, dname, dvalue = i, netcdf_dimension_name, value
+        elif self.dimensions is None or self.dataset is None:
+            return k, dname, dvalue
+        else:
+            for name in self._static_name_maps[dimension_name]:
+                if name in self.dataset.dims:
+                    value = self.dataset.dims[name]
+                    k, dname, dvalue = i, name, value
+                    break
         return k, dname, dvalue
 
     def _is_dimension_in_chunksize_request(self, parcels_dimension_name):
@@ -467,6 +545,53 @@ def _get_initial_chunk_dictionary_by_dict_(self):
                 self.chunksize.pop('lon')
         return chunk_dict, chunk_index_map
 
+    def _failsafe_parse_(self):
+        """
+        [private function - not to be called from outside the class]
+        ['name' need to be initialised]
+        """
+        # ==== fail - open it as a normal array and deduce the dimensions from the variable-function names ==== #
+        # ==== done by parsing ALL variables in the NetCDF, and comparing their call-parameters with the   ==== #
+        # ==== name map available here.                                                                    ==== #
+        init_chunk_dict = {}
+        self.dataset = ncDataset(str(self.filename))
+        refdims = self.dataset.dimensions.keys()
+        max_field = ""
+        max_dim_names = ()
+        max_coincide_dims = 0
+        for vname in self.dataset.variables:
+            var = self.dataset.variables[vname]
+            coincide_dims = []
+            for vdname in var.dimensions:
+                if vdname in refdims:
+                    coincide_dims.append(vdname)
+            n_coincide_dims = len(coincide_dims)
+            if n_coincide_dims > max_coincide_dims:
+                max_field = vname
+                max_dim_names = tuple(coincide_dims)
+                max_coincide_dims = n_coincide_dims
+        self.name = max_field
+        for nc_dname in max_dim_names:
+            pcls_dname = None
+            for dname in self._static_name_maps.keys():
+                if nc_dname in self._static_name_maps[dname]:
+                    pcls_dname = dname
+                    break
+            nc_dimsize = None
+            pcls_dim_chunksize = None
+            if pcls_dname is not None and pcls_dname in self.dimensions:
+                pcls_dim_chunksize = self._min_dim_chunksize
+            if isinstance(self.chunksize, dict) and pcls_dname is not None:
+                nc_dimsize = self.dataset.dimensions[nc_dname].size
+                if pcls_dname in self.chunksize.keys():
+                    pcls_dim_chunksize = self.chunksize[pcls_dname][1]
+            if pcls_dname is not None and nc_dname is not None and nc_dimsize is not None and pcls_dim_chunksize is not None:
+                init_chunk_dict[nc_dname] = pcls_dim_chunksize
+
+        # ==== because in this case it has shown that the requested chunksize setup cannot be used, ==== #
+        # ==== replace the requested chunksize with this auto-derived version.                      ==== #
+        return init_chunk_dict
+
     def _get_initial_chunk_dictionary(self):
         """
         [private function - not to be called from outside the class]
@@ -532,8 +657,10 @@ def _get_initial_chunk_dictionary(self):
         except:
             logger.warning("Chunking with init_chunk_dict = {} failed - Executing Dask chunking 'failsafe'...".format(init_chunk_dict))
             self.autochunkingfailed = True
-            self.dataset.close()
-            raise DaskChunkingError(self.__class__.__name__, "No correct mapping found between Parcels- and NetCDF dimensions! Please correct the 'FieldSet(..., chunksize={...})' parameter and try again.")
+            if not self.autochunkingfailed:
+                init_chunk_dict = self._failsafe_parse_()
+            if isinstance(self.chunksize, dict):
+                self.chunksize = init_chunk_dict
         finally:
             self.dataset.close()
             self.chunk_mapping = init_chunk_map
@@ -572,8 +699,6 @@ def data_access(self):
                         self.rechunk_callback_fields()
                         self.chunking_finalized = True
                 else:
-                    if not self.autochunkingfailed:
-                        data = data.rechunk(self.chunk_mapping)
                     self.chunking_finalized = True
         else:
             da_data = da.from_array(data, chunks=self.chunksize)
diff --git a/parcels/fieldset.py b/parcels/fieldset.py
@@ -1069,9 +1069,11 @@ def computeTimeChunk(self, time, dt):
                 f.data = f.reshape(data)
                 if not f.chunk_set:
                     f.chunk_setup()
-                if len(g.load_chunk) > 0:
-                    g.load_chunk = np.where(g.load_chunk == 2, 1, g.load_chunk)
-                    g.load_chunk = np.where(g.load_chunk == 3, 0, g.load_chunk)
+                if len(g.load_chunk) > g.chunk_not_loaded:
+                    g.load_chunk = np.where(g.load_chunk == g.chunk_loaded_touched,
+                                            g.chunk_loading_requested, g.load_chunk)
+                    g.load_chunk = np.where(g.load_chunk == g.chunk_deprecated,
+                                            g.chunk_not_loaded, g.load_chunk)
 
             elif g.update_status == 'updated':
                 lib = np if isinstance(f.data, np.ndarray) else da
@@ -1119,11 +1121,14 @@ def computeTimeChunk(self, time, dt):
                                 f.data[2, :] = None
                         f.data[1:, :] = f.data[:2, :]
                         f.data[0, :] = data
-                g.load_chunk = np.where(g.load_chunk == 3, 0, g.load_chunk)
+                g.load_chunk = np.where(g.load_chunk == g.chunk_loaded_touched,
+                                        g.chunk_loading_requested, g.load_chunk)
+                g.load_chunk = np.where(g.load_chunk == g.chunk_deprecated,
+                                        g.chunk_not_loaded, g.load_chunk)
                 if isinstance(f.data, da.core.Array) and len(g.load_chunk) > 0:
                     if signdt >= 0:
                         for block_id in range(len(g.load_chunk)):
-                            if g.load_chunk[block_id] == 2:
+                            if g.load_chunk[block_id] == g.chunk_loaded_touched:
                                 if f.data_chunks[block_id] is None:
                                     # file chunks were never loaded.
                                     # happens when field not called by kernel, but shares a grid with another field called by kernel
@@ -1134,7 +1139,7 @@ def computeTimeChunk(self, time, dt):
                                 f.data_chunks[block_id][2] = np.array(f.data.blocks[(slice(3),)+block][2])
                     else:
                         for block_id in range(len(g.load_chunk)):
-                            if g.load_chunk[block_id] == 2:
+                            if g.load_chunk[block_id] == g.chunk_loaded_touched:
                                 if f.data_chunks[block_id] is None:
                                     # file chunks were never loaded.
                                     # happens when field not called by kernel, but shares a grid with another field called by kernel
diff --git a/parcels/grid.py b/parcels/grid.py
@@ -233,6 +233,26 @@ def computeTimeChunk(self, f, time, signdt):
                 nextTime_loc = self.time[0] + periods*(self.time_full[-1]-self.time_full[0])
         return nextTime_loc
 
+    @property
+    def chunk_not_loaded(self):
+        return 0
+
+    @property
+    def chunk_loading_requested(self):
+        return 1
+
+    @property
+    def chunk_loaded_touched(self):
+        return 2
+
+    @property
+    def chunk_deprecated(self):
+        return 3
+
+    @property
+    def chunk_loaded(self):
+        return [2, 3]
+
 
 class RectilinearGrid(Grid):
     """Rectilinear Grid
diff --git a/parcels/kernel.py b/parcels/kernel.py
@@ -252,8 +252,9 @@ def execute_jit(self, pset, endtime, dt):
                         f.c_data_chunks[block_id] = None
 
             for g in pset.fieldset.gridset.grids:
-                g.load_chunk = np.where(g.load_chunk == 1, 2, g.load_chunk)
-                if len(g.load_chunk) > 0:  # not the case if a field in not called in the kernel
+                g.load_chunk = np.where(g.load_chunk == g.chunk_loading_requested,
+                                        g.chunk_loaded_touched, g.load_chunk)
+                if len(g.load_chunk) > g.chunk_not_loaded:  # not the case if a field in not called in the kernel
                     if not g.load_chunk.flags.c_contiguous:
                         g.load_chunk = g.load_chunk.copy()
                 if not g.depth.flags.c_contiguous:
@@ -410,8 +411,9 @@ def execute(self, pset, endtime, dt, recovery=None, output_file=None, execute_on
 
         if pset.fieldset is not None:
             for g in pset.fieldset.gridset.grids:
-                if len(g.load_chunk) > 0:  # not the case if a field in not called in the kernel
-                    g.load_chunk = np.where(g.load_chunk == 2, 3, g.load_chunk)
+                if len(g.load_chunk) > g.chunk_not_loaded:  # not the case if a field in not called in the kernel
+                    g.load_chunk = np.where(g.load_chunk == g.chunk_loaded_touched,
+                                            g.chunk_deprecated, g.load_chunk)
 
         # Execute the kernel over the particle set
         if self.ptype.uses_jit: