diff --git a/Changelog.rst b/Changelog.rst index cbc00cf8d3..a5eefa214e 100644 --- a/Changelog.rst +++ b/Changelog.rst @@ -1,8 +1,15 @@ Version NEXTVERSION -------------------- +-------------- **2026-01-??** +* Write Zarr v3 datasets with `cf.write` + (https://github.com/NCAS-CMS/cf-python/issues/895) +* Read Zarr v2 and v3 datasets that contain a group hierarchy with + `cf.read` (https://github.com/NCAS-CMS/cf-python/issues/894) +* Reduce the time taken to import `cf` + (https://github.com/NCAS-CMS/cf-python/issues/902) +* New optional dependency: ``zarr>=3.1.3`` * New function to control the creation of cached elements during data display: `cf.display_data` (https://github.com/NCAS-CMS/cf-python/issues/913) diff --git a/README.md b/README.md index 8829d96f6a..140e6faeb9 100644 --- a/README.md +++ b/README.md @@ -87,7 +87,8 @@ of its array manipulation and can: * read field constructs from netCDF, CDL, Zarr, PP and UM datasets with a choice of netCDF backends,and in local, http, and s3 locations, * create new field constructs in memory, -* write and append field and domain constructs to netCDF datasets on disk, +* write and append field and domain constructs to netCDF and Zarr v3 + datasets on disk, * read, create, and manipulate UGRID mesh topologies, * read, write, and create coordinates defined by geometry cells, * read netCDF and CDL datasets containing hierarchical groups, diff --git a/cf/aggregate.py b/cf/aggregate.py index ff53f31aa3..149be58d67 100644 --- a/cf/aggregate.py +++ b/cf/aggregate.py @@ -4112,7 +4112,7 @@ def _get_hfl( # Record the bounds of the first and last (sorted) cells first, last = hfl_cache.flb.get(hash_value, (None, None)) if first is None: - cached_elements = d._get_cached_elements() + cached_elements = d.get_cached_elements() x = [] for i in (0, 1, -2, -1): value = cached_elements.get(i) diff --git a/cf/data/data.py b/cf/data/data.py index e906087c93..7b75d16008 100644 --- a/cf/data/data.py +++ b/cf/data/data.py @@ -2761,7 +2761,7 @@ def Units(self, value): self._set_dask(dx, clear=self._ALL ^ self._CACHE ^ self._CFA) # Adjust cached values for the new units - cache = self._get_cached_elements() + cache = self.get_cached_elements() if cache: self._set_cached_elements( {index: cf_func(value) for index, value in cache.items()} diff --git a/cf/dimensioncoordinate.py b/cf/dimensioncoordinate.py index 644709bbda..c1a061e55f 100644 --- a/cf/dimensioncoordinate.py +++ b/cf/dimensioncoordinate.py @@ -165,7 +165,7 @@ def _infer_direction(self): if data is not None: # Infer the direction from the data if data.size > 1: - c = data._get_cached_elements() + c = data.get_cached_elements() if c: try: return bool(c.get(0) <= c.get(1)) @@ -179,7 +179,7 @@ def _infer_direction(self): data = self.get_bounds_data(None, _fill_value=False) if data is not None: # Infer the direction from the bounds - c = data._get_cached_elements() + c = data.get_cached_elements() if c: try: return bool(c.get(0) <= c.get(1)) diff --git a/cf/field.py b/cf/field.py index 53b889259d..b571158ea9 100644 --- a/cf/field.py +++ b/cf/field.py @@ -6994,7 +6994,7 @@ def collapse( else: b = dim.data - cached_elements = b._get_cached_elements() + cached_elements = b.get_cached_elements() try: # Try to set the new bounds from cached values bounds_data = Data( diff --git a/cf/functions.py b/cf/functions.py index 7fa8a3d08a..6b1850a388 100644 --- a/cf/functions.py +++ b/cf/functions.py @@ -3198,7 +3198,7 @@ def environment(display=True, paths=True): netCDF4: 1.7.2 /home/miniconda3/lib/python3.12/site-packages/netCDF4/__init__.py h5netcdf: 1.3.0 /home/miniconda3/lib/python3.12/site-packages/h5netcdf/__init__.py h5py: 3.12.1 /home/miniconda3/lib/python3.12/site-packages/h5py/__init__.py - zarr: 3.0.8 /home/miniconda3/lib/python3.12/site-packages/zarr/__init__.py + zarr: 3.1.3 /home/miniconda3/lib/python3.12/site-packages/zarr/__init__.py s3fs: 2024.12.0 /home/miniconda3/lib/python3.12/site-packages/s3fs/__init__.py scipy: 1.15.1 /home/miniconda3/lib/python3.12/site-packages/scipy/__init__.py dask: 2025.5.1 /home/miniconda3/lib/python3.12/site-packages/dask/__init__.py @@ -3224,7 +3224,7 @@ def environment(display=True, paths=True): netCDF4: 1.7.2 h5netcdf: 1.3.0 h5py: 3.12.1 - zarr: 3.0.8 + zarr: 3.1.3 s3fs: 2024.12.0 scipy: 1.15.1 dask: 2025.5.1 diff --git a/cf/read_write/read.py b/cf/read_write/read.py index ba3cd8469b..52cc132e7e 100644 --- a/cf/read_write/read.py +++ b/cf/read_write/read.py @@ -316,6 +316,10 @@ class read(cfdm.read): .. versionadded:: 3.17.0 + {{read store_dataset_shards: `bool`, optional}} + + .. versionadded:: NEXTVERSION + {{read cfa: `dict`, optional}} .. versionadded:: 3.15.0 @@ -328,6 +332,10 @@ class read(cfdm.read): .. versionadded:: 3.17.0 + {{read group_dimension_search: `str`, optional}} + + .. versionadded:: NEXTVERSION + umversion: deprecated at version 3.0.0 Use the *um* parameter instead. @@ -434,6 +442,7 @@ def __new__( warn_valid=False, dask_chunks="storage-aligned", store_dataset_chunks=True, + store_dataset_shards=True, domain=False, cfa=None, cfa_write=None, @@ -445,6 +454,7 @@ def __new__( ignore_read_error=False, fmt=None, file_type=None, + group_dimension_search="closest_ancestor", ): """Read field or domain constructs from a dataset.""" kwargs = locals() diff --git a/cf/test/test_Data.py b/cf/test/test_Data.py index 21bdff254d..d64ebdbeba 100644 --- a/cf/test/test_Data.py +++ b/cf/test/test_Data.py @@ -4457,28 +4457,30 @@ def test_Data__init__datetime(self): self.assertTrue((q == d).array.all()) self.assertTrue((d == q).array.all()) - def test_Data__str__(self): - """Test `Data.__str__`""" - elements0 = (0, -1, 1) - for array in ([1], [1, 2], [1, 2, 3]): - d = cf.Data(array) - d[0] = 1 - self.assertEqual(str(d), str(array)) - d += 0 - self.assertEqual(str(d), str(array)) - - # Test when size > 3, i.e. second element is not there. - d = cf.Data([1, 2, 3, 4]) - - self.assertEqual(str(d), "[1, ..., 4]") - cache = d.get_cached_elements() - self.assertNotIn(1, cache) - for element in elements0[:2]: - self.assertIn(element, cache) - - d[0] = 1 - for element in elements0: - self.assertNotIn(element, d.get_cached_elements()) + def test_Data__repr__str(self): + """Test all means of Data inspection.""" + for d in [ + cf.Data(9, units="km"), + cf.Data([9], units="km"), + cf.Data([[9]], units="km"), + cf.Data([8, 9], units="km"), + cf.Data([[8, 9]], units="km"), + cf.Data([7, 8, 9], units="km"), + cf.Data([[7, 8, 9]], units="km"), + cf.Data([6, 7, 8, 9], units="km"), + cf.Data([[6, 7, 8, 9]], units="km"), + cf.Data([[6, 7], [8, 9]], units="km"), + cf.Data([[6, 7, 8, 9], [6, 7, 8, 9]], units="km"), + ]: + _ = repr(d) + _ = str(d) + + # Test when the data contains date-times with the first + # element masked + dt = np.ma.array([10, 20], mask=[True, False]) + d = cf.Data(dt, units="days since 2000-01-01") + self.assertTrue(str(d) == "[--, 2000-01-21 00:00:00]") + self.assertTrue(repr(d) == "") def test_Data_cull_graph(self): """Test Data.cull_graph.""" diff --git a/cf/test/test_zarr.py b/cf/test/test_zarr.py new file mode 100644 index 0000000000..2310a83953 --- /dev/null +++ b/cf/test/test_zarr.py @@ -0,0 +1,333 @@ +import atexit +import datetime +import faulthandler +import os +import shutil +import tempfile +import unittest + +faulthandler.enable() # to debug seg faults and timeouts + +import zarr + +import cf + +warnings = False + +# Set up temporary directories +tmpdirs = [ + tempfile.mkdtemp("_test_zarr.zarr", dir=os.getcwd()) for i in range(2) +] +[tmpdir1, tmpdir2] = tmpdirs + +# Set up temporary files +tmpfiles = [ + tempfile.mkstemp("_test_zarr.nc", dir=os.getcwd())[1] for i in range(2) +] +[tmpfile1, tmpfile2] = tmpfiles + + +def _remove_tmpdirs(): + """Remove temporary files created during tests.""" + for f in tmpfiles: + try: + os.remove(f) + except OSError: + pass + + for d in tmpdirs: + try: + shutil.rmtree(d) + os.rmdir(d) + except OSError: + pass + + +atexit.register(_remove_tmpdirs) + + +class read_writeTest(unittest.TestCase): + """Test the reading and writing of field constructs from/to disk.""" + + f0 = cf.example_field(0) + + def setUp(self): + """Preparations called immediately before each test method.""" + # Disable log messages to silence expected warnings + cf.LOG_LEVEL("DISABLE") + # Note: to enable all messages for given methods, lines or + # calls (those without a 'verbose' option to do the same) + # e.g. to debug them, wrap them (for methods, start-to-end + # internally) as follows: cf.LOG_LEVEL('DEBUG') + # + # < ... test code ... > + # cf.log_level('DISABLE') + + def test_zarr_read_write_1(self): + """Test Zarr read/write on example fields.""" + for i, f in enumerate(cf.example_fields()): + if i in (8, 9, 10): + # Can't write UGRID yet + continue + + cf.write(f, tmpdir1, fmt="ZARR3") + z = cf.read(tmpdir1) + self.assertEqual(len(z), 1) + z = z[0] + self.assertTrue(z.equals(f)) + + # Check that the Zarr and netCDF4 encodings are equivalent + cf.write(f, tmpfile1, fmt="NETCDF4") + n = cf.read(tmpfile1)[0] + self.assertTrue(z.equals(n)) + + def test_zarr_read_write_2(self): + """Test Zarr read/write on various netCDF files.""" + for filename in ( + "DSG_timeSeries_contiguous.nc", + "DSG_timeSeries_indexed.nc", + "DSG_timeSeriesProfile_indexed_contiguous.nc", + "gathered.nc", + "geometry_1.nc", + "geometry_2.nc", + "geometry_3.nc", + "geometry_4.nc", + "string_char.nc", + ): + n = cf.read(filename) + cf.write(n, tmpdir1, fmt="ZARR3") + z = cf.read(tmpdir1) + self.assertEqual(len(z), len(n)) + for a, b in zip(z, n): + self.assertTrue(a.equals(b)) + + def test_zarr_read_write_chunks_shards(self): + """Test Zarr read/write with chunks and shards.""" + f = self.f0.copy() + f.data.nc_set_dataset_chunksizes([2, 3]) + + cf.write(f, tmpdir1, fmt="ZARR3") + z = cf.read(tmpdir1)[0] + self.assertTrue(z.equals(f)) + + z = zarr.open(tmpdir1) + self.assertEqual(z["q"].chunks, (2, 3)) + self.assertIsNone(z["q"].shards) + + # Make shards comprising 4 chunks + cf.write(f, tmpdir1, fmt="ZARR3", dataset_shards=4) + z = cf.read(tmpdir1, store_dataset_shards=False)[0] + self.assertTrue(z.equals(f)) + self.assertIsNone(z.data.nc_dataset_shards()) + + z = zarr.open(tmpdir1) + self.assertEqual(z["q"].chunks, (2, 3)) + self.assertEqual(z["q"].shards, (4, 6)) + + for shards in (4, [2, 2]): + f.data.nc_set_dataset_shards(shards) + cf.write(f, tmpdir1, fmt="ZARR3") + z = cf.read(tmpdir1)[0] + self.assertTrue(z.equals(f)) + self.assertEqual(z.data.nc_dataset_shards(), (2, 2)) + + z = zarr.open(tmpdir1) + self.assertEqual(z["q"].chunks, (2, 3)) + self.assertEqual(z["q"].shards, (4, 6)) + + def test_zarr_read_write_CFA(self): + """Test CF aggreagtion in Zarr.""" + f = self.f0 + + cf.write(f, tmpdir1, fmt="ZARR3") + cf.write(f, tmpfile1, fmt="NETCDF4") + + z = cf.read(tmpdir1, cfa_write="field")[0] + n = cf.read(tmpfile1, cfa_write="field")[0] + + self.assertTrue(z.equals(f)) + self.assertTrue(z.equals(n)) + + cf.write(z, tmpdir2, fmt="ZARR3", cfa="field") + cf.write(n, tmpfile2, fmt="NETCDF4", cfa="field") + + z = cf.read(tmpdir2)[0] + n = cf.read(tmpfile2)[0] + + self.assertTrue(z.equals(f)) + self.assertTrue(z.equals(n)) + + def test_zarr_groups_1(self): + """Test for the general handling of Zarr hierarchical groups.""" + f = cf.example_field(1) + + # Add a second grid mapping + datum = cf.Datum(parameters={"earth_radius": 7000000}) + conversion = cf.CoordinateConversion( + parameters={"grid_mapping_name": "latitude_longitude"} + ) + + grid = cf.CoordinateReference( + coordinate_conversion=conversion, + datum=datum, + coordinates=["auxiliarycoordinate0", "auxiliarycoordinate1"], + ) + + f.set_construct(grid) + + grid0 = f.construct("grid_mapping_name:rotated_latitude_longitude") + grid0.del_coordinate("auxiliarycoordinate0") + grid0.del_coordinate("auxiliarycoordinate1") + + grouped_dir = tmpdir1 + grouped_file = tmpfile1 + + # Set some groups + f.nc_set_variable_groups(["forecast", "model"]) + f.construct("grid_latitude").bounds.nc_set_variable_groups( + ["forecast"] + ) + for name in ( + "longitude", # Auxiliary coordinate + "latitude", # Auxiliary coordinate + "long_name=Grid latitude name", # Auxiliary coordinate + "measure:area", # Cell measure + "surface_altitude", # Domain ancillary + "air_temperature standard_error", # Field ancillary + "grid_mapping_name:rotated_latitude_longitude", + "time", # Dimension coordinate + "grid_latitude", # Dimension coordinate + ): + f.construct(name).nc_set_variable_groups(["forecast"]) + + # Check the groups + cf.write(f, grouped_file, fmt="NETCDF4") + cf.write(f, grouped_dir, fmt="ZARR3") + + n = cf.read(grouped_file)[0] + z = cf.read(grouped_dir)[0] + self.assertTrue(z.equals(n)) + self.assertTrue(z.equals(f)) + + # Directly check the groups in the Zarr dataset + x = zarr.open(grouped_dir) + self.assertEqual(list(x.group_keys()), ["forecast"]) + self.assertEqual(list(x["forecast"].group_keys()), ["model"]) + + cf.write(z, tmpdir2, fmt="ZARR3") + z1 = cf.read(tmpdir2)[0] + self.assertTrue(z1.equals(f)) + + def test_zarr_groups_dimension(self): + """Test Zarr groups dimensions.""" + f = self.f0.copy() + + grouped_dir = tmpdir1 + grouped_file = tmpfile1 + + # Set some groups + f.nc_set_variable_groups(["forecast", "model"]) + for construct in f.constructs.filter_by_data().values(): + construct.nc_set_variable_groups(["forecast"]) + + for construct in f.coordinates().values(): + try: + construct.bounds.nc_set_variable_groups(["forecast"]) + except ValueError: + pass + + domain_axis = f.domain_axis("latitude") + domain_axis.nc_set_dimension_groups(["forecast"]) + + # Check the groups + cf.write(f, grouped_file, fmt="NETCDF4") + cf.write(f, grouped_dir, fmt="ZARR3") + + n = cf.read(grouped_file)[0] + z = cf.read(grouped_dir)[0] + self.assertTrue(z.equals(n)) + self.assertTrue(z.equals(f)) + + # Check that grouped netCDF datasets can only be read with + # 'closest_ancestor' + cf.read(grouped_file, group_dimension_search="closest_ancestor") + for gsn in ("furthest_ancestor", "local", "BAD VALUE"): + with self.assertRaises(ValueError): + cf.read(grouped_file, group_dimension_search=gsn) + + def test_zarr_groups_DSG(self): + """Test Zarr groups containing DSGs.""" + f = cf.example_field(4) + + grouped_dir = tmpdir1 + grouped_file = tmpfile1 + + f.compress("indexed_contiguous", inplace=True) + f.data.get_count().nc_set_variable("count") + f.data.get_index().nc_set_variable("index") + + # Set some groups. (Write the read the field first to create + # the compressions variables on disk.) + cf.write(f, tmpfile2) + f = cf.read(tmpfile2)[0] + + # Set some groups + f.nc_set_variable_groups(["forecast", "model"]) + f.data.get_count().nc_set_variable_groups(["forecast"]) + f.data.get_index().nc_set_variable_groups(["forecast"]) + f.construct("altitude").nc_set_variable_groups(["forecast"]) + f.data.get_count().nc_set_sample_dimension_groups(["forecast"]) + + cf.write(f, grouped_file, fmt="NETCDF4") + cf.write(f, grouped_dir, fmt="ZARR3") + + n = cf.read(grouped_file) + z = cf.read(grouped_dir) + + n = n[0] + z = z[0] + self.assertTrue(z.equals(n)) + self.assertTrue(z.equals(f)) + + def test_zarr_groups_geometry(self): + """Test Zarr groups containing cell geometries.""" + f = cf.example_field(6) + + grouped_dir = tmpdir1 + grouped_file = tmpfile1 + + cf.write(f, tmpfile2) + f = cf.read(tmpfile2)[0] + + # Set some groups + f.nc_set_variable_groups(["forecast", "model"]) + f.nc_set_geometry_variable_groups(["forecast"]) + f.coordinate("longitude").bounds.nc_set_variable_groups(["forecast"]) + f.nc_set_component_variable_groups("node_count", ["forecast"]) + f.nc_set_component_variable_groups("part_node_count", ["forecast"]) + f.nc_set_component_variable("interior_ring", "interior_ring") + f.nc_set_component_variable_groups("interior_ring", ["forecast"]) + + # Check the groups + cf.write(f, grouped_file, fmt="NETCDF4") + cf.write(f, grouped_dir, fmt="ZARR3") + + n = cf.read(grouped_file)[0] + z = cf.read(grouped_dir)[0] + self.assertTrue(z.equals(n)) + self.assertTrue(z.equals(f)) + + def test_zarr_read_v2(self): + """Test reading Zarr v2.""" + f2 = cf.read("example_field_0.zarr2") + f3 = cf.read("example_field_0.zarr3") + self.assertEqual(len(f2), len(f3)) + self.assertEqual(len(f2), 1) + self.assertTrue(f2[0].equals(f3[0])) + + +if __name__ == "__main__": + print("Run date:", datetime.datetime.now()) + cf.environment() + print("") + unittest.main(verbosity=2) diff --git a/docs/source/installation.rst b/docs/source/installation.rst index b029966c48..4ec36e0d67 100644 --- a/docs/source/installation.rst +++ b/docs/source/installation.rst @@ -265,6 +265,12 @@ Some further dependencies that enable further functionality are optional. This to facilitate cf-python being installed in restricted environments for which these features are not required. +.. rubric:: Zarr + +* `zarr `_, version 3.1.3 or newer. + + For reading and writing Zarr datasets. + .. rubric:: Regridding * `esmpy `_, previously diff --git a/docs/source/introduction.rst b/docs/source/introduction.rst index 30514a0e5c..9e9b3208a9 100644 --- a/docs/source/introduction.rst +++ b/docs/source/introduction.rst @@ -67,8 +67,8 @@ may nonetheless be modified in memory. The `cf` package can: * read :term:`field constructs ` and :term:`domain - constructs ` from netCDF, CDL, PP and UM datasets - with a choice of netCDF backends, + constructs ` from netCDF, CDL, Zarr, PP and UM + datasets with a choice of netCDF backends, * read files from OPeNDAP servers and S3 object stores, @@ -76,7 +76,8 @@ The `cf` package can: * create new field constructs in memory, -* write and append field constructs to netCDF datasets on disk, +* write and append field and domain constructs to netCDF and Zarr + datasets on disk, * read, write, and manipulate UGRID mesh topologies, diff --git a/docs/source/tutorial.rst b/docs/source/tutorial.rst index bc3b6671c6..a696524e8f 100644 --- a/docs/source/tutorial.rst +++ b/docs/source/tutorial.rst @@ -5275,8 +5275,11 @@ Method Classes **Writing to a netCDF dataset** ------------------------------- +**Writing to disk** +------------------- + The `cf.write` function writes a field construct, or a sequence of -field constructs, to a new netCDF file on disk: +field constructs, to a netCDF or Zarr dataset on disk: .. code-block:: python :caption: *Write a field construct to a netCDF dataset on disk.* @@ -5345,8 +5348,8 @@ By default the output file will be for CF-|version|. The `cf.write` function has optional parameters to -* set the output netCDF format (all netCDF3 and netCDF4 formats are - possible); +* set the output format (all netCDF3 and netCDF4 formats, as + well as Zarr v3 are possible); * append to the netCDF file rather than over-writing it by default; diff --git a/setup.py b/setup.py index f5ddbed1f7..2ed416976d 100755 --- a/setup.py +++ b/setup.py @@ -178,13 +178,13 @@ def compile(): The ``cf`` package can: -* read field constructs from netCDF, CDL, Zarr, PP and UM datasets, +* read field and domain constructs from netCDF, CDL, Zarr, PP and UM datasets, * be fully flexible with respect to dataset storage chunking, * create new field constructs in memory, -* write and append field constructs to netCDF datasets on disk, +* write and append field and domain constructs to netCDF and Zarr v3 datasets on disk, * read, write, and create coordinates defined by geometry cells, @@ -263,6 +263,9 @@ def compile(): "docformatter", "flake8", ], + "zarr": [ + "zarr>=3.1.3", + ], } setup(