Xarray: propagate bounds coordinate with an IntervalIndex in DataArray¶
It should also work with any other coordinate associated with an Xarray index that shares at least one dimension with dataarray's dimensions.
1 2 3 4 5 | import numpy as np import pandas as pd import xarray as xr xr.set_options(display_expand_indexes=True); |
Example of an Xarray IntervalIndex that can be associated to a dimension coordinate and its CF bounds coordinate companion.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 | from collections.abc import Hashable from xarray.core.indexes import Index, PandasIndex from xarray import Variable class IntervalIndex(Index): # adapted from https://github.com/dcherian/xindexes/blob/main/interval-array.ipynb def __init__(self, index: PandasIndex, bounds_name: Hashable, bounds_dim: str): assert isinstance(index.index, pd.IntervalIndex) self._index = index self._bounds_name = bounds_name self._bounds_dim = bounds_dim @classmethod def from_variables(cls, variables, options): assert len(variables) == 2 for k, v in variables.items(): if v.ndim == 2: bounds_name, bounds = k, v elif v.ndim == 1: dim, _ = k, v bounds = bounds.transpose(..., dim) left, right = bounds.data.tolist() index = PandasIndex(pd.IntervalIndex.from_arrays(left, right), dim) bounds_dim = (set(bounds.dims) - set(dim)).pop() return cls(index, bounds_name, bounds_dim) @classmethod def concat(self, indexes, dim, positions=None): new_index = self._index.concat([idx._index for idx in indexes], dim, positions=positions) if indexes: bounds_name0 = indexes[0]._bounds_name bounds_dim0 = indexes[0]._bounds_dim if any(idx._bounds_name != bounds_name0 or idx._bounds_dim != bounds_dim0 for idx in indexes): raise ValueError( f"Cannot concatenate along dimension {dim!r} indexes with different " "boundary coordinate or dimension names" ) return type(self)(new_index, self._bounds_name, self._bounds_dim) def create_variables(self, variables=None): empty_var = Variable((), 0) bounds_attrs = variables.get(self._bounds_name, empty_var).attrs mid_attrs = variables.get(self._index.dim, empty_var).attrs bounds_var = Variable( dims=(self._bounds_dim, self._index.dim), data=np.stack([self._index.index.left, self._index.index.right], axis=0), attrs=bounds_attrs, ) mid_var = Variable( dims=(self._index.dim,), data=self._index.index.mid, attrs=mid_attrs, ) return {self._index.dim: mid_var, self._bounds_name: bounds_var} def validate_dataarray_coord(self, name, var, dims): # check the "mid" coordinate is enough here if var.ndim == 1 and var.dims[0] not in dims: raise xr.CoordinateValidationError( f"interval coordinate {name!r} has dimensions {var.dims}, but these " "are not a subset of the DataArray " f"dimensions {tuple(dims)}" ) def sel(self, labels, **kwargs): return self._index.sel(labels, **kwargs) def isel(self, indexers): new_index = self._index.isel(indexers) if new_index is not None: return type(self)(new_index, self._bounds_name, self._bounds_dim) else: return None def roll(self, shifts): new_index = self._index.roll(shifts) return type(self)(new_index, self._bounds_name, self._bounds_dim) def rename(self, name_dict, dims_dict): new_index = self._index.rename(name_dict, dims_dict) bounds_name = name_dict.get(self._bounds_name, self._bounds_name) bounds_dim = dims_dict.get(self._bounds_dim, self._bounds_dim) return type(self)(new_index, bounds_name, bounds_dim) def __repr__(self): string = f"{self._index!r}" return string |
Create an example dataset with an IntervalIndex
1 2 3 4 5 6 7 8 9 10 11 | left = np.arange(0.5, 3.6, 1) right = np.arange(1.5, 4.6, 1) bounds = np.stack([left, right]) ds = xr.Dataset( {"foo": ("x", [1, 2, 3, 4])}, coords={"x": [1, 2, 3, 4], "x_bounds": (("bnds", "x"), bounds)}, ) newds = ds.drop_indexes("x").set_xindex(("x", "x_bounds"), IntervalIndex) newds |
<xarray.Dataset> Size: 128B Dimensions: (x: 4, bnds: 2) Coordinates: * x (x) float64 32B 1.0 2.0 3.0 4.0 * x_bounds (bnds, x) float64 64B 0.5 1.5 2.5 3.5 1.5 2.5 3.5 4.5 Dimensions without coordinates: bnds Data variables: foo (x) int64 32B 1 2 3 4 Indexes: ┌ x IntervalIndex └ x_bounds
Extract a DataArray from a Dataset with a bounds coordinate¶
1 | newds.foo |
<xarray.DataArray 'foo' (x: 4)> Size: 32B array([1, 2, 3, 4]) Coordinates: * x (x) float64 32B 1.0 2.0 3.0 4.0 * x_bounds (bnds, x) float64 64B 0.5 1.5 2.5 3.5 1.5 2.5 3.5 4.5 Indexes: ┌ x IntervalIndex └ x_bounds
1 | newds.x |
<xarray.DataArray 'x' (x: 4)> Size: 32B array([1., 2., 3., 4.]) Coordinates: * x (x) float64 32B 1.0 2.0 3.0 4.0 * x_bounds (bnds, x) float64 64B 0.5 1.5 2.5 3.5 1.5 2.5 3.5 4.5 Indexes: ┌ x IntervalIndex └ x_bounds
The bounds coordinate is not propagated if it is not associated with an index! The IntervalIndex is also dropped.
1 | newds.drop_indexes(["x", "x_bounds"]).foo |
<xarray.DataArray 'foo' (x: 4)> Size: 32B array([1, 2, 3, 4]) Coordinates: x (x) float64 32B 1.0 2.0 3.0 4.0
Create a new DataArray with a bounds coordinate¶
1 2 3 4 5 | # note: the IntervalIndex associated with the "x" and "x_bounds" coordinates # is passed to the DataArray constructor below via the `coords` argument da = xr.DataArray(newds.foo, newds.coords, dims="x") da |
<xarray.DataArray 'foo' (x: 4)> Size: 32B array([1, 2, 3, 4]) Coordinates: * x (x) float64 32B 1.0 2.0 3.0 4.0 * x_bounds (bnds, x) float64 64B 0.5 1.5 2.5 3.5 1.5 2.5 3.5 4.5 Indexes: ┌ x IntervalIndex └ x_bounds
The bounds coordinate cannot be included if it is not associated with an index!
1 2 | # note: convert `newds.coords` to a dict removes all indexes xr.DataArray(newds.foo, dict(newds.coords), dims="x") |
--------------------------------------------------------------------------- CoordinateValidationError Traceback (most recent call last) Cell In[8], line 2 1 # note: convert `newds.coords` to a dict removes all indexes ----> 2 xr.DataArray(newds.foo, dict(newds.coords), dims="x") File ~/Git/github/benbovy/xarray/xarray/core/dataarray.py:469, in DataArray.__init__(self, data, coords, dims, name, attrs, indexes, fastpath) 467 data = _check_data_shape(data, coords, dims) 468 data = as_compatible_data(data) --> 469 coords, dims = _infer_coords_and_dims(data.shape, coords, dims) 470 variable = Variable(dims, data, attrs, fastpath=True) 472 if not isinstance(coords, Coordinates): File ~/Git/github/benbovy/xarray/xarray/core/dataarray.py:199, in _infer_coords_and_dims(shape, coords, dims) 196 var.dims = (dim,) 197 new_coords[dim] = var.to_index_variable() --> 199 validate_dataarray_coords(shape, new_coords, dims_tuple) 201 return new_coords, dims_tuple File ~/Git/github/benbovy/xarray/xarray/core/coordinates.py:1182, in validate_dataarray_coords(shape, coords, dim) 1180 indexes[k].validate_dataarray_coord(k, v, dim_set) 1181 elif any(d not in dim for d in v.dims): -> 1182 raise CoordinateValidationError( 1183 f"coordinate {k} has dimensions {v.dims}, but these " 1184 "are not a subset of the DataArray " 1185 f"dimensions {dim}" 1186 ) 1188 for d, s in v.sizes.items(): 1189 if d in sizes and s != sizes[d]: CoordinateValidationError: coordinate x_bounds has dimensions ('bnds', 'x'), but these are not a subset of the DataArray dimensions ('x',)
Main dimension name of the bounds coordinate must still match with array dimensions
1 2 3 | renamed = newds.rename(x="y", x_bounds="y_bounds") xr.DataArray(newds.foo, renamed.coords, dims="x") |
--------------------------------------------------------------------------- CoordinateValidationError Traceback (most recent call last) Cell In[9], line 3 1 renamed = newds.rename(x="y", x_bounds="y_bounds") ----> 3 xr.DataArray(newds.foo, renamed.coords, dims="x") File ~/Git/github/benbovy/xarray/xarray/core/dataarray.py:469, in DataArray.__init__(self, data, coords, dims, name, attrs, indexes, fastpath) 467 data = _check_data_shape(data, coords, dims) 468 data = as_compatible_data(data) --> 469 coords, dims = _infer_coords_and_dims(data.shape, coords, dims) 470 variable = Variable(dims, data, attrs, fastpath=True) 472 if not isinstance(coords, Coordinates): File ~/Git/github/benbovy/xarray/xarray/core/dataarray.py:199, in _infer_coords_and_dims(shape, coords, dims) 196 var.dims = (dim,) 197 new_coords[dim] = var.to_index_variable() --> 199 validate_dataarray_coords(shape, new_coords, dims_tuple) 201 return new_coords, dims_tuple File ~/Git/github/benbovy/xarray/xarray/core/coordinates.py:1180, in validate_dataarray_coords(shape, coords, dim) 1178 for k, v in coords.items(): 1179 if k in indexes: -> 1180 indexes[k].validate_dataarray_coord(k, v, dim_set) 1181 elif any(d not in dim for d in v.dims): 1182 raise CoordinateValidationError( 1183 f"coordinate {k} has dimensions {v.dims}, but these " 1184 "are not a subset of the DataArray " 1185 f"dimensions {dim}" 1186 ) Cell In[2], line 69, in IntervalIndex.validate_dataarray_coord(self, name, var, dims) 66 def validate_dataarray_coord(self, name, var, dims): 67 # check the "mid" coordinate is enough here 68 if var.ndim == 1 and var.dims[0] not in dims: ---> 69 raise xr.CoordinateValidationError( 70 f"interval coordinate {name!r} has dimensions {var.dims}, but these " 71 "are not a subset of the DataArray " 72 f"dimensions {tuple(dims)}" 73 ) CoordinateValidationError: interval coordinate 'y' has dimensions ('y',), but these are not a subset of the DataArray dimensions ('x',)
Assign bounds coordinate to an existing DataArray¶
1 | xr.DataArray(range(4), dims="x").assign_coords(newds.coords) |
<xarray.DataArray (x: 4)> Size: 32B array([0, 1, 2, 3]) Coordinates: * x (x) float64 32B 1.0 2.0 3.0 4.0 * x_bounds (bnds, x) float64 64B 0.5 1.5 2.5 3.5 1.5 2.5 3.5 4.5 Indexes: ┌ x IntervalIndex └ x_bounds
The bounds coordinate cannot be assigned if it is not associated with an index!
1 | xr.DataArray(range(4), dims="x").assign_coords(dict(newds.coords)) |
--------------------------------------------------------------------------- CoordinateValidationError Traceback (most recent call last) Cell In[11], line 1 ----> 1 xr.DataArray(range(4), dims="x").assign_coords(dict(newds.coords)) File ~/Git/github/benbovy/xarray/xarray/core/common.py:664, in DataWithCoords.assign_coords(self, coords, **coords_kwargs) 661 else: 662 results = self._calc_assign_results(coords_combined) --> 664 data.coords.update(results) 665 return data File ~/Git/github/benbovy/xarray/xarray/core/coordinates.py:603, in Coordinates.update(self, other) 597 # special case for PandasMultiIndex: updating only its dimension coordinate 598 # is still allowed but depreciated. 599 # It is the only case where we need to actually drop coordinates here (multi-index levels) 600 # TODO: remove when removing PandasMultiIndex's dimension coordinate. 601 self._drop_coords(self._names - coords_to_align._names) --> 603 self._update_coords(coords, indexes) File ~/Git/github/benbovy/xarray/xarray/core/coordinates.py:969, in DataArrayCoordinates._update_coords(self, coords, indexes) 966 def _update_coords( 967 self, coords: dict[Hashable, Variable], indexes: dict[Hashable, Index] 968 ) -> None: --> 969 validate_dataarray_coords( 970 self._data.shape, Coordinates._construct_direct(coords, indexes), self.dims 971 ) 973 self._data._coords = coords 974 self._data._indexes = indexes File ~/Git/github/benbovy/xarray/xarray/core/coordinates.py:1182, in validate_dataarray_coords(shape, coords, dim) 1180 indexes[k].validate_dataarray_coord(k, v, dim_set) 1181 elif any(d not in dim for d in v.dims): -> 1182 raise CoordinateValidationError( 1183 f"coordinate {k} has dimensions {v.dims}, but these " 1184 "are not a subset of the DataArray " 1185 f"dimensions {dim}" 1186 ) 1188 for d, s in v.sizes.items(): 1189 if d in sizes and s != sizes[d]: CoordinateValidationError: coordinate x_bounds has dimensions ('bnds', 'x'), but these are not a subset of the DataArray dimensions ('x',)
Ensure common operations on DataArray propagate the bounds coordinate¶
1 | da.x_bounds |
<xarray.DataArray 'x_bounds' (bnds: 2, x: 4)> Size: 64B array([[0.5, 1.5, 2.5, 3.5], [1.5, 2.5, 3.5, 4.5]]) Coordinates: * x (x) float64 32B 1.0 2.0 3.0 4.0 * x_bounds (bnds, x) float64 64B 0.5 1.5 2.5 3.5 1.5 2.5 3.5 4.5 Dimensions without coordinates: bnds Indexes: ┌ x IntervalIndex └ x_bounds
1 | da.to_dataset(name="foo") |
<xarray.Dataset> Size: 128B Dimensions: (x: 4, bnds: 2) Coordinates: * x (x) float64 32B 1.0 2.0 3.0 4.0 * x_bounds (bnds, x) float64 64B 0.5 1.5 2.5 3.5 1.5 2.5 3.5 4.5 Dimensions without coordinates: bnds Data variables: foo (x) int64 32B 1 2 3 4 Indexes: ┌ x IntervalIndex └ x_bounds
1 | da.sel(x=[0.8, 3.25]) |
<xarray.DataArray 'foo' (x: 2)> Size: 16B array([1, 3]) Coordinates: * x (x) float64 16B 1.0 3.0 * x_bounds (bnds, x) float64 32B 0.5 2.5 1.5 3.5 Indexes: ┌ x IntervalIndex └ x_bounds
1 | da + da |
<xarray.DataArray 'foo' (x: 4)> Size: 32B array([2, 4, 6, 8]) Coordinates: * x (x) float64 32B 1.0 2.0 3.0 4.0 * x_bounds (bnds, x) float64 64B 0.5 1.5 2.5 3.5 1.5 2.5 3.5 4.5 Indexes: ┌ x IntervalIndex └ x_bounds
1 | da.roll(x=1, roll_coords=True) |
<xarray.DataArray 'foo' (x: 4)> Size: 32B array([4, 1, 2, 3]) Coordinates: * x (x) float64 32B 4.0 1.0 2.0 3.0 * x_bounds (bnds, x) float64 64B 3.5 0.5 1.5 2.5 4.5 1.5 2.5 3.5 Indexes: ┌ x IntervalIndex └ x_bounds
1 | da.rename(x="y", x_bounds="y_bounds") |
<xarray.DataArray 'foo' (y: 4)> Size: 32B array([1, 2, 3, 4]) Coordinates: * y (y) float64 32B 1.0 2.0 3.0 4.0 * y_bounds (bnds, y) float64 64B 0.5 1.5 2.5 3.5 1.5 2.5 3.5 4.5 Indexes: ┌ y IntervalIndex └ y_bounds
1 | xr.merge([da, da]) |
<xarray.Dataset> Size: 128B Dimensions: (x: 4, bnds: 2) Coordinates: * x (x) float64 32B 1.0 2.0 3.0 4.0 * x_bounds (bnds, x) float64 64B 0.5 1.5 2.5 3.5 1.5 2.5 3.5 4.5 Dimensions without coordinates: bnds Data variables: foo (x) int64 32B 1 2 3 4 Indexes: ┌ x IntervalIndex └ x_bounds
IntervalIndex.concat()
is implemented and should normally work but Xarray current alignment rules prevents it to be applied.
1 | xr.concat([da, da], "x", coords=["x", "x_bounds"]) |
--------------------------------------------------------------------------- ValueError Traceback (most recent call last) Cell In[19], line 1 ----> 1 xr.concat([da, da], "x", coords=["x", "x_bounds"]) File ~/Git/github/benbovy/xarray/xarray/structure/concat.py:264, in concat(objs, dim, data_vars, coords, compat, positions, fill_value, join, combine_attrs, create_index_for_new_dim) 259 raise ValueError( 260 f"compat={compat!r} invalid: must be 'broadcast_equals', 'equals', 'identical', 'no_conflicts' or 'override'" 261 ) 263 if isinstance(first_obj, DataArray): --> 264 return _dataarray_concat( 265 objs, 266 dim=dim, 267 data_vars=data_vars, 268 coords=coords, 269 compat=compat, 270 positions=positions, 271 fill_value=fill_value, 272 join=join, 273 combine_attrs=combine_attrs, 274 create_index_for_new_dim=create_index_for_new_dim, 275 ) 276 elif isinstance(first_obj, Dataset): 277 return _dataset_concat( 278 objs, 279 dim=dim, (...) 287 create_index_for_new_dim=create_index_for_new_dim, 288 ) File ~/Git/github/benbovy/xarray/xarray/structure/concat.py:755, in _dataarray_concat(arrays, dim, data_vars, coords, compat, positions, fill_value, join, combine_attrs, create_index_for_new_dim) 752 arr = arr.rename(name) 753 datasets.append(arr._to_temp_dataset()) --> 755 ds = _dataset_concat( 756 datasets, 757 dim, 758 data_vars, 759 coords, 760 compat, 761 positions, 762 fill_value=fill_value, 763 join=join, 764 combine_attrs=combine_attrs, 765 create_index_for_new_dim=create_index_for_new_dim, 766 ) 768 merged_attrs = merge_attrs([da.attrs for da in arrays], combine_attrs) 770 result = arrays[0]._from_temp_dataset(ds, name) File ~/Git/github/benbovy/xarray/xarray/structure/concat.py:516, in _dataset_concat(datasets, dim, data_vars, coords, compat, positions, fill_value, join, combine_attrs, create_index_for_new_dim) 513 # Make sure we're working on a copy (we'll be loading variables) 514 datasets = [ds.copy() for ds in datasets] 515 datasets = list( --> 516 align( 517 *datasets, join=join, copy=False, exclude=[dim_name], fill_value=fill_value 518 ) 519 ) 521 dim_coords, dims_sizes, coord_names, data_names, vars_order = _parse_datasets( 522 datasets 523 ) 524 dim_names = set(dim_coords) File ~/Git/github/benbovy/xarray/xarray/structure/alignment.py:883, in align(join, copy, indexes, exclude, fill_value, *objects) 687 """ 688 Given any number of Dataset and/or DataArray objects, returns new 689 objects with aligned indexes and dimension sizes. (...) 873 874 """ 875 aligner = Aligner( 876 objects, 877 join=join, (...) 881 fill_value=fill_value, 882 ) --> 883 aligner.align() 884 return aligner.results File ~/Git/github/benbovy/xarray/xarray/structure/alignment.py:572, in Aligner.align(self) 569 self.results = (obj.copy(deep=self.copy),) 570 return --> 572 self.find_matching_indexes() 573 self.find_matching_unindexed_dims() 574 self.assert_no_index_conflict() File ~/Git/github/benbovy/xarray/xarray/structure/alignment.py:254, in Aligner.find_matching_indexes(self) 251 objects_matching_indexes = [] 253 for obj in self.objects: --> 254 obj_indexes, obj_index_vars = self._normalize_indexes(obj.xindexes) 255 objects_matching_indexes.append(obj_indexes) 256 for key, idx in obj_indexes.items(): File ~/Git/github/benbovy/xarray/xarray/structure/alignment.py:230, in Aligner._normalize_indexes(self, indexes) 228 excl_dims_str = ", ".join(str(d) for d in exclude_dims) 229 incl_dims_str = ", ".join(str(d) for d in all_dims - exclude_dims) --> 230 raise ValueError( 231 f"cannot exclude dimension(s) {excl_dims_str} from alignment because " 232 "these are used by an index together with non-excluded dimensions " 233 f"{incl_dims_str}" 234 ) 236 key = (tuple(coord_names_and_dims), type(idx)) 237 normalized_indexes[key] = idx ValueError: cannot exclude dimension(s) x from alignment because these are used by an index together with non-excluded dimensions bnds
1 |