Skip to content

Commit 52d4845

Browse files
keewisdcherian
authored andcommitted
Add DatasetGroupBy.quantile (#3527)
* move the implementation of DataArrayGroupBy.quantile to GroupBy * add tests for DatasetGroupBy * update whats-new.rst * move the item in whats-new.rst into New Features * don't drop scalar quantile coords
1 parent 68b004f commit 52d4845

File tree

3 files changed

+184
-68
lines changed

3 files changed

+184
-68
lines changed

doc/whats-new.rst

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -80,6 +80,8 @@ New Features
8080
invoked. (:issue:`3378`, :pull:`3446`, :pull:`3515`)
8181
By `Deepak Cherian <https://github.com/dcherian>`_ and
8282
`Guido Imperiale <https://github.com/crusaderky>`_.
83+
- Add the documented-but-missing :py:meth:`xarray.core.groupby.DatasetGroupBy.quantile`.
84+
(:issue:`3525`, :pull:`3527`). By `Justus Magin <https://github.com/keewis>`_.
8385

8486
Bug fixes
8587
~~~~~~~~~

xarray/core/groupby.py

Lines changed: 53 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -557,6 +557,59 @@ def fillna(self, value):
557557
out = ops.fillna(self, value)
558558
return out
559559

560+
def quantile(self, q, dim=None, interpolation="linear", keep_attrs=None):
561+
"""Compute the qth quantile over each array in the groups and
562+
concatenate them together into a new array.
563+
564+
Parameters
565+
----------
566+
q : float in range of [0,1] (or sequence of floats)
567+
Quantile to compute, which must be between 0 and 1
568+
inclusive.
569+
dim : `...`, str or sequence of str, optional
570+
Dimension(s) over which to apply quantile.
571+
Defaults to the grouped dimension.
572+
interpolation : {'linear', 'lower', 'higher', 'midpoint', 'nearest'}
573+
This optional parameter specifies the interpolation method to
574+
use when the desired quantile lies between two data points
575+
``i < j``:
576+
* linear: ``i + (j - i) * fraction``, where ``fraction`` is
577+
the fractional part of the index surrounded by ``i`` and
578+
``j``.
579+
* lower: ``i``.
580+
* higher: ``j``.
581+
* nearest: ``i`` or ``j``, whichever is nearest.
582+
* midpoint: ``(i + j) / 2``.
583+
584+
Returns
585+
-------
586+
quantiles : Variable
587+
If `q` is a single quantile, then the result is a
588+
scalar. If multiple percentiles are given, first axis of
589+
the result corresponds to the quantile. In either case a
590+
quantile dimension is added to the return array. The other
591+
dimensions are the dimensions that remain after the
592+
reduction of the array.
593+
594+
See Also
595+
--------
596+
numpy.nanpercentile, pandas.Series.quantile, Dataset.quantile,
597+
DataArray.quantile
598+
"""
599+
if dim is None:
600+
dim = self._group_dim
601+
602+
out = self.map(
603+
self._obj.__class__.quantile,
604+
shortcut=False,
605+
q=q,
606+
dim=dim,
607+
interpolation=interpolation,
608+
keep_attrs=keep_attrs,
609+
)
610+
611+
return out
612+
560613
def where(self, cond, other=dtypes.NA):
561614
"""Return elements from `self` or `other` depending on `cond`.
562615
@@ -737,60 +790,6 @@ def _combine(self, applied, restore_coord_dims=False, shortcut=False):
737790
combined = self._maybe_unstack(combined)
738791
return combined
739792

740-
def quantile(self, q, dim=None, interpolation="linear", keep_attrs=None):
741-
"""Compute the qth quantile over each array in the groups and
742-
concatenate them together into a new array.
743-
744-
Parameters
745-
----------
746-
q : float in range of [0,1] (or sequence of floats)
747-
Quantile to compute, which must be between 0 and 1
748-
inclusive.
749-
dim : `...`, str or sequence of str, optional
750-
Dimension(s) over which to apply quantile.
751-
Defaults to the grouped dimension.
752-
interpolation : {'linear', 'lower', 'higher', 'midpoint', 'nearest'}
753-
This optional parameter specifies the interpolation method to
754-
use when the desired quantile lies between two data points
755-
``i < j``:
756-
* linear: ``i + (j - i) * fraction``, where ``fraction`` is
757-
the fractional part of the index surrounded by ``i`` and
758-
``j``.
759-
* lower: ``i``.
760-
* higher: ``j``.
761-
* nearest: ``i`` or ``j``, whichever is nearest.
762-
* midpoint: ``(i + j) / 2``.
763-
764-
Returns
765-
-------
766-
quantiles : Variable
767-
If `q` is a single quantile, then the result
768-
is a scalar. If multiple percentiles are given, first axis of
769-
the result corresponds to the quantile and a quantile dimension
770-
is added to the return array. The other dimensions are the
771-
dimensions that remain after the reduction of the array.
772-
773-
See Also
774-
--------
775-
numpy.nanpercentile, pandas.Series.quantile, Dataset.quantile,
776-
DataArray.quantile
777-
"""
778-
if dim is None:
779-
dim = self._group_dim
780-
781-
out = self.map(
782-
self._obj.__class__.quantile,
783-
shortcut=False,
784-
q=q,
785-
dim=dim,
786-
interpolation=interpolation,
787-
keep_attrs=keep_attrs,
788-
)
789-
790-
if np.asarray(q, dtype=np.float64).ndim == 0:
791-
out = out.drop_vars("quantile")
792-
return out
793-
794793
def reduce(
795794
self, func, dim=None, axis=None, keep_attrs=None, shortcut=True, **kwargs
796795
):

xarray/tests/test_groupby.py

Lines changed: 129 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -137,57 +137,73 @@ def test_da_groupby_empty():
137137

138138
def test_da_groupby_quantile():
139139

140-
array = xr.DataArray([1, 2, 3, 4, 5, 6], [("x", [1, 1, 1, 2, 2, 2])])
140+
array = xr.DataArray(
141+
data=[1, 2, 3, 4, 5, 6], coords={"x": [1, 1, 1, 2, 2, 2]}, dims="x"
142+
)
141143

142144
# Scalar quantile
143-
expected = xr.DataArray([2, 5], [("x", [1, 2])])
145+
expected = xr.DataArray(
146+
data=[2, 5], coords={"x": [1, 2], "quantile": 0.5}, dims="x"
147+
)
144148
actual = array.groupby("x").quantile(0.5)
145149
assert_identical(expected, actual)
146150

147151
# Vector quantile
148-
expected = xr.DataArray([[1, 3], [4, 6]], [("x", [1, 2]), ("quantile", [0, 1])])
152+
expected = xr.DataArray(
153+
data=[[1, 3], [4, 6]],
154+
coords={"x": [1, 2], "quantile": [0, 1]},
155+
dims=("x", "quantile"),
156+
)
149157
actual = array.groupby("x").quantile([0, 1])
150158
assert_identical(expected, actual)
151159

152160
# Multiple dimensions
153161
array = xr.DataArray(
154-
[[1, 11, 26], [2, 12, 22], [3, 13, 23], [4, 16, 24], [5, 15, 25]],
155-
[("x", [1, 1, 1, 2, 2]), ("y", [0, 0, 1])],
162+
data=[[1, 11, 26], [2, 12, 22], [3, 13, 23], [4, 16, 24], [5, 15, 25]],
163+
coords={"x": [1, 1, 1, 2, 2], "y": [0, 0, 1]},
164+
dims=("x", "y"),
156165
)
157166

158167
actual_x = array.groupby("x").quantile(0, dim=...)
159-
expected_x = xr.DataArray([1, 4], [("x", [1, 2])])
168+
expected_x = xr.DataArray(
169+
data=[1, 4], coords={"x": [1, 2], "quantile": 0}, dims="x"
170+
)
160171
assert_identical(expected_x, actual_x)
161172

162173
actual_y = array.groupby("y").quantile(0, dim=...)
163-
expected_y = xr.DataArray([1, 22], [("y", [0, 1])])
174+
expected_y = xr.DataArray(
175+
data=[1, 22], coords={"y": [0, 1], "quantile": 0}, dims="y"
176+
)
164177
assert_identical(expected_y, actual_y)
165178

166179
actual_xx = array.groupby("x").quantile(0)
167180
expected_xx = xr.DataArray(
168-
[[1, 11, 22], [4, 15, 24]], [("x", [1, 2]), ("y", [0, 0, 1])]
181+
data=[[1, 11, 22], [4, 15, 24]],
182+
coords={"x": [1, 2], "y": [0, 0, 1], "quantile": 0},
183+
dims=("x", "y"),
169184
)
170185
assert_identical(expected_xx, actual_xx)
171186

172187
actual_yy = array.groupby("y").quantile(0)
173188
expected_yy = xr.DataArray(
174-
[[1, 26], [2, 22], [3, 23], [4, 24], [5, 25]],
175-
[("x", [1, 1, 1, 2, 2]), ("y", [0, 1])],
189+
data=[[1, 26], [2, 22], [3, 23], [4, 24], [5, 25]],
190+
coords={"x": [1, 1, 1, 2, 2], "y": [0, 1], "quantile": 0},
191+
dims=("x", "y"),
176192
)
177193
assert_identical(expected_yy, actual_yy)
178194

179195
times = pd.date_range("2000-01-01", periods=365)
180196
x = [0, 1]
181197
foo = xr.DataArray(
182198
np.reshape(np.arange(365 * 2), (365, 2)),
183-
coords=dict(time=times, x=x),
199+
coords={"time": times, "x": x},
184200
dims=("time", "x"),
185201
)
186202
g = foo.groupby(foo.time.dt.month)
187203

188204
actual = g.quantile(0, dim=...)
189205
expected = xr.DataArray(
190-
[
206+
data=[
191207
0.0,
192208
62.0,
193209
120.0,
@@ -201,12 +217,111 @@ def test_da_groupby_quantile():
201217
610.0,
202218
670.0,
203219
],
204-
[("month", np.arange(1, 13))],
220+
coords={"month": np.arange(1, 13), "quantile": 0},
221+
dims="month",
205222
)
206223
assert_identical(expected, actual)
207224

208225
actual = g.quantile(0, dim="time")[:2]
209-
expected = xr.DataArray([[0.0, 1], [62.0, 63]], [("month", [1, 2]), ("x", [0, 1])])
226+
expected = xr.DataArray(
227+
data=[[0.0, 1], [62.0, 63]],
228+
coords={"month": [1, 2], "x": [0, 1], "quantile": 0},
229+
dims=("month", "x"),
230+
)
231+
assert_identical(expected, actual)
232+
233+
234+
def test_ds_groupby_quantile():
235+
ds = xr.Dataset(
236+
data_vars={"a": ("x", [1, 2, 3, 4, 5, 6])}, coords={"x": [1, 1, 1, 2, 2, 2]}
237+
)
238+
239+
# Scalar quantile
240+
expected = xr.Dataset(
241+
data_vars={"a": ("x", [2, 5])}, coords={"quantile": 0.5, "x": [1, 2]}
242+
)
243+
actual = ds.groupby("x").quantile(0.5)
244+
assert_identical(expected, actual)
245+
246+
# Vector quantile
247+
expected = xr.Dataset(
248+
data_vars={"a": (("x", "quantile"), [[1, 3], [4, 6]])},
249+
coords={"x": [1, 2], "quantile": [0, 1]},
250+
)
251+
actual = ds.groupby("x").quantile([0, 1])
252+
assert_identical(expected, actual)
253+
254+
# Multiple dimensions
255+
ds = xr.Dataset(
256+
data_vars={
257+
"a": (
258+
("x", "y"),
259+
[[1, 11, 26], [2, 12, 22], [3, 13, 23], [4, 16, 24], [5, 15, 25]],
260+
)
261+
},
262+
coords={"x": [1, 1, 1, 2, 2], "y": [0, 0, 1]},
263+
)
264+
265+
actual_x = ds.groupby("x").quantile(0, dim=...)
266+
expected_x = xr.Dataset({"a": ("x", [1, 4])}, coords={"x": [1, 2], "quantile": 0})
267+
assert_identical(expected_x, actual_x)
268+
269+
actual_y = ds.groupby("y").quantile(0, dim=...)
270+
expected_y = xr.Dataset({"a": ("y", [1, 22])}, coords={"y": [0, 1], "quantile": 0})
271+
assert_identical(expected_y, actual_y)
272+
273+
actual_xx = ds.groupby("x").quantile(0)
274+
expected_xx = xr.Dataset(
275+
{"a": (("x", "y"), [[1, 11, 22], [4, 15, 24]])},
276+
coords={"x": [1, 2], "y": [0, 0, 1], "quantile": 0},
277+
)
278+
assert_identical(expected_xx, actual_xx)
279+
280+
actual_yy = ds.groupby("y").quantile(0)
281+
expected_yy = xr.Dataset(
282+
{"a": (("x", "y"), [[1, 26], [2, 22], [3, 23], [4, 24], [5, 25]])},
283+
coords={"x": [1, 1, 1, 2, 2], "y": [0, 1], "quantile": 0},
284+
).transpose()
285+
assert_identical(expected_yy, actual_yy)
286+
287+
times = pd.date_range("2000-01-01", periods=365)
288+
x = [0, 1]
289+
foo = xr.Dataset(
290+
{"a": (("time", "x"), np.reshape(np.arange(365 * 2), (365, 2)))},
291+
coords=dict(time=times, x=x),
292+
)
293+
g = foo.groupby(foo.time.dt.month)
294+
295+
actual = g.quantile(0, dim=...)
296+
expected = xr.Dataset(
297+
{
298+
"a": (
299+
"month",
300+
[
301+
0.0,
302+
62.0,
303+
120.0,
304+
182.0,
305+
242.0,
306+
304.0,
307+
364.0,
308+
426.0,
309+
488.0,
310+
548.0,
311+
610.0,
312+
670.0,
313+
],
314+
)
315+
},
316+
coords={"month": np.arange(1, 13), "quantile": 0},
317+
)
318+
assert_identical(expected, actual)
319+
320+
actual = g.quantile(0, dim="time").isel(month=slice(None, 2))
321+
expected = xr.Dataset(
322+
data_vars={"a": (("month", "x"), [[0.0, 1], [62.0, 63]])},
323+
coords={"month": [1, 2], "x": [0, 1], "quantile": 0},
324+
)
210325
assert_identical(expected, actual)
211326

212327

0 commit comments

Comments
 (0)