Merge pull request #137 from jhamman/dataset_reductions

shoyer · shoyer · commit fd5268f7bbf9 · 2014-05-21T13:23:36.000-07:00
Dataset.reduce methods
diff --git a/doc/api.rst b/doc/api.rst
@@ -70,6 +70,23 @@ Selecting
    Dataset.squeeze
    Dataset.groupby
 
+Computations
+~~~~~~~~~~~~
+
+.. autosummary::
+   :toctree: generated/
+
+   Dataset.all
+   Dataset.any
+   Dataset.argmax
+   Dataset.argmin
+   Dataset.max
+   Dataset.min
+   Dataset.mean
+   Dataset.std
+   Dataset.sum
+   Dataset.var
+
 IO / Conversion
 ~~~~~~~~~~~~~~~
 
diff --git a/doc/tutorial.rst b/doc/tutorial.rst
@@ -159,6 +159,23 @@ contents of the ``Dataset`` will still be the same underlying
 :py:class:`xray.Variable`. You can copy all data by supplying the argument
 ``deep=True``.
 
+Datasets reductions
+~~~~~~~~~~~~~~~~~~
+We can numpy reduction functions to the entire dataset, returning a new 
+``Dataset``.  
+
+.. ipython:: python
+
+    bar = ds.mean()
+    bar
+
+The ``dimension``(default=None) keyword will limit the reduction to only the dimension(s) provided.  
+
+.. ipython:: python
+
+    spam = ds.mean(dimension='time')
+    spam
+
 ``DataArray`` objects
 ---------------------
 
diff --git a/test/test_dataset.py b/test/test_dataset.py
@@ -656,3 +656,45 @@ def test_lazy_load(self):
             # these should not raise UnexpectedDataAccess:
             ds.indexed(time=10)
             ds.indexed(time=slice(10), dim1=[0]).indexed(dim1=0, dim2=-1)
+
+    def test_reduce(self):
+        data = create_test_data()
+
+        self.assertEqual(len(data.mean().coordinates), 0)
+
+        expected = data.max()
+        for var in data.noncoordinates:
+            expected = data[var].max()
+            actual = expected[var]
+            self.assertDataArrayEqual(expected, actual)
+
+        self.assertDatasetEqual(data.min(dimension=['dim1']),
+                                data.min(dimension='dim1'))
+
+        for reduct, expected in [('dim2', ['dim1', 'dim3', 'time']),
+                                 (['dim2', 'time'], ['dim1', 'dim3']),
+                                 (('dim2', 'time'), ['dim1', 'dim3']),
+                                 ((), ['dim1', 'dim2', 'dim3', 'time'])]:
+            actual = data.min(dimension=reduct).dimensions
+            print(reduct, actual, expected)
+            self.assertItemsEqual(actual, expected)
+
+        self.assertDatasetEqual(data.mean(dimension=[]), data)
+
+    def test_reduce_bad_dimension(self):
+        data = create_test_data()
+        with self.assertRaisesRegexp(ValueError, 'Dataset does not contain'):
+            ds = data.mean(dimension='bad_dim')
+
+    def test_reduce_non_numeric(self):
+        data1 = create_test_data(seed=44)
+        data2 = create_test_data(seed=44)
+        add_vars = {'var4': ['dim1', 'dim2']}
+        for v, dims in sorted(add_vars.items()):
+            data = np.random.random_integers(0, 100, size=tuple(_dims[d] for d in dims)).astype(np.str_)
+            data1[v] = (dims, data, {'foo': 'variable'})
+
+        self.assertTrue('var4' not in data1.mean())
+        self.assertDatasetEqual(data1.mean(), data2.mean())
+        self.assertDatasetEqual(data1.mean(dimension='dim1'),
+                                data2.mean(dimension='dim1'))
diff --git a/xray/dataset.py b/xray/dataset.py
@@ -15,6 +15,7 @@
 from . import variable
 from . import utils
 from . import data_array
+from . import ops
 from .utils import (FrozenOrderedDict, Frozen, SortedKeysDict, ChainMap,
                    multi_index_from_product)
 from .pycompat import iteritems, basestring
@@ -973,6 +974,86 @@ def squeeze(self, dimension=None):
         """
         return utils.squeeze(self, self.dimensions, dimension)
 
+    _reduce_method_docstring = \
+        """Reduce this {cls}'s data' by applying `{name}` along some
+        dimension(s).
+
+        Parameters
+        ----------
+        dimension : str or sequence of str, optional
+            Dimension(s) over which to apply `func`.  By default `func` is
+            applied over all dimensions.
+        **kwargs : dict
+            Additional keyword arguments passed on to `{name}`.
+
+        Returns
+        -------
+        reduced : {cls}
+            New {cls} object with `{name}` applied to its data and the
+            indicated dimension(s) removed.
+        """
+
+    @classmethod
+    def _reduce_method(cls, f, name=None, module=None):
+        def func(self, dimension=None, **kwargs):
+            return self.reduce(f, dimension, **kwargs)
+        if name is None:
+            name = f.__name__
+        func.__name__ = name
+        func.__doc__ = cls._reduce_method_docstring.format(
+            name=('' if module is None else module + '.') + name,
+            cls=cls.__name__)
+        return func
+
+    def reduce(self, func, dimension=None, **kwargs):
+        """Reduce this dataset by applying `func` along some dimension(s).
+
+        Parameters
+        ----------
+        func : function
+            Function which can be called in the form
+            `f(x, axis=axis, **kwargs)` to return the result of reducing an
+            np.ndarray over an integer valued axis.
+        dimension : str or sequence of str, optional
+            Dimension(s) over which to apply `func`.  By default `func` is
+            applied over all dimensions.
+        **kwargs : dict
+            Additional keyword arguments passed on to `func`.
+
+        Returns
+        -------
+        reduced : Dataset
+            Dataset with this object's DataArrays replaced with new DataArrays
+            of summarized data and the indicated dimension(s) removed.
+        """
+
+        if isinstance(dimension, basestring):
+            dims = set([dimension])
+        elif dimension is None:
+            dims = set(self.coordinates)
+        else:
+            dims = set(dimension)
+
+        bad_dims = [dim for dim in dims if dim not in self.coordinates]
+        if bad_dims:
+            raise ValueError('Dataset does not contain the dimensions: '
+                             '{0}'.format(bad_dims))
+
+        variables = OrderedDict()
+        for name, var in iteritems(self.variables):
+            reduce_dims = [dim for dim in var.dimensions if dim in dims]
+            if reduce_dims:
+                if name not in self.dimensions:
+                    try:
+                        variables[name] = var.reduce(func,
+                                                     dimension=reduce_dims,
+                                                     **kwargs)
+                    except TypeError:
+                        pass
+            else:
+                variables[name] = var
+        return Dataset(variables=variables)
+
     @classmethod
     def concat(cls, datasets, dimension='concat_dimension', indexers=None,
                mode='different', concat_over=None, compat='equals'):
@@ -1166,3 +1247,5 @@ def from_dataframe(cls, dataframe):
             data = series.values.reshape(shape)
             obj[name] = (dimensions, data)
         return obj
+
+ops.inject_reduce_methods(Dataset)