diff --git a/xarray/core/coordinates.py b/xarray/core/coordinates.py index efe8affb2a3..0c5ac822e81 100644 --- a/xarray/core/coordinates.py +++ b/xarray/core/coordinates.py @@ -3,6 +3,7 @@ from collections import Mapping from contextlib import contextmanager +import numpy as np import pandas as pd from . import formatting, indexing diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py index 17af3cf2cd1..916d6efac8f 100644 --- a/xarray/core/dataarray.py +++ b/xarray/core/dataarray.py @@ -21,9 +21,10 @@ from .utils import ( _check_inplace, decode_numpy_dict_values, either_dict_or_kwargs, ensure_us_time_resolution) +from .merge import expand_variable_dicts, merge_variables from .variable import ( IndexVariable, Variable, as_compatible_data, as_variable, - assert_unique_multiindex_level_names) + assert_unique_multiindex_level_names, maybe_expand_multiindex) def _infer_coords_and_dims(shape, coords, dims): @@ -58,19 +59,24 @@ def _infer_coords_and_dims(shape, coords, dims): if not isinstance(d, basestring): raise TypeError('dimension %s is not a string' % d) - new_coords = OrderedDict() - - if utils.is_dict_like(coords): - for k, v in coords.items(): - new_coords[k] = as_variable(v, name=k) - elif coords is not None: + if coords is None: + coords = OrderedDict() + elif not utils.is_dict_like(coords): + # Convert list-like coords into a dict + coords_dict = OrderedDict() for dim, coord in zip(dims, coords): var = as_variable(coord, name=dim) var.dims = (dim,) - new_coords[dim] = var + coords_dict[dim] = var + coords = coords_dict + + # Combine coordinates, including MultiIndex levels + expanded = expand_variable_dicts([coords]) + coords = merge_variables(expanded, compat='equals') + # Check consistent sizes = dict(zip(dims, shape)) - for k, v in new_coords.items(): + for k, v in coords.items(): if any(d not in dims for d in v.dims): raise ValueError('coordinate %s has dimensions %s, but these ' 'are not a subset of the DataArray ' @@ -88,9 +94,9 @@ def _infer_coords_and_dims(shape, coords, dims): 'matching the dimension size' % (k, v.shape, (sizes[k],))) - assert_unique_multiindex_level_names(new_coords) + # assert_unique_multiindex_level_names(coords) - return new_coords, dims + return coords, dims class _LocIndexer(object): @@ -462,8 +468,7 @@ def _getitem_coord(self, key): var = self._coords[key] except KeyError: dim_sizes = dict(zip(self.dims, self.shape)) - _, key, var = _get_virtual_variable( - self._coords, key, self._level_coords, dim_sizes) + _, key, var = _get_virtual_variable(self._coords, key, dim_sizes) return self._replace_maybe_drop_dims(var, name=key) diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 4f9c61b3269..aa9e302061e 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -44,12 +44,10 @@ 'quarter'] -def _get_virtual_variable(variables, key, level_vars=None, dim_sizes=None): - """Get a virtual variable (e.g., 'time.year' or a MultiIndex level) +def _get_virtual_variable(variables, key, dim_sizes=None): + """Get a virtual variable (e.g., 'time.year') from a dict of xarray.Variable objects (if possible) """ - if level_vars is None: - level_vars = {} if dim_sizes is None: dim_sizes = {} @@ -69,11 +67,7 @@ def _get_virtual_variable(variables, key, level_vars=None, dim_sizes=None): else: raise KeyError(key) - if ref_name in level_vars: - dim_var = variables[level_vars[ref_name]] - ref_var = dim_var.to_index_variable().get_level_variable(ref_name) - else: - ref_var = variables[ref_name] + ref_var = variables[ref_name] if var_name is None: virtual_var = ref_var @@ -843,21 +837,6 @@ def _subset_with_all_valid_coords(self, variables, coord_names, attrs): return self._construct_direct(variables, coord_names, dims, attrs) - @property - def _level_coords(self): - """Return a mapping of all MultiIndex levels and their corresponding - coordinate name. - """ - level_coords = OrderedDict() - for cname in self._coord_names: - var = self.variables[cname] - if var.ndim == 1 and isinstance(var, IndexVariable): - level_names = var.level_names - if level_names is not None: - dim, = var.dims - level_coords.update({lname: dim for lname in level_names}) - return level_coords - def _copy_listed(self, names): """Create a new Dataset with the listed variables from this dataset and the all relevant coordinates. Skips all validation. @@ -870,7 +849,7 @@ def _copy_listed(self, names): variables[name] = self._variables[name] except KeyError: ref_name, var_name, var = _get_virtual_variable( - self._variables, name, self._level_coords, self.dims) + self._variables, name, self.dims) variables[var_name] = var if ref_name in self._coord_names or ref_name in self.dims: coord_names.add(var_name) @@ -887,7 +866,7 @@ def _construct_dataarray(self, name): variable = self._variables[name] except KeyError: _, name, variable = _get_virtual_variable( - self._variables, name, self._level_coords, self.dims) + self._variables, name, self.dims) coords = OrderedDict() needed_dims = set(variable.dims) diff --git a/xarray/core/formatting.py b/xarray/core/formatting.py index 5dd3cf06025..a17ca3013a3 100644 --- a/xarray/core/formatting.py +++ b/xarray/core/formatting.py @@ -253,7 +253,9 @@ def summarize_variable(name, var, col_width, show_values=True, def _summarize_coord_multiindex(coord, col_width, marker): first_col = pretty_print(u' %s %s ' % (marker, coord.name), col_width) - return u'%s(%s) MultiIndex' % (first_col, unicode_type(coord.dims[0])) + level_names_str = ', '.join(map(str, coord.level_names)) + return (u'%s(%s) MultiIndex[%s]' % + (first_col, unicode_type(coord.dims[0]), level_names_str)) def _summarize_coord_levels(coord, col_width, marker=u'-'): @@ -277,9 +279,7 @@ def summarize_coord(name, var, col_width): if is_index: coord = var.variable.to_index_variable() if coord.level_names is not None: - return u'\n'.join( - [_summarize_coord_multiindex(coord, col_width, marker), - _summarize_coord_levels(coord, col_width)]) + return _summarize_coord_multiindex(coord, col_width, marker) return summarize_variable( name, var.variable, col_width, show_values, marker) diff --git a/xarray/core/indexes.py b/xarray/core/indexes.py new file mode 100644 index 00000000000..dcf738627cc --- /dev/null +++ b/xarray/core/indexes.py @@ -0,0 +1,73 @@ +from __future__ import absolute_import, division, print_function + +import numpy as np +import pandas as pd + + +def normalize_indexes(coords, sizes, indexes=None): + """Normalize indexes for Dataset/DataArray. + + Validates that all indexes are pd.Index instances (or at least satisfy + the Index API we need for xarray). Creates default indexes for variables + whose name matches their sole dimension. + + Eventually: consider combining indexes along the same dimension into a + MultiIndex. + + Parameters + ---------- + coords : Mapping[Any, xarray.Variable] + Coordinate variables from which to draw default indexes. + dim_sizes : Mapping[Any, int] + Integer sizes for each Dataset/DataArray dimension. + indexes : Optional[Dict[Any, pandas.Index]] + Explicitly supplied indexes, if any. + + Returns + ------- + Mapping[Any, pandas.Index] mapping indexing keys (levels/dimension names) + to indexes used for indexing along that dimension. + """ + indexes = {} if indexes is None else dict(indexes) + + # default indexes + for key in sizes: + if key not in indexes: + if key in coords: + indexes[key] = coords[key].to_index() + else: + # need to ensure dtype=int64 in case range is empty on Python 2 + indexes[key] = pd.Index( + range(sizes[key]), name=key, dtype=np.int64) + + return indexes + + +def result_indexes(input_indexes, output_coords): + """Combine indexes from inputs into indexes for an operation result. + + Drops indexes corresponding to dropped coordinates. + + IMPORTANT: Assumes outputs are already aligned! + + Parameters + ---------- + input_indexes : Sequence[Mapping[Any, pandas.Index]] + Sequence of mappings of indexes to combine. + output_coords : Sequence[Mapping[Any, pandas.Variable] + Optional sequence of mappings provided output coordinates. + + Returns + ------- + List[Mapping[Any, pandas.Index]] mapping variable names to indexes, + for each requested mapping of output coordinates. + """ + output_indexes = [] + for output_coords_item in output_coords: + indexes = {} + for input_indexes_item in input_indexes: + for k, v in input_indexes_item.items(): + if k in output_coords_item: + indexes[k] = v + output_indexes.append(indexes) + return output_indexes diff --git a/xarray/core/merge.py b/xarray/core/merge.py index 984dd2fa204..a2ea6fe9083 100644 --- a/xarray/core/merge.py +++ b/xarray/core/merge.py @@ -5,7 +5,8 @@ from .alignment import deep_align from .pycompat import OrderedDict, basestring from .utils import Frozen -from .variable import as_variable, assert_unique_multiindex_level_names +from .variable import ( + as_variable, assert_unique_multiindex_level_names, maybe_expand_multiindex) PANDAS_TYPES = (pd.Series, pd.DataFrame, pd.Panel) @@ -197,11 +198,10 @@ def expand_variable_dicts(list_of_variable_dicts): for variables in list_of_variable_dicts: if isinstance(variables, Dataset): - sanitized_vars = variables.variables + var_dicts.append(variables.variables) else: - # append coords to var_dicts before appending sanitized_vars, - # because we want coords to appear first sanitized_vars = OrderedDict() + var_dicts.append(sanitized_vars) for name, var in variables.items(): if isinstance(var, DataArray): @@ -211,10 +211,13 @@ def expand_variable_dicts(list_of_variable_dicts): coords.pop(name, None) var_dicts.append(coords) + multiindex_vars = maybe_expand_multiindex(var, name) + if multiindex_vars is not None: + var_dicts.append(multiindex_vars) + var = as_variable(var, name=name) sanitized_vars[name] = var - var_dicts.append(sanitized_vars) return var_dicts @@ -253,6 +256,10 @@ def determine_coords(list_of_variable_dicts): coords.discard(name) coord_names.update(coords) + multiindex_vars = maybe_expand_multiindex(var, name) + if multiindex_vars is not None: + coord_names.update(multiindex_vars) + return coord_names, noncoord_names @@ -296,7 +303,7 @@ def merge_coords_for_inplace_math(objs, priority_vars=None): """ expanded = expand_variable_dicts(objs) variables = merge_variables(expanded, priority_vars) - assert_unique_multiindex_level_names(variables) + # assert_unique_multiindex_level_names(variables) return variables @@ -443,7 +450,7 @@ def merge_core(objs, priority_vars = _get_priority_vars(aligned, priority_arg, compat=compat) variables = merge_variables(expanded, priority_vars, compat=compat) - assert_unique_multiindex_level_names(variables) + # assert_unique_multiindex_level_names(variables) dims = calculate_dimensions(variables) diff --git a/xarray/core/options.py b/xarray/core/options.py index ab461ca86bc..5801f029cd5 100644 --- a/xarray/core/options.py +++ b/xarray/core/options.py @@ -18,7 +18,7 @@ FILE_CACHE_MAXSIZE: 128, CMAP_SEQUENTIAL: 'viridis', CMAP_DIVERGENT: 'RdBu_r', - KEEP_ATTRS: 'default' + KEEP_ATTRS: 'default', } _JOIN_OPTIONS = frozenset(['inner', 'outer', 'left', 'right', 'exact']) diff --git a/xarray/core/variable.py b/xarray/core/variable.py index 0bff06e7546..9ede83a676d 100644 --- a/xarray/core/variable.py +++ b/xarray/core/variable.py @@ -114,6 +114,63 @@ def as_variable(obj, name=None): return obj +def maybe_expand_multiindex(obj, name): + """Expand an object into one or more Variable objects. + + Parameters + ---------- + obj : object + Object to convert into a variable or variables. Like the obj argument + to as_variable(), but if data is a MultiIndex, each level is extracted + as a separate IndexVariable. + name : any + Name of this object, when used as a key in a dictionary. This is used + to set a default dimension name. + + Returns + ------- + OrderedDict with a single Variable/IndexVariable value or multiple + IndexVariable values (keyed by level name) if input data is a MultiIndex. + + Examples + -------- + >>> as_variables_with_multiindex_expansion([1, 2, 3], name='x') + OrderedDict([('x', IndexVariable(('x',), array([1, 2, 3])))]) + + >>> as_variables_with_multiindex_expansion(('y', [1, 2, 3]), name='x') + OrderedDict([('x', Variable(('y',), array([1, 2, 3])))]) + + >>> idx = pd.MultiIndex.from_tuples([('a', 1), ('b', 2)], names=['y', 'z']) + >>> as_variables_with_multiindex_expansion(idx, name='x') + OrderedDict([('y', Variable(('x',), array(['a', 'b']))), + ('z', Variable(('x',), array([1, 2])))]) + """ + tuple_with_multiindex = (isinstance(obj, tuple) and len(obj) > 1 and + isinstance(obj[1], pd.MultiIndex)) + if tuple_with_multiindex or isinstance(obj, pd.MultiIndex): + if isinstance(obj, tuple): + dims, index = obj[:2] + else: + dims = (name,) + index = obj + if any(level_name is None for level_name in index.names): + raise ValueError( + 'cannot convert a MultiIndex with unknown level names {} into ' + 'xarray variables: {}'.format(index.names, index)) + if len(set(index.names)) != len(index.names): + raise ValueError( + 'cannot convert a MultiIndex with non-unique level names {} ' + 'into xarray variables: {}'.format(index.names, index)) + multiindex_vars = OrderedDict() + for level_name in index.names: + multiindex_vars[level_name] = Variable( + dims, index.get_level_values(level_name)) + else: + multiindex_vars = None + + return multiindex_vars + + def _maybe_wrap_data(data): """ Put pandas.Index and numpy.ndarray arguments in adapter objects to ensure diff --git a/xarray/tests/test_dataarray.py b/xarray/tests/test_dataarray.py index 87ee60715a1..015ee4c2ce2 100644 --- a/xarray/tests/test_dataarray.py +++ b/xarray/tests/test_dataarray.py @@ -59,9 +59,9 @@ def test_repr_multiindex(self): array([0, 1, 2, 3]) Coordinates: - * x (x) MultiIndex - - level_1 (x) object 'a' 'a' 'b' 'b' - - level_2 (x) int64 1 2 1 2""") + * x (x) MultiIndex[level_1, level_2] + level_1 (x) object 'a' 'a' 'b' 'b' + level_2 (x) int64 1 2 1 2""") assert expected == repr(self.mda) def test_properties(self): diff --git a/xarray/tests/test_dataset.py b/xarray/tests/test_dataset.py index 89ea3ba78a0..652df3d1491 100644 --- a/xarray/tests/test_dataset.py +++ b/xarray/tests/test_dataset.py @@ -144,9 +144,9 @@ def test_repr_multiindex(self): Dimensions: (x: 4) Coordinates: - * x (x) MultiIndex - - level_1 (x) object 'a' 'a' 'b' 'b' - - level_2 (x) int64 1 2 1 2 + * x (x) MultiIndex[level_1, level_2] + level_1 (x) object 'a' 'a' 'b' 'b' + level_2 (x) int64 1 2 1 2 Data variables: *empty*""") actual = '\n'.join(x.rstrip() for x in repr(data).split('\n')) @@ -162,9 +162,9 @@ def test_repr_multiindex(self): Dimensions: (x: 4) Coordinates: - * x (x) MultiIndex - - a_quite_long_level_name (x) object 'a' 'a' 'b' 'b' - - level_2 (x) int64 1 2 1 2 + * x (x) MultiIndex[a_quite_long_level_name, level_2] + a_quite_long_level_name (x) object 'a' 'a' 'b' 'b' + level_2 (x) int64 1 2 1 2 Data variables: *empty*""") actual = '\n'.join(x.rstrip() for x in repr(data).split('\n')) diff --git a/xarray/tests/test_variable.py b/xarray/tests/test_variable.py index 0bd440781ac..24374dc6d75 100644 --- a/xarray/tests/test_variable.py +++ b/xarray/tests/test_variable.py @@ -21,7 +21,8 @@ OuterIndexer, PandasIndexAdapter, VectorizedIndexer) from xarray.core.pycompat import PY3, OrderedDict from xarray.core.utils import NDArrayMixin -from xarray.core.variable import as_compatible_data, as_variable +from xarray.core.variable import ( + as_compatible_data, as_variable, maybe_expand_multiindex) from xarray.tests import requires_bottleneck from . import ( @@ -1949,6 +1950,44 @@ class CustomIndexable(CustomArray, indexing.ExplicitlyIndexed): assert isinstance(orig._data, CustomIndexable) +def assert_dict_identical(expected, actual): + assert expected.keys() == actual.keys() + for k in expected: + assert_identical(expected[k], actual[k]) + + +def test_maybe_expand_multiindex(): + + result = maybe_expand_multiindex([1, 2, 3], name='x') + assert result is None + + result = maybe_expand_multiindex(('y', [1, 2, 3]), name='x') + assert result is None + + index = pd.MultiIndex.from_arrays([[1, 2, 3]], names=['x']) + result = maybe_expand_multiindex(index, name='y') + expected = OrderedDict([('x', Variable(('y',), [1, 2, 3]))]) + assert_dict_identical(expected, result) + + result = maybe_expand_multiindex(('y', index), name='y') + expected = OrderedDict([('x', Variable(('y',), [1, 2, 3]))]) + assert_dict_identical(expected, result) + + index = pd.MultiIndex.from_tuples([('a', 1), ('b', 2)], names=['y', 'z']) + result = maybe_expand_multiindex(index, name='x') + expected = OrderedDict([('y', Variable(('x',), ['a', 'b'])), + ('z', Variable(('x',), [1, 2]))]) + assert_dict_identical(expected, result) + + index = pd.MultiIndex.from_arrays([[1, 2, 3]]) + with raises_regex(ValueError, 'unknown level names'): + maybe_expand_multiindex(index, 'foo') + + index = pd.MultiIndex.from_tuples([('a', 1), ('b', 2)], names=['A', 'A']) + with raises_regex(ValueError, 'non-unique level names'): + maybe_expand_multiindex(index, 'foo') + + def test_raise_no_warning_for_nan_in_binary_ops(): with pytest.warns(None) as record: Variable('x', [1, 2, np.NaN]) > 0