Skip to content
Closed
Show file tree
Hide file tree
Changes from 10 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions doc/whats-new.rst
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,8 @@ Bug fixes
(:issue:`3402`). By `Deepak Cherian <https://github.com/dcherian/>`_
- Allow appending datetime and bool data variables to zarr stores.
(:issue:`3480`). By `Akihiro Matsukawa <https://github.com/amatsukawa/>`_.
- Make :py:func:`~xarray.concat` more robust when concatenating variables present in some datasets but
not others (:issue:`508`). By `Scott Chamberlin <https://github.com/scottcha>`_.

Documentation
~~~~~~~~~~~~~
Expand Down
114 changes: 97 additions & 17 deletions xarray/core/concat.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
import pandas as pd
from collections import OrderedDict
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

just plain dict should be fine now since we are python 3.6+

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ok, I didn't realize that it was 3.6+ only. Will change to dict.


from . import dtypes, utils
from .alignment import align
from .common import full_like
from .duck_array_ops import lazy_array_equiv
from .merge import _VALID_COMPAT, unique_variable
from .variable import IndexVariable, Variable, as_variable
Expand All @@ -26,7 +28,7 @@ def concat(
xarray objects to concatenate together. Each object is expected to
consist of variables and coordinates with matching shapes except for
along the concatenated dimension.
dim : str or DataArray or pandas.Index
dim : str, DataArray, Variable, or pandas.Index
Name of the dimension to concatenate along. This can either be a new
dimension name, in which case it is added along axis=0, or an existing
dimension name, in which case the location of the dimension is
Expand Down Expand Up @@ -77,7 +79,8 @@ def concat(
to assign each dataset along the concatenated dimension. If not
supplied, objects are concatenated in the provided order.
fill_value : scalar, optional
Value to use for newly missing values
Value to use for newly missing values as well as to fill values where the
variable is not present in all datasets.
join : {'outer', 'inner', 'left', 'right', 'exact'}, optional
String indicating how to combine differing indexes
(excluding dim) in objects
Expand Down Expand Up @@ -129,6 +132,7 @@ def concat(
"can only concatenate xarray Dataset and DataArray "
"objects, got %s" % type(first_obj)
)

return f(objs, dim, data_vars, coords, compat, positions, fill_value, join)


Expand Down Expand Up @@ -366,25 +370,101 @@ def ensure_common_dims(vars):
var = var.set_dims(common_dims, common_shape)
yield var

# stack up each variable to fill-out the dataset (in order)
# n.b. this loop preserves variable order, needed for groupby.
for k in datasets[0].variables:
if k in concat_over:
try:
vars = ensure_common_dims([ds.variables[k] for ds in datasets])
except KeyError:
raise ValueError("%r is not present in all datasets." % k)
# Find union of all data variables (preserving order)
# assumes all datasets are relatively in the same order
# and missing variables are inserted in the correct position
# if datasets have variables in drastically different orders
# the resulting order will be dependent on the order they are in the list
# passed to concat
union_of_variables = OrderedDict()
union_of_coordinates = OrderedDict()
for ds in datasets:
var_list = list(ds.variables.keys())
# this logic maintains the order of the variable list and runs in
# O(n^2) where n is number of variables in the uncommon worst case
# where there are no missing variables this will be O(n)
for i in range(0, len(var_list)):
if var_list[i] not in union_of_variables:
# need to determine the correct place
# first add the new item which will be at the end
union_of_variables[var_list[i]] = None
union_of_variables.move_to_end(var_list[i])
# move any items after this in the variables list to the end
# this will only happen for missing variables
for j in range(i + 1, len(var_list)):
if var_list[j] in union_of_variables:
union_of_variables.move_to_end(var_list[j])
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@shoyer if this is the code you are referring to this have two purposes:

  1. Find a complete set of variables even if the first datatset in the concat list has a missing variable (the previous implementation assums the first datatset has all variables)
  2. Maintains the order of those variables (which is essentially the sorting operations happening when a missing variable is encountered) which was documented as a requirement for groupby in the previous implementation.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not sure if preserving ordering is really essential, though I guess it would be nice to have.

The fundamental problem here is efficiently determining a consistent merge order between lists. This is pretty similar to some code I once wrote in TensorFlow. It only handles merging two lists efficiently, but hopefully is a good model. The fundamental idea is to simultaneously consume elements across all the lists at once.

I think there is no reason why it could not be extended to N-lists (though it would also need to be changed to fall-back to order of appearance rather than raising an error):
https://github.com/tensorflow/tensorflow/blob/v1.15.0/tensorflow/contrib/labeled_tensor/python/ops/core.py#L919

Either way, the logic should definitely live in a separate helper function, which makes it easier to test.

Copy link
Contributor Author

@scottcha scottcha Jan 24, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Regarding ordering I was going off the previous comment which said
# stack up each variable to fill-out the dataset (in order) # n.b. this loop preserves variable order, needed for groupby.
I believe one of the groupby test was also checking for that but I can't really recall at this point (regardless all existing groupby tests are currently passing in the pr).

I liked the code you linked and took a little detour to to try to incorporate a version of it in to my PR. I pulled it back out once I realized two things:

  1. The conversion of the input list to a set I thought was a bit risky since the order isn't necessarily guaranteed (esp < python 3.7 where dicts weren't ordered by default) and it's why my implementation was relying on ordered dicts. I'm sure the code you linked is likely ok I just was unsure about taking a dependency on what seemed an undocumented assumption.
  2. The case where no consistent ordering was possible returned None while I didn't necessarily think that was appropriate for this code since there isn't really a strict necessity for variable ordering and I'm not sure you want to go deeper down that path. Removing this assumption was forcing me in to more complex code.

I did spend a bit of time trying to write the generalized n dimension version of the consistent_ordering code but it was getting quite complex and was potentially hiding some complexity under some syntactic sugar. I ended up refactoring the piece of code in question to an internal method (as its still fairly tied to the implementation of the public method) and put a note that its a potential candidate for a refactor.

The PR is updated with these changes.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@scottcha @shoyer I've tested again the different approaches. If there are only occasional misses I workes quite well. But in corner cases (two neighboring variables miss in consecutive datasets) it can have unwanted results. I'll add some code after the weekend.

From what I read this problem is closely related to the shortest common supersequence problem. I've checked on the implementations and it works very well in terms of result, but is (currently) quite slow.

There should be some checks to possibly find one Dataset which contains all variables and can be used for output sorting. If none such is available then...

If there is a correct solution possible, the code should find it. Just my 2c.

Copy link
Contributor Author

@scottcha scottcha Jan 24, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You are right that this is a special case of shortest common supersequence though since there shouldn't be repeated values in any sequence it might be easier to solve.

@kmuehlbauer can you provide a case where you think the ordering determined by the current algorighm isn't providing the expected results? I just updated the PR with a test case for the multiple neighboring missing variables (as well as explicit asserts on the data_var ordering) and I'm still getting expected results. It would be great to see what you observed.

It may be time to actually ask what you want the behavior to really be in this case before introducing additional complexity. I just read through some of the pandas issues and looked like they dealt with this as well pandas-dev/pandas#4588. Is that the behavior you would like in xarray? I like the alignment at least for the default behavior with the pandas behavior but I think its really up to the xarray owners? Pandas allows a sort option which is also something to consider for an explicit alphabetical ordering.

(edited as I think the statement in the linked article about sql behvior was incorrect, also more clear about the pandas behavior)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@scottcha This is from the top of my head, so bear with me, if this isn't creating the unwanted effects.

ds1 = ['d1', 'd3' , 'd4' , 'd5' , 'd6' ] 
ds2 = ['d1', 'd2' , 'd4' , 'd5' , 'd6' ] 
ds3 = ['d1', 'd2' , 'd3' , 'd5' , 'd6' ] 
ds4 = ['d1', 'd2' , 'd3' , 'd4' , 'd6' ] 

This is an example where one variable is missing in each Dataset, but the correct ordering is obvious. I hope I got it right. If not, I have to look it up on Monday earliest.

I'll test your additions/changes next week, currently travelling.


# check that all datasets have the same coordinate set
if len(union_of_coordinates) > 0:
coord_set_diff = (
union_of_coordinates.keys() ^ ds.coords.keys()
) & concat_over
if len(coord_set_diff) > 0:
raise ValueError(
"Variables %r are coordinates in some datasets but not others."
% coord_set_diff
)

union_of_coordinates = dict(
union_of_coordinates.items() | dict.fromkeys(ds.coords).items()
)

# we don't want to fill coordinate variables so remove them
for k in union_of_coordinates.keys():
union_of_variables.pop(k, None)

# Cache a filled tmp variable with correct dims for filling missing variables
# doing this here allows us to concat with variables missing from any dataset
# only will run until it finds one protype for each variable in concat list
# we will also only fill defaults for data_vars not coordinates

# optimization to allow us to break when filling variable
def find_fill_variable_from_ds(variable_key, union_of_variables, datasets):
for ds in datasets:
if union_of_variables[variable_key] is not None:
continue

if variable_key not in ds.variables:
continue

v_fill_value = fill_value
dtype, v_fill_value = dtypes.get_fill_value_for_variable(
ds[variable_key], fill_value
)

union_of_variables[variable_key] = full_like(
ds[variable_key], fill_value=v_fill_value, dtype=dtype
Copy link
Contributor

@kmuehlbauer kmuehlbauer Jan 28, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This need to be ds.variables[variable_key], fill_value=v_fill_value, dtype=dtype, otherwise it will fail later (DataArray has no set_dim)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks for the feedback and the above test. I'll try to incorporate your suggested test as well as the rest of the pending comments in the next update.

)
return

for v in union_of_variables.keys():
find_fill_variable_from_ds(v, union_of_variables, datasets)

# create the concat list filling in missing variables
while len(union_of_variables) > 0 or len(union_of_coordinates) > 0:
k = None
# get the variables in order
if len(union_of_variables) > 0:
k = union_of_variables.popitem(last=False)
elif len(union_of_coordinates) > 0:
k = union_of_coordinates.popitem()

if k[0] in concat_over:
variables = []
for ds in datasets:
if k[0] in ds.variables:
variables.append(ds.variables[k[0]])
else:
# var is missing, fill with cached value
variables.append(k[1])

vars = ensure_common_dims(variables)
combined = concat_vars(vars, dim, positions)
assert isinstance(combined, Variable)
result_vars[k] = combined
result_vars[k[0]] = combined

result = Dataset(result_vars, attrs=result_attrs)
absent_coord_names = coord_names - set(result.variables)
if absent_coord_names:
raise ValueError(
"Variables %r are coordinates in some datasets but not others."
% absent_coord_names
)
result = result.set_coords(coord_names)
result.encoding = result_encoding

Expand Down
32 changes: 32 additions & 0 deletions xarray/core/dtypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

from . import utils


# Use as a sentinel value to indicate a dtype appropriate NA value.
NA = utils.ReprObject("<NA>")

Expand Down Expand Up @@ -96,6 +97,37 @@ def get_fill_value(dtype):
return fill_value


def get_fill_value_for_variable(variable, fill_value=NA):
"""Return an appropriate fill value for this variable

Parameters
----------
variables : DataSet or DataArray
fill_value : a suggested fill value to evaluate and promote if necessary

Returns
-------
dtype : Promoted dtype for fill value.
new_fill_value : Missing value corresponding to this dtype.
"""
from .dataset import Dataset
from .dataarray import DataArray

if not (isinstance(variable, DataArray) or isinstance(variable, Dataset)):
raise TypeError(
"can only get fill value for xarray Dataset and DataArray "
"objects, got %s" % type(variable)
)

new_fill_value = fill_value
if fill_value is NA:
dtype, new_fill_value = maybe_promote(variable.dtype)
else:
dtype = variable.dtype

return dtype, new_fill_value


def get_pos_infinity(dtype):
"""Return an appropriate positive infinity for this dtype.

Expand Down
4 changes: 3 additions & 1 deletion xarray/tests/test_combine.py
Original file line number Diff line number Diff line change
Expand Up @@ -755,7 +755,9 @@ def test_auto_combine(self):
auto_combine(objs)

objs = [Dataset({"x": [0], "y": [0]}), Dataset({"x": [0]})]
with raises_regex(ValueError, "'y' is not present in all datasets"):
with raises_regex(
ValueError, ".* are coordinates in some datasets but not others"
):
auto_combine(objs)

def test_auto_combine_previously_failed(self):
Expand Down
Loading