Skip to content

Added convenience method for saving DataArray to netCDF file #990

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 10 commits into from
Sep 6, 2016
Merged
2 changes: 2 additions & 0 deletions doc/api.rst
Original file line number Diff line number Diff line change
Expand Up @@ -405,7 +405,9 @@ DataArray methods
.. autosummary::
:toctree: generated/

open_dataarray
DataArray.to_dataset
DataArray.to_netcdf
DataArray.to_pandas
DataArray.to_series
DataArray.to_dataframe
Expand Down
8 changes: 8 additions & 0 deletions doc/io.rst
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,14 @@ We can load netCDF files to create a new Dataset using
ds_disk = xr.open_dataset('saved_on_disk.nc')
ds_disk

Similarly, a DataArray can be saved to disk using the
:py:attr:`DataArray.to_netcdf <xarray.DataArray.to_netcdf>` method, and loaded
from disk using the :py:func:`~xarray.open_dataarray` function. As netCDF files
correspond to :py:class:`~xarray.Dataset` objects, these functions internally
convert the ``DataArray`` to a ``Dataset`` before saving, and then convert back
when loading, ensuring that the ``DataArray`` that is loaded is always exactly
the same as the one that was saved.

A dataset can also be loaded or written to a specific group within a netCDF
file. To load from a group, pass a ``group`` keyword argument to the
``open_dataset`` function. The group can be specified as a path-like
Expand Down
7 changes: 7 additions & 0 deletions doc/whats-new.rst
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,13 @@ Enhancements
error messages if they are invalid. (:issue:`911`).
By `Robin Wilson <https://github.com/robintw>`_.

- Added ability to save ``DataArray`` objects directly to netCDF files using
:py:meth:`~xarray.DataArray.to_netcdf`, and to load directly from netCDF files
using :py:func:`~xarray.open_dataarray` (:issue:`915`). These remove the need
to convert a ``DataArray`` to a ``Dataset`` before saving as a netCDF file,
and deals with names to ensure a perfect 'roundtrip' capability.
By `Robin Wilson <https://github.com/robintw`_.

Bug fixes
~~~~~~~~~

Expand Down
3 changes: 2 additions & 1 deletion xarray/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,8 @@
from .core.merge import merge, MergeError
from .core.options import set_options

from .backends.api import open_dataset, open_mfdataset, save_mfdataset
from .backends.api import (open_dataset, open_dataarray, open_mfdataset,
save_mfdataset)
from .conventions import decode_cf

try:
Expand Down
103 changes: 103 additions & 0 deletions xarray/backends/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@
from ..core.utils import close_on_error, is_remote_uri
from ..core.pycompat import basestring

DATAARRAY_NAME = '__xarray_dataarray_name__'
DATAARRAY_VARIABLE = '__xarray_dataarray_variable__'

def _get_default_engine(path, allow_remote=False):
if allow_remote and is_remote_uri(path): # pragma: no cover
Expand Down Expand Up @@ -267,6 +269,107 @@ def maybe_decode_store(store, lock=False):
return maybe_decode_store(store)


def open_dataarray(filename_or_obj, group=None, decode_cf=True,
mask_and_scale=True, decode_times=True,
concat_characters=True, decode_coords=True, engine=None,
chunks=None, lock=None, drop_variables=None):
"""
Opens an DataArray from a netCDF file containing a single data variable.

This is designed to read netCDF files with only one data variable. If
multiple variables are present then a ValueError is raised.

Parameters
----------
filename_or_obj : str, file or xarray.backends.*DataStore
Strings are interpreted as a path to a netCDF file or an OpenDAP URL
and opened with python-netCDF4, unless the filename ends with .gz, in
which case the file is gunzipped and opened with scipy.io.netcdf (only
netCDF3 supported). File-like objects are opened with scipy.io.netcdf
(only netCDF3 supported).
group : str, optional
Path to the netCDF4 group in the given file to open (only works for
netCDF4 files).
decode_cf : bool, optional
Whether to decode these variables, assuming they were saved according
to CF conventions.
mask_and_scale : bool, optional
If True, replace array values equal to `_FillValue` with NA and scale
values according to the formula `original_values * scale_factor +
add_offset`, where `_FillValue`, `scale_factor` and `add_offset` are
taken from variable attributes (if they exist). If the `_FillValue` or
`missing_value` attribute contains multiple values a warning will be
issued and all array values matching one of the multiple values will
be replaced by NA.
decode_times : bool, optional
If True, decode times encoded in the standard NetCDF datetime format
into datetime objects. Otherwise, leave them encoded as numbers.
concat_characters : bool, optional
If True, concatenate along the last dimension of character arrays to
form string arrays. Dimensions will only be concatenated over (and
removed) if they have no corresponding variable and if they are only
used as the last dimension of character arrays.
decode_coords : bool, optional
If True, decode the 'coordinates' attribute to identify coordinates in
the resulting dataset.
engine : {'netcdf4', 'scipy', 'pydap', 'h5netcdf', 'pynio'}, optional
Engine to use when reading files. If not provided, the default engine
is chosen based on available dependencies, with a preference for
'netcdf4'.
chunks : int or dict, optional
If chunks is provided, it used to load the new dataset into dask
arrays. This is an experimental feature; see the documentation for more
details.
lock : False, True or threading.Lock, optional
If chunks is provided, this argument is passed on to
:py:func:`dask.array.from_array`. By default, a per-variable lock is
used when reading data from netCDF files with the netcdf4 and h5netcdf
engines to avoid issues with concurrent access when using dask's
multithreaded backend.
drop_variables: string or iterable, optional
A variable or list of variables to exclude from being parsed from the
dataset. This may be useful to drop variables with problems or
inconsistent values.

Notes
-----
This is designed to be fully compatible with `DataArray.to_netcdf`. Saving
using `DataArray.to_netcdf` and then loading with this function will
produce an identical result.

All parameters are passed directly to `xarray.open_dataset`. See that
documentation for further details.

See also
--------
open_dataset
"""
dataset = open_dataset(filename_or_obj, group, decode_cf,
mask_and_scale, decode_times,
concat_characters, decode_coords, engine,
chunks, lock, drop_variables)

if len(dataset.data_vars) != 1:
raise ValueError('Given file dataset contains more than one data '
'variable. Please read with xarray.open_dataset and '
'then select the variable you want.')
else:
data_array, = dataset.data_vars.values()

data_array._file_obj = dataset._file_obj

# Reset names if they were changed during saving
# to ensure that we can 'roundtrip' perfectly
if DATAARRAY_NAME in dataset.attrs:
data_array.name = dataset.attrs[DATAARRAY_NAME]
del dataset.attrs[DATAARRAY_NAME]

if data_array.name == DATAARRAY_VARIABLE:
data_array.name = None

return data_array


class _MultiFileCloser(object):
def __init__(self, file_objs):
self.file_objs = file_objs
Expand Down
13 changes: 13 additions & 0 deletions xarray/core/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -578,6 +578,19 @@ def where(self, cond, other=None, drop=False):

return outobj._where(outcond)

def close(self):
"""Close any files linked to this object
"""
if self._file_obj is not None:
self._file_obj.close()
self._file_obj = None

def __enter__(self):
return self

def __exit__(self, exc_type, exc_value, traceback):
self.close()

# this has no runtime function - these are listed so IDEs know these methods
# are defined and don't warn on these operations
__lt__ = __le__ =__ge__ = __gt__ = __add__ = __sub__ = __mul__ = \
Expand Down
76 changes: 76 additions & 0 deletions xarray/core/dataarray.py
Original file line number Diff line number Diff line change
Expand Up @@ -221,8 +221,12 @@ def __init__(self, data, coords=None, dims=None, name=None,
self._variable = variable
self._coords = coords
self._name = name

self._file_obj = None

self._initialized = True


__default = object()

def _replace(self, variable=None, coords=None, name=__default):
Expand Down Expand Up @@ -1092,6 +1096,78 @@ def to_masked_array(self, copy=True):
isnull = pd.isnull(self.values)
return np.ma.MaskedArray(data=self.values, mask=isnull, copy=copy)

def to_netcdf(self, *args, **kwargs):
"""
Write DataArray contents to a netCDF file.

Parameters
----------
path : str, optional
Path to which to save this dataset. If no path is provided, this
function returns the resulting netCDF file as a bytes object; in
this case, we need to use scipy.io.netcdf, which does not support
netCDF version 4 (the default format becomes NETCDF3_64BIT).
mode : {'w', 'a'}, optional
Write ('w') or append ('a') mode. If mode='w', any existing file at
this location will be overwritten.
format : {'NETCDF4', 'NETCDF4_CLASSIC', 'NETCDF3_64BIT', 'NETCDF3_CLASSIC'}, optional
File format for the resulting netCDF file:

* NETCDF4: Data is stored in an HDF5 file, using netCDF4 API
features.
* NETCDF4_CLASSIC: Data is stored in an HDF5 file, using only
netCDF 3 compatible API features.
* NETCDF3_64BIT: 64-bit offset version of the netCDF 3 file format,
which fully supports 2+ GB files, but is only compatible with
clients linked against netCDF version 3.6.0 or later.
* NETCDF3_CLASSIC: The classic netCDF 3 file format. It does not
handle 2+ GB files very well.

All formats are supported by the netCDF4-python library.
scipy.io.netcdf only supports the last two formats.

The default format is NETCDF4 if you are saving a file to disk and
have the netCDF4-python library available. Otherwise, xarray falls
back to using scipy to write netCDF files and defaults to the
NETCDF3_64BIT format (scipy does not support netCDF4).
group : str, optional
Path to the netCDF4 group in the given file to open (only works for
format='NETCDF4'). The group(s) will be created if necessary.
engine : {'netcdf4', 'scipy', 'h5netcdf'}, optional
Engine to use when writing netCDF files. If not provided, the
default engine is chosen based on available dependencies, with a
preference for 'netcdf4' if writing to a file on disk.
encoding : dict, optional
Nested dictionary with variable names as keys and dictionaries of
variable specific encodings as values, e.g.,
``{'my_variable': {'dtype': 'int16', 'scale_factor': 0.1, 'zlib': True}, ...}``

Notes
-----
Only xarray.Dataset objects can be written to netCDF files, so
the xarray.DataArray is converted to a xarray.Dataset object
containing a single variable. If the DataArray has no name, or if the
name is the same as a co-ordinate name, then it is given the name
'__xarray_dataarray_variable__'.

All parameters are passed directly to `xarray.Dataset.to_netcdf`.
"""
from ..backends.api import DATAARRAY_NAME, DATAARRAY_VARIABLE

if not self.name:
# If no name is set then use a generic xarray name
dataset = self.to_dataset(name=DATAARRAY_VARIABLE)
elif self.name in list(self.coords):
# The name is the same as one of the coords names, which netCDF
# doesn't support, so rename it but keep track of the old name
dataset = self.to_dataset(name=DATAARRAY_VARIABLE)
dataset.attrs[DATAARRAY_NAME] = self.name
else:
# No problems with the name - so we're fine!
dataset = self.to_dataset()

dataset.to_netcdf(*args, **kwargs)

def to_dict(self):
"""
Convert this xarray.DataArray into a dictionary following xarray
Expand Down
13 changes: 0 additions & 13 deletions xarray/core/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -240,19 +240,6 @@ def load_store(cls, store, decoder=None):
obj._file_obj = store
return obj

def close(self):
"""Close any files linked to this dataset
"""
if self._file_obj is not None:
self._file_obj.close()
self._file_obj = None

def __enter__(self):
return self

def __exit__(self, exc_type, exc_value, traceback):
self.close()

def __getstate__(self):
"""Always load data in-memory before pickling"""
self.load()
Expand Down
39 changes: 37 additions & 2 deletions xarray/test/test_backends.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,8 @@
import pandas as pd

import xarray as xr
from xarray import Dataset, open_dataset, open_mfdataset, backends, save_mfdataset
from xarray import (Dataset, DataArray, open_dataset, open_dataarray,
open_mfdataset, backends, save_mfdataset)
from xarray.backends.common import robust_getitem
from xarray.backends.netCDF4_ import _extract_nc4_encoding
from xarray.core.pycompat import iteritems, PY3
Expand Down Expand Up @@ -1071,7 +1072,6 @@ def test_extract_h5nc_encoding(self):
with self.assertRaisesRegexp(ValueError, 'unexpected encoding'):
_extract_nc4_encoding(var, raise_on_invalid=True)


class MiscObject:
pass

Expand Down Expand Up @@ -1172,3 +1172,38 @@ def new_dataset_and_coord_attrs():
attrs['test'] = np.arange(12).reshape(3, 4)
with create_tmp_file() as tmp_file:
ds.to_netcdf(tmp_file)

@requires_netCDF4
class TestDataArrayToNetCDF(TestCase):

def test_dataarray_to_netcdf_no_name(self):
original_da = DataArray(np.arange(12).reshape((3, 4)))

with create_tmp_file() as tmp:
original_da.to_netcdf(tmp)

with open_dataarray(tmp) as loaded_da:
self.assertDataArrayIdentical(original_da, loaded_da)


def test_dataarray_to_netcdf_with_name(self):
original_da = DataArray(np.arange(12).reshape((3, 4)),
name='test')

with create_tmp_file() as tmp:
original_da.to_netcdf(tmp)

with open_dataarray(tmp) as loaded_da:
self.assertDataArrayIdentical(original_da, loaded_da)


def test_dataarray_to_netcdf_coord_name_clash(self):
original_da = DataArray(np.arange(12).reshape((3, 4)),
dims=['x', 'y'],
name='x')

with create_tmp_file() as tmp:
original_da.to_netcdf(tmp)

with open_dataarray(tmp) as loaded_da:
self.assertDataArrayIdentical(original_da, loaded_da)