Skip to content

Commit 8bf415a

Browse files
martindurantkeewisraybellwaves
authored
Allow fsspec URLs in open_(mf)dataset (#4823)
Co-authored-by: keewis <[email protected]> Co-authored-by: Ray Bell <[email protected]>
1 parent 735a359 commit 8bf415a

File tree

10 files changed

+138
-10
lines changed

10 files changed

+138
-10
lines changed

ci/requirements/environment.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@ channels:
33
- conda-forge
44
- nodefaults
55
dependencies:
6+
- aiobotocore
67
- boto3
78
- bottleneck
89
- cartopy

ci/requirements/py38-all-but-dask.yml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,8 @@ channels:
44
- nodefaults
55
dependencies:
66
- python=3.8
7+
- black
8+
- aiobotocore
79
- boto3
810
- bottleneck
911
- cartopy

doc/io.rst

Lines changed: 31 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -890,17 +890,44 @@ Cloud Storage Buckets
890890

891891
It is possible to read and write xarray datasets directly from / to cloud
892892
storage buckets using zarr. This example uses the `gcsfs`_ package to provide
893-
a ``MutableMapping`` interface to `Google Cloud Storage`_, which we can then
894-
pass to xarray::
893+
an interface to `Google Cloud Storage`_.
894+
895+
From v0.16.2: general `fsspec`_ URLs are parsed and the store set up for you
896+
automatically when reading, such that you can open a dataset in a single
897+
call. You should include any arguments to the storage backend as the
898+
key ``storage_options``, part of ``backend_kwargs``.
899+
900+
.. code:: python
901+
902+
ds_gcs = xr.open_dataset(
903+
"gcs://<bucket-name>/path.zarr",
904+
backend_kwargs={
905+
"storage_options": {"project": "<project-name>", "token": None}
906+
},
907+
engine="zarr",
908+
)
909+
910+
911+
This also works with ``open_mfdataset``, allowing you to pass a list of paths or
912+
a URL to be interpreted as a glob string.
913+
914+
For older versions, and for writing, you must explicitly set up a ``MutableMapping``
915+
instance and pass this, as follows:
916+
917+
.. code:: python
895918
896919
import gcsfs
897-
fs = gcsfs.GCSFileSystem(project='<project-name>', token=None)
898-
gcsmap = gcsfs.mapping.GCSMap('<bucket-name>', gcs=fs, check=True, create=False)
920+
921+
fs = gcsfs.GCSFileSystem(project="<project-name>", token=None)
922+
gcsmap = gcsfs.mapping.GCSMap("<bucket-name>", gcs=fs, check=True, create=False)
899923
# write to the bucket
900924
ds.to_zarr(store=gcsmap)
901925
# read it back
902926
ds_gcs = xr.open_zarr(gcsmap)
903927
928+
(or use the utility function ``fsspec.get_mapper()``).
929+
930+
.. _fsspec: https://filesystem-spec.readthedocs.io/en/latest/
904931
.. _Zarr: http://zarr.readthedocs.io/
905932
.. _Amazon S3: https://aws.amazon.com/s3/
906933
.. _Google Cloud Storage: https://cloud.google.com/storage/

doc/whats-new.rst

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,11 @@ New Features
7676
in the form of kwargs as well as a dict, like most similar methods.
7777
By `Maximilian Roos <https://github.com/max-sixty>`_.
7878

79+
- :py:func:`open_dataset` and :py:func:`open_mfdataset` now accept ``fsspec`` URLs
80+
(including globs for the latter) for ``engine="zarr"``, and so allow reading from
81+
many remote and other file systems (:pull:`4461`)
82+
By `Martin Durant <https://github.com/martindurant>`_
83+
7984
Bug fixes
8085
~~~~~~~~~
8186
- :py:meth:`DataArray.resample` and :py:meth:`Dataset.resample` do not trigger computations anymore if :py:meth:`Dataset.weighted` or :py:meth:`DataArray.weighted` are applied (:issue:`4625`, :pull:`4668`). By `Julius Busecke <https://github.com/jbusecke>`_.

setup.cfg

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -185,6 +185,8 @@ ignore_missing_imports = True
185185
ignore_missing_imports = True
186186
[mypy-distributed.*]
187187
ignore_missing_imports = True
188+
[mypy-fsspec.*]
189+
ignore_missing_imports = True
188190
[mypy-h5netcdf.*]
189191
ignore_missing_imports = True
190192
[mypy-h5py.*]

xarray/backends/api.py

Lines changed: 24 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -643,7 +643,9 @@ def open_dataarray(
643643
backend_kwargs: dict, optional
644644
A dictionary of keyword arguments to pass on to the backend. This
645645
may be useful when backend options would improve performance or
646-
allow user control of dataset processing.
646+
allow user control of dataset processing. If using fsspec URLs,
647+
include the key "storage_options" to pass arguments to the
648+
storage layer.
647649
use_cftime: bool, optional
648650
Only relevant if encoded dates come from a standard calendar
649651
(e.g. "gregorian", "proleptic_gregorian", "standard", or not
@@ -869,14 +871,33 @@ def open_mfdataset(
869871
.. [2] http://xarray.pydata.org/en/stable/dask.html#chunking-and-performance
870872
"""
871873
if isinstance(paths, str):
872-
if is_remote_uri(paths):
874+
if is_remote_uri(paths) and engine == "zarr":
875+
try:
876+
from fsspec.core import get_fs_token_paths
877+
except ImportError as e:
878+
raise ImportError(
879+
"The use of remote URLs for opening zarr requires the package fsspec"
880+
) from e
881+
882+
fs, _, _ = get_fs_token_paths(
883+
paths,
884+
mode="rb",
885+
storage_options=kwargs.get("backend_kwargs", {}).get(
886+
"storage_options", {}
887+
),
888+
expand=False,
889+
)
890+
paths = fs.glob(fs._strip_protocol(paths)) # finds directories
891+
paths = [fs.get_mapper(path) for path in paths]
892+
elif is_remote_uri(paths):
873893
raise ValueError(
874894
"cannot do wild-card matching for paths that are remote URLs: "
875895
"{!r}. Instead, supply paths as an explicit list of strings.".format(
876896
paths
877897
)
878898
)
879-
paths = sorted(glob(_normalize_path(paths)))
899+
else:
900+
paths = sorted(glob(_normalize_path(paths)))
880901
else:
881902
paths = [str(p) if isinstance(p, Path) else p for p in paths]
882903

xarray/backends/zarr.py

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
import os
22
import pathlib
3+
from distutils.version import LooseVersion
34

45
import numpy as np
56

@@ -295,6 +296,7 @@ def open_group(
295296
consolidated=False,
296297
consolidate_on_close=False,
297298
chunk_store=None,
299+
storage_options=None,
298300
append_dim=None,
299301
write_region=None,
300302
):
@@ -303,7 +305,15 @@ def open_group(
303305
if isinstance(store, pathlib.Path):
304306
store = os.fspath(store)
305307

306-
open_kwargs = dict(mode=mode, synchronizer=synchronizer, path=group)
308+
open_kwargs = dict(
309+
mode=mode,
310+
synchronizer=synchronizer,
311+
path=group,
312+
)
313+
if LooseVersion(zarr.__version__) >= "2.5.0":
314+
open_kwargs["storage_options"] = storage_options
315+
elif storage_options:
316+
raise ValueError("Storage options only compatible with zarr>=2.5.0")
307317
if chunk_store:
308318
open_kwargs["chunk_store"] = chunk_store
309319

@@ -537,6 +547,7 @@ def open_zarr(
537547
consolidated=False,
538548
overwrite_encoded_chunks=False,
539549
chunk_store=None,
550+
storage_options=None,
540551
decode_timedelta=None,
541552
use_cftime=None,
542553
**kwargs,
@@ -649,6 +660,7 @@ def open_zarr(
649660
"consolidated": consolidated,
650661
"overwrite_encoded_chunks": overwrite_encoded_chunks,
651662
"chunk_store": chunk_store,
663+
"storage_options": storage_options,
652664
}
653665

654666
ds = open_dataset(
@@ -687,6 +699,7 @@ def open_dataset(
687699
consolidated=False,
688700
consolidate_on_close=False,
689701
chunk_store=None,
702+
storage_options=None,
690703
):
691704
store = ZarrStore.open_group(
692705
filename_or_obj,
@@ -696,6 +709,7 @@ def open_dataset(
696709
consolidated=consolidated,
697710
consolidate_on_close=consolidate_on_close,
698711
chunk_store=chunk_store,
712+
storage_options=storage_options,
699713
)
700714

701715
store_entrypoint = StoreBackendEntrypoint()

xarray/core/utils.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -645,7 +645,12 @@ def close_on_error(f):
645645

646646

647647
def is_remote_uri(path: str) -> bool:
648-
return bool(re.search(r"^https?\://", path))
648+
"""Finds URLs of the form protocol:// or protocol::
649+
650+
This also matches for http[s]://, which were the only remote URLs
651+
supported in <=v0.16.2.
652+
"""
653+
return bool(re.search(r"^[a-z][a-z0-9]*(\://|\:\:)", path))
649654

650655

651656
def read_magic_number(filename_or_obj, count=8):

xarray/tests/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,7 @@ def LooseVersion(vstring):
7474
has_nc_time_axis, requires_nc_time_axis = _importorskip("nc_time_axis")
7575
has_rasterio, requires_rasterio = _importorskip("rasterio")
7676
has_zarr, requires_zarr = _importorskip("zarr")
77+
has_fsspec, requires_fsspec = _importorskip("fsspec")
7778
has_iris, requires_iris = _importorskip("iris")
7879
has_cfgrib, requires_cfgrib = _importorskip("cfgrib")
7980
has_numbagg, requires_numbagg = _importorskip("numbagg")

xarray/tests/test_backends.py

Lines changed: 51 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,7 @@
5454
requires_cfgrib,
5555
requires_cftime,
5656
requires_dask,
57+
requires_fsspec,
5758
requires_h5netcdf,
5859
requires_netCDF4,
5960
requires_pseudonetcdf,
@@ -3040,10 +3041,17 @@ def test_open_mfdataset(self):
30403041

30413042
with raises_regex(IOError, "no files to open"):
30423043
open_mfdataset("foo-bar-baz-*.nc")
3043-
30443044
with raises_regex(ValueError, "wild-card"):
30453045
open_mfdataset("http://some/remote/uri")
30463046

3047+
@requires_fsspec
3048+
def test_open_mfdataset_no_files(self):
3049+
pytest.importorskip("aiobotocore")
3050+
3051+
# glob is attempted as of #4823, but finds no files
3052+
with raises_regex(OSError, "no files"):
3053+
open_mfdataset("http://some/remote/uri", engine="zarr")
3054+
30473055
def test_open_mfdataset_2d(self):
30483056
original = Dataset({"foo": (["x", "y"], np.random.randn(10, 8))})
30493057
with create_tmp_file() as tmp1:
@@ -4799,6 +4807,48 @@ def test_extract_zarr_variable_encoding():
47994807
)
48004808

48014809

4810+
@requires_zarr
4811+
@requires_fsspec
4812+
def test_open_fsspec():
4813+
import fsspec
4814+
import zarr
4815+
4816+
if not hasattr(zarr.storage, "FSStore") or not hasattr(
4817+
zarr.storage.FSStore, "getitems"
4818+
):
4819+
pytest.skip("zarr too old")
4820+
4821+
ds = open_dataset(os.path.join(os.path.dirname(__file__), "data", "example_1.nc"))
4822+
4823+
m = fsspec.filesystem("memory")
4824+
mm = m.get_mapper("out1.zarr")
4825+
ds.to_zarr(mm) # old interface
4826+
ds0 = ds.copy()
4827+
ds0["time"] = ds.time + pd.to_timedelta("1 day")
4828+
mm = m.get_mapper("out2.zarr")
4829+
ds0.to_zarr(mm) # old interface
4830+
4831+
# single dataset
4832+
url = "memory://out2.zarr"
4833+
ds2 = open_dataset(url, engine="zarr")
4834+
assert ds0 == ds2
4835+
4836+
# single dataset with caching
4837+
url = "simplecache::memory://out2.zarr"
4838+
ds2 = open_dataset(url, engine="zarr")
4839+
assert ds0 == ds2
4840+
4841+
# multi dataset
4842+
url = "memory://out*.zarr"
4843+
ds2 = open_mfdataset(url, engine="zarr")
4844+
assert xr.concat([ds, ds0], dim="time") == ds2
4845+
4846+
# multi dataset with caching
4847+
url = "simplecache::memory://out*.zarr"
4848+
ds2 = open_mfdataset(url, engine="zarr")
4849+
assert xr.concat([ds, ds0], dim="time") == ds2
4850+
4851+
48024852
@requires_h5netcdf
48034853
def test_load_single_value_h5netcdf(tmp_path):
48044854
"""Test that numeric single-element vector attributes are handled fine.

0 commit comments

Comments
 (0)