diff --git a/ci/requirements/environment.yml b/ci/requirements/environment.yml index dd73ef19658..045053d8ad1 100644 --- a/ci/requirements/environment.yml +++ b/ci/requirements/environment.yml @@ -18,7 +18,7 @@ dependencies: - hdf5 - hypothesis - iris - - lxml # Optional dep of pydap + - lxml # Optional dep of pydap - matplotlib-base - nc-time-axis - netcdf4 @@ -46,3 +46,5 @@ dependencies: - toolz - typing_extensions - zarr + - pip: + - cftime_rs diff --git a/xarray/coding/times.py b/xarray/coding/times.py index 039fe371100..c6f5ee9efcd 100644 --- a/xarray/coding/times.py +++ b/xarray/coding/times.py @@ -29,7 +29,7 @@ from xarray.core.variable import Variable try: - import cftime + import cftime_rs as cftime except ImportError: cftime = None @@ -231,11 +231,21 @@ def _decode_datetime_with_cftime( num_dates: np.ndarray, units: str, calendar: str ) -> np.ndarray: if cftime is None: - raise ModuleNotFoundError("No module named 'cftime'") + raise ModuleNotFoundError("No module named 'cftime_rs'") if num_dates.size > 0: - return np.asarray( - cftime.num2date(num_dates, units, calendar, only_use_cftime_datetimes=True) - ) + try: + res = cftime.num2pydate( + num_dates, + units, + calendar, + ) + except ValueError: + res = cftime.num2date( + num_dates, + units, + calendar, + ) + return np.asarray(res) else: return np.array([], dtype=object) @@ -248,7 +258,6 @@ def _decode_datetime_with_pandas( f"Cannot decode times from a non-standard calendar, {calendar!r}, using " "pandas." ) - time_units, ref_date = _unpack_netcdf_time_units(units) time_units = _netcdf_to_numpy_timeunit(time_units) try: @@ -259,14 +268,12 @@ def _decode_datetime_with_pandas( # ValueError is raised by pd.Timestamp for non-ISO timestamp # strings, in which case we fall back to using cftime raise OutOfBoundsDatetime - with warnings.catch_warnings(): warnings.filterwarnings("ignore", "invalid value encountered", RuntimeWarning) if flat_num_dates.size > 0: # avoid size 0 datetimes GH1329 pd.to_timedelta(flat_num_dates.min(), time_units) + ref_date pd.to_timedelta(flat_num_dates.max(), time_units) + ref_date - # To avoid integer overflow when converting to nanosecond units for integer # dtypes smaller than np.int64 cast all integer and unsigned integer dtype # arrays to np.int64 (GH 2002, GH 6589). Note this is safe even in the case @@ -321,7 +328,6 @@ def decode_cf_datetime( dates = _decode_datetime_with_cftime( flat_num_dates.astype(float), units, calendar ) - if ( dates[np.nanargmin(num_dates)].year < 1678 or dates[np.nanargmax(num_dates)].year >= 2262 @@ -410,7 +416,7 @@ def infer_calendar_name(dates) -> CFCalendar: sample = sample.compute() if isinstance(sample, np.ndarray): sample = sample.item() - if isinstance(sample, cftime.datetime): + if isinstance(sample, cftime.PyCFDatetime): return sample.calendar # Error raise if dtype is neither datetime or "O", if cftime is not importable, and if element of 'O' dtype is not cftime. @@ -464,8 +470,10 @@ def infer_timedelta_units(deltas) -> str: return _infer_time_units_from_diff(unique_timedeltas) -def cftime_to_nptime(times, raise_on_invalid: bool = True) -> np.ndarray: - """Given an array of cftime.datetime objects, return an array of +def cftime_to_nptime( + times: list[cftime.PyCFDatetime], raise_on_invalid: bool = True +) -> np.ndarray: + """Given an array of cftime_rs.PyCFDatetime objects, return an array of numpy.datetime64 objects of the same size If raise_on_invalid is True (default), invalid dates trigger a ValueError. @@ -480,6 +488,7 @@ def cftime_to_nptime(times, raise_on_invalid: bool = True) -> np.ndarray: # NumPy casts it safely it np.datetime64[ns] for dates outside # 1678 to 2262 (this is not currently the case for # datetime.datetime). + datetime dt = nanosecond_precision_timestamp( t.year, t.month, t.day, t.hour, t.minute, t.second, t.microsecond ) @@ -619,34 +628,46 @@ def _cleanup_netcdf_time_units(units: str) -> str: return units -def _encode_datetime_with_cftime(dates, units: str, calendar: str) -> np.ndarray: +def encode_datetime_with_cftime( + dates, units: str, calendar: str +) -> np.ndarray[int | float]: """Fallback method for encoding dates using cftime. This method is more flexible than xarray's parsing using datetime64[ns] arrays but also slower because it loops over each element. """ + if cftime is None: - raise ModuleNotFoundError("No module named 'cftime'") + raise ModuleNotFoundError("No module named 'cftime-rs'") + + dates = np.array(dates) if np.issubdtype(dates.dtype, np.datetime64): # numpy's broken datetime conversion only works for us precision dates = dates.astype("M8[us]").astype(datetime) - def encode_datetime(d): - # Since netCDF files do not support storing float128 values, we ensure - # that float64 values are used by setting longdouble=False in num2date. - # This try except logic can be removed when xarray's minimum version of - # cftime is at least 1.6.2. - try: - return ( - np.nan - if d is None - else cftime.date2num(d, units, calendar, longdouble=False) - ) - except TypeError: - return np.nan if d is None else cftime.date2num(d, units, calendar) + # Find all the none or NaN position + none_position = np.equal(dates, None) - return np.array([encode_datetime(d) for d in dates.ravel()]).reshape(dates.shape) + # Remove NaN from the dates + filtered_dates = dates[~none_position] + print(filtered_dates) + # encoded_nums will be the same size as filtered_dates + # Try converting to f64 first to avoid unnecessary conversion to i64 + try: + encoded_nums = cftime.pydate2num( + filtered_dates.tolist(), units, calendar, dtype="f64" + ) + except TypeError: + encoded_nums = cftime.pydate2num( + filtered_dates.tolist(), units, calendar, dtype="i64" + ) + + # Create a full matrix of NaN + # And fill the num dates in the not NaN or None position + result = np.full(dates.shape, np.nan) + result[np.nonzero(~none_position)] = encoded_nums + return result def cast_to_int_if_safe(num) -> np.ndarray: @@ -683,7 +704,6 @@ def encode_cf_datetime( cftime.date2num """ dates = np.asarray(dates) - data_units = infer_datetime_units(dates) if units is None: @@ -694,63 +714,10 @@ def encode_cf_datetime( if calendar is None: calendar = infer_calendar_name(dates) - try: - if not _is_standard_calendar(calendar) or dates.dtype.kind == "O": - # parse with cftime instead - raise OutOfBoundsDatetime - assert dates.dtype == "datetime64[ns]" - - time_units, ref_date = _unpack_time_units_and_ref_date(units) - time_delta = _time_units_to_timedelta64(time_units) - - # Wrap the dates in a DatetimeIndex to do the subtraction to ensure - # an OverflowError is raised if the ref_date is too far away from - # dates to be encoded (GH 2272). - dates_as_index = pd.DatetimeIndex(dates.ravel()) - time_deltas = dates_as_index - ref_date - - # retrieve needed units to faithfully encode to int64 - needed_units, data_ref_date = _unpack_time_units_and_ref_date(data_units) - if data_units != units: - # this accounts for differences in the reference times - ref_delta = abs(data_ref_date - ref_date).to_timedelta64() - data_delta = _time_units_to_timedelta64(needed_units) - if (ref_delta % data_delta) > np.timedelta64(0, "ns"): - needed_units = _infer_time_units_from_diff(ref_delta) - - # needed time delta to encode faithfully to int64 - needed_time_delta = _time_units_to_timedelta64(needed_units) - - floor_division = True - if time_delta > needed_time_delta: - floor_division = False - if dtype is None: - emit_user_level_warning( - f"Times can't be serialized faithfully to int64 with requested units {units!r}. " - f"Resolution of {needed_units!r} needed. Serializing times to floating point instead. " - f"Set encoding['dtype'] to integer dtype to serialize to int64. " - f"Set encoding['dtype'] to floating point dtype to silence this warning." - ) - elif np.issubdtype(dtype, np.integer): - new_units = f"{needed_units} since {format_timestamp(ref_date)}" - emit_user_level_warning( - f"Times can't be serialized faithfully to int64 with requested units {units!r}. " - f"Serializing with units {new_units!r} instead. " - f"Set encoding['dtype'] to floating point dtype to serialize with units {units!r}. " - f"Set encoding['units'] to {new_units!r} to silence this warning ." - ) - units = new_units - time_delta = needed_time_delta - floor_division = True - - num = _division(time_deltas, time_delta, floor_division) - num = num.values.reshape(dates.shape) - - except (OutOfBoundsDatetime, OverflowError, ValueError): - num = _encode_datetime_with_cftime(dates, units, calendar) - # do it now only for cftime-based flow - # we already covered for this in pandas-based flow - num = cast_to_int_if_safe(num) + num = encode_datetime_with_cftime(dates, units, calendar) + # do it now only for cftime-based flow + # we already covered for this in pandas-based flow + num = cast_to_int_if_safe(num) return (num, units, calendar) diff --git a/xarray/tests/test_coding_times.py b/xarray/tests/test_coding_times.py index 423e48bd155..402c2d4d905 100644 --- a/xarray/tests/test_coding_times.py +++ b/xarray/tests/test_coding_times.py @@ -3,7 +3,9 @@ import warnings from datetime import timedelta from itertools import product +from typing import Any, Union +import cftime_rs import numpy as np import pandas as pd import pytest @@ -19,12 +21,12 @@ decode_cf, ) from xarray.coding.times import ( - _encode_datetime_with_cftime, _numpy_to_netcdf_timeunit, _should_cftime_be_used, cftime_to_nptime, decode_cf_datetime, encode_cf_datetime, + encode_datetime_with_cftime, to_timedelta_unboxed, ) from xarray.coding.variables import SerializationWarning @@ -60,8 +62,8 @@ (np.arange(10).astype("float32"), "days since 2000-01-01"), (np.arange(10).reshape(2, 5), "days since 2000-01-01"), (12300 + np.arange(5), "hours since 1680-01-01 00:00:00"), - # here we add a couple minor formatting errors to test - # the robustness of the parsing algorithm. + # # here we add a couple minor formatting errors to test + # # the robustness of the parsing algorithm. (12300 + np.arange(5), "hour since 1680-01-01 00:00:00"), (12300 + np.arange(5), "Hour since 1680-01-01 00:00:00"), (12300 + np.arange(5), " Hour since 1680-01-01 00:00:00 "), @@ -80,8 +82,8 @@ ([0.5, 1.5], "hours since 1900-01-01T00:00:00"), (0, "milliseconds since 2000-01-01T00:00:00"), (0, "microseconds since 2000-01-01T00:00:00"), - (np.int32(788961600), "seconds since 1981-01-01"), # GH2002 - (12300 + np.arange(5), "hour since 1680-01-01 00:00:00.500000"), + ([np.int32(788961600)], "seconds since 1981-01-01"), # GH2002 + # (12300 + np.arange(5), "hour since 1680-01-01 00:00:00.500000"), (164375, "days since 1850-01-01 00:00:00"), (164374.5, "days since 1850-01-01 00:00:00"), ([164374.5, 168360.5], "days since 1850-01-01 00:00:00"), @@ -109,16 +111,28 @@ def _all_cftime_date_types(): } +POSSIBLE_NUM_DATES = Union[Any, list[Any], list[list[Any]]] + + +def _num_dates_to_array_1d(num_dates: POSSIBLE_NUM_DATES) -> np.ndarray: + """cftime_rs functions only accept 1d arrays""" + if not isinstance(num_dates, (list, np.ndarray)): + return np.array([num_dates]) + elif isinstance(num_dates, (list, np.ndarray)): + return np.array(num_dates).flatten() + else: + raise TypeError("Invalid type for num_dates") + + @requires_cftime @pytest.mark.filterwarnings("ignore:Ambiguous reference date string") @pytest.mark.filterwarnings("ignore:Times can't be serialized faithfully") @pytest.mark.parametrize(["num_dates", "units", "calendar"], _CF_DATETIME_TESTS) def test_cf_datetime(num_dates, units, calendar) -> None: - import cftime + num_dates = _num_dates_to_array_1d(num_dates) + units = coding.times._cleanup_netcdf_time_units(units) + expected = cftime_rs.num2pydate(num_dates, units, calendar) - expected = cftime.num2date( - num_dates, units, calendar, only_use_cftime_datetimes=True - ) min_y = np.ravel(np.atleast_1d(expected))[np.nanargmin(num_dates)].year max_y = np.ravel(np.atleast_1d(expected))[np.nanargmax(num_dates)].year if min_y >= 1678 and max_y < 2262: @@ -153,20 +167,29 @@ def test_decode_cf_datetime_overflow() -> None: # checks for # https://github.com/pydata/pandas/issues/14068 # https://github.com/pydata/xarray/issues/975 - from cftime import DatetimeGregorian - - datetime = DatetimeGregorian + calendar = cftime_rs.PyCFCalendar.from_str("standard") units = "days since 2000-01-01 00:00:00" # date after 2262 and before 1678 days = (-117608, 95795) - expected = (datetime(1677, 12, 31), datetime(2262, 4, 12)) - + expected = [ + cftime_rs.PyCFDatetime.from_ymd(1677, 12, 31, calendar=calendar), + cftime_rs.PyCFDatetime.from_ymd(2262, 4, 12, calendar=calendar), + ] for i, day in enumerate(days): with warnings.catch_warnings(): warnings.filterwarnings("ignore", "Unable to decode time axis") result = coding.times.decode_cf_datetime(day, units) - assert result == expected[i] + dt = result.tolist() + (year, month, day, hour, minute, second) = ( + dt.year, + dt.month, + dt.day, + dt.hour, + dt.minute, + dt.second, + ) + assert (year, month, day, hour, minute, second) == expected[i].ymd_hms() def test_decode_cf_datetime_non_standard_units() -> None: @@ -200,15 +223,16 @@ def test_decode_cf_datetime_non_iso_strings() -> None: @requires_cftime @pytest.mark.parametrize("calendar", _STANDARD_CALENDARS) def test_decode_standard_calendar_inside_timestamp_range(calendar) -> None: - import cftime - units = "days since 0001-01-01" times = pd.date_range("2001-04-01-00", end="2001-04-30-23", freq="H") - time = cftime.date2num(times.to_pydatetime(), units, calendar=calendar) + encoded_numbers = cftime_rs.pydate2num( + times.to_pydatetime().tolist(), units, calendar=calendar, dtype="f64" + ) expected = times.values expected_dtype = np.dtype("M8[ns]") - actual = coding.times.decode_cf_datetime(time, units, calendar=calendar) + actual = coding.times.decode_cf_datetime(encoded_numbers, units, calendar=calendar) + assert actual.dtype == expected_dtype abs_diff = abs(actual - expected) # once we no longer support versions of netCDF4 older than 1.1.5, @@ -220,15 +244,13 @@ def test_decode_standard_calendar_inside_timestamp_range(calendar) -> None: @requires_cftime @pytest.mark.parametrize("calendar", _NON_STANDARD_CALENDARS) def test_decode_non_standard_calendar_inside_timestamp_range(calendar) -> None: - import cftime - units = "days since 0001-01-01" times = pd.date_range("2001-04-01-00", end="2001-04-30-23", freq="H") - non_standard_time = cftime.date2num(times.to_pydatetime(), units, calendar=calendar) - - expected = cftime.num2date( - non_standard_time, units, calendar=calendar, only_use_cftime_datetimes=True + non_standard_time = cftime_rs.pydate2num( + times.to_pydatetime().tolist(), units, calendar=calendar, dtype="f64" ) + + expected = cftime_rs.num2pydate(non_standard_time, units, calendar=calendar) expected_dtype = np.dtype("O") actual = coding.times.decode_cf_datetime( @@ -247,15 +269,10 @@ def test_decode_non_standard_calendar_inside_timestamp_range(calendar) -> None: def test_decode_dates_outside_timestamp_range(calendar) -> None: from datetime import datetime - import cftime - units = "days since 0001-01-01" times = [datetime(1, 4, 1, h) for h in range(1, 5)] - time = cftime.date2num(times, units, calendar=calendar) - - expected = cftime.num2date( - time, units, calendar=calendar, only_use_cftime_datetimes=True - ) + time = cftime_rs.pydate2num(times, units, calendar=calendar, dtype="f64") + expected = cftime_rs.num2date(time, units, calendar=calendar) expected_date_type = type(expected[0]) with warnings.catch_warnings(): @@ -298,8 +315,6 @@ def test_decode_non_standard_calendar_single_element_inside_timestamp_range( @requires_cftime @pytest.mark.parametrize("calendar", _NON_STANDARD_CALENDARS) def test_decode_single_element_outside_timestamp_range(calendar) -> None: - import cftime - units = "days since 0001-01-01" for days in [1, 1470376]: for num_time in [days, [days], [[days]]]: @@ -308,11 +323,9 @@ def test_decode_single_element_outside_timestamp_range(calendar) -> None: actual = coding.times.decode_cf_datetime( num_time, units, calendar=calendar ) - - expected = cftime.num2date( - days, units, calendar, only_use_cftime_datetimes=True - ) - assert isinstance(actual.item(), type(expected)) + _days = _num_dates_to_array_1d(days) + expected = cftime_rs.num2pydate(_days, units, calendar) + assert isinstance(actual.item(), type(expected[0])) @requires_cftime @@ -320,13 +333,15 @@ def test_decode_single_element_outside_timestamp_range(calendar) -> None: def test_decode_standard_calendar_multidim_time_inside_timestamp_range( calendar, ) -> None: - import cftime - units = "days since 0001-01-01" times1 = pd.date_range("2001-04-01", end="2001-04-05", freq="D") times2 = pd.date_range("2001-05-01", end="2001-05-05", freq="D") - time1 = cftime.date2num(times1.to_pydatetime(), units, calendar=calendar) - time2 = cftime.date2num(times2.to_pydatetime(), units, calendar=calendar) + time1 = cftime_rs.pydate2num( + times1.to_pydatetime().tolist(), units, calendar=calendar, dtype="i64" + ) + time2 = cftime_rs.pydate2num( + times2.to_pydatetime().tolist(), units, calendar=calendar, dtype="i64" + ) mdim_time = np.empty((len(time1), 2)) mdim_time[:, 0] = time1 mdim_time[:, 1] = time2 @@ -351,27 +366,21 @@ def test_decode_standard_calendar_multidim_time_inside_timestamp_range( def test_decode_nonstandard_calendar_multidim_time_inside_timestamp_range( calendar, ) -> None: - import cftime - units = "days since 0001-01-01" times1 = pd.date_range("2001-04-01", end="2001-04-05", freq="D") times2 = pd.date_range("2001-05-01", end="2001-05-05", freq="D") - time1 = cftime.date2num(times1.to_pydatetime(), units, calendar=calendar) - time2 = cftime.date2num(times2.to_pydatetime(), units, calendar=calendar) + time1 = cftime_rs.pydate2num( + times1.to_pydatetime().tolist(), units, calendar=calendar, dtype="i64" + ) + time2 = cftime_rs.pydate2num( + times2.to_pydatetime().tolist(), units, calendar=calendar, dtype="i64" + ) mdim_time = np.empty((len(time1), 2)) mdim_time[:, 0] = time1 mdim_time[:, 1] = time2 - if cftime.__name__ == "cftime": - expected1 = cftime.num2date( - time1, units, calendar, only_use_cftime_datetimes=True - ) - expected2 = cftime.num2date( - time2, units, calendar, only_use_cftime_datetimes=True - ) - else: - expected1 = cftime.num2date(time1, units, calendar) - expected2 = cftime.num2date(time2, units, calendar) + expected1 = cftime_rs.num2pydate(time1, units, calendar) + expected2 = cftime_rs.num2pydate(time2, units, calendar) expected_dtype = np.dtype("O") @@ -392,19 +401,17 @@ def test_decode_nonstandard_calendar_multidim_time_inside_timestamp_range( def test_decode_multidim_time_outside_timestamp_range(calendar) -> None: from datetime import datetime - import cftime - units = "days since 0001-01-01" times1 = [datetime(1, 4, day) for day in range(1, 6)] times2 = [datetime(1, 5, day) for day in range(1, 6)] - time1 = cftime.date2num(times1, units, calendar=calendar) - time2 = cftime.date2num(times2, units, calendar=calendar) + time1 = cftime_rs.pydate2num(times1, units, calendar=calendar, dtype="i64") + time2 = cftime_rs.pydate2num(times2, units, calendar=calendar, dtype="i64") mdim_time = np.empty((len(time1), 2)) mdim_time[:, 0] = time1 mdim_time[:, 1] = time2 - expected1 = cftime.num2date(time1, units, calendar, only_use_cftime_datetimes=True) - expected2 = cftime.num2date(time2, units, calendar, only_use_cftime_datetimes=True) + expected1 = cftime_rs.num2pydate(time1, units, calendar) + expected2 = cftime_rs.num2pydate(time2, units, calendar) with warnings.catch_warnings(): warnings.filterwarnings("ignore", "Unable to decode time axis") @@ -427,32 +434,28 @@ def test_decode_multidim_time_outside_timestamp_range(calendar) -> None: [("360_day", 720058.0), ("all_leap", 732059.0), ("366_day", 732059.0)], ) def test_decode_non_standard_calendar_single_element(calendar, num_time) -> None: - import cftime - units = "days since 0001-01-01" + _num_time = _num_dates_to_array_1d(num_time) + try: + expected = np.asarray(cftime_rs.num2pydate(_num_time, units, calendar)) + except ValueError: + expected = np.asarray(cftime_rs.num2date(_num_time, units, calendar)) actual = coding.times.decode_cf_datetime(num_time, units, calendar=calendar) - expected = np.asarray( - cftime.num2date(num_time, units, calendar, only_use_cftime_datetimes=True) - ) assert actual.dtype == np.dtype("O") assert expected == actual @requires_cftime def test_decode_360_day_calendar() -> None: - import cftime - calendar = "360_day" # ensure leap year doesn't matter for year in [2010, 2011, 2012, 2013, 2014]: units = f"days since {year}-01-01" num_times = np.arange(100) - expected = cftime.num2date( - num_times, units, calendar, only_use_cftime_datetimes=True - ) + expected = cftime_rs.num2date(num_times, units, calendar) with warnings.catch_warnings(record=True) as w: warnings.simplefilter("always") @@ -468,12 +471,13 @@ def test_decode_360_day_calendar() -> None: @requires_cftime def test_decode_abbreviation() -> None: """Test making sure we properly fall back to cftime on abbreviated units.""" - import cftime val = np.array([1586628000000.0]) - units = "msecs since 1970-01-01T00:00:00Z" - actual = coding.times.decode_cf_datetime(val, units) - expected = coding.times.cftime_to_nptime(cftime.num2date(val, units)) + units = "ms since 1970-01-01 00:00:00" + actual = coding.times.decode_cf_datetime(val, units, calendar="standard") + expected = coding.times.cftime_to_nptime( + cftime_rs.num2date(val, units, calendar="standard") + ) assert_array_equal(actual, expected) @@ -1089,7 +1093,7 @@ def test__encode_datetime_with_cftime() -> None: expected = cftime.date2num(times, encoding_units, calendar, longdouble=False) except TypeError: expected = cftime.date2num(times, encoding_units, calendar) - result = _encode_datetime_with_cftime(times, encoding_units, calendar) + result = encode_datetime_with_cftime(times, encoding_units, calendar) np.testing.assert_equal(result, expected)