Skip to content

Commit ffc1828

Browse files
Implement time_unit option for decode_cf_timedelta (pydata#3)
* Fix timedelta encoding overflow issue; always decode to ns resolution * Implement time_unit for decode_cf_timedelta * Reduce diff
1 parent 0556376 commit ffc1828

File tree

2 files changed

+72
-25
lines changed

2 files changed

+72
-25
lines changed

xarray/coding/times.py

Lines changed: 38 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -365,6 +365,21 @@ def _check_date_for_units_since_refdate(
365365
return pd.Timestamp("NaT")
366366

367367

368+
def _check_timedelta_range(value, data_unit, time_unit):
369+
if value > np.iinfo("int64").max or value < np.iinfo("int64").min:
370+
OutOfBoundsTimedelta(f"Value {value} can't be represented as Timedelta.")
371+
delta = value * np.timedelta64(1, data_unit)
372+
if not np.isnan(delta):
373+
# this will raise on dtype overflow for integer dtypes
374+
if value.dtype.kind in "u" and not np.int64(delta) == value:
375+
raise OutOfBoundsTimedelta(
376+
"DType overflow in Datetime/Timedelta calculation."
377+
)
378+
# this will raise on overflow if delta cannot be represented with the
379+
# resolutions supported by pandas.
380+
pd.to_timedelta(delta)
381+
382+
368383
def _align_reference_date_and_unit(
369384
ref_date: pd.Timestamp, unit: NPDatetimeUnitOptions
370385
) -> pd.Timestamp:
@@ -542,19 +557,6 @@ def decode_cf_datetime(
542557
return reshape(dates, num_dates.shape)
543558

544559

545-
def to_timedelta_unboxed(value, **kwargs):
546-
# todo: check, if the procedure here is correct
547-
result = pd.to_timedelta(value, **kwargs).to_numpy()
548-
unique_timedeltas = np.unique(result[pd.notnull(result)])
549-
unit = _netcdf_to_numpy_timeunit(_infer_time_units_from_diff(unique_timedeltas))
550-
if unit not in {"s", "ms", "us", "ns"}:
551-
# default to "ns", when not specified
552-
unit = "ns"
553-
result = result.astype(f"timedelta64[{unit}]")
554-
assert np.issubdtype(result.dtype, "timedelta64")
555-
return result
556-
557-
558560
def to_datetime_unboxed(value, **kwargs):
559561
result = pd.to_datetime(value, **kwargs).to_numpy()
560562
assert np.issubdtype(result.dtype, "datetime64")
@@ -604,22 +606,36 @@ def _numbers_to_timedelta(
604606
return flat_num.astype(f"timedelta64[{time_unit}]")
605607

606608

607-
def decode_cf_timedelta(num_timedeltas, units: str) -> np.ndarray:
608-
# todo: check, if this works as intended
609+
def decode_cf_timedelta(
610+
num_timedeltas, units: str, time_unit: str = "ns"
611+
) -> np.ndarray:
609612
"""Given an array of numeric timedeltas in netCDF format, convert it into a
610613
numpy timedelta64 ["s", "ms", "us", "ns"] array.
611614
"""
612615
num_timedeltas = np.asarray(num_timedeltas)
613616
unit = _netcdf_to_numpy_timeunit(units)
614617

618+
_check_timedelta_range(num_timedeltas.min(), unit, time_unit)
619+
_check_timedelta_range(num_timedeltas.max(), unit, time_unit)
620+
615621
timedeltas = _numbers_to_timedelta(num_timedeltas, unit, "s", "timedelta")
622+
timedeltas = pd.to_timedelta(ravel(timedeltas))
623+
624+
if np.isnat(timedeltas).all():
625+
empirical_unit = time_unit
626+
else:
627+
empirical_unit = timedeltas.unit
628+
629+
if np.timedelta64(1, time_unit) > np.timedelta64(1, empirical_unit):
630+
time_unit = empirical_unit
631+
632+
if time_unit not in {"s", "ms", "us", "ns"}:
633+
raise ValueError(
634+
f"time_unit must be one of 's', 'ms', 'us', or 'ns'. Got: {time_unit}"
635+
)
616636

617-
as_unit = unit
618-
if unit not in {"s", "ms", "us", "ns"}:
619-
# default to "ns", when not specified
620-
as_unit = "ns"
621-
result = pd.to_timedelta(ravel(timedeltas)).as_unit(as_unit).to_numpy()
622-
return reshape(result, timedeltas.shape)
637+
result = timedeltas.as_unit(time_unit).to_numpy()
638+
return reshape(result, num_timedeltas.shape)
623639

624640

625641
def _unit_timedelta_cftime(units: str) -> timedelta:
@@ -700,7 +716,7 @@ def infer_timedelta_units(deltas) -> str:
700716
{'days', 'hours', 'minutes' 'seconds'} (the first one that can evenly
701717
divide all unique time deltas in `deltas`)
702718
"""
703-
deltas = to_timedelta_unboxed(ravel(np.asarray(deltas)))
719+
deltas = ravel(deltas)
704720
unique_timedeltas = np.unique(deltas[pd.notnull(deltas)])
705721
return _infer_time_units_from_diff(unique_timedeltas)
706722

xarray/tests/test_coding_times.py

Lines changed: 34 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,6 @@
3434
format_cftime_datetime,
3535
infer_datetime_units,
3636
infer_timedelta_units,
37-
to_timedelta_unboxed,
3837
)
3938
from xarray.coding.variables import SerializationWarning
4039
from xarray.conventions import _update_bounds_attributes, cf_encoder
@@ -635,7 +634,7 @@ def test_cf_timedelta(timedeltas, units, numbers) -> None:
635634
if timedeltas == "NaT":
636635
timedeltas = np.timedelta64("NaT", "ns")
637636
else:
638-
timedeltas = to_timedelta_unboxed(timedeltas)
637+
timedeltas = pd.to_timedelta(timedeltas).to_numpy()
639638
numbers = np.array(numbers)
640639

641640
expected = numbers
@@ -659,14 +658,46 @@ def test_cf_timedelta_2d() -> None:
659658
units = "days"
660659
numbers = np.atleast_2d([1, 2, 3])
661660

662-
timedeltas = np.atleast_2d(to_timedelta_unboxed(["1D", "2D", "3D"]))
661+
timedeltas = np.atleast_2d(pd.to_timedelta(["1D", "2D", "3D"]).to_numpy())
663662
expected = timedeltas
664663

665664
actual = decode_cf_timedelta(numbers, units)
666665
assert_array_equal(expected, actual)
667666
assert expected.dtype == actual.dtype
668667

669668

669+
@pytest.mark.parametrize("encoding_unit", FREQUENCIES_TO_ENCODING_UNITS.values())
670+
def test_decode_cf_timedelta_time_unit(time_unit, encoding_unit) -> None:
671+
encoded = 1
672+
encoding_unit_as_numpy = _netcdf_to_numpy_timeunit(encoding_unit)
673+
if np.timedelta64(1, time_unit) > np.timedelta64(1, encoding_unit_as_numpy):
674+
expected = np.timedelta64(encoded, encoding_unit_as_numpy)
675+
else:
676+
expected = np.timedelta64(encoded, encoding_unit_as_numpy).astype(
677+
f"timedelta64[{time_unit}]"
678+
)
679+
result = decode_cf_timedelta(encoded, encoding_unit, time_unit)
680+
assert result == expected
681+
assert result.dtype == expected.dtype
682+
683+
684+
def test_decode_cf_timedelta_time_unit_out_of_bounds(time_unit):
685+
# Define a scale factor that will guarantee overflow with the given
686+
# time_unit.
687+
scale_factor = np.timedelta64(1, time_unit) // np.timedelta64(1, "ns")
688+
encoded = scale_factor * 300 * 365
689+
with pytest.raises(OutOfBoundsTimedelta):
690+
decode_cf_timedelta(encoded, "days", time_unit)
691+
692+
693+
def test_cf_timedelta_roundtrip_large_value(time_unit):
694+
value = np.timedelta64(np.iinfo(np.int64).max, time_unit)
695+
encoded, units = encode_cf_timedelta(value)
696+
decoded = decode_cf_timedelta(encoded, units, time_unit=time_unit)
697+
assert value == decoded
698+
assert value.dtype == decoded.dtype
699+
700+
670701
@pytest.mark.parametrize(
671702
["deltas", "expected"],
672703
[

0 commit comments

Comments
 (0)