Skip to content

Commit 083c6b1

Browse files
committed
Combine CF Unsigned and Mask handling
1 parent e3f78e5 commit 083c6b1

File tree

3 files changed

+171
-116
lines changed

3 files changed

+171
-116
lines changed

xarray/coding/variables.py

Lines changed: 117 additions & 107 deletions
Original file line numberDiff line numberDiff line change
@@ -305,11 +305,18 @@ def encode(self, variable: Variable, name: T_Name = None):
305305
dims, data, attrs, encoding = unpack_for_encoding(variable)
306306

307307
dtype = np.dtype(encoding.get("dtype", data.dtype))
308+
# from netCDF best practices
309+
# https://docs.unidata.ucar.edu/nug/current/best_practices.html#bp_Unsigned-Data
310+
# "_Unsigned = "true" to indicate that
311+
# integer data should be treated as unsigned"
312+
is_unsigned = encoding.get("_Unsigned", "false") == "true"
313+
# only used for _Unsigned cases
314+
signed_dtype = np.dtype(
315+
encoding.get("dtype", f"i{dtype.itemsize}" if is_unsigned else dtype)
316+
)
308317
fv = encoding.get("_FillValue")
309318
mv = encoding.get("missing_value")
310-
# to properly handle _FillValue/missing_value below [a], [b]
311-
# we need to check if unsigned data is written as signed data
312-
unsigned = encoding.get("_Unsigned") is not None
319+
fill_value = None
313320

314321
fv_exists = fv is not None
315322
mv_exists = mv is not None
@@ -324,23 +331,28 @@ def encode(self, variable: Variable, name: T_Name = None):
324331

325332
if fv_exists:
326333
# Ensure _FillValue is cast to same dtype as data's
327-
# [a] need to skip this if _Unsigned is available
328-
if not unsigned:
329-
encoding["_FillValue"] = dtype.type(fv)
334+
encoding["_FillValue"] = (
335+
self._encode_unsigned_fill_value(name, fv, signed_dtype)
336+
if is_unsigned
337+
else dtype.type(fv)
338+
)
330339
fill_value = pop_to(encoding, attrs, "_FillValue", name=name)
331340

332341
if mv_exists:
333342
# try to use _FillValue, if it exists to align both values
334343
# or use missing_value and ensure it's cast to same dtype as data's
335-
# [b] need to provide mv verbatim if _Unsigned is available
336344
encoding["missing_value"] = attrs.get(
337345
"_FillValue",
338-
(dtype.type(mv) if not unsigned else mv),
346+
(
347+
self._encode_unsigned_fill_value(name, fv, signed_dtype)
348+
if is_unsigned
349+
else dtype.type(mv)
350+
),
339351
)
340352
fill_value = pop_to(encoding, attrs, "missing_value", name=name)
341353

342354
# apply fillna
343-
if not pd.isnull(fill_value):
355+
if fill_value is not None and not pd.isnull(fill_value):
344356
# special case DateTime to properly handle NaT
345357
if _is_time_like(attrs.get("units")) and data.dtype.kind in "iu":
346358
data = duck_array_ops.where(
@@ -349,46 +361,112 @@ def encode(self, variable: Variable, name: T_Name = None):
349361
else:
350362
data = duck_array_ops.fillna(data, fill_value)
351363

364+
if fill_value is not None and is_unsigned:
365+
pop_to(encoding, attrs, "_Unsigned")
366+
# XXX: Is this actually needed? Doesn't the backend handle this?
367+
data = duck_array_ops.astype(duck_array_ops.around(data), signed_dtype)
368+
attrs["_FillValue"] = fill_value
369+
352370
return Variable(dims, data, attrs, encoding, fastpath=True)
353371

372+
def _encode_unsigned_fill_value(
373+
self, name: T_Name, fill_value: Any, signed_dtype: np.typing.DTypeLike
374+
) -> Any:
375+
try:
376+
# user provided the on-disk signed fill
377+
if hasattr(fill_value, "item"):
378+
# if numpy type, convert to python native integer to determine overflow
379+
# otherwise numpy unsigned ints will silently cast to the signed counterpart
380+
fill_value = fill_value.item()
381+
new_fill = signed_dtype.type(fill_value)
382+
except OverflowError:
383+
warnings.warn(
384+
f"variable {name!r} will be stored as signed integers "
385+
f"but _FillValue attribute can't be represented as a "
386+
f"signed integer.",
387+
SerializationWarning,
388+
stacklevel=3,
389+
)
390+
# user provided the in-memory unsigned fill, convert to signed type
391+
unsigned_dtype = np.dtype(f"u{signed_dtype.itemsize}")
392+
# use view here to prevent OverflowError
393+
new_fill = (
394+
np.array(fill_value, dtype=unsigned_dtype).view(signed_dtype).item()
395+
)
396+
return new_fill
397+
354398
def decode(self, variable: Variable, name: T_Name = None):
355399
raw_fill_dict, encoded_fill_values = _check_fill_values(
356400
variable.attrs, name, variable.dtype
357401
)
402+
if "_Unsigned" not in variable.attrs and not raw_fill_dict:
403+
return variable
358404

359-
if raw_fill_dict:
360-
dims, data, attrs, encoding = unpack_for_decoding(variable)
361-
[
362-
safe_setitem(encoding, attr, value, name=name)
363-
for attr, value in raw_fill_dict.items()
364-
]
365-
366-
if encoded_fill_values:
367-
# special case DateTime to properly handle NaT
368-
dtype: np.typing.DTypeLike
369-
decoded_fill_value: Any
370-
if _is_time_like(attrs.get("units")) and data.dtype.kind in "iu":
371-
dtype, decoded_fill_value = np.int64, np.iinfo(np.int64).min
372-
else:
373-
if "scale_factor" not in attrs and "add_offset" not in attrs:
374-
dtype, decoded_fill_value = dtypes.maybe_promote(data.dtype)
375-
else:
376-
dtype, decoded_fill_value = (
377-
_choose_float_dtype(data.dtype, attrs),
378-
np.nan,
379-
)
405+
dims, data, attrs, encoding = unpack_for_decoding(variable)
406+
407+
# dims, data, attrs, encoding = unpack_for_decoding(variable)
408+
# Even if _Unsigned is use, retain on-disk _FillValue
409+
[
410+
safe_setitem(encoding, attr, value, name=name)
411+
for attr, value in raw_fill_dict.items()
412+
]
380413

381-
transform = partial(
382-
_apply_mask,
383-
encoded_fill_values=encoded_fill_values,
384-
decoded_fill_value=decoded_fill_value,
385-
dtype=dtype,
414+
if "_Unsigned" in attrs:
415+
unsigned = pop_to(attrs, encoding, "_Unsigned")
416+
417+
if data.dtype.kind == "i":
418+
if unsigned == "true":
419+
unsigned_dtype = np.dtype(f"u{data.dtype.itemsize}")
420+
transform = partial(np.asarray, dtype=unsigned_dtype)
421+
if "_FillValue" in raw_fill_dict:
422+
new_fill = np.array(
423+
raw_fill_dict["_FillValue"], dtype=data.dtype
424+
)
425+
encoded_fill_values.remove(raw_fill_dict["_FillValue"])
426+
# use view here to prevent OverflowError
427+
encoded_fill_values.add(new_fill.view(unsigned_dtype).item())
428+
data = lazy_elemwise_func(data, transform, unsigned_dtype)
429+
elif data.dtype.kind == "u":
430+
if unsigned == "false":
431+
signed_dtype = np.dtype(f"i{data.dtype.itemsize}")
432+
transform = partial(np.asarray, dtype=signed_dtype)
433+
data = lazy_elemwise_func(data, transform, signed_dtype)
434+
if "_FillValue" in raw_fill_dict:
435+
new_fill = signed_dtype.type(raw_fill_dict["_FillValue"])
436+
encoded_fill_values.remove(raw_fill_dict["_FillValue"])
437+
encoded_fill_values.add(new_fill)
438+
else:
439+
warnings.warn(
440+
f"variable {name!r} has _Unsigned attribute but is not "
441+
"of integer type. Ignoring attribute.",
442+
SerializationWarning,
443+
stacklevel=3,
386444
)
387-
data = lazy_elemwise_func(data, transform, dtype)
388445

389-
return Variable(dims, data, attrs, encoding, fastpath=True)
390-
else:
391-
return variable
446+
if encoded_fill_values:
447+
# special case DateTime to properly handle NaT
448+
dtype: np.typing.DTypeLike
449+
decoded_fill_value: Any
450+
if _is_time_like(attrs.get("units")) and data.dtype.kind in "iu":
451+
dtype, decoded_fill_value = np.int64, np.iinfo(np.int64).min
452+
else:
453+
if "scale_factor" not in attrs and "add_offset" not in attrs:
454+
dtype, decoded_fill_value = dtypes.maybe_promote(data.dtype)
455+
else:
456+
dtype, decoded_fill_value = (
457+
_choose_float_dtype(data.dtype, attrs),
458+
np.nan,
459+
)
460+
461+
transform = partial(
462+
_apply_mask,
463+
encoded_fill_values=encoded_fill_values,
464+
decoded_fill_value=decoded_fill_value,
465+
dtype=dtype,
466+
)
467+
data = lazy_elemwise_func(data, transform, dtype)
468+
469+
return Variable(dims, data, attrs, encoding, fastpath=True)
392470

393471

394472
def _scale_offset_decoding(data, scale_factor, add_offset, dtype: np.typing.DTypeLike):
@@ -506,74 +584,6 @@ def decode(self, variable: Variable, name: T_Name = None) -> Variable:
506584
return variable
507585

508586

509-
class UnsignedIntegerCoder(VariableCoder):
510-
def encode(self, variable: Variable, name: T_Name = None) -> Variable:
511-
# from netCDF best practices
512-
# https://docs.unidata.ucar.edu/nug/current/best_practices.html#bp_Unsigned-Data
513-
# "_Unsigned = "true" to indicate that
514-
# integer data should be treated as unsigned"
515-
if variable.encoding.get("_Unsigned", "false") == "true":
516-
dims, data, attrs, encoding = unpack_for_encoding(variable)
517-
518-
pop_to(encoding, attrs, "_Unsigned")
519-
# we need the on-disk type here
520-
# trying to get it from encoding, resort to an int with the same precision as data.dtype if not available
521-
signed_dtype = np.dtype(encoding.get("dtype", f"i{data.dtype.itemsize}"))
522-
if "_FillValue" in attrs:
523-
try:
524-
# user provided the on-disk signed fill
525-
new_fill = signed_dtype.type(attrs["_FillValue"])
526-
except OverflowError:
527-
# user provided the in-memory unsigned fill, convert to signed type
528-
unsigned_dtype = np.dtype(f"u{signed_dtype.itemsize}")
529-
# use view here to prevent OverflowError
530-
new_fill = (
531-
np.array(attrs["_FillValue"], dtype=unsigned_dtype)
532-
.view(signed_dtype)
533-
.item()
534-
)
535-
attrs["_FillValue"] = new_fill
536-
data = duck_array_ops.astype(duck_array_ops.around(data), signed_dtype)
537-
538-
return Variable(dims, data, attrs, encoding, fastpath=True)
539-
else:
540-
return variable
541-
542-
def decode(self, variable: Variable, name: T_Name = None) -> Variable:
543-
if "_Unsigned" in variable.attrs:
544-
dims, data, attrs, encoding = unpack_for_decoding(variable)
545-
unsigned = pop_to(attrs, encoding, "_Unsigned")
546-
547-
if data.dtype.kind == "i":
548-
if unsigned == "true":
549-
unsigned_dtype = np.dtype(f"u{data.dtype.itemsize}")
550-
transform = partial(np.asarray, dtype=unsigned_dtype)
551-
if "_FillValue" in attrs:
552-
new_fill = np.array(attrs["_FillValue"], dtype=data.dtype)
553-
# use view here to prevent OverflowError
554-
attrs["_FillValue"] = new_fill.view(unsigned_dtype).item()
555-
data = lazy_elemwise_func(data, transform, unsigned_dtype)
556-
elif data.dtype.kind == "u":
557-
if unsigned == "false":
558-
signed_dtype = np.dtype(f"i{data.dtype.itemsize}")
559-
transform = partial(np.asarray, dtype=signed_dtype)
560-
data = lazy_elemwise_func(data, transform, signed_dtype)
561-
if "_FillValue" in attrs:
562-
new_fill = signed_dtype.type(attrs["_FillValue"])
563-
attrs["_FillValue"] = new_fill
564-
else:
565-
warnings.warn(
566-
f"variable {name!r} has _Unsigned attribute but is not "
567-
"of integer type. Ignoring attribute.",
568-
SerializationWarning,
569-
stacklevel=3,
570-
)
571-
572-
return Variable(dims, data, attrs, encoding, fastpath=True)
573-
else:
574-
return variable
575-
576-
577587
class DefaultFillvalueCoder(VariableCoder):
578588
"""Encode default _FillValue if needed."""
579589

xarray/conventions.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -187,7 +187,6 @@ def encode_cf_variable(
187187
times.CFTimedeltaCoder(),
188188
variables.CFScaleOffsetCoder(),
189189
variables.CFMaskCoder(),
190-
variables.UnsignedIntegerCoder(),
191190
variables.NativeEnumCoder(),
192191
variables.NonStringCoder(),
193192
variables.DefaultFillvalueCoder(),
@@ -279,7 +278,6 @@ def decode_cf_variable(
279278

280279
if mask_and_scale:
281280
for coder in [
282-
variables.UnsignedIntegerCoder(),
283281
variables.CFMaskCoder(),
284282
variables.CFScaleOffsetCoder(),
285283
]:

0 commit comments

Comments
 (0)