Combine CF Unsigned and Mask handling

djhoese · djhoese · commit 083c6b1a8283 · 2024-07-24T14:23:55.000-05:00
diff --git a/xarray/coding/variables.py b/xarray/coding/variables.py
@@ -305,11 +305,18 @@ def encode(self, variable: Variable, name: T_Name = None):
         dims, data, attrs, encoding = unpack_for_encoding(variable)
 
         dtype = np.dtype(encoding.get("dtype", data.dtype))
+        # from netCDF best practices
+        # https://docs.unidata.ucar.edu/nug/current/best_practices.html#bp_Unsigned-Data
+        #     "_Unsigned = "true" to indicate that
+        #      integer data should be treated as unsigned"
+        is_unsigned = encoding.get("_Unsigned", "false") == "true"
+        # only used for _Unsigned cases
+        signed_dtype = np.dtype(
+            encoding.get("dtype", f"i{dtype.itemsize}" if is_unsigned else dtype)
+        )
         fv = encoding.get("_FillValue")
         mv = encoding.get("missing_value")
-        # to properly handle _FillValue/missing_value below [a], [b]
-        # we need to check if unsigned data is written as signed data
-        unsigned = encoding.get("_Unsigned") is not None
+        fill_value = None
 
         fv_exists = fv is not None
         mv_exists = mv is not None
@@ -324,23 +331,28 @@ def encode(self, variable: Variable, name: T_Name = None):
 
         if fv_exists:
             # Ensure _FillValue is cast to same dtype as data's
-            # [a] need to skip this if _Unsigned is available
-            if not unsigned:
-                encoding["_FillValue"] = dtype.type(fv)
+            encoding["_FillValue"] = (
+                self._encode_unsigned_fill_value(name, fv, signed_dtype)
+                if is_unsigned
+                else dtype.type(fv)
+            )
             fill_value = pop_to(encoding, attrs, "_FillValue", name=name)
 
         if mv_exists:
             # try to use _FillValue, if it exists to align both values
             # or use missing_value and ensure it's cast to same dtype as data's
-            # [b] need to provide mv verbatim if _Unsigned is available
             encoding["missing_value"] = attrs.get(
                 "_FillValue",
-                (dtype.type(mv) if not unsigned else mv),
+                (
+                    self._encode_unsigned_fill_value(name, fv, signed_dtype)
+                    if is_unsigned
+                    else dtype.type(mv)
+                ),
             )
             fill_value = pop_to(encoding, attrs, "missing_value", name=name)
 
         # apply fillna
-        if not pd.isnull(fill_value):
+        if fill_value is not None and not pd.isnull(fill_value):
             # special case DateTime to properly handle NaT
             if _is_time_like(attrs.get("units")) and data.dtype.kind in "iu":
                 data = duck_array_ops.where(
@@ -349,46 +361,112 @@ def encode(self, variable: Variable, name: T_Name = None):
             else:
                 data = duck_array_ops.fillna(data, fill_value)
 
+        if fill_value is not None and is_unsigned:
+            pop_to(encoding, attrs, "_Unsigned")
+            # XXX: Is this actually needed? Doesn't the backend handle this?
+            data = duck_array_ops.astype(duck_array_ops.around(data), signed_dtype)
+            attrs["_FillValue"] = fill_value
+
         return Variable(dims, data, attrs, encoding, fastpath=True)
 
+    def _encode_unsigned_fill_value(
+        self, name: T_Name, fill_value: Any, signed_dtype: np.typing.DTypeLike
+    ) -> Any:
+        try:
+            # user provided the on-disk signed fill
+            if hasattr(fill_value, "item"):
+                # if numpy type, convert to python native integer to determine overflow
+                # otherwise numpy unsigned ints will silently cast to the signed counterpart
+                fill_value = fill_value.item()
+            new_fill = signed_dtype.type(fill_value)
+        except OverflowError:
+            warnings.warn(
+                f"variable {name!r} will be stored as signed integers "
+                f"but _FillValue attribute can't be represented as a "
+                f"signed integer.",
+                SerializationWarning,
+                stacklevel=3,
+            )
+            # user provided the in-memory unsigned fill, convert to signed type
+            unsigned_dtype = np.dtype(f"u{signed_dtype.itemsize}")
+            # use view here to prevent OverflowError
+            new_fill = (
+                np.array(fill_value, dtype=unsigned_dtype).view(signed_dtype).item()
+            )
+        return new_fill
+
     def decode(self, variable: Variable, name: T_Name = None):
         raw_fill_dict, encoded_fill_values = _check_fill_values(
             variable.attrs, name, variable.dtype
         )
+        if "_Unsigned" not in variable.attrs and not raw_fill_dict:
+            return variable
 
-        if raw_fill_dict:
-            dims, data, attrs, encoding = unpack_for_decoding(variable)
-            [
-                safe_setitem(encoding, attr, value, name=name)
-                for attr, value in raw_fill_dict.items()
-            ]
-
-            if encoded_fill_values:
-                # special case DateTime to properly handle NaT
-                dtype: np.typing.DTypeLike
-                decoded_fill_value: Any
-                if _is_time_like(attrs.get("units")) and data.dtype.kind in "iu":
-                    dtype, decoded_fill_value = np.int64, np.iinfo(np.int64).min
-                else:
-                    if "scale_factor" not in attrs and "add_offset" not in attrs:
-                        dtype, decoded_fill_value = dtypes.maybe_promote(data.dtype)
-                    else:
-                        dtype, decoded_fill_value = (
-                            _choose_float_dtype(data.dtype, attrs),
-                            np.nan,
-                        )
+        dims, data, attrs, encoding = unpack_for_decoding(variable)
+
+        # dims, data, attrs, encoding = unpack_for_decoding(variable)
+        # Even if _Unsigned is use, retain on-disk _FillValue
+        [
+            safe_setitem(encoding, attr, value, name=name)
+            for attr, value in raw_fill_dict.items()
+        ]
 
-                transform = partial(
-                    _apply_mask,
-                    encoded_fill_values=encoded_fill_values,
-                    decoded_fill_value=decoded_fill_value,
-                    dtype=dtype,
+        if "_Unsigned" in attrs:
+            unsigned = pop_to(attrs, encoding, "_Unsigned")
+
+            if data.dtype.kind == "i":
+                if unsigned == "true":
+                    unsigned_dtype = np.dtype(f"u{data.dtype.itemsize}")
+                    transform = partial(np.asarray, dtype=unsigned_dtype)
+                    if "_FillValue" in raw_fill_dict:
+                        new_fill = np.array(
+                            raw_fill_dict["_FillValue"], dtype=data.dtype
+                        )
+                        encoded_fill_values.remove(raw_fill_dict["_FillValue"])
+                        # use view here to prevent OverflowError
+                        encoded_fill_values.add(new_fill.view(unsigned_dtype).item())
+                    data = lazy_elemwise_func(data, transform, unsigned_dtype)
+            elif data.dtype.kind == "u":
+                if unsigned == "false":
+                    signed_dtype = np.dtype(f"i{data.dtype.itemsize}")
+                    transform = partial(np.asarray, dtype=signed_dtype)
+                    data = lazy_elemwise_func(data, transform, signed_dtype)
+                    if "_FillValue" in raw_fill_dict:
+                        new_fill = signed_dtype.type(raw_fill_dict["_FillValue"])
+                        encoded_fill_values.remove(raw_fill_dict["_FillValue"])
+                        encoded_fill_values.add(new_fill)
+            else:
+                warnings.warn(
+                    f"variable {name!r} has _Unsigned attribute but is not "
+                    "of integer type. Ignoring attribute.",
+                    SerializationWarning,
+                    stacklevel=3,
                 )
-                data = lazy_elemwise_func(data, transform, dtype)
 
-            return Variable(dims, data, attrs, encoding, fastpath=True)
-        else:
-            return variable
+        if encoded_fill_values:
+            # special case DateTime to properly handle NaT
+            dtype: np.typing.DTypeLike
+            decoded_fill_value: Any
+            if _is_time_like(attrs.get("units")) and data.dtype.kind in "iu":
+                dtype, decoded_fill_value = np.int64, np.iinfo(np.int64).min
+            else:
+                if "scale_factor" not in attrs and "add_offset" not in attrs:
+                    dtype, decoded_fill_value = dtypes.maybe_promote(data.dtype)
+                else:
+                    dtype, decoded_fill_value = (
+                        _choose_float_dtype(data.dtype, attrs),
+                        np.nan,
+                    )
+
+            transform = partial(
+                _apply_mask,
+                encoded_fill_values=encoded_fill_values,
+                decoded_fill_value=decoded_fill_value,
+                dtype=dtype,
+            )
+            data = lazy_elemwise_func(data, transform, dtype)
+
+        return Variable(dims, data, attrs, encoding, fastpath=True)
 
 
 def _scale_offset_decoding(data, scale_factor, add_offset, dtype: np.typing.DTypeLike):
@@ -506,74 +584,6 @@ def decode(self, variable: Variable, name: T_Name = None) -> Variable:
             return variable
 
 
-class UnsignedIntegerCoder(VariableCoder):
-    def encode(self, variable: Variable, name: T_Name = None) -> Variable:
-        # from netCDF best practices
-        # https://docs.unidata.ucar.edu/nug/current/best_practices.html#bp_Unsigned-Data
-        #     "_Unsigned = "true" to indicate that
-        #      integer data should be treated as unsigned"
-        if variable.encoding.get("_Unsigned", "false") == "true":
-            dims, data, attrs, encoding = unpack_for_encoding(variable)
-
-            pop_to(encoding, attrs, "_Unsigned")
-            # we need the on-disk type here
-            # trying to get it from encoding, resort to an int with the same precision as data.dtype if not available
-            signed_dtype = np.dtype(encoding.get("dtype", f"i{data.dtype.itemsize}"))
-            if "_FillValue" in attrs:
-                try:
-                    # user provided the on-disk signed fill
-                    new_fill = signed_dtype.type(attrs["_FillValue"])
-                except OverflowError:
-                    # user provided the in-memory unsigned fill, convert to signed type
-                    unsigned_dtype = np.dtype(f"u{signed_dtype.itemsize}")
-                    # use view here to prevent OverflowError
-                    new_fill = (
-                        np.array(attrs["_FillValue"], dtype=unsigned_dtype)
-                        .view(signed_dtype)
-                        .item()
-                    )
-                attrs["_FillValue"] = new_fill
-            data = duck_array_ops.astype(duck_array_ops.around(data), signed_dtype)
-
-            return Variable(dims, data, attrs, encoding, fastpath=True)
-        else:
-            return variable
-
-    def decode(self, variable: Variable, name: T_Name = None) -> Variable:
-        if "_Unsigned" in variable.attrs:
-            dims, data, attrs, encoding = unpack_for_decoding(variable)
-            unsigned = pop_to(attrs, encoding, "_Unsigned")
-
-            if data.dtype.kind == "i":
-                if unsigned == "true":
-                    unsigned_dtype = np.dtype(f"u{data.dtype.itemsize}")
-                    transform = partial(np.asarray, dtype=unsigned_dtype)
-                    if "_FillValue" in attrs:
-                        new_fill = np.array(attrs["_FillValue"], dtype=data.dtype)
-                        # use view here to prevent OverflowError
-                        attrs["_FillValue"] = new_fill.view(unsigned_dtype).item()
-                    data = lazy_elemwise_func(data, transform, unsigned_dtype)
-            elif data.dtype.kind == "u":
-                if unsigned == "false":
-                    signed_dtype = np.dtype(f"i{data.dtype.itemsize}")
-                    transform = partial(np.asarray, dtype=signed_dtype)
-                    data = lazy_elemwise_func(data, transform, signed_dtype)
-                    if "_FillValue" in attrs:
-                        new_fill = signed_dtype.type(attrs["_FillValue"])
-                        attrs["_FillValue"] = new_fill
-            else:
-                warnings.warn(
-                    f"variable {name!r} has _Unsigned attribute but is not "
-                    "of integer type. Ignoring attribute.",
-                    SerializationWarning,
-                    stacklevel=3,
-                )
-
-            return Variable(dims, data, attrs, encoding, fastpath=True)
-        else:
-            return variable
-
-
 class DefaultFillvalueCoder(VariableCoder):
     """Encode default _FillValue if needed."""
 
diff --git a/xarray/conventions.py b/xarray/conventions.py
@@ -187,7 +187,6 @@ def encode_cf_variable(
         times.CFTimedeltaCoder(),
         variables.CFScaleOffsetCoder(),
         variables.CFMaskCoder(),
-        variables.UnsignedIntegerCoder(),
         variables.NativeEnumCoder(),
         variables.NonStringCoder(),
         variables.DefaultFillvalueCoder(),
@@ -279,7 +278,6 @@ def decode_cf_variable(
 
     if mask_and_scale:
         for coder in [
-            variables.UnsignedIntegerCoder(),
             variables.CFMaskCoder(),
             variables.CFScaleOffsetCoder(),
         ]:
diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py