zarr-developers · d-v-b · Jun 27, 2025 · Jun 25, 2025 · Jun 25, 2025 · Jun 25, 2025
diff --git a/changes/3170.bugfix.rst b/changes/3170.bugfix.rst
@@ -0,0 +1,6 @@
+Fixes a variety of issues related to string data types.
+
+- Brings the ``VariableLengthUTF8`` data type Zarr V3 identifier in alignment with Zarr Python 3.0.8
+- Disallows creation of 0-length fixed-length data types
+- Adds a regression test for the ``VariableLengthUTF8`` data type that checks against version 3.0.8
+- Allows users to request the ``VariableLengthUTF8`` data type with ``str``, ``"str"``, or ``"string"``.
diff --git a/src/zarr/core/dtype/__init__.py b/src/zarr/core/dtype/__init__.py
@@ -109,6 +109,12 @@
     VariableLengthBytes,
 )
 
+# These are aliases for variable-length UTF-8 strings
+# We handle them when a user requests a data type instead of using NumPy's dtype inferece because
+# the default NumPy behavior -- to inspect the user-provided array data and choose
+# an appropriately sized U dtype -- is unworkable for Zarr.
+VLEN_UTF8_ALIAS: Final = ("str", str, "string")
+
 # This type models inputs that can be coerced to a ZDType
 ZDTypeLike: TypeAlias = npt.DTypeLike | ZDType[TBaseDType, TBaseScalar] | Mapping[str, JSON] | str
 
@@ -157,6 +163,10 @@
     # dict and zarr_format 3 means that we have a JSON object representation of the dtype
     if zarr_format == 3 and isinstance(dtype_spec, Mapping):
         return get_data_type_from_json(dtype_spec, zarr_format=3)
+    if dtype_spec in VLEN_UTF8_ALIAS:
+        # If the dtype request is one of the aliases for variable-length UTF-8 strings,
+        # return that dtype.
+        return VariableLengthUTF8()  # type: ignore[return-value]
     # otherwise, we have either a numpy dtype string, or a zarr v3 dtype string, and in either case
     # we can create a numpy dtype from it, and do the dtype inference from that
     return get_data_type_from_native_dtype(dtype_spec)  # type: ignore[arg-type]
diff --git a/src/zarr/core/dtype/npy/bytes.py b/src/zarr/core/dtype/npy/bytes.py
@@ -37,6 +37,14 @@
     dtype_cls = np.dtypes.BytesDType
     _zarr_v3_name: ClassVar[Literal["null_terminated_bytes"]] = "null_terminated_bytes"
 
+    def __post_init__(self) -> None:
+        """
+        We don't allow instances of this class with length less than 1 because there is no way such
+        a data type can contain actual data.
+        """
+        if self.length < 1:
+            raise ValueError(f"length must be >= 1, got {self.length}.")
+
     @classmethod
     def from_native_dtype(cls, dtype: TBaseDType) -> Self:
         if cls._check_native_dtype(dtype):
@@ -155,6 +163,14 @@
     dtype_cls = np.dtypes.VoidDType  # type: ignore[assignment]
     _zarr_v3_name: ClassVar[Literal["raw_bytes"]] = "raw_bytes"
 
+    def __post_init__(self) -> None:
+        """
+        We don't allow instances of this class with length less than 1 because there is no way such
+        a data type can contain actual data.
+        """
+        if self.length < 1:
+            raise ValueError(f"length must be >= 1, got {self.length}.")
+
     @classmethod
     def _check_native_dtype(
         cls: type[Self], dtype: TBaseDType

diff --git a/src/zarr/core/dtype/npy/string.py b/src/zarr/core/dtype/npy/string.py
@@ -63,6 +63,14 @@
     _zarr_v3_name: ClassVar[Literal["fixed_length_utf32"]] = "fixed_length_utf32"
     code_point_bytes: ClassVar[int] = 4  # utf32 is 4 bytes per code point
 
+    def __post_init__(self) -> None:
+        """
+        We don't allow instances of this class with length less than 1 because there is no way such
+        a data type can contain actual data.
+        """
+        if self.length < 1:
+            raise ValueError(f"length must be >= 1, got {self.length}.")
+
     @classmethod
     def from_native_dtype(cls, dtype: TBaseDType) -> Self:
         if cls._check_native_dtype(dtype):
@@ -195,7 +203,7 @@
     as data type, but as a base class for other variable length string data types.
     """
 
-    _zarr_v3_name: ClassVar[Literal["variable_length_utf8"]] = "variable_length_utf8"
+    _zarr_v3_name: ClassVar[Literal["string"]] = "string"
     object_codec_id: ClassVar[Literal["vlen-utf8"]] = "vlen-utf8"
 
     @classmethod
@@ -222,7 +230,7 @@
         )
 
     @classmethod
-    def _check_json_v3(cls, data: DTypeJSON) -> TypeGuard[Literal["variable_length_utf8"]]:
+    def _check_json_v3(cls, data: DTypeJSON) -> TypeGuard[Literal["string"]]:
         return data == cls._zarr_v3_name
 
     @classmethod
@@ -246,15 +254,14 @@
         self, zarr_format: Literal[2]
     ) -> DTypeConfig_V2[Literal["|O"], Literal["vlen-utf8"]]: ...
     @overload
-    def to_json(self, zarr_format: Literal[3]) -> Literal["variable_length_utf8"]: ...
+    def to_json(self, zarr_format: Literal[3]) -> Literal["string"]: ...
 
     def to_json(
         self, zarr_format: ZarrFormat
-    ) -> DTypeConfig_V2[Literal["|O"], Literal["vlen-utf8"]] | Literal["variable_length_utf8"]:
+    ) -> DTypeConfig_V2[Literal["|O"], Literal["vlen-utf8"]] | Literal["string"]:
         if zarr_format == 2:
             return {"name": "|O", "object_codec_id": self.object_codec_id}
         elif zarr_format == 3:
-            v3_unstable_dtype_warning(self)
             return self._zarr_v3_name
         raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}")  # pragma: no cover
 

diff --git a/src/zarr/core/dtype/npy/structured.py b/src/zarr/core/dtype/npy/structured.py
@@ -37,6 +37,10 @@
     _zarr_v3_name = "structured"
     fields: tuple[tuple[str, ZDType[TBaseDType, TBaseScalar]], ...]
 
+    def __post_init__(self) -> None:
+        if len(self.fields) < 1:
+            raise ValueError(f"must have at least one field. Got {self.fields!r}")
+
     @classmethod
     def _check_native_dtype(cls, dtype: TBaseDType) -> TypeGuard[np.dtypes.VoidDType[int]]:
         """

diff --git a/tests/test_array.py b/tests/test_array.py
@@ -41,7 +41,7 @@
 from zarr.core.chunk_grids import _auto_partition
 from zarr.core.chunk_key_encodings import ChunkKeyEncodingParams
 from zarr.core.common import JSON, MemoryOrder, ZarrFormat
-from zarr.core.dtype import get_data_type_from_native_dtype
+from zarr.core.dtype import parse_data_type
 from zarr.core.dtype.common import ENDIANNESS_STR, EndiannessStr
 from zarr.core.dtype.npy.common import NUMPY_ENDIANNESS_STR, endianness_from_numpy_str
 from zarr.core.dtype.npy.float import Float32, Float64
@@ -1285,7 +1285,7 @@ async def test_v2_chunk_encoding(
             filters=filters,
         )
         filters_expected, compressor_expected = _parse_chunk_encoding_v2(
-            filters=filters, compressor=compressors, dtype=get_data_type_from_native_dtype(dtype)
+            filters=filters, compressor=compressors, dtype=parse_data_type(dtype, zarr_format=2)
         )
         assert arr.metadata.zarr_format == 2  # guard for mypy
         assert arr.metadata.compressor == compressor_expected

diff --git a/tests/test_dtype/test_npy/test_bytes.py b/tests/test_dtype/test_npy/test_bytes.py
@@ -15,7 +15,7 @@ class TestNullTerminatedBytes(BaseTestZDType):
         np.dtype("|U10"),
     )
     valid_json_v2 = (
-        {"name": "|S0", "object_codec_id": None},
+        {"name": "|S1", "object_codec_id": None},
         {"name": "|S2", "object_codec_id": None},
         {"name": "|S4", "object_codec_id": None},
     )
@@ -31,22 +31,22 @@ class TestNullTerminatedBytes(BaseTestZDType):
     )
 
     scalar_v2_params = (
-        (NullTerminatedBytes(length=0), ""),
+        (NullTerminatedBytes(length=1), "MA=="),
         (NullTerminatedBytes(length=2), "YWI="),
         (NullTerminatedBytes(length=4), "YWJjZA=="),
     )
     scalar_v3_params = (
-        (NullTerminatedBytes(length=0), ""),
+        (NullTerminatedBytes(length=1), "MA=="),
         (NullTerminatedBytes(length=2), "YWI="),
         (NullTerminatedBytes(length=4), "YWJjZA=="),
     )
     cast_value_params = (
-        (NullTerminatedBytes(length=0), "", np.bytes_("")),
+        (NullTerminatedBytes(length=1), "", np.bytes_("")),
         (NullTerminatedBytes(length=2), "ab", np.bytes_("ab")),
         (NullTerminatedBytes(length=4), "abcdefg", np.bytes_("abcd")),
     )
     item_size_params = (
-        NullTerminatedBytes(length=0),
+        NullTerminatedBytes(length=1),
         NullTerminatedBytes(length=4),
         NullTerminatedBytes(length=10),
     )
@@ -62,7 +62,7 @@ class TestRawBytes(BaseTestZDType):
     )
     valid_json_v2 = ({"name": "|V10", "object_codec_id": None},)
     valid_json_v3 = (
-        {"name": "raw_bytes", "configuration": {"length_bytes": 0}},
+        {"name": "raw_bytes", "configuration": {"length_bytes": 1}},
         {"name": "raw_bytes", "configuration": {"length_bytes": 8}},
     )
 
@@ -77,22 +77,22 @@ class TestRawBytes(BaseTestZDType):
     )
 
     scalar_v2_params = (
-        (RawBytes(length=0), ""),
+        (RawBytes(length=1), "AA=="),
         (RawBytes(length=2), "YWI="),
         (RawBytes(length=4), "YWJjZA=="),
     )
     scalar_v3_params = (
-        (RawBytes(length=0), ""),
+        (RawBytes(length=1), "AA=="),
         (RawBytes(length=2), "YWI="),
         (RawBytes(length=4), "YWJjZA=="),
     )
     cast_value_params = (
-        (RawBytes(length=0), b"", np.void(b"")),
+        (RawBytes(length=1), b"\x00", np.void(b"\x00")),
         (RawBytes(length=2), b"ab", np.void(b"ab")),
         (RawBytes(length=4), b"abcd", np.void(b"abcd")),
     )
     item_size_params = (
-        RawBytes(length=0),
+        RawBytes(length=1),
         RawBytes(length=4),
         RawBytes(length=10),
     )
@@ -152,3 +152,14 @@ def test_unstable_dtype_warning(
     """
     with pytest.raises(UnstableSpecificationWarning):
         zdtype.to_json(zarr_format=3)
+
+
+@pytest.mark.parametrize("zdtype_cls", [NullTerminatedBytes, RawBytes])
+def test_invalid_size(zdtype_cls: type[NullTerminatedBytes] | type[RawBytes]) -> None:
+    """
+    Test that it's impossible to create a data type that has no length
+    """
+    length = 0
+    msg = f"length must be >= 1, got {length}."
+    with pytest.raises(ValueError, match=msg):
+        zdtype_cls(length=length)
diff --git a/tests/test_dtype/test_npy/test_string.py b/tests/test_dtype/test_npy/test_string.py
@@ -19,7 +19,7 @@ class TestVariableLengthString(BaseTestZDType):
             np.dtype("|S10"),
         )
         valid_json_v2 = ({"name": "|O", "object_codec_id": "vlen-utf8"},)
-        valid_json_v3 = ("variable_length_utf8",)
+        valid_json_v3 = ("string",)
         invalid_json_v2 = (
             "|S10",
             "|f8",
@@ -53,7 +53,7 @@ class TestVariableLengthString(BaseTestZDType):  # type: ignore[no-redef]
             np.dtype("|S10"),
         )
         valid_json_v2 = ({"name": "|O", "object_codec_id": "vlen-utf8"},)
-        valid_json_v3 = ("variable_length_utf8",)
+        valid_json_v3 = ("string",)
         invalid_json_v2 = (
             "|S10",
             "|f8",
@@ -101,30 +101,45 @@ class TestFixedLengthUTF32(BaseTestZDType):
         {"name": "numpy.fixed_length_utf32", "configuration": {"length_bits": "invalid"}},
     )
 
-    scalar_v2_params = ((FixedLengthUTF32(length=0), ""), (FixedLengthUTF32(length=2), "hi"))
+    scalar_v2_params = ((FixedLengthUTF32(length=1), ""), (FixedLengthUTF32(length=2), "hi"))
     scalar_v3_params = (
-        (FixedLengthUTF32(length=0), ""),
+        (FixedLengthUTF32(length=1), ""),
         (FixedLengthUTF32(length=2), "hi"),
         (FixedLengthUTF32(length=4), "hihi"),
     )
 
     cast_value_params = (
-        (FixedLengthUTF32(length=0), "", np.str_("")),
+        (FixedLengthUTF32(length=1), "", np.str_("")),
         (FixedLengthUTF32(length=2), "hi", np.str_("hi")),
         (FixedLengthUTF32(length=4), "hihi", np.str_("hihi")),
     )
     item_size_params = (
-        FixedLengthUTF32(length=0),
+        FixedLengthUTF32(length=1),
         FixedLengthUTF32(length=4),
         FixedLengthUTF32(length=10),
     )
 
 
-@pytest.mark.parametrize("zdtype", [FixedLengthUTF32(length=10), VariableLengthUTF8()])
+@pytest.mark.parametrize(
+    "zdtype",
+    [
+        FixedLengthUTF32(length=10),
+    ],
+)
 def test_unstable_dtype_warning(zdtype: FixedLengthUTF32 | VariableLengthUTF8) -> None:
     """
     Test that we get a warning when serializing a dtype without a zarr v3 spec to json
     when zarr_format is 3
     """
     with pytest.raises(UnstableSpecificationWarning):
         zdtype.to_json(zarr_format=3)
+
+
+def test_invalid_size() -> None:
+    """
+    Test that it's impossible to create a data type that has no length
+    """
+    length = 0
+    msg = f"length must be >= 1, got {length}."
+    with pytest.raises(ValueError, match=msg):
+        FixedLengthUTF32(length=length)
diff --git a/tests/test_dtype/test_npy/test_structured.py b/tests/test_dtype/test_npy/test_structured.py
@@ -3,6 +3,7 @@
 from typing import Any
 
 import numpy as np
+import pytest
 
 from tests.test_dtype.test_wrapper import BaseTestZDType
 from zarr.core.dtype import (
@@ -106,3 +107,13 @@ def scalar_equals(self, scalar1: Any, scalar2: Any) -> bool:
         Structured(fields=(("field1", Int32()), ("field2", Float64()))),
         Structured(fields=(("field1", Int64()), ("field2", Int32()))),
     )
+
+
+def test_invalid_size() -> None:
+    """
+    Test that it's impossible to create a data type that has no fields
+    """
+    fields = ()
+    msg = f"must have at least one field. Got {fields!r}"
+    with pytest.raises(ValueError, match=msg):
+        Structured(fields=fields)
diff --git a/tests/test_dtype_registry.py b/tests/test_dtype_registry.py
@@ -21,6 +21,7 @@
     Int16,
     TBaseDType,
     TBaseScalar,
+    VariableLengthUTF8,
     ZDType,
     data_type_registry,
     get_data_type_from_json,
@@ -176,6 +177,8 @@ def test_entrypoint_dtype(zarr_format: ZarrFormat) -> None:
 @pytest.mark.parametrize(
     ("dtype_params", "expected", "zarr_format"),
     [
+        ("str", VariableLengthUTF8(), 2),
+        ("str", VariableLengthUTF8(), 3),
         ("int8", Int8(), 3),
         (Int8(), Int8(), 3),
         (">i2", Int16(endianness="big"), 2),