Skip to content

string dtype fixes #3170

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 9 commits into from
Jun 27, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions changes/3170.bugfix.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
Fixes a variety of issues related to string data types.

- Brings the ``VariableLengthUTF8`` data type Zarr V3 identifier in alignment with Zarr Python 3.0.8
- Disallows creation of 0-length fixed-length data types
- Adds a regression test for the ``VariableLengthUTF8`` data type that checks against version 3.0.8
- Allows users to request the ``VariableLengthUTF8`` data type with ``str``, ``"str"``, or ``"string"``.
10 changes: 10 additions & 0 deletions src/zarr/core/dtype/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,12 @@
VariableLengthBytes,
)

# These are aliases for variable-length UTF-8 strings
# We handle them when a user requests a data type instead of using NumPy's dtype inferece because
# the default NumPy behavior -- to inspect the user-provided array data and choose
# an appropriately sized U dtype -- is unworkable for Zarr.
VLEN_UTF8_ALIAS: Final = ("str", str, "string")

# This type models inputs that can be coerced to a ZDType
ZDTypeLike: TypeAlias = npt.DTypeLike | ZDType[TBaseDType, TBaseScalar] | Mapping[str, JSON] | str

Expand Down Expand Up @@ -157,6 +163,10 @@
# dict and zarr_format 3 means that we have a JSON object representation of the dtype
if zarr_format == 3 and isinstance(dtype_spec, Mapping):
return get_data_type_from_json(dtype_spec, zarr_format=3)
if dtype_spec in VLEN_UTF8_ALIAS:
# If the dtype request is one of the aliases for variable-length UTF-8 strings,
# return that dtype.
return VariableLengthUTF8() # type: ignore[return-value]

Check warning on line 169 in src/zarr/core/dtype/__init__.py

View check run for this annotation

Codecov / codecov/patch

src/zarr/core/dtype/__init__.py#L169

Added line #L169 was not covered by tests
# otherwise, we have either a numpy dtype string, or a zarr v3 dtype string, and in either case
# we can create a numpy dtype from it, and do the dtype inference from that
return get_data_type_from_native_dtype(dtype_spec) # type: ignore[arg-type]
16 changes: 16 additions & 0 deletions src/zarr/core/dtype/npy/bytes.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,14 @@
dtype_cls = np.dtypes.BytesDType
_zarr_v3_name: ClassVar[Literal["null_terminated_bytes"]] = "null_terminated_bytes"

def __post_init__(self) -> None:
"""
We don't allow instances of this class with length less than 1 because there is no way such
a data type can contain actual data.
"""
if self.length < 1:
raise ValueError(f"length must be >= 1, got {self.length}.")

Check warning on line 46 in src/zarr/core/dtype/npy/bytes.py

View check run for this annotation

Codecov / codecov/patch

src/zarr/core/dtype/npy/bytes.py#L46

Added line #L46 was not covered by tests

@classmethod
def from_native_dtype(cls, dtype: TBaseDType) -> Self:
if cls._check_native_dtype(dtype):
Expand Down Expand Up @@ -155,6 +163,14 @@
dtype_cls = np.dtypes.VoidDType # type: ignore[assignment]
_zarr_v3_name: ClassVar[Literal["raw_bytes"]] = "raw_bytes"

def __post_init__(self) -> None:
"""
We don't allow instances of this class with length less than 1 because there is no way such
a data type can contain actual data.
"""
if self.length < 1:
raise ValueError(f"length must be >= 1, got {self.length}.")

Check warning on line 172 in src/zarr/core/dtype/npy/bytes.py

View check run for this annotation

Codecov / codecov/patch

src/zarr/core/dtype/npy/bytes.py#L172

Added line #L172 was not covered by tests

@classmethod
def _check_native_dtype(
cls: type[Self], dtype: TBaseDType
Expand Down
17 changes: 12 additions & 5 deletions src/zarr/core/dtype/npy/string.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,14 @@
_zarr_v3_name: ClassVar[Literal["fixed_length_utf32"]] = "fixed_length_utf32"
code_point_bytes: ClassVar[int] = 4 # utf32 is 4 bytes per code point

def __post_init__(self) -> None:
"""
We don't allow instances of this class with length less than 1 because there is no way such
a data type can contain actual data.
"""
if self.length < 1:
raise ValueError(f"length must be >= 1, got {self.length}.")

Check warning on line 72 in src/zarr/core/dtype/npy/string.py

View check run for this annotation

Codecov / codecov/patch

src/zarr/core/dtype/npy/string.py#L72

Added line #L72 was not covered by tests

@classmethod
def from_native_dtype(cls, dtype: TBaseDType) -> Self:
if cls._check_native_dtype(dtype):
Expand Down Expand Up @@ -195,7 +203,7 @@
as data type, but as a base class for other variable length string data types.
"""

_zarr_v3_name: ClassVar[Literal["variable_length_utf8"]] = "variable_length_utf8"
_zarr_v3_name: ClassVar[Literal["string"]] = "string"
object_codec_id: ClassVar[Literal["vlen-utf8"]] = "vlen-utf8"

@classmethod
Expand All @@ -222,7 +230,7 @@
)

@classmethod
def _check_json_v3(cls, data: DTypeJSON) -> TypeGuard[Literal["variable_length_utf8"]]:
def _check_json_v3(cls, data: DTypeJSON) -> TypeGuard[Literal["string"]]:
return data == cls._zarr_v3_name

@classmethod
Expand All @@ -246,15 +254,14 @@
self, zarr_format: Literal[2]
) -> DTypeConfig_V2[Literal["|O"], Literal["vlen-utf8"]]: ...
@overload
def to_json(self, zarr_format: Literal[3]) -> Literal["variable_length_utf8"]: ...
def to_json(self, zarr_format: Literal[3]) -> Literal["string"]: ...

def to_json(
self, zarr_format: ZarrFormat
) -> DTypeConfig_V2[Literal["|O"], Literal["vlen-utf8"]] | Literal["variable_length_utf8"]:
) -> DTypeConfig_V2[Literal["|O"], Literal["vlen-utf8"]] | Literal["string"]:
if zarr_format == 2:
return {"name": "|O", "object_codec_id": self.object_codec_id}
elif zarr_format == 3:
v3_unstable_dtype_warning(self)
return self._zarr_v3_name
raise ValueError(f"zarr_format must be 2 or 3, got {zarr_format}") # pragma: no cover

Expand Down
4 changes: 4 additions & 0 deletions src/zarr/core/dtype/npy/structured.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,10 @@
_zarr_v3_name = "structured"
fields: tuple[tuple[str, ZDType[TBaseDType, TBaseScalar]], ...]

def __post_init__(self) -> None:
if len(self.fields) < 1:
raise ValueError(f"must have at least one field. Got {self.fields!r}")

Check warning on line 42 in src/zarr/core/dtype/npy/structured.py

View check run for this annotation

Codecov / codecov/patch

src/zarr/core/dtype/npy/structured.py#L42

Added line #L42 was not covered by tests

@classmethod
def _check_native_dtype(cls, dtype: TBaseDType) -> TypeGuard[np.dtypes.VoidDType[int]]:
"""
Expand Down
4 changes: 2 additions & 2 deletions tests/test_array.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@
from zarr.core.chunk_grids import _auto_partition
from zarr.core.chunk_key_encodings import ChunkKeyEncodingParams
from zarr.core.common import JSON, MemoryOrder, ZarrFormat
from zarr.core.dtype import get_data_type_from_native_dtype
from zarr.core.dtype import parse_data_type
from zarr.core.dtype.common import ENDIANNESS_STR, EndiannessStr
from zarr.core.dtype.npy.common import NUMPY_ENDIANNESS_STR, endianness_from_numpy_str
from zarr.core.dtype.npy.float import Float32, Float64
Expand Down Expand Up @@ -1285,7 +1285,7 @@ async def test_v2_chunk_encoding(
filters=filters,
)
filters_expected, compressor_expected = _parse_chunk_encoding_v2(
filters=filters, compressor=compressors, dtype=get_data_type_from_native_dtype(dtype)
filters=filters, compressor=compressors, dtype=parse_data_type(dtype, zarr_format=2)
)
assert arr.metadata.zarr_format == 2 # guard for mypy
assert arr.metadata.compressor == compressor_expected
Expand Down
31 changes: 21 additions & 10 deletions tests/test_dtype/test_npy/test_bytes.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ class TestNullTerminatedBytes(BaseTestZDType):
np.dtype("|U10"),
)
valid_json_v2 = (
{"name": "|S0", "object_codec_id": None},
{"name": "|S1", "object_codec_id": None},
{"name": "|S2", "object_codec_id": None},
{"name": "|S4", "object_codec_id": None},
)
Expand All @@ -31,22 +31,22 @@ class TestNullTerminatedBytes(BaseTestZDType):
)

scalar_v2_params = (
(NullTerminatedBytes(length=0), ""),
(NullTerminatedBytes(length=1), "MA=="),
(NullTerminatedBytes(length=2), "YWI="),
(NullTerminatedBytes(length=4), "YWJjZA=="),
)
scalar_v3_params = (
(NullTerminatedBytes(length=0), ""),
(NullTerminatedBytes(length=1), "MA=="),
(NullTerminatedBytes(length=2), "YWI="),
(NullTerminatedBytes(length=4), "YWJjZA=="),
)
cast_value_params = (
(NullTerminatedBytes(length=0), "", np.bytes_("")),
(NullTerminatedBytes(length=1), "", np.bytes_("")),
(NullTerminatedBytes(length=2), "ab", np.bytes_("ab")),
(NullTerminatedBytes(length=4), "abcdefg", np.bytes_("abcd")),
)
item_size_params = (
NullTerminatedBytes(length=0),
NullTerminatedBytes(length=1),
NullTerminatedBytes(length=4),
NullTerminatedBytes(length=10),
)
Expand All @@ -62,7 +62,7 @@ class TestRawBytes(BaseTestZDType):
)
valid_json_v2 = ({"name": "|V10", "object_codec_id": None},)
valid_json_v3 = (
{"name": "raw_bytes", "configuration": {"length_bytes": 0}},
{"name": "raw_bytes", "configuration": {"length_bytes": 1}},
{"name": "raw_bytes", "configuration": {"length_bytes": 8}},
)

Expand All @@ -77,22 +77,22 @@ class TestRawBytes(BaseTestZDType):
)

scalar_v2_params = (
(RawBytes(length=0), ""),
(RawBytes(length=1), "AA=="),
(RawBytes(length=2), "YWI="),
(RawBytes(length=4), "YWJjZA=="),
)
scalar_v3_params = (
(RawBytes(length=0), ""),
(RawBytes(length=1), "AA=="),
(RawBytes(length=2), "YWI="),
(RawBytes(length=4), "YWJjZA=="),
)
cast_value_params = (
(RawBytes(length=0), b"", np.void(b"")),
(RawBytes(length=1), b"\x00", np.void(b"\x00")),
(RawBytes(length=2), b"ab", np.void(b"ab")),
(RawBytes(length=4), b"abcd", np.void(b"abcd")),
)
item_size_params = (
RawBytes(length=0),
RawBytes(length=1),
RawBytes(length=4),
RawBytes(length=10),
)
Expand Down Expand Up @@ -152,3 +152,14 @@ def test_unstable_dtype_warning(
"""
with pytest.raises(UnstableSpecificationWarning):
zdtype.to_json(zarr_format=3)


@pytest.mark.parametrize("zdtype_cls", [NullTerminatedBytes, RawBytes])
def test_invalid_size(zdtype_cls: type[NullTerminatedBytes] | type[RawBytes]) -> None:
"""
Test that it's impossible to create a data type that has no length
"""
length = 0
msg = f"length must be >= 1, got {length}."
with pytest.raises(ValueError, match=msg):
zdtype_cls(length=length)
29 changes: 22 additions & 7 deletions tests/test_dtype/test_npy/test_string.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ class TestVariableLengthString(BaseTestZDType):
np.dtype("|S10"),
)
valid_json_v2 = ({"name": "|O", "object_codec_id": "vlen-utf8"},)
valid_json_v3 = ("variable_length_utf8",)
valid_json_v3 = ("string",)
invalid_json_v2 = (
"|S10",
"|f8",
Expand Down Expand Up @@ -53,7 +53,7 @@ class TestVariableLengthString(BaseTestZDType): # type: ignore[no-redef]
np.dtype("|S10"),
)
valid_json_v2 = ({"name": "|O", "object_codec_id": "vlen-utf8"},)
valid_json_v3 = ("variable_length_utf8",)
valid_json_v3 = ("string",)
invalid_json_v2 = (
"|S10",
"|f8",
Expand Down Expand Up @@ -101,30 +101,45 @@ class TestFixedLengthUTF32(BaseTestZDType):
{"name": "numpy.fixed_length_utf32", "configuration": {"length_bits": "invalid"}},
)

scalar_v2_params = ((FixedLengthUTF32(length=0), ""), (FixedLengthUTF32(length=2), "hi"))
scalar_v2_params = ((FixedLengthUTF32(length=1), ""), (FixedLengthUTF32(length=2), "hi"))
scalar_v3_params = (
(FixedLengthUTF32(length=0), ""),
(FixedLengthUTF32(length=1), ""),
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This line leaves me a little confused on what the intended behavior for FIxedLengthString with an empty fill value is. or in general does the fill vlaue of a fixed length string need to be the same length?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm going with numpy's semantics, which assigns the empty string to a sequence of null bytes:

>>> np.array([''], dtype="U1").tobytes()
b'\x00\x00\x00\x00'

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this is a situation where the array data type and the scalar data type (the thing you get when you index that array) are not the same:

>>> np.dtype('U0').type('a').dtype
dtype('<U1')
>>> np.dtype('U0').type('').dtype 
dtype('<U')
>>> np.array([''], dtype="U0").dtype   
dtype('<U1')
>>> np.array([''], dtype="U0")[0].dtype
dtype('<U')

(FixedLengthUTF32(length=2), "hi"),
(FixedLengthUTF32(length=4), "hihi"),
)

cast_value_params = (
(FixedLengthUTF32(length=0), "", np.str_("")),
(FixedLengthUTF32(length=1), "", np.str_("")),
(FixedLengthUTF32(length=2), "hi", np.str_("hi")),
(FixedLengthUTF32(length=4), "hihi", np.str_("hihi")),
)
item_size_params = (
FixedLengthUTF32(length=0),
FixedLengthUTF32(length=1),
FixedLengthUTF32(length=4),
FixedLengthUTF32(length=10),
)


@pytest.mark.parametrize("zdtype", [FixedLengthUTF32(length=10), VariableLengthUTF8()])
@pytest.mark.parametrize(
"zdtype",
[
FixedLengthUTF32(length=10),
],
)
def test_unstable_dtype_warning(zdtype: FixedLengthUTF32 | VariableLengthUTF8) -> None:
"""
Test that we get a warning when serializing a dtype without a zarr v3 spec to json
when zarr_format is 3
"""
with pytest.raises(UnstableSpecificationWarning):
zdtype.to_json(zarr_format=3)


def test_invalid_size() -> None:
"""
Test that it's impossible to create a data type that has no length
"""
length = 0
msg = f"length must be >= 1, got {length}."
with pytest.raises(ValueError, match=msg):
FixedLengthUTF32(length=length)
11 changes: 11 additions & 0 deletions tests/test_dtype/test_npy/test_structured.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from typing import Any

import numpy as np
import pytest

from tests.test_dtype.test_wrapper import BaseTestZDType
from zarr.core.dtype import (
Expand Down Expand Up @@ -106,3 +107,13 @@ def scalar_equals(self, scalar1: Any, scalar2: Any) -> bool:
Structured(fields=(("field1", Int32()), ("field2", Float64()))),
Structured(fields=(("field1", Int64()), ("field2", Int32()))),
)


def test_invalid_size() -> None:
"""
Test that it's impossible to create a data type that has no fields
"""
fields = ()
msg = f"must have at least one field. Got {fields!r}"
with pytest.raises(ValueError, match=msg):
Structured(fields=fields)
3 changes: 3 additions & 0 deletions tests/test_dtype_registry.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
Int16,
TBaseDType,
TBaseScalar,
VariableLengthUTF8,
ZDType,
data_type_registry,
get_data_type_from_json,
Expand Down Expand Up @@ -176,6 +177,8 @@ def test_entrypoint_dtype(zarr_format: ZarrFormat) -> None:
@pytest.mark.parametrize(
("dtype_params", "expected", "zarr_format"),
[
("str", VariableLengthUTF8(), 2),
("str", VariableLengthUTF8(), 3),
("int8", Int8(), 3),
(Int8(), Int8(), 3),
(">i2", Int16(endianness="big"), 2),
Expand Down
Loading