Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions changes/3973.removal.md
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Removed the NumPy 1.x implementation of the `VariableLengthUTF8` data type.
154 changes: 35 additions & 119 deletions src/zarr/core/dtype/npy/string.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,8 +39,6 @@
from zarr.core.common import JSON, ZarrFormat
from zarr.core.dtype.wrapper import TBaseDType

_NUMPY_SUPPORTS_VLEN_STRING = hasattr(np.dtypes, "StringDType")


@runtime_checkable
class SupportsStr(Protocol):
Expand Down Expand Up @@ -450,36 +448,40 @@ class VariableLengthUTF8JSON_V2(DTypeConfig_V2[Literal["|O"], Literal["vlen-utf8
"""


# VariableLengthUTF8 is defined in two places, conditioned on the version of NumPy.
# If NumPy 2 is installed, then VariableLengthUTF8 is defined with the NumPy variable length
# string dtype as the native dtype. Otherwise, VariableLengthUTF8 is defined with the NumPy object
# dtype as the native dtype.
class UTF8Base[DType: TBaseDType](ZDType[DType, str], HasObjectCodec):
@dataclass(frozen=True, kw_only=True)
class VariableLengthUTF8(ZDType[np.dtypes.StringDType, str], HasObjectCodec): # type: ignore[type-var]
"""
A base class for variable-length UTF-8 string data types.
A Zarr data type for arrays containing variable-length UTF-8 strings.

Wraps the ``np.dtypes.StringDType`` data type. Scalars for this data type are instances
of ``str``.

Not intended for direct use, but as a base for concrete implementations.

Attributes
----------
object_codec_id : ClassVar[Literal["vlen-utf8"]]
dtype_cls : Type[np.dtypes.StringDType]
The NumPy dtype class for this data type.
_zarr_v3_name : ClassVar[Literal["variable_length_utf8"]] = "variable_length_utf8"
The name of this data type in Zarr V3.
object_codec_id : ClassVar[Literal["vlen-utf8"]] = "vlen-utf8"
The object codec ID for this data type.

References
----------
This data type does not have a Zarr V3 specification.
https://github.com/zarr-developers/zarr-extensions/tree/main/data-types/string

The Zarr V2 data type specification can be found [here](https://github.com/zarr-developers/zarr-specs/blob/main/docs/v2/v2.0.rst#data-type-encoding).
"""

dtype_cls = np.dtypes.StringDType # type: ignore[assignment]
_zarr_v3_name: ClassVar[Literal["string"]] = "string"
object_codec_id: ClassVar[Literal["vlen-utf8"]] = "vlen-utf8"

@classmethod
def from_native_dtype(cls, dtype: TBaseDType) -> Self:
"""
Create an instance of this data type from a compatible NumPy data type.

We reject NumPy StringDType instances that have the `na_object` field set,
because this is not representable by the Zarr `string` data type.

Parameters
----------
Expand All @@ -495,13 +497,33 @@ def from_native_dtype(cls, dtype: TBaseDType) -> Self:
------
DataTypeValidationError
If the input is not compatible with this data type.
ValueError
If the input is `numpy.dtypes.StringDType` and has `na_object` set.
"""
if cls._check_native_dtype(dtype):
if hasattr(dtype, "na_object"):
msg = (
f"Zarr data type resolution from {dtype} failed. "
"Attempted to resolve a zarr data type from a `numpy.dtypes.StringDType` "
"with `na_object` set, which is not supported."
)
raise ValueError(msg)
return cls()
raise DataTypeValidationError(
f"Invalid data type: {dtype}. Expected an instance of {cls.dtype_cls}"
)

def to_native_dtype(self) -> np.dtypes.StringDType:
"""
Create a NumPy string dtype from this VariableLengthUTF8 ZDType.

Returns
-------
np.dtypes.StringDType
The NumPy string dtype.
"""
return self.dtype_cls()

@classmethod
def _check_json_v2(
cls,
Expand Down Expand Up @@ -718,109 +740,3 @@ def cast_scalar(self, data: object) -> str:
f"data type {self}."
)
raise TypeError(msg) # pragma: no cover


if _NUMPY_SUPPORTS_VLEN_STRING:

@dataclass(frozen=True, kw_only=True)
class VariableLengthUTF8(UTF8Base[np.dtypes.StringDType]): # type: ignore[type-var]
"""
A Zarr data type for arrays containing variable-length UTF-8 strings.

Wraps the ``np.dtypes.StringDType`` data type. Scalars for this data type are instances
of ``str``.


Attributes
----------
dtype_cls : Type[np.dtypes.StringDType]
The NumPy dtype class for this data type.
_zarr_v3_name : ClassVar[Literal["variable_length_utf8"]] = "variable_length_utf8"
The name of this data type in Zarr V3.
object_codec_id : ClassVar[Literal["vlen-utf8"]] = "vlen-utf8"
The object codec ID for this data type.
"""

dtype_cls = np.dtypes.StringDType # type: ignore[assignment]

@classmethod
def from_native_dtype(cls, dtype: TBaseDType) -> Self:
"""
Create an instance of this data type from a compatible NumPy data type.
We reject NumPy StringDType instances that have the `na_object` field set,
because this is not representable by the Zarr `string` data type.

Parameters
----------
dtype : TBaseDType
The native data type.

Returns
-------
Self
An instance of this data type.

Raises
------
DataTypeValidationError
If the input is not compatible with this data type.
ValueError
If the input is `numpy.dtypes.StringDType` and has `na_object` set.
"""
if cls._check_native_dtype(dtype):
if hasattr(dtype, "na_object"):
msg = (
f"Zarr data type resolution from {dtype} failed. "
"Attempted to resolve a zarr data type from a `numpy.dtypes.StringDType` "
"with `na_object` set, which is not supported."
)
raise ValueError(msg)
return cls()
raise DataTypeValidationError(
f"Invalid data type: {dtype}. Expected an instance of {cls.dtype_cls}"
)

def to_native_dtype(self) -> np.dtypes.StringDType:
"""
Create a NumPy string dtype from this VariableLengthUTF8 ZDType.

Returns
-------
np.dtypes.StringDType
The NumPy string dtype.
"""
return self.dtype_cls()

else:
# Numpy pre-2 does not have a variable length string dtype, so we use the Object dtype instead.
@dataclass(frozen=True, kw_only=True)
class VariableLengthUTF8(UTF8Base[np.dtypes.ObjectDType]): # type: ignore[no-redef]
"""
A Zarr data type for arrays containing variable-length UTF-8 strings.

Wraps the ``np.dtypes.ObjectDType`` data type. Scalars for this data type are instances
of ``str``.


Attributes
----------
dtype_cls : Type[np.dtypes.ObjectDType]
The NumPy dtype class for this data type.
_zarr_v3_name : ClassVar[Literal["variable_length_utf8"]] = "variable_length_utf8"
The name of this data type in Zarr V3.
object_codec_id : ClassVar[Literal["vlen-utf8"]] = "vlen-utf8"
The object codec ID for this data type.
"""

dtype_cls = np.dtypes.ObjectDType

def to_native_dtype(self) -> np.dtypes.ObjectDType:
"""
Create a NumPy object dtype from this VariableLengthUTF8 ZDType.

Returns
-------
np.dtypes.ObjectDType
The NumPy object dtype.
"""
return self.dtype_cls()
16 changes: 3 additions & 13 deletions tests/test_array.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,6 @@
)
from zarr.core.dtype.common import ENDIANNESS_STR, EndiannessStr
from zarr.core.dtype.npy.common import NUMPY_ENDIANNESS_STR, endianness_from_numpy_str
from zarr.core.dtype.npy.string import UTF8Base
from zarr.core.group import AsyncGroup
from zarr.core.indexing import BasicIndexer, _iter_grid, _iter_regions
from zarr.core.metadata.v2 import ArrayV2Metadata
Expand Down Expand Up @@ -1981,19 +1980,10 @@ def test_array_repr(store: Store) -> None:
assert str(arr) == f"<Array {store} shape={shape} dtype={dtype}>"


class UnknownObjectDtype(UTF8Base[np.dtypes.ObjectDType]):
object_codec_id = "unknown" # type: ignore[assignment]

def to_native_dtype(self) -> np.dtypes.ObjectDType:
"""
Create a NumPy object dtype from this VariableLengthUTF8 ZDType.
class UnknownObjectDtype(VariableLengthUTF8):
"""A data type that requires an object codec with an unknown id, used for error-path tests."""

Returns
-------
np.dtypes.ObjectDType
The NumPy object dtype.
"""
return np.dtype("o") # type: ignore[return-value]
object_codec_id = "unknown" # type: ignore[assignment]


@pytest.mark.parametrize(
Expand Down
18 changes: 10 additions & 8 deletions tests/test_codecs/test_vlen.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,17 +10,19 @@
from zarr.codecs import ZstdCodec
from zarr.codecs.vlen_utf8 import VLenBytesCodec, VLenUTF8Codec
from zarr.core.dtype import get_data_type_from_native_dtype
from zarr.core.dtype.npy.string import _NUMPY_SUPPORTS_VLEN_STRING
from zarr.core.metadata.v3 import ArrayV3Metadata
from zarr.storage import StorePath

numpy_str_dtypes: list[type | str | None] = [None, str, "str", np.dtypes.StrDType, "S", "U"]
expected_array_string_dtype: np.dtype[Any]
if _NUMPY_SUPPORTS_VLEN_STRING:
numpy_str_dtypes.append(np.dtypes.StringDType)
expected_array_string_dtype = np.dtypes.StringDType()
else:
expected_array_string_dtype = np.dtype("O")
numpy_str_dtypes: list[type | str | None] = [
None,
str,
"str",
np.dtypes.StrDType,
"S",
"U",
np.dtypes.StringDType,
]
expected_array_string_dtype: np.dtype[Any] = np.dtypes.StringDType()


@pytest.mark.filterwarnings("ignore::zarr.core.dtype.common.UnstableSpecificationWarning")
Expand Down
103 changes: 33 additions & 70 deletions tests/test_dtype/test_npy/test_string.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,80 +5,43 @@

from tests.test_dtype.test_wrapper import BaseTestZDType
from zarr.core.dtype import FixedLengthUTF32
from zarr.core.dtype.npy.string import _NUMPY_SUPPORTS_VLEN_STRING, VariableLengthUTF8
from zarr.core.dtype.npy.string import VariableLengthUTF8
from zarr.errors import UnstableSpecificationWarning

if _NUMPY_SUPPORTS_VLEN_STRING:

class TestVariableLengthString(BaseTestZDType):
test_cls = VariableLengthUTF8 # type: ignore[assignment]
valid_dtype = (np.dtypes.StringDType(),) # type: ignore[assignment]
invalid_dtype = (
np.dtype(np.int8),
np.dtype(np.float64),
np.dtype("|S10"),
)
valid_json_v2 = ({"name": "|O", "object_codec_id": "vlen-utf8"},)
valid_json_v3 = ("string",)
invalid_json_v2 = (
"|S10",
"|f8",
"invalid",
)
invalid_json_v3 = (
{"name": "variable_length_utf8", "configuration": {"invalid_key": "value"}},
{"name": "invalid_name"},
)

scalar_v2_params = ((VariableLengthUTF8(), ""), (VariableLengthUTF8(), "hi"))
scalar_v3_params = (
(VariableLengthUTF8(), ""),
(VariableLengthUTF8(), "hi"),
)

cast_value_params = (
(VariableLengthUTF8(), "", np.str_("")),
(VariableLengthUTF8(), "hi", np.str_("hi")),
)
# anything can become a string
invalid_scalar_params = (None,)
item_size_params = (VariableLengthUTF8(),)

else:

class TestVariableLengthString(BaseTestZDType): # type: ignore[no-redef]
test_cls = VariableLengthUTF8 # type: ignore[assignment]
valid_dtype = (np.dtype("O"),)
invalid_dtype = (
np.dtype(np.int8),
np.dtype(np.float64),
np.dtype("|S10"),
)
valid_json_v2 = ({"name": "|O", "object_codec_id": "vlen-utf8"},)
valid_json_v3 = ("string",)
invalid_json_v2 = (
"|S10",
"|f8",
"invalid",
)
invalid_json_v3 = (
{"name": "numpy.variable_length_utf8", "configuration": {"invalid_key": "value"}},
{"name": "invalid_name"},
)
class TestVariableLengthString(BaseTestZDType):
test_cls = VariableLengthUTF8 # type: ignore[assignment]
valid_dtype = (np.dtypes.StringDType(),) # type: ignore[assignment]
invalid_dtype = (
np.dtype(np.int8),
np.dtype(np.float64),
np.dtype("|S10"),
)
valid_json_v2 = ({"name": "|O", "object_codec_id": "vlen-utf8"},)
valid_json_v3 = ("string",)
invalid_json_v2 = (
"|S10",
"|f8",
"invalid",
)
invalid_json_v3 = (
{"name": "variable_length_utf8", "configuration": {"invalid_key": "value"}},
{"name": "invalid_name"},
)

scalar_v2_params = ((VariableLengthUTF8(), ""), (VariableLengthUTF8(), "hi"))
scalar_v3_params = (
(VariableLengthUTF8(), ""),
(VariableLengthUTF8(), "hi"),
)
scalar_v2_params = ((VariableLengthUTF8(), ""), (VariableLengthUTF8(), "hi"))
scalar_v3_params = (
(VariableLengthUTF8(), ""),
(VariableLengthUTF8(), "hi"),
)

cast_value_params = (
(VariableLengthUTF8(), "", np.str_("")),
(VariableLengthUTF8(), "hi", np.str_("hi")),
)
# anything can become a string
invalid_scalar_params = (None,)
item_size_params = (VariableLengthUTF8(),)
cast_value_params = (
(VariableLengthUTF8(), "", np.str_("")),
(VariableLengthUTF8(), "hi", np.str_("hi")),
)
# anything can become a string
invalid_scalar_params = (None,)
item_size_params = (VariableLengthUTF8(),)


class TestFixedLengthUTF32(BaseTestZDType):
Expand Down Expand Up @@ -131,7 +94,7 @@ class TestFixedLengthUTF32(BaseTestZDType):
FixedLengthUTF32(length=10),
],
)
def test_unstable_dtype_warning(zdtype: FixedLengthUTF32 | VariableLengthUTF8) -> None:
def test_unstable_dtype_warning(zdtype: FixedLengthUTF32) -> None:
"""
Test that we get a warning when serializing a dtype without a zarr v3 spec to json
when zarr_format is 3
Expand Down
Loading
Loading