Skip to content

Commit 5a92bc7

Browse files
committed
implement casefold and normalize str accessor functions
1 parent d1d1234 commit 5a92bc7

File tree

2 files changed

+76
-3
lines changed

2 files changed

+76
-3
lines changed

xarray/core/accessor_str.py

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,7 @@
4141
import re
4242
import textwrap
4343
from typing import Any, Callable, Mapping, Union
44+
from unicodedata import normalize
4445

4546
import numpy as np
4647

@@ -279,6 +280,42 @@ def upper(self):
279280
"""
280281
return self._apply(lambda x: x.upper())
281282

283+
def casefold(self):
284+
"""
285+
Convert strings in the array to be casefolded.
286+
287+
Casefolding is similar to converting to lowercase,
288+
but removes all case distinctions.
289+
This is important in some languages that have more complicated
290+
cases and case conversions.
291+
292+
Returns
293+
-------
294+
casefolded : same type as values
295+
"""
296+
return self._apply(lambda x: x.casefold())
297+
298+
def normalize(
299+
self,
300+
form: str,
301+
) -> Any:
302+
"""
303+
Return the Unicode normal form for the strings in the datarray.
304+
305+
For more information on the forms, see the documentation for
306+
:func:`unicodedata.normalize`.
307+
308+
Parameters
309+
----------
310+
side : {"NFC", "NFKC", "NFD", and "NFKD"}
311+
Unicode form.
312+
313+
Returns
314+
-------
315+
normalized : same type as values
316+
"""
317+
return self._apply(lambda x: normalize(form, x))
318+
282319
def isalnum(self):
283320
"""
284321
Check whether all characters in each string are alphanumeric.

xarray/tests/test_accessor_str.py

Lines changed: 39 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
# -*- coding: utf-8 -*-
2+
13
# Tests for the `str` accessor are derived from the original
24
# pandas string accessor tests.
35

@@ -93,18 +95,52 @@ def test_starts_ends_with(dtype):
9395
assert_equal(result, expected)
9496

9597

96-
def test_case(dtype):
97-
da = xr.DataArray(["SOme word"]).astype(dtype)
98+
def test_case_bytes(dtype):
99+
dtype = np.bytes_
100+
101+
da = xr.DataArray(["SOme wOrd"]).astype(dtype)
98102
capitalized = xr.DataArray(["Some word"]).astype(dtype)
99103
lowered = xr.DataArray(["some word"]).astype(dtype)
100-
swapped = xr.DataArray(["soME WORD"]).astype(dtype)
104+
swapped = xr.DataArray(["soME WoRD"]).astype(dtype)
101105
titled = xr.DataArray(["Some Word"]).astype(dtype)
102106
uppered = xr.DataArray(["SOME WORD"]).astype(dtype)
107+
108+
assert_equal(da.str.capitalize(), capitalized)
109+
assert_equal(da.str.lower(), lowered)
110+
assert_equal(da.str.swapcase(), swapped)
111+
assert_equal(da.str.title(), titled)
112+
assert_equal(da.str.upper(), uppered)
113+
114+
115+
def test_case_str(dtype):
116+
dtype = np.str_
117+
118+
# This string includes some unicode characters
119+
# that are common case management corner cases
120+
da = xr.DataArray(["SOme wOrd DŽ ß ᾛ ΣΣ ffi⁵Å Ç Ⅰ"]).astype(dtype)
121+
capitalized = xr.DataArray(["Some word dž ß ᾓ σς ffi⁵å ç ⅰ"]).astype(dtype)
122+
lowered = xr.DataArray(["some word dž ß ᾓ σς ffi⁵å ç ⅰ"]).astype(dtype)
123+
swapped = xr.DataArray(["soME WoRD dž SS ᾛ σς FFI⁵å ç ⅰ"]).astype(dtype)
124+
titled = xr.DataArray(["Some Word Dž Ss ᾛ Σς Ffi⁵Å Ç Ⅰ"]).astype(dtype)
125+
uppered = xr.DataArray(["SOME WORD DŽ SS ἫΙ ΣΣ FFI⁵Å Ç Ⅰ"]).astype(dtype)
126+
casefolded = xr.DataArray(["some word dž ss ἣι σσ ffi⁵å ç ⅰ"]).astype(dtype)
127+
128+
norm_nfc = xr.DataArray(["SOme wOrd DŽ ß ᾛ ΣΣ ffi⁵Å Ç Ⅰ"]).astype(dtype)
129+
norm_nfkc = xr.DataArray(["SOme wOrd DŽ ß ᾛ ΣΣ ffi5Å Ç I"]).astype(dtype)
130+
norm_nfd = xr.DataArray(["SOme wOrd DŽ ß ᾛ ΣΣ ffi⁵Å Ç Ⅰ"]).astype(dtype)
131+
norm_nfkd = xr.DataArray(["SOme wOrd DŽ ß ᾛ ΣΣ ffi5Å Ç I"]).astype(dtype)
132+
103133
assert_equal(da.str.capitalize(), capitalized)
104134
assert_equal(da.str.lower(), lowered)
105135
assert_equal(da.str.swapcase(), swapped)
106136
assert_equal(da.str.title(), titled)
107137
assert_equal(da.str.upper(), uppered)
138+
assert_equal(da.str.casefold(), casefolded)
139+
140+
assert_equal(da.str.normalize("NFC"), norm_nfc)
141+
assert_equal(da.str.normalize("NFKC"), norm_nfkc)
142+
assert_equal(da.str.normalize("NFD"), norm_nfd)
143+
assert_equal(da.str.normalize("NFKD"), norm_nfkd)
108144

109145

110146
def test_replace(dtype):

0 commit comments

Comments
 (0)