diff --git a/doc/api.rst b/doc/api.rst index 0e766f2cf9a..33c8d9d3ceb 100644 --- a/doc/api.rst +++ b/doc/api.rst @@ -324,6 +324,7 @@ Computation DataArray.quantile DataArray.differentiate DataArray.integrate + DataArray.str **Aggregation**: :py:attr:`~DataArray.all` @@ -557,6 +558,15 @@ Resample objects also implement the GroupBy interface core.resample.DatasetResample.nearest core.resample.DatasetResample.pad +Accessors +========= + +.. autosummary:: + :toctree: generated/ + + core.accessor_dt.DatetimeAccessor + core.accessor_str.StringAccessor + Custom Indexes ============== .. autosummary:: diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 55773af92b3..48a94ee8724 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -49,6 +49,8 @@ Enhancements - Like :py:class:`pandas.DatetimeIndex`, :py:class:`CFTimeIndex` now supports a :py:meth:`~xarray.CFTimeIndex.strftime` method to return an index of string formatted datetimes. By `Alan Brammer `_. +- Add ``.str`` accessor to DataArrays for string related manipulations. + By `0x0L `_. Bug fixes ~~~~~~~~~ diff --git a/xarray/core/accessors.py b/xarray/core/accessor_dt.py similarity index 98% rename from xarray/core/accessors.py rename to xarray/core/accessor_dt.py index 806e1579c3a..01cddae188f 100644 --- a/xarray/core/accessors.py +++ b/xarray/core/accessor_dt.py @@ -165,13 +165,13 @@ class DatetimeAccessor: """ - def __init__(self, xarray_obj): - if not _contains_datetime_like_objects(xarray_obj): + def __init__(self, obj): + if not _contains_datetime_like_objects(obj): raise TypeError("'dt' accessor only available for " "DataArray with datetime64 timedelta64 dtype or " "for arrays containing cftime datetime " "objects.") - self._obj = xarray_obj + self._obj = obj def _tslib_field_accessor(name, docstring=None, dtype=None): def f(self, dtype=dtype): diff --git a/xarray/core/accessor_str.py b/xarray/core/accessor_str.py new file mode 100644 index 00000000000..564593a032e --- /dev/null +++ b/xarray/core/accessor_str.py @@ -0,0 +1,958 @@ +# The StringAccessor class defined below is an adaptation of the +# pandas string methods source code (see pd.core.strings) + +# For reference, here is a copy of the pandas copyright notice: + +# (c) 2011-2012, Lambda Foundry, Inc. and PyData Development Team +# All rights reserved. + +# Copyright (c) 2008-2011 AQR Capital Management, LLC +# All rights reserved. + +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: + +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. + +# * Redistributions in binary form must reproduce the above +# copyright notice, this list of conditions and the following +# disclaimer in the documentation and/or other materials provided +# with the distribution. + +# * Neither the name of the copyright holder nor the names of any +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. + +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import codecs +import re +import textwrap + +import numpy as np + +from .computation import apply_ufunc + + +_cpython_optimized_encoders = ( + "utf-8", "utf8", "latin-1", "latin1", "iso-8859-1", "mbcs", "ascii" +) +_cpython_optimized_decoders = _cpython_optimized_encoders + ( + "utf-16", "utf-32" +) + + +def _is_str_like(x): + return isinstance(x, str) or isinstance(x, bytes) + + +class StringAccessor: + """Vectorized string functions for string-like arrays. + + Similar to pandas, fields can be accessed through the `.str` attribute + for applicable DataArrays. + + >>> da = xr.DataArray(['some', 'text', 'in', 'an', 'array']) + >>> ds.str.len() + + array([4, 4, 2, 2, 5]) + Dimensions without coordinates: dim_0 + + """ + + def __init__(self, obj): + self._obj = obj + + def _apply(self, f, dtype=None): + # TODO handling of na values ? + if dtype is None: + dtype = self._obj.dtype + + g = np.vectorize(f, otypes=[dtype]) + return apply_ufunc( + g, self._obj, dask='parallelized', output_dtypes=[dtype]) + + def len(self): + ''' + Compute the length of each element in the array. + + Returns + ------- + lengths array : array of int + ''' + return self._apply(len, dtype=int) + + def __getitem__(self, key): + if isinstance(key, slice): + return self.slice(start=key.start, stop=key.stop, step=key.step) + else: + return self.get(key) + + def get(self, i): + ''' + Extract element from indexable in each element in the array. + + Parameters + ---------- + i : int + Position of element to extract. + default : optional + Value for out-of-range index. If not specified (None) defaults to + an empty string. + + Returns + ------- + items : array of objects + ''' + obj = slice(-1, None) if i == -1 else slice(i, i + 1) + return self._apply(lambda x: x[obj]) + + def slice(self, start=None, stop=None, step=None): + ''' + Slice substrings from each element in the array. + + Parameters + ---------- + start : int, optional + Start position for slice operation. + stop : int, optional + Stop position for slice operation. + step : int, optional + Step size for slice operation. + + Returns + ------- + sliced strings : same type as values + ''' + s = slice(start, stop, step) + f = lambda x: x[s] + return self._apply(f) + + def slice_replace(self, start=None, stop=None, repl=''): + ''' + Replace a positional slice of a string with another value. + + Parameters + ---------- + start : int, optional + Left index position to use for the slice. If not specified (None), + the slice is unbounded on the left, i.e. slice from the start + of the string. + stop : int, optional + Right index position to use for the slice. If not specified (None), + the slice is unbounded on the right, i.e. slice until the + end of the string. + repl : str, optional + String for replacement. If not specified, the sliced region + is replaced with an empty string. + + Returns + ------- + replaced : same type as values + ''' + repl = self._obj.dtype.type(repl) + + def f(x): + if len(x[start:stop]) == 0: + local_stop = start + else: + local_stop = stop + y = self._obj.dtype.type('') + if start is not None: + y += x[:start] + y += repl + if stop is not None: + y += x[local_stop:] + return y + + return self._apply(f) + + def capitalize(self): + ''' + Convert strings in the array to be capitalized. + + Returns + ------- + capitalized : same type as values + ''' + return self._apply(lambda x: x.capitalize()) + + def lower(self): + ''' + Convert strings in the array to lowercase. + + Returns + ------- + lowerd : same type as values + ''' + return self._apply(lambda x: x.lower()) + + def swapcase(self): + ''' + Convert strings in the array to be swapcased. + + Returns + ------- + swapcased : same type as values + ''' + return self._apply(lambda x: x.swapcase()) + + def title(self): + ''' + Convert strings in the array to titlecase. + + Returns + ------- + titled : same type as values + ''' + return self._apply(lambda x: x.title()) + + def upper(self): + ''' + Convert strings in the array to uppercase. + + Returns + ------- + uppered : same type as values + ''' + return self._apply(lambda x: x.upper()) + + def isalnum(self): + ''' + Check whether all characters in each string are alphanumeric. + + Returns + ------- + isalnum : array of bool + Array of boolean values with the same shape as the original array. + ''' + return self._apply(lambda x: x.isalnum(), dtype=bool) + + def isalpha(self): + ''' + Check whether all characters in each string are alphabetic. + + Returns + ------- + isalpha : array of bool + Array of boolean values with the same shape as the original array. + ''' + return self._apply(lambda x: x.isalpha(), dtype=bool) + + def isdecimal(self): + ''' + Check whether all characters in each string are decimal. + + Returns + ------- + isdecimal : array of bool + Array of boolean values with the same shape as the original array. + ''' + return self._apply(lambda x: x.isdecimal(), dtype=bool) + + def isdigit(self): + ''' + Check whether all characters in each string are digits. + + Returns + ------- + isdigit : array of bool + Array of boolean values with the same shape as the original array. + ''' + return self._apply(lambda x: x.isdigit(), dtype=bool) + + def islower(self): + ''' + Check whether all characters in each string are lowercase. + + Returns + ------- + islower : array of bool + Array of boolean values with the same shape as the original array. + ''' + return self._apply(lambda x: x.islower(), dtype=bool) + + def isnumeric(self): + ''' + Check whether all characters in each string are numeric. + + Returns + ------- + isnumeric : array of bool + Array of boolean values with the same shape as the original array. + ''' + return self._apply(lambda x: x.isnumeric(), dtype=bool) + + def isspace(self): + ''' + Check whether all characters in each string are spaces. + + Returns + ------- + isspace : array of bool + Array of boolean values with the same shape as the original array. + ''' + return self._apply(lambda x: x.isspace(), dtype=bool) + + def istitle(self): + ''' + Check whether all characters in each string are titlecase. + + Returns + ------- + istitle : array of bool + Array of boolean values with the same shape as the original array. + ''' + return self._apply(lambda x: x.istitle(), dtype=bool) + + def isupper(self): + ''' + Check whether all characters in each string are uppercase. + + Returns + ------- + isupper : array of bool + Array of boolean values with the same shape as the original array. + ''' + return self._apply(lambda x: x.isupper(), dtype=bool) + + def count(self, pat, flags=0): + ''' + Count occurrences of pattern in each string of the array. + + This function is used to count the number of times a particular regex + pattern is repeated in each of the string elements of the + :class:`~xarray.DatArray`. + + Parameters + ---------- + pat : str + Valid regular expression. + flags : int, default 0, meaning no flags + Flags for the `re` module. For a complete list, `see here + `_. + + Returns + ------- + counts : array of int + ''' + pat = self._obj.dtype.type(pat) + regex = re.compile(pat, flags=flags) + f = lambda x: len(regex.findall(x)) + return self._apply(f, dtype=int) + + def startswith(self, pat): + ''' + Test if the start of each string element matches a pattern. + + Parameters + ---------- + pat : str + Character sequence. Regular expressions are not accepted. + + Returns + ------- + startswith : array of bool + An array of booleans indicating whether the given pattern matches + the start of each string element. + ''' + pat = self._obj.dtype.type(pat) + f = lambda x: x.startswith(pat) + return self._apply(f, dtype=bool) + + def endswith(self, pat): + ''' + Test if the end of each string element matches a pattern. + + Parameters + ---------- + pat : str + Character sequence. Regular expressions are not accepted. + + Returns + ------- + endswith : array of bool + A Series of booleans indicating whether the given pattern matches + the end of each string element. + ''' + pat = self._obj.dtype.type(pat) + f = lambda x: x.endswith(pat) + return self._apply(f, dtype=bool) + + def pad(self, width, side='left', fillchar=' '): + ''' + Pad strings in the array up to width. + + Parameters + ---------- + width : int + Minimum width of resulting string; additional characters will be + filled with character defined in `fillchar`. + side : {'left', 'right', 'both'}, default 'left' + Side from which to fill resulting string. + fillchar : str, default ' ' + Additional character for filling, default is whitespace. + + Returns + ------- + filled : same type as values + Array with a minimum number of char in each element. + ''' + width = int(width) + fillchar = self._obj.dtype.type(fillchar) + if len(fillchar) != 1: + raise TypeError('fillchar must be a character, not str') + + if side == 'left': + f = lambda s: s.rjust(width, fillchar) + elif side == 'right': + f = lambda s: s.ljust(width, fillchar) + elif side == 'both': + f = lambda s: s.center(width, fillchar) + else: # pragma: no cover + raise ValueError('Invalid side') + + return self._apply(f) + + def center(self, width, fillchar=' '): + ''' + Filling left and right side of strings in the array with an + additional character. + + Parameters + ---------- + width : int + Minimum width of resulting string; additional characters will be + filled with ``fillchar`` + fillchar : str + Additional character for filling, default is whitespace + + Returns + ------- + filled : same type as values + ''' + return self.pad(width, side='both', fillchar=fillchar) + + def ljust(self, width, fillchar=' '): + ''' + Filling right side of strings in the array with an additional + character. + + Parameters + ---------- + width : int + Minimum width of resulting string; additional characters will be + filled with ``fillchar`` + fillchar : str + Additional character for filling, default is whitespace + + Returns + ------- + filled : same type as values + ''' + return self.pad(width, side='right', fillchar=fillchar) + + def rjust(self, width, fillchar=' '): + ''' + Filling left side of strings in the array with an additional character. + + Parameters + ---------- + width : int + Minimum width of resulting string; additional characters will be + filled with ``fillchar`` + fillchar : str + Additional character for filling, default is whitespace + + Returns + ------- + filled : same type as values + ''' + return self.pad(width, side='left', fillchar=fillchar) + + def zfill(self, width): + ''' + Pad strings in the array by prepending '0' characters. + + Strings in the array are padded with '0' characters on the + left of the string to reach a total string length `width`. Strings + in the array with length greater or equal to `width` are unchanged. + + Parameters + ---------- + width : int + Minimum length of resulting string; strings with length less + than `width` be prepended with '0' characters. + + Returns + ------- + filled : same type as values + ''' + return self.pad(width, side='left', fillchar='0') + + def contains(self, pat, case=True, flags=0, regex=True): + ''' + Test if pattern or regex is contained within a string of the array. + + Return boolean array based on whether a given pattern or regex is + contained within a string of the array. + + Parameters + ---------- + pat : str + Character sequence or regular expression. + case : bool, default True + If True, case sensitive. + flags : int, default 0 (no flags) + Flags to pass through to the re module, e.g. re.IGNORECASE. + regex : bool, default True + If True, assumes the pat is a regular expression. + If False, treats the pat as a literal string. + + Returns + ------- + contains : array of bool + An array of boolean values indicating whether the + given pattern is contained within the string of each element + of the array. + ''' + pat = self._obj.dtype.type(pat) + if regex: + if not case: + flags |= re.IGNORECASE + + regex = re.compile(pat, flags=flags) + + if regex.groups > 0: # pragma: no cover + raise ValueError("This pattern has match groups.") + + f = lambda x: bool(regex.search(x)) + else: + if case: + f = lambda x: pat in x + else: + uppered = self._obj.str.upper() + return uppered.str.contains(pat.upper(), regex=False) + + return self._apply(f, dtype=bool) + + def match(self, pat, case=True, flags=0): + ''' + Determine if each string matches a regular expression. + + Parameters + ---------- + pat : string + Character sequence or regular expression + case : boolean, default True + If True, case sensitive + flags : int, default 0 (no flags) + re module flags, e.g. re.IGNORECASE + + Returns + ------- + matched : array of bool + ''' + if not case: + flags |= re.IGNORECASE + + pat = self._obj.dtype.type(pat) + regex = re.compile(pat, flags=flags) + f = lambda x: bool(regex.match(x)) + return self._apply(f, dtype=bool) + + def strip(self, to_strip=None, side='both'): + ''' + Remove leading and trailing characters. + + Strip whitespaces (including newlines) or a set of specified characters + from each string in the array from left and/or right sides. + + Parameters + ---------- + to_strip : str or None, default None + Specifying the set of characters to be removed. + All combinations of this set of characters will be stripped. + If None then whitespaces are removed. + side : {'left', 'right', 'both'}, default 'left' + Side from which to strip. + + Returns + ------- + stripped : same type as values + ''' + if to_strip is not None: + to_strip = self._obj.dtype.type(to_strip) + + if side == 'both': + f = lambda x: x.strip(to_strip) + elif side == 'left': + f = lambda x: x.lstrip(to_strip) + elif side == 'right': + f = lambda x: x.rstrip(to_strip) + else: # pragma: no cover + raise ValueError('Invalid side') + + return self._apply(f) + + def lstrip(self, to_strip=None): + ''' + Remove leading and trailing characters. + + Strip whitespaces (including newlines) or a set of specified characters + from each string in the array from the left side. + + Parameters + ---------- + to_strip : str or None, default None + Specifying the set of characters to be removed. + All combinations of this set of characters will be stripped. + If None then whitespaces are removed. + + Returns + ------- + stripped : same type as values + ''' + return self.strip(to_strip, side='left') + + def rstrip(self, to_strip=None): + ''' + Remove leading and trailing characters. + + Strip whitespaces (including newlines) or a set of specified characters + from each string in the array from the right side. + + Parameters + ---------- + to_strip : str or None, default None + Specifying the set of characters to be removed. + All combinations of this set of characters will be stripped. + If None then whitespaces are removed. + + Returns + ------- + stripped : same type as values + ''' + return self.strip(to_strip, side='right') + + def wrap(self, width, **kwargs): + ''' + Wrap long strings in the array to be formatted in paragraphs with + length less than a given width. + + This method has the same keyword parameters and defaults as + :class:`textwrap.TextWrapper`. + + Parameters + ---------- + width : int + Maximum line-width + expand_tabs : bool, optional + If true, tab characters will be expanded to spaces (default: True) + replace_whitespace : bool, optional + If true, each whitespace character (as defined by + string.whitespace) remaining after tab expansion will be replaced + by a single space (default: True) + drop_whitespace : bool, optional + If true, whitespace that, after wrapping, happens to end up at the + beginning or end of a line is dropped (default: True) + break_long_words : bool, optional + If true, then words longer than width will be broken in order to + ensure that no lines are longer than width. If it is false, long + words will not be broken, and some lines may be longer than width. + (default: True) + break_on_hyphens : bool, optional + If true, wrapping will occur preferably on whitespace and right + after hyphens in compound words, as it is customary in English. If + false, only whitespaces will be considered as potentially good + places for line breaks, but you need to set break_long_words to + false if you want truly insecable words. (default: True) + + Returns + ------- + wrapped : same type as values + ''' + tw = textwrap.TextWrapper(width=width) + f = lambda x: '\n'.join(tw.wrap(x)) + return self._apply(f) + + def translate(self, table): + ''' + Map all characters in the string through the given mapping table. + + Parameters + ---------- + table : dict + A a mapping of Unicode ordinals to Unicode ordinals, strings, + or None. Unmapped characters are left untouched. Characters mapped + to None are deleted. :meth:`str.maketrans` is a helper function for + making translation tables. + + Returns + ------- + translated : same type as values + ''' + f = lambda x: x.translate(table) + return self._apply(f) + + def repeat(self, repeats): + ''' + Duplicate each string in the array. + + Parameters + ---------- + repeats : int + Number of repetitions. + + Returns + ------- + repeated : same type as values + Array of repeated string objects. + ''' + f = lambda x: repeats * x + return self._apply(f) + + def find(self, sub, start=0, end=None, side='left'): + ''' + Return lowest or highest indexes in each strings in the array + where the substring is fully contained between [start:end]. + Return -1 on failure. + + Parameters + ---------- + sub : str + Substring being searched + start : int + Left edge index + end : int + Right edge index + side : {'left', 'right'}, default 'left' + Starting side for search. + + Returns + ------- + found : array of integer values + ''' + sub = self._obj.dtype.type(sub) + + if side == 'left': + method = 'find' + elif side == 'right': + method = 'rfind' + else: # pragma: no cover + raise ValueError('Invalid side') + + if end is None: + f = lambda x: getattr(x, method)(sub, start) + else: + f = lambda x: getattr(x, method)(sub, start, end) + + return self._apply(f, dtype=int) + + def rfind(self, sub, start=0, end=None): + ''' + Return highest indexes in each strings in the array + where the substring is fully contained between [start:end]. + Return -1 on failure. + + Parameters + ---------- + sub : str + Substring being searched + start : int + Left edge index + end : int + Right edge index + + Returns + ------- + found : array of integer values + ''' + return self.find(sub, start=start, end=end, side='right') + + def index(self, sub, start=0, end=None, side='left'): + ''' + Return lowest or highest indexes in each strings where the substring is + fully contained between [start:end]. This is the same as + ``str.find`` except instead of returning -1, it raises a ValueError + when the substring is not found. + + Parameters + ---------- + sub : str + Substring being searched + start : int + Left edge index + end : int + Right edge index + side : {'left', 'right'}, default 'left' + Starting side for search. + + Returns + ------- + found : array of integer values + ''' + sub = self._obj.dtype.type(sub) + + if side == 'left': + method = 'index' + elif side == 'right': + method = 'rindex' + else: # pragma: no cover + raise ValueError('Invalid side') + + if end is None: + f = lambda x: getattr(x, method)(sub, start) + else: + f = lambda x: getattr(x, method)(sub, start, end) + + return self._apply(f, dtype=int) + + def rindex(self, sub, start=0, end=None): + ''' + Return highest indexes in each strings where the substring is + fully contained between [start:end]. This is the same as + ``str.rfind`` except instead of returning -1, it raises a ValueError + when the substring is not found. + + Parameters + ---------- + sub : str + Substring being searched + start : int + Left edge index + end : int + Right edge index + + Returns + ------- + found : array of integer values + ''' + return self.index(sub, start=start, end=end, side='right') + + def replace(self, pat, repl, n=-1, case=None, flags=0, regex=True): + ''' + Replace occurrences of pattern/regex in the array with some string. + + Parameters + ---------- + pat : string or compiled regex + String can be a character sequence or regular expression. + + repl : string or callable + Replacement string or a callable. The callable is passed the regex + match object and must return a replacement string to be used. + See :func:`re.sub`. + + n : int, default -1 (all) + Number of replacements to make from start + case : boolean, default None + - If True, case sensitive (the default if `pat` is a string) + - Set to False for case insensitive + - Cannot be set if `pat` is a compiled regex + flags : int, default 0 (no flags) + - re module flags, e.g. re.IGNORECASE + - Cannot be set if `pat` is a compiled regex + regex : boolean, default True + - If True, assumes the passed-in pattern is a regular expression. + - If False, treats the pattern as a literal string + - Cannot be set to False if `pat` is a compiled regex or `repl` is + a callable. + + Returns + ------- + replaced : same type as values + A copy of the object with all matching occurrences of `pat` + replaced by `repl`. + ''' + if not (_is_str_like(repl) or callable(repl)): # pragma: no cover + raise TypeError("repl must be a string or callable") + + if _is_str_like(pat): + pat = self._obj.dtype.type(pat) + + if _is_str_like(repl): + repl = self._obj.dtype.type(repl) + + is_compiled_re = isinstance(pat, type(re.compile(''))) + if regex: + if is_compiled_re: + if (case is not None) or (flags != 0): + raise ValueError("case and flags cannot be set" + " when pat is a compiled regex") + else: + # not a compiled regex + # set default case + if case is None: + case = True + + # add case flag, if provided + if case is False: + flags |= re.IGNORECASE + if is_compiled_re or len(pat) > 1 or flags or callable(repl): + n = n if n >= 0 else 0 + compiled = re.compile(pat, flags=flags) + f = lambda x: compiled.sub(repl=repl, string=x, count=n) + else: + f = lambda x: x.replace(pat, repl, n) + else: + if is_compiled_re: + raise ValueError("Cannot use a compiled regex as replacement " + "pattern with regex=False") + if callable(repl): + raise ValueError("Cannot use a callable replacement when " + "regex=False") + f = lambda x: x.replace(pat, repl, n) + return self._apply(f) + + def decode(self, encoding, errors='strict'): + ''' + Decode character string in the array using indicated encoding. + + Parameters + ---------- + encoding : str + errors : str, optional + + Returns + ------- + decoded : same type as values + ''' + if encoding in _cpython_optimized_decoders: + f = lambda x: x.decode(encoding, errors) + else: + decoder = codecs.getdecoder(encoding) + f = lambda x: decoder(x, errors)[0] + return self._apply(f, dtype=np.str_) + + def encode(self, encoding, errors='strict'): + ''' + Encode character string in the array using indicated encoding. + + Parameters + ---------- + encoding : str + errors : str, optional + + Returns + ------- + encoded : same type as values + ''' + if encoding in _cpython_optimized_encoders: + f = lambda x: x.encode(encoding, errors) + else: + encoder = codecs.getencoder(encoding) + f = lambda x: encoder(x, errors)[0] + return self._apply(f, dtype=np.bytes_) diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py index 15abdaf4a92..b79c8f4dc2f 100644 --- a/xarray/core/dataarray.py +++ b/xarray/core/dataarray.py @@ -9,7 +9,8 @@ from ..plot.plot import _PlotMethods from . import ( computation, dtypes, groupby, indexing, ops, resample, rolling, utils) -from .accessors import DatetimeAccessor +from .accessor_dt import DatetimeAccessor +from .accessor_str import StringAccessor from .alignment import align, reindex_like_indexers from .common import AbstractArray, DataWithCoords from .coordinates import ( @@ -162,6 +163,7 @@ class DataArray(AbstractArray, DataWithCoords): _resample_cls = resample.DataArrayResample dt = property(DatetimeAccessor) + str = property(StringAccessor) def __init__(self, data, coords=None, dims=None, name=None, attrs=None, encoding=None, indexes=None, fastpath=False): diff --git a/xarray/tests/test_accessors.py b/xarray/tests/test_accessor_dt.py similarity index 100% rename from xarray/tests/test_accessors.py rename to xarray/tests/test_accessor_dt.py diff --git a/xarray/tests/test_accessor_str.py b/xarray/tests/test_accessor_str.py new file mode 100644 index 00000000000..26d5e385df3 --- /dev/null +++ b/xarray/tests/test_accessor_str.py @@ -0,0 +1,659 @@ +# Tests for the `str` accessor are derived from the original +# pandas string accessor tests. + +# For reference, here is a copy of the pandas copyright notice: + +# (c) 2011-2012, Lambda Foundry, Inc. and PyData Development Team +# All rights reserved. + +# Copyright (c) 2008-2011 AQR Capital Management, LLC +# All rights reserved. + +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: + +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. + +# * Redistributions in binary form must reproduce the above +# copyright notice, this list of conditions and the following +# disclaimer in the documentation and/or other materials provided +# with the distribution. + +# * Neither the name of the copyright holder nor the names of any +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. + +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import re + +import pytest +import numpy as np +import xarray as xr + +from . import ( + assert_array_equal, assert_equal, has_dask, raises_regex, requires_dask) + + +@pytest.fixture(params=[np.str_, np.bytes_]) +def dtype(request): + return request.param + + +@requires_dask +def test_dask(): + import dask.array as da + arr = da.from_array(['a', 'b', 'c']) + xarr = xr.DataArray(arr) + + result = xarr.str.len().compute() + expected = xr.DataArray([1, 1, 1]) + assert_equal(result, expected) + + +def test_count(dtype): + values = xr.DataArray(['foo', 'foofoo', 'foooofooofommmfoo']).astype(dtype) + result = values.str.count('f[o]+') + expected = xr.DataArray([1, 2, 4]) + assert_equal(result, expected) + + +def test_contains(dtype): + values = xr.DataArray(['Foo', 'xYz', 'fOOomMm__fOo', 'MMM_']).astype(dtype) + # case insensitive using regex + result = values.str.contains('FOO|mmm', case=False) + expected = xr.DataArray([True, False, True, True]) + assert_equal(result, expected) + # case insensitive without regex + result = values.str.contains('foo', regex=False, case=False) + expected = xr.DataArray([True, False, True, False]) + assert_equal(result, expected) + + +def test_starts_ends_with(dtype): + values = xr.DataArray( + ['om', 'foo_nom', 'nom', 'bar_foo', 'foo']).astype(dtype) + result = values.str.startswith('foo') + expected = xr.DataArray([False, True, False, False, True]) + assert_equal(result, expected) + result = values.str.endswith('foo') + expected = xr.DataArray([False, False, False, True, True]) + assert_equal(result, expected) + + +def test_case(dtype): + da = xr.DataArray(['SOme word']).astype(dtype) + capitalized = xr.DataArray(['Some word']).astype(dtype) + lowered = xr.DataArray(['some word']).astype(dtype) + swapped = xr.DataArray(['soME WORD']).astype(dtype) + titled = xr.DataArray(['Some Word']).astype(dtype) + uppered = xr.DataArray(['SOME WORD']).astype(dtype) + assert_equal(da.str.capitalize(), capitalized) + assert_equal(da.str.lower(), lowered) + assert_equal(da.str.swapcase(), swapped) + assert_equal(da.str.title(), titled) + assert_equal(da.str.upper(), uppered) + + +def test_replace(dtype): + values = xr.DataArray(['fooBAD__barBAD']).astype(dtype) + result = values.str.replace('BAD[_]*', '') + expected = xr.DataArray(['foobar']).astype(dtype) + assert_equal(result, expected) + + result = values.str.replace('BAD[_]*', '', n=1) + expected = xr.DataArray(['foobarBAD']).astype(dtype) + assert_equal(result, expected) + + s = xr.DataArray(['A', 'B', 'C', 'Aaba', 'Baca', '', + 'CABA', 'dog', 'cat']).astype(dtype) + result = s.str.replace('A', 'YYY') + expected = xr.DataArray(['YYY', 'B', 'C', 'YYYaba', 'Baca', '', 'CYYYBYYY', + 'dog', 'cat']).astype(dtype) + assert_equal(result, expected) + + result = s.str.replace('A', 'YYY', case=False) + expected = xr.DataArray(['YYY', 'B', 'C', 'YYYYYYbYYY', 'BYYYcYYY', + '', 'CYYYBYYY', 'dog', 'cYYYt']).astype(dtype) + assert_equal(result, expected) + + result = s.str.replace('^.a|dog', 'XX-XX ', case=False) + expected = xr.DataArray(['A', 'B', 'C', 'XX-XX ba', 'XX-XX ca', '', + 'XX-XX BA', 'XX-XX ', 'XX-XX t']).astype(dtype) + assert_equal(result, expected) + + +def test_replace_callable(): + values = xr.DataArray(['fooBAD__barBAD']) + # test with callable + repl = lambda m: m.group(0).swapcase() + result = values.str.replace('[a-z][A-Z]{2}', repl, n=2) + exp = xr.DataArray(['foObaD__baRbaD']) + assert_equal(result, exp) + # test regex named groups + values = xr.DataArray(['Foo Bar Baz']) + pat = r"(?P\w+) (?P\w+) (?P\w+)" + repl = lambda m: m.group('middle').swapcase() + result = values.str.replace(pat, repl) + exp = xr.DataArray(['bAR']) + assert_equal(result, exp) + + +def test_replace_unicode(): + # flags + unicode + values = xr.DataArray([b"abcd,\xc3\xa0".decode("utf-8")]) + expected = xr.DataArray([b"abcd, \xc3\xa0".decode("utf-8")]) + pat = re.compile(r"(?<=\w),(?=\w)", flags=re.UNICODE) + result = values.str.replace(pat, ", ") + assert_equal(result, expected) + + +def test_replace_compiled_regex(dtype): + values = xr.DataArray(['fooBAD__barBAD']).astype(dtype) + # test with compiled regex + pat = re.compile(dtype('BAD[_]*')) + result = values.str.replace(pat, '') + expected = xr.DataArray(['foobar']).astype(dtype) + assert_equal(result, expected) + + result = values.str.replace(pat, '', n=1) + expected = xr.DataArray(['foobarBAD']).astype(dtype) + assert_equal(result, expected) + + # case and flags provided to str.replace will have no effect + # and will produce warnings + values = xr.DataArray(['fooBAD__barBAD__bad']).astype(dtype) + pat = re.compile(dtype('BAD[_]*')) + + with pytest.raises(ValueError, match="case and flags cannot be"): + result = values.str.replace(pat, '', flags=re.IGNORECASE) + + with pytest.raises(ValueError, match="case and flags cannot be"): + result = values.str.replace(pat, '', case=False) + + with pytest.raises(ValueError, match="case and flags cannot be"): + result = values.str.replace(pat, '', case=True) + + # test with callable + values = xr.DataArray(['fooBAD__barBAD']).astype(dtype) + repl = lambda m: m.group(0).swapcase() + pat = re.compile(dtype('[a-z][A-Z]{2}')) + result = values.str.replace(pat, repl, n=2) + expected = xr.DataArray(['foObaD__baRbaD']).astype(dtype) + assert_equal(result, expected) + + +def test_replace_literal(dtype): + # GH16808 literal replace (regex=False vs regex=True) + values = xr.DataArray(['f.o', 'foo']).astype(dtype) + expected = xr.DataArray(['bao', 'bao']).astype(dtype) + result = values.str.replace('f.', 'ba') + assert_equal(result, expected) + + expected = xr.DataArray(['bao', 'foo']).astype(dtype) + result = values.str.replace('f.', 'ba', regex=False) + assert_equal(result, expected) + + # Cannot do a literal replace if given a callable repl or compiled + # pattern + callable_repl = lambda m: m.group(0).swapcase() + compiled_pat = re.compile('[a-z][A-Z]{2}') + + msg = "Cannot use a callable replacement when regex=False" + with pytest.raises(ValueError, match=msg): + values.str.replace('abc', callable_repl, regex=False) + + msg = "Cannot use a compiled regex as replacement pattern with regex=False" + with pytest.raises(ValueError, match=msg): + values.str.replace(compiled_pat, '', regex=False) + + +def test_repeat(dtype): + values = xr.DataArray(['a', 'b', 'c', 'd']).astype(dtype) + result = values.str.repeat(3) + expected = xr.DataArray(['aaa', 'bbb', 'ccc', 'ddd']).astype(dtype) + assert_equal(result, expected) + + +def test_match(dtype): + # New match behavior introduced in 0.13 + values = xr.DataArray(['fooBAD__barBAD', 'foo']).astype(dtype) + result = values.str.match('.*(BAD[_]+).*(BAD)') + expected = xr.DataArray([True, False]) + assert_equal(result, expected) + + values = xr.DataArray(['fooBAD__barBAD', 'foo']).astype(dtype) + result = values.str.match('.*BAD[_]+.*BAD') + expected = xr.DataArray([True, False]) + assert_equal(result, expected) + + +def test_empty_str_methods(): + empty = xr.DataArray(np.empty(shape=(0,), dtype='U')) + empty_str = empty + empty_int = xr.DataArray(np.empty(shape=(0,), dtype=int)) + empty_bool = xr.DataArray(np.empty(shape=(0,), dtype=bool)) + empty_bytes = xr.DataArray(np.empty(shape=(0,), dtype='S')) + + assert_equal(empty_str, empty.str.title()) + assert_equal(empty_int, empty.str.count('a')) + assert_equal(empty_bool, empty.str.contains('a')) + assert_equal(empty_bool, empty.str.startswith('a')) + assert_equal(empty_bool, empty.str.endswith('a')) + assert_equal(empty_str, empty.str.lower()) + assert_equal(empty_str, empty.str.upper()) + assert_equal(empty_str, empty.str.replace('a', 'b')) + assert_equal(empty_str, empty.str.repeat(3)) + assert_equal(empty_bool, empty.str.match('^a')) + assert_equal(empty_int, empty.str.len()) + assert_equal(empty_int, empty.str.find('a')) + assert_equal(empty_int, empty.str.rfind('a')) + assert_equal(empty_str, empty.str.pad(42)) + assert_equal(empty_str, empty.str.center(42)) + assert_equal(empty_str, empty.str.slice(stop=1)) + assert_equal(empty_str, empty.str.slice(step=1)) + assert_equal(empty_str, empty.str.strip()) + assert_equal(empty_str, empty.str.lstrip()) + assert_equal(empty_str, empty.str.rstrip()) + assert_equal(empty_str, empty.str.wrap(42)) + assert_equal(empty_str, empty.str.get(0)) + assert_equal(empty_str, empty_bytes.str.decode('ascii')) + assert_equal(empty_bytes, empty.str.encode('ascii')) + assert_equal(empty_str, empty.str.isalnum()) + assert_equal(empty_str, empty.str.isalpha()) + assert_equal(empty_str, empty.str.isdigit()) + assert_equal(empty_str, empty.str.isspace()) + assert_equal(empty_str, empty.str.islower()) + assert_equal(empty_str, empty.str.isupper()) + assert_equal(empty_str, empty.str.istitle()) + assert_equal(empty_str, empty.str.isnumeric()) + assert_equal(empty_str, empty.str.isdecimal()) + assert_equal(empty_str, empty.str.capitalize()) + assert_equal(empty_str, empty.str.swapcase()) + table = str.maketrans('a', 'b') + assert_equal(empty_str, empty.str.translate(table)) + + +def test_ismethods(dtype): + values = ['A', 'b', 'Xy', '4', '3A', '', 'TT', '55', '-', ' '] + str_s = xr.DataArray(values).astype(dtype) + alnum_e = [True, True, True, True, True, False, True, True, False, False] + alpha_e = [True, True, True, False, False, False, True, False, False, + False] + digit_e = [False, False, False, True, False, False, False, True, False, + False] + space_e = [False, False, False, False, False, False, False, False, + False, True] + lower_e = [False, True, False, False, False, False, False, False, + False, False] + upper_e = [True, False, False, False, True, False, True, False, False, + False] + title_e = [True, False, True, False, True, False, False, False, False, + False] + + assert_equal(str_s.str.isalnum(), xr.DataArray(alnum_e)) + assert_equal(str_s.str.isalpha(), xr.DataArray(alpha_e)) + assert_equal(str_s.str.isdigit(), xr.DataArray(digit_e)) + assert_equal(str_s.str.isspace(), xr.DataArray(space_e)) + assert_equal(str_s.str.islower(), xr.DataArray(lower_e)) + assert_equal(str_s.str.isupper(), xr.DataArray(upper_e)) + assert_equal(str_s.str.istitle(), xr.DataArray(title_e)) + + +def test_isnumeric(): + # 0x00bc: ¼ VULGAR FRACTION ONE QUARTER + # 0x2605: ★ not number + # 0x1378: ፸ ETHIOPIC NUMBER SEVENTY + # 0xFF13: 3 Em 3 + values = ['A', '3', '¼', '★', '፸', '3', 'four'] + s = xr.DataArray(values) + numeric_e = [False, True, True, False, True, True, False] + decimal_e = [False, True, False, False, False, True, False] + assert_equal(s.str.isnumeric(), xr.DataArray(numeric_e)) + assert_equal(s.str.isdecimal(), xr.DataArray(decimal_e)) + + +def test_len(dtype): + values = ['foo', 'fooo', 'fooooo', 'fooooooo'] + result = xr.DataArray(values).astype(dtype).str.len() + expected = xr.DataArray([len(x) for x in values]) + assert_equal(result, expected) + + +def test_find(dtype): + values = xr.DataArray(['ABCDEFG', 'BCDEFEF', 'DEFGHIJEF', 'EFGHEF', 'XXX']) + values = values.astype(dtype) + result = values.str.find('EF') + assert_equal(result, xr.DataArray([4, 3, 1, 0, -1])) + expected = xr.DataArray([v.find(dtype('EF')) for v in values.values]) + assert_equal(result, expected) + + result = values.str.rfind('EF') + assert_equal(result, xr.DataArray([4, 5, 7, 4, -1])) + expected = xr.DataArray([v.rfind(dtype('EF')) for v in values.values]) + assert_equal(result, expected) + + result = values.str.find('EF', 3) + assert_equal(result, xr.DataArray([4, 3, 7, 4, -1])) + expected = xr.DataArray([v.find(dtype('EF'), 3) for v in values.values]) + assert_equal(result, expected) + + result = values.str.rfind('EF', 3) + assert_equal(result, xr.DataArray([4, 5, 7, 4, -1])) + expected = xr.DataArray([v.rfind(dtype('EF'), 3) for v in values.values]) + assert_equal(result, expected) + + result = values.str.find('EF', 3, 6) + assert_equal(result, xr.DataArray([4, 3, -1, 4, -1])) + expected = xr.DataArray([v.find(dtype('EF'), 3, 6) for v in values.values]) + assert_equal(result, expected) + + result = values.str.rfind('EF', 3, 6) + assert_equal(result, xr.DataArray([4, 3, -1, 4, -1])) + xp = xr.DataArray([v.rfind(dtype('EF'), 3, 6) for v in values.values]) + assert_equal(result, xp) + + +def test_index(dtype): + s = xr.DataArray(['ABCDEFG', 'BCDEFEF', 'DEFGHIJEF', + 'EFGHEF']).astype(dtype) + + result = s.str.index('EF') + assert_equal(result, xr.DataArray([4, 3, 1, 0])) + + result = s.str.rindex('EF') + assert_equal(result, xr.DataArray([4, 5, 7, 4])) + + result = s.str.index('EF', 3) + assert_equal(result, xr.DataArray([4, 3, 7, 4])) + + result = s.str.rindex('EF', 3) + assert_equal(result, xr.DataArray([4, 5, 7, 4])) + + result = s.str.index('E', 4, 8) + assert_equal(result, xr.DataArray([4, 5, 7, 4])) + + result = s.str.rindex('E', 0, 5) + assert_equal(result, xr.DataArray([4, 3, 1, 4])) + + with pytest.raises(ValueError): + result = s.str.index('DE') + + +def test_pad(dtype): + values = xr.DataArray(['a', 'b', 'c', 'eeeee']).astype(dtype) + + result = values.str.pad(5, side='left') + expected = xr.DataArray([' a', ' b', ' c', 'eeeee']).astype(dtype) + assert_equal(result, expected) + + result = values.str.pad(5, side='right') + expected = xr.DataArray(['a ', 'b ', 'c ', 'eeeee']).astype(dtype) + assert_equal(result, expected) + + result = values.str.pad(5, side='both') + expected = xr.DataArray([' a ', ' b ', ' c ', 'eeeee']).astype(dtype) + assert_equal(result, expected) + + +def test_pad_fillchar(dtype): + values = xr.DataArray(['a', 'b', 'c', 'eeeee']).astype(dtype) + + result = values.str.pad(5, side='left', fillchar='X') + expected = xr.DataArray(['XXXXa', 'XXXXb', 'XXXXc', 'eeeee']).astype(dtype) + assert_equal(result, expected) + + result = values.str.pad(5, side='right', fillchar='X') + expected = xr.DataArray(['aXXXX', 'bXXXX', 'cXXXX', 'eeeee']).astype(dtype) + assert_equal(result, expected) + + result = values.str.pad(5, side='both', fillchar='X') + expected = xr.DataArray(['XXaXX', 'XXbXX', 'XXcXX', 'eeeee']).astype(dtype) + assert_equal(result, expected) + + msg = "fillchar must be a character, not str" + with pytest.raises(TypeError, match=msg): + result = values.str.pad(5, fillchar='XY') + + +def test_translate(): + values = xr.DataArray(['abcdefg', 'abcc', 'cdddfg', 'cdefggg']) + table = str.maketrans('abc', 'cde') + result = values.str.translate(table) + expected = xr.DataArray(['cdedefg', 'cdee', 'edddfg', 'edefggg']) + assert_equal(result, expected) + + +def test_center_ljust_rjust(dtype): + values = xr.DataArray(['a', 'b', 'c', 'eeeee']).astype(dtype) + + result = values.str.center(5) + expected = xr.DataArray([' a ', ' b ', ' c ', 'eeeee']).astype(dtype) + assert_equal(result, expected) + + result = values.str.ljust(5) + expected = xr.DataArray(['a ', 'b ', 'c ', 'eeeee']).astype(dtype) + assert_equal(result, expected) + + result = values.str.rjust(5) + expected = xr.DataArray([' a', ' b', ' c', 'eeeee']).astype(dtype) + assert_equal(result, expected) + + +def test_center_ljust_rjust_fillchar(dtype): + values = xr.DataArray(['a', 'bb', 'cccc', 'ddddd', 'eeeeee']).astype(dtype) + result = values.str.center(5, fillchar='X') + expected = xr.DataArray(['XXaXX', 'XXbbX', 'Xcccc', 'ddddd', 'eeeeee']) + assert_equal(result, expected.astype(dtype)) + + result = values.str.ljust(5, fillchar='X') + expected = xr.DataArray(['aXXXX', 'bbXXX', 'ccccX', 'ddddd', 'eeeeee']) + assert_equal(result, expected.astype(dtype)) + + result = values.str.rjust(5, fillchar='X') + expected = xr.DataArray(['XXXXa', 'XXXbb', 'Xcccc', 'ddddd', 'eeeeee']) + assert_equal(result, expected.astype(dtype)) + + # If fillchar is not a charatter, normal str raises TypeError + # 'aaa'.ljust(5, 'XY') + # TypeError: must be char, not str + template = "fillchar must be a character, not {dtype}" + + with pytest.raises(TypeError, match=template.format(dtype="str")): + values.str.center(5, fillchar='XY') + + with pytest.raises(TypeError, match=template.format(dtype="str")): + values.str.ljust(5, fillchar='XY') + + with pytest.raises(TypeError, match=template.format(dtype="str")): + values.str.rjust(5, fillchar='XY') + + +def test_zfill(dtype): + values = xr.DataArray(['1', '22', 'aaa', '333', '45678']).astype(dtype) + + result = values.str.zfill(5) + expected = xr.DataArray(['00001', '00022', '00aaa', '00333', '45678']) + assert_equal(result, expected.astype(dtype)) + + result = values.str.zfill(3) + expected = xr.DataArray(['001', '022', 'aaa', '333', '45678']) + assert_equal(result, expected.astype(dtype)) + + +def test_slice(dtype): + arr = xr.DataArray(['aafootwo', 'aabartwo', 'aabazqux']).astype(dtype) + + result = arr.str.slice(2, 5) + exp = xr.DataArray(['foo', 'bar', 'baz']).astype(dtype) + assert_equal(result, exp) + + for start, stop, step in [(0, 3, -1), (None, None, -1), + (3, 10, 2), (3, 0, -1)]: + try: + result = arr.str[start:stop:step] + expected = xr.DataArray([s[start:stop:step] for s in arr.values]) + assert_equal(result, expected.astype(dtype)) + except IndexError: + print('failed on %s:%s:%s' % (start, stop, step)) + raise + + +def test_slice_replace(dtype): + da = lambda x: xr.DataArray(x).astype(dtype) + values = da(['short', 'a bit longer', 'evenlongerthanthat', '']) + + expected = da(['shrt', 'a it longer', 'evnlongerthanthat', '']) + result = values.str.slice_replace(2, 3) + assert_equal(result, expected) + + expected = da(['shzrt', 'a zit longer', 'evznlongerthanthat', 'z']) + result = values.str.slice_replace(2, 3, 'z') + assert_equal(result, expected) + + expected = da(['shzort', 'a zbit longer', 'evzenlongerthanthat', 'z']) + result = values.str.slice_replace(2, 2, 'z') + assert_equal(result, expected) + + expected = da(['shzort', 'a zbit longer', 'evzenlongerthanthat', 'z']) + result = values.str.slice_replace(2, 1, 'z') + assert_equal(result, expected) + + expected = da(['shorz', 'a bit longez', 'evenlongerthanthaz', 'z']) + result = values.str.slice_replace(-1, None, 'z') + assert_equal(result, expected) + + expected = da(['zrt', 'zer', 'zat', 'z']) + result = values.str.slice_replace(None, -2, 'z') + assert_equal(result, expected) + + expected = da(['shortz', 'a bit znger', 'evenlozerthanthat', 'z']) + result = values.str.slice_replace(6, 8, 'z') + assert_equal(result, expected) + + expected = da(['zrt', 'a zit longer', 'evenlongzerthanthat', 'z']) + result = values.str.slice_replace(-10, 3, 'z') + assert_equal(result, expected) + + +def test_strip_lstrip_rstrip(dtype): + values = xr.DataArray([' aa ', ' bb \n', 'cc ']).astype(dtype) + + result = values.str.strip() + expected = xr.DataArray(['aa', 'bb', 'cc']).astype(dtype) + assert_equal(result, expected) + + result = values.str.lstrip() + expected = xr.DataArray(['aa ', 'bb \n', 'cc ']).astype(dtype) + assert_equal(result, expected) + + result = values.str.rstrip() + expected = xr.DataArray([' aa', ' bb', 'cc']).astype(dtype) + assert_equal(result, expected) + + +def test_strip_lstrip_rstrip_args(dtype): + values = xr.DataArray(['xxABCxx', 'xx BNSD', 'LDFJH xx']).astype(dtype) + + rs = values.str.strip('x') + xp = xr.DataArray(['ABC', ' BNSD', 'LDFJH ']).astype(dtype) + assert_equal(rs, xp) + + rs = values.str.lstrip('x') + xp = xr.DataArray(['ABCxx', ' BNSD', 'LDFJH xx']).astype(dtype) + assert_equal(rs, xp) + + rs = values.str.rstrip('x') + xp = xr.DataArray(['xxABC', 'xx BNSD', 'LDFJH ']).astype(dtype) + assert_equal(rs, xp) + + +def test_wrap(): + # test values are: two words less than width, two words equal to width, + # two words greater than width, one word less than width, one word + # equal to width, one word greater than width, multiple tokens with + # trailing whitespace equal to width + values = xr.DataArray(['hello world', 'hello world!', 'hello world!!', + 'abcdefabcde', 'abcdefabcdef', 'abcdefabcdefa', + 'ab ab ab ab ', 'ab ab ab ab a', '\t']) + + # expected values + xp = xr.DataArray(['hello world', 'hello world!', 'hello\nworld!!', + 'abcdefabcde', 'abcdefabcdef', 'abcdefabcdef\na', + 'ab ab ab ab', 'ab ab ab ab\na', '']) + + rs = values.str.wrap(12, break_long_words=True) + assert_equal(rs, xp) + + # test with pre and post whitespace (non-unicode), NaN, and non-ascii + # Unicode + values = xr.DataArray([' pre ', '\xac\u20ac\U00008000 abadcafe']) + xp = xr.DataArray([' pre', '\xac\u20ac\U00008000 ab\nadcafe']) + rs = values.str.wrap(6) + assert_equal(rs, xp) + + +def test_get(dtype): + values = xr.DataArray(['a_b_c', 'c_d_e', 'f_g_h']).astype(dtype) + + result = values.str[2] + expected = xr.DataArray(['b', 'd', 'g']).astype(dtype) + assert_equal(result, expected) + + # bounds testing + values = xr.DataArray(['1_2_3_4_5', '6_7_8_9_10', '11_12']).astype(dtype) + + # positive index + result = values.str[5] + expected = xr.DataArray(['_', '_', '']).astype(dtype) + assert_equal(result, expected) + + # negative index + result = values.str[-6] + expected = xr.DataArray(['_', '8', '']).astype(dtype) + assert_equal(result, expected) + + +def test_encode_decode(): + data = xr.DataArray(['a', 'b', 'a\xe4']) + encoded = data.str.encode('utf-8') + decoded = encoded.str.decode('utf-8') + assert_equal(data, decoded) + + +def test_encode_decode_errors(): + encodeBase = xr.DataArray(['a', 'b', 'a\x9d']) + + msg = (r"'charmap' codec can't encode character '\\x9d' in position 1:" + " character maps to ") + with pytest.raises(UnicodeEncodeError, match=msg): + encodeBase.str.encode('cp1252') + + f = lambda x: x.encode('cp1252', 'ignore') + result = encodeBase.str.encode('cp1252', 'ignore') + expected = xr.DataArray([f(x) for x in encodeBase.values.tolist()]) + assert_equal(result, expected) + + decodeBase = xr.DataArray([b'a', b'b', b'a\x9d']) + + msg = ("'charmap' codec can't decode byte 0x9d in position 1:" + " character maps to ") + with pytest.raises(UnicodeDecodeError, match=msg): + decodeBase.str.decode('cp1252') + + f = lambda x: x.decode('cp1252', 'ignore') + result = decodeBase.str.decode('cp1252', 'ignore') + expected = xr.DataArray([f(x) for x in decodeBase.values.tolist()]) + assert_equal(result, expected)