From 5135518a600e4d9c25bcef74278f80d779e67f70 Mon Sep 17 00:00:00 2001 From: Sam Levang Date: Wed, 3 Nov 2021 10:25:04 -0400 Subject: [PATCH] reimplement polyfit with apply_ufunc --- xarray/core/dataarray.py | 20 ++-- xarray/core/dataset.py | 189 +++++++++++++++------------------ xarray/tests/test_dataarray.py | 2 +- 3 files changed, 98 insertions(+), 113 deletions(-) diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py index 89f916db7f4..5a0681d8a21 100644 --- a/xarray/core/dataarray.py +++ b/xarray/core/dataarray.py @@ -3847,7 +3847,7 @@ def polyfit( self, dim: Hashable, deg: int, - skipna: bool = None, + skipna: bool = True, rcond: float = None, w: Union[Hashable, Any] = None, full: bool = False, @@ -3867,11 +3867,10 @@ def polyfit( Degree of the fitting polynomial. skipna : bool, optional If True, removes all invalid values before fitting each 1D slices of the array. - Default is True if data is stored in a dask.array or if there is any - invalid values, False otherwise. + Default is True. rcond : float, optional Relative condition number to the fit. - w : hashable or array-like, optional + w : hashable or Any, optional Weights to apply to the y-coordinate of the sample points. Can be an array-like object or the name of a coordinate in the dataset. full : bool, optional @@ -3893,11 +3892,20 @@ def polyfit( When the matrix rank is deficient, np.nan is returned. [dim]_matrix_rank The effective rank of the scaled Vandermonde coefficient matrix (only included if `full=True`) - [dim]_singular_value + The rank is computed ignoring the NaN values that might be skipped. + [dim]_singular_values The singular values of the scaled Vandermonde coefficient matrix (only included if `full=True`) - polyfit_covariance + [dim]_rcond + The specified value of rcond (only included if `full=True`) + [var]_polyfit_covariance The covariance matrix of the polynomial coefficient estimates (only included if `full=False` and `cov=True`) + Warns + ----- + RankWarning + The rank of the coefficient matrix in the least-squares fit is deficient. + The warning is not raised with `full=True`. + See Also -------- numpy.polyfit diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index e882495dce5..32eb76761b8 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -6808,7 +6808,7 @@ def polyfit( self, dim: Hashable, deg: int, - skipna: bool = None, + skipna: bool = True, rcond: float = None, w: Union[Hashable, Any] = None, full: bool = False, @@ -6828,8 +6828,7 @@ def polyfit( Degree of the fitting polynomial. skipna : bool, optional If True, removes all invalid values before fitting each 1D slices of the array. - Default is True if data is stored in a dask.array or if there is any - invalid values, False otherwise. + Default is True. rcond : float, optional Relative condition number to the fit. w : hashable or Any, optional @@ -6857,6 +6856,8 @@ def polyfit( The rank is computed ignoring the NaN values that might be skipped. [dim]_singular_values The singular values of the scaled Vandermonde coefficient matrix (only included if `full=True`) + [dim]_rcond + The specified value of rcond (only included if `full=True`) [var]_polyfit_covariance The covariance matrix of the polynomial coefficient estimates (only included if `full=False` and `cov=True`) @@ -6864,7 +6865,7 @@ def polyfit( ----- RankWarning The rank of the coefficient matrix in the least-squares fit is deficient. - The warning is not raised with in-memory (not dask) data and `full=True`. + The warning is not raised with `full=True`. See Also -------- @@ -6872,131 +6873,107 @@ def polyfit( numpy.polyval xarray.polyval """ - variables = {} - skipna_da = skipna - - x = get_clean_interp_index(self, dim, strict=False) - xname = "{}_".format(self[dim].name) - order = int(deg) + 1 - lhs = np.vander(x, order) - - if rcond is None: - rcond = ( - x.shape[0] * np.core.finfo(x.dtype).eps # type: ignore[attr-defined] - ) - - # Weights: if w is not None: if isinstance(w, Hashable): w = self.coords[w] w = np.asarray(w) if w.ndim != 1: raise TypeError("Expected a 1-d array for weights.") - if w.shape[0] != lhs.shape[0]: - raise TypeError("Expected w and {} to have the same length".format(dim)) - lhs *= w[:, np.newaxis] + if w.shape[0] != self[dim].shape[0]: + raise TypeError(f"Expected w and {dim} to have the same length") - # Scaling - scale = np.sqrt((lhs * lhs).sum(axis=0)) - lhs /= scale - - degree_dim = utils.get_temp_dimname(self.dims, "degree") + order = int(deg) + 1 + degrees = np.arange(deg, -1, -1) - rank = np.linalg.matrix_rank(lhs) + x = get_clean_interp_index(self, dim, strict=False) + # sizes and names for varying polyfit outputs + output_sizes = {"degree": order} if full: - rank = xr.DataArray(rank, name=xname + "matrix_rank") - variables[rank.name] = rank - sing = np.linalg.svd(lhs, compute_uv=False) - sing = xr.DataArray( - sing, - dims=(degree_dim,), - coords={degree_dim: np.arange(rank - 1, -1, -1)}, - name=xname + "singular_values", - ) - variables[sing.name] = sing + output_core_dims = [("degree",), (), (), ("degree",), ()] + output_vars = [ + "{name}polyfit_coefficients", + "{name}polyfit_residuals", + f"{dim}_matrix_rank", + f"{dim}_singular_values", + f"{dim}_rcond", + ] + elif cov and not full: + output_core_dims = [("degree",), ("cov_i", "cov_j")] + output_vars = [ + "{name}polyfit_coefficients", + "{name}polyfit_covariance", + ] + output_sizes["cov_i"] = order + output_sizes["cov_j"] = order + else: + output_core_dims = [("degree",)] + output_vars = ["{name}polyfit_coefficients"] + + def _wrapper(x, y): + # Wrap np.polyfit with pointwise NaN handling + if skipna: + mask = np.all([~np.isnan(x), ~np.isnan(y)], axis=0) + x = x[mask] + y = y[mask] + if not len(y): + return tuple( + np.full(len(var) * [order], np.nan) for var in output_core_dims + ) + output = np.polyfit(x, y, deg, rcond=rcond, full=full, w=w, cov=cov) + if full and output[2] < order: + # fit is poorly conditioned, need to pad higher polynomial values with zeros + output = ( + output[0], + np.nan, + output[2], + np.append((order - output[2]) * [np.nan], output[3]), + output[4], + ) + return output + result = xr.Dataset() for name, da in self.data_vars.items(): if dim not in da.dims: continue - - if is_duck_dask_array(da.data) and ( - rank != order or full or skipna is None - ): - # Current algorithm with dask and skipna=False neither supports - # deficient ranks nor does it output the "full" info (issue dask/dask#6516) - skipna_da = True - elif skipna is None: - skipna_da = bool(np.any(da.isnull())) - - dims_to_stack = [dimname for dimname in da.dims if dimname != dim] - stacked_coords: Dict[Hashable, DataArray] = {} - if dims_to_stack: - stacked_dim = utils.get_temp_dimname(dims_to_stack, "stacked") - rhs = da.transpose(dim, *dims_to_stack).stack( - {stacked_dim: dims_to_stack} - ) - stacked_coords = {stacked_dim: rhs[stacked_dim]} - scale_da = scale[:, np.newaxis] + if name is xr.core.dataarray._THIS_ARRAY: + name = "" else: - rhs = da - scale_da = scale - - if w is not None: - rhs *= w[:, np.newaxis] - + name = f"{str(name)}_" with warnings.catch_warnings(): if full: # Copy np.polyfit behavior warnings.simplefilter("ignore", np.RankWarning) else: # Raise only once per variable warnings.simplefilter("once", np.RankWarning) - - coeffs, residuals = duck_array_ops.least_squares( - lhs, rhs.data, rcond=rcond, skipna=skipna_da + output = xr.apply_ufunc( + _wrapper, + x, + da, + vectorize=True, + dask="parallelized", + input_core_dims=[[dim], [dim]], + output_core_dims=output_core_dims, + dask_gufunc_kwargs={"output_sizes": output_sizes}, + output_dtypes=len(output_core_dims) * (np.float64,), ) - - if isinstance(name, str): - name = "{}_".format(name) + # handle naming of varying polyfit outputs + # including case where output is only coefficients and not a tuple + if len(output_vars) == 1: + result[output_vars[0].format(name=name)] = output else: - # Thus a ReprObject => polyfit was called on a DataArray - name = "" - - coeffs = xr.DataArray( - coeffs / scale_da, - dims=[degree_dim] + list(stacked_coords.keys()), - coords={degree_dim: np.arange(order)[::-1], **stacked_coords}, - name=name + "polyfit_coefficients", - ) - if dims_to_stack: - coeffs = coeffs.unstack(stacked_dim) - variables[coeffs.name] = coeffs - - if full or (cov is True): - residuals = xr.DataArray( - residuals if dims_to_stack else residuals.squeeze(), - dims=list(stacked_coords.keys()), - coords=stacked_coords, - name=name + "polyfit_residuals", - ) - if dims_to_stack: - residuals = residuals.unstack(stacked_dim) - variables[residuals.name] = residuals - - if cov: - Vbase = np.linalg.inv(np.dot(lhs.T, lhs)) - Vbase /= np.outer(scale, scale) - if cov == "unscaled": - fac = 1 - else: - if x.shape[0] <= order: - raise ValueError( - "The number of data points must exceed order to scale the covariance matrix." - ) - fac = residuals / (x.shape[0] - order) - covariance = xr.DataArray(Vbase, dims=("cov_i", "cov_j")) * fac - variables[name + "polyfit_covariance"] = covariance + for i, var in enumerate(output_vars): + result[var.format(name=name)] = output[i] + + result = result.assign_coords({"degree": degrees}) + if cov: + result = result.assign_coords({"cov_i": degrees, "cov_j": degrees}) + # We need this to maintain ordering of previous polyfit implementation...? + result = result.transpose("degree", "cov_i", "cov_j", ...) + else: + result = result.transpose("degree", ...) + result.attrs = self.attrs.copy() - return Dataset(data_vars=variables, attrs=self.attrs.copy()) + return result def pad( self, diff --git a/xarray/tests/test_dataarray.py b/xarray/tests/test_dataarray.py index 53c650046e7..90c328156fa 100644 --- a/xarray/tests/test_dataarray.py +++ b/xarray/tests/test_dataarray.py @@ -3742,7 +3742,7 @@ def test_polyfit(self, use_dask, use_datetime): # Skipna + Full output out = da.polyfit("x", 2, skipna=True, full=True) assert_allclose(out.polyfit_coefficients, expected, rtol=1e-3) - assert out.x_matrix_rank == 3 + np.testing.assert_almost_equal(out.x_matrix_rank, [3, 3]) np.testing.assert_almost_equal(out.polyfit_residuals, [0, 0]) with warnings.catch_warnings():