diff --git a/impyute/imputations/cs/averaging_imputations.py b/impyute/imputations/cs/averaging_imputations.py index e9caa50..822f33a 100644 --- a/impyute/imputations/cs/averaging_imputations.py +++ b/impyute/imputations/cs/averaging_imputations.py @@ -2,9 +2,13 @@ import numpy as np from impyute.utils import find_null from impyute.utils import checks +from impyute.utils import preprocess +# pylint:disable=unused-argument +# pylint:disable=invalid-name +@preprocess @checks -def mean_imputation(data): +def mean_imputation(data, **kwargs): """ Substitute missing values with the mean of that column. Parameters @@ -25,8 +29,9 @@ def mean_imputation(data): data[x_i][y_i] = new_value return data +@preprocess @checks -def median_imputation(data): +def median_imputation(data, **kwargs): """ Substitute missing values with the median of that column(middle). Parameters @@ -51,8 +56,9 @@ def median_imputation(data): data[x_i][y_i] = medians[str(y_i)] return data +@preprocess @checks -def mode_imputation(data): +def mode_imputation(data, **kwargs): """ Substitute missing values with the mode of that column(most frequent). In the case that there is a tie (there are multiple, most frequent values) diff --git a/impyute/imputations/cs/em.py b/impyute/imputations/cs/em.py index ce9138b..e433a02 100644 --- a/impyute/imputations/cs/em.py +++ b/impyute/imputations/cs/em.py @@ -2,10 +2,14 @@ import random import numpy as np from impyute.utils import find_null +from impyute.utils import preprocess from impyute.utils import checks +# pylint:disable=invalid-name +# pylint:disable=unused-argument +@preprocess @checks -def em(data, loops=50, dtype="cont"): +def em(data, loops=50, **kwargs): """ Imputes given data using expectation maximization. E-step: Calculates the expected complete data log likelihood ratio. @@ -18,9 +22,8 @@ def em(data, loops=50, dtype="cont"): Data to impute. loops: int Number of em iterations to run before breaking. - dtype: ("cont","disc") - Indicates whether the possible values will come from a continuous - range or categorical range. + inplace: boolean + If True, operate on the numpy array reference Returns ------- @@ -28,28 +31,25 @@ def em(data, loops=50, dtype="cont"): Imputed data. """ - if dtype == "cont": - null_xy = find_null(data) - for x_i, y_i in null_xy: - col = data[:, int(y_i)] + null_xy = find_null(data) + for x_i, y_i in null_xy: + col = data[:, int(y_i)] + mu = col[~np.isnan(col)].mean() + std = col[~np.isnan(col)].std() + col[x_i] = random.gauss(mu, std) + previous, i = 1, 1 + for i in range(loops): + # Expectation mu = col[~np.isnan(col)].mean() std = col[~np.isnan(col)].std() + # Maximization col[x_i] = random.gauss(mu, std) - previous, i = 1, 1 - for i in range(loops): - # Expectation - mu = col[~np.isnan(col)].mean() - std = col[~np.isnan(col)].std() - # Maximization - col[x_i] = random.gauss(mu, std) - # Break out of loop if likelihood doesn't change at least 10% - # and has run at least 5 times - delta = (col[x_i]-previous)/previous - if i > 5 and delta < 0.1: - data[x_i][y_i] = col[x_i] - break + # Break out of loop if likelihood doesn't change at least 10% + # and has run at least 5 times + delta = (col[x_i]-previous)/previous + if i > 5 and delta < 0.1: data[x_i][y_i] = col[x_i] - previous = col[x_i] - return data - else: - raise Exception("Other dtypes not supported yet.") + break + data[x_i][y_i] = col[x_i] + previous = col[x_i] + return data diff --git a/impyute/imputations/cs/fast_knn.py b/impyute/imputations/cs/fast_knn.py index b170fe4..413e25b 100644 --- a/impyute/imputations/cs/fast_knn.py +++ b/impyute/imputations/cs/fast_knn.py @@ -2,12 +2,15 @@ import numpy as np from impyute.utils import find_null from impyute.utils import checks +from impyute.utils import preprocess from impyute.imputations.cs import mean_imputation from scipy.spatial import KDTree # pylint: disable=invalid-name +# pylint:disable=unused-argument +@preprocess @checks -def fast_knn(data, k=5): +def fast_knn(data, k=3, **kwargs): """ Impute using a variant of the nearest neighbours approach Basic idea: Impute array and then use the resulting complete @@ -33,12 +36,12 @@ def fast_knn(data, k=5): kdtree = KDTree(data_c) for x_i, y_i in null_xy: - distances, indices = kdtree.query(data[x_i], k=k+1) + distances, indices = kdtree.query(data_c[x_i], k=k+1) # Will always return itself in the first index. Delete it. distances, indices = distances[1:], indices[1:] weights = (np.sum(distances)-distances)/np.sum(distances) # Make weights sum to 1 weights_unit = weights/np.sum(weights) # Assign missing value the weighted average of `k` nearest neighbours - data[x_i][y_i] = np.dot(weights_unit, [data[y_i][ind] for ind in indices]) + data[x_i][y_i] = np.dot(weights_unit, [data_c[y_i][ind] for ind in indices]) return data diff --git a/impyute/imputations/cs/mice.py b/impyute/imputations/cs/mice.py index e1a5139..ee7493a 100644 --- a/impyute/imputations/cs/mice.py +++ b/impyute/imputations/cs/mice.py @@ -3,10 +3,14 @@ from sklearn.linear_model import LinearRegression from impyute.utils import find_null from impyute.utils import checks +from impyute.utils import preprocess # pylint: disable=too-many-locals +# pylint:disable=invalid-name +# pylint:disable=unused-argument +@preprocess @checks -def mice(data): +def mice(data, **kwargs): """Multivariate Imputation by Chained Equations Reference: diff --git a/impyute/imputations/cs/random_imputation.py b/impyute/imputations/cs/random_imputation.py index f5624cd..5e4e1ca 100644 --- a/impyute/imputations/cs/random_imputation.py +++ b/impyute/imputations/cs/random_imputation.py @@ -1,10 +1,14 @@ """ impyute.imputations.cs.random_imputation""" import numpy as np from impyute.utils import find_null +from impyute.utils import preprocess from impyute.utils import checks +# pylint:disable=invalid-name +# pylint:disable=unused-argument +@preprocess @checks -def random_imputation(data): +def random_imputation(data, **kwargs): """ Fill missing values in with a randomly selected value from the same column. diff --git a/impyute/imputations/ts/arima.py b/impyute/imputations/ts/arima.py index 50fc48b..2998201 100644 --- a/impyute/imputations/ts/arima.py +++ b/impyute/imputations/ts/arima.py @@ -2,9 +2,10 @@ import numpy as np from impyute.utils import find_null from impyute.utils import checks +# pylint: disable=invalid-name @checks -def arima(data, p, d, q): +def arima(data, p, d, q, axis=0): """Autoregressive Integrated Moving Average Imputation Stationary model @@ -14,30 +15,24 @@ def arima(data, p, d, q): data: numpy.ndarray The matrix with missing values that you want to impute p: int - Number of autoregressive terms. Ex (p,d,q)=(1,0,0). + Number of autoregressive terms. Ex (p,d,q)=(1,0,0). d: int Number of nonseasonal differences needed for stationarity q: int Number of lagged forecast errors in the prediction equation + axis: boolean (optional) + 0 if time series is in row format (Ex. data[0][:] is 1st data point). + 1 if time series is in col format (Ex. data[:][0] is 1st data point). + RETURNS ------- numpy.ndarray """ - def _compute_nan_endpoints(x, y): - pass + assert isinstance(p, int), "Parameter `p` must be an integer" + assert isinstance(d, int), "Parameter `d` must be an integer" + assert isinstance(q, int), "Parameter `q` must be an integer" - try: - p = int(p) - d = int(d) - q = int(q) - data = isinstance(data, np.ndarray) - except: - raise Exception - # ARIMA null_xy = find_null(data) for x, y in null_xy: print(x, y) return data - - - diff --git a/impyute/utils/__init__.py b/impyute/utils/__init__.py index 4bf1a9f..9b0f236 100644 --- a/impyute/utils/__init__.py +++ b/impyute/utils/__init__.py @@ -10,6 +10,7 @@ from .errors import BadInputError from .checks import checks from .compare import compare +from .preprocess import preprocess __all__ = ["find_null", "print_io", "describe", "count_missing", - "checks", "compare", "BadInputError"] + "checks", "compare", "BadInputError", "preprocess"] diff --git a/impyute/utils/checks.py b/impyute/utils/checks.py index 429d8e1..247be7a 100644 --- a/impyute/utils/checks.py +++ b/impyute/utils/checks.py @@ -3,6 +3,7 @@ import numpy as np from impyute.utils import find_null from impyute.utils import BadInputError +# pylint:disable=invalid-name def checks(fn): """ Main check function to ensure input is correctly formatted @@ -19,7 +20,7 @@ def checks(fn): """ @wraps(fn) - def wrapper(*args, **kwds): + def wrapper(*args, **kwargs): """ Run input checks""" data = args[0] if len(np.shape(data)) != 2: @@ -32,7 +33,7 @@ def wrapper(*args, **kwds): raise BadInputError("Data is not float.") elif not _nan_exists(data): raise BadInputError("No NaN's in given data") - return fn(*args, **kwds) + return fn(*args, **kwargs) return wrapper def _shape_2d(data): diff --git a/impyute/utils/preprocess.py b/impyute/utils/preprocess.py new file mode 100644 index 0000000..ea2c62e --- /dev/null +++ b/impyute/utils/preprocess.py @@ -0,0 +1,32 @@ +""" impyute.utils.preprocess """ +from functools import wraps +# pylint:disable=invalid-name + +def preprocess(fn): + """ Base preprocess function for commonly used preprocessing + + PARAMETERS + ---------- + data: numpy.ndarray + Data to impute. + + RETURNS + ------- + bool + True if `data` is correctly formatted + + """ + @wraps(fn) + def wrapper(*args, **kwargs): + """ Run input checks""" + if "inplace" in kwargs and kwargs['inplace']: + data = args[0] + else: + data = args[0].copy() + + if len(args) == 1: + return fn(data, **kwargs) + return fn(data, *args[1:], **kwargs) + return wrapper + +