eltonlaw · eltonlaw · Apr 27, 2018 · Apr 26, 2018 · Apr 27, 2018 · Apr 27, 2018
diff --git a/impyute/imputations/cs/averaging_imputations.py b/impyute/imputations/cs/averaging_imputations.py
@@ -2,9 +2,13 @@
 import numpy as np
 from impyute.utils import find_null
 from impyute.utils import checks
+from impyute.utils import preprocess
+# pylint:disable=unused-argument
+# pylint:disable=invalid-name
 
+@preprocess
 @checks
-def mean_imputation(data):
+def mean_imputation(data, **kwargs):
     """ Substitute missing values with the mean of that column.
 
     Parameters
@@ -25,8 +29,9 @@ def mean_imputation(data):
         data[x_i][y_i] = new_value
     return data
 
+@preprocess
 @checks
-def median_imputation(data):
+def median_imputation(data, **kwargs):
     """ Substitute missing values with the median of that column(middle).
 
     Parameters
@@ -51,8 +56,9 @@ def median_imputation(data):
         data[x_i][y_i] = medians[str(y_i)]
     return data
 
+@preprocess
 @checks
-def mode_imputation(data):
+def mode_imputation(data, **kwargs):
     """ Substitute missing values with the mode of that column(most frequent).
 
     In the case that there is a tie (there are multiple, most frequent values)

diff --git a/impyute/imputations/cs/em.py b/impyute/imputations/cs/em.py
@@ -2,10 +2,14 @@
 import random
 import numpy as np
 from impyute.utils import find_null
+from impyute.utils import preprocess
 from impyute.utils import checks
+# pylint:disable=invalid-name
+# pylint:disable=unused-argument
 
+@preprocess
 @checks
-def em(data, loops=50, dtype="cont"):
+def em(data, loops=50, **kwargs):
     """ Imputes given data using expectation maximization.
 
     E-step: Calculates the expected complete data log likelihood ratio.
@@ -18,38 +22,34 @@ def em(data, loops=50, dtype="cont"):
         Data to impute.
     loops: int
         Number of em iterations to run before breaking.
-    dtype: ("cont","disc")
-        Indicates whether the possible values will come from a continuous
-        range or categorical range.
+    inplace: boolean
+        If True, operate on the numpy array reference
 
     Returns
     -------
     numpy.nd.array
         Imputed data.
 
     """
-    if dtype == "cont":
-        null_xy = find_null(data)
-        for x_i, y_i in null_xy:
-            col = data[:, int(y_i)]
+    null_xy = find_null(data)
+    for x_i, y_i in null_xy:
+        col = data[:, int(y_i)]
+        mu = col[~np.isnan(col)].mean()
+        std = col[~np.isnan(col)].std()
+        col[x_i] = random.gauss(mu, std)
+        previous, i = 1, 1
+        for i in range(loops):
+            # Expectation
             mu = col[~np.isnan(col)].mean()
             std = col[~np.isnan(col)].std()
+            # Maximization
             col[x_i] = random.gauss(mu, std)
-            previous, i = 1, 1
-            for i in range(loops):
-                # Expectation
-                mu = col[~np.isnan(col)].mean()
-                std = col[~np.isnan(col)].std()
-                # Maximization
-                col[x_i] = random.gauss(mu, std)
-                # Break out of loop if likelihood doesn't change at least 10%
-                # and has run at least 5 times
-                delta = (col[x_i]-previous)/previous
-                if i > 5 and delta < 0.1:
-                    data[x_i][y_i] = col[x_i]
-                    break
+            # Break out of loop if likelihood doesn't change at least 10%
+            # and has run at least 5 times
+            delta = (col[x_i]-previous)/previous
+            if i > 5 and delta < 0.1:
                 data[x_i][y_i] = col[x_i]
-                previous = col[x_i]
-        return data
-    else:
-        raise Exception("Other dtypes not supported yet.")
+                break
+            data[x_i][y_i] = col[x_i]
+            previous = col[x_i]
+    return data
diff --git a/impyute/imputations/cs/fast_knn.py b/impyute/imputations/cs/fast_knn.py
@@ -2,12 +2,15 @@
 import numpy as np
 from impyute.utils import find_null
 from impyute.utils import checks
+from impyute.utils import preprocess
 from impyute.imputations.cs import mean_imputation
 from scipy.spatial import KDTree
 # pylint: disable=invalid-name
+# pylint:disable=unused-argument
 
+@preprocess
 @checks
-def fast_knn(data, k=5):
+def fast_knn(data, k=3, **kwargs):
     """ Impute using a variant of the nearest neighbours approach
 
     Basic idea: Impute array and then use the resulting complete
@@ -33,12 +36,12 @@ def fast_knn(data, k=5):
     kdtree = KDTree(data_c)
 
     for x_i, y_i in null_xy:
-        distances, indices = kdtree.query(data[x_i], k=k+1)
+        distances, indices = kdtree.query(data_c[x_i], k=k+1)
         # Will always return itself in the first index. Delete it.
         distances, indices = distances[1:], indices[1:]
         weights = (np.sum(distances)-distances)/np.sum(distances)
         # Make weights sum to 1
         weights_unit = weights/np.sum(weights)
         # Assign missing value the weighted average of `k` nearest neighbours
-        data[x_i][y_i] = np.dot(weights_unit, [data[y_i][ind] for ind in indices])
+        data[x_i][y_i] = np.dot(weights_unit, [data_c[y_i][ind] for ind in indices])
     return data
diff --git a/impyute/imputations/cs/mice.py b/impyute/imputations/cs/mice.py
@@ -3,10 +3,14 @@
 from sklearn.linear_model import LinearRegression
 from impyute.utils import find_null
 from impyute.utils import checks
+from impyute.utils import preprocess
 # pylint: disable=too-many-locals
+# pylint:disable=invalid-name
+# pylint:disable=unused-argument
 
+@preprocess
 @checks
-def mice(data):
+def mice(data, **kwargs):
     """Multivariate Imputation by Chained Equations
 
     Reference:

diff --git a/impyute/imputations/cs/random_imputation.py b/impyute/imputations/cs/random_imputation.py
@@ -1,10 +1,14 @@
 """ impyute.imputations.cs.random_imputation"""
 import numpy as np
 from impyute.utils import find_null
+from impyute.utils import preprocess
 from impyute.utils import checks
+# pylint:disable=invalid-name
+# pylint:disable=unused-argument
 
+@preprocess
 @checks
-def random_imputation(data):
+def random_imputation(data, **kwargs):
     """ Fill missing values in with a randomly selected value from the same
     column.
 

diff --git a/impyute/imputations/ts/arima.py b/impyute/imputations/ts/arima.py
@@ -2,9 +2,10 @@
 import numpy as np
 from impyute.utils import find_null
 from impyute.utils import checks
+# pylint: disable=invalid-name
 
 @checks
-def arima(data, p, d, q):
+def arima(data, p, d, q, axis=0):
     """Autoregressive Integrated Moving Average Imputation
 
     Stationary model
@@ -14,30 +15,24 @@ def arima(data, p, d, q):
     data: numpy.ndarray
         The matrix with missing values that you want to impute
     p: int
-        Number of autoregressive terms. Ex (p,d,q)=(1,0,0). 
+        Number of autoregressive terms. Ex (p,d,q)=(1,0,0).
     d: int
         Number of nonseasonal differences needed for stationarity
     q: int
         Number of lagged forecast errors in the prediction equation
+    axis: boolean (optional)
+        0 if time series is in row format (Ex. data[0][:] is 1st data point).
+        1 if time series is in col format (Ex. data[:][0] is 1st data point).
+
     RETURNS
     -------
     numpy.ndarray
     """
-    def _compute_nan_endpoints(x, y):
-        pass
+    assert isinstance(p, int), "Parameter `p` must be an integer"
+    assert isinstance(d, int), "Parameter `d` must be an integer"
+    assert isinstance(q, int), "Parameter `q` must be an integer"
 
-    try:
-        p = int(p)
-        d = int(d)
-        q = int(q)
-        data = isinstance(data, np.ndarray)
-    except:
-        raise Exception
-    # ARIMA
     null_xy = find_null(data)
     for x, y in null_xy:
         print(x, y)
     return data
-
-
-
diff --git a/impyute/utils/__init__.py b/impyute/utils/__init__.py
@@ -10,6 +10,7 @@
 from .errors import BadInputError
 from .checks import checks
 from .compare import compare
+from .preprocess import preprocess
 
 __all__ = ["find_null", "print_io", "describe", "count_missing",
-           "checks", "compare", "BadInputError"]
+           "checks", "compare", "BadInputError", "preprocess"]
diff --git a/impyute/utils/checks.py b/impyute/utils/checks.py
@@ -3,6 +3,7 @@
 import numpy as np
 from impyute.utils import find_null
 from impyute.utils import BadInputError
+# pylint:disable=invalid-name
 
 def checks(fn):
     """ Main check function to ensure input is correctly formatted
@@ -19,7 +20,7 @@ def checks(fn):
 
     """
     @wraps(fn)
-    def wrapper(*args, **kwds):
+    def wrapper(*args, **kwargs):
         """ Run input checks"""
         data = args[0]
         if len(np.shape(data)) != 2:
@@ -32,7 +33,7 @@ def wrapper(*args, **kwds):
             raise BadInputError("Data is not float.")
         elif not _nan_exists(data):
             raise BadInputError("No NaN's in given data")
-        return fn(*args, **kwds)
+        return fn(*args, **kwargs)
     return wrapper
 
 def _shape_2d(data):

diff --git a/impyute/utils/preprocess.py b/impyute/utils/preprocess.py
@@ -0,0 +1,32 @@
+""" impyute.utils.preprocess """
+from functools import wraps
+# pylint:disable=invalid-name
+
+def preprocess(fn):
+    """ Base preprocess function for commonly used preprocessing
+
+    PARAMETERS
+    ----------
+    data: numpy.ndarray
+        Data to impute.
+
+    RETURNS
+    -------
+    bool
+        True if `data` is correctly formatted
+
+    """
+    @wraps(fn)
+    def wrapper(*args, **kwargs):
+        """ Run input checks"""
+        if "inplace" in kwargs and kwargs['inplace']:
+            data = args[0]
+        else:
+            data = args[0].copy()
+
+        if len(args) == 1:
+            return fn(data, **kwargs)
+        return fn(data, *args[1:], **kwargs)
+    return wrapper
+
+