Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 9 additions & 3 deletions impyute/imputations/cs/averaging_imputations.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,13 @@
import numpy as np
from impyute.utils import find_null
from impyute.utils import checks
from impyute.utils import preprocess
# pylint:disable=unused-argument
# pylint:disable=invalid-name

@preprocess
@checks
def mean_imputation(data):
def mean_imputation(data, **kwargs):
""" Substitute missing values with the mean of that column.

Parameters
Expand All @@ -25,8 +29,9 @@ def mean_imputation(data):
data[x_i][y_i] = new_value
return data

@preprocess
@checks
def median_imputation(data):
def median_imputation(data, **kwargs):
""" Substitute missing values with the median of that column(middle).

Parameters
Expand All @@ -51,8 +56,9 @@ def median_imputation(data):
data[x_i][y_i] = medians[str(y_i)]
return data

@preprocess
@checks
def mode_imputation(data):
def mode_imputation(data, **kwargs):
""" Substitute missing values with the mode of that column(most frequent).

In the case that there is a tie (there are multiple, most frequent values)
Expand Down
50 changes: 25 additions & 25 deletions impyute/imputations/cs/em.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,14 @@
import random
import numpy as np
from impyute.utils import find_null
from impyute.utils import preprocess
from impyute.utils import checks
# pylint:disable=invalid-name
# pylint:disable=unused-argument

@preprocess
@checks
def em(data, loops=50, dtype="cont"):
def em(data, loops=50, **kwargs):
""" Imputes given data using expectation maximization.

E-step: Calculates the expected complete data log likelihood ratio.
Expand All @@ -18,38 +22,34 @@ def em(data, loops=50, dtype="cont"):
Data to impute.
loops: int
Number of em iterations to run before breaking.
dtype: ("cont","disc")
Indicates whether the possible values will come from a continuous
range or categorical range.
inplace: boolean
If True, operate on the numpy array reference

Returns
-------
numpy.nd.array
Imputed data.

"""
if dtype == "cont":
null_xy = find_null(data)
for x_i, y_i in null_xy:
col = data[:, int(y_i)]
null_xy = find_null(data)
for x_i, y_i in null_xy:
col = data[:, int(y_i)]
mu = col[~np.isnan(col)].mean()
std = col[~np.isnan(col)].std()
col[x_i] = random.gauss(mu, std)
previous, i = 1, 1
for i in range(loops):
# Expectation
mu = col[~np.isnan(col)].mean()
std = col[~np.isnan(col)].std()
# Maximization
col[x_i] = random.gauss(mu, std)
previous, i = 1, 1
for i in range(loops):
# Expectation
mu = col[~np.isnan(col)].mean()
std = col[~np.isnan(col)].std()
# Maximization
col[x_i] = random.gauss(mu, std)
# Break out of loop if likelihood doesn't change at least 10%
# and has run at least 5 times
delta = (col[x_i]-previous)/previous
if i > 5 and delta < 0.1:
data[x_i][y_i] = col[x_i]
break
# Break out of loop if likelihood doesn't change at least 10%
# and has run at least 5 times
delta = (col[x_i]-previous)/previous
if i > 5 and delta < 0.1:
data[x_i][y_i] = col[x_i]
previous = col[x_i]
return data
else:
raise Exception("Other dtypes not supported yet.")
break
data[x_i][y_i] = col[x_i]
previous = col[x_i]
return data
9 changes: 6 additions & 3 deletions impyute/imputations/cs/fast_knn.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,15 @@
import numpy as np
from impyute.utils import find_null
from impyute.utils import checks
from impyute.utils import preprocess
from impyute.imputations.cs import mean_imputation
from scipy.spatial import KDTree
# pylint: disable=invalid-name
# pylint:disable=unused-argument

@preprocess
@checks
def fast_knn(data, k=5):
def fast_knn(data, k=3, **kwargs):
""" Impute using a variant of the nearest neighbours approach

Basic idea: Impute array and then use the resulting complete
Expand All @@ -33,12 +36,12 @@ def fast_knn(data, k=5):
kdtree = KDTree(data_c)

for x_i, y_i in null_xy:
distances, indices = kdtree.query(data[x_i], k=k+1)
distances, indices = kdtree.query(data_c[x_i], k=k+1)
# Will always return itself in the first index. Delete it.
distances, indices = distances[1:], indices[1:]
weights = (np.sum(distances)-distances)/np.sum(distances)
# Make weights sum to 1
weights_unit = weights/np.sum(weights)
# Assign missing value the weighted average of `k` nearest neighbours
data[x_i][y_i] = np.dot(weights_unit, [data[y_i][ind] for ind in indices])
data[x_i][y_i] = np.dot(weights_unit, [data_c[y_i][ind] for ind in indices])
return data
6 changes: 5 additions & 1 deletion impyute/imputations/cs/mice.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,14 @@
from sklearn.linear_model import LinearRegression
from impyute.utils import find_null
from impyute.utils import checks
from impyute.utils import preprocess
# pylint: disable=too-many-locals
# pylint:disable=invalid-name
# pylint:disable=unused-argument

@preprocess
@checks
def mice(data):
def mice(data, **kwargs):
"""Multivariate Imputation by Chained Equations

Reference:
Expand Down
6 changes: 5 additions & 1 deletion impyute/imputations/cs/random_imputation.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,14 @@
""" impyute.imputations.cs.random_imputation"""
import numpy as np
from impyute.utils import find_null
from impyute.utils import preprocess
from impyute.utils import checks
# pylint:disable=invalid-name
# pylint:disable=unused-argument

@preprocess
@checks
def random_imputation(data):
def random_imputation(data, **kwargs):
""" Fill missing values in with a randomly selected value from the same
column.

Expand Down
25 changes: 10 additions & 15 deletions impyute/imputations/ts/arima.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,10 @@
import numpy as np
from impyute.utils import find_null
from impyute.utils import checks
# pylint: disable=invalid-name

@checks
def arima(data, p, d, q):
def arima(data, p, d, q, axis=0):
"""Autoregressive Integrated Moving Average Imputation

Stationary model
Expand All @@ -14,30 +15,24 @@ def arima(data, p, d, q):
data: numpy.ndarray
The matrix with missing values that you want to impute
p: int
Number of autoregressive terms. Ex (p,d,q)=(1,0,0).
Number of autoregressive terms. Ex (p,d,q)=(1,0,0).
d: int
Number of nonseasonal differences needed for stationarity
q: int
Number of lagged forecast errors in the prediction equation
axis: boolean (optional)
0 if time series is in row format (Ex. data[0][:] is 1st data point).
1 if time series is in col format (Ex. data[:][0] is 1st data point).

RETURNS
-------
numpy.ndarray
"""
def _compute_nan_endpoints(x, y):
pass
assert isinstance(p, int), "Parameter `p` must be an integer"
assert isinstance(d, int), "Parameter `d` must be an integer"
assert isinstance(q, int), "Parameter `q` must be an integer"

try:
p = int(p)
d = int(d)
q = int(q)
data = isinstance(data, np.ndarray)
except:
raise Exception
# ARIMA
null_xy = find_null(data)
for x, y in null_xy:
print(x, y)
return data



3 changes: 2 additions & 1 deletion impyute/utils/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
from .errors import BadInputError
from .checks import checks
from .compare import compare
from .preprocess import preprocess

__all__ = ["find_null", "print_io", "describe", "count_missing",
"checks", "compare", "BadInputError"]
"checks", "compare", "BadInputError", "preprocess"]
5 changes: 3 additions & 2 deletions impyute/utils/checks.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import numpy as np
from impyute.utils import find_null
from impyute.utils import BadInputError
# pylint:disable=invalid-name

def checks(fn):
""" Main check function to ensure input is correctly formatted
Expand All @@ -19,7 +20,7 @@ def checks(fn):

"""
@wraps(fn)
def wrapper(*args, **kwds):
def wrapper(*args, **kwargs):
""" Run input checks"""
data = args[0]
if len(np.shape(data)) != 2:
Expand All @@ -32,7 +33,7 @@ def wrapper(*args, **kwds):
raise BadInputError("Data is not float.")
elif not _nan_exists(data):
raise BadInputError("No NaN's in given data")
return fn(*args, **kwds)
return fn(*args, **kwargs)
return wrapper

def _shape_2d(data):
Expand Down
32 changes: 32 additions & 0 deletions impyute/utils/preprocess.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
""" impyute.utils.preprocess """
from functools import wraps
# pylint:disable=invalid-name

def preprocess(fn):
""" Base preprocess function for commonly used preprocessing

PARAMETERS
----------
data: numpy.ndarray
Data to impute.

RETURNS
-------
bool
True if `data` is correctly formatted

"""
@wraps(fn)
def wrapper(*args, **kwargs):
""" Run input checks"""
if "inplace" in kwargs and kwargs['inplace']:
data = args[0]
else:
data = args[0].copy()

if len(args) == 1:
return fn(data, **kwargs)
return fn(data, *args[1:], **kwargs)
return wrapper