Fixed #277 and #278 Added Ostinato Consensus Motifs and Tutorial

refactoriel · web-flow · commit 060df7373598 · 2020-11-30T21:40:06.000-05:00
* Reproduce Fig 9 from paper Matrix Profile XV Consensus Motifs This uses the mtDNA example. Ostinato transcribed from table 2 in paper. Performance is really slow. Related to #278 * Faster implementation of Ostinato * Change from numpy.convolve to scipy.signal.convolve See #279. SciPy is faster for vectors with more than 500 elements. * add test_ostinato * naive consensus search * add ostinato function * Fix bug in naive implementation Bug had been copied from paper's pseudocode (table 1, line 8) * test_ostinato: ignore DeprecationWarning Also remove trailing whitespaces in test.sh * Notebook: add EOG data, polish mtDNA, some more explantory text * expand range of tested random seeds * Black'd and flake8'd * only test radius for now * black 20.8b1 * put back test of time series and subsequence indices * Private function _get_central_motif * Complete _get_central_motif * Ostinato: main computation in private function, add wrapper * Consistent variable naming, improved docstrings and central motif search * Implement Sean's suggestions to notebook * Retrieve clean CSVs from Zenodo, remove unnecessary imports * Edge case: same radius, same mean distance: default to first * docstring formatting * naive implementation of get_central_motif * remove nested conditional * oops, forgot black
diff --git a/docs/Tutorial_Consensus_Motif.ipynb b/docs/Tutorial_Consensus_Motif.ipynb
diff --git a/stumpy/__init__.py b/stumpy/__init__.py
@@ -20,6 +20,7 @@
 from .aampi import aampi  # noqa: F401
 from .chains import atsc, allc  # noqa: F401
 from .floss import floss, fluss, _nnmark, _iac, _cac, _rea  # noqa: F401
+from .ostinato import ostinato  # noqa: F401
 from .scrump import scrump  # noqa: F401
 from .stumpi import stumpi  # noqa: F401
 from numba import cuda
diff --git a/stumpy/core.py b/stumpy/core.py
@@ -4,6 +4,7 @@
 
 import numpy as np
 from numba import njit, prange
+from scipy.signal import convolve
 import tempfile
 import math
 
@@ -265,7 +266,7 @@ def sliding_dot_product(Q, T):
     n = T.shape[0]
     m = Q.shape[0]
     Qr = np.flipud(Q)  # Reverse/flip Q
-    QT = np.convolve(Qr, T)
+    QT = convolve(Qr, T)
 
     return QT.real[m - 1 : n]
 
diff --git a/stumpy/ostinato.py b/stumpy/ostinato.py
@@ -0,0 +1,267 @@
+import numpy as np
+from . import core, stump
+
+
+def ostinato(tss, m):
+    """
+    Find the consensus motif of multiple time series
+
+    This is a wrapper around the vanilla version of the ostinato algorithm
+    which finds the best radius and a helper function that finds the most
+    central conserved motif.
+
+    Parameters
+    ----------
+    tss : list
+        List of time series for which to find the consensus motif
+
+    m : int
+        Window size
+
+    Returns
+    -------
+    rad : float
+        Radius of the most central consensus motif
+
+    ts_ind : int
+        Index of time series which contains the most central consensus motif
+
+    ss_ind : int
+        Start index of the most central consensus motif within the time series
+        `ts_ind` that contains it
+
+    Notes
+    -----
+    <https://www.cs.ucr.edu/~eamonn/consensus_Motif_ICDM_Long_version.pdf>
+
+    See Table 2
+
+    The ostinato algorithm proposed in the paper finds the best radius
+    in `tss`. Intuitively, the radius is the minimum distance of a
+    subsequence to encompass at least one nearest neighbor subsequence
+    from all other time series. The best radius in `tss` is the minimum
+    radius amongst all radii. Some data sets might contain multiple
+    subsequences which have the same optimal radius.
+    The greedy Ostinato algorithm only finds one of them, which might
+    not be the most central motif. The most central motif amongst the
+    subsequences with the best radius is the one with the smallest mean
+    distance to nearest neighbors in all other time series. To find this
+    central motif it is necessary to search the subsequences with the
+    best radius via `stumpy.ostinato._get_central_motif`
+    """
+    rad, ts_ind, ss_ind = _ostinato(tss, m)
+    return _get_central_motif(tss, rad, ts_ind, ss_ind, m)
+
+
+def _ostinato(tss, m):
+    """
+    Find the consensus motif of multiple time series
+
+    Parameters
+    ----------
+    tss : list
+        List of time series for which to find the consensus motif
+
+    m : int
+        Window size
+
+    Returns
+    -------
+    bsf_rad : float
+        Radius of the consensus motif
+
+    ts_ind : int
+        Index of time series which contains the consensus motif
+
+    ss_ind : int
+        Start index of consensus motif within the time series ts_ind
+        that contains it
+
+    Notes
+    -----
+    <https://www.cs.ucr.edu/~eamonn/consensus_Motif_ICDM_Long_version.pdf>
+
+    See Table 2
+
+    The ostinato algorithm proposed in the paper finds the best radius
+    in `tss`. Intuitively, the radius is the minimum distance of a
+    subsequence to encompass at least one nearest neighbor subsequence
+    from all other time series. The best radius in `tss` is the minimum
+    radius amongst all radii. Some data sets might contain multiple
+    subsequences which have the same optimal radius.
+    The greedy Ostinato algorithm only finds one of them, which might
+    not be the most central motif. The most central motif amongst the
+    subsequences with the best radius is the one with the smallest mean
+    distance to nearest neighbors in all other time series. To find this
+    central motif it is necessary to search the subsequences with the
+    best radius via `stumpy.ostinato._get_central_motif`
+    """
+    # Preprocess means and stddevs and handle np.nan/np.inf
+    Ts = [None] * len(tss)
+    M_Ts = [None] * len(tss)
+    Σ_Ts = [None] * len(tss)
+    for i, T in enumerate(tss):
+        Ts[i], M_Ts[i], Σ_Ts[i] = core.preprocess(T, m)
+
+    bsf_rad, ts_ind, ss_ind = np.inf, 0, 0
+    k = len(Ts)
+    for j in range(k):
+        if j < (k - 1):
+            h = j + 1
+        else:
+            h = 0
+
+        mp = stump(Ts[j], m, Ts[h], ignore_trivial=False)
+        si = np.argsort(mp[:, 0])
+        for q in si:
+            rad = mp[q, 0]
+            if rad >= bsf_rad:
+                break
+            for i in range(k):
+                if ~np.isin(i, [j, h]):
+                    QT = core.sliding_dot_product(Ts[j][q : q + m], Ts[i])
+                    rad = np.max(
+                        (
+                            rad,
+                            np.min(
+                                core._mass(
+                                    Ts[j][q : q + m],
+                                    Ts[i],
+                                    QT,
+                                    M_Ts[j][q],
+                                    Σ_Ts[j][q],
+                                    M_Ts[i],
+                                    Σ_Ts[i],
+                                )
+                            ),
+                        )
+                    )
+                    if rad >= bsf_rad:
+                        break
+            if rad < bsf_rad:
+                bsf_rad, ts_ind, ss_ind = rad, j, q
+
+    return bsf_rad, ts_ind, ss_ind
+
+
+def _get_central_motif(tss, rad, ts_ind, ss_ind, m):
+    """
+    Compare subsequences with the same radius and return the most central motif
+
+    Parameters
+    ----------
+    tss : list
+        List of time series for which to find the most central motif
+
+    rad : float
+        Best radius found by a consensus search algorithm
+
+    ts_ind : int
+        Index of time series in which `rad` was found first
+
+    ss_ind : int
+        Start index of subsequence in `ts_ind` that has radius `rad`
+
+    m : int
+        Window size
+
+    Returns
+    -------
+    rad : float
+        Radius of the most central consensus motif
+
+    ts_ind : int
+        Index of time series which contains the most central consensus motif
+
+    ss_ind : int
+        Start index of most central consensus motif within the time series `ts_ind`
+        that contains it
+
+    Notes
+    -----
+    <https://www.cs.ucr.edu/~eamonn/consensus_Motif_ICDM_Long_version.pdf>
+
+    See Table 2
+
+    The ostinato algorithm proposed in the paper finds the best radius
+    in `tss`. Intuitively, the radius is the minimum distance of a
+    subsequence to encompass at least one nearest neighbor subsequence
+    from all other time series. The best radius in `tss` is the minimum
+    radius amongst all radii. Some data sets might contain multiple
+    subsequences which have the same optimal radius.
+    The greedy Ostinato algorithm only finds one of them, which might
+    not be the most central motif. The most central motif amongst the
+    subsequences with the best radius is the one with the smallest mean
+    distance to nearest neighbors in all other time series. To find this
+    central motif it is necessary to search the subsequences with the
+    best radius via `stumpy.ostinato._get_central_motif`
+    """
+    k = len(tss)
+
+    # Ostinato hit: get nearest neighbors' distances and indices
+    q_ost = tss[ts_ind][ss_ind : ss_ind + m]
+    ss_ind_nn_ost, d_ost = _across_series_nearest_neighbors(q_ost, tss)
+
+    # Alternative candidates: Distance to ostinato hit equals best radius
+    ts_ind_alt = np.flatnonzero(np.isclose(d_ost, rad))
+    ss_ind_alt = ss_ind_nn_ost[ts_ind_alt]
+    d_alt = np.zeros((len(ts_ind_alt), k), dtype=float)
+    for i, (tsi, ssi) in enumerate(zip(ts_ind_alt, ss_ind_alt)):
+        q = tss[tsi][ssi : ssi + m]
+        _, d_alt[i] = _across_series_nearest_neighbors(q, tss)
+    rad_alt = np.max(d_alt, axis=1)
+    d_mean_alt = np.mean(d_alt, axis=1)
+
+    # Alternatives with same radius and lower mean distance
+    alt_better = np.logical_and(
+        np.isclose(rad_alt, rad),
+        d_mean_alt < d_ost.mean(),
+    )
+    # Alternatives with same radius and same mean distance
+    alt_same = np.logical_and(
+        np.isclose(rad_alt, rad),
+        np.isclose(d_mean_alt, d_ost.mean()),
+    )
+    if np.any(alt_better):
+        ts_ind_alt = ts_ind_alt[alt_better]
+        d_mean_alt = d_mean_alt[alt_better]
+        i_alt_best = np.argmin(d_mean_alt)
+        ts_ind = ts_ind_alt[i_alt_best]
+    elif np.any(alt_same):
+        # Default to the first match in the list of time series
+        ts_ind_alt = ts_ind_alt[alt_same]
+        i_alt_first = np.argmin(ts_ind_alt)
+        ts_ind = np.min((ts_ind, ts_ind_alt[i_alt_first]))
+    ss_ind = ss_ind_nn_ost[ts_ind]
+    return rad, ts_ind, ss_ind
+
+
+def _across_series_nearest_neighbors(q, tss):
+    """
+    For multiple time series find, per individual time series, the subsequences closest
+    to a query.
+
+    Parameters
+    ----------
+    q : ndarray
+        Query array or subsequence
+
+    tss : list
+        List of time series for which to the nearest neighbors to `q`
+
+    Returns
+    -------
+    ss_ind_nn : ndarray
+        Indices to subsequences in `tss` that are closest to `q`
+
+    d : ndarray
+        Distances to subsequences in `tss` that are closest to `q`
+    """
+    k = len(tss)
+    d = np.zeros(k, dtype=float)
+    ss_ind_nn = np.zeros(k, dtype=int)
+    for i in range(k):
+        dp = core.mass(q, tss[i])
+        ss_ind_nn[i] = np.argmin(dp)
+        d[i] = dp[ss_ind_nn[i]]
+    return ss_ind_nn, d
diff --git a/test.sh b/test.sh
@@ -13,7 +13,7 @@ if [ $# -gt 0 ]; then
     else
         echo "Using default test_mode=\"all\""
     fi
-fi 
+fi
 
 ###############
 #  Functions  #
@@ -63,11 +63,11 @@ test_unit()
     check_errs $?
     py.test -x -W ignore::RuntimeWarning -W ignore::DeprecationWarning tests/test_stump.py tests/test_mstump.py tests/test_scrump.py tests/test_stumpi.py
     check_errs $?
-    py.test -x -W ignore::RuntimeWarning -W ignore::DeprecationWarning tests/test_stumped.py 
+    py.test -x -W ignore::RuntimeWarning -W ignore::DeprecationWarning tests/test_stumped.py
     check_errs $?
-    py.test -x -W ignore::RuntimeWarning -W ignore::DeprecationWarning tests/test_stumped_one_constant_subsequence.py 
+    py.test -x -W ignore::RuntimeWarning -W ignore::DeprecationWarning tests/test_stumped_one_constant_subsequence.py
     check_errs $?
-    py.test -x -W ignore::RuntimeWarning -W ignore::DeprecationWarning tests/test_stumped_two_constant_subsequences.py 
+    py.test -x -W ignore::RuntimeWarning -W ignore::DeprecationWarning tests/test_stumped_two_constant_subsequences.py
     check_errs $?
     py.test -x -W ignore::RuntimeWarning -W ignore::DeprecationWarning tests/test_stumped_two_constant_subsequences_swap.py
     check_errs $?
@@ -87,7 +87,7 @@ test_unit()
     check_errs $?
     py.test -x -W ignore::RuntimeWarning -W ignore::DeprecationWarning tests/test_stumped_two_subsequences_nan_inf_A_B_join_swap.py
     check_errs $?
-    py.test -x -W ignore::RuntimeWarning -W ignore::DeprecationWarning tests/test_mstumped.py 
+    py.test -x -W ignore::RuntimeWarning -W ignore::DeprecationWarning tests/test_mstumped.py
     check_errs $?
     py.test -x -W ignore::RuntimeWarning -W ignore::DeprecationWarning tests/test_mstumped_one_subsequence_nan_self_join.py
     check_errs $?
@@ -99,9 +99,9 @@ test_unit()
     check_errs $?
     py.test -x -W ignore::RuntimeWarning -W ignore::DeprecationWarning tests/test_aamped.py
     check_errs $?
-    py.test -x -W ignore::RuntimeWarning -W ignore::DeprecationWarning tests/test_aamped_one_constant_subsequence.py 
+    py.test -x -W ignore::RuntimeWarning -W ignore::DeprecationWarning tests/test_aamped_one_constant_subsequence.py
     check_errs $?
-    py.test -x -W ignore::RuntimeWarning -W ignore::DeprecationWarning tests/test_aamped_two_constant_subsequences.py 
+    py.test -x -W ignore::RuntimeWarning -W ignore::DeprecationWarning tests/test_aamped_two_constant_subsequences.py
     check_errs $?
     py.test -x -W ignore::RuntimeWarning -W ignore::DeprecationWarning tests/test_aamped_two_constant_subsequences_swap.py
     check_errs $?
@@ -121,6 +121,8 @@ test_unit()
     check_errs $?
     py.test -x -W ignore::RuntimeWarning -W ignore::DeprecationWarning tests/test_aamped_two_subsequences_nan_inf_A_B_join_swap.py
     check_errs $?
+    py.test -x -W ignore::RuntimeWarning -W ignore::DeprecationWarning tests/test_ostinato.py
+    check_errs $?
 }
 
 test_coverage()
diff --git a/tests/test_ostinato.py b/tests/test_ostinato.py