|
| 1 | +import numpy as np |
| 2 | +from . import core, stump |
| 3 | + |
| 4 | + |
| 5 | +def ostinato(tss, m): |
| 6 | + """ |
| 7 | + Find the consensus motif of multiple time series |
| 8 | +
|
| 9 | + This is a wrapper around the vanilla version of the ostinato algorithm |
| 10 | + which finds the best radius and a helper function that finds the most |
| 11 | + central conserved motif. |
| 12 | +
|
| 13 | + Parameters |
| 14 | + ---------- |
| 15 | + tss : list |
| 16 | + List of time series for which to find the consensus motif |
| 17 | +
|
| 18 | + m : int |
| 19 | + Window size |
| 20 | +
|
| 21 | + Returns |
| 22 | + ------- |
| 23 | + rad : float |
| 24 | + Radius of the most central consensus motif |
| 25 | +
|
| 26 | + ts_ind : int |
| 27 | + Index of time series which contains the most central consensus motif |
| 28 | +
|
| 29 | + ss_ind : int |
| 30 | + Start index of the most central consensus motif within the time series |
| 31 | + `ts_ind` that contains it |
| 32 | +
|
| 33 | + Notes |
| 34 | + ----- |
| 35 | + <https://www.cs.ucr.edu/~eamonn/consensus_Motif_ICDM_Long_version.pdf> |
| 36 | +
|
| 37 | + See Table 2 |
| 38 | +
|
| 39 | + The ostinato algorithm proposed in the paper finds the best radius |
| 40 | + in `tss`. Intuitively, the radius is the minimum distance of a |
| 41 | + subsequence to encompass at least one nearest neighbor subsequence |
| 42 | + from all other time series. The best radius in `tss` is the minimum |
| 43 | + radius amongst all radii. Some data sets might contain multiple |
| 44 | + subsequences which have the same optimal radius. |
| 45 | + The greedy Ostinato algorithm only finds one of them, which might |
| 46 | + not be the most central motif. The most central motif amongst the |
| 47 | + subsequences with the best radius is the one with the smallest mean |
| 48 | + distance to nearest neighbors in all other time series. To find this |
| 49 | + central motif it is necessary to search the subsequences with the |
| 50 | + best radius via `stumpy.ostinato._get_central_motif` |
| 51 | + """ |
| 52 | + rad, ts_ind, ss_ind = _ostinato(tss, m) |
| 53 | + return _get_central_motif(tss, rad, ts_ind, ss_ind, m) |
| 54 | + |
| 55 | + |
| 56 | +def _ostinato(tss, m): |
| 57 | + """ |
| 58 | + Find the consensus motif of multiple time series |
| 59 | +
|
| 60 | + Parameters |
| 61 | + ---------- |
| 62 | + tss : list |
| 63 | + List of time series for which to find the consensus motif |
| 64 | +
|
| 65 | + m : int |
| 66 | + Window size |
| 67 | +
|
| 68 | + Returns |
| 69 | + ------- |
| 70 | + bsf_rad : float |
| 71 | + Radius of the consensus motif |
| 72 | +
|
| 73 | + ts_ind : int |
| 74 | + Index of time series which contains the consensus motif |
| 75 | +
|
| 76 | + ss_ind : int |
| 77 | + Start index of consensus motif within the time series ts_ind |
| 78 | + that contains it |
| 79 | +
|
| 80 | + Notes |
| 81 | + ----- |
| 82 | + <https://www.cs.ucr.edu/~eamonn/consensus_Motif_ICDM_Long_version.pdf> |
| 83 | +
|
| 84 | + See Table 2 |
| 85 | +
|
| 86 | + The ostinato algorithm proposed in the paper finds the best radius |
| 87 | + in `tss`. Intuitively, the radius is the minimum distance of a |
| 88 | + subsequence to encompass at least one nearest neighbor subsequence |
| 89 | + from all other time series. The best radius in `tss` is the minimum |
| 90 | + radius amongst all radii. Some data sets might contain multiple |
| 91 | + subsequences which have the same optimal radius. |
| 92 | + The greedy Ostinato algorithm only finds one of them, which might |
| 93 | + not be the most central motif. The most central motif amongst the |
| 94 | + subsequences with the best radius is the one with the smallest mean |
| 95 | + distance to nearest neighbors in all other time series. To find this |
| 96 | + central motif it is necessary to search the subsequences with the |
| 97 | + best radius via `stumpy.ostinato._get_central_motif` |
| 98 | + """ |
| 99 | + # Preprocess means and stddevs and handle np.nan/np.inf |
| 100 | + Ts = [None] * len(tss) |
| 101 | + M_Ts = [None] * len(tss) |
| 102 | + Σ_Ts = [None] * len(tss) |
| 103 | + for i, T in enumerate(tss): |
| 104 | + Ts[i], M_Ts[i], Σ_Ts[i] = core.preprocess(T, m) |
| 105 | + |
| 106 | + bsf_rad, ts_ind, ss_ind = np.inf, 0, 0 |
| 107 | + k = len(Ts) |
| 108 | + for j in range(k): |
| 109 | + if j < (k - 1): |
| 110 | + h = j + 1 |
| 111 | + else: |
| 112 | + h = 0 |
| 113 | + |
| 114 | + mp = stump(Ts[j], m, Ts[h], ignore_trivial=False) |
| 115 | + si = np.argsort(mp[:, 0]) |
| 116 | + for q in si: |
| 117 | + rad = mp[q, 0] |
| 118 | + if rad >= bsf_rad: |
| 119 | + break |
| 120 | + for i in range(k): |
| 121 | + if ~np.isin(i, [j, h]): |
| 122 | + QT = core.sliding_dot_product(Ts[j][q : q + m], Ts[i]) |
| 123 | + rad = np.max( |
| 124 | + ( |
| 125 | + rad, |
| 126 | + np.min( |
| 127 | + core._mass( |
| 128 | + Ts[j][q : q + m], |
| 129 | + Ts[i], |
| 130 | + QT, |
| 131 | + M_Ts[j][q], |
| 132 | + Σ_Ts[j][q], |
| 133 | + M_Ts[i], |
| 134 | + Σ_Ts[i], |
| 135 | + ) |
| 136 | + ), |
| 137 | + ) |
| 138 | + ) |
| 139 | + if rad >= bsf_rad: |
| 140 | + break |
| 141 | + if rad < bsf_rad: |
| 142 | + bsf_rad, ts_ind, ss_ind = rad, j, q |
| 143 | + |
| 144 | + return bsf_rad, ts_ind, ss_ind |
| 145 | + |
| 146 | + |
| 147 | +def _get_central_motif(tss, rad, ts_ind, ss_ind, m): |
| 148 | + """ |
| 149 | + Compare subsequences with the same radius and return the most central motif |
| 150 | +
|
| 151 | + Parameters |
| 152 | + ---------- |
| 153 | + tss : list |
| 154 | + List of time series for which to find the most central motif |
| 155 | +
|
| 156 | + rad : float |
| 157 | + Best radius found by a consensus search algorithm |
| 158 | +
|
| 159 | + ts_ind : int |
| 160 | + Index of time series in which `rad` was found first |
| 161 | +
|
| 162 | + ss_ind : int |
| 163 | + Start index of subsequence in `ts_ind` that has radius `rad` |
| 164 | +
|
| 165 | + m : int |
| 166 | + Window size |
| 167 | +
|
| 168 | + Returns |
| 169 | + ------- |
| 170 | + rad : float |
| 171 | + Radius of the most central consensus motif |
| 172 | +
|
| 173 | + ts_ind : int |
| 174 | + Index of time series which contains the most central consensus motif |
| 175 | +
|
| 176 | + ss_ind : int |
| 177 | + Start index of most central consensus motif within the time series `ts_ind` |
| 178 | + that contains it |
| 179 | +
|
| 180 | + Notes |
| 181 | + ----- |
| 182 | + <https://www.cs.ucr.edu/~eamonn/consensus_Motif_ICDM_Long_version.pdf> |
| 183 | +
|
| 184 | + See Table 2 |
| 185 | +
|
| 186 | + The ostinato algorithm proposed in the paper finds the best radius |
| 187 | + in `tss`. Intuitively, the radius is the minimum distance of a |
| 188 | + subsequence to encompass at least one nearest neighbor subsequence |
| 189 | + from all other time series. The best radius in `tss` is the minimum |
| 190 | + radius amongst all radii. Some data sets might contain multiple |
| 191 | + subsequences which have the same optimal radius. |
| 192 | + The greedy Ostinato algorithm only finds one of them, which might |
| 193 | + not be the most central motif. The most central motif amongst the |
| 194 | + subsequences with the best radius is the one with the smallest mean |
| 195 | + distance to nearest neighbors in all other time series. To find this |
| 196 | + central motif it is necessary to search the subsequences with the |
| 197 | + best radius via `stumpy.ostinato._get_central_motif` |
| 198 | + """ |
| 199 | + k = len(tss) |
| 200 | + |
| 201 | + # Ostinato hit: get nearest neighbors' distances and indices |
| 202 | + q_ost = tss[ts_ind][ss_ind : ss_ind + m] |
| 203 | + ss_ind_nn_ost, d_ost = _across_series_nearest_neighbors(q_ost, tss) |
| 204 | + |
| 205 | + # Alternative candidates: Distance to ostinato hit equals best radius |
| 206 | + ts_ind_alt = np.flatnonzero(np.isclose(d_ost, rad)) |
| 207 | + ss_ind_alt = ss_ind_nn_ost[ts_ind_alt] |
| 208 | + d_alt = np.zeros((len(ts_ind_alt), k), dtype=float) |
| 209 | + for i, (tsi, ssi) in enumerate(zip(ts_ind_alt, ss_ind_alt)): |
| 210 | + q = tss[tsi][ssi : ssi + m] |
| 211 | + _, d_alt[i] = _across_series_nearest_neighbors(q, tss) |
| 212 | + rad_alt = np.max(d_alt, axis=1) |
| 213 | + d_mean_alt = np.mean(d_alt, axis=1) |
| 214 | + |
| 215 | + # Alternatives with same radius and lower mean distance |
| 216 | + alt_better = np.logical_and( |
| 217 | + np.isclose(rad_alt, rad), |
| 218 | + d_mean_alt < d_ost.mean(), |
| 219 | + ) |
| 220 | + # Alternatives with same radius and same mean distance |
| 221 | + alt_same = np.logical_and( |
| 222 | + np.isclose(rad_alt, rad), |
| 223 | + np.isclose(d_mean_alt, d_ost.mean()), |
| 224 | + ) |
| 225 | + if np.any(alt_better): |
| 226 | + ts_ind_alt = ts_ind_alt[alt_better] |
| 227 | + d_mean_alt = d_mean_alt[alt_better] |
| 228 | + i_alt_best = np.argmin(d_mean_alt) |
| 229 | + ts_ind = ts_ind_alt[i_alt_best] |
| 230 | + elif np.any(alt_same): |
| 231 | + # Default to the first match in the list of time series |
| 232 | + ts_ind_alt = ts_ind_alt[alt_same] |
| 233 | + i_alt_first = np.argmin(ts_ind_alt) |
| 234 | + ts_ind = np.min((ts_ind, ts_ind_alt[i_alt_first])) |
| 235 | + ss_ind = ss_ind_nn_ost[ts_ind] |
| 236 | + return rad, ts_ind, ss_ind |
| 237 | + |
| 238 | + |
| 239 | +def _across_series_nearest_neighbors(q, tss): |
| 240 | + """ |
| 241 | + For multiple time series find, per individual time series, the subsequences closest |
| 242 | + to a query. |
| 243 | +
|
| 244 | + Parameters |
| 245 | + ---------- |
| 246 | + q : ndarray |
| 247 | + Query array or subsequence |
| 248 | +
|
| 249 | + tss : list |
| 250 | + List of time series for which to the nearest neighbors to `q` |
| 251 | +
|
| 252 | + Returns |
| 253 | + ------- |
| 254 | + ss_ind_nn : ndarray |
| 255 | + Indices to subsequences in `tss` that are closest to `q` |
| 256 | +
|
| 257 | + d : ndarray |
| 258 | + Distances to subsequences in `tss` that are closest to `q` |
| 259 | + """ |
| 260 | + k = len(tss) |
| 261 | + d = np.zeros(k, dtype=float) |
| 262 | + ss_ind_nn = np.zeros(k, dtype=int) |
| 263 | + for i in range(k): |
| 264 | + dp = core.mass(q, tss[i]) |
| 265 | + ss_ind_nn[i] = np.argmin(dp) |
| 266 | + d[i] = dp[ss_ind_nn[i]] |
| 267 | + return ss_ind_nn, d |
0 commit comments