diff --git a/stumpy/config.py b/stumpy/config.py index dad017ec3..63fe602fc 100644 --- a/stumpy/config.py +++ b/stumpy/config.py @@ -8,9 +8,11 @@ STUMPY_MEAN_STD_NUM_CHUNKS = 1 STUMPY_MEAN_STD_MAX_ITER = 10 STUMPY_DENOM_THRESHOLD = 1e-14 -STUMPY_STDDEV_THRESHOLD = 1e-7 +STUMPY_STDDEV_THRESHOLD = 1e-100 # 1e-7 +STUMPY_VAR_THRESHOLD = 1e-5 STUMPY_P_NORM_THRESHOLD = 1e-14 STUMPY_TEST_PRECISION = 5 STUMPY_MAX_P_NORM_DISTANCE = np.finfo(np.float64).max STUMPY_MAX_DISTANCE = np.sqrt(STUMPY_MAX_P_NORM_DISTANCE) STUMPY_EXCL_ZONE_DENOM = 4 +STUMPY_CORRELATION_THRESHOLD = 0.99999999 # 1 - 1e-08 diff --git a/stumpy/core.py b/stumpy/core.py index ed0935f4c..d9035d67e 100644 --- a/stumpy/core.py +++ b/stumpy/core.py @@ -577,6 +577,8 @@ def _welford_nanvar(a, w, a_subseq_isfinite): * (a[last_idx] - curr_mean + a[prev_start_idx] - prev_mean) / w ) + if curr_var < config.STUMPY_VAR_THRESHOLD: + curr_var = np.nanvar(a[start_idx:stop_idx]) all_variances[start_idx] = curr_var @@ -1753,8 +1755,10 @@ def preprocess_diagonal(T, m): """ T, T_subseq_isfinite = preprocess_non_normalized(T, m) M_T, Σ_T = compute_mean_std(T, m) - T_subseq_isconstant = Σ_T < config.STUMPY_STDDEV_THRESHOLD - Σ_T[T_subseq_isconstant] = 1.0 # Avoid divide by zero in next inversion step + T_sliding_min = _rolling_nanmin_1d(T, m) + T_subseq_isconstant = T_sliding_min == M_T + Σ_T[T_subseq_isfinite & T_subseq_isconstant] = 0.0 + Σ_T[Σ_T <= config.STUMPY_STDDEV_THRESHOLD] = 1.0 Σ_T_inverse = 1.0 / Σ_T M_T_m_1, _ = compute_mean_std(T, m - 1) diff --git a/stumpy/stump.py b/stumpy/stump.py index fbf518045..ad9027744 100644 --- a/stumpy/stump.py +++ b/stumpy/stump.py @@ -182,12 +182,27 @@ def _compute_diagonal( if T_B_subseq_isfinite[i + k] and T_A_subseq_isfinite[i]: # Neither subsequence contains NaNs - if T_B_subseq_isconstant[i + k] or T_A_subseq_isconstant[i]: + if T_B_subseq_isconstant[i + k] and T_A_subseq_isconstant[i]: + pearson = 1.0 + elif T_B_subseq_isconstant[i + k] or T_A_subseq_isconstant[i]: pearson = 0.5 else: pearson = cov * Σ_T_inverse[i + k] * σ_Q_inverse[i] - - if T_B_subseq_isconstant[i + k] and T_A_subseq_isconstant[i]: + if config.STUMPY_CORRELATION_THRESHOLD <= pearson < 1.0: + if pearson > ρ[thread_idx, i, 0] or ( + ignore_trivial and pearson > ρ[thread_idx, i + k, 0] + ): + cov = ( + np.dot( + (T_B[i + k : i + k + m] - M_T[i + k]), + (T_A[i : i + m] - μ_Q[i]), + ) + * m_inverse + ) + + pearson = cov * Σ_T_inverse[i + k] * σ_Q_inverse[i] + + if pearson > 1.0: pearson = 1.0 if pearson > ρ[thread_idx, i, 0]: diff --git a/tests/naive.py b/tests/naive.py index 3f68fcda2..1cc8a7f5d 100644 --- a/tests/naive.py +++ b/tests/naive.py @@ -6,7 +6,7 @@ from stumpy import core, config -def z_norm(a, axis=0, threshold=1e-7): +def z_norm(a, axis=0, threshold=config.STUMPY_STDDEV_THRESHOLD): std = np.std(a, axis, keepdims=True) std[np.less(std, threshold, where=~np.isnan(std))] = 1.0 diff --git a/tests/test_stump.py b/tests/test_stump.py index 6feaf1598..b49961f8d 100644 --- a/tests/test_stump.py +++ b/tests/test_stump.py @@ -240,3 +240,111 @@ def test_stump_nan_zero_mean_self_join(): naive.replace_inf(ref_mp) naive.replace_inf(comp_mp) npt.assert_almost_equal(ref_mp, comp_mp) + + +def test_stump_identical_subsequence_self_join_rare_cases(): + # This test function is designed to capture the errors that migtht be raised + # due the imprecision in the calculation of pearson values in the edge case + # where two subsequences are identical. + m = 3 + zone = int(np.ceil(m / 4)) + + seed_values = [27343, 84451] + for seed in seed_values: + np.random.seed(seed) + + identical = np.random.rand(8) + T_A = np.random.rand(20) + T_A[1 : 1 + identical.shape[0]] = identical + T_A[11 : 11 + identical.shape[0]] = identical + + ref_mp = naive.stump(T_A, m, exclusion_zone=zone, row_wise=True) + comp_mp = stump(T_A, m, ignore_trivial=True) + naive.replace_inf(ref_mp) + naive.replace_inf(comp_mp) + npt.assert_almost_equal( + ref_mp[:, 0], comp_mp[:, 0], decimal=config.STUMPY_TEST_PRECISION + ) # ignore indices + + comp_mp = stump(pd.Series(T_A), m, ignore_trivial=True) + naive.replace_inf(comp_mp) + npt.assert_almost_equal( + ref_mp[:, 0], comp_mp[:, 0], decimal=config.STUMPY_TEST_PRECISION + ) # ignore indices + + +def test_stump_identical_subsequence_self_join_rare_cases_2(): + m = 3 + zone = int(np.ceil(m / 4)) + + seed_values = [27343, 84451] + for seed in seed_values: + np.random.seed(seed) + + identical = np.random.rand(8) + T_A = np.random.rand(20) + T_A[1 : 1 + identical.shape[0]] = identical * 0.001 + T_A[11 : 11 + identical.shape[0]] = identical * 1000 + + ref_mp = naive.stump(T_A, m, exclusion_zone=zone, row_wise=True) + comp_mp = stump(T_A, m, ignore_trivial=True) + naive.replace_inf(ref_mp) + naive.replace_inf(comp_mp) + npt.assert_almost_equal( + ref_mp[:, 0], comp_mp[:, 0], decimal=config.STUMPY_TEST_PRECISION + ) # ignore indices + + comp_mp = stump(pd.Series(T_A), m, ignore_trivial=True) + naive.replace_inf(comp_mp) + npt.assert_almost_equal( + ref_mp[:, 0], comp_mp[:, 0], decimal=config.STUMPY_TEST_PRECISION + ) # ignore indices + + +def test_stump_identical_subsequence_self_join_rare_cases_3(): + m = 3 + zone = int(np.ceil(m / 4)) + + seed_values = [27343, 84451] + for seed in seed_values: + np.random.seed(seed) + + identical = np.random.rand(8) + T_A = np.random.rand(20) + T_A[1 : 1 + identical.shape[0]] = identical * 0.00001 + T_A[11 : 11 + identical.shape[0]] = identical * 100000 + + ref_mp = naive.stump(T_A, m, exclusion_zone=zone, row_wise=True) + comp_mp = stump(T_A, m, ignore_trivial=True) + naive.replace_inf(ref_mp) + naive.replace_inf(comp_mp) + npt.assert_almost_equal( + ref_mp[:, 0], comp_mp[:, 0], decimal=config.STUMPY_TEST_PRECISION + ) # ignore indices + + comp_mp = stump(pd.Series(T_A), m, ignore_trivial=True) + naive.replace_inf(comp_mp) + npt.assert_almost_equal( + ref_mp[:, 0], comp_mp[:, 0], decimal=config.STUMPY_TEST_PRECISION + ) # ignore indices + + +def test_stump_volatile(): + m = 3 + zone = int(np.ceil(m / 4)) + + seed_values = [1] + for seed in seed_values: + np.random.seed(seed) + T = np.random.rand(64) + scale = np.random.choice(np.array([0.001, 0, 1000]), len(T), replace=True) + T[:] = T * scale + + ref_mp = naive.stump(T, m, exclusion_zone=zone, row_wise=True) + comp_mp = stump(T, m, ignore_trivial=True) + naive.replace_inf(ref_mp) + naive.replace_inf(comp_mp) + + npt.assert_almost_equal( + ref_mp[:, 0], comp_mp[:, 0], decimal=config.STUMPY_TEST_PRECISION + ) # ignore indices