[SPARK-53077][CORE][TESTS][FOLLOWUP] Reduce insertion count in SparkBloomFilterSuite

ishnagy · dongjoon-hyun · commit d72e02897617 · 2025-08-05T07:58:33.000-07:00
## reduce insertion count in SparkBloomFilterSuite to mitigate long running time ### What changes were proposed in this pull request? This change reduces the insertion count in the `SparkBloomFilterSuite` testsuite to the bare minimum that's necessary to demonstrate the int truncation bug in the V1 version of `BloomFilterImpl`. ### Why are the changes needed? #50933 introduced a new `SparkBloomFilterSuite` testsuite which increased the test running time of the common/sketch module from about 7s to a whopping 12minutes. This change is a workaround to decrease the test running time, until we can devise a way to then (and only then) trigger these long running tests when there are actual changes done in `common/sketch`. ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? the minimum insertion count was selected based on the following measurements with the V1 version of the `BloomFilterImpl`: ``` 100M testAccuracyRandomDistribution: acceptableFpp(3.000000 %) < actualFpp (3.050257 %) [00m18s] T: ~9.6% testAccuracyEvenOdd: acceptableFpp(3.000000 %) < actualFpp (3.053887 %) [00m09s] T: ~9.3% 150M testAccuracyRandomDistribution: acceptableFpp(3.000000 %) < actualFpp (3.080157 %) [00m28s] T: ~15.0% testAccuracyEvenOdd: acceptableFpp(3.000000 %) < actualFpp (3.079987 %) [00m15s] T: ~15.4% 200M testAccuracyRandomDistribution: acceptableFpp(3.000000 %) < actualFpp (3.861257 %) [00m37s] T: ~19.8% testAccuracyEvenOdd: acceptableFpp(3.000000 %) < actualFpp (3.860424 %) [00m20s] T: ~20.6% 250M testAccuracyRandomDistribution: acceptableFpp(3.000000 %) < actualFpp (3.676172 %) [00m47s] T: ~25.1% testAccuracyEvenOdd: acceptableFpp(3.000000 %) < actualFpp (3.675387 %) [00m25s] T: ~25.8% 300M testAccuracyRandomDistribution: acceptableFpp(3.000000 %) < actualFpp (3.210548 %) [00m57s] T: ~30.5% testAccuracyEvenOdd: acceptableFpp(3.000000 %) < actualFpp (3.209847 %) [00m30s] T: ~30.1% 350M testAccuracyRandomDistribution: acceptableFpp(3.000000 %) < actualFpp (5.377388 %) [01m07s] T: ~35.8% testAccuracyEvenOdd: acceptableFpp(3.000000 %) < actualFpp (5.377483 %) [00m36s] T: ~37.1% 400M testAccuracyRandomDistribution: acceptableFpp(3.000000 %) < actualFpp (8.170380 %) [01m17s] T: ~41.2% testAccuracyEvenOdd: acceptableFpp(3.000000 %) < actualFpp (8.170716 %) [00m40s] T: ~41.2% 500M testAccuracyRandomDistribution: acceptableFpp(3.000000 %) < actualFpp (15.392861 %) [01m36s] T: ~51.3% testAccuracyEvenOdd: acceptableFpp(3.000000 %) < actualFpp (15.391692 %) [00m50s] T: ~51.5% 1G testAccuracyRandomDistribution: acceptableFpp(3.000000 %) < actualFpp (59.890330 %) [03m07s] T: 100.0% testAccuracyEvenOdd: acceptableFpp(3.000000 %) < actualFpp (59.888499 %) [01m37s] T: 100.0% ``` ### Was this patch authored or co-authored using generative AI tooling? No Closes #51845 from ishnagy/SPARK-53077_reenable_SparkBloomFilterSuite. Authored-by: Ish Nagy <ish@ishnagy.eu> Signed-off-by: Dongjoon Hyun <dongjoon@apache.org>
diff --git a/common/sketch/src/test/java/org/apache/spark/util/sketch/SparkBloomFilterSuite.java b/common/sketch/src/test/java/org/apache/spark/util/sketch/SparkBloomFilterSuite.java
@@ -40,8 +40,7 @@ public class SparkBloomFilterSuite {
 
   // the implemented fpp limit is only approximating the hard boundary,
   // so we'll need an error threshold for the assertion
-  final double FPP_EVEN_ODD_ERROR_FACTOR = 0.10;
-  final double FPP_RANDOM_ERROR_FACTOR = 0.10;
+  final double FPP_ACCEPTABLE_ERROR_FACTOR = 0.10;
 
   final long ONE_GB = 1024L * 1024L * 1024L;
   final long REQUIRED_HEAP_UPPER_BOUND_IN_BYTES = 4 * ONE_GB;
@@ -106,7 +105,7 @@ private static Stream<Arguments> dataPointProvider() {
     //   to reduce running time to acceptable levels, we test only one case,
     //   with the default FPP and the default seed only.
     return Stream.of(
-      Arguments.of(1_000_000_000L, 0.03, BloomFilterImplV2.DEFAULT_SEED)
+      Arguments.of(350_000_000L, 0.03, BloomFilterImplV2.DEFAULT_SEED)
     );
     // preferable minimum parameter space for tests:
     //   {1_000_000L, 1_000_000_000L}           for: long numItems
@@ -201,7 +200,7 @@ public void testAccuracyEvenOdd(
     );
 
     double actualFpp = mightContainOdd.doubleValue() / numItems;
-    double acceptableFpp = expectedFpp * (1 + FPP_EVEN_ODD_ERROR_FACTOR);
+    double acceptableFpp = expectedFpp * (1 + FPP_ACCEPTABLE_ERROR_FACTOR);
 
     testOut.printf("expectedFpp:   %f %%\n", 100 * expectedFpp);
     testOut.printf("acceptableFpp: %f %%\n", 100 * acceptableFpp);
@@ -279,6 +278,7 @@ public void testAccuracyRandomDistribution(
         deterministicSeed
       );
 
+    // V1 ignores custom seed values, so the control filter must be at least V2
     BloomFilter bloomFilterSecondary =
       BloomFilter.create(
         BloomFilter.Version.V2,
@@ -354,7 +354,7 @@ public void testAccuracyRandomDistribution(
 
     double actualFpp =
       mightContainOddIndexed.doubleValue() / confirmedAsNotInserted.doubleValue();
-    double acceptableFpp = expectedFpp * (1 + FPP_RANDOM_ERROR_FACTOR);
+    double acceptableFpp = expectedFpp * (1 + FPP_ACCEPTABLE_ERROR_FACTOR);
 
     testOut.printf("mightContainOddIndexed: %10d\n", mightContainOddIndexed.longValue());
     testOut.printf("confirmedAsNotInserted: %10d\n", confirmedAsNotInserted.longValue());