Merge pull request #6655 from gchq/6618-disable-datafusion-readahead

patchwork01 · web-flow · commit fd9904ab0998 · 2026-02-17T14:30:40.000Z
6618: Disable data fusion readahead sore by default
diff --git a/docs/usage/properties/instance/user/table_property_defaults.md b/docs/usage/properties/instance/user/table_property_defaults.md
@@ -13,7 +13,7 @@ The following instance properties relate to default values used by table propert
 | sleeper.default.table.parquet.dictionary.encoding.value.fields                              | Whether dictionary encoding should be used for value columns in the Parquet files.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                     | false                      | false                       |
 | sleeper.default.table.parquet.columnindex.truncate.length                                   | Used to set parquet.columnindex.truncate.length, see documentation here:<br>https://github.com/apache/parquet-mr/blob/master/parquet-hadoop/README.md<br>The length in bytes to truncate binary values in a column index.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                              | 128                        | false                       |
 | sleeper.default.table.parquet.statistics.truncate.length                                    | Used to set parquet.statistics.truncate.length, see documentation here:<br>https://github.com/apache/parquet-mr/blob/master/parquet-hadoop/README.md<br>The length in bytes to truncate the min/max binary values in row groups.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       | 2147483647                 | false                       |
-| sleeper.default.table.datafusion.s3.readahead.enabled                                       | Enables a cache of data when reading from S3 with the DataFusion data engine, to hold data in larger blocks than are requested by DataFusion.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                          | true                       | false                       |
+| sleeper.default.table.datafusion.s3.readahead.enabled                                       | Enables a cache of data when reading from S3 with the DataFusion data engine, to hold data in larger blocks than are requested by DataFusion.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                          | false                      | false                       |
 | sleeper.default.table.parquet.writer.version                                                | Used to set parquet.writer.version, see documentation here:<br>https://github.com/apache/parquet-mr/blob/master/parquet-hadoop/README.md<br>Can be either v1 or v2. The v2 pages store levels uncompressed while v1 pages compress levels with the data.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                               | v2                         | false                       |
 | sleeper.default.table.parquet.rowgroup.rows.max                                             | Maximum number of rows to write in a Parquet row group.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                | 100000                     | false                       |
 | sleeper.default.table.statestore.transactionlog.add.transaction.max.attempts                | The number of attempts to make when applying a transaction to the state store. This default can be overridden by a table property.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                     | 10                         | false                       |
diff --git a/docs/usage/properties/table/data_storage.md b/docs/usage/properties/table/data_storage.md
@@ -11,7 +11,7 @@ The following table properties relate to the storage of data inside a table.
 | sleeper.table.parquet.dictionary.encoding.value.fields   | Whether dictionary encoding should be used for value columns in the Parquet files.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                            | false         |
 | sleeper.table.parquet.columnindex.truncate.length        | Used to set parquet.columnindex.truncate.length, see documentation here:<br>https://github.com/apache/parquet-mr/blob/master/parquet-hadoop/README.md<br>The length in bytes to truncate binary values in a column index.                                                                                                                                                                                                                                                                                                                                                                                                                     | 128           |
 | sleeper.table.parquet.statistics.truncate.length         | Used to set parquet.statistics.truncate.length, see documentation here:<br>https://github.com/apache/parquet-mr/blob/master/parquet-hadoop/README.md<br>The length in bytes to truncate the min/max binary values in row groups.                                                                                                                                                                                                                                                                                                                                                                                                              | 2147483647    |
-| sleeper.table.datafusion.s3.readahead.enabled            | Enables a cache of data when reading from S3 with the DataFusion data engine, to hold data in larger blocks than are requested by DataFusion.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                 | true          |
+| sleeper.table.datafusion.s3.readahead.enabled            | Enables a cache of data when reading from S3 with the DataFusion data engine, to hold data in larger blocks than are requested by DataFusion.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                 | false         |
 | sleeper.table.parquet.writer.version                     | Used to set parquet.writer.version, see documentation here:<br>https://github.com/apache/parquet-mr/blob/master/parquet-hadoop/README.md<br>Can be either v1 or v2. The v2 pages store levels uncompressed while v1 pages compress levels with the data.                                                                                                                                                                                                                                                                                                                                                                                      | v2            |
 | sleeper.table.parquet.query.column.index.enabled         | Used during Sleeper queries to determine whether the column/offset indexes (also known as page indexes) are read from Parquet files. For some queries, e.g. single/few row lookups this can improve performance by enabling more aggressive pruning. On range queries, especially on large tables this can harm performance, since readers will read the extra index data before returning results, but with little benefit from pruning.                                                                                                                                                                                                     | false         |
 | sleeper.table.parquet.rowgroup.rows.max                  | Maximum number of rows to write in a Parquet row group.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       | 100000        |
diff --git a/example/full/instance.properties b/example/full/instance.properties
@@ -1821,7 +1821,7 @@ sleeper.logging.root.level=INFO
 # Enables a cache of data when reading from S3 with the DataFusion data engine, to hold data in larger
 # blocks than are requested by DataFusion.
 # (default value shown below, uncomment to set a value)
-# sleeper.default.table.datafusion.s3.readahead.enabled=true
+# sleeper.default.table.datafusion.s3.readahead.enabled=false
 
 # Used to set parquet.writer.version, see documentation here:
 # https://github.com/apache/parquet-mr/blob/master/parquet-hadoop/README.md
diff --git a/example/full/table.properties b/example/full/table.properties
@@ -165,7 +165,7 @@ sleeper.table.statestore.classname=DynamoDBTransactionLogStateStore
 # Enables a cache of data when reading from S3 with the DataFusion data engine, to hold data in larger
 # blocks than are requested by DataFusion.
 # (default value shown below, uncomment to set a value)
-# sleeper.table.datafusion.s3.readahead.enabled=true
+# sleeper.table.datafusion.s3.readahead.enabled=false
 
 # Used to set parquet.writer.version, see documentation here:
 # https://github.com/apache/parquet-mr/blob/master/parquet-hadoop/README.md
diff --git a/java/core/src/main/java/sleeper/core/properties/instance/TableDefaultProperty.java b/java/core/src/main/java/sleeper/core/properties/instance/TableDefaultProperty.java
@@ -89,7 +89,7 @@ public interface TableDefaultProperty {
     UserDefinedInstanceProperty DEFAULT_DATAFUSION_S3_READAHEAD_ENABLED = Index.propertyBuilder("sleeper.default.table.datafusion.s3.readahead.enabled")
             .description("Enables a cache of data when reading from S3 with the DataFusion data engine, to hold data " +
                     "in larger blocks than are requested by DataFusion.")
-            .defaultValue("true")
+            .defaultValue("false")
             .validationPredicate(SleeperPropertyValueUtils::isTrueOrFalse)
             .propertyGroup(InstancePropertyGroup.TABLE_PROPERTY_DEFAULT).build();
     UserDefinedInstanceProperty DEFAULT_PARQUET_WRITER_VERSION = Index.propertyBuilder("sleeper.default.table.parquet.writer.version")
diff --git a/scripts/templates/instanceproperties.template b/scripts/templates/instanceproperties.template
@@ -1822,7 +1822,7 @@ sleeper.subnets=set-automatically
 # Enables a cache of data when reading from S3 with the DataFusion data engine, to hold data in larger
 # blocks than are requested by DataFusion.
 # (default value shown below, uncomment to set a value)
-# sleeper.default.table.datafusion.s3.readahead.enabled=true
+# sleeper.default.table.datafusion.s3.readahead.enabled=false
 
 # Used to set parquet.writer.version, see documentation here:
 # https://github.com/apache/parquet-mr/blob/master/parquet-hadoop/README.md
diff --git a/scripts/templates/tableproperties.template b/scripts/templates/tableproperties.template
@@ -149,7 +149,7 @@ sleeper.table.name=changeme
 # Enables a cache of data when reading from S3 with the DataFusion data engine, to hold data in larger
 # blocks than are requested by DataFusion.
 # (default value shown below, uncomment to set a value)
-# sleeper.table.datafusion.s3.readahead.enabled=true
+# sleeper.table.datafusion.s3.readahead.enabled=false
 
 # Used to set parquet.writer.version, see documentation here:
 # https://github.com/apache/parquet-mr/blob/master/parquet-hadoop/README.md