fixes

clintropolis · clintropolis · commit f12e2cfe6d95 · 2026-06-16T13:56:39.000-07:00
diff --git a/processing/src/main/java/org/apache/druid/data/input/impl/ClusteredValueGroupsBaseTableProjectionSpec.java b/processing/src/main/java/org/apache/druid/data/input/impl/ClusteredValueGroupsBaseTableProjectionSpec.java
@@ -32,6 +32,8 @@
 import org.apache.druid.query.aggregation.AggregatorFactory;
 import org.apache.druid.segment.VirtualColumns;
 import org.apache.druid.segment.column.ColumnHolder;
+import org.apache.druid.segment.column.ColumnType;
+import org.apache.druid.segment.column.ValueType;
 import org.apache.druid.utils.CollectionUtils;
 
 import javax.annotation.Nullable;
@@ -49,18 +51,19 @@
  * <p>
  * The operator declares a single ordered {@link #columns} list — the full set of columns in segment order, plus a
  * {@link #clusteringColumns} list of NAMES designating the leading prefix of {@link #columns} that rows are
- * clustered by. The time position is an explicit positional entry in {@link #columns} (named {@code __time}, or the
- * query-granularity column {@link Granularities#GRANULARITY_VIRTUAL_COLUMN_NAME}); clustering by the time column is not
- * yet supported, so the time marker must be a non-clustering column. A clustered base table is never rollup and has
- * no metric columns.
+ * clustered by. The time position is an explicit positional entry in {@link #columns} named {@code __time}; clustering
+ * by the time column is not yet supported, so {@code __time} must be a non-clustering column. A clustered base table
+ * is never rollup and has no metric columns.
  * <p>
  * {@link #getDimensionsSpec()} returns the unified spec built from {@link #columns} in declared order with
  * {@code forceSegmentSortByTime=false}; {@link #getOrdering()} is computed as every column of {@link #columns}
  * ascending, in list order.
  * <p>
- * Query granularity, when wanted, is just another entry in {@link #getVirtualColumns()} named
- * {@link Granularities#GRANULARITY_VIRTUAL_COLUMN_NAME}; absent that virtual column the query granularity is
- * {@code NONE}. Segment granularity and intervals live on the top-level
+ * Query granularity, when wanted, is a virtual column in {@link #getVirtualColumns()} named
+ * {@link Granularities#GRANULARITY_VIRTUAL_COLUMN_NAME}. It is a granularity <em>carrier</em>: it supplies the
+ * granularity that floors the stored {@code __time} column, and is NOT itself a stored column, so it never appears in
+ * {@link #columns} (declare {@code __time} there as the time column). Absent that virtual column the query granularity
+ * is {@code NONE}.
  * {@link org.apache.druid.indexer.granularity.SegmentGranularitySpec}, not here.
  */
 @JsonTypeName(ClusteredValueGroupsBaseTableProjectionSpec.TYPE_NAME)
@@ -180,9 +183,19 @@ private static void validate(List<DimensionSchema> columns, List<String> cluster
       throw clusteringPrefixException(columns, clusteringColumns);
     }
     for (int i = 0; i < clusteringColumns.size(); i++) {
-      if (!columns.get(i).getName().equals(clusteringColumns.get(i))) {
+      final DimensionSchema clusteringColumn = columns.get(i);
+      if (!clusteringColumn.getName().equals(clusteringColumns.get(i))) {
         throw clusteringPrefixException(columns, clusteringColumns);
       }
+      // Clustering values are dictionary-encoded into per-type dictionaries on the write side, which supports only
+      // these scalar types; reject anything else up front rather than failing later at ingest.
+      if (!isSupportedClusteringType(clusteringColumn.getColumnType())) {
+        throw InvalidInput.exception(
+            "clustering column [%s] has unsupported type [%s]; clustering columns must be STRING, LONG, DOUBLE, or FLOAT",
+            clusteringColumn.getName(),
+            clusteringColumn.getColumnType()
+        );
+      }
     }
 
     final Set<String> seen = Sets.newHashSetWithExpectedSize(columns.size());
@@ -193,40 +206,45 @@ private static void validate(List<DimensionSchema> columns, List<String> cluster
     }
 
     int timeIndex = -1;
-    boolean bothPresent = false;
     for (int i = 0; i < columns.size(); i++) {
       final String name = columns.get(i).getName();
-      if (ColumnHolder.TIME_COLUMN_NAME.equals(name) || Granularities.GRANULARITY_VIRTUAL_COLUMN_NAME.equals(name)) {
-        if (timeIndex >= 0) {
-          bothPresent = true;
-        }
+      // The query-granularity virtual column is a granularity carrier in virtualColumns (it floors the stored __time
+      // column); it is not itself a stored column, so it must not be declared in 'columns'.
+      if (Granularities.GRANULARITY_VIRTUAL_COLUMN_NAME.equals(name)) {
+        throw InvalidInput.exception(
+            "[%s] is the query-granularity virtual column, not a stored column; declare it in 'virtualColumns' and use"
+            + " [%s] as the time column in 'columns'",
+            Granularities.GRANULARITY_VIRTUAL_COLUMN_NAME,
+            ColumnHolder.TIME_COLUMN_NAME
+        );
+      }
+      if (ColumnHolder.TIME_COLUMN_NAME.equals(name)) {
         timeIndex = i;
       }
     }
     if (timeIndex < 0) {
       throw InvalidInput.exception(
-          "clustered base table must include %s (or the query-granularity column [%s]) in 'columns' to define the"
-          + " time position",
-          ColumnHolder.TIME_COLUMN_NAME,
-          Granularities.GRANULARITY_VIRTUAL_COLUMN_NAME
-      );
-    }
-    if (bothPresent) {
-      throw InvalidInput.exception(
-          "clustered base table must include exactly one of %s / %s in 'columns' to define the time position",
-          ColumnHolder.TIME_COLUMN_NAME,
-          Granularities.GRANULARITY_VIRTUAL_COLUMN_NAME
+          "clustered base table must include [%s] in 'columns' to define the time position",
+          ColumnHolder.TIME_COLUMN_NAME
       );
     }
     if (timeIndex < clusteringColumns.size()) {
       throw InvalidInput.exception(
-          "clustering by %s / %s is not yet supported; the time column must be a non-clustering column",
-          ColumnHolder.TIME_COLUMN_NAME,
-          Granularities.GRANULARITY_VIRTUAL_COLUMN_NAME
+          "clustering by [%s] is not yet supported; the time column must be a non-clustering column",
+          ColumnHolder.TIME_COLUMN_NAME
       );
     }
   }
 
+  private static boolean isSupportedClusteringType(ColumnType type)
+  {
+    return type != null
+           && (type.is(ValueType.STRING)
+               || type.is(ValueType.LONG)
+               || type.is(ValueType.DOUBLE)
+               || type.is(ValueType.FLOAT));
+  }
+
   private static DruidException clusteringPrefixException(
       List<DimensionSchema> columns,
       List<String> clusteringColumns
diff --git a/processing/src/main/java/org/apache/druid/segment/incremental/OnHeapClusterGroup.java b/processing/src/main/java/org/apache/druid/segment/incremental/OnHeapClusterGroup.java
@@ -74,6 +74,7 @@ public final class OnHeapClusterGroup implements IncrementalIndexRowSelector
   private final AtomicInteger numEntries = new AtomicInteger(0);
   private final int groupTimePosition;
 
+  private final VirtualColumns virtualColumns;
   private final ColumnSelectorFactory virtualSelectorFactory;
 
   OnHeapClusterGroup(
@@ -114,6 +115,7 @@ public final class OnHeapClusterGroup implements IncrementalIndexRowSelector
       this.factsHolder = new OnheapIncrementalIndex.PlainNonTimeOrderedFactsHolder(rowComparator);
     }
 
+    this.virtualColumns = virtualColumns;
     this.virtualSelectorFactory = new OnheapIncrementalIndex.CachingColumnSelectorFactory(
         IncrementalIndex.makeColumnSelectorFactory(virtualColumns, inputRowHolder, null)
     );
@@ -301,10 +303,16 @@ boolean addToFacts(
     long dimsKeySize = 0L;
     for (int i = 0; i < dimensions.size(); i++) {
       final IncrementalIndex.DimensionDesc desc = dimensions.get(i);
+      final String name = desc.getName();
+      // A column declared as a virtual-column output is computed through the (VC-aware) selector factory; a plain
+      // column is read straight from the raw row.
+      final Object dimValue = virtualColumns.exists(name)
+                              ? virtualSelectorFactory.makeColumnValueSelector(name).getObject()
+                              : row.getRaw(name);
       try {
         @SuppressWarnings({"unchecked", "rawtypes"})
         final EncodedKeyComponent<?> k = ((DimensionIndexer) desc.getIndexer())
-            .processRowValsToUnsortedEncodedKeyComponent(row.getRaw(desc.getName()), true);
+            .processRowValsToUnsortedEncodedKeyComponent(dimValue, true);
         groupDims[i] = k.getComponent();
         dimsKeySize += k.getEffectiveSizeBytes();
       }
diff --git a/processing/src/main/java/org/apache/druid/segment/incremental/OnheapIncrementalIndex.java b/processing/src/main/java/org/apache/druid/segment/incremental/OnheapIncrementalIndex.java
@@ -677,8 +677,7 @@ public void close()
 
   /**
    * Caches references to selector objects for each column instead of creating a new object each time in order to save
-   * heap space. In general the selectorFactory need not to thread-safe. If required, set concurrentEventAdd to true to
-   * use concurrent hash map instead of vanilla hash map for thread-safe operations.
+   * heap space.
    */
   static class CachingColumnSelectorFactory implements ColumnSelectorFactory
   {
diff --git a/processing/src/test/java/org/apache/druid/segment/incremental/IncrementalIndexCursorFactoryClusteredTest.java b/processing/src/test/java/org/apache/druid/segment/incremental/IncrementalIndexCursorFactoryClusteredTest.java
@@ -27,17 +27,20 @@
 import org.apache.druid.java.util.common.DateTimes;
 import org.apache.druid.java.util.common.granularity.Granularities;
 import org.apache.druid.query.dimension.DefaultDimensionSpec;
+import org.apache.druid.query.expression.TestExprMacroTable;
 import org.apache.druid.query.filter.EqualityFilter;
 import org.apache.druid.query.filter.Filter;
 import org.apache.druid.query.filter.TypedInFilter;
 import org.apache.druid.segment.Cursor;
 import org.apache.druid.segment.CursorBuildSpec;
 import org.apache.druid.segment.CursorHolder;
 import org.apache.druid.segment.DimensionSelector;
+import org.apache.druid.segment.VirtualColumns;
 import org.apache.druid.segment.column.ColumnHolder;
 import org.apache.druid.segment.column.ColumnType;
 import org.apache.druid.segment.column.RowSignature;
 import org.apache.druid.segment.filter.AndFilter;
+import org.apache.druid.segment.virtual.ExpressionVirtualColumn;
 import org.apache.druid.testing.InitializedNullHandlingTest;
 import org.junit.jupiter.api.Assertions;
 import org.junit.jupiter.api.Test;
@@ -114,6 +117,65 @@ private static List<List<String>> scanTenantRegion(CursorHolder holder)
     return out;
   }
 
+  @Test
+  void testNonClusteringVirtualColumnDimensionIsMaterialized()
+  {
+    // A non-clustering column declared as a virtual-column output (region_upper = upper(region)) is computed at
+    // ingest through the VC-aware selector and stored like any other column — VCs aren't limited to clustering
+    // columns. region_upper is never in the input row, so a null here would mean the VC was not applied.
+    final ClusteredValueGroupsBaseTableProjectionSpec spec = ClusteredValueGroupsBaseTableProjectionSpec.builder()
+        .virtualColumns(VirtualColumns.create(
+            new ExpressionVirtualColumn("region_upper", "upper(region)", ColumnType.STRING, TestExprMacroTable.INSTANCE)
+        ))
+        .columns(
+            new StringDimensionSchema("tenant"),
+            new StringDimensionSchema("region"),
+            new StringDimensionSchema("region_upper"),
+            new LongDimensionSchema("__time")
+        )
+        .clusteringColumns("tenant")
+        .build();
+    final IncrementalIndexSchema schema = IncrementalIndexSchema.builder()
+        .withMinTimestamp(T0)
+        .withTimestampSpec(TIMESTAMP_SPEC)
+        .withQueryGranularity(Granularities.NONE)
+        .withDimensionsSpec(spec.getDimensionsSpec())
+        .withRollup(false)
+        .withClusterSpec(spec)
+        .build();
+    try (OnheapIncrementalIndex index = (OnheapIncrementalIndex) new OnheapIncrementalIndex.Builder()
+        .setIndexSchema(schema)
+        .setMaxRowCount(10_000)
+        .build()) {
+      index.add(row(T0, "acme", "us-east-1"));
+      index.add(row(T0 + 1, "acme", "us-west-2"));
+
+      final IncrementalIndexCursorFactory factory = new IncrementalIndexCursorFactory(index);
+      try (CursorHolder holder = factory.makeCursorHolder(CursorBuildSpec.FULL_SCAN)) {
+        final Cursor cursor = holder.asCursor();
+        final DimensionSelector regionSel =
+            cursor.getColumnSelectorFactory().makeDimensionSelector(DefaultDimensionSpec.of("region"));
+        final DimensionSelector upperSel =
+            cursor.getColumnSelectorFactory().makeDimensionSelector(DefaultDimensionSpec.of("region_upper"));
+        final List<List<String>> out = new ArrayList<>();
+        while (!cursor.isDone()) {
+          out.add(Arrays.asList(
+              regionSel.lookupName(regionSel.getRow().get(0)),
+              upperSel.lookupName(upperSel.getRow().get(0))
+          ));
+          cursor.advance();
+        }
+        Assertions.assertEquals(
+            List.of(
+                List.of("us-east-1", "US-EAST-1"),
+                List.of("us-west-2", "US-WEST-2")
+            ),
+            out
+        );
+      }
+    }
+  }
+
   @Test
   void testRowSignatureExposesClusteringAndNonClusteringColumns()
   {
diff --git a/processing/src/test/java/org/apache/druid/segment/incremental/OnHeapClusteredBaseTableTest.java b/processing/src/test/java/org/apache/druid/segment/incremental/OnHeapClusteredBaseTableTest.java
@@ -27,10 +27,12 @@
 import org.apache.druid.data.input.impl.LongDimensionSchema;
 import org.apache.druid.data.input.impl.StringDimensionSchema;
 import org.apache.druid.data.input.impl.TimestampSpec;
+import org.apache.druid.error.DruidException;
 import org.apache.druid.java.util.common.DateTimes;
 import org.apache.druid.java.util.common.Intervals;
 import org.apache.druid.java.util.common.granularity.Granularities;
 import org.apache.druid.query.expression.TestExprMacroTable;
+import org.apache.druid.segment.AutoTypeColumnSchema;
 import org.apache.druid.segment.IndexableAdapter;
 import org.apache.druid.segment.Metadata;
 import org.apache.druid.segment.VirtualColumns;
@@ -145,6 +147,61 @@ void testNewClusterGroupsGrowHeapEstimate()
     }
   }
 
+  @Test
+  void testQueryGranularityColumnRejectedInColumns()
+  {
+    // The query-granularity virtual column is a granularity carrier (it floors the stored __time column), not a stored
+    // column, so it must not be declared in 'columns' — only __time defines the time position.
+    final DruidException e = Assertions.assertThrows(
+        DruidException.class,
+        () -> ClusteredValueGroupsBaseTableProjectionSpec.builder()
+            .columns(
+                new StringDimensionSchema("tenant"),
+                new LongDimensionSchema(Granularities.GRANULARITY_VIRTUAL_COLUMN_NAME)
+            )
+            .clusteringColumns("tenant")
+            .build()
+    );
+    Assertions.assertTrue(
+        e.getMessage().contains(Granularities.GRANULARITY_VIRTUAL_COLUMN_NAME) && e.getMessage().contains("virtualColumns"),
+        e.getMessage()
+    );
+  }
+
+  @Test
+  void testUnsupportedClusteringColumnTypeRejected()
+  {
+    // Clustering values are dictionary-encoded by scalar type on the write side, so a non-scalar (here array) type is
+    // rejected at spec-validation time rather than failing later at ingest.
+    final DruidException e = Assertions.assertThrows(
+        DruidException.class,
+        () -> ClusteredValueGroupsBaseTableProjectionSpec.builder()
+            .columns(
+                new AutoTypeColumnSchema("tags", ColumnType.STRING_ARRAY, null),
+                new LongDimensionSchema("__time")
+            )
+            .clusteringColumns("tags")
+            .build()
+    );
+    Assertions.assertTrue(
+        e.getMessage().contains("tags") && e.getMessage().contains("STRING, LONG, DOUBLE"),
+        e.getMessage()
+    );
+  }
+
+  @Test
+  void testMissingTimeColumnRejected()
+  {
+    final DruidException e = Assertions.assertThrows(
+        DruidException.class,
+        () -> ClusteredValueGroupsBaseTableProjectionSpec.builder()
+            .columns(new StringDimensionSchema("tenant"), new StringDimensionSchema("region"))
+            .clusteringColumns("tenant")
+            .build()
+    );
+    Assertions.assertTrue(e.getMessage().contains("__time"), e.getMessage());
+  }
+
   @Test
   void testTwoColumnClusteringDistinguishesTuples()
   {

Original file line number	Diff line number	Diff line change
`@@ -677,8 +677,7 @@ public void close()`
`677`	`677`
`678`	`678`	`/**`
`679`	`679`	`* Caches references to selector objects for each column instead of creating a new object each time in order to save`
`680`		`- * heap space. In general the selectorFactory need not to thread-safe. If required, set concurrentEventAdd to true to`
`681`		`- * use concurrent hash map instead of vanilla hash map for thread-safe operations.`
	`680`	`+ * heap space.`
`682`	`681`	`*/`
`683`	`682`	`static class CachingColumnSelectorFactory implements ColumnSelectorFactory`
`684`	`683`	`{`