iree-org
diff --git a/‎compiler/src/iree/compiler/Codegen/Common/BlockDynamicDimensions.cpp‎
Lines changed: 1 addition & 16 deletions b/‎compiler/src/iree/compiler/Codegen/Common/BlockDynamicDimensions.cpp‎
Lines changed: 1 addition & 16 deletions
diff --git a/‎compiler/src/iree/compiler/Codegen/Common/GPU/BUILD.bazel‎
Lines changed: 2 additions & 0 deletions b/‎compiler/src/iree/compiler/Codegen/Common/GPU/BUILD.bazel‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎compiler/src/iree/compiler/Codegen/Common/GPU/CMakeLists.txt‎
Lines changed: 2 additions & 0 deletions b/‎compiler/src/iree/compiler/Codegen/Common/GPU/CMakeLists.txt‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎compiler/src/iree/compiler/Codegen/Common/GPU/GPUExpandDimensions.cpp‎
Lines changed: 290 additions & 0 deletions b/‎compiler/src/iree/compiler/Codegen/Common/GPU/GPUExpandDimensions.cpp‎
Lines changed: 290 additions & 0 deletions
diff --git a/‎compiler/src/iree/compiler/Codegen/Common/GPU/Passes.td‎
Lines changed: 8 additions & 0 deletions b/‎compiler/src/iree/compiler/Codegen/Common/GPU/Passes.td‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎compiler/src/iree/compiler/Codegen/Common/GPU/test/BUILD.bazel‎
Lines changed: 1 addition & 0 deletions b/‎compiler/src/iree/compiler/Codegen/Common/GPU/test/BUILD.bazel‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎compiler/src/iree/compiler/Codegen/Common/GPU/test/CMakeLists.txt‎
Lines changed: 1 addition & 0 deletions b/‎compiler/src/iree/compiler/Codegen/Common/GPU/test/CMakeLists.txt‎
Lines changed: 1 addition & 0 deletions
@@ -36,17 +36,6 @@ using TensorDivisibilityInfo =
 
 namespace {
 
-struct RemoveOptimizationBarrier final
-    : public OpRewritePattern<IREE::Util::OptimizationBarrierOp> {
-  using Base::Base;
-
-  LogicalResult matchAndRewrite(IREE::Util::OptimizationBarrierOp barrierOp,
-                                PatternRewriter &rewriter) const override {
-    rewriter.replaceOp(barrierOp, barrierOp.getOperands());
-    return success();
-  }
-};
-
 /// This pass is used to materialize information about dynamic dimensions of
 /// `tensor` operands of an operation in the IR. If a dynamic dimension is
 /// known to be a multiple of a compile-time constant value, this pass
@@ -110,10 +99,6 @@ getTensorDivisibilityInfo(const TensorDynamicDimAnalysis &dynamicDimAnalysis,
 /// inverses of each other. The `util.optimization.barrier` avoid these from
 /// getting folded away during reshape propagation. Return the result of the
 /// `tensor.collapse_shape generated.
-struct ReshapeOps {
-  tensor::ExpandShapeOp expandShapeOp;
-  tensor::CollapseShapeOp collapseShapeOp;
-};
 static std::optional<ReshapeOps>
 blockDynamicDimensionsOfValue(RewriterBase &rewriter,
                               const TensorDivisibilityInfo &divisibilityInfo,
@@ -413,7 +398,7 @@ void BlockDynamicDimensionsPass::runOnOperation() {
   // Delete the optimization barrier and run some further cleanup.
   {
     RewritePatternSet removeBarrierOpsPatterns(context);
-    removeBarrierOpsPatterns.insert<RemoveOptimizationBarrier>(context);
+    populateRemoveOptimizationBarrierPatterns(removeBarrierOpsPatterns);
     tensor::ExpandShapeOp::getCanonicalizationPatterns(removeBarrierOpsPatterns,
                                                        context);
     tensor::CollapseShapeOp::getCanonicalizationPatterns(
 
@@ -74,6 +74,7 @@ iree_compiler_cc_library(
         "GPUDistributeScfFor.cpp",
         "GPUDistributeSharedMemoryCopy.cpp",
         "GPUDistributionPatterns.cpp",
+        "GPUExpandDimensions.cpp",
         "GPUFuseAndHoistParallelLoops.cpp",
         "GPUGeneralizeNamedOps.cpp",
         "GPUGreedilyDistributeToThreads.cpp",
@@ -125,6 +126,7 @@ iree_compiler_cc_library(
         "//compiler/src/iree/compiler/Dialect/LinalgExt/Transforms",
         "//compiler/src/iree/compiler/Dialect/LinalgExt/Utils",
         "//compiler/src/iree/compiler/Dialect/TensorExt/IR",
+        "//compiler/src/iree/compiler/Dialect/Util/IR",
         "//compiler/src/iree/compiler/Utils",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:AMDGPUDialect",
 
@@ -67,6 +67,7 @@ iree_cc_library(
     "GPUDistributeScfFor.cpp"
     "GPUDistributeSharedMemoryCopy.cpp"
     "GPUDistributionPatterns.cpp"
+    "GPUExpandDimensions.cpp"
     "GPUFuseAndHoistParallelLoops.cpp"
     "GPUGeneralizeNamedOps.cpp"
     "GPUGreedilyDistributeToThreads.cpp"
@@ -159,6 +160,7 @@ iree_cc_library(
     iree::compiler::Dialect::LinalgExt::Transforms
     iree::compiler::Dialect::LinalgExt::Utils
     iree::compiler::Dialect::TensorExt::IR
+    iree::compiler::Dialect::Util::IR
     iree::compiler::Utils
   PUBLIC
 )
 
@@ -0,0 +1,290 @@
+// Copyright 2025 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "iree/compiler/Codegen/Common/Transforms.h"
+#include "iree/compiler/Codegen/Dialect/GPU/IR/GPULoweringConfigUtils.h"
+#include "iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.h"
+#include "iree/compiler/Dialect/LinalgExt/Transforms/Transforms.h"
+#include "iree/compiler/Dialect/Util/IR/UtilDialect.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVectorExtras.h"
+#include "llvm/Support/DebugLog.h"
+#include "llvm/Support/LogicalResult.h"
+#include "mlir/Dialect/Affine/IR/AffineOps.h"
+#include "mlir/Dialect/Arith/Utils/Utils.h"
+#include "mlir/Dialect/Linalg/IR/Linalg.h"
+#include "mlir/Dialect/MemRef/Transforms/Transforms.h"
+#include "mlir/Dialect/Tensor/IR/Tensor.h"
+#include "mlir/IR/AffineExpr.h"
+#include "mlir/IR/AffineMap.h"
+#include "mlir/IR/BuiltinTypeInterfaces.h"
+#include "mlir/IR/OpDefinition.h"
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
+
+#define DEBUG_TYPE "iree-codegen-gpu-expand-dimensions"
+
+namespace mlir::iree_compiler {
+
+#define GEN_PASS_DEF_GPUEXPANDDIMENSIONSPASS
+#include "iree/compiler/Codegen/Common/GPU/Passes.h.inc"
+
+namespace {
+
+struct GPUExpandDimensionsPass final
+    : impl::GPUExpandDimensionsPassBase<GPUExpandDimensionsPass> {
+  using Base::Base;
+  void runOnOperation() override;
+};
+} // namespace
+
+// Compute the expanded shape for a reassociation group. Requires the original
+// dimension to be static and evenly divisible by the product of static factors
+// in the target shape.
+static FailureOr<SmallVector<OpFoldResult>> computeExpandedGroupShape(
+    RewriterBase &rewriter, Location loc, OpFoldResult origDimSize,
+    ArrayRef<int64_t> groupTargetShape, unsigned iteratorDim) {
+  if (groupTargetShape.size() == 1) {
+    return SmallVector<OpFoldResult>{origDimSize};
+  }
+
+  std::optional<int64_t> staticOrigDim = getConstantIntValue(origDimSize);
+  if (!staticOrigDim) {
+    return rewriter.notifyMatchFailure(
+        loc, "dimension " + Twine(iteratorDim) +
+                 " is dynamic, but expand_dims requires static dimensions");
+  }
+
+  int64_t staticFactor = llvm::product_of(
+      llvm::make_filter_range(groupTargetShape, ShapedType::isStatic));
+
+  if (staticFactor < 1) {
+    return rewriter.notifyMatchFailure(
+        loc, "invalid expansion factor " + Twine(staticFactor) +
+                 " for iterator dimension " + Twine(iteratorDim));
+  }
+
+  if (staticOrigDim.value() % staticFactor != 0) {
+    return rewriter.notifyMatchFailure(
+        loc, "dimension " + Twine(iteratorDim) +
+                 " (size=" + Twine(staticOrigDim.value()) +
+                 ") not divisible by expansion factor " + Twine(staticFactor));
+  }
+
+  return llvm::map_to_vector(
+      groupTargetShape, [&](int64_t size) -> OpFoldResult {
+        if (ShapedType::isStatic(size)) {
+          return rewriter.getIndexAttr(size);
+        }
+        AffineExpr s0 = rewriter.getAffineSymbolExpr(0);
+        return affine::makeComposedFoldedAffineApply(
+            rewriter, loc, s0.floorDiv(staticFactor), {origDimSize});
+      });
+}
+
+// For an operation annotated with the `expand_dims` attribute, replace relevant
+// operands with tensor.expand_shape/tensor.collapse_shape pair to materialize
+// dimension expansion according to the reassociation and output_shape defined
+// in the attribute.
+//
+// Example:
+//
+// ```mlir
+// %0 = <some_op>(..., %0, ...) {
+//   lowering_config = #iree_gpu.lowering_config<{
+//     expand_dims = #iree_gpu.expand_dims
+//       [[0], [1, 2]], output_shape = [?, ?, 8]>
+//   }>
+// } : ... -> tensor<4x128xf32>
+// ```
+//
+// becomes:
+//
+// ```mlir
+// %expanded = tensor.expand_shape %0 [[0], [1, 2]]
+//     : tensor<4x128xf32> into tensor<4x16x8xf32>
+// %barrier = util.optimization_barrier %expanded
+// %collapsed = tensor.collapse_shape %barrier [[0], [1, 2]]
+//     : tensor<4x16x8xf32> into tensor<4x128xf32>
+// %1 = <some_op>(..., %collapsed, ...) : ... -> tensor<4x128xf32>
+// ```
+static std::optional<ReshapeOps>
+createDimensionExpansionOps(RewriterBase &rewriter,
+                            IREE::GPU::DimensionExpansionAttr config, Value v,
+                            AffineMap indexingMap, linalg::LinalgOp op) {
+  auto tensorType = dyn_cast<RankedTensorType>(v.getType());
+  if (!tensorType) {
+    return std::nullopt;
+  }
+
+  Location loc = v.getLoc();
+  MLIRContext *ctx = op.getContext();
+  int64_t tensorRank = tensorType.getRank();
+  ArrayRef<int64_t> outputShape = config.getOutputShape().asArrayRef();
+  SmallVector<OpFoldResult> origShape = tensor::getMixedSizes(rewriter, loc, v);
+
+  // Map each tensor dimension to its expanded shape components.
+  SmallVector<SmallVector<OpFoldResult>> expandedShapes(tensorRank);
+  for (auto [iterDim, reassocIndices] :
+       llvm::enumerate(config.getReassociationIndices())) {
+    std::optional<unsigned> tensorDim =
+        indexingMap.getResultPosition(getAffineDimExpr(iterDim, ctx));
+    if (!tensorDim.has_value()) {
+      continue;
+    }
+
+    auto groupOutputShape = llvm::map_to_vector(
+        reassocIndices, [&](int64_t i) { return outputShape[i]; });
+
+    FailureOr<SmallVector<OpFoldResult>> groupShape = computeExpandedGroupShape(
+        rewriter, loc, origShape[tensorDim.value()], groupOutputShape, iterDim);
+    if (failed(groupShape)) {
+      return std::nullopt;
+    }
+
+    expandedShapes[tensorDim.value()] = std::move(groupShape.value());
+  }
+
+  // Build reassociation indices and expanded shape in tensor dimension order.
+  SmallVector<ReassociationIndices> reassociation;
+  SmallVector<OpFoldResult> expandedShape;
+  for (auto [tensorDim, expanded] : llvm::enumerate(expandedShapes)) {
+    ReassociationIndices &indices = reassociation.emplace_back();
+    auto addDim = [&](OpFoldResult dim) {
+      indices.push_back(expandedShape.size());
+      expandedShape.push_back(dim);
+    };
+    if (expanded.empty()) {
+      addDim(origShape[tensorDim]);
+    } else {
+      llvm::for_each(expanded, addDim);
+    }
+  }
+
+  // If no expansion is needed, return early.
+  if (llvm::equal(origShape, expandedShape)) {
+    return std::nullopt;
+  }
+
+  auto staticShape = llvm::map_to_vector(expandedShape, [](OpFoldResult ofr) {
+    return getConstantIntValue(ofr).value();
+  });
+
+  auto expandedType = RankedTensorType::get(
+      staticShape, tensorType.getElementType(), tensorType.getEncoding());
+
+  auto expandOp = tensor::ExpandShapeOp::create(rewriter, loc, expandedType, v,
+                                                reassociation, expandedShape);
+  Value barrier = IREE::Util::OptimizationBarrierOp::create(
+                      rewriter, loc, expandOp.getResult())
+                      .getResult(0);
+  auto collapseOp = tensor::CollapseShapeOp::create(rewriter, loc, tensorType,
+                                                    barrier, reassociation);
+
+  return ReshapeOps{expandOp, collapseOp};
+}
+
+static LogicalResult expandIterationSpace(RewriterBase &rewriter,
+                                          linalg::LinalgOp op) {
+  auto loweringConfig = getLoweringConfig<IREE::GPU::LoweringConfigAttr>(op);
+  if (!loweringConfig) {
+    return success();
+  }
+  auto config = IREE::GPU::getDimensionExpansion(loweringConfig);
+  if (!config) {
+    return success();
+  }
+
+  LDBG() << "Expanding dimensions for op: " << *op;
+
+  for (OpOperand &operand : op->getOpOperands()) {
+    AffineMap indexingMap = op.getMatchingIndexingMap(&operand);
+    std::optional<ReshapeOps> reshapes = createDimensionExpansionOps(
+        rewriter, config, operand.get(), indexingMap, op);
+    if (reshapes.has_value()) {
+      rewriter.modifyOpInPlace(
+          op, [&]() { operand.set(reshapes.value().collapseShapeOp); });
+    }
+  }
+
+  return success();
+}
+
+void GPUExpandDimensionsPass::runOnOperation() {
+  Operation *operation = getOperation();
+  MLIRContext *context = &getContext();
+  IRRewriter rewriter(context);
+
+  SmallVector<linalg::LinalgOp> worklist;
+  operation->walk([&](linalg::LinalgOp op) {
+    if (auto cfg = getLoweringConfig<IREE::GPU::LoweringConfigAttr>(op)) {
+      if (IREE::GPU::getDimensionExpansion(cfg)) {
+        worklist.push_back(op);
+      }
+    }
+  });
+
+  for (linalg::LinalgOp op : worklist) {
+    rewriter.setInsertionPoint(op);
+    if (failed(expandIterationSpace(rewriter, op))) {
+      return signalPassFailure();
+    }
+  }
+
+  LDBG() << "After expanding dimensions: " << *operation;
+
+  ConfigTrackingListener listener;
+  GreedyRewriteConfig config;
+  config.setListener(&listener);
+
+  {
+    RewritePatternSet bubbleExpandShapePatterns(context);
+    linalg::ControlFusionFn controlFn = [](OpOperand *opOperand) {
+      return !isa_and_nonnull<linalg::FillOp, tensor::EmptyOp>(
+          opOperand->get().getDefiningOp());
+    };
+    linalg::populateFoldReshapeOpsByExpansionPatterns(bubbleExpandShapePatterns,
+                                                      controlFn);
+    IREE::LinalgExt::populateFoldReshapeOpsByExpansionPatterns(
+        bubbleExpandShapePatterns, controlFn);
+    tensor::populateFoldTensorEmptyPatterns(bubbleExpandShapePatterns);
+    tensor::populateBubbleUpExpandShapePatterns(bubbleExpandShapePatterns);
+    linalg::FillOp::getCanonicalizationPatterns(
+        bubbleExpandShapePatterns, bubbleExpandShapePatterns.getContext());
+    memref::populateResolveRankedShapedTypeResultDimsPatterns(
+        bubbleExpandShapePatterns);
+    if (failed(applyPatternsGreedily(
+            operation, std::move(bubbleExpandShapePatterns), config))) {
+      operation->emitOpError(
+          "failed in application of bubble up expand shape patterns");
+      return signalPassFailure();
+    }
+  }
+
+  LDBG() << "After reshape propagation: " << *operation;
+
+  {
+    RewritePatternSet removeBarrierOpsPatterns(context);
+    populateRemoveOptimizationBarrierPatterns(removeBarrierOpsPatterns);
+    tensor::ExpandShapeOp::getCanonicalizationPatterns(removeBarrierOpsPatterns,
+                                                       context);
+    tensor::CollapseShapeOp::getCanonicalizationPatterns(
+        removeBarrierOpsPatterns, context);
+    tensor::populateFoldTensorEmptyPatterns(removeBarrierOpsPatterns);
+    linalg::FillOp::getCanonicalizationPatterns(removeBarrierOpsPatterns,
+                                                context);
+    memref::populateResolveRankedShapedTypeResultDimsPatterns(
+        removeBarrierOpsPatterns);
+    if (failed(applyPatternsGreedily(operation,
+                                     std::move(removeBarrierOpsPatterns)))) {
+      operation->emitOpError("failed in cleanup patterns");
+      return signalPassFailure();
+    }
+  }
+
+  return;
+}
+
+} // namespace mlir::iree_compiler
@@ -383,6 +383,14 @@ def GPUApplyPaddingLevelPass :
   ];
 }
 
+def GPUExpandDimensionsPass :
+    InterfacePass<"iree-codegen-gpu-expand-dimensions", "mlir::FunctionOpInterface"> {
+  let summary = "Pass to expand tensor op dims based on `expand_dims` lowering_config";
+  let dependentDialects = [
+    "::mlir::iree_compiler::IREE::Util::UtilDialect"
+  ];
+}
+
 def GPUTensorTileToSerialLoopsPass :
     InterfacePass<"iree-codegen-gpu-tensor-tile-to-serial-loops", "mlir::FunctionOpInterface"> {
   let summary = "Pass to tile reduction dimensions for certain GPU ops";
 
@@ -36,6 +36,7 @@ iree_lit_test_suite(
             "gpu_distribute_forall.mlir",
             "gpu_distribute_scf_for.mlir",
             "gpu_distribute_shared_memory.mlir",
+            "gpu_expand_dimensions.mlir",
             "gpu_fuse_and_hoist_forall.mlir",
             "gpu_generalize_named_ops.mlir",
             "gpu_greedily_distribute_to_threads.mlir",
 
@@ -32,6 +32,7 @@ iree_lit_test_suite(
     "gpu_distribute_forall.mlir"
     "gpu_distribute_scf_for.mlir"
     "gpu_distribute_shared_memory.mlir"
+    "gpu_expand_dimensions.mlir"
     "gpu_fuse_and_hoist_forall.mlir"
     "gpu_generalize_named_ops.mlir"
     "gpu_greedily_distribute_to_threads.mlir"