[Codegen] add FoldExtractSliceOfFillThroughBlockArg pattern to TileAndDistributeToWorkgroups

bangtianliu · bangtianliu · commit 7c8f3a22c3a5 · 2025-12-26T04:07:54.000Z
Signed-off-by: Bangtian Liu &lt;liubangtian@gmail.com&gt;
diff --git a/compiler/src/iree/compiler/Codegen/Common/TileDispatchUsingForall.cpp b/compiler/src/iree/compiler/Codegen/Common/TileDispatchUsingForall.cpp
@@ -308,6 +308,7 @@ void TileAndDistributeToWorkgroupsUsingForallOpPass::runOnOperation() {
   // TODO(Max191): Replace populateSwapExtractWithExpandPattern with upstream
   // MLIR version once it is available (llvm-project/pull/126898).
   populateSwapExtractWithExpandPattern(cleanupPatterns);
+  populateFoldExtractSliceOfFillThroughBlockArgPattern(cleanupPatterns);
   // When fusing pads we do not want to generate zeroSliceGuards when doing
   // workgroup tiling. In `GPUApplyTilingLevelPass` we do have an option called
   // `allowZeroSlices` that can control this but we do not want these
@@ -412,6 +413,7 @@ void TileAndDistributeToWorkgroupsUsingForallOpPass::runOnOperation() {
   {
     RewritePatternSet patterns(context);
     populateSwapExtractWithCollapsePattern(patterns);
+    populateFoldExtractSliceOfFillThroughBlockArgPattern(patterns);
     linalg::populateLinalgTilingCanonicalizationPatterns(patterns);
     tensor::populateFoldTensorEmptyPatterns(patterns);
     context->getOrLoadDialect<tensor::TensorDialect>()
diff --git a/compiler/src/iree/compiler/Codegen/Common/Transforms.cpp b/compiler/src/iree/compiler/Codegen/Common/Transforms.cpp
@@ -10,6 +10,7 @@
 #include "llvm/ADT/ScopeExit.h"
 #include "mlir/Analysis/SliceAnalysis.h"
 #include "mlir/Dialect/Affine/IR/AffineOps.h"
+#include "mlir/Dialect/Linalg/IR/Linalg.h"
 #include "mlir/Dialect/Utils/StaticValueUtils.h"
 
 #define DEBUG_TYPE "iree-codegen-common-transforms"
@@ -354,6 +355,89 @@ void populateSwapExtractWithExpandPattern(RewritePatternSet &patterns) {
   patterns.add<SwapExpandShapeWithSlicePattern>(patterns.getContext());
 }
 
+namespace {
+/// Pattern to fold extract_slice of a fill through a forall's block argument.
+/// When extracting a slice from a block argument where the init value is a
+/// linalg.fill, we update the forall's shared_outs to use the fill's
+/// destination (the empty tensor), and then create a fill on the extracted
+/// slice inside the loop body.
+///
+/// Example:
+///   %empty = tensor.empty() : tensor<4x1xf16>
+///   %fill = linalg.fill ins(%cst) outs(%empty) -> tensor<4x1xf16>
+///   scf.forall ... shared_outs(%arg = %fill) {
+///     %slice = tensor.extract_slice %arg[%i, 0] [1, 1] -> tensor<1x1xf16>
+///     ...
+///   }
+/// ->
+///   %empty = tensor.empty() : tensor<4x1xf16>
+///   scf.forall ... shared_outs(%arg = %empty) {  // Updated to use %empty
+///     %extracted = tensor.extract_slice %arg[%i, 0] [1, 1] -> tensor<1x1xf16>
+///     %slice = linalg.fill ins(%cst) outs(%extracted) -> tensor<1x1xf16>
+///     ...
+///   }
+struct FoldExtractSliceOfFillThroughBlockArg final
+    : OpRewritePattern<tensor::ExtractSliceOp> {
+  using OpRewritePattern::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(tensor::ExtractSliceOp extractOp,
+                                PatternRewriter &rewriter) const override {
+    auto blockArg = dyn_cast<BlockArgument>(extractOp.getSource());
+    if (!blockArg) {
+      return rewriter.notifyMatchFailure(extractOp,
+                                         "source is not a block argument");
+    }
+
+    auto forallOp = dyn_cast<scf::ForallOp>(blockArg.getOwner()->getParentOp());
+    if (!forallOp) {
+      return rewriter.notifyMatchFailure(
+          extractOp, "block argument is not from an scf.forall");
+    }
+
+    unsigned argNum = blockArg.getArgNumber();
+    unsigned numIVs = forallOp.getInductionVars().size();
+    if (argNum < numIVs) {
+      return rewriter.notifyMatchFailure(
+          extractOp, "block argument is an induction variable, not shared_out");
+    }
+
+    unsigned outputIdx = argNum - numIVs;
+    if (outputIdx >= forallOp.getOutputs().size()) {
+      return rewriter.notifyMatchFailure(extractOp,
+                                         "invalid output index for block arg");
+    }
+
+    Value initValue = forallOp.getOutputs()[outputIdx];
+
+    auto fillOp = initValue.getDefiningOp<linalg::FillOp>();
+    if (!fillOp) {
+      return rewriter.notifyMatchFailure(
+          extractOp, "init value is not a linalg.fill operation");
+    }
+
+    Value fillValue = fillOp.getInputs()[0];
+    Value fillDest = fillOp.getOutputs()[0];
+    rewriter.modifyOpInPlace(forallOp, [&]() {
+      forallOp.getOutputsMutable()[outputIdx].set(fillDest);
+    });
+
+    rewriter.setInsertionPointAfter(extractOp);
+    Location loc = extractOp.getLoc();
+    auto newFillOp =
+        linalg::FillOp::create(rewriter, loc, fillValue, extractOp.getResult());
+    rewriter.replaceAllUsesExcept(extractOp.getResult(), newFillOp.getResult(0),
+                                  newFillOp);
+    return success();
+  }
+};
+
+} // namespace
+
+void populateFoldExtractSliceOfFillThroughBlockArgPattern(
+    RewritePatternSet &patterns) {
+  patterns.add<FoldExtractSliceOfFillThroughBlockArg>(patterns.getContext());
+}
+
 /// Note the following pattern is adapted from the upstream pattern
 /// `BubbleUpCollapseShapeThroughExtractSlice` by allowing some special cases.
 ///
diff --git a/compiler/src/iree/compiler/Codegen/Common/Transforms.h b/compiler/src/iree/compiler/Codegen/Common/Transforms.h
@@ -193,6 +193,12 @@ void populateReplaceSlowMinMaxOpsPatterns(RewritePatternSet &patterns);
 /// `tensor.expand_shape(tensor.extract_slice)`.
 void populateSwapExtractWithExpandPattern(RewritePatternSet &patterns);
 
+/// Populate pattern to fold `tensor.extract_slice` of a `linalg.fill` through
+/// a forall's block argument. Creates a smaller tensor.empty and linalg.fill
+/// inside the loop body.
+void populateFoldExtractSliceOfFillThroughBlockArgPattern(
+    RewritePatternSet &patterns);
+
 /// Populate pattern to convert `tensor.extract_slice(tensor.collapse_shape)` to
 /// `tensor.collapse_shape(tensor.extract_slice)`.
 void populateSwapExtractWithCollapsePattern(RewritePatternSet &patterns);
diff --git a/compiler/src/iree/compiler/Codegen/Common/test/tile_and_distribute_workgroups_using_forall.mlir b/compiler/src/iree/compiler/Codegen/Common/test/tile_and_distribute_workgroups_using_forall.mlir
@@ -1364,3 +1364,64 @@ attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPU
 //       CHECK:   scf.forall.in_parallel
 //       CHECK:     tensor.parallel_insert_slice %[[RES]] into %[[OUT0]][%[[OFFSET0]], 0, %[[OFFSET1]]]
 //   CHECK: {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
+
+// -----
+
+// Test for FoldExtractSliceOfFillThroughBlockArgPattern:
+// When a forall's shared_out init is a linalg.fill, and we extract a slice
+// from the block argument, the pattern should:
+// 1. Change the forall's init to use the fill's destination (empty tensor)
+// 2. Create a new fill on the extracted slice inside the loop
+
+#config_fill_fold = #iree_codegen.lowering_config<tile_sizes = [[1, 8]]>
+
+func.func @fold_fill_through_block_arg(%arg0 : tensor<4x16x128xf16>) -> (tensor<4x16xf16>, tensor<4x16xi32>) {
+  %cst = arith.constant 0xFC00 : f16
+  %c0_i32 = arith.constant 0 : i32
+  %c0 = arith.constant 0 : index
+  %empty_f16 = tensor.empty() : tensor<4x16xf16>
+  %empty_i32 = tensor.empty() : tensor<4x16xi32>
+  %fill_f16 = linalg.fill {lowering_config = #config_fill_fold}
+      ins(%cst : f16) outs(%empty_f16 : tensor<4x16xf16>) -> tensor<4x16xf16>
+  %fill_i32 = linalg.fill {lowering_config = #config_fill_fold}
+      ins(%c0_i32 : i32) outs(%empty_i32 : tensor<4x16xi32>) -> tensor<4x16xi32>
+  %result:2 = scf.forall (%iv0, %iv1) = (0, 0) to (4, 16) step (1, 8)
+      shared_outs(%out_f16 = %fill_f16, %out_i32 = %fill_i32) -> (tensor<4x16xf16>, tensor<4x16xi32>) {
+    %in_slice = tensor.extract_slice %arg0[%iv0, %iv1, 0] [1, 8, 128] [1, 1, 1]
+        : tensor<4x16x128xf16> to tensor<1x8x128xf16>
+    %slice_f16 = tensor.extract_slice %out_f16[%iv0, %iv1] [1, 8] [1, 1]
+        : tensor<4x16xf16> to tensor<1x8xf16>
+    %slice_i32 = tensor.extract_slice %out_i32[%iv0, %iv1] [1, 8] [1, 1]
+        : tensor<4x16xi32> to tensor<1x8xi32>
+    %compare:2 = iree_linalg_ext.arg_compare {lowering_config = #config_fill_fold}
+        dimension(2) ins(%in_slice : tensor<1x8x128xf16>)
+        outs(%slice_f16, %slice_i32 : tensor<1x8xf16>, tensor<1x8xi32>)
+        index_base(%c0 : index) {
+      ^bb0(%lhs: f16, %rhs: f16):
+        %cmp = arith.cmpf ogt, %lhs, %rhs : f16
+        iree_linalg_ext.yield %cmp : i1
+    } -> tensor<1x8xf16>, tensor<1x8xi32>
+    scf.forall.in_parallel {
+      tensor.parallel_insert_slice %compare#0 into %out_f16[%iv0, %iv1] [1, 8] [1, 1]
+          : tensor<1x8xf16> into tensor<4x16xf16>
+      tensor.parallel_insert_slice %compare#1 into %out_i32[%iv0, %iv1] [1, 8] [1, 1]
+          : tensor<1x8xi32> into tensor<4x16xi32>
+    }
+  } {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
+  return %result#0, %result#1 : tensor<4x16xf16>, tensor<4x16xi32>
+}
+
+// CHECK-LABEL: func.func @fold_fill_through_block_arg
+//   CHECK-DAG:   %[[CST_F16:.+]] = arith.constant 0xFC00 : f16
+//   CHECK-DAG:   %[[CST_I32:.+]] = arith.constant 0 : i32
+//   CHECK-DAG:   %[[EMPTY_F16:.+]] = tensor.empty() : tensor<4x16xf16>
+//   CHECK-DAG:   %[[EMPTY_I32:.+]] = tensor.empty() : tensor<4x16xi32>
+//       CHECK:   scf.forall (%[[IV0:.+]], %[[IV1:.+]]) = (0, 0) to (4, 16) step (1, 8)
+//  CHECK-SAME:       shared_outs(%[[OUT_F16:.+]] = %[[EMPTY_F16]], %[[OUT_I32:.+]] = %[[EMPTY_I32]])
+//       CHECK:     %[[SLICE_F16:.+]] = tensor.extract_slice %[[OUT_F16]][%[[IV0]], %[[IV1]]] [1, 8] [1, 1]
+//       CHECK:     %[[FILLED_F16:.+]] = linalg.fill ins(%[[CST_F16]] : f16) outs(%[[SLICE_F16]] : tensor<1x8xf16>)
+//       CHECK:     %[[SLICE_I32:.+]] = tensor.extract_slice %[[OUT_I32]][%[[IV0]], %[[IV1]]] [1, 8] [1, 1]
+//       CHECK:     %[[FILLED_I32:.+]] = linalg.fill ins(%[[CST_I32]] : i32) outs(%[[SLICE_I32]] : tensor<1x8xi32>)
+//       CHECK:     scf.forall
+//  CHECK-SAME:         shared_outs({{.*}} = %[[FILLED_F16]], {{.*}} = %[[FILLED_I32]])
+//       CHECK:       iree_linalg_ext.arg_compare