[CPU] Tile all the ops to target vector sizes before vectorization. (iree-org#21900)

hanhanW · web-flow · commit 71c2b3c25810 · 2025-09-09T18:48:31.000Z
The revision introduces a pass that iterates all the compute ops and
tiles them to target vector sizes, if any dimension is not yet tiled
with the size.

It uses the ValueBounds analysis to infer the tiling sizes. If it is not
inferrable, it assumes that the op is already within target vector size.
Because it usually implies that the op is fused with some tiling config,
and the size computation is too complicated. E.g., it can happen on
linalg.unpack ops.

The revision prevents huge vector failure, and provides a reasonable
fallback.

The `linalg.fill` op is excluded because it usually goes with the
corresponding reduction op and there may be issues in lowering config
propagation. It is a fair stopgap in practice.

---------

Signed-off-by: hanhanW &lt;hanhan0912@gmail.com&gt;
diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/BUILD.bazel b/compiler/src/iree/compiler/Codegen/LLVMCPU/BUILD.bazel
@@ -71,6 +71,7 @@ iree_compiler_cc_library(
         "LLVMCPUSynchronizeSymbolVisibility.cpp",
         "LLVMCPUTile.cpp",
         "LLVMCPUTileAndFuseProducerConsumer.cpp",
+        "LLVMCPUTileToVectorSize.cpp",
         "LLVMCPUUnfuseFMAOps.cpp",
         "LLVMCPUVectorShapeCastLowering.cpp",
         "LLVMCPUVectorTransposeLowering.cpp",
diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/CMakeLists.txt b/compiler/src/iree/compiler/Codegen/LLVMCPU/CMakeLists.txt
@@ -67,6 +67,7 @@ iree_cc_library(
     "LLVMCPUSynchronizeSymbolVisibility.cpp"
     "LLVMCPUTile.cpp"
     "LLVMCPUTileAndFuseProducerConsumer.cpp"
+    "LLVMCPUTileToVectorSize.cpp"
     "LLVMCPUUnfuseFMAOps.cpp"
     "LLVMCPUVectorShapeCastLowering.cpp"
     "LLVMCPUVectorTransposeLowering.cpp"
diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/LLVMCPUTileToVectorSize.cpp b/compiler/src/iree/compiler/Codegen/LLVMCPU/LLVMCPUTileToVectorSize.cpp
@@ -0,0 +1,192 @@
+// Copyright 2025 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenAttrs.h"
+#include "iree/compiler/Codegen/LLVMCPU/Passes.h"
+#include "iree/compiler/Codegen/LLVMCPU/Utils.h"
+#include "llvm/Support/DebugLog.h"
+#include "llvm/Support/InterleavedRange.h"
+#include "llvm/Support/LogicalResult.h"
+#include "mlir/Dialect/Linalg/Transforms/Transforms.h"
+#include "mlir/Dialect/MemRef/Transforms/Transforms.h"
+#include "mlir/Dialect/SCF/IR/SCF.h"
+#include "mlir/Dialect/SCF/Transforms/Patterns.h"
+#include "mlir/Dialect/SCF/Transforms/TileUsingInterface.h"
+#include "mlir/Dialect/SCF/Transforms/Transforms.h"
+#include "mlir/Dialect/Tensor/IR/Tensor.h"
+#include "mlir/IR/BuiltinTypeInterfaces.h"
+#include "mlir/IR/Iterators.h"
+#include "mlir/Interfaces/TilingInterface.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
+
+#define DEBUG_TYPE "iree-llvmcpu-tile-to-vector-size"
+
+namespace mlir::iree_compiler {
+
+#define GEN_PASS_DEF_LLVMCPUTILETOVECTORSIZEPASS
+#include "iree/compiler/Codegen/LLVMCPU/Passes.h.inc"
+
+namespace {
+
+struct LLVMCPUTileToVectorSizePass final
+    : impl::LLVMCPUTileToVectorSizePassBase<LLVMCPUTileToVectorSizePass> {
+  using impl::LLVMCPUTileToVectorSizePassBase<
+      LLVMCPUTileToVectorSizePass>::LLVMCPUTileToVectorSizePassBase;
+
+  void getDependentDialects(DialectRegistry &registry) const override {
+    registry.insert<arith::ArithDialect, scf::SCFDialect>();
+  }
+
+  void runOnOperation() override;
+};
+
+static std::optional<SmallVector<int64_t>>
+getTileSizesForEachDims(linalg::LinalgOp op) {
+  IREE::Codegen::LoweringConfigAttrInterface loweringConfig =
+      getLoweringConfig(op);
+  SmallVector<bool> scalableFlags = loweringConfig.getVectorScalableFlags();
+  if (llvm::count(scalableFlags, true) > 0) {
+    return std::nullopt;
+  }
+
+  unsigned numLoops = op.getNumLoops();
+  std::optional<SmallVector<int64_t>> vectorSizes =
+      loweringConfig.getVectorSizes();
+  if (!vectorSizes || vectorSizes->size() != numLoops) {
+    return std::nullopt;
+  }
+  LDBG() << "configured vector sizes: "
+         << llvm::interleaved_array(vectorSizes.value());
+
+  SmallVector<int64_t> result(numLoops, 0);
+  for (unsigned dim = 0; dim < numLoops; ++dim) {
+    SmallVector<std::pair<Value, unsigned>> operandDimPairs;
+    op.mapIterationSpaceDimToAllOperandDims(dim, operandDimPairs);
+    if (operandDimPairs.empty()) {
+      return std::nullopt;
+    }
+
+    Value firstOperand = operandDimPairs[0].first;
+    unsigned firstOperandDim = operandDimPairs[0].second;
+
+    // Trivial case: `dim` size is available in the operand type.
+    int64_t dimSize = llvm::cast<ShapedType>(firstOperand.getType())
+                          .getShape()[firstOperandDim];
+    int64_t vectorDimSize = vectorSizes.value()[dim];
+    if (ShapedType::isStatic(dimSize) && dimSize > vectorDimSize) {
+      LDBG() << "set dim #" << dim << " size (" << dimSize
+             << ") with vector size: " << vectorDimSize;
+      result[dim] = vectorDimSize;
+      continue;
+    }
+
+    // If a `tensor.extract_slice` op can not be found, the operand is not tiled
+    // at all. It implies that the dimension is not yet tiled. `tensor.empty` is
+    // part of tiling artifacts that can be used to infer tiling sizes.
+    if (!isa_and_present<tensor::EmptyOp, tensor::ExtractSliceOp>(
+            firstOperand.getDefiningOp())) {
+      LDBG() << "set dim #" << dim
+             << " size (untiled) with vector size: " << vectorDimSize;
+      result[dim] = vectorDimSize;
+      continue;
+    }
+
+    // Use ValueBounds analysis to infer `dim` size upper bound.
+    std::optional<int64_t> maybeDimSize;
+    FailureOr<DimBoundSize> maybeDimBound;
+    for (auto [operand, operandDim] : operandDimPairs) {
+      FailureOr<int64_t> maybeDimBoundSize =
+          ValueBoundsConstraintSet::computeConstantBound(
+              presburger::BoundType::UB, {operand, operandDim},
+              /*stopCondition=*/nullptr, /*closedUB=*/true);
+      if (succeeded(maybeDimBoundSize)) {
+        maybeDimSize = maybeDimBoundSize.value();
+        break;
+      }
+    }
+    // Assume that the unknown dimension size implies the dimension is already
+    // tiled. It means that the dimension is definitely tiled, but it is hard to
+    // infer the tile size. It usually happens in fusion case, so the pass
+    // assumes that it is not needed.
+    if (maybeDimSize && maybeDimSize.value() > vectorDimSize) {
+      LDBG() << "set dim #" << dim << " size (" << maybeDimSize.value()
+             << ") with vector size: " << vectorDimSize;
+      result[dim] = vectorDimSize;
+    } else {
+      LDBG() << "dim #" << dim << " either is tiled to vector size ("
+             << vectorDimSize << ") or has complex size computation";
+    }
+  }
+
+  return result;
+}
+
+void LLVMCPUTileToVectorSizePass::runOnOperation() {
+  MLIRContext *context = &getContext();
+  FunctionOpInterface funcOp = getOperation();
+  SmallVector<linalg::LinalgOp> candidates;
+  funcOp.walk([&](linalg::LinalgOp op) {
+    // XXX(hanchung): linalg.fill usually follow the reduction consumer ops, so
+    // the additional tiling is not needed. Otherwise, it results in an
+    // additional loops before converting it to a vector. We may need to fix the
+    // lowering config issue, but it is a fair stopgap in practice.
+    if (isa<linalg::FillOp>(op)) {
+      return;
+    }
+    IREE::Codegen::LoweringConfigAttrInterface loweringConfig =
+        getLoweringConfig(op);
+    if (!loweringConfig) {
+      return;
+    }
+    if (!loweringConfig.getVectorSizes().has_value()) {
+      return;
+    }
+    candidates.push_back(op);
+  });
+
+  IRRewriter rewriter(context);
+  for (linalg::LinalgOp op : candidates) {
+    LDBG() << "candidate: " << op;
+    std::optional<SmallVector<int64_t>> tileSizes = getTileSizesForEachDims(op);
+    if (!tileSizes) {
+      LDBG() << "all the dimensions are either tiled or target scalable tile "
+                "sizes";
+      continue;
+    }
+    if (llvm::all_of(tileSizes.value(), [](int64_t val) { return val == 0; })) {
+      LDBG() << "skip the op because tile sizes are all zeros";
+      continue;
+    }
+    LDBG() << "tileSizes: " << llvm::interleaved_array(tileSizes.value());
+
+    auto tilingInterfaceOp = cast<TilingInterface>(op.getOperation());
+    scf::SCFTilingOptions options;
+    setSCFTileSizes(options, tilingInterfaceOp, std::move(tileSizes.value()),
+                    /*tileScalableFlags=*/{});
+    FailureOr<scf::SCFTilingResult> tiledResults =
+        scf::tileUsingSCF(rewriter, tilingInterfaceOp, options);
+    if (failed(tiledResults)) {
+      LDBG() << "failed to tile the op";
+      return signalPassFailure();
+    }
+    rewriter.replaceOp(op, tiledResults->replacements);
+  }
+
+  RewritePatternSet patterns =
+      linalg::getLinalgTilingCanonicalizationPatterns(context);
+  scf::populateSCFForLoopCanonicalizationPatterns(patterns);
+  tensor::populateFoldTensorEmptyPatterns(patterns);
+  memref::populateResolveRankedShapedTypeResultDimsPatterns(patterns);
+  context->getLoadedDialect<tensor::TensorDialect>()
+      ->getCanonicalizationPatterns(patterns);
+  if (failed(applyPatternsGreedily(funcOp, std::move(patterns)))) {
+    LDBG() << "----- cleanup failed -----";
+    return signalPassFailure();
+  }
+}
+} // namespace
+} // namespace mlir::iree_compiler
diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/Passes.cpp b/compiler/src/iree/compiler/Codegen/LLVMCPU/Passes.cpp
@@ -293,6 +293,7 @@ void addMultiTilingExpertPassPipeline(
       funcPassManager.addPass(createConfigTrackingCanonicalizerPass());
       funcPassManager.addPass(createCSEPass());
     }
+    funcPassManager.addPass(createLLVMCPUTileToVectorSizePass());
 
     GenericVectorizationPassOptions options;
     options.useConfiguredVectorSizes = pipelineOpt.useConfiguredVectorSizes;
diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/Passes.td b/compiler/src/iree/compiler/Codegen/LLVMCPU/Passes.td
@@ -144,6 +144,19 @@ def LLVMCPUTilePass :
   ];
 }
 
+def LLVMCPUTileToVectorSizePass :
+    InterfacePass<"iree-llvmcpu-tile-to-vector-size", "mlir::FunctionOpInterface"> {
+  let summary = "Tile TilingInterface operations to target vector size.";
+  let description = [{
+    Walk through all the TilingInterface operations and tiling the dimensions to
+    target vector sizes, if the lowering config is present and the dimension is
+    known as greater than the vector size.
+
+    It is intended to be used before vectorization that avoids big vectors and
+    stack buffers.
+  }];
+}
+
 def LLVMCPUTileAndFuseProducerConsumerPass
     : InterfacePass<"iree-llvmcpu-tile-and-fuse-producer-consumer",
                     "mlir::FunctionOpInterface"> {
diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/BUILD.bazel b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/BUILD.bazel
@@ -61,6 +61,7 @@ iree_lit_test_suite(
             "tile.mlir",
             "tile_and_fuse_producer_consumer_anchoring_last_op.mlir",
             "tile_and_fuse_producer_consumer_anchoring_root_op.mlir",
+            "tile_to_vector_size.mlir",
             "unfused_fma.mlir",
             "vector_contract_to_arm_asm.mlir",
             "vector_contract_to_arm_intrinsics.mlir",
diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/CMakeLists.txt b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/CMakeLists.txt
@@ -56,6 +56,7 @@ iree_lit_test_suite(
     "tile.mlir"
     "tile_and_fuse_producer_consumer_anchoring_last_op.mlir"
     "tile_and_fuse_producer_consumer_anchoring_root_op.mlir"
+    "tile_to_vector_size.mlir"
     "unfused_fma.mlir"
     "vector_contract_to_arm_asm.mlir"
     "vector_contract_to_arm_intrinsics.mlir"
diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/tile_to_vector_size.mlir b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/tile_to_vector_size.mlir
@@ -0,0 +1,86 @@
+// RUN: iree-opt --pass-pipeline="builtin.module(func.func(iree-llvmcpu-tile-to-vector-size))" --split-input-file %s | FileCheck %s
+
+#config = #iree_cpu.lowering_config<vector_common_parallel = [10, 20, 0], vector_reduction = [0, 0, 30]>
+func.func @matmul_all_dims_untiled(%arg0 : tensor<?x?xf32>, %arg1 : tensor<?x?xf32>, %arg2 : tensor<?x?xf32>) -> tensor<?x?xf32> {
+  %0 = linalg.matmul {lowering_config = #config}
+      ins(%arg0, %arg1 : tensor<?x?xf32>, tensor<?x?xf32>)
+      outs(%arg2 : tensor<?x?xf32>) -> tensor<?x?xf32>
+  return %0 : tensor<?x?xf32>
+}
+// CHECK-LABEL: func.func @matmul_all_dims_untiled(
+// CHECK:         scf.for
+// CHECK:           scf.for
+// CHECK:             scf.for
+// CHECK:               linalg.matmul
+
+// -----
+
+#config = #iree_cpu.lowering_config<vector_common_parallel = [10, 20, 0, 0], vector_reduction = [0, 0, 30, 30]>
+func.func @invalid_matmul_vector_config(%arg0 : tensor<?x?xf32>, %arg1 : tensor<?x?xf32>, %arg2 : tensor<?x?xf32>) -> tensor<?x?xf32> {
+  %0 = linalg.matmul {lowering_config = #config}
+      ins(%arg0, %arg1 : tensor<?x?xf32>, tensor<?x?xf32>)
+      outs(%arg2 : tensor<?x?xf32>) -> tensor<?x?xf32>
+  return %0 : tensor<?x?xf32>
+}
+// CHECK-LABEL: func.func @invalid_matmul_vector_config(
+// CHECK-NOT:     scf.for
+// CHECK:         linalg.matmul
+
+// -----
+
+#config = #iree_cpu.lowering_config<vector_common_parallel = [10, 30, 0], vector_reduction = [0, 0, 20]>
+func.func @static_matmul_with_vector_size(%arg0 : tensor<10x20xf32>, %arg1 : tensor<20x30xf32>, %arg2 : tensor<10x30xf32>) -> tensor<10x30xf32> {
+  %0 = linalg.matmul {lowering_config = #config}
+      ins(%arg0, %arg1 : tensor<10x20xf32>, tensor<20x30xf32>)
+      outs(%arg2 : tensor<10x30xf32>) -> tensor<10x30xf32>
+  return %0 : tensor<10x30xf32>
+}
+// CHECK-LABEL: func.func @static_matmul_with_vector_size(
+// CHECK-NOT:     scf.for
+// CHECK:         linalg.matmul
+
+// -----
+
+#config = #iree_cpu.lowering_config<vector_common_parallel = [10, 30, 0], vector_reduction = [0, 0, 20]>
+func.func @static_matmul_with_untiled_K_dim(%arg0 : tensor<10x40xf32>, %arg1 : tensor<40x30xf32>, %arg2 : tensor<10x30xf32>) -> tensor<10x30xf32> {
+  %0 = linalg.matmul {lowering_config = #config}
+      ins(%arg0, %arg1 : tensor<10x40xf32>, tensor<40x30xf32>)
+      outs(%arg2 : tensor<10x30xf32>) -> tensor<10x30xf32>
+  return %0 : tensor<10x30xf32>
+}
+// CHECK-LABEL: func.func @static_matmul_with_untiled_K_dim(
+// CHECK:         %[[C20:.+]] = arith.constant 20 : index
+// CHECK:         scf.for
+// CHECK-SAME:      step %[[C20]]
+// CHECK-NOT:     scf.for
+// CHECK:           linalg.matmul
+
+// -----
+
+#map = affine_map<(d0)[s0] -> (-d0 + s0, 10)>
+#map1 = affine_map<(d0)[s0] -> (-d0 + s0, 20)>
+#map2 = affine_map<(d0)[s0] -> (-d0 + s0, 60)>
+#config = #iree_cpu.lowering_config<vector_common_parallel = [10, 20, 0], vector_reduction = [0, 0, 30]>
+func.func @matmul_tiled_MxNxK_to_10x20x60(%arg0 : tensor<?x?xf32>, %arg1 : tensor<?x?xf32>, %arg2 : tensor<?x?xf32>) -> tensor<?x?xf32> {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %M = tensor.dim %arg0, %c0 : tensor<?x?xf32>
+  %N = tensor.dim %arg1, %c1 : tensor<?x?xf32>
+  %K = tensor.dim %arg0, %c1 : tensor<?x?xf32>
+  %mSize = affine.min #map(%c0)[%M]
+  %nSize = affine.min #map1(%c0)[%N]
+  %kSize = affine.min #map2(%c0)[%K]
+  %lhs = tensor.extract_slice %arg0 [0, 0][%mSize, %kSize][1, 1] : tensor<?x?xf32> to tensor<?x?xf32>
+  %rhs = tensor.extract_slice %arg1 [0, 0][%kSize, %nSize][1, 1] : tensor<?x?xf32> to tensor<?x?xf32>
+  %acc = tensor.extract_slice %arg2 [0, 0][%mSize, %nSize][1, 1] : tensor<?x?xf32> to tensor<?x?xf32>
+  %0 = linalg.matmul {lowering_config = #config}
+      ins(%lhs, %rhs : tensor<?x?xf32>, tensor<?x?xf32>)
+      outs(%acc : tensor<?x?xf32>) -> tensor<?x?xf32>
+  return %0 : tensor<?x?xf32>
+}
+// CHECK-LABEL: func.func @matmul_tiled_MxNxK_to_10x20x60(
+// CHECK:         %[[C30:.+]] = arith.constant 30 : index
+// CHECK:         scf.for
+// CHECK-SAME:      step %[[C30]]
+// CHECK-NOT:     scf.for
+// CHECK:           linalg.matmul

Original file line number	Diff line number	Diff line change
`@@ -293,6 +293,7 @@ void addMultiTilingExpertPassPipeline(`
`293`	`293`	`funcPassManager.addPass(createConfigTrackingCanonicalizerPass());`
`294`	`294`	`funcPassManager.addPass(createCSEPass());`
`295`	`295`	`}`
	`296`	`+ funcPassManager.addPass(createLLVMCPUTileToVectorSizePass());`
`296`	`297`
`297`	`298`	`GenericVectorizationPassOptions options;`
`298`	`299`	`options.useConfiguredVectorSizes = pipelineOpt.useConfiguredVectorSizes;`