[mlir][vector] Canonicalize gathers/scatters with trivial offsets #117939

Hardcode84 · 2024-11-27T23:32:40Z

Canonicalize gathers/scatters with contiguous (i.e. [0, 1, 2, ...]) offsets into vector masked load/store ops.

llvmbot · 2024-11-27T23:33:12Z

@llvm/pr-subscribers-mlir-vector

@llvm/pr-subscribers-mlir

Author: Ivan Butygin (Hardcode84)

Changes

Canonicalize gathers/scatters with contiguous (i.e. [0, 1, 2, ...]) offsets into vector masked load/store ops.

Full diff: https://github.com/llvm/llvm-project/pull/117939.diff

2 Files Affected:

(modified) mlir/lib/Dialect/Vector/IR/VectorOps.cpp (+44-2)
(modified) mlir/test/Dialect/Vector/canonicalize.mlir (+31)

diff --git a/mlir/lib/Dialect/Vector/IR/VectorOps.cpp b/mlir/lib/Dialect/Vector/IR/VectorOps.cpp
index 0c0a7bc98d8b5e..21e62085be5a49 100644
--- a/mlir/lib/Dialect/Vector/IR/VectorOps.cpp
+++ b/mlir/lib/Dialect/Vector/IR/VectorOps.cpp
@@ -5181,6 +5181,19 @@ std::optional<SmallVector<int64_t, 4>> GatherOp::getShapeForUnroll() {
   return llvm::to_vector<4>(getVectorType().getShape());
 }
 
+static LogicalResult isContiguousIndices(Value val) {
+  auto vecType = dyn_cast<VectorType>(val.getType());
+  if (!vecType || vecType.getRank() != 1 || vecType.isScalable())
+    return failure();
+
+  DenseIntElementsAttr elements;
+  if (!matchPattern(val, m_Constant(&elements)))
+    return failure();
+
+  return success(
+      llvm::equal(elements, llvm::seq<int64_t>(0, vecType.getNumElements())));
+}
+
 namespace {
 class GatherFolder final : public OpRewritePattern<GatherOp> {
 public:
@@ -5199,11 +5212,26 @@ class GatherFolder final : public OpRewritePattern<GatherOp> {
     llvm_unreachable("Unexpected 1DMaskFormat on GatherFolder");
   }
 };
+
+class GatherTrivialIndices final : public OpRewritePattern<GatherOp> {
+public:
+  using OpRewritePattern::OpRewritePattern;
+  LogicalResult matchAndRewrite(GatherOp op,
+                                PatternRewriter &rewriter) const override {
+    if (failed(isContiguousIndices(op.getIndexVec())))
+      return failure();
+
+    rewriter.replaceOpWithNewOp<MaskedLoadOp>(op, op.getType(), op.getBase(),
+                                              op.getIndices(), op.getMask(),
+                                              op.getPassThru());
+    return success();
+  }
+};
 } // namespace
 
 void GatherOp::getCanonicalizationPatterns(RewritePatternSet &results,
                                            MLIRContext *context) {
-  results.add<GatherFolder>(context);
+  results.add<GatherFolder, GatherTrivialIndices>(context);
 }
 
 //===----------------------------------------------------------------------===//
@@ -5245,11 +5273,25 @@ class ScatterFolder final : public OpRewritePattern<ScatterOp> {
     llvm_unreachable("Unexpected 1DMaskFormat on ScatterFolder");
   }
 };
+
+class ScatterTrivialIndices final : public OpRewritePattern<ScatterOp> {
+public:
+  using OpRewritePattern::OpRewritePattern;
+  LogicalResult matchAndRewrite(ScatterOp op,
+                                PatternRewriter &rewriter) const override {
+    if (failed(isContiguousIndices(op.getIndexVec())))
+      return failure();
+
+    rewriter.replaceOpWithNewOp<MaskedStoreOp>(
+        op, op.getBase(), op.getIndices(), op.getMask(), op.getValueToStore());
+    return success();
+  }
+};
 } // namespace
 
 void ScatterOp::getCanonicalizationPatterns(RewritePatternSet &results,
                                             MLIRContext *context) {
-  results.add<ScatterFolder>(context);
+  results.add<ScatterFolder, ScatterTrivialIndices>(context);
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/mlir/test/Dialect/Vector/canonicalize.mlir b/mlir/test/Dialect/Vector/canonicalize.mlir
index 5ae769090dac66..b4f9d98e729771 100644
--- a/mlir/test/Dialect/Vector/canonicalize.mlir
+++ b/mlir/test/Dialect/Vector/canonicalize.mlir
@@ -2826,3 +2826,34 @@ func.func @contiguous_extract_strided_slices_to_extract_failure_non_full_inner_s
   %1 = vector.extract_strided_slice %arg0 {offsets = [0, 0, 0, 0, 0, 0], sizes = [1, 1, 2, 1, 1, 1], strides = [1, 1, 1, 1, 1, 1]} : vector<8x1x2x1x1x4xi32> to vector<1x1x2x1x1x1xi32>
   return %1 : vector<1x1x2x1x1x1xi32>
 }
+
+// -----
+
+// CHECK-LABEL: @contiguous_gather
+//  CHECK-SAME:   (%[[BASE:.*]]: memref<?xf32>, %[[MASK:.*]]: vector<16xi1>, %[[PASSTHRU:.*]]: vector<16xf32>)
+//       CHECK:   %[[C0:.*]] = arith.constant 0 : index
+//       CHECK:   %[[R:.*]] = vector.maskedload %[[BASE]][%[[C0]]], %[[MASK]], %[[PASSTHRU]] : memref<?xf32>, vector<16xi1>, vector<16xf32> into vector<16xf32>
+//       CHECK:   return %[[R]]
+func.func @contiguous_gather(%base: memref<?xf32>,
+                             %mask: vector<16xi1>, %passthru: vector<16xf32>) -> vector<16xf32> {
+  %c0 = arith.constant 0 : index
+  %indices = arith.constant dense<[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]> : vector<16xi32>
+  %1 = vector.gather %base[%c0][%indices], %mask, %passthru :
+    memref<?xf32>, vector<16xi32>, vector<16xi1>, vector<16xf32> into vector<16xf32>
+  return %1 : vector<16xf32>
+}
+
+// -----
+
+// CHECK-LABEL: @contiguous_scatter
+//  CHECK-SAME:   (%[[BASE:.*]]: memref<?xf32>, %[[MASK:.*]]: vector<16xi1>, %[[VALUE:.*]]: vector<16xf32>)
+//       CHECK:   %[[C0:.*]] = arith.constant 0 : index
+//       CHECK:   vector.maskedstore %[[BASE]][%[[C0]]], %[[MASK]], %[[VALUE]] : memref<?xf32>, vector<16xi1>, vector<16xf32>
+func.func @contiguous_scatter(%base: memref<?xf32>,
+                              %mask: vector<16xi1>, %value: vector<16xf32>){
+  %c0 = arith.constant 0 : index
+  %indices = arith.constant dense<[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]> : vector<16xi32>
+  vector.scatter %base[%c0][%indices], %mask, %value :
+    memref<?xf32>, vector<16xi32>, vector<16xi1>, vector<16xf32>
+  return
+}

dcaballe

Awesome, thanks! I took a quick look and left some comments for now.

dcaballe · 2024-11-28T18:20:33Z

mlir/lib/Dialect/Vector/IR/VectorOps.cpp

@@ -5181,6 +5181,19 @@ std::optional<SmallVector<int64_t, 4>> GatherOp::getShapeForUnroll() {
  return llvm::to_vector<4>(getVectorType().getShape());
 }

+static LogicalResult isContiguousIndices(Value val) {


Could you add doc about the current supported cases and limitations.

nit: val -> indices, indexVec ... ?

dcaballe · 2024-11-28T18:23:26Z

mlir/lib/Dialect/Vector/IR/VectorOps.cpp

+    return failure();
+
+  DenseIntElementsAttr elements;
+  if (!matchPattern(val, m_Constant(&elements)))


We may want to do something for ConstantMaskOp

dcaballe · 2024-11-28T18:24:26Z

mlir/lib/Dialect/Vector/IR/VectorOps.cpp

+    return failure();
+
+  return success(
+      llvm::equal(elements, llvm::seq<int64_t>(0, vecType.getNumElements())));


What about contiguous indices with a different start number?

dcaballe · 2024-11-28T18:25:04Z

mlir/lib/Dialect/Vector/IR/VectorOps.cpp

+
+  return success(
+      llvm::equal(elements, llvm::seq<int64_t>(0, vecType.getNumElements())));
+}


@banach-space, is there a common utility that we can use here and for the extract op in the Linalg vectorizer?

Not yet - the vectorizer looks at the scalar indices before vectorization. However, this patch make me think that we could do better 🤔 Let me look into this!

@Hardcode84 Would you have some linalg examples that vectorize into these contiguous gathers? That would be helpful, but no worries if Vector is your actual starting point here.

dcaballe · 2024-11-28T18:27:52Z

mlir/test/Dialect/Vector/canonicalize.mlir

+func.func @contiguous_gather(%base: memref<?xf32>,
+                             %mask: vector<16xi1>, %passthru: vector<16xf32>) -> vector<16xf32> {
+  %c0 = arith.constant 0 : index
+  %indices = arith.constant dense<[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]> : vector<16xi32>


Could you add tests for:

Start index != 0

ConstantMaskOp

constant indices that describe a broadcast (e.g., [3, 3, 3, 3, 3, 3... 3])

We don't need any special handling for constant mask as it already handled in existing masked -> non-masked canonicalizations, added a couple of tests.

I can add support for non-zero start, but broadcast is more involved

For scatters duplicated indices are undefined per current spec

For gather we need reduce(mask) + 1-element vector.maskedload + extract + splat and I would rather not do this as part of this PR.

broadcast is more involved

Could you add a test and a TODO? The test would be "negative" (i.e. the folder would leave the code unchanged). For "scatter" load we'd only need to make sure that invalid.mlir contains relevant test. Could you check that?

I can add support for non-zero start

Could you add a negative test to exercise this case? And a TODO to extend the pattern :)

Invalid scatter indices (and invalid dynamic indices in general) should not fail validation (see https://mlir.llvm.org/getting_started/DeveloperGuide/#ir-verifier), so nothing to add to invalid.mlir

banach-space

Great, thank you for working on this!

Two high level requests:

add support for vector.step
either add support for multi-dim gathers/scatters, or document the limitations.

Hardcode84 · 2024-11-29T01:44:37Z

add support for vector.step

Maybe, instead of adding special case for vector.step just fold all non-scalable vector.steps into arith.constants? It will probably help other canonicalizations as well.

UPD: Found the other thread #113655, but either way situation is not ideal as we now have semantically identical code represented in 2 different forms, exact situation which folders/canonicalizers was intended to avoid. (IMO, we we should always represent this as constants and constant-> step optimization should happen much later (probably on llvm/spirv level), but I'm not willing to die on this hill)

dcaballe · 2024-12-11T23:18:26Z

Maybe, instead of adding special case for vector.step just fold all non-scalable vector.steps into arith.constants? It will probably help other canonicalizations as well.

I would say the other way around. We may want vector.step to be the canonical form for vector constants that fall into that category as vector.step holds valuable information that has to be inferred by inspecting the values of a constant op.

Hardcode84 · 2024-12-27T21:16:18Z

Added vector.step support and rebased but I don't want to do non-zero start offset and broadcast as part of this PR.

I would say the other way around. We may want vector.step to be the canonical form for vector constants that fall into that category as vector.step holds valuable information that has to be inferred by inspecting the values of a constant op.

If we really want vector.step to be a canonical representation, we will need to always canonicalize [0,1,2...] constants to it (and it to be part of canonicalize pass and not just some random set of patterns), otherwise relevant patterns like this one will need to always check both forms. I'm still not convinced vector.step (for non-scalable) is useful as checking for relevant constant pattern is trivial and actually gives you more freedom (e.g. you can check for non-zero start offsets mentioned earlier) but I will leave this fight for someone else.

Hardcode84 · 2024-12-27T22:38:32Z

Actually, I think, better approach may be to have a special StepElementsAttr in the same way we already have SplatElementsAttr, which is "canonicalized" on DenseElementsAttr construction like SplatElementsAttr. This way we don't need any special ops and users which doesn't know or doesn't care about StepElementsAttr can still work with it as normal dense constant.

mlir/lib/Dialect/Vector/IR/VectorOps.cpp

banach-space · 2024-12-31T16:09:48Z

mlir/test/Dialect/Vector/canonicalize.mlir

+func.func @contiguous_gather(%base: memref<?xf32>,
+                             %mask: vector<16xi1>, %passthru: vector<16xf32>) -> vector<16xf32> {
+  %c0 = arith.constant 0 : index
+  %indices = arith.constant dense<[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]> : vector<16xi32>


broadcast is more involved

Could you add a test and a TODO? The test would be "negative" (i.e. the folder would leave the code unchanged). For "scatter" load we'd only need to make sure that invalid.mlir contains relevant test. Could you check that?

I can add support for non-zero start

Could you add a negative test to exercise this case? And a TODO to extend the pattern :)

mlir/lib/Dialect/Vector/IR/VectorOps.cpp

banach-space · 2024-12-31T16:12:07Z

mlir/lib/Dialect/Vector/IR/VectorOps.cpp

+
+  return success(
+      llvm::equal(elements, llvm::seq<int64_t>(0, vecType.getNumElements())));
+}


@Hardcode84 Would you have some linalg examples that vectorize into these contiguous gathers? That would be helpful, but no worries if Vector is your actual starting point here.

banach-space · 2024-12-31T16:22:14Z

Actually, I think, better approach may be to have a special StepElementsAttr in the same way we already have SplatElementsAttr, which is "canonicalized" on DenseElementsAttr construction like SplatElementsAttr. This way we don't need any special ops and users which doesn't know or doesn't care about StepElementsAttr can still work with it as normal dense constant.

Supporting vector.step is important - that's the only option for scalable vectors. In fact, since you added support for vector.step, your changes should also work for scalable vectors.

Hardcode84 · 2024-12-31T17:07:37Z

@Hardcode84 Would you have some linalg examples that vectorize into these contiguous gathers? That would be helpful, but no worries if Vector is your actual starting point here.

In my specific case (https://github.com/iree-org/iree-turbine/blob/main/iree/turbine/kernel/wave/codegen.py#L808), I generate gathers directly, without going through linalg. Also as side note, indices are constructed from user-provided sympy exprs, so we don't know beforehand if it's a step or not.

Hardcode84 · 2025-01-12T16:16:42Z

updated

Cononicalize gathers/scatters with contiguous (i.e. [0, 1, 2, ...]) offsets into vector masked load/store ops.

banach-space · 2025-01-21T13:28:41Z

@Hardcode84 Is this ready for another round of reviews? If yes, could you follow LLVM's code-review guidelines and ping reviewers?

https://llvm.org/docs/CodeReview.html#code-reviews-speed-and-reciprocity

Ping the patch. If it is urgent, provide reasons why it is important to you to get this patch landed and ping it every couple of days. If it is not urgent, the common courtesy ping rate is one week. Remember that you’re asking for valuable time from other professional developers.

Note, reviewers get a notification every time a PR is updated (and there's a lot of PRs). "ping" is the usual LLVM way let people know that this is ready for another round :) And, specifically, things like:

updated

tend to be interpreted as noise (it's not clear to me whether "updated" means "I've addressed all PR comments" or just casual "I've made some changes, but might do some more sometime soon.").

Thanks :)
-Andrzej

Hardcode84 · 2025-01-21T15:40:53Z

@banach-space yes, PTAL.

I understand everyone is busy but this PR was intended as trivial improvement and I never planned for it to take multiple months or to support broadcasts/step/scalable vectors/non-1D vectors (all of which have zero benefit for my specific use case). At this point I would prefer to either merge it in current form or just drop it and move on.

banach-space

LGTM, thanks!

@dcaballe , looks like tests in canonicalize.mlir cover all TODOs (apart from scalable vectors, but I can handle that). WDYT?

Hardcode84 · 2025-01-21T16:27:08Z

Thanks, sorry for the rant

Hardcode84 requested review from banach-space, dcaballe, nicolasvasilache and Groverkss as code owners November 27, 2024 23:32

llvmbot added mlir:vectorops mlir mlir:vector labels Nov 27, 2024

dcaballe reviewed Nov 28, 2024

View reviewed changes

banach-space reviewed Nov 28, 2024

View reviewed changes

Hardcode84 force-pushed the gather-canon branch from 06e4f95 to 8b1f69c Compare December 27, 2024 21:01

Hardcode84 force-pushed the gather-canon branch from 8b1f69c to 08fc937 Compare December 28, 2024 11:29

banach-space reviewed Dec 31, 2024

View reviewed changes

Hardcode84 force-pushed the gather-canon branch from 08fc937 to 8ef0b0e Compare January 12, 2025 16:03

Hardcode84 added 7 commits January 20, 2025 23:14

[mlir][vector] Canonicalize gathers/scatters with trivial offsets

bfd8536

Cononicalize gathers/scatters with contiguous (i.e. [0, 1, 2, ...]) offsets into vector masked load/store ops.

add const mask tests

cf51b97

nits

50f4984

vector.step support

88de40b

rename func

0c7d962

more tests

900393f

fix name

c079482

Hardcode84 force-pushed the gather-canon branch from 8ef0b0e to c079482 Compare January 21, 2025 11:26

banach-space approved these changes Jan 21, 2025

View reviewed changes

dcaballe approved these changes Jan 23, 2025

View reviewed changes

Hardcode84 merged commit 88136f9 into llvm:main Jan 24, 2025
8 checks passed

Hardcode84 deleted the gather-canon branch January 24, 2025 11:14

[mlir][vector] Canonicalize gathers/scatters with trivial offsets #117939

[mlir][vector] Canonicalize gathers/scatters with trivial offsets #117939

Uh oh!

Conversation

Hardcode84 commented Nov 27, 2024

Uh oh!

llvmbot commented Nov 27, 2024 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

dcaballe left a comment

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

banach-space left a comment

Choose a reason for hiding this comment

Uh oh!

Hardcode84 commented Nov 29, 2024 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

dcaballe commented Dec 11, 2024

Uh oh!

Hardcode84 commented Dec 27, 2024 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

Hardcode84 commented Dec 27, 2024

Uh oh!

Uh oh!

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Uh oh!

Choose a reason for hiding this comment

Uh oh!

banach-space commented Dec 31, 2024

Uh oh!

Hardcode84 commented Dec 31, 2024

Uh oh!

Hardcode84 commented Jan 12, 2025

Uh oh!

banach-space commented Jan 21, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

Hardcode84 commented Jan 21, 2025

Uh oh!

banach-space left a comment

Choose a reason for hiding this comment

Uh oh!

Hardcode84 commented Jan 21, 2025

Uh oh!

Uh oh!

Uh oh!

llvmbot commented Nov 27, 2024 •

edited

Loading

Hardcode84 commented Nov 29, 2024 •

edited

Loading

Hardcode84 commented Dec 27, 2024 •

edited

Loading

banach-space commented Jan 21, 2025 •

edited

Loading