llvm · chencha3 · Jun 2, 2025 · May 12, 2025 · May 12, 2025 · May 13, 2025
diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
@@ -295,11 +295,15 @@ def XeGPU_LayoutAttr : XeGPUAttr<"Layout", "layout"> {
     }
 
     LayoutAttr dropSgLayoutAndData() {
+      if (!getInstData() && !getLaneLayout())
+        return nullptr;
       return LayoutAttr::get(getContext(), nullptr, nullptr, getInstData(),
                              getLaneLayout(), getLaneData(), getOrder());
     }
 
     LayoutAttr dropInstData() {
+      if (!getSgLayout() && !getLaneLayout())
+        return nullptr;
       return LayoutAttr::get(getContext(), getSgLayout(), getSgData(), nullptr,
                              getLaneLayout(), getLaneData(), getOrder());
     }

diff --git a/mlir/include/mlir/Dialect/XeGPU/Transforms/Passes.td b/mlir/include/mlir/Dialect/XeGPU/Transforms/Passes.td
@@ -38,4 +38,16 @@ def XeGPUSubgroupDistribute : Pass<"xegpu-subgroup-distribute"> {
   ];
 }
 
+def XeGPUBlocking: Pass<"xegpu-blocking"> {
+  let summary = "Instructionlize XeGPU ops";
+  let description = [{
+    The pass unrolls XeGPU ops working on large shapes into ops working on small shapes
+    (given by the inst_data in the layout attr), such that each of them can be dispatch
+    into a hardware instruction.
+  }];
+  let dependentDialects = [
+      "memref::MemRefDialect", "xegpu::XeGPUDialect", "vector::VectorDialect"
+  ];
+}
+
 #endif // MLIR_DIALECT_XEGPU_TRANSFORMS_PASSES_TD
diff --git a/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h b/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h
@@ -13,6 +13,11 @@
 namespace mlir {
 
 class VectorType;
+class OpOperand;
+class OpResult;
+class OpBuilder;
+class ValueRange;
+
 namespace xegpu {
 class LayoutAttr;
 class TensorDescType;
@@ -50,6 +55,52 @@ FailureOr<VectorType> getDistributedVectorType(xegpu::TensorDescType tdescTy);
 FailureOr<VectorType> getDistributedVectorType(VectorType originalType,
                                                LayoutAttr layout);
 
+/// Return the attribute name for the OpOperand to attach LayoutAttr
+std::string getLayoutName(OpOperand &opr);
+
+/// Return the attribute name for the OpResult to attach LayoutAttr
+std::string getLayoutName(OpResult res);
+
+/// Retrieves the LayoutAttr associated with a given Value. For TensorDescType
+/// values, the LayoutAttr is extracted from the TensorDescType itself. For
+/// other values, it is obtained from the attributes of the defining operation.
+/// Returns nullptr if no LayoutAttr is found.
+LayoutAttr getLayoutAttr(Value value);
+
+/// Retrieves the LayoutAttr associated with a given OpOperand. It will
+/// first check the operand_layout_{id} of the owner operation. If not found,
+/// it will check the operand itself and its defining op.
+LayoutAttr getLayoutAttr(OpOperand &opr);
+
+/// Sets the LayoutAttr for a given OpOperand by attaching it to the owner
+void setLayoutAttr(OpOperand &opr, LayoutAttr layout);
+
+/// Set the LayoutAttr for the given OpResult by attching it to the defining op
+void setLayoutAttr(OpResult result, LayoutAttr layout);
+
+/// Set the LayoutAttr for each OpOperand and OpResult of the given operation.
+/// If the operation contains regions, it is also applied recursively to the
+/// contained operations
+void setLayoutAttrs(Operation *op,
+                    function_ref<LayoutAttr(Value)> getLayoutImpl);
+
+/// Extract a set of small vectors from a value with a given shape using
+/// vector.extract_stride_slice
+SmallVector<Value> extractVectorsWithShapeFromValue(OpBuilder &builder,
+                                                    Location loc, Value value,
+                                                    ArrayRef<int64_t> shape);
+
+/// Create a vector of shape from a set of values using
+/// vector.insert_stride_slice.
+Value createVectorWithShapeFromValues(OpBuilder &builder, Location loc,
+                                      ValueRange values,
+                                      ArrayRef<int64_t> shape);
+
+/// Do type conversion for SCF structural ops, e.g., scf.for. Since VectorType
+/// cannot carry the layout attribute, they are converted into RankedTensorType
+/// first, which will convert back to VectorType in the second round.
+void doSCFStructuralTypeConversionWithTensorType(Operation *op);
+
 } // namespace xegpu
 
 } // namespace mlir

diff --git a/mlir/lib/Dialect/XeGPU/Transforms/CMakeLists.txt b/mlir/lib/Dialect/XeGPU/Transforms/CMakeLists.txt
@@ -1,4 +1,5 @@
 add_mlir_dialect_library(MLIRXeGPUTransforms
+  XeGPUBlocking.cpp
   XeGPUFoldAliasOps.cpp
   XeGPUSubgroupDistribute.cpp
   XeGPUUnroll.cpp

diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp
@@ -0,0 +1,265 @@
+//===---- XeGPUBlocking.cpp ---- XeGPU Instructionlize Pass ---------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/XeGPU/Transforms/Passes.h"
+
+#include "mlir/Dialect/GPU/IR/GPUDialect.h"
+#include "mlir/Dialect/Vector/Transforms/VectorTransforms.h"
+#include "mlir/Dialect/XeGPU/IR/XeGPU.h"
+#include "mlir/Dialect/XeGPU/Transforms/Transforms.h"
+#include "mlir/Dialect/XeGPU/Utils/XeGPUUtils.h"
+#include "mlir/Interfaces/LoopLikeInterface.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Pass/PassManager.h"
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
+
+namespace mlir {
+namespace xegpu {
+#define GEN_PASS_DEF_XEGPUBLOCKING
+#include "mlir/Dialect/XeGPU/Transforms/Passes.h.inc"
+} // namespace xegpu
+} // namespace mlir
+
+#define DEBUG_TYPE "xegpu-blocking"
+#define DBGS() (llvm::dbgs() << "[" DEBUG_TYPE "]: ")
+
+using namespace mlir;
+
+namespace {
+
+void resolveUnrealizedConversionCastOp(UnrealizedConversionCastOp castOp) {
+  ValueRange inputs = castOp.getInputs();
+  ValueRange outputs = castOp.getOutputs();
+
+  if (inputs.size() == 1 && outputs.size() == 1) {
+    castOp->replaceAllUsesWith(inputs);
+    castOp->erase();
+  }
+
+  VectorType inputTy = dyn_cast<VectorType>(inputs[0].getType());
+  VectorType outputTy = dyn_cast<VectorType>(outputs[0].getType());
+  if (inputTy && outputTy) {
+    OpBuilder builder(castOp);
+    // unpack
+    if (inputs.size() > 1 && outputs.size() == 1) {
+      ArrayRef<int64_t> shape = outputTy.getShape();
+      Value result = xegpu::createVectorWithShapeFromValues(
+          builder, castOp.getLoc(), inputs, shape);
+      castOp->replaceAllUsesWith(ValueRange(result));
+      castOp->erase();
+    }
+
+    // pack
+    if (castOp.getNumResults() > 1 && castOp.getNumOperands() == 1) {
+      ArrayRef<int64_t> tileShape = outputTy.getShape();
+      SmallVector<Value> results = xegpu::extractVectorsWithShapeFromValue(
+          builder, castOp.getLoc(), inputs[0], tileShape);
+      castOp->replaceAllUsesWith(results);
+      castOp->erase();
+    }
+  }
+}
+
+/// Unroll XeGPU ops to their instruction-level representation.
+class XeGPUBlockingPass final
+    : public xegpu::impl::XeGPUBlockingBase<XeGPUBlockingPass> {
+public:
+  void runOnOperation() override;
+
+private:
+  // Get the tile shape for a given value. If the value has a layout
+  // attribute and it is an SG layout, return the inst_data as the tile shape
+  // if inst_data is available; otherwise, return the original shape of the
+  // value. If the value does not have an SG layout, return std::nullopt.
+  std::optional<SmallVector<int64_t>>
+  getTileShape(TypedValue<ShapedType> value) const;
+
+  std::optional<SmallVector<int64_t>> getTileShape(OpOperand &operand) const;
+
+  std::optional<SmallVector<int64_t>> getTileShape(OpResult result) const;
+
+  // Get the tile shape for a given operation.
+  std::optional<SmallVector<int64_t>> getTileShape(Operation *op) const;
+
+  // Determine if the operation requires unrolling. Return false if all operands
+  // and results have tile shapes identical to their original types. Otherwise,
+  // return true.
+  bool needsUnroll(Operation *op) const;
+};
+} // namespace
+
+std::optional<SmallVector<int64_t>>
+XeGPUBlockingPass::getTileShape(TypedValue<ShapedType> value) const {
+  assert(value && "value must be non-null");
+  xegpu::LayoutAttr layout = xegpu::getLayoutAttr(value);
+  if (layout && layout.isSgLayout()) {
+    if (auto inst_data = layout.getInstData())
+      return llvm::to_vector_of<int64_t>(inst_data.asArrayRef());
+    return llvm::to_vector(value.getType().getShape());
+  }
+  return std::nullopt;
+}
+
+std::optional<SmallVector<int64_t>>
+XeGPUBlockingPass::getTileShape(OpOperand &operand) const {
+  xegpu::LayoutAttr layout = xegpu::getLayoutAttr(operand);
+  if (layout && layout.isSgLayout()) {
+    if (auto inst_data = layout.getInstData())
+      return llvm::to_vector_of<int64_t>(inst_data.asArrayRef());
+
+    if (auto type = dyn_cast<ShapedType>(operand.get().getType()))
+      return llvm::to_vector(type.getShape());
+  }
+  return std::nullopt;
+}
+
+std::optional<SmallVector<int64_t>>
+XeGPUBlockingPass::getTileShape(OpResult result) const {
+  xegpu::LayoutAttr layout = xegpu::getLayoutAttr(result);
+  if (layout && layout.isSgLayout()) {
+    if (auto inst_data = layout.getInstData())
+      return llvm::to_vector_of<int64_t>(inst_data.asArrayRef());
+
+    if (auto type = dyn_cast<ShapedType>(result.getType()))
+      return llvm::to_vector(type.getShape());
+  }
+  return std::nullopt;
+}
+
+std::optional<SmallVector<int64_t>>
+XeGPUBlockingPass::getTileShape(Operation *op) const {
+  if (isa<xegpu::CreateNdDescOp, xegpu::UpdateNdOffsetOp>(op))
+    return getTileShape(op->getOpResult(0));
+  if (isa<xegpu::PrefetchNdOp, xegpu::LoadNdOp>(op))
+    return getTileShape(op->getOpOperand(0));
+  if (isa<xegpu::StoreNdOp>(op))
+    return getTileShape(op->getOpOperand(1));
+
+  if (isa<xegpu::DpasOp>(op)) {
+    std::optional<SmallVector<int64_t>> aTile =
+        getTileShape(op->getOpOperand(0));
+    std::optional<SmallVector<int64_t>> bTile =
+        getTileShape(op->getOpOperand(1));
+
+    if (!aTile || aTile->size() != 2 || !bTile || bTile->size() != 2)
+      return std::nullopt;
+
+    // semantic check for A and B
+    if ((*aTile)[1] != (*bTile)[0])
+      return std::nullopt;
+
+    // semantic check for C
+    if (op->getNumOperands() == 3) {
+      std::optional<SmallVector<int64_t>> cTile =
+          getTileShape(op->getOpOperand(2));
+      int64_t expectedCTile[2] = {(*aTile)[0], (*bTile)[1]};
+      if (!cTile || !llvm::equal(*cTile, expectedCTile))
+        return std::nullopt;
+    }
+
+    return SmallVector<int64_t>({(*aTile)[0], (*aTile)[1], (*bTile)[1]});
+  }
+
+  if (OpTrait::hasElementwiseMappableTraits(op) && op->getNumResults() == 1)
+    return getTileShape(op->getOpResult(0));
+
+  return std::nullopt;
+}
+
+bool XeGPUBlockingPass::needsUnroll(Operation *op) const {
+  if (isa<LoopLikeOpInterface>(op))
+    return false;
+
+  for (auto &opr : op->getOpOperands()) {
+    std::optional<SmallVector<int64_t>> tileShape = getTileShape(opr);
+    auto shapedType = dyn_cast<ShapedType>(opr.get().getType());
+    if (!shapedType)
+      continue;
+
+    if (tileShape && !llvm::equal(*tileShape, shapedType.getShape()))
+      return true;
+  }
+
+  for (auto result : op->getOpResults()) {
+    std::optional<SmallVector<int64_t>> tileShape = getTileShape(result);
+    auto shapedType = dyn_cast<ShapedType>(result.getType());
+    if (!shapedType)
+      continue;
+
+    if (tileShape && !llvm::equal(*tileShape, shapedType.getShape()))
+      return true;
+  }
+  return false;
+}
+
+void XeGPUBlockingPass::runOnOperation() {
+  MLIRContext *ctx = &getContext();
+  Operation *mod = getOperation();
+
+  // Preserve the LayoutAttr for each operand to the owner's DictionaryAttr.
+  // This ensures that the LayoutAttr remains accessible even if the defining
+  // operation is replaced.
+  xegpu::setLayoutAttrs(mod, [&](Value v) { return xegpu::getLayoutAttr(v); });
+
+  // Perform type conversion for SCF control folow ops
+  xegpu::doSCFStructuralTypeConversionWithTensorType(mod);
+
+  xegpu::UnrollOptions options;
+  options.setFilterConstraint([&](Operation *op) -> LogicalResult {
+    return needsUnroll(op) ? success() : failure();
+  });
+
+  options.setNativeShapeFn([&](Operation *op) { return getTileShape(op); });
+
+  options.setUnrolledTypesFn([&](ShapedType type, ArrayRef<int64_t> tileShape) {
+    Type elemTy = type.getElementType();
+    Type newTy;
+
+    if (auto tdescTy = dyn_cast<xegpu::TensorDescType>(type))
+      newTy = xegpu::TensorDescType::get(
+          ctx, tileShape, elemTy, tdescTy.getEncoding(),
+          tdescTy.getLayoutAttr().dropInstData());
+    else
+      newTy = type.clone(tileShape, elemTy);
+
+    std::optional<SmallVector<int64_t>> ratio =
+        computeShapeRatio(type.getShape(), tileShape);
+    assert(ratio && "The shape of the type must be a multiple of tileShape.");
+    return SmallVector<Type>(computeProduct(*ratio), newTy);
+  });
+
+  RewritePatternSet patterns(ctx);
+
+  vector::UnrollVectorOptions vectorOptions;
+  vectorOptions.setNativeShapeFn(options.nativeShape);
+
+  populateXeGPUUnrollPatterns(patterns, options);
+  vector::populateVectorUnrollPatterns(patterns, vectorOptions);
+
+  (void)applyPatternsGreedily(mod, std::move(patterns));
+
+  mod->walk([&](Operation *op) {
+    if (auto castOp = dyn_cast<UnrealizedConversionCastOp>(op))
+      resolveUnrealizedConversionCastOp(castOp);
+
+    for (OpOperand &opr : op->getOpOperands()) {
+      std::string name = xegpu::getLayoutName(opr);
+      if (auto layout = op->getAttrOfType<xegpu::LayoutAttr>(name))
+        op->removeAttr(name);
+    }
+
+    for (OpResult result : op->getOpResults()) {
+      std::string name = xegpu::getLayoutName(result);
+      if (auto layout = op->getAttrOfType<xegpu::LayoutAttr>(name)) {
+        op->removeAttr(name);
+        if (!isa<LoopLikeOpInterface>(op))
+          xegpu::setLayoutAttr(result, layout.dropInstData());
+      }
+    }
+  });
+}