libxsmm · rolfmorel · Jun 12, 2025 · Jun 6, 2025 · Jun 11, 2025 · Jun 11, 2025
diff --git a/build_tools/llvm_version.txt b/build_tools/llvm_version.txt
@@ -1 +1 @@
-faf5d747f174cc9d714839f0d3bce1a783eac2ac
+d698ede748e66f5519cb8481abc2df89a994a059
diff --git a/lib/TPP/Dialect/Check/BufferizableOpInterfaceImpl.cpp b/lib/TPP/Dialect/Check/BufferizableOpInterfaceImpl.cpp
@@ -48,11 +48,12 @@ struct ExpectTrueLayoutInterface
   }
 
   LogicalResult bufferize(Operation *op, RewriterBase &rewriter,
-                          const BufferizationOptions &options) const {
+                          const BufferizationOptions &options,
+                          BufferizationState &state) const {
     check::ExpectTrueOp expectTrueOp = cast<check::ExpectTrueOp>(op);
 
     FailureOr<Value> maybeSrcBuffer =
-        getBuffer(rewriter, expectTrueOp.getOperand(), options);
+        getBuffer(rewriter, expectTrueOp.getOperand(), options, state);
     if (failed(maybeSrcBuffer))
       return failure();
     Value srcBuffer = *maybeSrcBuffer;
@@ -91,16 +92,17 @@ struct ExpectAlmostEqLayoutInterface
   }
 
   LogicalResult bufferize(Operation *op, RewriterBase &rewriter,
-                          const BufferizationOptions &options) const {
+                          const BufferizationOptions &options,
+                          BufferizationState &state) const {
     check::ExpectAlmostEqOp almostEqOp = cast<check::ExpectAlmostEqOp>(op);
     FailureOr<Value> maybeFirstBuffer =
-        getBuffer(rewriter, almostEqOp.getLhs(), options);
+        getBuffer(rewriter, almostEqOp.getLhs(), options, state);
     if (failed(maybeFirstBuffer))
       return failure();
     Value firstBuffer = *maybeFirstBuffer;
 
     FailureOr<Value> maybeSecondBuffer =
-        getBuffer(rewriter, almostEqOp.getRhs(), options);
+        getBuffer(rewriter, almostEqOp.getRhs(), options, state);
     if (failed(maybeSecondBuffer))
       return failure();
     Value secondBuffer = *maybeSecondBuffer;
@@ -142,10 +144,11 @@ struct ExpectSaneLayoutInterface
   }
 
   LogicalResult bufferize(Operation *op, RewriterBase &rewriter,
-                          const BufferizationOptions &options) const {
+                          const BufferizationOptions &options,
+                          BufferizationState &state) const {
     check::ExpectSaneOp saneOp = cast<check::ExpectSaneOp>(op);
     FailureOr<Value> maybeBuffer =
-        getBuffer(rewriter, saneOp.getOperand(), options);
+        getBuffer(rewriter, saneOp.getOperand(), options, state);
     if (failed(maybeBuffer)) {
       return failure();
     }

diff --git a/lib/TPP/Dialect/Perf/BufferizableOpInterfaceImpl.cpp b/lib/TPP/Dialect/Perf/BufferizableOpInterfaceImpl.cpp
@@ -55,10 +55,12 @@ struct SinkLayoutInterface
   }
 
   LogicalResult bufferize(Operation *op, RewriterBase &rewriter,
-                          const BufferizationOptions &options) const {
+                          const BufferizationOptions &options,
+                          BufferizationState &state) const {
     auto sink = cast<perf::SinkOp>(op);
 
-    FailureOr<Value> srcBuffer = getBuffer(rewriter, sink.getInput(), options);
+    FailureOr<Value> srcBuffer =
+        getBuffer(rewriter, sink.getInput(), options, state);
     if (failed(srcBuffer))
       return failure();
 

diff --git a/lib/TPP/Transforms/LowerPacksAndUnpacks.cpp b/lib/TPP/Transforms/LowerPacksAndUnpacks.cpp
@@ -112,7 +112,7 @@ static void fuseOrTilePacks(RewriterBase &rewriter, FunctionOpInterface func) {
             forLoops);
     if (!fusedProducer)
       continue;
-    rewriter.replaceOp(consumerPackOp, tilingResult->mergeResult.replacements);
+    rewriter.replaceOp(consumerPackOp, tilingResult->replacements);
   }
 
   // Tile packs.
@@ -124,7 +124,7 @@ static void fuseOrTilePacks(RewriterBase &rewriter, FunctionOpInterface func) {
         rewriter, cast<TilingInterface>(packOp.getOperation()), tileSizes);
     if (failed(tilingResult))
       continue;
-    rewriter.replaceOp(packOp, tilingResult->mergeResult.replacements);
+    rewriter.replaceOp(packOp, tilingResult->replacements);
   }
 
   // Tile unpacks.
@@ -136,7 +136,7 @@ static void fuseOrTilePacks(RewriterBase &rewriter, FunctionOpInterface func) {
         rewriter, cast<TilingInterface>(unPackOp.getOperation()), tileSizes);
     if (failed(tilingResult))
       continue;
-    rewriter.replaceOp(unPackOp, tilingResult->mergeResult.replacements);
+    rewriter.replaceOp(unPackOp, tilingResult->replacements);
   }
 }
 
@@ -215,7 +215,7 @@ class LowerPacksAndUnPacks
             unpackTilingOptions);
         if (failed(tilingResult))
           return signalPassFailure();
-        rewriter.replaceOp(unPackOp, tilingResult->mergeResult.replacements);
+        rewriter.replaceOp(unPackOp, tilingResult->replacements);
       });
       getOperation()->walk([&](linalg::PackOp packOp) {
         SmallVector<int64_t> tiles(packOp.getSourceType().getRank(), 1);
@@ -226,7 +226,7 @@ class LowerPacksAndUnPacks
             packTilingOptions);
         if (failed(tilingResult))
           return signalPassFailure();
-        rewriter.replaceOp(packOp, tilingResult->mergeResult.replacements);
+        rewriter.replaceOp(packOp, tilingResult->replacements);
       });
       RewritePatternSet patterns(&getContext());
       patterns.add<linalg::DecomposeOuterUnitDimsUnPackOpPattern,

diff --git a/lib/TPP/Transforms/RewriteBatchMatmulToMatmul.cpp b/lib/TPP/Transforms/RewriteBatchMatmulToMatmul.cpp
@@ -111,7 +111,7 @@ struct RewriteBatchMatmulToMatmul
           tilingOpts);
       if (failed(tilingResult))
         return signalPassFailure();
-      rewriter.replaceOp(batchMatmulOp, tilingResult->mergeResult.replacements);
+      rewriter.replaceOp(batchMatmulOp, tilingResult->replacements);
     });
 
     // Step2:

diff --git a/lib/TPP/Transforms/SplitReductionDim.cpp b/lib/TPP/Transforms/SplitReductionDim.cpp
@@ -81,7 +81,7 @@ struct SplitContractionReduction
       return rewriter.notifyMatchFailure(linalgOp,
                                          "failed to tile contraction");
 
-    rewriter.replaceOp(linalgOp, tilingResult->mergeResult.replacements);
+    rewriter.replaceOp(linalgOp, tilingResult->replacements);
 
     return success();
   }

diff --git a/lib/TPP/Transforms/VectorContractToAMX.cpp b/lib/TPP/Transforms/VectorContractToAMX.cpp
@@ -344,8 +344,8 @@ struct VectorContractToAMXPattern
       return rewriter.notifyMatchFailure(
           op, "Accumulator defined by TransferReadOp");
 
-    if (!llvm::all_of(lhsDefiningOp.getIndices(), isZeroIndex) ||
-        !llvm::all_of(rhsDefiningOp.getIndices(), isZeroIndex))
+    if (!llvm::all_of(lhsDefiningOp.getIndices(), isZeroInteger) ||
+        !llvm::all_of(rhsDefiningOp.getIndices(), isZeroInteger))
       return rewriter.notifyMatchFailure(
           op, "Inputs are not whole tensor or subview");
 

diff --git a/lib/TPP/Transforms/VectorContractToFMA.cpp b/lib/TPP/Transforms/VectorContractToFMA.cpp
@@ -174,8 +174,8 @@ struct VectorContractToFMAPattern
       return failure();
 
     // Make sure the inputs being read are whole tensor or subview.
-    if (!llvm::all_of(lhsDefiningOp.getIndices(), isZeroIndex) ||
-        !llvm::all_of(rhsDefiningOp.getIndices(), isZeroIndex)) {
+    if (!llvm::all_of(lhsDefiningOp.getIndices(), isZeroInteger) ||
+        !llvm::all_of(rhsDefiningOp.getIndices(), isZeroInteger)) {
       return failure();
     }
 

diff --git a/lib/TPP/Transforms/VectorContractToOuterproduct.cpp b/lib/TPP/Transforms/VectorContractToOuterproduct.cpp
@@ -133,8 +133,8 @@ struct VectorContractToOuterproductPattern
       return failure();
 
     // Make sure the inputs being read are whole tensor or subview.
-    if (!llvm::all_of(lhsDefiningOp.getIndices(), isZeroIndex) ||
-        !llvm::all_of(rhsDefiningOp.getIndices(), isZeroIndex)) {
+    if (!llvm::all_of(lhsDefiningOp.getIndices(), isZeroInteger) ||
+        !llvm::all_of(rhsDefiningOp.getIndices(), isZeroInteger)) {
       return failure();
     }
 

diff --git a/python/mlir/tpp/sched/bundles.py b/python/mlir/tpp/sched/bundles.py
@@ -1,5 +1,6 @@
 from typing import Optional, Sequence
 
+from mlir import ir
 from mlir.dialects import transform
 from .common import apply_registered_pass, match
 from .utils import GpuBackend, PipelineInterrupt
@@ -67,7 +68,7 @@ def linalg_lowering(mod, /, *, skip_operations: Sequence[str] = (), **_config):
     func = apply_registered_pass(
         func,
         "convert-linalg-to-xsmm",
-        options="skip-operations=" + ",".join(skip_operations),
+        options={"skip-operations": ",".join(skip_operations)},
     )
     func = apply_registered_pass(func, "combine-xsmm-op-optimization")
     func = apply_registered_pass(func, "fold-xsmm-flags")
@@ -130,7 +131,7 @@ def low_level_parallel(
     # Run cleanup after LICM to allow CSE to eliminate common operations now
     # that they are hoisted out of loops.
     mod = cleanup(mod)
-    options = "parallel-loop-tile-sizes=" + ",".join(map(str, parallel_task_grid))
+    options = {"parallel-loop-tile-sizes": ",".join(map(str, parallel_task_grid))}
     mod = apply_registered_pass(mod, "scf-parallel-loop-tiling", options=options)
     return mod
 
@@ -228,7 +229,7 @@ def default_tpp_passes(
         mod = linalg_lowering(mod, skip_operations=skip_ops, **config)
         if linalg_to_vector or force_linalg_to_vector:
             func = match(mod, ops={"func.func"})
-            options = "registerTileShape=" + ",".join(map(str, register_blocking))
+            options = {"registerTileShape": ",".join(map(str, register_blocking))}
             func = apply_registered_pass(func, "brgemm-linalg-tiling", options=options)
             func = apply_registered_pass(func, "loop-invariant-code-motion")
             apply_registered_pass(func, "vectorization-pass")
@@ -315,7 +316,7 @@ def default_pipeline(
     #     #if defined(__x86_64__)
     #     options.x86Vector = true;
     #     #endif
-    options = f"enable-amx={int(xsmm_utils.has_amx())}"
+    options = {"enable-amx": int(xsmm_utils.has_amx())}
     mod = apply_registered_pass(mod, "convert-vector-to-llvm", options=options)
     mod = apply_registered_pass(mod, "finalize-memref-to-llvm")
     mod = apply_registered_pass(mod, "convert-scf-to-cf")
@@ -327,9 +328,8 @@ def default_pipeline(
         # gpu-to-llvm cannot be invoked from transform-interpreter as it
         # tries to load ... something while multi-threaded PassManager is running.
         mod = apply_registered_pass(mod, "gpu-to-llvm")
-        mod = apply_registered_pass(
-            mod, "gpu-module-to-binary", options="compilation-target=fatbin"
-        )
+        options = {"compilation-target": "fatbin"}
+        mod = apply_registered_pass(mod, "gpu-module-to-binary", options=options)
     mod = apply_registered_pass(mod, "convert-math-to-llvm")
     if gpu_backend:
         mod = apply_registered_pass(mod, "async-to-async-runtime")

diff --git a/python/mlir/tpp/sched/common.py b/python/mlir/tpp/sched/common.py
@@ -4,7 +4,7 @@
 
 # Wrapper to addresss verbosity.
 def apply_registered_pass(*args, **kwargs):
-    return transform.ApplyRegisteredPassOp(transform.AnyOpType.get(), *args, **kwargs)
+    return transform.apply_registered_pass(transform.AnyOpType.get(), *args, **kwargs)
 
 
 # Wrapper to addresss verbosity.

diff --git a/test/BF16/Integration/avx512bf16/vector-contract-to-amx-gemm.mlir b/test/BF16/Integration/avx512bf16/vector-contract-to-amx-gemm.mlir
diff --git a/test/BF16/Integration/avx512bf16/vector-contract-to-amx-mlp.mlir b/test/BF16/Integration/avx512bf16/vector-contract-to-amx-mlp.mlir
diff --git a/test/Integration/tpp-run-splat-mlp.mlir b/test/Integration/tpp-run-splat-mlp.mlir
@@ -53,7 +53,7 @@ func.func @entry(%arg0: tensor<8x8xf32>, %output: tensor<8x8xf32>) -> tensor<8x8
 // CHECK-DAG: memref.global "private" constant @__constant_1x1x8x8xf32 : memref<1x1x8x8xf32>
 // CHECK-DAG: memref.global "private" constant @__constant_1x1x8x8xf32_0 : memref<1x1x8x8xf32>
 // CHECK-DAG: memref.global "private" constant @__constant_8xf32 : memref<8xf32>
-// CHECK-DAG: memref.global "private" constant @__constant_8xf32_0 : memref<8xf32>
+// CHECK-DAG: memref.global "private" constant @__constant_8xf32_1 : memref<8xf32>
 
 // Randomized input.
 // CHECK-DAG: memref.global "private" @__wrapper_0 : memref<8x8xf32>

diff --git a/test/Passes/DefaultPipeline/amx-initialization.mlir b/test/Passes/DefaultPipeline/amx-initialization.mlir
@@ -5,9 +5,10 @@
 
 
 // CHECK-AMX-BF16-LABEL:   llvm.func @entry
-// CHECK-AMX-BF16:         amx.tileloadd64
-// CHECK-AMX-BF16:         amx.tdpbf16ps
-// CHECK-AMX-BF16:         amx.tilestored64
+// CHECK-AMX-BF16:         llvm.call_intrinsic "llvm.x86.tileloadd64.internal"{{.*}} -> !llvm.x86_amx
+// CHECK-AMX-BF16:         llvm.call_intrinsic "llvm.x86.tileloadd64.internal"{{.*}} -> !llvm.x86_amx
+// CHECK-AMX-BF16:         llvm.call_intrinsic "llvm.x86.tilezero.internal"{{.*}} -> !llvm.x86_amx
+// CHECK-AMX-BF16:         llvm.call_intrinsic "llvm.x86.tdpbf16ps.internal"{{.*}} -> !llvm.x86_amx
 func.func @entry(%arg0: memref<16x32xbf16>,
              %arg1: memref<16x32xbf16>,
              %arg2: memref<16x16xf32>) {
Original file line number	Diff line number	Diff line change
		@@ -1 +1 @@
		faf5d747f174cc9d714839f0d3bce1a783eac2ac
		d698ede748e66f5519cb8481abc2df89a994a059