[compiler] Add mtrt-scf-float-strength-reduce pass to preprocessing pipeline

christopherbate · christopherbate · commit 5eeeb4f8066a · 2025-12-19T01:19:13.000Z
In the Stablehlo preprocessing pipeline, enable the
`mtrt-scf-float-strength-reduce` pass in order to convert while-style
loops to for-style loops where possible. In order for this to work on
some common JAX use-cases, we also need to more aggressively detensorize
loops in the `convert-stablehlo-to-scf` pass.

GitOrigin-RevId: 2614ebf00b5c2d29de32b09a78e7b1a2f42c13cc
diff --git a/mlir-tensorrt/compiler/lib/Compiler/StablehloToExecutable/CMakeLists.txt b/mlir-tensorrt/compiler/lib/Compiler/StablehloToExecutable/CMakeLists.txt
@@ -36,6 +36,7 @@ add_mlir_tensorrt_library(MLIRTensorRTCompilerStableHloToExecutable
       MLIRTensorRTTargetLua
       MLIRTensorRTTensorRTBackend
       MLIRTensorRTTensorRTToTensorRTRuntime
+      MLIRTensorRTTransformsSCFFloatStrengthReduce
       MLIRTensorRTTransformsUnrollForLoops
       StablehloLinalgTransforms
     MLIR_LIBS PUBLIC
diff --git a/mlir-tensorrt/compiler/lib/Compiler/StablehloToExecutable/StableHloInputPipeline.cpp b/mlir-tensorrt/compiler/lib/Compiler/StablehloToExecutable/StableHloInputPipeline.cpp
@@ -93,6 +93,7 @@ void mtrt::compiler::buildStablehloPreProcessingPipeline(
   // `convert-stablehlo-to-scf`:
   if (opts.legalizeControlFlowToSCF) {
     pm.addNestedPass<func::FuncOp>(mlir::createConvertStablehloToScfPass());
+    pm.addNestedPass<func::FuncOp>(mtrt::createSCFFloatStrengthReducePass());
     pm.addNestedPass<func::FuncOp>(mtrt::createSCFUnrollPass(
         mtrt::SCFUnrollPassOptions{opts.unrollThreshold}));
   }
diff --git a/mlir-tensorrt/compiler/lib/Conversion/StablehloToScf/StablehloToScf.cpp b/mlir-tensorrt/compiler/lib/Conversion/StablehloToScf/StablehloToScf.cpp
@@ -357,91 +357,24 @@ struct ScalarizeWhileConditionProducers
 };
 } // namespace
 
-/// Check if the add op is a valid induction variable increment.
-static bool matchInductionVariableIncrement(stablehlo::AddOp op,
-                                            scf::WhileOp parentWhile) {
-  Value lhs = op.getLhs();
-  Value rhs = op.getRhs();
-  if (matchPattern(lhs, m_Constant()) || matchPattern(rhs, m_Constant()))
-    return true;
-  Region *whileRegion = parentWhile->getParentRegion();
-  return lhs.getParentRegion()->isAncestor(whileRegion) ||
-         rhs.getParentRegion()->isAncestor(whileRegion);
-}
-
 namespace {
 /// Scalarize any `stablehlo.add` operations in the 'after' region of
 /// a scf.while op.
-struct ScalarizeStablehloAddOp : public OpRewritePattern<stablehlo::AddOp> {
-  using OpRewritePattern<stablehlo::AddOp>::OpRewritePattern;
-  LogicalResult matchAndRewrite(stablehlo::AddOp op,
+struct ScalarizeStablehloAddOp : public OpRewritePattern<tensor::ExtractOp> {
+  using OpRewritePattern<tensor::ExtractOp>::OpRewritePattern;
+  LogicalResult matchAndRewrite(tensor::ExtractOp op,
                                 PatternRewriter &rewriter) const override {
-    if (!op->hasOneUse())
-      return rewriter.notifyMatchFailure(
-          op, "op has more than one use, cannot scalarize");
-    auto extractUser = dyn_cast<tensor::ExtractOp>(*op->user_begin());
-    if (!extractUser || !extractUser->hasOneUse() ||
-        !isa<scf::YieldOp>(*extractUser->user_begin()))
-      return rewriter.notifyMatchFailure(
-          op, "op result is not extracted and yielded from region");
-
-    auto scfWhile = extractUser->getParentOfType<scf::WhileOp>();
-    if (!scfWhile || scfWhile.getAfter() != op->getParentRegion())
-      return rewriter.notifyMatchFailure(
-          op, "op is not in the after region of a scf.while op");
-
-    // One operand must be a constant or defined above in order to be
-    // considered as the loop step.
-    if (!matchInductionVariableIncrement(op, scfWhile))
-      return rewriter.notifyMatchFailure(
-          op, "op is not a valid induction variable increment");
-
-    // Find a block argument that has been scalarized.
-    auto findBlockArgument = [](Value v) -> BlockArgument {
-      Value source{};
-      if (matchPattern(v,
-                       m_Op<tensor::FromElementsOp>(matchers::m_Any(&source))))
-        return dyn_cast<BlockArgument>(source);
-      return {};
-    };
-    BlockArgument arg = findBlockArgument(op.getLhs());
-    if (!arg)
-      arg = findBlockArgument(op.getRhs());
-    if (!arg || arg.getParentRegion() != scfWhile.getAfter())
-      return rewriter.notifyMatchFailure(
-          op, "could not find block argument in after region");
-
-    // Check that the corresponding block argument in the `before` region feeds
-    // into a comparison.
-    Region &before = scfWhile.getBefore();
-    if (arg.getArgNumber() >= before.getNumArguments() ||
-        before.getArgument(arg.getArgNumber()).getType() != arg.getType())
-      return rewriter.notifyMatchFailure(
-          op, "could not find block argument in before region");
-    auto beforeArg = before.getArgument(arg.getArgNumber());
-    if (!llvm::all_of(beforeArg.getUsers(),
-                      llvm::IsaPred<scf::ConditionOp, arith::CmpIOp>))
-      return rewriter.notifyMatchFailure(
-          op, "block argument is not consumed by a comparison op");
-
-    // Check that the before region has a block argument in the same position
-    // and is consumed by a comparison op.
-    RankedTensorType rtt = op.getType();
-    Type elementType = rtt.getElementType();
-    if (!rtt.hasStaticShape() || rtt.getNumElements() != 1 ||
-        !elementType.isSignlessIntOrIndex())
-      return rewriter.notifyMatchFailure(op, "op is not a scalar add op");
-
-    auto scalarOperands = llvm::map_to_vector(op.getOperands(), [&](Value v) {
-      return extractScalarFromTensorValue(rewriter, v);
-    });
-
-    auto scalarAdd =
-        stablehlo::StablehloOpToStdScalarOp::mapOp<stablehlo::AddOp>(
-            op, elementType, scalarOperands, &rewriter);
-    auto fromElements =
-        rewriter.create<tensor::FromElementsOp>(op.getLoc(), rtt, scalarAdd);
-    rewriter.replaceOp(op, fromElements);
+    auto addOp = op.getTensor().getDefiningOp<stablehlo::AddOp>();
+    if (!addOp || !addOp.getType().hasStaticShape() ||
+        addOp.getType().getNumElements() != 1)
+      return failure();
+    rewriter.setInsertionPoint(addOp);
+    SmallVector<Value> scalarOperands;
+    for (Value operand : addOp.getOperands())
+      scalarOperands.push_back(extractScalarFromTensorValue(rewriter, operand));
+    auto scalarAdd = stablehlo::StablehloOpToStdScalarOp::mapOp(
+        addOp, addOp.getType().getElementType(), scalarOperands, &rewriter);
+    rewriter.replaceOp(op, scalarAdd);
     return success();
   }
 };
@@ -453,9 +386,7 @@ struct ScalarizeStablehloAddOp : public OpRewritePattern<stablehlo::AddOp> {
 /// for loop. It will have a user like `stablehlo.compare` or `tensor.extract`.
 static bool shouldScalarizeWhileBeforeArg(BlockArgument arg, Value initOperand,
                                           Value yieldOperand) {
-  return cast<RankedTensorType>(arg.getType())
-             .getElementType()
-             .isSignlessIntOrIndex() &&
+  return cast<RankedTensorType>(arg.getType()).getElementType() &&
          llvm::count_if(arg.getUsers(),
                         llvm::IsaPred<stablehlo::CompareOp, arith::CmpIOp,
                                       tensor::ExtractOp>) >= 1;
@@ -473,17 +404,17 @@ static bool shouldScalarizeWhileAfterArg(BlockArgument arg, Value condOperand,
   if (before.getNumArguments() <= arg.getArgNumber() ||
       before.getArgument(arg.getArgNumber()).getType() !=
           rtt.getElementType() ||
-      !llvm::all_of(before.getArgument(arg.getArgNumber()).getUsers(),
-                    llvm::IsaPred<arith::CmpIOp, tensor::FromElementsOp>))
+      !llvm::all_of(
+          before.getArgument(arg.getArgNumber()).getUsers(),
+          llvm::IsaPred<arith::CmpIOp, arith::CmpFOp, tensor::FromElementsOp>))
     return false;
 
   auto condProducer = condOperand.getDefiningOp<tensor::FromElementsOp>();
   if (!condProducer || condProducer.getElements().size() != 1 ||
       !isa<BlockArgument>(condProducer.getElements().front()))
     return false;
 
-  return rtt.getElementType().isSignlessIntOrIndex() &&
-         llvm::count_if(arg.getUsers(),
+  return llvm::count_if(arg.getUsers(),
                         llvm::IsaPred<stablehlo::AddOp, arith::AddIOp,
                                       tensor::ExtractOp>) >= 1;
 }
diff --git a/mlir-tensorrt/compiler/test/Conversion/StablehloToScf/stablehlo-to-scf.mlir b/mlir-tensorrt/compiler/test/Conversion/StablehloToScf/stablehlo-to-scf.mlir
@@ -70,18 +70,18 @@ func.func @stablehlo_while_to_scf_while(%arg0: tensor<i64>, %arg1: tensor<i64>)
 
 func.func private @some_compute(tensor<f32>) -> tensor<1xf32>
 
-func.func @stablehlo_while_regression(%arg0: tensor<1xf32>, %arg1: tensor<f32>) -> tensor<1xf32> {
-  %c_33 = stablehlo.constant dense<0> : tensor<i32>
+func.func @stablehlo_while_single_iteration(%arg0: tensor<1xf32>, %arg1: tensor<f32>) -> tensor<1xf32> {
+  %c0 = stablehlo.constant dense<0> : tensor<i32>
   %cst = stablehlo.constant dense<0.000000e+00> : tensor<1xf32>
-  %c_31 = stablehlo.constant dense<1> : tensor<i32>
+  %c1 = stablehlo.constant dense<1> : tensor<i32>
   %cst_0 = stablehlo.constant dense<1.000000e+00> : tensor<f32>
-  %5:2 = stablehlo.while(%iterArg = %c_33, %iterArg_34 = %cst) : tensor<i32>, tensor<1xf32>
+  %5:2 = stablehlo.while(%iterArg = %c0, %iterArg_34 = %cst) : tensor<i32>, tensor<1xf32>
     cond {
-    %6 = stablehlo.compare  LT, %iterArg, %c_31,  SIGNED : (tensor<i32>, tensor<i32>) -> tensor<i1>
+    %6 = stablehlo.compare  LT, %iterArg, %c1,  SIGNED : (tensor<i32>, tensor<i32>) -> tensor<i1>
     stablehlo.return %6 : tensor<i1>
   } do {
-    %6 = stablehlo.compare  LT, %iterArg, %c_33,  SIGNED : (tensor<i32>, tensor<i32>) -> tensor<i1>
-    %7 = stablehlo.add %iterArg, %c_31 : tensor<i32>
+    %6 = stablehlo.compare  LT, %iterArg, %c0,  SIGNED : (tensor<i32>, tensor<i32>) -> tensor<i1>
+    %7 = stablehlo.add %iterArg, %c1 : tensor<i32>
     %8 = stablehlo.select %6, %7, %iterArg : tensor<i1>, tensor<i32>
     %10 = stablehlo.dynamic_slice %arg0, %8, sizes = [1] : (tensor<1xf32>, tensor<i32>) -> tensor<1xf32>
     %11 = stablehlo.reshape %10 : (tensor<1xf32>) -> tensor<f32>
@@ -94,30 +94,14 @@ func.func @stablehlo_while_regression(%arg0: tensor<1xf32>, %arg1: tensor<f32>)
   return %5#1 : tensor<1xf32>
 }
 
-// CHECK-LABEL: func.func @stablehlo_while_regression
-//       CHECK: scf.while
-
-// -----
-
-
-func.func @dont_scalarize_while(%arg0: tensor<f32>) -> tensor<f32> {
-  %0 = stablehlo.while(%iterArg = %arg0) : tensor<f32>
-    cond {
-    %c0 = stablehlo.constant dense<0.0> : tensor<f32>
-    %1 = stablehlo.compare  LT, %iterArg, %c0,  SIGNED : (tensor<f32>, tensor<f32>) -> tensor<i1>
-    stablehlo.return %1 : tensor<i1>
-  } do {
-    %c1 = stablehlo.constant dense<1.0> : tensor<f32>
-    %2 = stablehlo.subtract %iterArg, %c1 : tensor<f32>
-    stablehlo.return %2 :  tensor<f32>
-  }
-  return %0 : tensor<f32>
-}
-
-// CHECK-LABEL: @dont_scalarize_while
-//       CHECK: scf.while {{.*}} (tensor<f32>) -> tensor<f32>
-//       CHECK:   scf.condition{{.*}} : tensor<f32>
-//       CHECK:   scf.yield{{.*}} : tensor<f32>
+// CHECK-LABEL: func.func @stablehlo_while_single_iteration
+//   CHECK-NOT: scf.while
+//   CHECK-NOT: scf.for
+//       CHECK:  stablehlo.compare
+//       CHECK:  stablehlo.add
+//       CHECK:  stablehlo.dynamic_slice
+//       CHECK:  call @some_compute
+//       CHECK:  return
 
 // -----
 
@@ -254,4 +238,4 @@ func.func @case_three_branches(
 //   CHECK-DAG:         %[[v5:.+]] = stablehlo.multiply %[[v4]], %[[arg2]] : tensor<2xi64>
 //   CHECK-DAG:         scf.yield %[[v5]] : tensor<2xi64>
 //       CHECK:       scf.yield %[[v3]] : tensor<2xi64>
-//       CHECK:     return %[[v1]] : tensor<2xi64>
+//       CHECK:     return %[[v1]] : tensor<2xi64>

Original file line number	Diff line number	Diff line change
`@@ -93,6 +93,7 @@ void mtrt::compiler::buildStablehloPreProcessingPipeline(`
`93`	`93`	// `convert-stablehlo-to-scf`:
`94`	`94`	`if (opts.legalizeControlFlowToSCF) {`
`95`	`95`	`pm.addNestedPass<func::FuncOp>(mlir::createConvertStablehloToScfPass());`
	`96`	`+ pm.addNestedPass<func::FuncOp>(mtrt::createSCFFloatStrengthReducePass());`
`96`	`97`	`pm.addNestedPass<func::FuncOp>(mtrt::createSCFUnrollPass(`
`97`	`98`	`mtrt::SCFUnrollPassOptions{opts.unrollThreshold}));`
`98`	`99`	`}`