[compiler][conversion] Fix an issue in stablehlo.slice conversion

shelkesagar29 · shelkesagar29 · commit 8f87f1468ee8 · 2024-11-22T22:48:18.000Z
This PR fixes an issue in `stablehlo.slice` conversion which
popped result type is dynamic.
In the converter, we use `tensorrt.slice` op builder that
infers result based on `static_size` (output shape is same as size on
TensorRT side) and `size` was taken as shape of stablehlo op output.
This caused two issues,
- `truncateI64ToI32` failed for dynamic output because dynamic dim can't be
truncated to i32 without loss.
- A new builder needed which could set result to dynamic
shape even when size is static.

With this change,
- `size` is computed as `ceil(limit-start)/stride` (these are
stablehlo slice op attributes) and NOT as shape of output.
- A new builder is added.

MLIR test is added.
diff --git a/mlir-tensorrt/compiler/lib/Conversion/StablehloToTensorRT/StablehloToTensorRT.cpp b/mlir-tensorrt/compiler/lib/Conversion/StablehloToTensorRT/StablehloToTensorRT.cpp
@@ -44,6 +44,8 @@
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
 #include "stablehlo/dialect/StablehloOps.h"
 #include "llvm/ADT/StringExtras.h"
+#include "llvm/Support/Debug.h"
+#include <cmath>
 #include <functional>
 #include <numeric>
 #include <regex>
@@ -1947,19 +1949,25 @@ struct HloSliceConverter : public ConvertHloOpToTensorRTPattern<HloOpType> {
     if (failed(startIndices))
       return rewriter.notifyMatchFailure(
           op, "could not convert i64 offsets to i32");
+    FailureOr<SmallVector<int32_t>> limitIndices =
+        truncateI64ToI32(loc, op.getLimitIndices());
+    if (failed(limitIndices))
+      return rewriter.notifyMatchFailure(
+          op, "could not convert i64 offsets to i32");
     FailureOr<SmallVector<int32_t>> strides =
         truncateI64ToI32(loc, op.getStrides());
     if (failed(strides))
       return rewriter.notifyMatchFailure(op,
                                          "could not convert i64 stride to i32");
-    FailureOr<SmallVector<int32_t>> i32Shape =
-        truncateI64ToI32(loc, op.getType().getShape());
-    if (failed(i32Shape))
-      return rewriter.notifyMatchFailure(op,
-                                         "could not convert i64 shape to i32");
+
+    SmallVector<int32_t> i32Shape(limitIndices->size());
+    for (size_t i = 0; i < limitIndices->size(); i++) {
+      i32Shape[i] = std::ceil((((*limitIndices)[i] - (*startIndices)[i])) /
+                              static_cast<float>((*strides)[i]));
+    }
     auto sliceOp = trtRewriter.checkAndCreate<mlir::tensorrt::SliceOp>(
-        op.getLoc(), targetTrtMajorVersion, adaptor.getOperand(), *startIndices,
-        *i32Shape, *strides);
+        op.getLoc(), targetTrtMajorVersion, op.getType(), adaptor.getOperand(),
+        *startIndices, i32Shape, *strides);
     if (!sliceOp)
       return failure();
 
diff --git a/mlir-tensorrt/tensorrt/include/mlir-tensorrt-dialect/TensorRT/IR/TensorRTOps.td b/mlir-tensorrt/tensorrt/include/mlir-tensorrt-dialect/TensorRT/IR/TensorRTOps.td
@@ -1628,6 +1628,11 @@ def TensorRT_SliceOp : TensorRT_Op<"slice", [
                    "ArrayRef<int32_t>":$size, "ArrayRef<int32_t>":$stride,
                    CArg<"SliceMode", "SliceMode::kDEFAULT">:$sliceMode,
                    CArg<"Value", "Value()">:$fill)>,
+    // Same as above but result shape is provided and not inferred.
+    OpBuilder<(ins "Type":$result, "Value":$input, "ArrayRef<int32_t>":$start,
+                   "ArrayRef<int32_t>":$size, "ArrayRef<int32_t>":$stride,
+                   CArg<"SliceMode", "SliceMode::kDEFAULT">:$sliceMode,
+                   CArg<"Value", "Value()">:$fill)>,
     // Builder using static array for start/stride and Value for size.
     OpBuilder<(ins "Value":$input, "ArrayRef<int32_t>":$start,
                    "Value":$size, "ArrayRef<int32_t>":$stride,
diff --git a/mlir-tensorrt/tensorrt/lib/TensorRT/IR/TensorRT.cpp b/mlir-tensorrt/tensorrt/lib/TensorRT/IR/TensorRT.cpp
@@ -1136,6 +1136,17 @@ void tensorrt::SliceOp::build(OpBuilder &odsBuilder, OperationState &odsState,
                  toArrayAttr(size), toArrayAttr(stride), sliceMode, fill);
 }
 
+void tensorrt::SliceOp::build(OpBuilder &odsBuilder, OperationState &odsState,
+                              Type result, Value input, ArrayRef<int32_t> start,
+                              ArrayRef<int32_t> size, ArrayRef<int32_t> stride,
+                              SliceMode sliceMode, Value fill) {
+  auto toArrayAttr = [&](ArrayRef<int32_t> arr) {
+    return OpFoldResult(DenseI32ArrayAttr::get(odsBuilder.getContext(), arr));
+  };
+  SliceOp::build(odsBuilder, odsState, result, input, toArrayAttr(start),
+                 toArrayAttr(size), toArrayAttr(stride), sliceMode, fill);
+}
+
 void tensorrt::SliceOp::build(OpBuilder &odsBuilder, OperationState &odsState,
                               Value input, ArrayRef<int32_t> start, Value size,
                               ArrayRef<int32_t> stride, SliceMode sliceMode,
diff --git a/mlir-tensorrt/test/Conversion/StablehloToTensorRT/stablehlo-to-tensorrt.mlir b/mlir-tensorrt/test/Conversion/StablehloToTensorRT/stablehlo-to-tensorrt.mlir
@@ -1927,4 +1927,16 @@ func.func @jnp_cumsum_2d_f16(%arg0: tensor<1x134xf16>) -> tensor<1x134xf16> {
 //       CHECK-SAME: post_padding = array<i64: 0, 0>
 //       CHECK-SAME: pre_padding = array<i64: 0, 133>
 //       CHECK-SAME: in(%[[v1]] : tensor<1x1x1x134xf16>) kernel(%[[v2]] : tensor<1x1x1x134xf16>) -> tensor<1x1x1x134xf16>
-//       CHECK:  %[[v4:.+]] = tensorrt.reshape %[[v3]] : tensor<1x1x1x134xf16> to tensor<1x134xf16>
+//       CHECK:  %[[v4:.+]] = tensorrt.reshape %[[v3]] : tensor<1x1x1x134xf16> to tensor<1x134xf16>
+
+// -----
+
+func.func @slice_conversion_dynamic(%arg0: tensor<1x?x256xf16>) -> tensor<1x?x256xf16>{
+    %16 = "stablehlo.slice"(%arg0) <{limit_indices = array<i64: 1, 6, 256>, start_indices = array<i64: 0, 2, 0>, strides = array<i64: 1, 1, 1>}> : (tensor<1x?x256xf16>) -> tensor<1x?x256xf16>
+    return %16: tensor<1x?x256xf16>
+}
+
+// CHECK-LABEL: @slice_conversion_dynamic
+//  CHECK-SAME: (%[[arg0:.+]]: tensor<1x?x256xf16>) -> tensor<1x?x256xf16>
+//  CHECK-NEXT: %[[v0:.+]] = tensorrt.slice %[[arg0]][0, 2, 0][1, 4, 256][1, 1, 1] : tensor<1x?x256xf16> to tensor<1x?x256xf16>
+//  CHECK-NEXT: return %[[v0]] : tensor<1x?x256xf16>