Skip to content

assume-nonnull operand bundles should optimize away more #151791

@scottmcm

Description

@scottmcm

Today in rust we emit certain conversions using assume(icmp ne %p, null): https://rust.godbolt.org/z/WTGj3Ks9q https://github.com/rust-lang/rust/blob/63f6845e570305a92eaf855897768617366164d6/tests/codegen-llvm/intrinsics/transmute.rs#L380-L388

define { ptr, i64 } @check_pair_to_dst_ref(i64 noundef %x.0, i64 noundef %x.1) unnamed_addr {
start:
  %_0.0 = getelementptr i8, ptr null, i64 %x.0
  %0 = icmp ne ptr %_0.0, null
  call void @llvm.assume(i1 %0)
  %1 = insertvalue { ptr, i64 } poison, ptr %_0.0, 0
  %2 = insertvalue { ptr, i64 } %1, i64 %x.1, 1
  ret { ptr, i64 } %2
}

But since I hear extra uses from such icmps can make optimization worse sometimes, I wanted to move to assume operand bundles instead, so I made that change and now get what I think is correct from that,

define { ptr, i64 } @check_pair_to_dst_ref(i64 noundef %x.0, i64 noundef %x.1) unnamed_addr #0 {
start:
  %_0.0 = getelementptr i8, ptr null, i64 %x.0
  call void @llvm.assume(i1 true) [ "nonnull"(ptr %_0.0) ]
  %0 = insertvalue { ptr, i64 } poison, ptr %_0.0, 0
  %1 = insertvalue { ptr, i64 } %0, i64 %x.1, 1
  ret { ptr, i64 } %1
}

But that turned out to give bad consequences. For example, what used to be a quite-good https://rust.godbolt.org/z/vhY16Kavc

define void @long_integer_map(ptr dead_on_unwind noalias nocapture noundef writable writeonly sret([2048 x i8]) align 4 dereferenceable(2048) %_0, ptr noalias nocapture noundef readonly align 4 dereferenceable(2048) %x) unnamed_addr personality ptr @rust_eh_personality {
start:
  %array.i.i.i.i = alloca [2048 x i8], align 4
  br label %vector.body

vector.body:
  %index = phi i64 [ 0, %start ], [ %index.next, %vector.body ]
  %offset.idx = shl i64 %index, 2
  %next.gep = getelementptr i8, ptr %x, i64 %offset.idx
  %0 = getelementptr i8, ptr %next.gep, i64 16
  %wide.load = load <4 x i32>, ptr %next.gep, align 4
  %wide.load1 = load <4 x i32>, ptr %0, align 4
  %1 = mul <4 x i32> %wide.load, splat (i32 13)
  %2 = mul <4 x i32> %wide.load1, splat (i32 13)
  %3 = add <4 x i32> %1, splat (i32 7)
  %4 = add <4 x i32> %2, splat (i32 7)
  %5 = getelementptr inbounds nuw i32, ptr %array.i.i.i.i, i64 %index
  %6 = getelementptr inbounds nuw i8, ptr %5, i64 16
  store <4 x i32> %3, ptr %5, align 4
  store <4 x i32> %4, ptr %6, align 4
  %index.next = add nuw i64 %index, 8
  %7 = icmp eq i64 %index.next, 512
  br i1 %7, label %core::array::drain::drain_array_with::h75d8f8b0fda7bb41.exit, label %vector.body

core::array::drain::drain_array_with::h75d8f8b0fda7bb41.exit:
  call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 4 dereferenceable(2048) %_0, ptr noundef nonnull align 4 dereferenceable(2048) %array.i.i.i.i, i64 2048, i1 false)
  ret void
}

Never removes any of the superfluous-after-inlining assumes, giving this obviously-silly IR:

; Function Attrs: nofree norecurse nosync nounwind memory(argmem: readwrite, inaccessiblemem: write) uwtable
define void @long_integer_map(ptr dead_on_unwind noalias nocapture noundef writable writeonly sret([2048 x i8]) align 4 dereferenceable(2048) %_0, ptr noalias nocapture noundef readonly align 4 dereferenceable(2048) %x) unnamed_addr #1 personality ptr @__CxxFrameHandler3 {
start:
  %array.i.i.i.i = alloca [2048 x i8], align 4
  %array1.i = alloca [2048 x i8], align 4
  call void @llvm.lifetime.start.p0(i64 2048, ptr nonnull %array1.i), !noalias !7
  call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 4 dereferenceable(2048) %array1.i, ptr noundef nonnull readonly align 4 dereferenceable(2048) %x, i64 2048, i1 false), !noalias !11
  call void @llvm.assume(i1 true) [ "nonnull"(ptr %array1.i) ]
  %0 = getelementptr inbounds nuw i8, ptr %array1.i, i64 2048
  call void @llvm.assume(i1 true) [ "nonnull"(ptr %0) ]
  call void @llvm.assume(i1 true) [ "nonnull"(ptr %0) ]
  call void @llvm.assume(i1 true) [ "nonnull"(ptr %array1.i) ]
  call void @llvm.assume(i1 true) [ "nonnull"(ptr %0) ]
  %1 = getelementptr inbounds nuw i8, ptr %array1.i, i64 2048
  br label %vector.body

vector.body:                                      ; preds = %vector.body, %start
  %index = phi i64 [ 0, %start ], [ %index.next, %vector.body ]
  %offset.idx = shl i64 %index, 2
  %2 = or disjoint i64 %offset.idx, 4
  %3 = or disjoint i64 %offset.idx, 8
  %4 = or disjoint i64 %offset.idx, 12
  %5 = or disjoint i64 %offset.idx, 16
  %6 = or disjoint i64 %offset.idx, 20
  %7 = or disjoint i64 %offset.idx, 24
  %8 = or disjoint i64 %offset.idx, 28
  %next.gep = getelementptr i8, ptr %array1.i, i64 %offset.idx
  %next.gep1 = getelementptr i8, ptr %array1.i, i64 %2
  %next.gep2 = getelementptr i8, ptr %array1.i, i64 %3
  %next.gep3 = getelementptr i8, ptr %array1.i, i64 %4
  %next.gep4 = getelementptr i8, ptr %array1.i, i64 %5
  %next.gep5 = getelementptr i8, ptr %array1.i, i64 %6
  %next.gep6 = getelementptr i8, ptr %array1.i, i64 %7
  %next.gep7 = getelementptr i8, ptr %array1.i, i64 %8
  call void @llvm.assume(i1 true) [ "nonnull"(ptr %next.gep) ]
  call void @llvm.assume(i1 true) [ "nonnull"(ptr %next.gep1) ]
  call void @llvm.assume(i1 true) [ "nonnull"(ptr %next.gep2) ]
  call void @llvm.assume(i1 true) [ "nonnull"(ptr %next.gep3) ]
  call void @llvm.assume(i1 true) [ "nonnull"(ptr %next.gep4) ]
  call void @llvm.assume(i1 true) [ "nonnull"(ptr %next.gep5) ]
  call void @llvm.assume(i1 true) [ "nonnull"(ptr %next.gep6) ]
  call void @llvm.assume(i1 true) [ "nonnull"(ptr %next.gep7) ]
  %9 = getelementptr inbounds nuw i8, ptr %next.gep, i64 4
  %10 = getelementptr inbounds nuw i8, ptr %next.gep1, i64 4
  %11 = getelementptr inbounds nuw i8, ptr %next.gep2, i64 4
  %12 = getelementptr inbounds nuw i8, ptr %next.gep3, i64 4
  %13 = getelementptr inbounds nuw i8, ptr %next.gep4, i64 4
  %14 = getelementptr inbounds nuw i8, ptr %next.gep5, i64 4
  %15 = getelementptr inbounds nuw i8, ptr %next.gep6, i64 4
  %16 = getelementptr inbounds nuw i8, ptr %next.gep7, i64 4
  call void @llvm.assume(i1 true) [ "nonnull"(ptr %9) ]
  call void @llvm.assume(i1 true) [ "nonnull"(ptr %10) ]
  call void @llvm.assume(i1 true) [ "nonnull"(ptr %11) ]
  call void @llvm.assume(i1 true) [ "nonnull"(ptr %12) ]
  call void @llvm.assume(i1 true) [ "nonnull"(ptr %13) ]
  call void @llvm.assume(i1 true) [ "nonnull"(ptr %14) ]
  call void @llvm.assume(i1 true) [ "nonnull"(ptr %15) ]
  call void @llvm.assume(i1 true) [ "nonnull"(ptr %16) ]
  call void @llvm.assume(i1 true) [ "nonnull"(ptr %next.gep) ]
  call void @llvm.assume(i1 true) [ "nonnull"(ptr %next.gep1) ]
  call void @llvm.assume(i1 true) [ "nonnull"(ptr %next.gep2) ]
  call void @llvm.assume(i1 true) [ "nonnull"(ptr %next.gep3) ]
  call void @llvm.assume(i1 true) [ "nonnull"(ptr %next.gep4) ]
  call void @llvm.assume(i1 true) [ "nonnull"(ptr %next.gep5) ]
  call void @llvm.assume(i1 true) [ "nonnull"(ptr %next.gep6) ]
  call void @llvm.assume(i1 true) [ "nonnull"(ptr %next.gep7) ]
  %17 = getelementptr i8, ptr %next.gep, i64 16
  %wide.load = load <4 x i32>, ptr %next.gep, align 4, !noalias !12
  %wide.load8 = load <4 x i32>, ptr %17, align 4, !noalias !12
  %18 = mul <4 x i32> %wide.load, splat (i32 13)
  %19 = mul <4 x i32> %wide.load8, splat (i32 13)
  %20 = add <4 x i32> %18, splat (i32 7)
  %21 = add <4 x i32> %19, splat (i32 7)
  %22 = getelementptr inbounds nuw i32, ptr %array.i.i.i.i, i64 %index
  %23 = getelementptr inbounds nuw i8, ptr %22, i64 16
  store <4 x i32> %20, ptr %22, align 4
  store <4 x i32> %21, ptr %23, align 4
  %index.next = add nuw i64 %index, 8
  %24 = icmp eq i64 %index.next, 512
  br i1 %24, label %_ZN4core5array5drain16drain_array_with17hdab83ed713860683E.exit, label %vector.body, !llvm.loop !27

_ZN4core5array5drain16drain_array_with17hdab83ed713860683E.exit: ; preds = %vector.body
  call void @llvm.assume(i1 true) [ "nonnull"(ptr %1) ]
  call void @llvm.assume(i1 true) [ "nonnull"(ptr %0) ]
  call void @llvm.assume(i1 true) [ "nonnull"(ptr %0) ]
  call void @llvm.assume(i1 true) [ "nonnull"(ptr %1) ]
  call void @llvm.assume(i1 true) [ "nonnull"(ptr %0) ]
  call void @llvm.lifetime.end.p0(i64 2048, ptr nonnull %array1.i), !noalias !7
  call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 4 dereferenceable(2048) %_0, ptr noundef nonnull align 4 dereferenceable(2048) %array.i.i.i.i, i64 2048, i1 false)
  ret void
}

Trunk and clean that up a bit, but it's still full of unnecessary assumes: https://llvm.godbolt.org/z/b8oM1vxvT

At a minimum this at least ought to be treated idempotently, since there's no need for

  call void @llvm.assume(i1 true) [ "nonnull"(ptr %0) ]
  call void @llvm.assume(i1 true) [ "nonnull"(ptr %0) ]

repeated in a row like that. But it'd also be nice to optimize out all the ones that came from GEP nuw, for example.


(Or if this is the wrong way to do this, that'd be good to know and reflect in the langref too.)

Metadata

Metadata

Assignees

No one assigned

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions