-
Notifications
You must be signed in to change notification settings - Fork 14.6k
Description
Today in rust we emit certain conversions using assume(icmp ne %p, null)
: https://rust.godbolt.org/z/WTGj3Ks9q https://github.com/rust-lang/rust/blob/63f6845e570305a92eaf855897768617366164d6/tests/codegen-llvm/intrinsics/transmute.rs#L380-L388
define { ptr, i64 } @check_pair_to_dst_ref(i64 noundef %x.0, i64 noundef %x.1) unnamed_addr {
start:
%_0.0 = getelementptr i8, ptr null, i64 %x.0
%0 = icmp ne ptr %_0.0, null
call void @llvm.assume(i1 %0)
%1 = insertvalue { ptr, i64 } poison, ptr %_0.0, 0
%2 = insertvalue { ptr, i64 } %1, i64 %x.1, 1
ret { ptr, i64 } %2
}
But since I hear extra uses from such icmp
s can make optimization worse sometimes, I wanted to move to assume operand bundles instead, so I made that change and now get what I think is correct from that,
define { ptr, i64 } @check_pair_to_dst_ref(i64 noundef %x.0, i64 noundef %x.1) unnamed_addr #0 {
start:
%_0.0 = getelementptr i8, ptr null, i64 %x.0
call void @llvm.assume(i1 true) [ "nonnull"(ptr %_0.0) ]
%0 = insertvalue { ptr, i64 } poison, ptr %_0.0, 0
%1 = insertvalue { ptr, i64 } %0, i64 %x.1, 1
ret { ptr, i64 } %1
}
But that turned out to give bad consequences. For example, what used to be a quite-good https://rust.godbolt.org/z/vhY16Kavc
define void @long_integer_map(ptr dead_on_unwind noalias nocapture noundef writable writeonly sret([2048 x i8]) align 4 dereferenceable(2048) %_0, ptr noalias nocapture noundef readonly align 4 dereferenceable(2048) %x) unnamed_addr personality ptr @rust_eh_personality {
start:
%array.i.i.i.i = alloca [2048 x i8], align 4
br label %vector.body
vector.body:
%index = phi i64 [ 0, %start ], [ %index.next, %vector.body ]
%offset.idx = shl i64 %index, 2
%next.gep = getelementptr i8, ptr %x, i64 %offset.idx
%0 = getelementptr i8, ptr %next.gep, i64 16
%wide.load = load <4 x i32>, ptr %next.gep, align 4
%wide.load1 = load <4 x i32>, ptr %0, align 4
%1 = mul <4 x i32> %wide.load, splat (i32 13)
%2 = mul <4 x i32> %wide.load1, splat (i32 13)
%3 = add <4 x i32> %1, splat (i32 7)
%4 = add <4 x i32> %2, splat (i32 7)
%5 = getelementptr inbounds nuw i32, ptr %array.i.i.i.i, i64 %index
%6 = getelementptr inbounds nuw i8, ptr %5, i64 16
store <4 x i32> %3, ptr %5, align 4
store <4 x i32> %4, ptr %6, align 4
%index.next = add nuw i64 %index, 8
%7 = icmp eq i64 %index.next, 512
br i1 %7, label %core::array::drain::drain_array_with::h75d8f8b0fda7bb41.exit, label %vector.body
core::array::drain::drain_array_with::h75d8f8b0fda7bb41.exit:
call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 4 dereferenceable(2048) %_0, ptr noundef nonnull align 4 dereferenceable(2048) %array.i.i.i.i, i64 2048, i1 false)
ret void
}
Never removes any of the superfluous-after-inlining assume
s, giving this obviously-silly IR:
; Function Attrs: nofree norecurse nosync nounwind memory(argmem: readwrite, inaccessiblemem: write) uwtable
define void @long_integer_map(ptr dead_on_unwind noalias nocapture noundef writable writeonly sret([2048 x i8]) align 4 dereferenceable(2048) %_0, ptr noalias nocapture noundef readonly align 4 dereferenceable(2048) %x) unnamed_addr #1 personality ptr @__CxxFrameHandler3 {
start:
%array.i.i.i.i = alloca [2048 x i8], align 4
%array1.i = alloca [2048 x i8], align 4
call void @llvm.lifetime.start.p0(i64 2048, ptr nonnull %array1.i), !noalias !7
call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 4 dereferenceable(2048) %array1.i, ptr noundef nonnull readonly align 4 dereferenceable(2048) %x, i64 2048, i1 false), !noalias !11
call void @llvm.assume(i1 true) [ "nonnull"(ptr %array1.i) ]
%0 = getelementptr inbounds nuw i8, ptr %array1.i, i64 2048
call void @llvm.assume(i1 true) [ "nonnull"(ptr %0) ]
call void @llvm.assume(i1 true) [ "nonnull"(ptr %0) ]
call void @llvm.assume(i1 true) [ "nonnull"(ptr %array1.i) ]
call void @llvm.assume(i1 true) [ "nonnull"(ptr %0) ]
%1 = getelementptr inbounds nuw i8, ptr %array1.i, i64 2048
br label %vector.body
vector.body: ; preds = %vector.body, %start
%index = phi i64 [ 0, %start ], [ %index.next, %vector.body ]
%offset.idx = shl i64 %index, 2
%2 = or disjoint i64 %offset.idx, 4
%3 = or disjoint i64 %offset.idx, 8
%4 = or disjoint i64 %offset.idx, 12
%5 = or disjoint i64 %offset.idx, 16
%6 = or disjoint i64 %offset.idx, 20
%7 = or disjoint i64 %offset.idx, 24
%8 = or disjoint i64 %offset.idx, 28
%next.gep = getelementptr i8, ptr %array1.i, i64 %offset.idx
%next.gep1 = getelementptr i8, ptr %array1.i, i64 %2
%next.gep2 = getelementptr i8, ptr %array1.i, i64 %3
%next.gep3 = getelementptr i8, ptr %array1.i, i64 %4
%next.gep4 = getelementptr i8, ptr %array1.i, i64 %5
%next.gep5 = getelementptr i8, ptr %array1.i, i64 %6
%next.gep6 = getelementptr i8, ptr %array1.i, i64 %7
%next.gep7 = getelementptr i8, ptr %array1.i, i64 %8
call void @llvm.assume(i1 true) [ "nonnull"(ptr %next.gep) ]
call void @llvm.assume(i1 true) [ "nonnull"(ptr %next.gep1) ]
call void @llvm.assume(i1 true) [ "nonnull"(ptr %next.gep2) ]
call void @llvm.assume(i1 true) [ "nonnull"(ptr %next.gep3) ]
call void @llvm.assume(i1 true) [ "nonnull"(ptr %next.gep4) ]
call void @llvm.assume(i1 true) [ "nonnull"(ptr %next.gep5) ]
call void @llvm.assume(i1 true) [ "nonnull"(ptr %next.gep6) ]
call void @llvm.assume(i1 true) [ "nonnull"(ptr %next.gep7) ]
%9 = getelementptr inbounds nuw i8, ptr %next.gep, i64 4
%10 = getelementptr inbounds nuw i8, ptr %next.gep1, i64 4
%11 = getelementptr inbounds nuw i8, ptr %next.gep2, i64 4
%12 = getelementptr inbounds nuw i8, ptr %next.gep3, i64 4
%13 = getelementptr inbounds nuw i8, ptr %next.gep4, i64 4
%14 = getelementptr inbounds nuw i8, ptr %next.gep5, i64 4
%15 = getelementptr inbounds nuw i8, ptr %next.gep6, i64 4
%16 = getelementptr inbounds nuw i8, ptr %next.gep7, i64 4
call void @llvm.assume(i1 true) [ "nonnull"(ptr %9) ]
call void @llvm.assume(i1 true) [ "nonnull"(ptr %10) ]
call void @llvm.assume(i1 true) [ "nonnull"(ptr %11) ]
call void @llvm.assume(i1 true) [ "nonnull"(ptr %12) ]
call void @llvm.assume(i1 true) [ "nonnull"(ptr %13) ]
call void @llvm.assume(i1 true) [ "nonnull"(ptr %14) ]
call void @llvm.assume(i1 true) [ "nonnull"(ptr %15) ]
call void @llvm.assume(i1 true) [ "nonnull"(ptr %16) ]
call void @llvm.assume(i1 true) [ "nonnull"(ptr %next.gep) ]
call void @llvm.assume(i1 true) [ "nonnull"(ptr %next.gep1) ]
call void @llvm.assume(i1 true) [ "nonnull"(ptr %next.gep2) ]
call void @llvm.assume(i1 true) [ "nonnull"(ptr %next.gep3) ]
call void @llvm.assume(i1 true) [ "nonnull"(ptr %next.gep4) ]
call void @llvm.assume(i1 true) [ "nonnull"(ptr %next.gep5) ]
call void @llvm.assume(i1 true) [ "nonnull"(ptr %next.gep6) ]
call void @llvm.assume(i1 true) [ "nonnull"(ptr %next.gep7) ]
%17 = getelementptr i8, ptr %next.gep, i64 16
%wide.load = load <4 x i32>, ptr %next.gep, align 4, !noalias !12
%wide.load8 = load <4 x i32>, ptr %17, align 4, !noalias !12
%18 = mul <4 x i32> %wide.load, splat (i32 13)
%19 = mul <4 x i32> %wide.load8, splat (i32 13)
%20 = add <4 x i32> %18, splat (i32 7)
%21 = add <4 x i32> %19, splat (i32 7)
%22 = getelementptr inbounds nuw i32, ptr %array.i.i.i.i, i64 %index
%23 = getelementptr inbounds nuw i8, ptr %22, i64 16
store <4 x i32> %20, ptr %22, align 4
store <4 x i32> %21, ptr %23, align 4
%index.next = add nuw i64 %index, 8
%24 = icmp eq i64 %index.next, 512
br i1 %24, label %_ZN4core5array5drain16drain_array_with17hdab83ed713860683E.exit, label %vector.body, !llvm.loop !27
_ZN4core5array5drain16drain_array_with17hdab83ed713860683E.exit: ; preds = %vector.body
call void @llvm.assume(i1 true) [ "nonnull"(ptr %1) ]
call void @llvm.assume(i1 true) [ "nonnull"(ptr %0) ]
call void @llvm.assume(i1 true) [ "nonnull"(ptr %0) ]
call void @llvm.assume(i1 true) [ "nonnull"(ptr %1) ]
call void @llvm.assume(i1 true) [ "nonnull"(ptr %0) ]
call void @llvm.lifetime.end.p0(i64 2048, ptr nonnull %array1.i), !noalias !7
call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 4 dereferenceable(2048) %_0, ptr noundef nonnull align 4 dereferenceable(2048) %array.i.i.i.i, i64 2048, i1 false)
ret void
}
Trunk and clean that up a bit, but it's still full of unnecessary assume
s: https://llvm.godbolt.org/z/b8oM1vxvT
At a minimum this at least ought to be treated idempotently, since there's no need for
call void @llvm.assume(i1 true) [ "nonnull"(ptr %0) ]
call void @llvm.assume(i1 true) [ "nonnull"(ptr %0) ]
repeated in a row like that. But it'd also be nice to optimize out all the ones that came from GEP nuw
, for example.
(Or if this is the wrong way to do this, that'd be good to know and reflect in the langref too.)