diff --git a/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp b/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp index 1c4ec6aa08b43..2b0e221f341e0 100644 --- a/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp +++ b/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp @@ -107,6 +107,9 @@ struct MemsetRange { } // end anonymous namespace +static bool overreadUndefContents(MemorySSA *MSSA, MemCpyInst *MemCpy, + MemIntrinsic *MemSrc, BatchAAResults &BAA); + bool MemsetRange::isProfitableToUseMemset(const DataLayout &DL) const { // If we found more than 4 stores to merge or 16 bytes, use memset. if (TheStores.size() >= 4 || End - Start >= 16) @@ -1129,14 +1132,29 @@ bool MemCpyOptPass::processMemCpyMemCpyDependence(MemCpyInst *M, MForwardOffset = *Offset; } - // The length of the memcpy's must be the same, or the preceding one - // must be larger than the following one. - if (MForwardOffset != 0 || MDep->getLength() != M->getLength()) { + Value *CopyLength = M->getLength(); + + // The length of the memcpy's must be the same, or the preceding one must be + // larger than the following one, or the contents of the overread must be + // undefined bytes of a defined size. + if (MForwardOffset != 0 || MDep->getLength() != CopyLength) { auto *MDepLen = dyn_cast(MDep->getLength()); - auto *MLen = dyn_cast(M->getLength()); - if (!MDepLen || !MLen || - MDepLen->getZExtValue() < MLen->getZExtValue() + MForwardOffset) + auto *MLen = dyn_cast(CopyLength); + // This could be converted to a runtime test (%CopyLength = + // min(max(0, MDepLen - MForwardOffset), MLen)), but it is + // unclear if that is useful + if (!MDepLen || !MLen) return false; + if (MDepLen->getZExtValue() < MLen->getZExtValue() + MForwardOffset) { + if (!overreadUndefContents(MSSA, M, MDep, BAA)) + return false; + if (MDepLen->getZExtValue() <= (uint64_t)MForwardOffset) + return false; // Should not reach here (there is obviously no aliasing + // with MDep), so just bail in case it had incomplete info + // somehow + CopyLength = ConstantInt::get(CopyLength->getType(), + MDepLen->getZExtValue() - MForwardOffset); + } } IRBuilder<> Builder(M); @@ -1152,9 +1170,13 @@ bool MemCpyOptPass::processMemCpyMemCpyDependence(MemCpyInst *M, eraseInstruction(NewCopySource); }); MaybeAlign CopySourceAlign = MDep->getSourceAlign(); - // We just need to calculate the actual size of the copy. - auto MCopyLoc = MemoryLocation::getForSource(MDep).getWithNewSize( - MemoryLocation::getForSource(M).Size); + auto MCopyLoc = MemoryLocation::getForSource(MDep); + // Truncate the size of the MDep access to just the bytes read + if (MDep->getLength() != CopyLength) { + auto *ConstLength = cast(CopyLength); + MCopyLoc = MCopyLoc.getWithNewSize( + LocationSize::precise(ConstLength->getZExtValue())); + } // When the forwarding offset is greater than 0, we transform // memcpy(d1 <- s1) @@ -1223,20 +1245,18 @@ bool MemCpyOptPass::processMemCpyMemCpyDependence(MemCpyInst *M, // example we could be moving from movaps -> movq on x86. Instruction *NewM; if (UseMemMove) - NewM = - Builder.CreateMemMove(M->getDest(), M->getDestAlign(), CopySource, - CopySourceAlign, M->getLength(), M->isVolatile()); + NewM = Builder.CreateMemMove(M->getDest(), M->getDestAlign(), CopySource, + CopySourceAlign, CopyLength, M->isVolatile()); else if (M->isForceInlined()) // llvm.memcpy may be promoted to llvm.memcpy.inline, but the converse is // never allowed since that would allow the latter to be lowered as a call // to an external function. NewM = Builder.CreateMemCpyInline(M->getDest(), M->getDestAlign(), - CopySource, CopySourceAlign, - M->getLength(), M->isVolatile()); + CopySource, CopySourceAlign, CopyLength, + M->isVolatile()); else NewM = Builder.CreateMemCpy(M->getDest(), M->getDestAlign(), CopySource, - CopySourceAlign, M->getLength(), - M->isVolatile()); + CopySourceAlign, CopyLength, M->isVolatile()); NewM->copyMetadata(*M, LLVMContext::MD_DIAssignID); diff --git a/llvm/test/Transforms/MemCpyOpt/memcpy-memcpy-offset.ll b/llvm/test/Transforms/MemCpyOpt/memcpy-memcpy-offset.ll index da654438d7bd6..7dc579aad02f2 100644 --- a/llvm/test/Transforms/MemCpyOpt/memcpy-memcpy-offset.ll +++ b/llvm/test/Transforms/MemCpyOpt/memcpy-memcpy-offset.ll @@ -134,14 +134,15 @@ define void @forward_offset_memcpy_inline(ptr %src, ptr %dest) { ret void } -; We cannot forward `memcpy` because it exceeds the size of `memcpy` it depends on. -define void @do_not_forward_oversize_offset(ptr %src, ptr %dest) { -; CHECK-LABEL: define void @do_not_forward_oversize_offset( +; We can forward `memcpy` by shrinking it to the size of the `memcpy` it depends on. +define void @forward_oversize_offset(ptr %src, ptr %dest) { +; CHECK-LABEL: define void @forward_oversize_offset( ; CHECK-SAME: ptr [[SRC:%.*]], ptr [[DEST:%.*]]) { -; CHECK-NEXT: [[DEP_DEST:%.*]] = alloca [9 x i8], align 1 -; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[DEP_DEST]], ptr align 1 [[SRC]], i64 6, i1 false) -; CHECK-NEXT: [[TMP_OFFSET:%.*]] = getelementptr inbounds i8, ptr [[DEP_DEST]], i64 1 -; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[DEST]], ptr align 1 [[TMP_OFFSET]], i64 6, i1 false) +; CHECK-NEXT: [[CPY_TMP:%.*]] = alloca [9 x i8], align 1 +; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[CPY_TMP]], ptr align 1 [[SRC]], i64 6, i1 false) +; CHECK-NEXT: [[CPY_TMP_OFFSET:%.*]] = getelementptr inbounds i8, ptr [[CPY_TMP]], i64 1 +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 1 +; CHECK-NEXT: call void @llvm.memmove.p0.p0.i64(ptr align 1 [[DEST]], ptr align 1 [[TMP1]], i64 5, i1 false) ; CHECK-NEXT: ret void ; %cpy_tmp = alloca %buf, align 1 @@ -214,6 +215,24 @@ define void @pr98675(ptr noalias %p1, ptr noalias %p2) { ret void } +define void @over_offset_cpy(ptr %src) { +; CHECK-LABEL: define void @over_offset_cpy( +; CHECK-SAME: ptr [[SRC:%.*]]) { +; CHECK-NEXT: [[TMP:%.*]] = alloca [2 x i8], align 1 +; CHECK-NEXT: [[DST:%.*]] = alloca i8, align 1 +; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[TMP]], ptr align 8 [[SRC]], i64 1, i1 false) +; CHECK-NEXT: [[TMP_OFFSET:%.*]] = getelementptr inbounds i8, ptr [[TMP]], i64 1 +; CHECK-NEXT: ret void +; + %tmp = alloca [2 x i8] + %dst = alloca i8 + call void @llvm.memcpy.p0.p0.i64(ptr align 8 %tmp, ptr align 8 %src, i64 1, i1 false) + %tmp_offset = getelementptr inbounds i8, ptr %tmp, i64 1 + call void @llvm.memcpy.p0.p0.i64(ptr align 8 %dst, ptr align 8 %tmp_offset, i64 1, i1 false) + + ret void +} + declare void @use(ptr) declare void @llvm.memcpy.p0.p0.i64(ptr nocapture, ptr nocapture, i64, i1) diff --git a/llvm/test/Transforms/MemCpyOpt/variable-sized-memcpy-memcpy.ll b/llvm/test/Transforms/MemCpyOpt/variable-sized-memcpy-memcpy.ll index 4f6b734ec057d..95402a8ea686c 100644 --- a/llvm/test/Transforms/MemCpyOpt/variable-sized-memcpy-memcpy.ll +++ b/llvm/test/Transforms/MemCpyOpt/variable-sized-memcpy-memcpy.ll @@ -18,7 +18,42 @@ define void @test(ptr %src, i64 %size) { ret void } -; Differing sizes, so left as it is. +define void @dynalloca_test(ptr %src, i64 %size1) { +; CHECK-LABEL: @dynalloca_test( +; CHECK-NEXT: [[TMP:%.*]] = alloca i8, i64 [[SIZE1:%.*]], align 1 +; CHECK-NEXT: [[DST:%.*]] = alloca i8, i64 [[SIZE1]], align 1 +; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[TMP]], ptr align 8 [[SRC:%.*]], i64 31, i1 false) +; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[DST]], ptr align 8 [[SRC]], i64 31, i1 false) +; CHECK-NEXT: ret void +; + %tmp = alloca i8, i64 %size1 + %dst = alloca i8, i64 %size1 + call void @llvm.memcpy.p0.p0.i64(ptr align 8 %tmp, ptr align 8 %src, i64 31, i1 false) + call void @llvm.memcpy.p0.p0.i64(ptr align 8 %dst, ptr align 8 %tmp, i64 32, i1 false) + + ret void +} + +define void @dynalloca_offset_test(ptr %src, i64 %size1) { +; CHECK-LABEL: @dynalloca_offset_test( +; CHECK-NEXT: [[TMP:%.*]] = alloca i8, i64 [[SIZE1:%.*]], align 1 +; CHECK-NEXT: [[DST:%.*]] = alloca i8, i64 [[SIZE1]], align 1 +; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[TMP]], ptr align 8 [[SRC:%.*]], i64 31, i1 false) +; CHECK-NEXT: [[TMP_OFFSET:%.*]] = getelementptr inbounds i8, ptr [[TMP]], i64 1 +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 1 +; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[DST]], ptr align 1 [[TMP1]], i64 30, i1 false) +; CHECK-NEXT: ret void +; + %tmp = alloca i8, i64 %size1 + %dst = alloca i8, i64 %size1 + call void @llvm.memcpy.p0.p0.i64(ptr align 8 %tmp, ptr align 8 %src, i64 31, i1 false) + %tmp_offset = getelementptr inbounds i8, ptr %tmp, i64 1 + call void @llvm.memcpy.p0.p0.i64(ptr align 8 %dst, ptr align 8 %tmp_offset, i64 31, i1 false) + + ret void +} + +; Dynamic sizes, so left as it is. define void @negative_test(ptr %src, i64 %size1, i64 %size2) { ; CHECK-LABEL: @negative_test( ; CHECK-NEXT: [[TMP:%.*]] = alloca i8, i64 [[SIZE1:%.*]], align 1