diff --git a/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp b/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp index 9c8cd53d56b56..64c57c6d368b8 100644 --- a/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp +++ b/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp @@ -1367,8 +1367,9 @@ bool MemCpyOptPass::processMemSetMemCpyDependence(MemCpyInst *MemCpy, return true; } -/// Determine whether the instruction has undefined content for the given Size, -/// either because it was freshly alloca'd or started its lifetime. +/// Determine whether the pointer V had only undefined content (due to Def) up +/// to the given Size, either because it was freshly alloca'd or started its +/// lifetime. static bool hasUndefContents(MemorySSA *MSSA, BatchAAResults &AA, Value *V, MemoryDef *Def, Value *Size) { if (MSSA->isLiveOnEntryDef(Def)) @@ -1403,6 +1404,24 @@ static bool hasUndefContents(MemorySSA *MSSA, BatchAAResults &AA, Value *V, return false; } +// If the memcpy is larger than the previous, but the memory was undef prior to +// that, we can just ignore the tail. Technically we're only interested in the +// bytes from 0..MemSrcOffset and MemSrcLength+MemSrcOffset..CopySize here, but +// as we can't easily represent this location (hasUndefContents uses mustAlias +// which cannot deal with offsets), we use the full 0..CopySize range. +static bool overreadUndefContents(MemorySSA *MSSA, MemCpyInst *MemCpy, + MemIntrinsic *MemSrc, BatchAAResults &BAA) { + Value *CopySize = MemCpy->getLength(); + MemoryLocation MemCpyLoc = MemoryLocation::getForSource(MemCpy); + MemoryUseOrDef *MemSrcAccess = MSSA->getMemoryAccess(MemSrc); + MemoryAccess *Clobber = MSSA->getWalker()->getClobberingMemoryAccess( + MemSrcAccess->getDefiningAccess(), MemCpyLoc, BAA); + if (auto *MD = dyn_cast(Clobber)) + if (hasUndefContents(MSSA, BAA, MemCpy->getSource(), MD, CopySize)) + return true; + return false; +} + /// Transform memcpy to memset when its source was just memset. /// In other words, turn: /// \code @@ -1418,19 +1437,25 @@ static bool hasUndefContents(MemorySSA *MSSA, BatchAAResults &AA, Value *V, bool MemCpyOptPass::performMemCpyToMemSetOptzn(MemCpyInst *MemCpy, MemSetInst *MemSet, BatchAAResults &BAA) { - // Make sure that memcpy(..., memset(...), ...), that is we are memsetting and - // memcpying from the same address. Otherwise it is hard to reason about. - if (!BAA.isMustAlias(MemSet->getRawDest(), MemCpy->getRawSource())) - return false; - Value *MemSetSize = MemSet->getLength(); Value *CopySize = MemCpy->getLength(); - if (MemSetSize != CopySize) { - // Make sure the memcpy doesn't read any more than what the memset wrote. - // Don't worry about sizes larger than i64. + int64_t MOffset = 0; + const DataLayout &DL = MemCpy->getModule()->getDataLayout(); + // We can only transforms memcpy's where the dest of one is the source of the + // other, or the memory transfer has a known offset from the memset. + if (MemCpy->getSource() != MemSet->getDest()) { + std::optional Offset = + MemCpy->getSource()->getPointerOffsetFrom(MemSet->getDest(), DL); + if (!Offset || *Offset < 0) + return false; + MOffset = *Offset; + } - // A known memset size is required. + if (MOffset != 0 || MemSetSize != CopySize) { + // Make sure the memcpy doesn't read any more than what the memset wrote, + // other than undef. Don't worry about sizes larger than i64. A known memset + // size is required. auto *CMemSetSize = dyn_cast(MemSetSize); if (!CMemSetSize) return false; @@ -1439,23 +1464,18 @@ bool MemCpyOptPass::performMemCpyToMemSetOptzn(MemCpyInst *MemCpy, auto *CCopySize = dyn_cast(CopySize); if (!CCopySize) return false; - if (CCopySize->getZExtValue() > CMemSetSize->getZExtValue()) { - // If the memcpy is larger than the memset, but the memory was undef prior - // to the memset, we can just ignore the tail. Technically we're only - // interested in the bytes from MemSetSize..CopySize here, but as we can't - // easily represent this location, we use the full 0..CopySize range. - MemoryLocation MemCpyLoc = MemoryLocation::getForSource(MemCpy); - bool CanReduceSize = false; - MemoryUseOrDef *MemSetAccess = MSSA->getMemoryAccess(MemSet); - MemoryAccess *Clobber = MSSA->getWalker()->getClobberingMemoryAccess( - MemSetAccess->getDefiningAccess(), MemCpyLoc, BAA); - if (auto *MD = dyn_cast(Clobber)) - if (hasUndefContents(MSSA, BAA, MemCpy->getSource(), MD, CopySize)) - CanReduceSize = true; - - if (!CanReduceSize) + if (CCopySize->getZExtValue() + MOffset > CMemSetSize->getZExtValue()) { + if (!overreadUndefContents(MSSA, MemCpy, MemSet, BAA)) return false; - CopySize = MemSetSize; + // Clip the memcpy to the bounds of the memset + if (MOffset == 0) + CopySize = MemSetSize; + else + CopySize = + ConstantInt::get(CopySize->getType(), + CMemSetSize->getZExtValue() <= (uint64_t)MOffset + ? 0 + : CMemSetSize->getZExtValue() - MOffset); } } diff --git a/llvm/test/Transforms/MemCpyOpt/memset-memcpy-oversized.ll b/llvm/test/Transforms/MemCpyOpt/memset-memcpy-oversized.ll index 1c3896407e950..0c16f34590fc7 100644 --- a/llvm/test/Transforms/MemCpyOpt/memset-memcpy-oversized.ll +++ b/llvm/test/Transforms/MemCpyOpt/memset-memcpy-oversized.ll @@ -187,6 +187,53 @@ define void @test_write_before_memset_in_both_regions(ptr %result) { ret void } +define void @test_negative_offset_memset(ptr %result) { +; CHECK-LABEL: @test_negative_offset_memset( +; CHECK-NEXT: [[A1:%.*]] = alloca [16 x i8], align 8 +; CHECK-NEXT: [[A:%.*]] = getelementptr i8, ptr [[A1]], i32 4 +; CHECK-NEXT: call void @llvm.memset.p0.i64(ptr align 8 [[A]], i8 0, i64 12, i1 false) +; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr [[RESULT:%.*]], ptr align 8 [[A1]], i64 12, i1 false) +; CHECK-NEXT: ret void +; + %a = alloca [ 16 x i8 ], align 8 + %b = getelementptr i8, ptr %a, i32 4 + call void @llvm.memset.p0.i64(ptr align 8 %b, i8 0, i64 12, i1 false) + call void @llvm.memcpy.p0.p0.i64(ptr %result, ptr align 8 %a, i64 12, i1 false) + ret void +} + +define void @test_offset_memsetcpy(ptr %result) { +; CHECK-LABEL: @test_offset_memsetcpy( +; CHECK-NEXT: [[A1:%.*]] = alloca [16 x i8], align 8 +; CHECK-NEXT: [[A:%.*]] = getelementptr i8, ptr [[A1]], i32 4 +; CHECK-NEXT: call void @llvm.memset.p0.i64(ptr align 8 [[A1]], i8 0, i64 12, i1 false) +; CHECK-NEXT: call void @llvm.memset.p0.i64(ptr [[RESULT:%.*]], i8 0, i64 8, i1 false) +; CHECK-NEXT: ret void +; + %a = alloca [ 16 x i8 ], align 8 + %b = getelementptr i8, ptr %a, i32 4 + call void @llvm.memset.p0.i64(ptr align 8 %a, i8 0, i64 12, i1 false) + call void @llvm.memcpy.p0.p0.i64(ptr %result, ptr align 8 %b, i64 12, i1 false) + ret void +} + +define void @test_two_memset(ptr %result) { +; CHECK-LABEL: @test_two_memset( +; CHECK-NEXT: [[A:%.*]] = alloca [16 x i8], align 8 +; CHECK-NEXT: [[B:%.*]] = getelementptr i8, ptr [[A]], i32 12 +; CHECK-NEXT: call void @llvm.memset.p0.i64(ptr align 8 [[A]], i8 0, i64 12, i1 false) +; CHECK-NEXT: call void @llvm.memset.p0.i64(ptr align 8 [[B]], i8 1, i64 4, i1 false) +; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr [[RESULT:%.*]], ptr align 8 [[A]], i64 16, i1 false) +; CHECK-NEXT: ret void +; + %a = alloca [ 16 x i8 ], align 8 + %b = getelementptr i8, ptr %a, i32 12 + call void @llvm.memset.p0.i64(ptr align 8 %a, i8 0, i64 12, i1 false) + call void @llvm.memset.p0.i64(ptr align 8 %b, i8 1, i64 4, i1 false) + call void @llvm.memcpy.p0.p0.i64(ptr %result, ptr align 8 %a, i64 16, i1 false) + ret void +} + declare ptr @malloc(i64) declare void @free(ptr) diff --git a/llvm/test/Transforms/MemCpyOpt/memset-memcpy-to-2x-memset.ll b/llvm/test/Transforms/MemCpyOpt/memset-memcpy-to-2x-memset.ll index 47474e8dac051..18488f03a2d88 100644 --- a/llvm/test/Transforms/MemCpyOpt/memset-memcpy-to-2x-memset.ll +++ b/llvm/test/Transforms/MemCpyOpt/memset-memcpy-to-2x-memset.ll @@ -73,11 +73,10 @@ define void @test_different_source_gep(ptr %dst1, ptr %dst2, i8 %c) { ; CHECK-LABEL: @test_different_source_gep( ; CHECK-NEXT: call void @llvm.memset.p0.i64(ptr [[DST1:%.*]], i8 [[C:%.*]], i64 128, i1 false) ; CHECK-NEXT: [[P:%.*]] = getelementptr i8, ptr [[DST1]], i64 64 -; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr [[DST2:%.*]], ptr [[P]], i64 64, i1 false) +; CHECK-NEXT: call void @llvm.memset.p0.i64(ptr [[DST2:%.*]], i8 [[C]], i64 64, i1 false) ; CHECK-NEXT: ret void ; call void @llvm.memset.p0.i64(ptr %dst1, i8 %c, i64 128, i1 false) - ; FIXME: We could optimize this as well. %p = getelementptr i8, ptr %dst1, i64 64 call void @llvm.memcpy.p0.p0.i64(ptr %dst2, ptr %p, i64 64, i1 false) ret void diff --git a/llvm/test/Transforms/MemCpyOpt/mixed-sizes.ll b/llvm/test/Transforms/MemCpyOpt/mixed-sizes.ll index 5e13432746bf7..0e312bc42d463 100644 --- a/llvm/test/Transforms/MemCpyOpt/mixed-sizes.ll +++ b/llvm/test/Transforms/MemCpyOpt/mixed-sizes.ll @@ -19,7 +19,7 @@ define i32 @foo(i1 %z) { ; CHECK: for.body3.lr.ph: ; CHECK-NEXT: br label [[FOR_INC7_1]] ; CHECK: for.inc7.1: -; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[A]], ptr align 4 [[SCEVGEP]], i64 4, i1 false) +; CHECK-NEXT: call void @llvm.memset.p0.i64(ptr align 4 [[A]], i8 0, i64 4, i1 false) ; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[A]], align 4 ; CHECK-NEXT: ret i32 [[TMP2]] ; diff --git a/llvm/test/Transforms/MemCpyOpt/variable-sized-memset-memcpy.ll b/llvm/test/Transforms/MemCpyOpt/variable-sized-memset-memcpy.ll index a834d2465dfa5..d5b1ab9b2f299 100644 --- a/llvm/test/Transforms/MemCpyOpt/variable-sized-memset-memcpy.ll +++ b/llvm/test/Transforms/MemCpyOpt/variable-sized-memset-memcpy.ll @@ -18,7 +18,7 @@ define void @test(ptr %src, i8 %c, i64 %size) { ret void } -; Differing sizes, so left as it is. +; Differing sizes, but would be UB if size1 < size2 since the memcpy would reference outside of the first alloca define void @negative_test(ptr %src, i8 %c, i64 %size1, i64 %size2) { ; CHECK-LABEL: @negative_test( ; CHECK-NEXT: [[DST1:%.*]] = alloca i8, i64 [[SIZE1:%.*]], align 1