diff --git a/clang/test/CodeGen/attr-arm-sve-vector-bits-cast.c b/clang/test/CodeGen/attr-arm-sve-vector-bits-cast.c index e1e2220f94d6d..fcd4314249ff8 100644 --- a/clang/test/CodeGen/attr-arm-sve-vector-bits-cast.c +++ b/clang/test/CodeGen/attr-arm-sve-vector-bits-cast.c @@ -62,10 +62,7 @@ fixed_bool_t from_svbool_t(svbool_t type) { // CHECK-LABEL: @lax_cast( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SAVED_VALUE:%.*]] = alloca <16 x i32>, align 64 -// CHECK-NEXT: [[TYPE:%.*]] = tail call <16 x i32> @llvm.vector.extract.v16i32.nxv4i32( [[TYPE_COERCE:%.*]], i64 0) -// CHECK-NEXT: store <16 x i32> [[TYPE]], ptr [[SAVED_VALUE]], align 64, !tbaa [[TBAA6:![0-9]+]] -// CHECK-NEXT: [[TMP0:%.*]] = load , ptr [[SAVED_VALUE]], align 64, !tbaa [[TBAA6]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast [[TYPE_COERCE:%.*]] to // CHECK-NEXT: ret [[TMP0]] // svint64_t lax_cast(fixed_int32_t type) { @@ -74,9 +71,9 @@ svint64_t lax_cast(fixed_int32_t type) { // CHECK-LABEL: @to_svint32_t__from_gnu_int32_t( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TYPE:%.*]] = load <16 x i32>, ptr [[TMP0:%.*]], align 16, !tbaa [[TBAA6]] -// CHECK-NEXT: [[CASTSCALABLESVE:%.*]] = tail call @llvm.vector.insert.nxv4i32.v16i32( poison, <16 x i32> [[TYPE]], i64 0) -// CHECK-NEXT: ret [[CASTSCALABLESVE]] +// CHECK-NEXT: [[TYPE:%.*]] = load <16 x i32>, ptr [[TMP0:%.*]], align 16, !tbaa [[TBAA2:![0-9]+]] +// CHECK-NEXT: [[CAST_SCALABLE:%.*]] = tail call @llvm.vector.insert.nxv4i32.v16i32( poison, <16 x i32> [[TYPE]], i64 0) +// CHECK-NEXT: ret [[CAST_SCALABLE]] // svint32_t to_svint32_t__from_gnu_int32_t(gnu_int32_t type) { return type; @@ -84,8 +81,8 @@ svint32_t to_svint32_t__from_gnu_int32_t(gnu_int32_t type) { // CHECK-LABEL: @from_svint32_t__to_gnu_int32_t( // CHECK-NEXT: entry: -// CHECK-NEXT: [[CASTFIXEDSVE:%.*]] = tail call <16 x i32> @llvm.vector.extract.v16i32.nxv4i32( [[TYPE:%.*]], i64 0) -// CHECK-NEXT: store <16 x i32> [[CASTFIXEDSVE]], ptr [[AGG_RESULT:%.*]], align 16, !tbaa [[TBAA6]] +// CHECK-NEXT: [[CAST_FIXED:%.*]] = tail call <16 x i32> @llvm.vector.extract.v16i32.nxv4i32( [[TYPE:%.*]], i64 0) +// CHECK-NEXT: store <16 x i32> [[CAST_FIXED]], ptr [[AGG_RESULT:%.*]], align 16, !tbaa [[TBAA2]] // CHECK-NEXT: ret void // gnu_int32_t from_svint32_t__to_gnu_int32_t(svint32_t type) { @@ -94,9 +91,9 @@ gnu_int32_t from_svint32_t__to_gnu_int32_t(svint32_t type) { // CHECK-LABEL: @to_fixed_int32_t__from_gnu_int32_t( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TYPE:%.*]] = load <16 x i32>, ptr [[TMP0:%.*]], align 16, !tbaa [[TBAA6]] -// CHECK-NEXT: [[CASTSCALABLESVE:%.*]] = tail call @llvm.vector.insert.nxv4i32.v16i32( poison, <16 x i32> [[TYPE]], i64 0) -// CHECK-NEXT: ret [[CASTSCALABLESVE]] +// CHECK-NEXT: [[TYPE:%.*]] = load <16 x i32>, ptr [[TMP0:%.*]], align 16, !tbaa [[TBAA2]] +// CHECK-NEXT: [[CAST_SCALABLE:%.*]] = tail call @llvm.vector.insert.nxv4i32.v16i32( poison, <16 x i32> [[TYPE]], i64 0) +// CHECK-NEXT: ret [[CAST_SCALABLE]] // fixed_int32_t to_fixed_int32_t__from_gnu_int32_t(gnu_int32_t type) { return type; @@ -105,7 +102,7 @@ fixed_int32_t to_fixed_int32_t__from_gnu_int32_t(gnu_int32_t type) { // CHECK-LABEL: @from_fixed_int32_t__to_gnu_int32_t( // CHECK-NEXT: entry: // CHECK-NEXT: [[TYPE:%.*]] = tail call <16 x i32> @llvm.vector.extract.v16i32.nxv4i32( [[TYPE_COERCE:%.*]], i64 0) -// CHECK-NEXT: store <16 x i32> [[TYPE]], ptr [[AGG_RESULT:%.*]], align 16, !tbaa [[TBAA6]] +// CHECK-NEXT: store <16 x i32> [[TYPE]], ptr [[AGG_RESULT:%.*]], align 16, !tbaa [[TBAA2]] // CHECK-NEXT: ret void // gnu_int32_t from_fixed_int32_t__to_gnu_int32_t(fixed_int32_t type) { diff --git a/llvm/include/llvm/IR/Function.h b/llvm/include/llvm/IR/Function.h index c2510ea75544a..f24d03635731e 100644 --- a/llvm/include/llvm/IR/Function.h +++ b/llvm/include/llvm/IR/Function.h @@ -1053,6 +1053,10 @@ class LLVM_ABI Function : public GlobalObject, public ilist_node { /// defined. void setAlignment(MaybeAlign Align) { GlobalObject::setAlignment(Align); } + /// Return the value for vscale based on the vscale_range attribute or 0 when + /// unknown. + unsigned getVScaleValue() const; + private: void allocHungoffUselist(); template void setHungoffOperand(Constant *C); diff --git a/llvm/lib/IR/Function.cpp b/llvm/lib/IR/Function.cpp index 63665d837c398..493dec72d45af 100644 --- a/llvm/lib/IR/Function.cpp +++ b/llvm/lib/IR/Function.cpp @@ -1165,6 +1165,18 @@ bool Function::nullPointerIsDefined() const { return hasFnAttribute(Attribute::NullPointerIsValid); } +unsigned Function::getVScaleValue() const { + Attribute Attr = getFnAttribute(Attribute::VScaleRange); + if (!Attr.isValid()) + return 0; + + unsigned VScale = Attr.getVScaleRangeMin(); + if (VScale && VScale == Attr.getVScaleRangeMax()) + return VScale; + + return 0; +} + bool llvm::NullPointerIsDefined(const Function *F, unsigned AS) { if (F && F->nullPointerIsDefined()) return true; diff --git a/llvm/lib/Transforms/Scalar/SROA.cpp b/llvm/lib/Transforms/Scalar/SROA.cpp index a4e373d395b90..42d1d9a437bb2 100644 --- a/llvm/lib/Transforms/Scalar/SROA.cpp +++ b/llvm/lib/Transforms/Scalar/SROA.cpp @@ -1120,8 +1120,13 @@ class AllocaSlices::SliceBuilder : public PtrUseVisitor { return PI.setEscapedReadOnly(&LI); TypeSize Size = DL.getTypeStoreSize(LI.getType()); - if (Size.isScalable()) - return PI.setAborted(&LI); + if (Size.isScalable()) { + unsigned VScale = LI.getFunction()->getVScaleValue(); + if (!VScale) + return PI.setAborted(&LI); + + Size = TypeSize::getFixed(Size.getKnownMinValue() * VScale); + } return handleLoadOrStore(LI.getType(), LI, Offset, Size.getFixedValue(), LI.isVolatile()); @@ -1135,8 +1140,13 @@ class AllocaSlices::SliceBuilder : public PtrUseVisitor { return PI.setAborted(&SI); TypeSize StoreSize = DL.getTypeStoreSize(ValOp->getType()); - if (StoreSize.isScalable()) - return PI.setAborted(&SI); + if (StoreSize.isScalable()) { + unsigned VScale = SI.getFunction()->getVScaleValue(); + if (!VScale) + return PI.setAborted(&SI); + + StoreSize = TypeSize::getFixed(StoreSize.getKnownMinValue() * VScale); + } uint64_t Size = StoreSize.getFixedValue(); @@ -1927,7 +1937,8 @@ static Align getAdjustedAlignment(Instruction *I, uint64_t Offset) { /// ensure that we only try to convert viable values. The strategy is that we /// will peel off single element struct and array wrappings to get to an /// underlying value, and convert that value. -static bool canConvertValue(const DataLayout &DL, Type *OldTy, Type *NewTy) { +static bool canConvertValue(const DataLayout &DL, Type *OldTy, Type *NewTy, + unsigned VScale = 0) { if (OldTy == NewTy) return true; @@ -1941,8 +1952,35 @@ static bool canConvertValue(const DataLayout &DL, Type *OldTy, Type *NewTy) { return false; } - if (DL.getTypeSizeInBits(NewTy).getFixedValue() != - DL.getTypeSizeInBits(OldTy).getFixedValue()) + TypeSize NewSize = DL.getTypeSizeInBits(NewTy); + TypeSize OldSize = DL.getTypeSizeInBits(OldTy); + + if ((isa(NewTy) && isa(OldTy)) || + (isa(OldTy) && isa(NewTy))) { + // Conversion is only possible when the size of scalable vectors is known. + if (!VScale) + return false; + + // For ptr-to-int and int-to-ptr casts, the pointer side is resolved within + // a single domain (either fixed or scalable). Any additional conversion + // between fixed and scalable types is handled through integer types. + auto OldVTy = OldTy->isPtrOrPtrVectorTy() ? DL.getIntPtrType(OldTy) : OldTy; + auto NewVTy = NewTy->isPtrOrPtrVectorTy() ? DL.getIntPtrType(NewTy) : NewTy; + + if (isa(NewTy)) { + if (!VectorType::getWithSizeAndScalar(cast(NewVTy), OldVTy)) + return false; + + NewSize = TypeSize::getFixed(NewSize.getKnownMinValue() * VScale); + } else { + if (!VectorType::getWithSizeAndScalar(cast(OldVTy), NewVTy)) + return false; + + OldSize = TypeSize::getFixed(OldSize.getKnownMinValue() * VScale); + } + } + + if (NewSize != OldSize) return false; if (!NewTy->isSingleValueType() || !OldTy->isSingleValueType()) return false; @@ -1992,7 +2030,14 @@ static bool canConvertValue(const DataLayout &DL, Type *OldTy, Type *NewTy) { static Value *convertValue(const DataLayout &DL, IRBuilderTy &IRB, Value *V, Type *NewTy) { Type *OldTy = V->getType(); - assert(canConvertValue(DL, OldTy, NewTy) && "Value not convertable to type"); + +#ifndef NDEBUG + BasicBlock *BB = IRB.GetInsertBlock(); + assert(BB && BB->getParent() && "VScale unknown!"); + unsigned VScale = BB->getParent()->getVScaleValue(); + assert(canConvertValue(DL, OldTy, NewTy, VScale) && + "Value not convertable to type"); +#endif if (OldTy == NewTy) return V; @@ -2000,13 +2045,41 @@ static Value *convertValue(const DataLayout &DL, IRBuilderTy &IRB, Value *V, assert(!(isa(OldTy) && isa(NewTy)) && "Integer types must be the exact same to convert."); + // A variant of bitcast that supports a mixture of fixed and scalable types + // that are know to have the same size. + auto CreateBitCastLike = [&IRB](Value *In, Type *Ty) -> Value * { + Type *InTy = In->getType(); + if (InTy == Ty) + return In; + + if (isa(InTy) && isa(Ty)) { + // For vscale_range(2) expand <4 x i32> to --> + // <4 x i32> to to + auto *VTy = VectorType::getWithSizeAndScalar(cast(Ty), InTy); + return IRB.CreateBitCast(IRB.CreateInsertVector(VTy, + PoisonValue::get(VTy), In, + IRB.getInt64(0)), + Ty); + } + + if (isa(InTy) && isa(Ty)) { + // For vscale_range(2) expand to <4 x i32> --> + // to to <4 x i32> + auto *VTy = VectorType::getWithSizeAndScalar(cast(InTy), Ty); + return IRB.CreateExtractVector(Ty, IRB.CreateBitCast(In, VTy), + IRB.getInt64(0)); + } + + return IRB.CreateBitCast(In, Ty); + }; + // See if we need inttoptr for this type pair. May require additional bitcast. if (OldTy->isIntOrIntVectorTy() && NewTy->isPtrOrPtrVectorTy()) { // Expand <2 x i32> to i8* --> <2 x i32> to i64 to i8* // Expand i128 to <2 x i8*> --> i128 to <2 x i64> to <2 x i8*> // Expand <4 x i32> to <2 x i8*> --> <4 x i32> to <2 x i64> to <2 x i8*> // Directly handle i64 to i8* - return IRB.CreateIntToPtr(IRB.CreateBitCast(V, DL.getIntPtrType(NewTy)), + return IRB.CreateIntToPtr(CreateBitCastLike(V, DL.getIntPtrType(NewTy)), NewTy); } @@ -2016,7 +2089,7 @@ static Value *convertValue(const DataLayout &DL, IRBuilderTy &IRB, Value *V, // Expand i8* to <2 x i32> --> i8* to i64 to <2 x i32> // Expand <2 x i8*> to <4 x i32> --> <2 x i8*> to <2 x i64> to <4 x i32> // Expand i8* to i64 --> i8* to i64 to i64 - return IRB.CreateBitCast(IRB.CreatePtrToInt(V, DL.getIntPtrType(OldTy)), + return CreateBitCastLike(IRB.CreatePtrToInt(V, DL.getIntPtrType(OldTy)), NewTy); } @@ -2031,12 +2104,14 @@ static Value *convertValue(const DataLayout &DL, IRBuilderTy &IRB, Value *V, // size. if (OldAS != NewAS) { assert(DL.getPointerSize(OldAS) == DL.getPointerSize(NewAS)); - return IRB.CreateIntToPtr(IRB.CreatePtrToInt(V, DL.getIntPtrType(OldTy)), - NewTy); + return IRB.CreateIntToPtr( + CreateBitCastLike(IRB.CreatePtrToInt(V, DL.getIntPtrType(OldTy)), + DL.getIntPtrType(NewTy)), + NewTy); } } - return IRB.CreateBitCast(V, NewTy); + return CreateBitCastLike(V, NewTy); } /// Test whether the given slice use can be promoted to a vector. @@ -2046,7 +2121,8 @@ static Value *convertValue(const DataLayout &DL, IRBuilderTy &IRB, Value *V, static bool isVectorPromotionViableForSlice(Partition &P, const Slice &S, VectorType *Ty, uint64_t ElementSize, - const DataLayout &DL) { + const DataLayout &DL, + unsigned VScale) { // First validate the slice offsets. uint64_t BeginOffset = std::max(S.beginOffset(), P.beginOffset()) - P.beginOffset(); @@ -2090,7 +2166,7 @@ static bool isVectorPromotionViableForSlice(Partition &P, const Slice &S, assert(LTy->isIntegerTy()); LTy = SplitIntTy; } - if (!canConvertValue(DL, SliceTy, LTy)) + if (!canConvertValue(DL, SliceTy, LTy, VScale)) return false; } else if (StoreInst *SI = dyn_cast(U->getUser())) { if (SI->isVolatile()) @@ -2103,7 +2179,7 @@ static bool isVectorPromotionViableForSlice(Partition &P, const Slice &S, assert(STy->isIntegerTy()); STy = SplitIntTy; } - if (!canConvertValue(DL, STy, SliceTy)) + if (!canConvertValue(DL, STy, SliceTy, VScale)) return false; } else { return false; @@ -2118,7 +2194,7 @@ static bool isVectorPromotionViableForSlice(Partition &P, const Slice &S, /// (and thus isVectorPromotionViable) over all slices of the alloca for the /// given VectorType. static bool checkVectorTypeForPromotion(Partition &P, VectorType *VTy, - const DataLayout &DL) { + const DataLayout &DL, unsigned VScale) { uint64_t ElementSize = DL.getTypeSizeInBits(VTy->getElementType()).getFixedValue(); @@ -2131,11 +2207,11 @@ static bool checkVectorTypeForPromotion(Partition &P, VectorType *VTy, ElementSize /= 8; for (const Slice &S : P) - if (!isVectorPromotionViableForSlice(P, S, VTy, ElementSize, DL)) + if (!isVectorPromotionViableForSlice(P, S, VTy, ElementSize, DL, VScale)) return false; for (const Slice *S : P.splitSliceTails()) - if (!isVectorPromotionViableForSlice(P, *S, VTy, ElementSize, DL)) + if (!isVectorPromotionViableForSlice(P, *S, VTy, ElementSize, DL, VScale)) return false; return true; @@ -2150,7 +2226,7 @@ checkVectorTypesForPromotion(Partition &P, const DataLayout &DL, SmallVectorImpl &CandidateTys, bool HaveCommonEltTy, Type *CommonEltTy, bool HaveVecPtrTy, bool HaveCommonVecPtrTy, - VectorType *CommonVecPtrTy) { + VectorType *CommonVecPtrTy, unsigned VScale) { // If we didn't find a vector type, nothing to do here. if (CandidateTys.empty()) return nullptr; @@ -2226,7 +2302,7 @@ checkVectorTypesForPromotion(Partition &P, const DataLayout &DL, }); for (VectorType *VTy : CandidateTys) - if (checkVectorTypeForPromotion(P, VTy, DL)) + if (checkVectorTypeForPromotion(P, VTy, DL, VScale)) return VTy; return nullptr; @@ -2237,7 +2313,7 @@ static VectorType *createAndCheckVectorTypesForPromotion( function_ref CheckCandidateType, Partition &P, const DataLayout &DL, SmallVectorImpl &CandidateTys, bool &HaveCommonEltTy, Type *&CommonEltTy, bool &HaveVecPtrTy, - bool &HaveCommonVecPtrTy, VectorType *&CommonVecPtrTy) { + bool &HaveCommonVecPtrTy, VectorType *&CommonVecPtrTy, unsigned VScale) { [[maybe_unused]] VectorType *OriginalElt = CandidateTysCopy.size() ? CandidateTysCopy[0] : nullptr; // Consider additional vector types where the element type size is a @@ -2262,9 +2338,9 @@ static VectorType *createAndCheckVectorTypesForPromotion( } } - return checkVectorTypesForPromotion(P, DL, CandidateTys, HaveCommonEltTy, - CommonEltTy, HaveVecPtrTy, - HaveCommonVecPtrTy, CommonVecPtrTy); + return checkVectorTypesForPromotion( + P, DL, CandidateTys, HaveCommonEltTy, CommonEltTy, HaveVecPtrTy, + HaveCommonVecPtrTy, CommonVecPtrTy, VScale); } /// Test whether the given alloca partitioning and range of slices can be @@ -2276,7 +2352,8 @@ static VectorType *createAndCheckVectorTypesForPromotion( /// SSA value. We only can ensure this for a limited set of operations, and we /// don't want to do the rewrites unless we are confident that the result will /// be promotable, so we have an early test here. -static VectorType *isVectorPromotionViable(Partition &P, const DataLayout &DL) { +static VectorType *isVectorPromotionViable(Partition &P, const DataLayout &DL, + unsigned VScale) { // Collect the candidate types for vector-based promotion. Also track whether // we have different element types. SmallVector CandidateTys; @@ -2288,7 +2365,7 @@ static VectorType *isVectorPromotionViable(Partition &P, const DataLayout &DL) { bool HaveCommonEltTy = true; bool HaveCommonVecPtrTy = true; auto CheckCandidateType = [&](Type *Ty) { - if (auto *VTy = dyn_cast(Ty)) { + if (auto *VTy = dyn_cast(Ty)) { // Return if bitcast to vectors is different for total size in bits. if (!CandidateTys.empty()) { VectorType *V = CandidateTys[0]; @@ -2343,14 +2420,14 @@ static VectorType *isVectorPromotionViable(Partition &P, const DataLayout &DL) { if (auto *VTy = createAndCheckVectorTypesForPromotion( LoadStoreTys, CandidateTysCopy, CheckCandidateType, P, DL, CandidateTys, HaveCommonEltTy, CommonEltTy, HaveVecPtrTy, - HaveCommonVecPtrTy, CommonVecPtrTy)) + HaveCommonVecPtrTy, CommonVecPtrTy, VScale)) return VTy; CandidateTys.clear(); return createAndCheckVectorTypesForPromotion( DeferredTys, CandidateTysCopy, CheckCandidateType, P, DL, CandidateTys, HaveCommonEltTy, CommonEltTy, HaveVecPtrTy, HaveCommonVecPtrTy, - CommonVecPtrTy); + CommonVecPtrTy, VScale); } /// Test whether a slice of an alloca is valid for integer widening. @@ -2387,7 +2464,8 @@ static bool isIntegerWideningViableForSlice(const Slice &S, if (LI->isVolatile()) return false; // We can't handle loads that extend past the allocated memory. - if (DL.getTypeStoreSize(LI->getType()).getFixedValue() > Size) + TypeSize LoadSize = DL.getTypeStoreSize(LI->getType()); + if (!LoadSize.isFixed() || LoadSize.getFixedValue() > Size) return false; // So far, AllocaSliceRewriter does not support widening split slice tails // in rewriteIntegerLoad. @@ -2412,7 +2490,8 @@ static bool isIntegerWideningViableForSlice(const Slice &S, if (SI->isVolatile()) return false; // We can't handle stores that extend past the allocated memory. - if (DL.getTypeStoreSize(ValueTy).getFixedValue() > Size) + TypeSize StoreSize = DL.getTypeStoreSize(ValueTy); + if (!StoreSize.isFixed() || StoreSize.getFixedValue() > Size) return false; // So far, AllocaSliceRewriter does not support widening split slice tails // in rewriteIntegerStore. @@ -2885,8 +2964,6 @@ class AllocaSliceRewriter : public InstVisitor { Type *TargetTy = IsSplit ? Type::getIntNTy(LI.getContext(), SliceSize * 8) : LI.getType(); - const bool IsLoadPastEnd = - DL.getTypeStoreSize(TargetTy).getFixedValue() > SliceSize; bool IsPtrAdjusted = false; Value *V; if (VecTy) { @@ -2896,8 +2973,9 @@ class AllocaSliceRewriter : public InstVisitor { } else if (NewBeginOffset == NewAllocaBeginOffset && NewEndOffset == NewAllocaEndOffset && (canConvertValue(DL, NewAllocaTy, TargetTy) || - (IsLoadPastEnd && NewAllocaTy->isIntegerTy() && - TargetTy->isIntegerTy() && !LI.isVolatile()))) { + (NewAllocaTy->isIntegerTy() && TargetTy->isIntegerTy() && + DL.getTypeStoreSize(TargetTy).getFixedValue() > SliceSize && + !LI.isVolatile()))) { Value *NewPtr = getPtrToNewAI(LI.getPointerAddressSpace(), LI.isVolatile()); LoadInst *NewLI = IRB.CreateAlignedLoad(NewAI.getAllocatedType(), NewPtr, @@ -3070,7 +3148,8 @@ class AllocaSliceRewriter : public InstVisitor { if (AllocaInst *AI = dyn_cast(V->stripInBoundsOffsets())) Pass.PostPromotionWorklist.insert(AI); - if (SliceSize < DL.getTypeStoreSize(V->getType()).getFixedValue()) { + TypeSize StoreSize = DL.getTypeStoreSize(V->getType()); + if (StoreSize.isFixed() && SliceSize < StoreSize.getFixedValue()) { assert(!SI.isVolatile()); assert(V->getType()->isIntegerTy() && "Only integer type loads and stores are split"); @@ -4846,14 +4925,18 @@ AllocaInst *SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS, Type *SliceTy = nullptr; VectorType *SliceVecTy = nullptr; const DataLayout &DL = AI.getDataLayout(); + unsigned VScale = AI.getFunction()->getVScaleValue(); + std::pair CommonUseTy = findCommonType(P.begin(), P.end(), P.endOffset()); // Do all uses operate on the same type? - if (CommonUseTy.first) - if (DL.getTypeAllocSize(CommonUseTy.first).getFixedValue() >= P.size()) { + if (CommonUseTy.first) { + TypeSize CommonUseSize = DL.getTypeAllocSize(CommonUseTy.first); + if (CommonUseSize.isFixed() && CommonUseSize.getFixedValue() >= P.size()) { SliceTy = CommonUseTy.first; SliceVecTy = dyn_cast(SliceTy); } + } // If not, can we find an appropriate subtype in the original allocated type? if (!SliceTy) if (Type *TypePartitionTy = getTypePartition(DL, AI.getAllocatedType(), @@ -4874,12 +4957,12 @@ AllocaInst *SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS, // If the common use types are not viable for promotion then attempt to find // another type that is viable. - if (SliceVecTy && !checkVectorTypeForPromotion(P, SliceVecTy, DL)) + if (SliceVecTy && !checkVectorTypeForPromotion(P, SliceVecTy, DL, VScale)) if (Type *TypePartitionTy = getTypePartition(DL, AI.getAllocatedType(), P.beginOffset(), P.size())) { VectorType *TypePartitionVecTy = dyn_cast(TypePartitionTy); if (TypePartitionVecTy && - checkVectorTypeForPromotion(P, TypePartitionVecTy, DL)) + checkVectorTypeForPromotion(P, TypePartitionVecTy, DL, VScale)) SliceTy = TypePartitionTy; } @@ -4890,7 +4973,7 @@ AllocaInst *SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS, bool IsIntegerPromotable = isIntegerWideningViable(P, SliceTy, DL); VectorType *VecTy = - IsIntegerPromotable ? nullptr : isVectorPromotionViable(P, DL); + IsIntegerPromotable ? nullptr : isVectorPromotionViable(P, DL, VScale); if (VecTy) SliceTy = VecTy; diff --git a/llvm/test/Transforms/SROA/scalable-vectors-with-known-vscale.ll b/llvm/test/Transforms/SROA/scalable-vectors-with-known-vscale.ll new file mode 100644 index 0000000000000..85715e406e065 --- /dev/null +++ b/llvm/test/Transforms/SROA/scalable-vectors-with-known-vscale.ll @@ -0,0 +1,349 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt < %s -passes='sroa' -S | FileCheck %s --check-prefixes=CHECK,CHECK-PRESERVE-CFG +; RUN: opt < %s -passes='sroa' -S | FileCheck %s --check-prefixes=CHECK,CHECK-MODIFY-CFG + +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n8:16:32:64" + +; This test checks that SROA runs mem2reg on scalable vectors. + +define @alloca_nxv16i1( %pg) vscale_range(1) { +; CHECK-LABEL: @alloca_nxv16i1( +; CHECK-NEXT: ret [[PG:%.*]] +; + %pg.addr = alloca + store %pg, ptr %pg.addr + %1 = load , ptr %pg.addr + ret %1 +} + +define @alloca_nxv16i8( %vec) vscale_range(1) { +; CHECK-LABEL: @alloca_nxv16i8( +; CHECK-NEXT: ret [[VEC:%.*]] +; + %vec.addr = alloca + store %vec, ptr %vec.addr + %1 = load , ptr %vec.addr + ret %1 +} + +; Test scalable alloca that can't be promoted. Mem2Reg only considers +; non-volatile loads and stores for promotion. +define @unpromotable_alloca( %vec) vscale_range(1) { +; CHECK-LABEL: @unpromotable_alloca( +; CHECK-NEXT: [[VEC_ADDR:%.*]] = alloca , align 16 +; CHECK-NEXT: store volatile [[VEC:%.*]], ptr [[VEC_ADDR]], align 16 +; CHECK-NEXT: [[TMP1:%.*]] = load volatile , ptr [[VEC_ADDR]], align 16 +; CHECK-NEXT: ret [[TMP1]] +; + %vec.addr = alloca + store volatile %vec, ptr %vec.addr + %1 = load volatile , ptr %vec.addr + ret %1 +} + +; Test we bail out when using an alloca of a fixed-length vector (VLS) that was +; bitcasted to a scalable vector. +define @cast_alloca_to_svint32_t( %type.coerce) vscale_range(1) { +; CHECK-LABEL: @cast_alloca_to_svint32_t( +; CHECK-NEXT: [[TMP1:%.*]] = call <4 x i32> @llvm.vector.extract.v4i32.nxv4i32( [[TYPE_COERCE:%.*]], i64 0) +; CHECK-NEXT: [[TYPE_0_VEC_EXPAND:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <16 x i32> +; CHECK-NEXT: [[TYPE_0_VECBLEND:%.*]] = select <16 x i1> , <16 x i32> [[TYPE_0_VEC_EXPAND]], <16 x i32> undef +; CHECK-NEXT: [[TYPE_ADDR_0_VEC_EXTRACT:%.*]] = shufflevector <16 x i32> [[TYPE_0_VECBLEND]], <16 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = call @llvm.vector.insert.nxv4i32.v4i32( poison, <4 x i32> [[TYPE_ADDR_0_VEC_EXTRACT]], i64 0) +; CHECK-NEXT: ret [[TMP2]] +; + %type = alloca <16 x i32> + %type.addr = alloca <16 x i32> + store %type.coerce, ptr %type + %type1 = load <16 x i32>, ptr %type + store <16 x i32> %type1, ptr %type.addr + %1 = load <16 x i32>, ptr %type.addr + %2 = load , ptr %type.addr + ret %2 +} + +; When casting from VLA to VLS via memory check we bail out when producing a +; GEP where the element type is a scalable vector. +define @cast_alloca_from_svint32_t() vscale_range(1) { +; CHECK-LABEL: @cast_alloca_from_svint32_t( +; CHECK-NEXT: [[RETVAL_COERCE:%.*]] = alloca , align 16 +; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr [[RETVAL_COERCE]], align 16 +; CHECK-NEXT: [[TMP1:%.*]] = load , ptr [[RETVAL_COERCE]], align 16 +; CHECK-NEXT: ret [[TMP1]] +; + %retval = alloca <16 x i32> + store <16 x i32> zeroinitializer, ptr %retval + %retval.coerce = alloca + call void @llvm.memcpy.p0.p0.i64(ptr align 16 %retval.coerce, ptr align 16 %retval, i64 64, i1 false) + %1 = load , ptr %retval.coerce + ret %1 +} + +; Test we bail out when using an alloca of a fixed-length vector (VLS) that was +; bitcasted to a scalable vector. +define void @select_load_alloca_to_svdouble_t() vscale_range(1) { +; CHECK-LABEL: @select_load_alloca_to_svdouble_t( +; CHECK-NEXT: [[Z:%.*]] = alloca <16 x half>, align 32 +; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 0, 0 +; CHECK-NEXT: [[COND:%.*]] = select i1 [[CMP]], ptr [[Z]], ptr null +; CHECK-NEXT: [[VAL:%.*]] = load , ptr [[COND]], align 16 +; CHECK-NEXT: ret void +; + %z = alloca <16 x half> + %cmp = icmp eq i32 0, 0 + %cond = select i1 %cmp, ptr %z, ptr null + %val = load , ptr %cond, align 16 + ret void +} + +define void @select_store_alloca_to_svdouble_t( %val) vscale_range(1) { +; CHECK-LABEL: @select_store_alloca_to_svdouble_t( +; CHECK-NEXT: [[Z:%.*]] = alloca <16 x half>, align 32 +; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 0, 0 +; CHECK-NEXT: [[COND:%.*]] = select i1 [[CMP]], ptr [[Z]], ptr null +; CHECK-NEXT: store [[VAL:%.*]], ptr [[COND]], align 16 +; CHECK-NEXT: ret void +; + %z = alloca <16 x half> + %cmp = icmp eq i32 0, 0 + %cond = select i1 %cmp, ptr %z, ptr null + store %val, ptr %cond, align 16 + ret void +} + +define <4 x i32> @fixed_alloca_fixed_from_scalable( %a) vscale_range(1) { +; CHECK-LABEL: @fixed_alloca_fixed_from_scalable( +; CHECK-NEXT: [[TMP1:%.*]] = call <4 x i32> @llvm.vector.extract.v4i32.nxv4i32( [[A:%.*]], i64 0) +; CHECK-NEXT: ret <4 x i32> [[TMP1]] +; + %tmp = alloca <4 x i32> + store %a, ptr %tmp + %cast = load <4 x i32>, ptr %tmp + ret <4 x i32> %cast +} + +define <2 x i8> @fixed_alloca_fixed_from_scalable_requires_bitcast( %a) vscale_range(1) { +; CHECK-LABEL: @fixed_alloca_fixed_from_scalable_requires_bitcast( +; CHECK-NEXT: [[TMP1:%.*]] = bitcast [[A:%.*]] to +; CHECK-NEXT: [[TMP2:%.*]] = call <2 x i8> @llvm.vector.extract.v2i8.nxv2i8( [[TMP1]], i64 0) +; CHECK-NEXT: ret <2 x i8> [[TMP2]] +; + %tmp = alloca <2 x i8> + store %a, ptr %tmp + %cast = load <2 x i8>, ptr %tmp + ret <2 x i8> %cast +} + +define <2 x ptr> @fixed_alloca_fixed_from_scalable_inttoptr( %a) vscale_range(1) { +; CHECK-LABEL: @fixed_alloca_fixed_from_scalable_inttoptr( +; CHECK-NEXT: [[TMP1:%.*]] = bitcast [[A:%.*]] to +; CHECK-NEXT: [[TMP3:%.*]] = call <2 x i64> @llvm.vector.extract.v2i64.nxv2i64( [[TMP1]], i64 0) +; CHECK-NEXT: [[TMP2:%.*]] = inttoptr <2 x i64> [[TMP3]] to <2 x ptr> +; CHECK-NEXT: ret <2 x ptr> [[TMP2]] +; + %tmp = alloca <4 x i32> + store %a, ptr %tmp + %cast = load <2 x ptr>, ptr %tmp + ret <2 x ptr> %cast +} + +define <4 x i32> @fixed_alloca_fixed_from_scalable_ptrtoint( %a) vscale_range(1) { +; CHECK-LABEL: @fixed_alloca_fixed_from_scalable_ptrtoint( +; CHECK-NEXT: [[TMP1:%.*]] = ptrtoint [[A:%.*]] to +; CHECK-NEXT: [[TMP2:%.*]] = bitcast [[TMP1]] to +; CHECK-NEXT: [[TMP_0_CAST:%.*]] = call <4 x i32> @llvm.vector.extract.v4i32.nxv4i32( [[TMP2]], i64 0) +; CHECK-NEXT: ret <4 x i32> [[TMP_0_CAST]] +; + %tmp = alloca <4 x i32> + store %a, ptr %tmp + %cast = load <4 x i32>, ptr %tmp + ret <4 x i32> %cast +} + +define <2 x ptr> @fixed_alloca_fixed_from_scalable_ptrtoptr( %a) vscale_range(1) { +; CHECK-LABEL: @fixed_alloca_fixed_from_scalable_ptrtoptr( +; CHECK-NEXT: [[TMP_0_CAST:%.*]] = call <2 x ptr> @llvm.vector.extract.v2p0.nxv2p0( [[A:%.*]], i64 0) +; CHECK-NEXT: ret <2 x ptr> [[TMP_0_CAST]] +; + %tmp = alloca <2 x ptr> + store %a, ptr %tmp + %cast = load <2 x ptr>, ptr %tmp + ret <2 x ptr> %cast +} + +define <2 x ptr> @fixed_alloca_fixed_from_scalable_ptrtoptr_different_addrspace( %a) vscale_range(1) { +; CHECK-LABEL: @fixed_alloca_fixed_from_scalable_ptrtoptr_different_addrspace( +; CHECK-NEXT: [[TMP1:%.*]] = ptrtoint [[A:%.*]] to +; CHECK-NEXT: [[TMP2:%.*]] = call <2 x i64> @llvm.vector.extract.v2i64.nxv2i64( [[TMP1]], i64 0) +; CHECK-NEXT: [[TMP3:%.*]] = inttoptr <2 x i64> [[TMP2]] to <2 x ptr> +; CHECK-NEXT: ret <2 x ptr> [[TMP3]] +; + %tmp = alloca <2 x ptr> + store %a, ptr %tmp + %cast = load <2 x ptr>, ptr %tmp + ret <2 x ptr> %cast +} + +define @fixed_alloca_scalable_from_fixed(<4 x i32> %a) vscale_range(1) { +; CHECK-LABEL: @fixed_alloca_scalable_from_fixed( +; CHECK-NEXT: [[TMP1:%.*]] = call @llvm.vector.insert.nxv4i32.v4i32( poison, <4 x i32> [[A:%.*]], i64 0) +; CHECK-NEXT: ret [[TMP1]] +; + %tmp = alloca <4 x i32> + store <4 x i32> %a, ptr %tmp + %cast = load , ptr %tmp + ret %cast +} + +define @fixed_alloca_scalable_from_fixed_requires_bitcast(<2 x i8> %a) vscale_range(1) { +; CHECK-LABEL: @fixed_alloca_scalable_from_fixed_requires_bitcast( +; CHECK-NEXT: [[TMP1:%.*]] = call @llvm.vector.insert.nxv2i8.v2i8( poison, <2 x i8> [[A:%.*]], i64 0) +; CHECK-NEXT: [[TMP2:%.*]] = bitcast [[TMP1]] to +; CHECK-NEXT: ret [[TMP2]] +; + %tmp = alloca <2 x i8> + store <2 x i8> %a, ptr %tmp + %cast = load , ptr %tmp + ret %cast +} + +define @fixed_alloca_scalable_from_fixed_inttoptr(<4 x i32> %a) vscale_range(1) { +; CHECK-LABEL: @fixed_alloca_scalable_from_fixed_inttoptr( +; CHECK-NEXT: [[TMP1:%.*]] = call @llvm.vector.insert.nxv4i32.v4i32( poison, <4 x i32> [[A:%.*]], i64 0) +; CHECK-NEXT: [[TMP2:%.*]] = bitcast [[TMP1]] to +; CHECK-NEXT: [[TMP_0_CAST:%.*]] = inttoptr [[TMP2]] to +; CHECK-NEXT: ret [[TMP_0_CAST]] +; + %tmp = alloca <4 x i32> + store <4 x i32> %a, ptr %tmp + %cast = load , ptr %tmp + ret %cast +} + +define @fixed_alloca_scalable_from_fixed_ptrtoint(<2 x ptr> %a) vscale_range(1) { +; CHECK-LABEL: @fixed_alloca_scalable_from_fixed_ptrtoint( +; CHECK-NEXT: [[TMP1:%.*]] = ptrtoint <2 x ptr> [[A:%.*]] to <2 x i64> +; CHECK-NEXT: [[TMP2:%.*]] = call @llvm.vector.insert.nxv2i64.v2i64( poison, <2 x i64> [[TMP1]], i64 0) +; CHECK-NEXT: [[TMP_0_CAST:%.*]] = bitcast [[TMP2]] to +; CHECK-NEXT: ret [[TMP_0_CAST]] +; + %tmp = alloca <4 x i32> + store <2 x ptr> %a, ptr %tmp + %cast = load , ptr %tmp + ret %cast +} + +define @fixed_alloca_scalable_from_fixed_ptrtoptr(<2 x ptr> %a) vscale_range(1) { +; CHECK-LABEL: @fixed_alloca_scalable_from_fixed_ptrtoptr( +; CHECK-NEXT: [[TMP_0_CAST:%.*]] = call @llvm.vector.insert.nxv2p0.v2p0( poison, <2 x ptr> [[A:%.*]], i64 0) +; CHECK-NEXT: ret [[TMP_0_CAST]] +; + %tmp = alloca <2 x ptr> + store <2 x ptr> %a, ptr %tmp + %cast = load , ptr %tmp + ret %cast +} + +define @fixed_alloca_scalable_from_fixed_ptrtoptr_different_addrspace(<2 x ptr> %a) vscale_range(1) { +; CHECK-LABEL: @fixed_alloca_scalable_from_fixed_ptrtoptr_different_addrspace( +; CHECK-NEXT: [[TMP1:%.*]] = ptrtoint <2 x ptr> [[A:%.*]] to <2 x i64> +; CHECK-NEXT: [[TMP2:%.*]] = call @llvm.vector.insert.nxv2i64.v2i64( poison, <2 x i64> [[TMP1]], i64 0) +; CHECK-NEXT: [[TMP3:%.*]] = inttoptr [[TMP2]] to +; CHECK-NEXT: ret [[TMP3]] +; + %tmp = alloca <2 x ptr> + store <2 x ptr> %a, ptr %tmp + %cast = load , ptr %tmp + ret %cast +} + +define <4 x i32> @scalable_alloca_fixed_from_scalable( %a) vscale_range(1) { +; CHECK-LABEL: @scalable_alloca_fixed_from_scalable( +; CHECK-NEXT: [[TMP:%.*]] = alloca , align 16 +; CHECK-NEXT: store [[A:%.*]], ptr [[TMP]], align 16 +; CHECK-NEXT: [[CAST:%.*]] = load <4 x i32>, ptr [[TMP]], align 16 +; CHECK-NEXT: ret <4 x i32> [[CAST]] +; + %tmp = alloca + store %a, ptr %tmp + %cast = load <4 x i32>, ptr %tmp + ret <4 x i32> %cast +} + +define @scalable_alloca_scalable_from_fixed(<4 x i32> %a) vscale_range(1) { +; CHECK-LABEL: @scalable_alloca_scalable_from_fixed( +; CHECK-NEXT: [[TMP:%.*]] = alloca , align 16 +; CHECK-NEXT: store <4 x i32> [[A:%.*]], ptr [[TMP]], align 16 +; CHECK-NEXT: [[CAST:%.*]] = load , ptr [[TMP]], align 16 +; CHECK-NEXT: ret [[CAST]] +; + %tmp = alloca + store <4 x i32> %a, ptr %tmp + %cast = load , ptr %tmp + ret %cast +} + +define i16 @scalar_alloca_scalar_from_scalable( %a) vscale_range(1) { +; CHECK-LABEL: @scalar_alloca_scalar_from_scalable( +; CHECK-NEXT: [[TMP:%.*]] = alloca i16, align 2 +; CHECK-NEXT: store [[A:%.*]], ptr [[TMP]], align 2 +; CHECK-NEXT: [[TMP_0_CAST:%.*]] = load i16, ptr [[TMP]], align 2 +; CHECK-NEXT: ret i16 [[TMP_0_CAST]] +; + %tmp = alloca i16 + store %a, ptr %tmp + %cast = load i16, ptr %tmp + ret i16 %cast +} + +define @scalar_alloca_scalable_from_scalar(i16 %a) vscale_range(1) { +; CHECK-LABEL: @scalar_alloca_scalable_from_scalar( +; CHECK-NEXT: [[TMP:%.*]] = alloca i16, align 2 +; CHECK-NEXT: store i16 [[A:%.*]], ptr [[TMP]], align 2 +; CHECK-NEXT: [[TMP_0_CAST:%.*]] = load , ptr [[TMP]], align 2 +; CHECK-NEXT: ret [[TMP_0_CAST]] +; + %tmp = alloca i16 + store i16 %a, ptr %tmp + %cast = load , ptr %tmp + ret %cast +} + +define { <2 x i32>, <2 x i32> } @fixed_struct_alloca_fixed_from_scalable( %a) vscale_range(1) { +; CHECK-LABEL: @fixed_struct_alloca_fixed_from_scalable( +; CHECK-NEXT: [[TMP:%.*]] = alloca { <2 x i32>, <2 x i32> }, align 8 +; CHECK-NEXT: store [[A:%.*]], ptr [[TMP]], align 8 +; CHECK-NEXT: [[TMP_0_CAST_FCA_0_LOAD:%.*]] = load <2 x i32>, ptr [[TMP]], align 8 +; CHECK-NEXT: [[CAST_FCA_0_INSERT:%.*]] = insertvalue { <2 x i32>, <2 x i32> } poison, <2 x i32> [[TMP_0_CAST_FCA_0_LOAD]], 0 +; CHECK-NEXT: [[TMP_8_CAST_FCA_1_GEP_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[TMP]], i64 8 +; CHECK-NEXT: [[TMP_8_CAST_FCA_1_LOAD:%.*]] = load <2 x i32>, ptr [[TMP_8_CAST_FCA_1_GEP_SROA_IDX]], align 8 +; CHECK-NEXT: [[CAST_FCA_1_INSERT:%.*]] = insertvalue { <2 x i32>, <2 x i32> } [[CAST_FCA_0_INSERT]], <2 x i32> [[TMP_8_CAST_FCA_1_LOAD]], 1 +; CHECK-NEXT: ret { <2 x i32>, <2 x i32> } [[CAST_FCA_1_INSERT]] +; + %tmp = alloca { <2 x i32>, <2 x i32> } + store %a, ptr %tmp + %cast = load { <2 x i32>, <2 x i32> }, ptr %tmp + ret { <2 x i32>, <2 x i32> } %cast +} + +define @fixed_struct_alloca_scalable_from_fixed({ <2 x ptr>, <2 x ptr> } %a) vscale_range(1) { +; CHECK-LABEL: @fixed_struct_alloca_scalable_from_fixed( +; CHECK-NEXT: [[TMP:%.*]] = alloca { <2 x ptr>, <2 x ptr> }, align 16 +; CHECK-NEXT: [[A_FCA_0_EXTRACT:%.*]] = extractvalue { <2 x ptr>, <2 x ptr> } [[A:%.*]], 0 +; CHECK-NEXT: store <2 x ptr> [[A_FCA_0_EXTRACT]], ptr [[TMP]], align 16 +; CHECK-NEXT: [[A_FCA_1_EXTRACT:%.*]] = extractvalue { <2 x ptr>, <2 x ptr> } [[A]], 1 +; CHECK-NEXT: [[TMP_16_A_FCA_1_GEP_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[TMP]], i64 16 +; CHECK-NEXT: store <2 x ptr> [[A_FCA_1_EXTRACT]], ptr [[TMP_16_A_FCA_1_GEP_SROA_IDX]], align 16 +; CHECK-NEXT: [[TMP_0_CAST:%.*]] = load , ptr [[TMP]], align 16 +; CHECK-NEXT: ret [[TMP_0_CAST]] +; + %tmp = alloca { <2 x ptr>, <2 x ptr> } + store { <2 x ptr>, <2 x ptr> } %a, ptr %tmp + %cast = load , ptr %tmp + ret %cast +} + +declare void @llvm.memcpy.p0.p0.i64(ptr nocapture, ptr nocapture, i64, i1) nounwind +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; CHECK-MODIFY-CFG: {{.*}} +; CHECK-PRESERVE-CFG: {{.*}} diff --git a/llvm/test/Transforms/SROA/scalable-vectors.ll b/llvm/test/Transforms/SROA/scalable-vectors.ll index d892883ce9dc3..346814d9f630e 100644 --- a/llvm/test/Transforms/SROA/scalable-vectors.ll +++ b/llvm/test/Transforms/SROA/scalable-vectors.ll @@ -2,6 +2,8 @@ ; RUN: opt < %s -passes='sroa' -S | FileCheck %s --check-prefixes=CHECK,CHECK-PRESERVE-CFG ; RUN: opt < %s -passes='sroa' -S | FileCheck %s --check-prefixes=CHECK,CHECK-MODIFY-CFG +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n8:16:32:64" + ; This test checks that SROA runs mem2reg on scalable vectors. define @alloca_nxv16i1( %pg) { @@ -67,11 +69,12 @@ define @cast_alloca_to_svint32_t( %type.coe define @cast_alloca_from_svint32_t() { ; CHECK-LABEL: @cast_alloca_from_svint32_t( ; CHECK-NEXT: [[RETVAL_COERCE:%.*]] = alloca , align 16 -; CHECK-NEXT: store <16 x i32> undef, ptr [[RETVAL_COERCE]], align 16 +; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr [[RETVAL_COERCE]], align 16 ; CHECK-NEXT: [[TMP1:%.*]] = load , ptr [[RETVAL_COERCE]], align 16 ; CHECK-NEXT: ret [[TMP1]] ; %retval = alloca <16 x i32> + store <16 x i32> zeroinitializer, ptr %retval %retval.coerce = alloca call void @llvm.memcpy.p0.p0.i64(ptr align 16 %retval.coerce, ptr align 16 %retval, i64 64, i1 false) %1 = load , ptr %retval.coerce @@ -110,6 +113,224 @@ define void @select_store_alloca_to_svdouble_t( %val) { ret void } +define <4 x i32> @fixed_alloca_fixed_from_scalable( %a) { +; CHECK-LABEL: @fixed_alloca_fixed_from_scalable( +; CHECK-NEXT: [[TMP:%.*]] = alloca <4 x i32>, align 16 +; CHECK-NEXT: store [[A:%.*]], ptr [[TMP]], align 16 +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr [[TMP]], align 16 +; CHECK-NEXT: ret <4 x i32> [[TMP1]] +; + %tmp = alloca <4 x i32> + store %a, ptr %tmp + %cast = load <4 x i32>, ptr %tmp + ret <4 x i32> %cast +} + +define <2 x i8> @fixed_alloca_fixed_from_scalable_requires_bitcast( %a) { +; CHECK-LABEL: @fixed_alloca_fixed_from_scalable_requires_bitcast( +; CHECK-NEXT: [[TMP:%.*]] = alloca <2 x i8>, align 2 +; CHECK-NEXT: store [[A:%.*]], ptr [[TMP]], align 2 +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i8>, ptr [[TMP]], align 2 +; CHECK-NEXT: ret <2 x i8> [[TMP2]] +; + %tmp = alloca <2 x i8> + store %a, ptr %tmp + %cast = load <2 x i8>, ptr %tmp + ret <2 x i8> %cast +} + +define <2 x ptr> @fixed_alloca_fixed_from_scalable_inttoptr( %a) { +; CHECK-LABEL: @fixed_alloca_fixed_from_scalable_inttoptr( +; CHECK-NEXT: [[TMP:%.*]] = alloca <4 x i32>, align 16 +; CHECK-NEXT: store [[A:%.*]], ptr [[TMP]], align 16 +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x ptr>, ptr [[TMP]], align 16 +; CHECK-NEXT: ret <2 x ptr> [[TMP2]] +; + %tmp = alloca <4 x i32> + store %a, ptr %tmp + %cast = load <2 x ptr>, ptr %tmp + ret <2 x ptr> %cast +} + +define <4 x i32> @fixed_alloca_fixed_from_scalable_ptrtoint( %a) { +; CHECK-LABEL: @fixed_alloca_fixed_from_scalable_ptrtoint( +; CHECK-NEXT: [[TMP:%.*]] = alloca <4 x i32>, align 16 +; CHECK-NEXT: store [[A:%.*]], ptr [[TMP]], align 16 +; CHECK-NEXT: [[TMP_0_CAST:%.*]] = load <4 x i32>, ptr [[TMP]], align 16 +; CHECK-NEXT: ret <4 x i32> [[TMP_0_CAST]] +; + %tmp = alloca <4 x i32> + store %a, ptr %tmp + %cast = load <4 x i32>, ptr %tmp + ret <4 x i32> %cast +} + +define <2 x ptr> @fixed_alloca_fixed_from_scalable_ptrtoptr( %a) { +; CHECK-LABEL: @fixed_alloca_fixed_from_scalable_ptrtoptr( +; CHECK-NEXT: [[TMP:%.*]] = alloca <2 x ptr>, align 16 +; CHECK-NEXT: store [[A:%.*]], ptr [[TMP]], align 16 +; CHECK-NEXT: [[CAST:%.*]] = load <2 x ptr>, ptr [[TMP]], align 16 +; CHECK-NEXT: ret <2 x ptr> [[CAST]] +; + %tmp = alloca <2 x ptr> + store %a, ptr %tmp + %cast = load <2 x ptr>, ptr %tmp + ret <2 x ptr> %cast +} + +define @fixed_alloca_scalable_from_fixed(<4 x i32> %a) { +; CHECK-LABEL: @fixed_alloca_scalable_from_fixed( +; CHECK-NEXT: [[TMP:%.*]] = alloca <4 x i32>, align 16 +; CHECK-NEXT: store <4 x i32> [[A:%.*]], ptr [[TMP]], align 16 +; CHECK-NEXT: [[TMP1:%.*]] = load , ptr [[TMP]], align 16 +; CHECK-NEXT: ret [[TMP1]] +; + %tmp = alloca <4 x i32> + store <4 x i32> %a, ptr %tmp + %cast = load , ptr %tmp + ret %cast +} + +define @fixed_alloca_scalable_from_fixed_requires_bitcast(<2 x i8> %a) { +; CHECK-LABEL: @fixed_alloca_scalable_from_fixed_requires_bitcast( +; CHECK-NEXT: [[TMP:%.*]] = alloca <2 x i8>, align 2 +; CHECK-NEXT: store <2 x i8> [[A:%.*]], ptr [[TMP]], align 2 +; CHECK-NEXT: [[TMP2:%.*]] = load , ptr [[TMP]], align 2 +; CHECK-NEXT: ret [[TMP2]] +; + %tmp = alloca <2 x i8> + store <2 x i8> %a, ptr %tmp + %cast = load , ptr %tmp + ret %cast +} + +define @fixed_alloca_scalable_from_fixed_inttoptr(<4 x i32> %a) { +; CHECK-LABEL: @fixed_alloca_scalable_from_fixed_inttoptr( +; CHECK-NEXT: [[TMP:%.*]] = alloca <4 x i32>, align 16 +; CHECK-NEXT: store <4 x i32> [[A:%.*]], ptr [[TMP]], align 16 +; CHECK-NEXT: [[TMP_0_CAST:%.*]] = load , ptr [[TMP]], align 16 +; CHECK-NEXT: ret [[TMP_0_CAST]] +; + %tmp = alloca <4 x i32> + store <4 x i32> %a, ptr %tmp + %cast = load , ptr %tmp + ret %cast +} + +define @fixed_alloca_scalable_from_fixed_ptrtoint(<2 x ptr> %a) { +; CHECK-LABEL: @fixed_alloca_scalable_from_fixed_ptrtoint( +; CHECK-NEXT: [[TMP:%.*]] = alloca <4 x i32>, align 16 +; CHECK-NEXT: store <2 x ptr> [[A:%.*]], ptr [[TMP]], align 16 +; CHECK-NEXT: [[TMP_0_CAST:%.*]] = load , ptr [[TMP]], align 16 +; CHECK-NEXT: ret [[TMP_0_CAST]] +; + %tmp = alloca <4 x i32> + store <2 x ptr> %a, ptr %tmp + %cast = load , ptr %tmp + ret %cast +} + +define @fixed_alloca_scalable_from_fixed_ptrtoptr(<2 x ptr> %a) { +; CHECK-LABEL: @fixed_alloca_scalable_from_fixed_ptrtoptr( +; CHECK-NEXT: [[TMP:%.*]] = alloca <2 x ptr>, align 16 +; CHECK-NEXT: store <2 x ptr> [[A:%.*]], ptr [[TMP]], align 16 +; CHECK-NEXT: [[CAST:%.*]] = load , ptr [[TMP]], align 16 +; CHECK-NEXT: ret [[CAST]] +; + %tmp = alloca <2 x ptr> + store <2 x ptr> %a, ptr %tmp + %cast = load , ptr %tmp + ret %cast +} + +define <4 x i32> @scalable_alloca_fixed_from_scalable( %a) { +; CHECK-LABEL: @scalable_alloca_fixed_from_scalable( +; CHECK-NEXT: [[TMP:%.*]] = alloca , align 16 +; CHECK-NEXT: store [[A:%.*]], ptr [[TMP]], align 16 +; CHECK-NEXT: [[CAST:%.*]] = load <4 x i32>, ptr [[TMP]], align 16 +; CHECK-NEXT: ret <4 x i32> [[CAST]] +; + %tmp = alloca + store %a, ptr %tmp + %cast = load <4 x i32>, ptr %tmp + ret <4 x i32> %cast +} + +define @scalable_alloca_scalable_from_fixed(<4 x i32> %a) { +; CHECK-LABEL: @scalable_alloca_scalable_from_fixed( +; CHECK-NEXT: [[TMP:%.*]] = alloca , align 16 +; CHECK-NEXT: store <4 x i32> [[A:%.*]], ptr [[TMP]], align 16 +; CHECK-NEXT: [[CAST:%.*]] = load , ptr [[TMP]], align 16 +; CHECK-NEXT: ret [[CAST]] +; + %tmp = alloca + store <4 x i32> %a, ptr %tmp + %cast = load , ptr %tmp + ret %cast +} + +define i16 @scalar_alloca_scalar_from_scalable( %a) { +; CHECK-LABEL: @scalar_alloca_scalar_from_scalable( +; CHECK-NEXT: [[TMP:%.*]] = alloca i16, align 2 +; CHECK-NEXT: store [[A:%.*]], ptr [[TMP]], align 2 +; CHECK-NEXT: [[TMP_0_CAST:%.*]] = load i16, ptr [[TMP]], align 2 +; CHECK-NEXT: ret i16 [[TMP_0_CAST]] +; + %tmp = alloca i16 + store %a, ptr %tmp + %cast = load i16, ptr %tmp + ret i16 %cast +} + +define @scalar_alloca_scalable_from_scalar(i16 %a) { +; CHECK-LABEL: @scalar_alloca_scalable_from_scalar( +; CHECK-NEXT: [[TMP:%.*]] = alloca i16, align 2 +; CHECK-NEXT: store i16 [[A:%.*]], ptr [[TMP]], align 2 +; CHECK-NEXT: [[TMP_0_CAST:%.*]] = load , ptr [[TMP]], align 2 +; CHECK-NEXT: ret [[TMP_0_CAST]] +; + %tmp = alloca i16 + store i16 %a, ptr %tmp + %cast = load , ptr %tmp + ret %cast +} + +define { <2 x i32>, <2 x i32> } @fixed_struct_alloca_fixed_from_scalable( %a) { +; CHECK-LABEL: @fixed_struct_alloca_fixed_from_scalable( +; CHECK-NEXT: [[TMP:%.*]] = alloca { <2 x i32>, <2 x i32> }, align 8 +; CHECK-NEXT: store [[A:%.*]], ptr [[TMP]], align 16 +; CHECK-NEXT: [[CAST_FCA_0_GEP:%.*]] = getelementptr inbounds { <2 x i32>, <2 x i32> }, ptr [[TMP]], i32 0, i32 0 +; CHECK-NEXT: [[TMP_0_CAST_FCA_0_LOAD:%.*]] = load <2 x i32>, ptr [[CAST_FCA_0_GEP]], align 8 +; CHECK-NEXT: [[CAST_FCA_0_INSERT:%.*]] = insertvalue { <2 x i32>, <2 x i32> } poison, <2 x i32> [[TMP_0_CAST_FCA_0_LOAD]], 0 +; CHECK-NEXT: [[TMP_8_CAST_FCA_1_GEP_SROA_IDX:%.*]] = getelementptr inbounds { <2 x i32>, <2 x i32> }, ptr [[TMP]], i32 0, i32 1 +; CHECK-NEXT: [[TMP_8_CAST_FCA_1_LOAD:%.*]] = load <2 x i32>, ptr [[TMP_8_CAST_FCA_1_GEP_SROA_IDX]], align 8 +; CHECK-NEXT: [[CAST_FCA_1_INSERT:%.*]] = insertvalue { <2 x i32>, <2 x i32> } [[CAST_FCA_0_INSERT]], <2 x i32> [[TMP_8_CAST_FCA_1_LOAD]], 1 +; CHECK-NEXT: ret { <2 x i32>, <2 x i32> } [[CAST_FCA_1_INSERT]] +; + %tmp = alloca { <2 x i32>, <2 x i32> } + store %a, ptr %tmp + %cast = load { <2 x i32>, <2 x i32> }, ptr %tmp + ret { <2 x i32>, <2 x i32> } %cast +} + +define @fixed_struct_alloca_scalable_from_fixed({ <2 x ptr>, <2 x ptr> } %a) { +; CHECK-LABEL: @fixed_struct_alloca_scalable_from_fixed( +; CHECK-NEXT: [[TMP:%.*]] = alloca { <2 x ptr>, <2 x ptr> }, align 16 +; CHECK-NEXT: [[A_FCA_0_EXTRACT:%.*]] = extractvalue { <2 x ptr>, <2 x ptr> } [[A:%.*]], 0 +; CHECK-NEXT: [[A_FCA_0_GEP:%.*]] = getelementptr inbounds { <2 x ptr>, <2 x ptr> }, ptr [[TMP]], i32 0, i32 0 +; CHECK-NEXT: store <2 x ptr> [[A_FCA_0_EXTRACT]], ptr [[A_FCA_0_GEP]], align 16 +; CHECK-NEXT: [[A_FCA_1_EXTRACT:%.*]] = extractvalue { <2 x ptr>, <2 x ptr> } [[A]], 1 +; CHECK-NEXT: [[TMP_16_A_FCA_1_GEP_SROA_IDX:%.*]] = getelementptr inbounds { <2 x ptr>, <2 x ptr> }, ptr [[TMP]], i32 0, i32 1 +; CHECK-NEXT: store <2 x ptr> [[A_FCA_1_EXTRACT]], ptr [[TMP_16_A_FCA_1_GEP_SROA_IDX]], align 16 +; CHECK-NEXT: [[TMP_0_CAST:%.*]] = load , ptr [[TMP]], align 32 +; CHECK-NEXT: ret [[TMP_0_CAST]] +; + %tmp = alloca { <2 x ptr>, <2 x ptr> } + store { <2 x ptr>, <2 x ptr> } %a, ptr %tmp + %cast = load , ptr %tmp + ret %cast +} + declare void @llvm.memcpy.p0.p0.i64(ptr nocapture, ptr nocapture, i64, i1) nounwind ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: ; CHECK-MODIFY-CFG: {{.*}}