diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp index 77537df1ae053..a710261941aaa 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -18,6 +18,7 @@ #include "llvm/CodeGen/BasicTTIImpl.h" #include "llvm/CodeGen/CostTable.h" #include "llvm/CodeGen/TargetLowering.h" +#include "llvm/IR/Constants.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/Intrinsics.h" #include "llvm/IR/IntrinsicsAArch64.h" @@ -3572,6 +3573,40 @@ InstructionCost AArch64TTIImpl::getArithmeticInstrCost( Cost *= 4; return Cost; } else { + // If the information about individual scalars being vectorized is + // available, this yeilds better cost estimation. + if (auto *VTy = dyn_cast(Ty); VTy && !Args.empty()) { + assert(Args.size() % 2 == 0 && "Args size should be even"); + InstructionCost InsertExtractCost = + ST->getVectorInsertExtractBaseCost(); + // If the cost of single sdiv is inquired through the cost-model. + // FIXME: remove the isa checks once the PR 122236 lands. + if (Args.size() == 2 && + !(isa(Args[1]) || + isa(Args[1]) || + isa(Args[1])) && + none_of(Args, IsaPred)) { + unsigned NElts = VTy->getNumElements(); + // Compute per element cost + Cost = getArithmeticInstrCost(Opcode, VTy->getScalarType(), + CostKind, Op1Info.getNoProps(), + Op2Info.getNoProps()); + Cost += 3 * InsertExtractCost; + Cost *= NElts; + return Cost; + } else if (Args.size() > 2) // vectorization cost is inquired + { + Cost = (3 * InsertExtractCost) * VTy->getNumElements(); + for (int i = 0, Sz = Args.size(); i < Sz; i += 2) { + Cost += + getArithmeticInstrCost(Opcode, VTy->getScalarType(), CostKind, + TTI::getOperandInfo(Args[i]), + TTI::getOperandInfo(Args[i + 1])); + } + return Cost; + } + } + // If one of the operands is a uniform constant then the cost for each // element is Cost for insertion, extraction and division. // Insertion cost = 2, Extraction Cost = 2, Division = cost for the diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index fc6bba6d2b8b3..082cd501ab336 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -11650,9 +11650,20 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef VectorizedVals, unsigned OpIdx = isa(VL0) ? 0 : 1; TTI::OperandValueInfo Op1Info = getOperandInfo(E->getOperand(0)); TTI::OperandValueInfo Op2Info = getOperandInfo(E->getOperand(OpIdx)); - return TTI->getArithmeticInstrCost(ShuffleOrOp, VecTy, CostKind, Op1Info, - Op2Info, {}, nullptr, TLI) + - CommonCost; + SmallVector Operands; + if (all_of(E->Scalars, [ShuffleOrOp](Value *V) { + return !IsaPred(V) && + cast(V)->getOpcode() == ShuffleOrOp; + })) { + for (auto *Scalar : E->Scalars) { + Instruction *I = cast(Scalar); + auto IOperands = I->operand_values(); + Operands.insert(Operands.end(), IOperands.begin(), IOperands.end()); + } + } + return CommonCost + + TTI->getArithmeticInstrCost(ShuffleOrOp, VecTy, CostKind, Op1Info, + Op2Info, Operands, nullptr, TLI); }; return GetCostDiff(GetScalarCost, GetVectorCost); } diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/div.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/div.ll index 29bd81998cdb2..412cfb5e569f3 100644 --- a/llvm/test/Transforms/SLPVectorizer/AArch64/div.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/div.ll @@ -553,35 +553,13 @@ define <4 x i32> @slp_v4i32_Op1_unknown_Op2_const_pow2(<4 x i32> %a) } define <2 x i32> @sdiv_v2i32_unknown_divisor(<2 x i32> %a, <2 x i32> %x, <2 x i32> %y, <2 x i32> %z) -; NO-SVE-LABEL: define <2 x i32> @sdiv_v2i32_unknown_divisor( -; NO-SVE-SAME: <2 x i32> [[A:%.*]], <2 x i32> [[X:%.*]], <2 x i32> [[Y:%.*]], <2 x i32> [[Z:%.*]]) #[[ATTR0]] { -; NO-SVE-NEXT: [[A0:%.*]] = extractelement <2 x i32> [[A]], i64 0 -; NO-SVE-NEXT: [[A1:%.*]] = extractelement <2 x i32> [[A]], i64 1 -; NO-SVE-NEXT: [[X0:%.*]] = extractelement <2 x i32> [[X]], i64 0 -; NO-SVE-NEXT: [[X1:%.*]] = extractelement <2 x i32> [[X]], i64 1 -; NO-SVE-NEXT: [[TMP1:%.*]] = sdiv i32 [[A0]], [[X0]] -; NO-SVE-NEXT: [[TMP2:%.*]] = sdiv i32 [[A1]], [[X1]] -; NO-SVE-NEXT: [[TMP3:%.*]] = add i32 [[TMP1]], [[X0]] -; NO-SVE-NEXT: [[TMP4:%.*]] = add i32 [[TMP2]], [[X1]] -; NO-SVE-NEXT: [[Y0:%.*]] = extractelement <2 x i32> [[Y]], i64 0 -; NO-SVE-NEXT: [[Y1:%.*]] = extractelement <2 x i32> [[Y]], i64 1 -; NO-SVE-NEXT: [[TMP5:%.*]] = sub i32 [[TMP3]], [[Y0]] -; NO-SVE-NEXT: [[TMP6:%.*]] = sub i32 [[TMP4]], [[Y1]] -; NO-SVE-NEXT: [[Z0:%.*]] = extractelement <2 x i32> [[Z]], i64 0 -; NO-SVE-NEXT: [[Z1:%.*]] = extractelement <2 x i32> [[Z]], i64 1 -; NO-SVE-NEXT: [[TMP7:%.*]] = mul i32 [[TMP5]], [[Z0]] -; NO-SVE-NEXT: [[TMP8:%.*]] = mul i32 [[TMP6]], [[Z1]] -; NO-SVE-NEXT: [[RES0:%.*]] = insertelement <2 x i32> poison, i32 [[TMP7]], i32 0 -; NO-SVE-NEXT: [[RES1:%.*]] = insertelement <2 x i32> [[RES0]], i32 [[TMP8]], i32 1 -; NO-SVE-NEXT: ret <2 x i32> [[RES1]] -; -; SVE-LABEL: define <2 x i32> @sdiv_v2i32_unknown_divisor( -; SVE-SAME: <2 x i32> [[A:%.*]], <2 x i32> [[X:%.*]], <2 x i32> [[Y:%.*]], <2 x i32> [[Z:%.*]]) #[[ATTR0]] { -; SVE-NEXT: [[TMP2:%.*]] = sdiv <2 x i32> [[A]], [[X]] -; SVE-NEXT: [[TMP3:%.*]] = add <2 x i32> [[TMP2]], [[X]] -; SVE-NEXT: [[TMP4:%.*]] = sub <2 x i32> [[TMP3]], [[Y]] -; SVE-NEXT: [[TMP5:%.*]] = mul <2 x i32> [[TMP4]], [[Z]] -; SVE-NEXT: ret <2 x i32> [[TMP5]] +; CHECK-LABEL: define <2 x i32> @sdiv_v2i32_unknown_divisor( +; CHECK-SAME: <2 x i32> [[A:%.*]], <2 x i32> [[X:%.*]], <2 x i32> [[Y:%.*]], <2 x i32> [[Z:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = sdiv <2 x i32> [[A]], [[X]] +; CHECK-NEXT: [[TMP2:%.*]] = add <2 x i32> [[TMP1]], [[X]] +; CHECK-NEXT: [[TMP3:%.*]] = sub <2 x i32> [[TMP2]], [[Y]] +; CHECK-NEXT: [[TMP4:%.*]] = mul <2 x i32> [[TMP3]], [[Z]] +; CHECK-NEXT: ret <2 x i32> [[TMP4]] ; { %a0 = extractelement <2 x i32> %a, i64 0 @@ -607,35 +585,13 @@ define <2 x i32> @sdiv_v2i32_unknown_divisor(<2 x i32> %a, <2 x i32> %x, <2 x i3 ; computes (a/const + x - y) * z define <2 x i32> @sdiv_v2i32_const_divisor(<2 x i32> %a, <2 x i32> %x, <2 x i32> %y, <2 x i32> %z) -; NO-SVE-LABEL: define <2 x i32> @sdiv_v2i32_const_divisor( -; NO-SVE-SAME: <2 x i32> [[A:%.*]], <2 x i32> [[X:%.*]], <2 x i32> [[Y:%.*]], <2 x i32> [[Z:%.*]]) #[[ATTR0]] { -; NO-SVE-NEXT: [[A0:%.*]] = extractelement <2 x i32> [[A]], i64 0 -; NO-SVE-NEXT: [[A1:%.*]] = extractelement <2 x i32> [[A]], i64 1 -; NO-SVE-NEXT: [[TMP1:%.*]] = sdiv i32 [[A0]], 2 -; NO-SVE-NEXT: [[TMP2:%.*]] = sdiv i32 [[A1]], 4 -; NO-SVE-NEXT: [[X0:%.*]] = extractelement <2 x i32> [[X]], i64 0 -; NO-SVE-NEXT: [[X1:%.*]] = extractelement <2 x i32> [[X]], i64 1 -; NO-SVE-NEXT: [[TMP3:%.*]] = add i32 [[TMP1]], [[X0]] -; NO-SVE-NEXT: [[TMP4:%.*]] = add i32 [[TMP2]], [[X1]] -; NO-SVE-NEXT: [[Y0:%.*]] = extractelement <2 x i32> [[Y]], i64 0 -; NO-SVE-NEXT: [[Y1:%.*]] = extractelement <2 x i32> [[Y]], i64 1 -; NO-SVE-NEXT: [[TMP5:%.*]] = sub i32 [[TMP3]], [[Y0]] -; NO-SVE-NEXT: [[TMP6:%.*]] = sub i32 [[TMP4]], [[Y1]] -; NO-SVE-NEXT: [[Z0:%.*]] = extractelement <2 x i32> [[Z]], i64 0 -; NO-SVE-NEXT: [[Z1:%.*]] = extractelement <2 x i32> [[Z]], i64 1 -; NO-SVE-NEXT: [[TMP7:%.*]] = mul i32 [[TMP5]], [[Z0]] -; NO-SVE-NEXT: [[TMP8:%.*]] = mul i32 [[TMP6]], [[Z1]] -; NO-SVE-NEXT: [[RES0:%.*]] = insertelement <2 x i32> poison, i32 [[TMP7]], i32 0 -; NO-SVE-NEXT: [[RES1:%.*]] = insertelement <2 x i32> [[RES0]], i32 [[TMP8]], i32 1 -; NO-SVE-NEXT: ret <2 x i32> [[RES1]] -; -; SVE-LABEL: define <2 x i32> @sdiv_v2i32_const_divisor( -; SVE-SAME: <2 x i32> [[A:%.*]], <2 x i32> [[X:%.*]], <2 x i32> [[Y:%.*]], <2 x i32> [[Z:%.*]]) #[[ATTR0]] { -; SVE-NEXT: [[TMP1:%.*]] = sdiv <2 x i32> [[A]], -; SVE-NEXT: [[TMP2:%.*]] = add <2 x i32> [[TMP1]], [[X]] -; SVE-NEXT: [[TMP3:%.*]] = sub <2 x i32> [[TMP2]], [[Y]] -; SVE-NEXT: [[TMP4:%.*]] = mul <2 x i32> [[TMP3]], [[Z]] -; SVE-NEXT: ret <2 x i32> [[TMP4]] +; CHECK-LABEL: define <2 x i32> @sdiv_v2i32_const_divisor( +; CHECK-SAME: <2 x i32> [[A:%.*]], <2 x i32> [[X:%.*]], <2 x i32> [[Y:%.*]], <2 x i32> [[Z:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = sdiv <2 x i32> [[A]], +; CHECK-NEXT: [[TMP2:%.*]] = add <2 x i32> [[TMP1]], [[X]] +; CHECK-NEXT: [[TMP3:%.*]] = sub <2 x i32> [[TMP2]], [[Y]] +; CHECK-NEXT: [[TMP4:%.*]] = mul <2 x i32> [[TMP3]], [[Z]] +; CHECK-NEXT: ret <2 x i32> [[TMP4]] ; { %a0 = extractelement <2 x i32> %a, i64 0 @@ -660,36 +616,14 @@ define <2 x i32> @sdiv_v2i32_const_divisor(<2 x i32> %a, <2 x i32> %x, <2 x i32> } define <2 x i32> @sdiv_v2i32_Op1_unknown_Op2_const(<2 x i32> %a, <2 x i32> %x, <2 x i32> %y, <2 x i32> %z) -; NO-SVE-LABEL: define <2 x i32> @sdiv_v2i32_Op1_unknown_Op2_const( -; NO-SVE-SAME: <2 x i32> [[A:%.*]], <2 x i32> [[X:%.*]], <2 x i32> [[Y:%.*]], <2 x i32> [[Z:%.*]]) #[[ATTR0]] { -; NO-SVE-NEXT: [[A0:%.*]] = extractelement <2 x i32> [[A]], i64 0 -; NO-SVE-NEXT: [[A1:%.*]] = extractelement <2 x i32> [[A]], i64 1 -; NO-SVE-NEXT: [[TMP1:%.*]] = sdiv i32 [[A0]], [[A0]] -; NO-SVE-NEXT: [[TMP2:%.*]] = sdiv i32 [[A1]], 4 -; NO-SVE-NEXT: [[X0:%.*]] = extractelement <2 x i32> [[X]], i64 0 -; NO-SVE-NEXT: [[X1:%.*]] = extractelement <2 x i32> [[X]], i64 1 -; NO-SVE-NEXT: [[TMP3:%.*]] = add i32 [[TMP1]], [[X0]] -; NO-SVE-NEXT: [[TMP4:%.*]] = add i32 [[TMP2]], [[X1]] -; NO-SVE-NEXT: [[Y0:%.*]] = extractelement <2 x i32> [[Y]], i64 0 -; NO-SVE-NEXT: [[Y1:%.*]] = extractelement <2 x i32> [[Y]], i64 1 -; NO-SVE-NEXT: [[TMP5:%.*]] = sub i32 [[TMP3]], [[Y0]] -; NO-SVE-NEXT: [[TMP6:%.*]] = sub i32 [[TMP4]], [[Y1]] -; NO-SVE-NEXT: [[Z0:%.*]] = extractelement <2 x i32> [[Z]], i64 0 -; NO-SVE-NEXT: [[Z1:%.*]] = extractelement <2 x i32> [[Z]], i64 1 -; NO-SVE-NEXT: [[TMP7:%.*]] = mul i32 [[TMP5]], [[Z0]] -; NO-SVE-NEXT: [[TMP8:%.*]] = mul i32 [[TMP6]], [[Z1]] -; NO-SVE-NEXT: [[RES0:%.*]] = insertelement <2 x i32> poison, i32 [[TMP7]], i32 0 -; NO-SVE-NEXT: [[RES1:%.*]] = insertelement <2 x i32> [[RES0]], i32 [[TMP8]], i32 1 -; NO-SVE-NEXT: ret <2 x i32> [[RES1]] -; -; SVE-LABEL: define <2 x i32> @sdiv_v2i32_Op1_unknown_Op2_const( -; SVE-SAME: <2 x i32> [[A:%.*]], <2 x i32> [[X:%.*]], <2 x i32> [[Y:%.*]], <2 x i32> [[Z:%.*]]) #[[ATTR0]] { -; SVE-NEXT: [[TMP1:%.*]] = shufflevector <2 x i32> [[A]], <2 x i32> , <2 x i32> -; SVE-NEXT: [[TMP2:%.*]] = sdiv <2 x i32> [[A]], [[TMP1]] -; SVE-NEXT: [[TMP3:%.*]] = add <2 x i32> [[TMP2]], [[X]] -; SVE-NEXT: [[TMP4:%.*]] = sub <2 x i32> [[TMP3]], [[Y]] -; SVE-NEXT: [[TMP5:%.*]] = mul <2 x i32> [[TMP4]], [[Z]] -; SVE-NEXT: ret <2 x i32> [[TMP5]] +; CHECK-LABEL: define <2 x i32> @sdiv_v2i32_Op1_unknown_Op2_const( +; CHECK-SAME: <2 x i32> [[A:%.*]], <2 x i32> [[X:%.*]], <2 x i32> [[Y:%.*]], <2 x i32> [[Z:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x i32> [[A]], <2 x i32> , <2 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = sdiv <2 x i32> [[A]], [[TMP1]] +; CHECK-NEXT: [[TMP3:%.*]] = add <2 x i32> [[TMP2]], [[X]] +; CHECK-NEXT: [[TMP4:%.*]] = sub <2 x i32> [[TMP3]], [[Y]] +; CHECK-NEXT: [[TMP5:%.*]] = mul <2 x i32> [[TMP4]], [[Z]] +; CHECK-NEXT: ret <2 x i32> [[TMP5]] ; { %a0 = extractelement <2 x i32> %a, i64 0 diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/strided-loads-vectorized.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/strided-loads-vectorized.ll index 4556df3b31687..b582a0db488be 100644 --- a/llvm/test/Transforms/SLPVectorizer/RISCV/strided-loads-vectorized.ll +++ b/llvm/test/Transforms/SLPVectorizer/RISCV/strided-loads-vectorized.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt -passes=slp-vectorizer -S -mtriple=riscv64-unknown-linux-gnu -mattr=+v < %s | FileCheck %s --check-prefixes=CHECK,NO-ZVFHMIN-ZVFBFMIN -; RUN: opt -passes=slp-vectorizer -S -mtriple=riscv64-unknown-linux-gnu -mattr=+v,+zvfhmin,+zvfbfmin < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN-ZVFBFMIN +; RUN: opt -passes=slp-vectorizer -S -mtriple=riscv64-unknown-linux-gnu -mattr=+v < %s | FileCheck %s +; RUN: opt -passes=slp-vectorizer -S -mtriple=riscv64-unknown-linux-gnu -mattr=+v,+zvfhmin,+zvfbfmin < %s | FileCheck %s define void @test(ptr %p, ptr noalias %s) { @@ -312,76 +312,65 @@ entry: define void @test_bf16(ptr %p, ptr noalias %s) { -; NO-ZVFHMIN-ZVFBFMIN-LABEL: @test_bf16( -; NO-ZVFHMIN-ZVFBFMIN-NEXT: entry: -; NO-ZVFHMIN-ZVFBFMIN-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [48 x bfloat], ptr [[P:%.*]], i64 0, i64 0 -; NO-ZVFHMIN-ZVFBFMIN-NEXT: [[I:%.*]] = load bfloat, ptr [[ARRAYIDX]], align 4 -; NO-ZVFHMIN-ZVFBFMIN-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds [48 x bfloat], ptr [[P]], i64 0, i64 30 -; NO-ZVFHMIN-ZVFBFMIN-NEXT: [[I1:%.*]] = load bfloat, ptr [[ARRAYIDX1]], align 4 -; NO-ZVFHMIN-ZVFBFMIN-NEXT: [[ADD:%.*]] = fsub fast bfloat [[I1]], [[I]] -; NO-ZVFHMIN-ZVFBFMIN-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds bfloat, ptr [[S:%.*]], i64 0 -; NO-ZVFHMIN-ZVFBFMIN-NEXT: store bfloat [[ADD]], ptr [[ARRAYIDX2]], align 4 -; NO-ZVFHMIN-ZVFBFMIN-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds [48 x bfloat], ptr [[P]], i64 0, i64 4 -; NO-ZVFHMIN-ZVFBFMIN-NEXT: [[I2:%.*]] = load bfloat, ptr [[ARRAYIDX4]], align 4 -; NO-ZVFHMIN-ZVFBFMIN-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds [48 x bfloat], ptr [[P]], i64 0, i64 26 -; NO-ZVFHMIN-ZVFBFMIN-NEXT: [[I3:%.*]] = load bfloat, ptr [[ARRAYIDX6]], align 4 -; NO-ZVFHMIN-ZVFBFMIN-NEXT: [[ADD7:%.*]] = fsub fast bfloat [[I3]], [[I2]] -; NO-ZVFHMIN-ZVFBFMIN-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds bfloat, ptr [[S]], i64 1 -; NO-ZVFHMIN-ZVFBFMIN-NEXT: store bfloat [[ADD7]], ptr [[ARRAYIDX9]], align 4 -; NO-ZVFHMIN-ZVFBFMIN-NEXT: [[ARRAYIDX11:%.*]] = getelementptr inbounds [48 x bfloat], ptr [[P]], i64 0, i64 8 -; NO-ZVFHMIN-ZVFBFMIN-NEXT: [[I4:%.*]] = load bfloat, ptr [[ARRAYIDX11]], align 4 -; NO-ZVFHMIN-ZVFBFMIN-NEXT: [[ARRAYIDX13:%.*]] = getelementptr inbounds [48 x bfloat], ptr [[P]], i64 0, i64 22 -; NO-ZVFHMIN-ZVFBFMIN-NEXT: [[I5:%.*]] = load bfloat, ptr [[ARRAYIDX13]], align 4 -; NO-ZVFHMIN-ZVFBFMIN-NEXT: [[ADD14:%.*]] = fsub fast bfloat [[I5]], [[I4]] -; NO-ZVFHMIN-ZVFBFMIN-NEXT: [[ARRAYIDX16:%.*]] = getelementptr inbounds bfloat, ptr [[S]], i64 2 -; NO-ZVFHMIN-ZVFBFMIN-NEXT: store bfloat [[ADD14]], ptr [[ARRAYIDX16]], align 4 -; NO-ZVFHMIN-ZVFBFMIN-NEXT: [[ARRAYIDX18:%.*]] = getelementptr inbounds [48 x bfloat], ptr [[P]], i64 0, i64 12 -; NO-ZVFHMIN-ZVFBFMIN-NEXT: [[I6:%.*]] = load bfloat, ptr [[ARRAYIDX18]], align 4 -; NO-ZVFHMIN-ZVFBFMIN-NEXT: [[ARRAYIDX20:%.*]] = getelementptr inbounds [48 x bfloat], ptr [[P]], i64 0, i64 18 -; NO-ZVFHMIN-ZVFBFMIN-NEXT: [[I7:%.*]] = load bfloat, ptr [[ARRAYIDX20]], align 4 -; NO-ZVFHMIN-ZVFBFMIN-NEXT: [[ADD21:%.*]] = fsub fast bfloat [[I7]], [[I6]] -; NO-ZVFHMIN-ZVFBFMIN-NEXT: [[ARRAYIDX23:%.*]] = getelementptr inbounds bfloat, ptr [[S]], i64 3 -; NO-ZVFHMIN-ZVFBFMIN-NEXT: store bfloat [[ADD21]], ptr [[ARRAYIDX23]], align 4 -; NO-ZVFHMIN-ZVFBFMIN-NEXT: [[ARRAYIDX25:%.*]] = getelementptr inbounds [48 x bfloat], ptr [[P]], i64 0, i64 16 -; NO-ZVFHMIN-ZVFBFMIN-NEXT: [[I8:%.*]] = load bfloat, ptr [[ARRAYIDX25]], align 4 -; NO-ZVFHMIN-ZVFBFMIN-NEXT: [[ARRAYIDX27:%.*]] = getelementptr inbounds [48 x bfloat], ptr [[P]], i64 0, i64 14 -; NO-ZVFHMIN-ZVFBFMIN-NEXT: [[I9:%.*]] = load bfloat, ptr [[ARRAYIDX27]], align 4 -; NO-ZVFHMIN-ZVFBFMIN-NEXT: [[ADD28:%.*]] = fsub fast bfloat [[I9]], [[I8]] -; NO-ZVFHMIN-ZVFBFMIN-NEXT: [[ARRAYIDX30:%.*]] = getelementptr inbounds bfloat, ptr [[S]], i64 4 -; NO-ZVFHMIN-ZVFBFMIN-NEXT: store bfloat [[ADD28]], ptr [[ARRAYIDX30]], align 4 -; NO-ZVFHMIN-ZVFBFMIN-NEXT: [[ARRAYIDX32:%.*]] = getelementptr inbounds [48 x bfloat], ptr [[P]], i64 0, i64 20 -; NO-ZVFHMIN-ZVFBFMIN-NEXT: [[I10:%.*]] = load bfloat, ptr [[ARRAYIDX32]], align 4 -; NO-ZVFHMIN-ZVFBFMIN-NEXT: [[ARRAYIDX34:%.*]] = getelementptr inbounds [48 x bfloat], ptr [[P]], i64 0, i64 10 -; NO-ZVFHMIN-ZVFBFMIN-NEXT: [[I11:%.*]] = load bfloat, ptr [[ARRAYIDX34]], align 4 -; NO-ZVFHMIN-ZVFBFMIN-NEXT: [[ADD35:%.*]] = fsub fast bfloat [[I11]], [[I10]] -; NO-ZVFHMIN-ZVFBFMIN-NEXT: [[ARRAYIDX37:%.*]] = getelementptr inbounds bfloat, ptr [[S]], i64 5 -; NO-ZVFHMIN-ZVFBFMIN-NEXT: store bfloat [[ADD35]], ptr [[ARRAYIDX37]], align 4 -; NO-ZVFHMIN-ZVFBFMIN-NEXT: [[ARRAYIDX39:%.*]] = getelementptr inbounds [48 x bfloat], ptr [[P]], i64 0, i64 24 -; NO-ZVFHMIN-ZVFBFMIN-NEXT: [[I12:%.*]] = load bfloat, ptr [[ARRAYIDX39]], align 4 -; NO-ZVFHMIN-ZVFBFMIN-NEXT: [[ARRAYIDX41:%.*]] = getelementptr inbounds [48 x bfloat], ptr [[P]], i64 0, i64 6 -; NO-ZVFHMIN-ZVFBFMIN-NEXT: [[I13:%.*]] = load bfloat, ptr [[ARRAYIDX41]], align 4 -; NO-ZVFHMIN-ZVFBFMIN-NEXT: [[ADD42:%.*]] = fsub fast bfloat [[I13]], [[I12]] -; NO-ZVFHMIN-ZVFBFMIN-NEXT: [[ARRAYIDX44:%.*]] = getelementptr inbounds bfloat, ptr [[S]], i64 6 -; NO-ZVFHMIN-ZVFBFMIN-NEXT: store bfloat [[ADD42]], ptr [[ARRAYIDX44]], align 4 -; NO-ZVFHMIN-ZVFBFMIN-NEXT: [[ARRAYIDX46:%.*]] = getelementptr inbounds [48 x bfloat], ptr [[P]], i64 0, i64 28 -; NO-ZVFHMIN-ZVFBFMIN-NEXT: [[I14:%.*]] = load bfloat, ptr [[ARRAYIDX46]], align 4 -; NO-ZVFHMIN-ZVFBFMIN-NEXT: [[ARRAYIDX48:%.*]] = getelementptr inbounds [48 x bfloat], ptr [[P]], i64 0, i64 2 -; NO-ZVFHMIN-ZVFBFMIN-NEXT: [[I15:%.*]] = load bfloat, ptr [[ARRAYIDX48]], align 4 -; NO-ZVFHMIN-ZVFBFMIN-NEXT: [[ADD49:%.*]] = fsub fast bfloat [[I15]], [[I14]] -; NO-ZVFHMIN-ZVFBFMIN-NEXT: [[ARRAYIDX51:%.*]] = getelementptr inbounds bfloat, ptr [[S]], i64 7 -; NO-ZVFHMIN-ZVFBFMIN-NEXT: store bfloat [[ADD49]], ptr [[ARRAYIDX51]], align 4 -; NO-ZVFHMIN-ZVFBFMIN-NEXT: ret void -; -; ZVFHMIN-ZVFBFMIN-LABEL: @test_bf16( -; ZVFHMIN-ZVFBFMIN-NEXT: entry: -; ZVFHMIN-ZVFBFMIN-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [48 x bfloat], ptr [[P:%.*]], i64 0, i64 0 -; ZVFHMIN-ZVFBFMIN-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds [48 x bfloat], ptr [[P]], i64 0, i64 30 -; ZVFHMIN-ZVFBFMIN-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds bfloat, ptr [[S:%.*]], i64 0 -; ZVFHMIN-ZVFBFMIN-NEXT: [[TMP15:%.*]] = call <8 x bfloat> @llvm.experimental.vp.strided.load.v8bf16.p0.i64(ptr align 4 [[ARRAYIDX]], i64 8, <8 x i1> splat (i1 true), i32 8) -; ZVFHMIN-ZVFBFMIN-NEXT: [[TMP7:%.*]] = call <8 x bfloat> @llvm.experimental.vp.strided.load.v8bf16.p0.i64(ptr align 4 [[ARRAYIDX1]], i64 -8, <8 x i1> splat (i1 true), i32 8) -; ZVFHMIN-ZVFBFMIN-NEXT: [[TMP16:%.*]] = fsub fast <8 x bfloat> [[TMP7]], [[TMP15]] -; ZVFHMIN-ZVFBFMIN-NEXT: store <8 x bfloat> [[TMP16]], ptr [[ARRAYIDX2]], align 4 -; ZVFHMIN-ZVFBFMIN-NEXT: ret void +; CHECK-LABEL: @test_bf16( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [48 x bfloat], ptr [[P:%.*]], i64 0, i64 0 +; CHECK-NEXT: [[I:%.*]] = load bfloat, ptr [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds [48 x bfloat], ptr [[P]], i64 0, i64 30 +; CHECK-NEXT: [[I1:%.*]] = load bfloat, ptr [[ARRAYIDX1]], align 4 +; CHECK-NEXT: [[ADD:%.*]] = fsub fast bfloat [[I1]], [[I]] +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds bfloat, ptr [[S:%.*]], i64 0 +; CHECK-NEXT: store bfloat [[ADD]], ptr [[ARRAYIDX2]], align 4 +; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds [48 x bfloat], ptr [[P]], i64 0, i64 4 +; CHECK-NEXT: [[I2:%.*]] = load bfloat, ptr [[ARRAYIDX4]], align 4 +; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds [48 x bfloat], ptr [[P]], i64 0, i64 26 +; CHECK-NEXT: [[I3:%.*]] = load bfloat, ptr [[ARRAYIDX6]], align 4 +; CHECK-NEXT: [[ADD7:%.*]] = fsub fast bfloat [[I3]], [[I2]] +; CHECK-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds bfloat, ptr [[S]], i64 1 +; CHECK-NEXT: store bfloat [[ADD7]], ptr [[ARRAYIDX9]], align 4 +; CHECK-NEXT: [[ARRAYIDX11:%.*]] = getelementptr inbounds [48 x bfloat], ptr [[P]], i64 0, i64 8 +; CHECK-NEXT: [[I4:%.*]] = load bfloat, ptr [[ARRAYIDX11]], align 4 +; CHECK-NEXT: [[ARRAYIDX13:%.*]] = getelementptr inbounds [48 x bfloat], ptr [[P]], i64 0, i64 22 +; CHECK-NEXT: [[I5:%.*]] = load bfloat, ptr [[ARRAYIDX13]], align 4 +; CHECK-NEXT: [[ADD14:%.*]] = fsub fast bfloat [[I5]], [[I4]] +; CHECK-NEXT: [[ARRAYIDX16:%.*]] = getelementptr inbounds bfloat, ptr [[S]], i64 2 +; CHECK-NEXT: store bfloat [[ADD14]], ptr [[ARRAYIDX16]], align 4 +; CHECK-NEXT: [[ARRAYIDX18:%.*]] = getelementptr inbounds [48 x bfloat], ptr [[P]], i64 0, i64 12 +; CHECK-NEXT: [[I6:%.*]] = load bfloat, ptr [[ARRAYIDX18]], align 4 +; CHECK-NEXT: [[ARRAYIDX20:%.*]] = getelementptr inbounds [48 x bfloat], ptr [[P]], i64 0, i64 18 +; CHECK-NEXT: [[I7:%.*]] = load bfloat, ptr [[ARRAYIDX20]], align 4 +; CHECK-NEXT: [[ADD21:%.*]] = fsub fast bfloat [[I7]], [[I6]] +; CHECK-NEXT: [[ARRAYIDX23:%.*]] = getelementptr inbounds bfloat, ptr [[S]], i64 3 +; CHECK-NEXT: store bfloat [[ADD21]], ptr [[ARRAYIDX23]], align 4 +; CHECK-NEXT: [[ARRAYIDX25:%.*]] = getelementptr inbounds [48 x bfloat], ptr [[P]], i64 0, i64 16 +; CHECK-NEXT: [[I8:%.*]] = load bfloat, ptr [[ARRAYIDX25]], align 4 +; CHECK-NEXT: [[ARRAYIDX27:%.*]] = getelementptr inbounds [48 x bfloat], ptr [[P]], i64 0, i64 14 +; CHECK-NEXT: [[I9:%.*]] = load bfloat, ptr [[ARRAYIDX27]], align 4 +; CHECK-NEXT: [[ADD28:%.*]] = fsub fast bfloat [[I9]], [[I8]] +; CHECK-NEXT: [[ARRAYIDX30:%.*]] = getelementptr inbounds bfloat, ptr [[S]], i64 4 +; CHECK-NEXT: store bfloat [[ADD28]], ptr [[ARRAYIDX30]], align 4 +; CHECK-NEXT: [[ARRAYIDX32:%.*]] = getelementptr inbounds [48 x bfloat], ptr [[P]], i64 0, i64 20 +; CHECK-NEXT: [[I10:%.*]] = load bfloat, ptr [[ARRAYIDX32]], align 4 +; CHECK-NEXT: [[ARRAYIDX34:%.*]] = getelementptr inbounds [48 x bfloat], ptr [[P]], i64 0, i64 10 +; CHECK-NEXT: [[I11:%.*]] = load bfloat, ptr [[ARRAYIDX34]], align 4 +; CHECK-NEXT: [[ADD35:%.*]] = fsub fast bfloat [[I11]], [[I10]] +; CHECK-NEXT: [[ARRAYIDX37:%.*]] = getelementptr inbounds bfloat, ptr [[S]], i64 5 +; CHECK-NEXT: store bfloat [[ADD35]], ptr [[ARRAYIDX37]], align 4 +; CHECK-NEXT: [[ARRAYIDX39:%.*]] = getelementptr inbounds [48 x bfloat], ptr [[P]], i64 0, i64 24 +; CHECK-NEXT: [[I12:%.*]] = load bfloat, ptr [[ARRAYIDX39]], align 4 +; CHECK-NEXT: [[ARRAYIDX41:%.*]] = getelementptr inbounds [48 x bfloat], ptr [[P]], i64 0, i64 6 +; CHECK-NEXT: [[I13:%.*]] = load bfloat, ptr [[ARRAYIDX41]], align 4 +; CHECK-NEXT: [[ADD42:%.*]] = fsub fast bfloat [[I13]], [[I12]] +; CHECK-NEXT: [[ARRAYIDX44:%.*]] = getelementptr inbounds bfloat, ptr [[S]], i64 6 +; CHECK-NEXT: store bfloat [[ADD42]], ptr [[ARRAYIDX44]], align 4 +; CHECK-NEXT: [[ARRAYIDX46:%.*]] = getelementptr inbounds [48 x bfloat], ptr [[P]], i64 0, i64 28 +; CHECK-NEXT: [[I14:%.*]] = load bfloat, ptr [[ARRAYIDX46]], align 4 +; CHECK-NEXT: [[ARRAYIDX48:%.*]] = getelementptr inbounds [48 x bfloat], ptr [[P]], i64 0, i64 2 +; CHECK-NEXT: [[I15:%.*]] = load bfloat, ptr [[ARRAYIDX48]], align 4 +; CHECK-NEXT: [[ADD49:%.*]] = fsub fast bfloat [[I15]], [[I14]] +; CHECK-NEXT: [[ARRAYIDX51:%.*]] = getelementptr inbounds bfloat, ptr [[S]], i64 7 +; CHECK-NEXT: store bfloat [[ADD49]], ptr [[ARRAYIDX51]], align 4 +; CHECK-NEXT: ret void ; entry: %arrayidx = getelementptr inbounds [48 x bfloat], ptr %p, i64 0, i64 0 @@ -444,76 +433,65 @@ entry: } define void @test_f16(ptr %p, ptr noalias %s) { -; NO-ZVFHMIN-ZVFBFMIN-LABEL: @test_f16( -; NO-ZVFHMIN-ZVFBFMIN-NEXT: entry: -; NO-ZVFHMIN-ZVFBFMIN-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [48 x half], ptr [[P:%.*]], i64 0, i64 0 -; NO-ZVFHMIN-ZVFBFMIN-NEXT: [[I:%.*]] = load half, ptr [[ARRAYIDX]], align 4 -; NO-ZVFHMIN-ZVFBFMIN-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds [48 x half], ptr [[P]], i64 0, i64 30 -; NO-ZVFHMIN-ZVFBFMIN-NEXT: [[I1:%.*]] = load half, ptr [[ARRAYIDX1]], align 4 -; NO-ZVFHMIN-ZVFBFMIN-NEXT: [[ADD:%.*]] = fsub fast half [[I1]], [[I]] -; NO-ZVFHMIN-ZVFBFMIN-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds half, ptr [[S:%.*]], i64 0 -; NO-ZVFHMIN-ZVFBFMIN-NEXT: store half [[ADD]], ptr [[ARRAYIDX2]], align 4 -; NO-ZVFHMIN-ZVFBFMIN-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds [48 x half], ptr [[P]], i64 0, i64 4 -; NO-ZVFHMIN-ZVFBFMIN-NEXT: [[I2:%.*]] = load half, ptr [[ARRAYIDX4]], align 4 -; NO-ZVFHMIN-ZVFBFMIN-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds [48 x half], ptr [[P]], i64 0, i64 26 -; NO-ZVFHMIN-ZVFBFMIN-NEXT: [[I3:%.*]] = load half, ptr [[ARRAYIDX6]], align 4 -; NO-ZVFHMIN-ZVFBFMIN-NEXT: [[ADD7:%.*]] = fsub fast half [[I3]], [[I2]] -; NO-ZVFHMIN-ZVFBFMIN-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds half, ptr [[S]], i64 1 -; NO-ZVFHMIN-ZVFBFMIN-NEXT: store half [[ADD7]], ptr [[ARRAYIDX9]], align 4 -; NO-ZVFHMIN-ZVFBFMIN-NEXT: [[ARRAYIDX11:%.*]] = getelementptr inbounds [48 x half], ptr [[P]], i64 0, i64 8 -; NO-ZVFHMIN-ZVFBFMIN-NEXT: [[I4:%.*]] = load half, ptr [[ARRAYIDX11]], align 4 -; NO-ZVFHMIN-ZVFBFMIN-NEXT: [[ARRAYIDX13:%.*]] = getelementptr inbounds [48 x half], ptr [[P]], i64 0, i64 22 -; NO-ZVFHMIN-ZVFBFMIN-NEXT: [[I5:%.*]] = load half, ptr [[ARRAYIDX13]], align 4 -; NO-ZVFHMIN-ZVFBFMIN-NEXT: [[ADD14:%.*]] = fsub fast half [[I5]], [[I4]] -; NO-ZVFHMIN-ZVFBFMIN-NEXT: [[ARRAYIDX16:%.*]] = getelementptr inbounds half, ptr [[S]], i64 2 -; NO-ZVFHMIN-ZVFBFMIN-NEXT: store half [[ADD14]], ptr [[ARRAYIDX16]], align 4 -; NO-ZVFHMIN-ZVFBFMIN-NEXT: [[ARRAYIDX18:%.*]] = getelementptr inbounds [48 x half], ptr [[P]], i64 0, i64 12 -; NO-ZVFHMIN-ZVFBFMIN-NEXT: [[I6:%.*]] = load half, ptr [[ARRAYIDX18]], align 4 -; NO-ZVFHMIN-ZVFBFMIN-NEXT: [[ARRAYIDX20:%.*]] = getelementptr inbounds [48 x half], ptr [[P]], i64 0, i64 18 -; NO-ZVFHMIN-ZVFBFMIN-NEXT: [[I7:%.*]] = load half, ptr [[ARRAYIDX20]], align 4 -; NO-ZVFHMIN-ZVFBFMIN-NEXT: [[ADD21:%.*]] = fsub fast half [[I7]], [[I6]] -; NO-ZVFHMIN-ZVFBFMIN-NEXT: [[ARRAYIDX23:%.*]] = getelementptr inbounds half, ptr [[S]], i64 3 -; NO-ZVFHMIN-ZVFBFMIN-NEXT: store half [[ADD21]], ptr [[ARRAYIDX23]], align 4 -; NO-ZVFHMIN-ZVFBFMIN-NEXT: [[ARRAYIDX25:%.*]] = getelementptr inbounds [48 x half], ptr [[P]], i64 0, i64 16 -; NO-ZVFHMIN-ZVFBFMIN-NEXT: [[I8:%.*]] = load half, ptr [[ARRAYIDX25]], align 4 -; NO-ZVFHMIN-ZVFBFMIN-NEXT: [[ARRAYIDX27:%.*]] = getelementptr inbounds [48 x half], ptr [[P]], i64 0, i64 14 -; NO-ZVFHMIN-ZVFBFMIN-NEXT: [[I9:%.*]] = load half, ptr [[ARRAYIDX27]], align 4 -; NO-ZVFHMIN-ZVFBFMIN-NEXT: [[ADD28:%.*]] = fsub fast half [[I9]], [[I8]] -; NO-ZVFHMIN-ZVFBFMIN-NEXT: [[ARRAYIDX30:%.*]] = getelementptr inbounds half, ptr [[S]], i64 4 -; NO-ZVFHMIN-ZVFBFMIN-NEXT: store half [[ADD28]], ptr [[ARRAYIDX30]], align 4 -; NO-ZVFHMIN-ZVFBFMIN-NEXT: [[ARRAYIDX32:%.*]] = getelementptr inbounds [48 x half], ptr [[P]], i64 0, i64 20 -; NO-ZVFHMIN-ZVFBFMIN-NEXT: [[I10:%.*]] = load half, ptr [[ARRAYIDX32]], align 4 -; NO-ZVFHMIN-ZVFBFMIN-NEXT: [[ARRAYIDX34:%.*]] = getelementptr inbounds [48 x half], ptr [[P]], i64 0, i64 10 -; NO-ZVFHMIN-ZVFBFMIN-NEXT: [[I11:%.*]] = load half, ptr [[ARRAYIDX34]], align 4 -; NO-ZVFHMIN-ZVFBFMIN-NEXT: [[ADD35:%.*]] = fsub fast half [[I11]], [[I10]] -; NO-ZVFHMIN-ZVFBFMIN-NEXT: [[ARRAYIDX37:%.*]] = getelementptr inbounds half, ptr [[S]], i64 5 -; NO-ZVFHMIN-ZVFBFMIN-NEXT: store half [[ADD35]], ptr [[ARRAYIDX37]], align 4 -; NO-ZVFHMIN-ZVFBFMIN-NEXT: [[ARRAYIDX39:%.*]] = getelementptr inbounds [48 x half], ptr [[P]], i64 0, i64 24 -; NO-ZVFHMIN-ZVFBFMIN-NEXT: [[I12:%.*]] = load half, ptr [[ARRAYIDX39]], align 4 -; NO-ZVFHMIN-ZVFBFMIN-NEXT: [[ARRAYIDX41:%.*]] = getelementptr inbounds [48 x half], ptr [[P]], i64 0, i64 6 -; NO-ZVFHMIN-ZVFBFMIN-NEXT: [[I13:%.*]] = load half, ptr [[ARRAYIDX41]], align 4 -; NO-ZVFHMIN-ZVFBFMIN-NEXT: [[ADD42:%.*]] = fsub fast half [[I13]], [[I12]] -; NO-ZVFHMIN-ZVFBFMIN-NEXT: [[ARRAYIDX44:%.*]] = getelementptr inbounds half, ptr [[S]], i64 6 -; NO-ZVFHMIN-ZVFBFMIN-NEXT: store half [[ADD42]], ptr [[ARRAYIDX44]], align 4 -; NO-ZVFHMIN-ZVFBFMIN-NEXT: [[ARRAYIDX46:%.*]] = getelementptr inbounds [48 x half], ptr [[P]], i64 0, i64 28 -; NO-ZVFHMIN-ZVFBFMIN-NEXT: [[I14:%.*]] = load half, ptr [[ARRAYIDX46]], align 4 -; NO-ZVFHMIN-ZVFBFMIN-NEXT: [[ARRAYIDX48:%.*]] = getelementptr inbounds [48 x half], ptr [[P]], i64 0, i64 2 -; NO-ZVFHMIN-ZVFBFMIN-NEXT: [[I15:%.*]] = load half, ptr [[ARRAYIDX48]], align 4 -; NO-ZVFHMIN-ZVFBFMIN-NEXT: [[ADD49:%.*]] = fsub fast half [[I15]], [[I14]] -; NO-ZVFHMIN-ZVFBFMIN-NEXT: [[ARRAYIDX51:%.*]] = getelementptr inbounds half, ptr [[S]], i64 7 -; NO-ZVFHMIN-ZVFBFMIN-NEXT: store half [[ADD49]], ptr [[ARRAYIDX51]], align 4 -; NO-ZVFHMIN-ZVFBFMIN-NEXT: ret void -; -; ZVFHMIN-ZVFBFMIN-LABEL: @test_f16( -; ZVFHMIN-ZVFBFMIN-NEXT: entry: -; ZVFHMIN-ZVFBFMIN-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [48 x half], ptr [[P:%.*]], i64 0, i64 0 -; ZVFHMIN-ZVFBFMIN-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds [48 x half], ptr [[P]], i64 0, i64 30 -; ZVFHMIN-ZVFBFMIN-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds half, ptr [[S:%.*]], i64 0 -; ZVFHMIN-ZVFBFMIN-NEXT: [[TMP15:%.*]] = call <8 x half> @llvm.experimental.vp.strided.load.v8f16.p0.i64(ptr align 4 [[ARRAYIDX]], i64 8, <8 x i1> splat (i1 true), i32 8) -; ZVFHMIN-ZVFBFMIN-NEXT: [[TMP7:%.*]] = call <8 x half> @llvm.experimental.vp.strided.load.v8f16.p0.i64(ptr align 4 [[ARRAYIDX1]], i64 -8, <8 x i1> splat (i1 true), i32 8) -; ZVFHMIN-ZVFBFMIN-NEXT: [[TMP16:%.*]] = fsub fast <8 x half> [[TMP7]], [[TMP15]] -; ZVFHMIN-ZVFBFMIN-NEXT: store <8 x half> [[TMP16]], ptr [[ARRAYIDX2]], align 4 -; ZVFHMIN-ZVFBFMIN-NEXT: ret void +; CHECK-LABEL: @test_f16( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [48 x half], ptr [[P:%.*]], i64 0, i64 0 +; CHECK-NEXT: [[I:%.*]] = load half, ptr [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds [48 x half], ptr [[P]], i64 0, i64 30 +; CHECK-NEXT: [[I1:%.*]] = load half, ptr [[ARRAYIDX1]], align 4 +; CHECK-NEXT: [[ADD:%.*]] = fsub fast half [[I1]], [[I]] +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds half, ptr [[S:%.*]], i64 0 +; CHECK-NEXT: store half [[ADD]], ptr [[ARRAYIDX2]], align 4 +; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds [48 x half], ptr [[P]], i64 0, i64 4 +; CHECK-NEXT: [[I2:%.*]] = load half, ptr [[ARRAYIDX4]], align 4 +; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds [48 x half], ptr [[P]], i64 0, i64 26 +; CHECK-NEXT: [[I3:%.*]] = load half, ptr [[ARRAYIDX6]], align 4 +; CHECK-NEXT: [[ADD7:%.*]] = fsub fast half [[I3]], [[I2]] +; CHECK-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds half, ptr [[S]], i64 1 +; CHECK-NEXT: store half [[ADD7]], ptr [[ARRAYIDX9]], align 4 +; CHECK-NEXT: [[ARRAYIDX11:%.*]] = getelementptr inbounds [48 x half], ptr [[P]], i64 0, i64 8 +; CHECK-NEXT: [[I4:%.*]] = load half, ptr [[ARRAYIDX11]], align 4 +; CHECK-NEXT: [[ARRAYIDX13:%.*]] = getelementptr inbounds [48 x half], ptr [[P]], i64 0, i64 22 +; CHECK-NEXT: [[I5:%.*]] = load half, ptr [[ARRAYIDX13]], align 4 +; CHECK-NEXT: [[ADD14:%.*]] = fsub fast half [[I5]], [[I4]] +; CHECK-NEXT: [[ARRAYIDX16:%.*]] = getelementptr inbounds half, ptr [[S]], i64 2 +; CHECK-NEXT: store half [[ADD14]], ptr [[ARRAYIDX16]], align 4 +; CHECK-NEXT: [[ARRAYIDX18:%.*]] = getelementptr inbounds [48 x half], ptr [[P]], i64 0, i64 12 +; CHECK-NEXT: [[I6:%.*]] = load half, ptr [[ARRAYIDX18]], align 4 +; CHECK-NEXT: [[ARRAYIDX20:%.*]] = getelementptr inbounds [48 x half], ptr [[P]], i64 0, i64 18 +; CHECK-NEXT: [[I7:%.*]] = load half, ptr [[ARRAYIDX20]], align 4 +; CHECK-NEXT: [[ADD21:%.*]] = fsub fast half [[I7]], [[I6]] +; CHECK-NEXT: [[ARRAYIDX23:%.*]] = getelementptr inbounds half, ptr [[S]], i64 3 +; CHECK-NEXT: store half [[ADD21]], ptr [[ARRAYIDX23]], align 4 +; CHECK-NEXT: [[ARRAYIDX25:%.*]] = getelementptr inbounds [48 x half], ptr [[P]], i64 0, i64 16 +; CHECK-NEXT: [[I8:%.*]] = load half, ptr [[ARRAYIDX25]], align 4 +; CHECK-NEXT: [[ARRAYIDX27:%.*]] = getelementptr inbounds [48 x half], ptr [[P]], i64 0, i64 14 +; CHECK-NEXT: [[I9:%.*]] = load half, ptr [[ARRAYIDX27]], align 4 +; CHECK-NEXT: [[ADD28:%.*]] = fsub fast half [[I9]], [[I8]] +; CHECK-NEXT: [[ARRAYIDX30:%.*]] = getelementptr inbounds half, ptr [[S]], i64 4 +; CHECK-NEXT: store half [[ADD28]], ptr [[ARRAYIDX30]], align 4 +; CHECK-NEXT: [[ARRAYIDX32:%.*]] = getelementptr inbounds [48 x half], ptr [[P]], i64 0, i64 20 +; CHECK-NEXT: [[I10:%.*]] = load half, ptr [[ARRAYIDX32]], align 4 +; CHECK-NEXT: [[ARRAYIDX34:%.*]] = getelementptr inbounds [48 x half], ptr [[P]], i64 0, i64 10 +; CHECK-NEXT: [[I11:%.*]] = load half, ptr [[ARRAYIDX34]], align 4 +; CHECK-NEXT: [[ADD35:%.*]] = fsub fast half [[I11]], [[I10]] +; CHECK-NEXT: [[ARRAYIDX37:%.*]] = getelementptr inbounds half, ptr [[S]], i64 5 +; CHECK-NEXT: store half [[ADD35]], ptr [[ARRAYIDX37]], align 4 +; CHECK-NEXT: [[ARRAYIDX39:%.*]] = getelementptr inbounds [48 x half], ptr [[P]], i64 0, i64 24 +; CHECK-NEXT: [[I12:%.*]] = load half, ptr [[ARRAYIDX39]], align 4 +; CHECK-NEXT: [[ARRAYIDX41:%.*]] = getelementptr inbounds [48 x half], ptr [[P]], i64 0, i64 6 +; CHECK-NEXT: [[I13:%.*]] = load half, ptr [[ARRAYIDX41]], align 4 +; CHECK-NEXT: [[ADD42:%.*]] = fsub fast half [[I13]], [[I12]] +; CHECK-NEXT: [[ARRAYIDX44:%.*]] = getelementptr inbounds half, ptr [[S]], i64 6 +; CHECK-NEXT: store half [[ADD42]], ptr [[ARRAYIDX44]], align 4 +; CHECK-NEXT: [[ARRAYIDX46:%.*]] = getelementptr inbounds [48 x half], ptr [[P]], i64 0, i64 28 +; CHECK-NEXT: [[I14:%.*]] = load half, ptr [[ARRAYIDX46]], align 4 +; CHECK-NEXT: [[ARRAYIDX48:%.*]] = getelementptr inbounds [48 x half], ptr [[P]], i64 0, i64 2 +; CHECK-NEXT: [[I15:%.*]] = load half, ptr [[ARRAYIDX48]], align 4 +; CHECK-NEXT: [[ADD49:%.*]] = fsub fast half [[I15]], [[I14]] +; CHECK-NEXT: [[ARRAYIDX51:%.*]] = getelementptr inbounds half, ptr [[S]], i64 7 +; CHECK-NEXT: store half [[ADD49]], ptr [[ARRAYIDX51]], align 4 +; CHECK-NEXT: ret void ; entry: %arrayidx = getelementptr inbounds [48 x half], ptr %p, i64 0, i64 0