Skip to content

Commit 84be954

Browse files
authored
[RISCV][CostModel] Refine Arithmetic reduction costs (#79103)
This patch is split off from #77342 - Correct for CodeSize cost that 1 instruction is not included. 3 is from {VMV.S, ReductionOp, VMV.X} - Add SplitCost Unordered reduction chain a series of VADD/VFADD/... which scales with LMUL. Ordered reductions chain a series of VFREDOSUMs. - Use MVT to estimate VL.
1 parent f59eef6 commit 84be954

File tree

9 files changed

+361
-307
lines changed

9 files changed

+361
-307
lines changed

llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp

Lines changed: 66 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -86,6 +86,10 @@ RISCVTTIImpl::getRISCVInstructionCost(ArrayRef<unsigned> OpCodes, MVT VT,
8686
}
8787
case RISCV::VMV_X_S:
8888
case RISCV::VMV_S_X:
89+
case RISCV::VFMV_F_S:
90+
case RISCV::VFMV_S_F:
91+
case RISCV::VMNAND_MM:
92+
case RISCV::VCPOP_M:
8993
Cost += 1;
9094
break;
9195
default:
@@ -966,20 +970,70 @@ RISCVTTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *Ty,
966970
return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
967971

968972
std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
969-
if (Ty->getElementType()->isIntegerTy(1))
970-
// vcpop sequences, see vreduction-mask.ll
971-
return (LT.first - 1) + (ISD == ISD::AND ? 3 : 2);
973+
SmallVector<unsigned, 3> Opcodes;
974+
Type *ElementTy = Ty->getElementType();
975+
if (ElementTy->isIntegerTy(1)) {
976+
if (ISD == ISD::AND) {
977+
// Example sequences:
978+
// vsetvli a0, zero, e8, mf8, ta, ma
979+
// vmnot.m v8, v0
980+
// vcpop.m a0, v8
981+
// seqz a0, a0
982+
Opcodes = {RISCV::VMNAND_MM, RISCV::VCPOP_M};
983+
return (LT.first - 1) +
984+
getRISCVInstructionCost(Opcodes, LT.second, CostKind) +
985+
getCmpSelInstrCost(Instruction::ICmp, ElementTy, ElementTy,
986+
CmpInst::ICMP_EQ, CostKind);
987+
} else {
988+
// Example sequences:
989+
// vsetvli a0, zero, e8, mf8, ta, ma
990+
// vcpop.m a0, v0
991+
// snez a0, a0
992+
Opcodes = {RISCV::VCPOP_M};
993+
return (LT.first - 1) +
994+
getRISCVInstructionCost(Opcodes, LT.second, CostKind) +
995+
getCmpSelInstrCost(Instruction::ICmp, ElementTy, ElementTy,
996+
CmpInst::ICMP_NE, CostKind);
997+
}
998+
}
972999

9731000
// IR Reduction is composed by two vmv and one rvv reduction instruction.
974-
InstructionCost BaseCost = 2;
975-
976-
if (CostKind == TTI::TCK_CodeSize)
977-
return (LT.first - 1) + BaseCost;
978-
979-
unsigned VL = getEstimatedVLFor(Ty);
980-
if (TTI::requiresOrderedReduction(FMF))
981-
return (LT.first - 1) + BaseCost + VL;
982-
return (LT.first - 1) + BaseCost + Log2_32_Ceil(VL);
1001+
if (TTI::requiresOrderedReduction(FMF)) {
1002+
Opcodes.push_back(RISCV::VFMV_S_F);
1003+
for (unsigned i = 0; i < LT.first.getValue(); i++)
1004+
Opcodes.push_back(RISCV::VFREDOSUM_VS);
1005+
Opcodes.push_back(RISCV::VFMV_F_S);
1006+
return getRISCVInstructionCost(Opcodes, LT.second, CostKind);
1007+
}
1008+
unsigned SplitOp;
1009+
switch (ISD) {
1010+
case ISD::ADD:
1011+
SplitOp = RISCV::VADD_VV;
1012+
Opcodes = {RISCV::VMV_S_X, RISCV::VREDSUM_VS, RISCV::VMV_X_S};
1013+
break;
1014+
case ISD::OR:
1015+
SplitOp = RISCV::VOR_VV;
1016+
Opcodes = {RISCV::VMV_S_X, RISCV::VREDOR_VS, RISCV::VMV_X_S};
1017+
break;
1018+
case ISD::XOR:
1019+
SplitOp = RISCV::VXOR_VV;
1020+
Opcodes = {RISCV::VMV_S_X, RISCV::VREDXOR_VS, RISCV::VMV_X_S};
1021+
break;
1022+
case ISD::AND:
1023+
SplitOp = RISCV::VAND_VV;
1024+
Opcodes = {RISCV::VMV_S_X, RISCV::VREDAND_VS, RISCV::VMV_X_S};
1025+
break;
1026+
case ISD::FADD:
1027+
SplitOp = RISCV::VFADD_VV;
1028+
Opcodes = {RISCV::VFMV_S_F, RISCV::VFREDUSUM_VS, RISCV::VFMV_F_S};
1029+
break;
1030+
}
1031+
// Add a cost for data larger than LMUL8
1032+
InstructionCost SplitCost =
1033+
(LT.first > 1) ? (LT.first - 1) *
1034+
getRISCVInstructionCost(SplitOp, LT.second, CostKind)
1035+
: 0;
1036+
return SplitCost + getRISCVInstructionCost(Opcodes, LT.second, CostKind);
9831037
}
9841038

9851039
InstructionCost RISCVTTIImpl::getExtendedReductionCost(

llvm/test/Analysis/CostModel/RISCV/reduce-add.ll

Lines changed: 35 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -51,14 +51,14 @@ define i32 @reduce_i8(i32 %arg) {
5151
; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
5252
;
5353
; SIZE-LABEL: 'reduce_i8'
54-
; SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V1 = call i8 @llvm.vector.reduce.add.v1i8(<1 x i8> undef)
55-
; SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i8 @llvm.vector.reduce.add.v2i8(<2 x i8> undef)
56-
; SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4 = call i8 @llvm.vector.reduce.add.v4i8(<4 x i8> undef)
57-
; SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> undef)
58-
; SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> undef)
59-
; SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32 = call i8 @llvm.vector.reduce.add.v32i8(<32 x i8> undef)
60-
; SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V64 = call i8 @llvm.vector.reduce.add.v64i8(<64 x i8> undef)
61-
; SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V128 = call i8 @llvm.vector.reduce.add.v128i8(<128 x i8> undef)
54+
; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V1 = call i8 @llvm.vector.reduce.add.v1i8(<1 x i8> undef)
55+
; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.vector.reduce.add.v2i8(<2 x i8> undef)
56+
; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i8 @llvm.vector.reduce.add.v4i8(<4 x i8> undef)
57+
; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8 = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> undef)
58+
; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> undef)
59+
; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V32 = call i8 @llvm.vector.reduce.add.v32i8(<32 x i8> undef)
60+
; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V64 = call i8 @llvm.vector.reduce.add.v64i8(<64 x i8> undef)
61+
; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V128 = call i8 @llvm.vector.reduce.add.v128i8(<128 x i8> undef)
6262
; SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
6363
;
6464
%V1 = call i8 @llvm.vector.reduce.add.v1i8(<1 x i8> undef)
@@ -85,14 +85,14 @@ define i32 @reduce_i16(i32 %arg) {
8585
; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
8686
;
8787
; SIZE-LABEL: 'reduce_i16'
88-
; SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V1 = call i16 @llvm.vector.reduce.add.v1i16(<1 x i16> undef)
89-
; SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i16 @llvm.vector.reduce.add.v2i16(<2 x i16> undef)
90-
; SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4 = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> undef)
91-
; SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> undef)
92-
; SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16 = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> undef)
93-
; SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32 = call i16 @llvm.vector.reduce.add.v32i16(<32 x i16> undef)
94-
; SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V64 = call i16 @llvm.vector.reduce.add.v64i16(<64 x i16> undef)
95-
; SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V128 = call i16 @llvm.vector.reduce.add.v128i16(<128 x i16> undef)
88+
; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V1 = call i16 @llvm.vector.reduce.add.v1i16(<1 x i16> undef)
89+
; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.vector.reduce.add.v2i16(<2 x i16> undef)
90+
; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> undef)
91+
; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> undef)
92+
; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16 = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> undef)
93+
; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V32 = call i16 @llvm.vector.reduce.add.v32i16(<32 x i16> undef)
94+
; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V64 = call i16 @llvm.vector.reduce.add.v64i16(<64 x i16> undef)
95+
; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V128 = call i16 @llvm.vector.reduce.add.v128i16(<128 x i16> undef)
9696
; SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
9797
;
9898
%V1 = call i16 @llvm.vector.reduce.add.v1i16(<1 x i16> undef)
@@ -115,18 +115,18 @@ define i32 @reduce_i32(i32 %arg) {
115115
; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> undef)
116116
; CHECK-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V32 = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> undef)
117117
; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64 = call i32 @llvm.vector.reduce.add.v64i32(<64 x i32> undef)
118-
; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V128 = call i32 @llvm.vector.reduce.add.v128i32(<128 x i32> undef)
118+
; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128 = call i32 @llvm.vector.reduce.add.v128i32(<128 x i32> undef)
119119
; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
120120
;
121121
; SIZE-LABEL: 'reduce_i32'
122-
; SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V1 = call i32 @llvm.vector.reduce.add.v1i32(<1 x i32> undef)
123-
; SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> undef)
124-
; SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> undef)
125-
; SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> undef)
126-
; SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> undef)
127-
; SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32 = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> undef)
128-
; SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V64 = call i32 @llvm.vector.reduce.add.v64i32(<64 x i32> undef)
129-
; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V128 = call i32 @llvm.vector.reduce.add.v128i32(<128 x i32> undef)
122+
; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V1 = call i32 @llvm.vector.reduce.add.v1i32(<1 x i32> undef)
123+
; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> undef)
124+
; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> undef)
125+
; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> undef)
126+
; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> undef)
127+
; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V32 = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> undef)
128+
; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V64 = call i32 @llvm.vector.reduce.add.v64i32(<64 x i32> undef)
129+
; SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V128 = call i32 @llvm.vector.reduce.add.v128i32(<128 x i32> undef)
130130
; SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
131131
;
132132
%V1 = call i32 @llvm.vector.reduce.add.v1i32(<1 x i32> undef)
@@ -148,19 +148,19 @@ define i32 @reduce_i64(i32 %arg) {
148148
; CHECK-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V8 = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> undef)
149149
; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> undef)
150150
; CHECK-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V32 = call i64 @llvm.vector.reduce.add.v32i64(<32 x i64> undef)
151-
; CHECK-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V64 = call i64 @llvm.vector.reduce.add.v64i64(<64 x i64> undef)
152-
; CHECK-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V128 = call i64 @llvm.vector.reduce.add.v128i64(<128 x i64> undef)
151+
; CHECK-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V64 = call i64 @llvm.vector.reduce.add.v64i64(<64 x i64> undef)
152+
; CHECK-NEXT: Cost Model: Found an estimated cost of 31 for instruction: %V128 = call i64 @llvm.vector.reduce.add.v128i64(<128 x i64> undef)
153153
; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
154154
;
155155
; SIZE-LABEL: 'reduce_i64'
156-
; SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V1 = call i64 @llvm.vector.reduce.add.v1i64(<1 x i64> undef)
157-
; SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> undef)
158-
; SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> undef)
159-
; SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> undef)
160-
; SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16 = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> undef)
161-
; SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32 = call i64 @llvm.vector.reduce.add.v32i64(<32 x i64> undef)
162-
; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V64 = call i64 @llvm.vector.reduce.add.v64i64(<64 x i64> undef)
163-
; SIZE-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V128 = call i64 @llvm.vector.reduce.add.v128i64(<128 x i64> undef)
156+
; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V1 = call i64 @llvm.vector.reduce.add.v1i64(<1 x i64> undef)
157+
; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> undef)
158+
; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> undef)
159+
; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8 = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> undef)
160+
; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16 = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> undef)
161+
; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V32 = call i64 @llvm.vector.reduce.add.v32i64(<32 x i64> undef)
162+
; SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64 = call i64 @llvm.vector.reduce.add.v64i64(<64 x i64> undef)
163+
; SIZE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V128 = call i64 @llvm.vector.reduce.add.v128i64(<128 x i64> undef)
164164
; SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
165165
;
166166
%V1 = call i64 @llvm.vector.reduce.add.v1i64(<1 x i64> undef)

0 commit comments

Comments
 (0)