@@ -514,12 +514,13 @@ let OtherPredicates = [isGFX10Plus, Has16BitInsts], True16Predicate = NotHasTrue
514
514
defm: Ternary_i16_Pats_gfx9<mul, add, V_MAD_U16_gfx9_e64>;
515
515
} // End OtherPredicates = [isGFX10Plus, Has16BitInsts], True16Predicate = NotHasTrue16BitInsts
516
516
517
- class ThreeOpFragSDAG<SDPatternOperator op1, SDPatternOperator op2> : PatFrag<
517
+ class ThreeOpFragSDAG<SDPatternOperator op1, SDPatternOperator op2, bit op1IsRight = 0 > : PatFrag<
518
518
(ops node:$x, node:$y, node:$z),
519
519
// When the inner operation is used multiple times, selecting 3-op
520
520
// instructions may still be beneficial -- if the other users can be
521
521
// combined similarly. Let's be conservative for now.
522
- (op2 (HasOneUseBinOp<op1> node:$x, node:$y), node:$z),
522
+ !if(op1IsRight, (op2 node:$z, (HasOneUseBinOp<op1> node:$x, node:$y)),
523
+ (op2 (HasOneUseBinOp<op1> node:$x, node:$y), node:$z)),
523
524
[{
524
525
// Only use VALU ops when the result is divergent.
525
526
if (!N->isDivergent())
@@ -546,7 +547,10 @@ class ThreeOpFragSDAG<SDPatternOperator op1, SDPatternOperator op2> : PatFrag<
546
547
let PredicateCodeUsesOperands = 1;
547
548
}
548
549
549
- class ThreeOpFrag<SDPatternOperator op1, SDPatternOperator op2> : ThreeOpFragSDAG<op1, op2> {
550
+ // Matches (op2 (op1 x, y), z) if op1IsRight = 0 and
551
+ // matches (op2 z, (op1, x, y)) if op1IsRight = 1.
552
+ class ThreeOpFrag<SDPatternOperator op1, SDPatternOperator op2,
553
+ bit op1IsRight = 0> : ThreeOpFragSDAG<op1, op2, op1IsRight> {
550
554
// The divergence predicate is irrelevant in GlobalISel, as we have
551
555
// proper register bank checks. We just need to verify the constant
552
556
// bus restriction when all the sources are considered.
@@ -836,12 +840,19 @@ def : GCNPat<
836
840
(DivergentBinFrag<mul> i32:$src0, IsPow2Plus1:$src1),
837
841
(V_LSHL_ADD_U32_e64 i32:$src0, (i32 (Log2_32 imm:$src1)), i32:$src0)>;
838
842
839
- let SubtargetPredicate = HasLshlAddU64Inst in
843
+ let SubtargetPredicate = HasLshlAddU64Inst in {
840
844
def : GCNPat<
841
845
(ThreeOpFrag<shl_0_to_4, add> i64:$src0, i32:$src1, i64:$src2),
842
846
(V_LSHL_ADD_U64_e64 VSrc_b64:$src0, VSrc_b32:$src1, VSrc_b64:$src2)
843
847
>;
844
848
849
+ def : GCNPat <
850
+ // (ptradd z, (shl x, y)) -> ((x << y) + z)
851
+ (ThreeOpFrag<shl_0_to_4, ptradd, /*op1IsRight=*/1> i64:$src0, i32:$src1, i64:$src2),
852
+ (V_LSHL_ADD_U64_e64 VSrc_b64:$src0, VSrc_b32:$src1, VSrc_b64:$src2)
853
+ >;
854
+ } // End SubtargetPredicate = HasLshlAddU64Inst
855
+
845
856
def : VOPBinOpClampPat<saddsat, V_ADD_I32_e64, i32>;
846
857
def : VOPBinOpClampPat<ssubsat, V_SUB_I32_e64, i32>;
847
858
@@ -910,19 +921,24 @@ multiclass IMAD32_Pats <VOP3_Pseudo inst> {
910
921
911
922
// Handle cases where amdgpu-codegenprepare-mul24 made a mul24 instead of a normal mul.
912
923
// We need to separate this because otherwise OtherPredicates would be overriden.
913
- class IMAD32_Mul24_Pat<VOP3_Pseudo inst>: GCNPat <
914
- (i64 (add (i64 (AMDGPUmul_u24 i32:$src0, i32:$src1)), i64:$src2)),
915
- (inst $src0, $src1, $src2, 0 /* clamp */)
916
- >;
924
+ class IMAD32_Mul24_Pats_Impl<VOP3_Pseudo inst, SDPatternOperator AddOp, bit mulIsRight = 0> : GCNPat <
925
+ !if(mulIsRight, (i64 (AddOp i64:$src2, (i64 (AMDGPUmul_u24 i32:$src0, i32:$src1)))),
926
+ (i64 (AddOp (i64 (AMDGPUmul_u24 i32:$src0, i32:$src1)), i64:$src2))),
927
+ (inst $src0, $src1, $src2, 0 /* clamp */)>;
928
+
929
+ multiclass IMAD32_Mul24_Pats<VOP3_Pseudo inst> {
930
+ def : IMAD32_Mul24_Pats_Impl<inst, add>;
931
+ def : IMAD32_Mul24_Pats_Impl<inst, ptradd, /*mulIsRight=*/1>;
932
+ }
917
933
918
934
// exclude pre-GFX9 where it was slow
919
935
let OtherPredicates = [HasNotMADIntraFwdBug], SubtargetPredicate = isGFX9Plus in {
920
936
defm : IMAD32_Pats<V_MAD_U64_U32_e64>;
921
- def : IMAD32_Mul24_Pat <V_MAD_U64_U32_e64>;
937
+ defm : IMAD32_Mul24_Pats <V_MAD_U64_U32_e64>;
922
938
}
923
939
let OtherPredicates = [HasMADIntraFwdBug], SubtargetPredicate = isGFX11Only in {
924
940
defm : IMAD32_Pats<V_MAD_U64_U32_gfx11_e64>;
925
- def : IMAD32_Mul24_Pat <V_MAD_U64_U32_gfx11_e64>;
941
+ defm : IMAD32_Mul24_Pats <V_MAD_U64_U32_gfx11_e64>;
926
942
}
927
943
928
944
def VOP3_PERMLANE_Profile : VOP3_Profile<VOPProfile <[i32, i32, i32, i32]>, VOP3_OPSEL> {
0 commit comments