@@ -46,59 +46,101 @@ define void @test(ptr %dp0, ptr addrspace(1) %dp1, ptr addrspace(3) %dp3, bfloat
46
46
; CHECKPTX71-LABEL: test(
47
47
; CHECKPTX71: {
48
48
; CHECKPTX71-NEXT: .reg .pred %p<5>;
49
- ; CHECKPTX71-NEXT: .reg .b16 %rs<26 >;
50
- ; CHECKPTX71-NEXT: .reg .b32 %r<4 >;
49
+ ; CHECKPTX71-NEXT: .reg .b16 %rs<14 >;
50
+ ; CHECKPTX71-NEXT: .reg .b32 %r<58 >;
51
51
; CHECKPTX71-EMPTY:
52
52
; CHECKPTX71-NEXT: // %bb.0:
53
- ; CHECKPTX71-NEXT: ld.param.b16 %rs13, [test_param_3];
54
- ; CHECKPTX71-NEXT: ld.param.u32 %r3, [test_param_2];
55
- ; CHECKPTX71-NEXT: ld.param.u32 %r2, [test_param_1];
56
- ; CHECKPTX71-NEXT: ld.param.u32 %r1, [test_param_0];
57
- ; CHECKPTX71-NEXT: ld.b16 %rs22, [%r1];
58
- ; CHECKPTX71-NEXT: $L__BB0_1: // %atomicrmw.start14
53
+ ; CHECKPTX71-NEXT: ld.param.b16 %rs1, [test_param_3];
54
+ ; CHECKPTX71-NEXT: ld.param.u32 %r23, [test_param_2];
55
+ ; CHECKPTX71-NEXT: ld.param.u32 %r22, [test_param_1];
56
+ ; CHECKPTX71-NEXT: ld.param.u32 %r24, [test_param_0];
57
+ ; CHECKPTX71-NEXT: and.b32 %r1, %r24, -4;
58
+ ; CHECKPTX71-NEXT: and.b32 %r25, %r24, 3;
59
+ ; CHECKPTX71-NEXT: shl.b32 %r2, %r25, 3;
60
+ ; CHECKPTX71-NEXT: mov.b32 %r26, 65535;
61
+ ; CHECKPTX71-NEXT: shl.b32 %r27, %r26, %r2;
62
+ ; CHECKPTX71-NEXT: not.b32 %r3, %r27;
63
+ ; CHECKPTX71-NEXT: ld.u32 %r54, [%r1];
64
+ ; CHECKPTX71-NEXT: $L__BB0_1: // %atomicrmw.start45
59
65
; CHECKPTX71-NEXT: // =>This Inner Loop Header: Depth=1
60
- ; CHECKPTX71-NEXT: mov.b16 %rs14, 0x3F80;
61
- ; CHECKPTX71-NEXT: fma.rn.bf16 %rs15, %rs22, %rs14, %rs13;
62
- ; CHECKPTX71-NEXT: atom.cas.b16 %rs3, [%r1], %rs22, %rs15;
63
- ; CHECKPTX71-NEXT: setp.ne.s16 %p1, %rs3, %rs22;
64
- ; CHECKPTX71-NEXT: mov.u16 %rs22, %rs3;
66
+ ; CHECKPTX71-NEXT: shr.u32 %r28, %r54, %r2;
67
+ ; CHECKPTX71-NEXT: cvt.u16.u32 %rs2, %r28;
68
+ ; CHECKPTX71-NEXT: mov.b16 %rs3, 0x3F80;
69
+ ; CHECKPTX71-NEXT: fma.rn.bf16 %rs4, %rs2, %rs3, %rs1;
70
+ ; CHECKPTX71-NEXT: cvt.u32.u16 %r29, %rs4;
71
+ ; CHECKPTX71-NEXT: shl.b32 %r30, %r29, %r2;
72
+ ; CHECKPTX71-NEXT: and.b32 %r31, %r54, %r3;
73
+ ; CHECKPTX71-NEXT: or.b32 %r32, %r31, %r30;
74
+ ; CHECKPTX71-NEXT: atom.cas.b32 %r6, [%r1], %r54, %r32;
75
+ ; CHECKPTX71-NEXT: setp.ne.s32 %p1, %r6, %r54;
76
+ ; CHECKPTX71-NEXT: mov.u32 %r54, %r6;
65
77
; CHECKPTX71-NEXT: @%p1 bra $L__BB0_1;
66
- ; CHECKPTX71-NEXT: // %bb.2: // %atomicrmw.end13
67
- ; CHECKPTX71-NEXT: ld.b16 %rs23 , [%r1];
68
- ; CHECKPTX71-NEXT: $L__BB0_3: // %atomicrmw.start8
78
+ ; CHECKPTX71-NEXT: // %bb.2: // %atomicrmw.end44
79
+ ; CHECKPTX71-NEXT: ld.u32 %r55 , [%r1];
80
+ ; CHECKPTX71-NEXT: $L__BB0_3: // %atomicrmw.start27
69
81
; CHECKPTX71-NEXT: // =>This Inner Loop Header: Depth=1
70
- ; CHECKPTX71-NEXT: mov.b16 %rs16, 0x3F80;
71
- ; CHECKPTX71-NEXT: fma.rn.bf16 %rs17, %rs23, %rs16, %rs16;
72
- ; CHECKPTX71-NEXT: atom.cas.b16 %rs6, [%r1], %rs23, %rs17;
73
- ; CHECKPTX71-NEXT: setp.ne.s16 %p2, %rs6, %rs23;
74
- ; CHECKPTX71-NEXT: mov.u16 %rs23, %rs6;
82
+ ; CHECKPTX71-NEXT: shr.u32 %r33, %r55, %r2;
83
+ ; CHECKPTX71-NEXT: cvt.u16.u32 %rs5, %r33;
84
+ ; CHECKPTX71-NEXT: mov.b16 %rs6, 0x3F80;
85
+ ; CHECKPTX71-NEXT: fma.rn.bf16 %rs7, %rs5, %rs6, %rs6;
86
+ ; CHECKPTX71-NEXT: cvt.u32.u16 %r34, %rs7;
87
+ ; CHECKPTX71-NEXT: shl.b32 %r35, %r34, %r2;
88
+ ; CHECKPTX71-NEXT: and.b32 %r36, %r55, %r3;
89
+ ; CHECKPTX71-NEXT: or.b32 %r37, %r36, %r35;
90
+ ; CHECKPTX71-NEXT: atom.cas.b32 %r9, [%r1], %r55, %r37;
91
+ ; CHECKPTX71-NEXT: setp.ne.s32 %p2, %r9, %r55;
92
+ ; CHECKPTX71-NEXT: mov.u32 %r55, %r9;
75
93
; CHECKPTX71-NEXT: @%p2 bra $L__BB0_3;
76
- ; CHECKPTX71-NEXT: // %bb.4: // %atomicrmw.end7
77
- ; CHECKPTX71-NEXT: ld.global.b16 %rs24, [%r2];
78
- ; CHECKPTX71-NEXT: $L__BB0_5: // %atomicrmw.start2
94
+ ; CHECKPTX71-NEXT: // %bb.4: // %atomicrmw.end26
95
+ ; CHECKPTX71-NEXT: and.b32 %r10, %r22, -4;
96
+ ; CHECKPTX71-NEXT: shl.b32 %r38, %r22, 3;
97
+ ; CHECKPTX71-NEXT: and.b32 %r11, %r38, 24;
98
+ ; CHECKPTX71-NEXT: mov.b32 %r39, 65535;
99
+ ; CHECKPTX71-NEXT: shl.b32 %r40, %r39, %r11;
100
+ ; CHECKPTX71-NEXT: not.b32 %r12, %r40;
101
+ ; CHECKPTX71-NEXT: ld.global.u32 %r56, [%r10];
102
+ ; CHECKPTX71-NEXT: $L__BB0_5: // %atomicrmw.start9
79
103
; CHECKPTX71-NEXT: // =>This Inner Loop Header: Depth=1
80
- ; CHECKPTX71-NEXT: mov.b16 %rs18, 0x3F80;
81
- ; CHECKPTX71-NEXT: fma.rn.bf16 %rs19, %rs24, %rs18, %rs13;
82
- ; CHECKPTX71-NEXT: atom.global.cas.b16 %rs9, [%r2], %rs24, %rs19;
83
- ; CHECKPTX71-NEXT: setp.ne.s16 %p3, %rs9, %rs24;
84
- ; CHECKPTX71-NEXT: mov.u16 %rs24, %rs9;
104
+ ; CHECKPTX71-NEXT: shr.u32 %r41, %r56, %r11;
105
+ ; CHECKPTX71-NEXT: cvt.u16.u32 %rs8, %r41;
106
+ ; CHECKPTX71-NEXT: mov.b16 %rs9, 0x3F80;
107
+ ; CHECKPTX71-NEXT: fma.rn.bf16 %rs10, %rs8, %rs9, %rs1;
108
+ ; CHECKPTX71-NEXT: cvt.u32.u16 %r42, %rs10;
109
+ ; CHECKPTX71-NEXT: shl.b32 %r43, %r42, %r11;
110
+ ; CHECKPTX71-NEXT: and.b32 %r44, %r56, %r12;
111
+ ; CHECKPTX71-NEXT: or.b32 %r45, %r44, %r43;
112
+ ; CHECKPTX71-NEXT: atom.global.cas.b32 %r15, [%r10], %r56, %r45;
113
+ ; CHECKPTX71-NEXT: setp.ne.s32 %p3, %r15, %r56;
114
+ ; CHECKPTX71-NEXT: mov.u32 %r56, %r15;
85
115
; CHECKPTX71-NEXT: @%p3 bra $L__BB0_5;
86
- ; CHECKPTX71-NEXT: // %bb.6: // %atomicrmw.end1
87
- ; CHECKPTX71-NEXT: ld.shared.b16 %rs25, [%r3];
116
+ ; CHECKPTX71-NEXT: // %bb.6: // %atomicrmw.end8
117
+ ; CHECKPTX71-NEXT: and.b32 %r16, %r23, -4;
118
+ ; CHECKPTX71-NEXT: shl.b32 %r46, %r23, 3;
119
+ ; CHECKPTX71-NEXT: and.b32 %r17, %r46, 24;
120
+ ; CHECKPTX71-NEXT: mov.b32 %r47, 65535;
121
+ ; CHECKPTX71-NEXT: shl.b32 %r48, %r47, %r17;
122
+ ; CHECKPTX71-NEXT: not.b32 %r18, %r48;
123
+ ; CHECKPTX71-NEXT: ld.shared.u32 %r57, [%r16];
88
124
; CHECKPTX71-NEXT: $L__BB0_7: // %atomicrmw.start
89
125
; CHECKPTX71-NEXT: // =>This Inner Loop Header: Depth=1
90
- ; CHECKPTX71-NEXT: mov.b16 %rs20, 0x3F80;
91
- ; CHECKPTX71-NEXT: fma.rn.bf16 %rs21, %rs25, %rs20, %rs13;
92
- ; CHECKPTX71-NEXT: atom.shared.cas.b16 %rs12, [%r3], %rs25, %rs21;
93
- ; CHECKPTX71-NEXT: setp.ne.s16 %p4, %rs12, %rs25;
94
- ; CHECKPTX71-NEXT: mov.u16 %rs25, %rs12;
126
+ ; CHECKPTX71-NEXT: shr.u32 %r49, %r57, %r17;
127
+ ; CHECKPTX71-NEXT: cvt.u16.u32 %rs11, %r49;
128
+ ; CHECKPTX71-NEXT: mov.b16 %rs12, 0x3F80;
129
+ ; CHECKPTX71-NEXT: fma.rn.bf16 %rs13, %rs11, %rs12, %rs1;
130
+ ; CHECKPTX71-NEXT: cvt.u32.u16 %r50, %rs13;
131
+ ; CHECKPTX71-NEXT: shl.b32 %r51, %r50, %r17;
132
+ ; CHECKPTX71-NEXT: and.b32 %r52, %r57, %r18;
133
+ ; CHECKPTX71-NEXT: or.b32 %r53, %r52, %r51;
134
+ ; CHECKPTX71-NEXT: atom.shared.cas.b32 %r21, [%r16], %r57, %r53;
135
+ ; CHECKPTX71-NEXT: setp.ne.s32 %p4, %r21, %r57;
136
+ ; CHECKPTX71-NEXT: mov.u32 %r57, %r21;
95
137
; CHECKPTX71-NEXT: @%p4 bra $L__BB0_7;
96
138
; CHECKPTX71-NEXT: // %bb.8: // %atomicrmw.end
97
139
; CHECKPTX71-NEXT: ret;
98
- %r1 = atomicrmw fadd ptr %dp0 , bfloat %val seq_cst
99
- %r2 = atomicrmw fadd ptr %dp0 , bfloat 1 .0 seq_cst
100
- %r3 = atomicrmw fadd ptr addrspace (1 ) %dp1 , bfloat %val seq_cst
101
- %r4 = atomicrmw fadd ptr addrspace (3 ) %dp3 , bfloat %val seq_cst
140
+ %r1 = atomicrmw fadd ptr %dp0 , bfloat %val monotonic
141
+ %r2 = atomicrmw fadd ptr %dp0 , bfloat 1 .0 monotonic
142
+ %r3 = atomicrmw fadd ptr addrspace (1 ) %dp1 , bfloat %val monotonic
143
+ %r4 = atomicrmw fadd ptr addrspace (3 ) %dp3 , bfloat %val monotonic
102
144
ret void
103
145
}
104
146
0 commit comments