Skip to content

Commit 22bbaf7

Browse files
[NVPTX] Stop using 16-bit CAS instructions from PTX
1 parent 8f45452 commit 22bbaf7

File tree

5 files changed

+153
-83
lines changed

5 files changed

+153
-83
lines changed

llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -995,7 +995,7 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
995995
// actions
996996
computeRegisterProperties(STI.getRegisterInfo());
997997

998-
setMinCmpXchgSizeInBits(STI.hasAtomCas16() ? 16 : 32);
998+
setMinCmpXchgSizeInBits(STI.getMinCmpXchgSizeInBits());
999999
setMaxAtomicSizeInBitsSupported(64);
10001000
setMaxDivRemBitWidthSupported(64);
10011001
}

llvm/lib/Target/NVPTX/NVPTXSubtarget.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -128,6 +128,8 @@ class NVPTXSubtarget : public NVPTXGenSubtargetInfo {
128128
// set of equivalent memory operations with a scalar data-type, executed in
129129
// an unspecified order on the elements in the vector.
130130
unsigned getMaxRequiredAlignment() const { return 8; }
131+
// Emulated loops with 32-bit/64-bit CAS generate better SASS than 16-bit CAS
132+
unsigned getMinCmpXchgSizeInBits() const { return 32; }
131133

132134
unsigned getPTXVersion() const { return PTXVersion; }
133135

llvm/test/CodeGen/NVPTX/atomics-sm70.ll

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -134,10 +134,10 @@ define void @test(ptr %dp0, ptr addrspace(1) %dp1, ptr addrspace(3) %dp3, half %
134134
; CHECKPTX62-NEXT: @%p4 bra $L__BB0_7;
135135
; CHECKPTX62-NEXT: // %bb.8: // %atomicrmw.end
136136
; CHECKPTX62-NEXT: ret;
137-
%r1 = atomicrmw fadd ptr %dp0, half %val seq_cst
138-
%r2 = atomicrmw fadd ptr %dp0, half 1.0 seq_cst
139-
%r3 = atomicrmw fadd ptr addrspace(1) %dp1, half %val seq_cst
140-
%r4 = atomicrmw fadd ptr addrspace(3) %dp3, half %val seq_cst
137+
%r1 = atomicrmw fadd ptr %dp0, half %val monotonic
138+
%r2 = atomicrmw fadd ptr %dp0, half 1.0 monotonic
139+
%r3 = atomicrmw fadd ptr addrspace(1) %dp1, half %val monotonic
140+
%r4 = atomicrmw fadd ptr addrspace(3) %dp3, half %val monotonic
141141
ret void
142142
}
143143

llvm/test/CodeGen/NVPTX/atomics-sm90.ll

Lines changed: 82 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -46,59 +46,101 @@ define void @test(ptr %dp0, ptr addrspace(1) %dp1, ptr addrspace(3) %dp3, bfloat
4646
; CHECKPTX71-LABEL: test(
4747
; CHECKPTX71: {
4848
; CHECKPTX71-NEXT: .reg .pred %p<5>;
49-
; CHECKPTX71-NEXT: .reg .b16 %rs<26>;
50-
; CHECKPTX71-NEXT: .reg .b32 %r<4>;
49+
; CHECKPTX71-NEXT: .reg .b16 %rs<14>;
50+
; CHECKPTX71-NEXT: .reg .b32 %r<58>;
5151
; CHECKPTX71-EMPTY:
5252
; CHECKPTX71-NEXT: // %bb.0:
53-
; CHECKPTX71-NEXT: ld.param.b16 %rs13, [test_param_3];
54-
; CHECKPTX71-NEXT: ld.param.u32 %r3, [test_param_2];
55-
; CHECKPTX71-NEXT: ld.param.u32 %r2, [test_param_1];
56-
; CHECKPTX71-NEXT: ld.param.u32 %r1, [test_param_0];
57-
; CHECKPTX71-NEXT: ld.b16 %rs22, [%r1];
58-
; CHECKPTX71-NEXT: $L__BB0_1: // %atomicrmw.start14
53+
; CHECKPTX71-NEXT: ld.param.b16 %rs1, [test_param_3];
54+
; CHECKPTX71-NEXT: ld.param.u32 %r23, [test_param_2];
55+
; CHECKPTX71-NEXT: ld.param.u32 %r22, [test_param_1];
56+
; CHECKPTX71-NEXT: ld.param.u32 %r24, [test_param_0];
57+
; CHECKPTX71-NEXT: and.b32 %r1, %r24, -4;
58+
; CHECKPTX71-NEXT: and.b32 %r25, %r24, 3;
59+
; CHECKPTX71-NEXT: shl.b32 %r2, %r25, 3;
60+
; CHECKPTX71-NEXT: mov.b32 %r26, 65535;
61+
; CHECKPTX71-NEXT: shl.b32 %r27, %r26, %r2;
62+
; CHECKPTX71-NEXT: not.b32 %r3, %r27;
63+
; CHECKPTX71-NEXT: ld.u32 %r54, [%r1];
64+
; CHECKPTX71-NEXT: $L__BB0_1: // %atomicrmw.start45
5965
; CHECKPTX71-NEXT: // =>This Inner Loop Header: Depth=1
60-
; CHECKPTX71-NEXT: mov.b16 %rs14, 0x3F80;
61-
; CHECKPTX71-NEXT: fma.rn.bf16 %rs15, %rs22, %rs14, %rs13;
62-
; CHECKPTX71-NEXT: atom.cas.b16 %rs3, [%r1], %rs22, %rs15;
63-
; CHECKPTX71-NEXT: setp.ne.s16 %p1, %rs3, %rs22;
64-
; CHECKPTX71-NEXT: mov.u16 %rs22, %rs3;
66+
; CHECKPTX71-NEXT: shr.u32 %r28, %r54, %r2;
67+
; CHECKPTX71-NEXT: cvt.u16.u32 %rs2, %r28;
68+
; CHECKPTX71-NEXT: mov.b16 %rs3, 0x3F80;
69+
; CHECKPTX71-NEXT: fma.rn.bf16 %rs4, %rs2, %rs3, %rs1;
70+
; CHECKPTX71-NEXT: cvt.u32.u16 %r29, %rs4;
71+
; CHECKPTX71-NEXT: shl.b32 %r30, %r29, %r2;
72+
; CHECKPTX71-NEXT: and.b32 %r31, %r54, %r3;
73+
; CHECKPTX71-NEXT: or.b32 %r32, %r31, %r30;
74+
; CHECKPTX71-NEXT: atom.cas.b32 %r6, [%r1], %r54, %r32;
75+
; CHECKPTX71-NEXT: setp.ne.s32 %p1, %r6, %r54;
76+
; CHECKPTX71-NEXT: mov.u32 %r54, %r6;
6577
; CHECKPTX71-NEXT: @%p1 bra $L__BB0_1;
66-
; CHECKPTX71-NEXT: // %bb.2: // %atomicrmw.end13
67-
; CHECKPTX71-NEXT: ld.b16 %rs23, [%r1];
68-
; CHECKPTX71-NEXT: $L__BB0_3: // %atomicrmw.start8
78+
; CHECKPTX71-NEXT: // %bb.2: // %atomicrmw.end44
79+
; CHECKPTX71-NEXT: ld.u32 %r55, [%r1];
80+
; CHECKPTX71-NEXT: $L__BB0_3: // %atomicrmw.start27
6981
; CHECKPTX71-NEXT: // =>This Inner Loop Header: Depth=1
70-
; CHECKPTX71-NEXT: mov.b16 %rs16, 0x3F80;
71-
; CHECKPTX71-NEXT: fma.rn.bf16 %rs17, %rs23, %rs16, %rs16;
72-
; CHECKPTX71-NEXT: atom.cas.b16 %rs6, [%r1], %rs23, %rs17;
73-
; CHECKPTX71-NEXT: setp.ne.s16 %p2, %rs6, %rs23;
74-
; CHECKPTX71-NEXT: mov.u16 %rs23, %rs6;
82+
; CHECKPTX71-NEXT: shr.u32 %r33, %r55, %r2;
83+
; CHECKPTX71-NEXT: cvt.u16.u32 %rs5, %r33;
84+
; CHECKPTX71-NEXT: mov.b16 %rs6, 0x3F80;
85+
; CHECKPTX71-NEXT: fma.rn.bf16 %rs7, %rs5, %rs6, %rs6;
86+
; CHECKPTX71-NEXT: cvt.u32.u16 %r34, %rs7;
87+
; CHECKPTX71-NEXT: shl.b32 %r35, %r34, %r2;
88+
; CHECKPTX71-NEXT: and.b32 %r36, %r55, %r3;
89+
; CHECKPTX71-NEXT: or.b32 %r37, %r36, %r35;
90+
; CHECKPTX71-NEXT: atom.cas.b32 %r9, [%r1], %r55, %r37;
91+
; CHECKPTX71-NEXT: setp.ne.s32 %p2, %r9, %r55;
92+
; CHECKPTX71-NEXT: mov.u32 %r55, %r9;
7593
; CHECKPTX71-NEXT: @%p2 bra $L__BB0_3;
76-
; CHECKPTX71-NEXT: // %bb.4: // %atomicrmw.end7
77-
; CHECKPTX71-NEXT: ld.global.b16 %rs24, [%r2];
78-
; CHECKPTX71-NEXT: $L__BB0_5: // %atomicrmw.start2
94+
; CHECKPTX71-NEXT: // %bb.4: // %atomicrmw.end26
95+
; CHECKPTX71-NEXT: and.b32 %r10, %r22, -4;
96+
; CHECKPTX71-NEXT: shl.b32 %r38, %r22, 3;
97+
; CHECKPTX71-NEXT: and.b32 %r11, %r38, 24;
98+
; CHECKPTX71-NEXT: mov.b32 %r39, 65535;
99+
; CHECKPTX71-NEXT: shl.b32 %r40, %r39, %r11;
100+
; CHECKPTX71-NEXT: not.b32 %r12, %r40;
101+
; CHECKPTX71-NEXT: ld.global.u32 %r56, [%r10];
102+
; CHECKPTX71-NEXT: $L__BB0_5: // %atomicrmw.start9
79103
; CHECKPTX71-NEXT: // =>This Inner Loop Header: Depth=1
80-
; CHECKPTX71-NEXT: mov.b16 %rs18, 0x3F80;
81-
; CHECKPTX71-NEXT: fma.rn.bf16 %rs19, %rs24, %rs18, %rs13;
82-
; CHECKPTX71-NEXT: atom.global.cas.b16 %rs9, [%r2], %rs24, %rs19;
83-
; CHECKPTX71-NEXT: setp.ne.s16 %p3, %rs9, %rs24;
84-
; CHECKPTX71-NEXT: mov.u16 %rs24, %rs9;
104+
; CHECKPTX71-NEXT: shr.u32 %r41, %r56, %r11;
105+
; CHECKPTX71-NEXT: cvt.u16.u32 %rs8, %r41;
106+
; CHECKPTX71-NEXT: mov.b16 %rs9, 0x3F80;
107+
; CHECKPTX71-NEXT: fma.rn.bf16 %rs10, %rs8, %rs9, %rs1;
108+
; CHECKPTX71-NEXT: cvt.u32.u16 %r42, %rs10;
109+
; CHECKPTX71-NEXT: shl.b32 %r43, %r42, %r11;
110+
; CHECKPTX71-NEXT: and.b32 %r44, %r56, %r12;
111+
; CHECKPTX71-NEXT: or.b32 %r45, %r44, %r43;
112+
; CHECKPTX71-NEXT: atom.global.cas.b32 %r15, [%r10], %r56, %r45;
113+
; CHECKPTX71-NEXT: setp.ne.s32 %p3, %r15, %r56;
114+
; CHECKPTX71-NEXT: mov.u32 %r56, %r15;
85115
; CHECKPTX71-NEXT: @%p3 bra $L__BB0_5;
86-
; CHECKPTX71-NEXT: // %bb.6: // %atomicrmw.end1
87-
; CHECKPTX71-NEXT: ld.shared.b16 %rs25, [%r3];
116+
; CHECKPTX71-NEXT: // %bb.6: // %atomicrmw.end8
117+
; CHECKPTX71-NEXT: and.b32 %r16, %r23, -4;
118+
; CHECKPTX71-NEXT: shl.b32 %r46, %r23, 3;
119+
; CHECKPTX71-NEXT: and.b32 %r17, %r46, 24;
120+
; CHECKPTX71-NEXT: mov.b32 %r47, 65535;
121+
; CHECKPTX71-NEXT: shl.b32 %r48, %r47, %r17;
122+
; CHECKPTX71-NEXT: not.b32 %r18, %r48;
123+
; CHECKPTX71-NEXT: ld.shared.u32 %r57, [%r16];
88124
; CHECKPTX71-NEXT: $L__BB0_7: // %atomicrmw.start
89125
; CHECKPTX71-NEXT: // =>This Inner Loop Header: Depth=1
90-
; CHECKPTX71-NEXT: mov.b16 %rs20, 0x3F80;
91-
; CHECKPTX71-NEXT: fma.rn.bf16 %rs21, %rs25, %rs20, %rs13;
92-
; CHECKPTX71-NEXT: atom.shared.cas.b16 %rs12, [%r3], %rs25, %rs21;
93-
; CHECKPTX71-NEXT: setp.ne.s16 %p4, %rs12, %rs25;
94-
; CHECKPTX71-NEXT: mov.u16 %rs25, %rs12;
126+
; CHECKPTX71-NEXT: shr.u32 %r49, %r57, %r17;
127+
; CHECKPTX71-NEXT: cvt.u16.u32 %rs11, %r49;
128+
; CHECKPTX71-NEXT: mov.b16 %rs12, 0x3F80;
129+
; CHECKPTX71-NEXT: fma.rn.bf16 %rs13, %rs11, %rs12, %rs1;
130+
; CHECKPTX71-NEXT: cvt.u32.u16 %r50, %rs13;
131+
; CHECKPTX71-NEXT: shl.b32 %r51, %r50, %r17;
132+
; CHECKPTX71-NEXT: and.b32 %r52, %r57, %r18;
133+
; CHECKPTX71-NEXT: or.b32 %r53, %r52, %r51;
134+
; CHECKPTX71-NEXT: atom.shared.cas.b32 %r21, [%r16], %r57, %r53;
135+
; CHECKPTX71-NEXT: setp.ne.s32 %p4, %r21, %r57;
136+
; CHECKPTX71-NEXT: mov.u32 %r57, %r21;
95137
; CHECKPTX71-NEXT: @%p4 bra $L__BB0_7;
96138
; CHECKPTX71-NEXT: // %bb.8: // %atomicrmw.end
97139
; CHECKPTX71-NEXT: ret;
98-
%r1 = atomicrmw fadd ptr %dp0, bfloat %val seq_cst
99-
%r2 = atomicrmw fadd ptr %dp0, bfloat 1.0 seq_cst
100-
%r3 = atomicrmw fadd ptr addrspace(1) %dp1, bfloat %val seq_cst
101-
%r4 = atomicrmw fadd ptr addrspace(3) %dp3, bfloat %val seq_cst
140+
%r1 = atomicrmw fadd ptr %dp0, bfloat %val monotonic
141+
%r2 = atomicrmw fadd ptr %dp0, bfloat 1.0 monotonic
142+
%r3 = atomicrmw fadd ptr addrspace(1) %dp1, bfloat %val monotonic
143+
%r4 = atomicrmw fadd ptr addrspace(3) %dp3, bfloat %val monotonic
102144
ret void
103145
}
104146

llvm/test/CodeGen/NVPTX/cmpxchg.ll

Lines changed: 64 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -53,43 +53,44 @@ define i8 @relaxed_sys_i8(ptr %addr, i8 %cmp, i8 %new) {
5353
; SM70-LABEL: relaxed_sys_i8(
5454
; SM70: {
5555
; SM70-NEXT: .reg .pred %p<3>;
56-
; SM70-NEXT: .reg .b16 %rs<17>;
57-
; SM70-NEXT: .reg .b32 %r<3>;
58-
; SM70-NEXT: .reg .b64 %rd<5>;
56+
; SM70-NEXT: .reg .b16 %rs<2>;
57+
; SM70-NEXT: .reg .b32 %r<21>;
58+
; SM70-NEXT: .reg .b64 %rd<3>;
5959
; SM70-EMPTY:
6060
; SM70-NEXT: // %bb.0:
61-
; SM70-NEXT: ld.param.u8 %rs9, [relaxed_sys_i8_param_2];
61+
; SM70-NEXT: ld.param.u8 %rs1, [relaxed_sys_i8_param_2];
6262
; SM70-NEXT: ld.param.u64 %rd2, [relaxed_sys_i8_param_0];
63-
; SM70-NEXT: and.b64 %rd1, %rd2, -2;
64-
; SM70-NEXT: ld.param.u8 %rs10, [relaxed_sys_i8_param_1];
65-
; SM70-NEXT: and.b64 %rd3, %rd2, 1;
66-
; SM70-NEXT: shl.b64 %rd4, %rd3, 3;
67-
; SM70-NEXT: cvt.u32.u64 %r1, %rd4;
68-
; SM70-NEXT: mov.b16 %rs11, 255;
69-
; SM70-NEXT: shl.b16 %rs12, %rs11, %r1;
70-
; SM70-NEXT: not.b16 %rs2, %rs12;
71-
; SM70-NEXT: shl.b16 %rs3, %rs9, %r1;
72-
; SM70-NEXT: shl.b16 %rs4, %rs10, %r1;
73-
; SM70-NEXT: ld.u16 %rs13, [%rd1];
74-
; SM70-NEXT: and.b16 %rs16, %rs13, %rs2;
63+
; SM70-NEXT: and.b64 %rd1, %rd2, -4;
64+
; SM70-NEXT: cvt.u32.u64 %r9, %rd2;
65+
; SM70-NEXT: and.b32 %r10, %r9, 3;
66+
; SM70-NEXT: shl.b32 %r1, %r10, 3;
67+
; SM70-NEXT: mov.b32 %r11, 255;
68+
; SM70-NEXT: shl.b32 %r12, %r11, %r1;
69+
; SM70-NEXT: not.b32 %r2, %r12;
70+
; SM70-NEXT: cvt.u32.u16 %r13, %rs1;
71+
; SM70-NEXT: and.b32 %r14, %r13, 255;
72+
; SM70-NEXT: shl.b32 %r3, %r14, %r1;
73+
; SM70-NEXT: ld.param.u8 %r15, [relaxed_sys_i8_param_1];
74+
; SM70-NEXT: shl.b32 %r4, %r15, %r1;
75+
; SM70-NEXT: ld.u32 %r16, [%rd1];
76+
; SM70-NEXT: and.b32 %r20, %r16, %r2;
7577
; SM70-NEXT: $L__BB0_1: // %partword.cmpxchg.loop
7678
; SM70-NEXT: // =>This Inner Loop Header: Depth=1
77-
; SM70-NEXT: or.b16 %rs14, %rs16, %rs3;
78-
; SM70-NEXT: or.b16 %rs15, %rs16, %rs4;
79-
; SM70-NEXT: atom.cas.b16 %rs7, [%rd1], %rs15, %rs14;
80-
; SM70-NEXT: setp.eq.s16 %p1, %rs7, %rs15;
79+
; SM70-NEXT: or.b32 %r17, %r20, %r3;
80+
; SM70-NEXT: or.b32 %r18, %r20, %r4;
81+
; SM70-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17;
82+
; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18;
8183
; SM70-NEXT: @%p1 bra $L__BB0_3;
8284
; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
8385
; SM70-NEXT: // in Loop: Header=BB0_1 Depth=1
84-
; SM70-NEXT: and.b16 %rs8, %rs7, %rs2;
85-
; SM70-NEXT: setp.ne.s16 %p2, %rs16, %rs8;
86-
; SM70-NEXT: mov.u16 %rs16, %rs8;
86+
; SM70-NEXT: and.b32 %r8, %r7, %r2;
87+
; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8;
88+
; SM70-NEXT: mov.u32 %r20, %r8;
8789
; SM70-NEXT: @%p2 bra $L__BB0_1;
8890
; SM70-NEXT: $L__BB0_3: // %partword.cmpxchg.end
89-
; SM70-NEXT: cvt.u32.u16 %r2, %rs9;
90-
; SM70-NEXT: st.param.b32 [func_retval0], %r2;
91+
; SM70-NEXT: st.param.b32 [func_retval0], %r13;
9192
; SM70-NEXT: ret;
92-
%pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new seq_cst seq_cst
93+
%pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new monotonic monotonic
9394
ret i8 %new
9495
}
9596

@@ -137,19 +138,44 @@ define i16 @relaxed_sys_i16(ptr %addr, i16 %cmp, i16 %new) {
137138
;
138139
; SM70-LABEL: relaxed_sys_i16(
139140
; SM70: {
140-
; SM70-NEXT: .reg .b16 %rs<4>;
141-
; SM70-NEXT: .reg .b32 %r<2>;
142-
; SM70-NEXT: .reg .b64 %rd<2>;
141+
; SM70-NEXT: .reg .pred %p<3>;
142+
; SM70-NEXT: .reg .b16 %rs<2>;
143+
; SM70-NEXT: .reg .b32 %r<20>;
144+
; SM70-NEXT: .reg .b64 %rd<3>;
143145
; SM70-EMPTY:
144146
; SM70-NEXT: // %bb.0:
145-
; SM70-NEXT: ld.param.u64 %rd1, [relaxed_sys_i16_param_0];
146-
; SM70-NEXT: ld.param.u16 %rs1, [relaxed_sys_i16_param_1];
147-
; SM70-NEXT: ld.param.u16 %rs2, [relaxed_sys_i16_param_2];
148-
; SM70-NEXT: atom.cas.b16 %rs3, [%rd1], %rs1, %rs2;
149-
; SM70-NEXT: cvt.u32.u16 %r1, %rs2;
150-
; SM70-NEXT: st.param.b32 [func_retval0], %r1;
147+
; SM70-NEXT: ld.param.u16 %rs1, [relaxed_sys_i16_param_2];
148+
; SM70-NEXT: ld.param.u64 %rd2, [relaxed_sys_i16_param_0];
149+
; SM70-NEXT: and.b64 %rd1, %rd2, -4;
150+
; SM70-NEXT: ld.param.u16 %r9, [relaxed_sys_i16_param_1];
151+
; SM70-NEXT: cvt.u32.u64 %r10, %rd2;
152+
; SM70-NEXT: and.b32 %r11, %r10, 3;
153+
; SM70-NEXT: shl.b32 %r1, %r11, 3;
154+
; SM70-NEXT: mov.b32 %r12, 65535;
155+
; SM70-NEXT: shl.b32 %r13, %r12, %r1;
156+
; SM70-NEXT: not.b32 %r2, %r13;
157+
; SM70-NEXT: cvt.u32.u16 %r14, %rs1;
158+
; SM70-NEXT: shl.b32 %r3, %r14, %r1;
159+
; SM70-NEXT: shl.b32 %r4, %r9, %r1;
160+
; SM70-NEXT: ld.u32 %r15, [%rd1];
161+
; SM70-NEXT: and.b32 %r19, %r15, %r2;
162+
; SM70-NEXT: $L__BB1_1: // %partword.cmpxchg.loop
163+
; SM70-NEXT: // =>This Inner Loop Header: Depth=1
164+
; SM70-NEXT: or.b32 %r16, %r19, %r3;
165+
; SM70-NEXT: or.b32 %r17, %r19, %r4;
166+
; SM70-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16;
167+
; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17;
168+
; SM70-NEXT: @%p1 bra $L__BB1_3;
169+
; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
170+
; SM70-NEXT: // in Loop: Header=BB1_1 Depth=1
171+
; SM70-NEXT: and.b32 %r8, %r7, %r2;
172+
; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8;
173+
; SM70-NEXT: mov.u32 %r19, %r8;
174+
; SM70-NEXT: @%p2 bra $L__BB1_1;
175+
; SM70-NEXT: $L__BB1_3: // %partword.cmpxchg.end
176+
; SM70-NEXT: st.param.b32 [func_retval0], %r14;
151177
; SM70-NEXT: ret;
152-
%pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new seq_cst seq_cst
178+
%pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new monotonic monotonic
153179
ret i16 %new
154180
}
155181

@@ -180,7 +206,7 @@ define i32 @relaxed_sys_i32(ptr %addr, i32 %cmp, i32 %new) {
180206
; SM70-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2;
181207
; SM70-NEXT: st.param.b32 [func_retval0], %r2;
182208
; SM70-NEXT: ret;
183-
%pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new seq_cst seq_cst
209+
%pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new monotonic monotonic
184210
ret i32 %new
185211
}
186212

@@ -209,7 +235,7 @@ define i64 @relaxed_sys_i64(ptr %addr, i64 %cmp, i64 %new) {
209235
; SM70-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3;
210236
; SM70-NEXT: st.param.b64 [func_retval0], %rd3;
211237
; SM70-NEXT: ret;
212-
%pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new seq_cst seq_cst
238+
%pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new monotonic monotonic
213239
ret i64 %new
214240
}
215241
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:

0 commit comments

Comments
 (0)