Skip to content

Commit 86c63b2

Browse files
pendingchaosMarge Bot
authored andcommitted
aco/gfx12: insert wait between VMEM WaW
llvm/llvm-project#105549 fossil-db (gfx1200): Totals from 1783 (2.25% of 79395) affected shaders: Instrs: 7398391 -> 7404566 (+0.08%); split: -0.00%, +0.08% CodeSize: 38862456 -> 38886364 (+0.06%); split: -0.00%, +0.06% Latency: 83191513 -> 84211504 (+1.23%); split: -0.00%, +1.23% InvThroughput: 15185936 -> 15345744 (+1.05%); split: -0.01%, +1.06% Signed-off-by: Rhys Perry <[email protected]> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/32373>
1 parent 9f3607d commit 86c63b2

File tree

2 files changed

+11
-2
lines changed

2 files changed

+11
-2
lines changed

src/amd/compiler/aco_insert_waitcnt.cpp

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -278,9 +278,13 @@ check_instr(wait_ctx& ctx, wait_imm& wait, Instruction* instr)
278278

279279
wait_imm reg_imm = it->second.imm;
280280

281-
/* Vector Memory reads and writes return in the order they were issued */
281+
/* Vector Memory reads and writes decrease the counter in the order they were issued.
282+
* Before GFX12, they also write VGPRs in order if they're of the same type.
283+
* TODO: We can do this for GFX12 and different types for GFX11 if we know that the two
284+
* VMEM loads do not write the same lanes. Since GFX11, we track VMEM operations on the
285+
* linear CFG, so this is difficult */
282286
uint8_t vmem_type = get_vmem_type(ctx.gfx_level, instr);
283-
if (vmem_type) {
287+
if (vmem_type && ctx.gfx_level < GFX12) {
284288
wait_event event = get_vmem_event(ctx, instr, vmem_type);
285289
wait_type type = (wait_type)(ffs(ctx.info->get_counters_for_event(event)) - 1);
286290
if ((it->second.events & ctx.info->events[type]) == event &&

src/amd/compiler/tests/test_insert_waitcnt.cpp

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -192,6 +192,7 @@ BEGIN_TEST(insert_waitcnt.waw.vmem_types)
192192

193193
//>> p_unit_test 0
194194
//! v1: %0:v[4] = buffer_load_dword %0:s[0-3], %0:v[0], 0
195+
//~gfx12! s_wait_loadcnt imm:0
195196
//! v1: %0:v[4] = buffer_load_dword %0:s[0-3], %0:v[0], 0
196197
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(0));
197198
bld.mubuf(aco_opcode::buffer_load_dword, def_v4, desc_s4, op_v0, Operand::zero(), 0, false);
@@ -220,6 +221,7 @@ BEGIN_TEST(insert_waitcnt.waw.vmem_types)
220221

221222
//>> p_unit_test 3
222223
//! v1: %0:v[4] = image_sample %0:s[8-15], %0:s[0-3], v1: undef, %0:v[0] 1d
224+
//~gfx12! s_wait_samplecnt imm:0
223225
//! v1: %0:v[4] = image_sample %0:s[8-15], %0:s[0-3], v1: undef, %0:v[0] 1d
224226
bld.reset(program->create_and_insert_block());
225227
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(3));
@@ -249,6 +251,7 @@ BEGIN_TEST(insert_waitcnt.waw.vmem_types)
249251

250252
//>> p_unit_test 6
251253
//! v1: %0:v[4] = image_bvh64_intersect_ray %0:s[0-3], s4: undef, v1: undef, %0:v[16-26] 1d
254+
//~gfx12! s_wait_bvhcnt imm:0
252255
//! v1: %0:v[4] = image_bvh64_intersect_ray %0:s[0-3], s4: undef, v1: undef, %0:v[16-26] 1d
253256
bld.reset(program->create_and_insert_block());
254257
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(6));
@@ -300,6 +303,7 @@ BEGIN_TEST(insert_waitcnt.waw.vmem_types)
300303
//>> BB11
301304
//! /* logical preds: BB9, BB10, / linear preds: BB9, BB10, / kind: uniform, */
302305
//! p_unit_test 9
306+
//~gfx12! s_wait_loadcnt imm:0
303307
//! v1: %0:v[4] = buffer_load_dword %0:s[0-3], %0:v[0], 0
304308
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(9));
305309
bld.mubuf(aco_opcode::buffer_load_dword, def_v4, desc_s4, op_v0, Operand::zero(), 0, false);
@@ -326,6 +330,7 @@ BEGIN_TEST(insert_waitcnt.waw.vmem_types)
326330
//! /* logical preds: BB12, BB13, / linear preds: BB12, BB13, / kind: uniform, */
327331
//! p_unit_test 10
328332
//~gfx11! s_waitcnt vmcnt(0)
333+
//~gfx12! s_wait_loadcnt imm:0
329334
//~gfx12! s_wait_samplecnt imm:0
330335
//! v1: %0:v[4] = buffer_load_dword %0:s[0-3], %0:v[0], 0
331336
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(10));

0 commit comments

Comments
 (0)