Skip to content

Commit dc45797

Browse files
jimbothigpenclaude
andcommitted
phase 3a ggml-org#23: port turbo2_tcq encoder optimizations + preemptive s_outputs[] fix
Source: buun 2e239fb "perf: port turbo3_tcq optimizations to turbo2_tcq encoder" Ports six optimizations to k_set_rows_turboq2_tcq: - 128x64 backtrace (was 128x256); predecessor depends only on sid's low 6 bits - 64-group predecessor-minima precomputation (replaces 256 4-way scans/step) - shared-memory backtrace opt-in (CUDA only; HIP/MUSA stay on global bt) - FWHT warp shuffles (5 stages warp + 2 stages shared) - parallel 33-thread bitpack (one byte per thread, no atomics) - warp-shuffle final-state min reduction (replaces serial 8-warp scan) Additionally folds in the strict-aliasing fix from Phase 3a ggml-org#20 (commit 70b3dd5) to k_set_rows_turboq2_tcq's outputs[] handling. Reason: buun's port introduces a parallel-bitpack section that reads outputs[] cross-thread, which under HIP/ROCm would have triggered the same hoist-past-__syncthreads symbol corruption that produced the +12.7% PPL regression session-64 measured on turboq3. Declare __shared__ uint8_t s_outputs[128] and point the local outputs pointer at it instead of the (uint8_t *)x alias over __shared__ float x[128]. Per the 70b3dd5 commit-message forecast: "If turboq2 ever gains a parallel bitpack, it will need the same fix." Expected PPL chunks 1-4: bit-identical to the pre-port TURBOQ2_TCQ baseline measured in session-phase-3a-23-turbo2-encoder-opt §3.3 (turboq2-baseline.log). Mirrors ygg's ggml-org#21 dead-store removal (no pred_min_p[64] declared — the same value is already written to bt[t*64+sid] for backtrack). Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
1 parent a29c222 commit dc45797

1 file changed

Lines changed: 146 additions & 52 deletions

File tree

ggml/src/ggml-cuda/set-rows.cu

Lines changed: 146 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -1084,7 +1084,9 @@ static void set_rows_cuda_turboq4(
10841084
// =====================================================================================
10851085

10861086
// Global backtrace buffer for TCQ Viterbi (replaces 32KB shared/block + 16KB for 2-bit).
1087-
// Sized to ne_total_groups * 128 * BS (BS=512 for 3-bit, 256 for 2-bit). Grown on demand.
1087+
// Sized to ne_total_groups * 128 * 64 (compressed: 64 low-state groups per step,
1088+
// same layout for both turboq3 and turboq2). Grown on demand. Devices that opt in
1089+
// to per-block shared-memory backtrace bypass this buffer.
10881090
static uint8_t * tcq_bt_buf = nullptr;
10891091
static int64_t tcq_bt_buf_bytes = 0;
10901092

@@ -1412,6 +1414,7 @@ static __global__ void __launch_bounds__(256, 1) k_set_rows_turboq2_tcq(
14121414
const float * __restrict__ src0, const idx_t * __restrict__ src1,
14131415
block_turboq2_tcq * __restrict__ dst, const int64_t ne_total_groups,
14141416
uint8_t * __restrict__ bt_buf,
1417+
const int use_shared_bt,
14151418
const int64_t ne00, const int64_t ne01, const int64_t ne02,
14161419
const int64_t ne10, const int64_t ne11, const int64_t ne12, const int64_t ne13,
14171420
const int64_t s01, const int64_t s02, const int64_t s03,
@@ -1438,12 +1441,32 @@ static __global__ void __launch_bounds__(256, 1) k_set_rows_turboq2_tcq(
14381441
block_turboq2_tcq * dst_blk = (block_turboq2_tcq *)((char *)dst + dst_row*s1 + i02*s2 + i03*s3)
14391442
+ (i00 / QK_TURBOQ2_TCQ);
14401443

1444+
// Shared memory layout:
1445+
// x[128] : rotated+normalized input (also reused as scratch during reductions)
1446+
// cost[256] : path costs buffer A (also reused as norm-reduction scratch)
1447+
// cost_b[256]: path costs buffer B (double-buffering eliminates 2/3 of syncs)
1448+
// Backtrace: one predecessor byte for each of the 64 low-state groups per
1449+
// step (compressed from the old 128x256 layout — the predecessor depends
1450+
// only on sid's low 6 bits, never on the output bits in sid[7:6]). The
1451+
// backtrace lives in dynamic shared memory when the device opts in, else
1452+
// in bt_buf in global memory (still 128x64 bytes per block, byte-packed).
1453+
extern __shared__ uint8_t bt_shared[];
14411454
__shared__ float x[128];
14421455
__shared__ float cost[256];
1443-
__shared__ float cost_b[256]; // double-buffering for Viterbi (was bt[128][128], 16KB shared)
1456+
__shared__ float cost_b[256];
14441457
__shared__ int warp_min_idx[8];
14451458
__shared__ float warp_min_cost[8];
1459+
__shared__ float pred_min_cost[64];
14461460
__shared__ int shared_initial_state;
1461+
// Dedicated shared buffer for the Viterbi-backtrack output bytes. Previously
1462+
// aliased onto x[] via (uint8_t *)x, but writing uint8_t into a float-typed
1463+
// shared array is a strict-aliasing violation: under HIP/ROCm the compiler
1464+
// can hoist cross-thread reads of outputs[] above the __syncthreads() that
1465+
// follows the sid==0 backtrack write. The parallel bitpack introduced by
1466+
// this port reads outputs[sym_idx] from sids 0..32, which would hit the
1467+
// same hazard turboq3 hit in Phase 3a #20 (+12.7% PPL regression). Apply
1468+
// the same fix preemptively, as forecast in commit 70b3dd57c.
1469+
__shared__ uint8_t s_outputs[128];
14471470

14481471
// Parallel pre-Viterbi: load (threads 0-127)
14491472
if (sid < 128) x[sid] = grp_src[sid];
@@ -1483,17 +1506,31 @@ static __global__ void __launch_bounds__(256, 1) k_set_rows_turboq2_tcq(
14831506
if (sid < 128) x[sid] *= inv_norm;
14841507
__syncthreads();
14851508

1486-
// Parallel FWHT: signs1 → 7-stage butterfly → scale + signs2
1487-
if (sid < 128) x[sid] *= TURBO_WHT_SIGNS1[sid];
1488-
__syncthreads();
1489-
for (int h = 1; h < 128; h *= 2) {
1490-
if (sid < 64) {
1491-
int j = (sid / h) * (2 * h) + (sid % h);
1492-
float a = x[j], b = x[j + h];
1493-
x[j] = a + b; x[j + h] = a - b;
1509+
// Parallel FWHT: signs1 → 7-stage butterfly → scale + signs2.
1510+
// The first five stages run inside each warp via __shfl_xor_sync; the
1511+
// last two stages span warps so they fall back to shared memory.
1512+
if (sid < 128) {
1513+
float v = x[sid] * TURBO_WHT_SIGNS1[sid];
1514+
const int lane = sid & 31;
1515+
#pragma unroll
1516+
for (int h = 1; h < 32; h <<= 1) {
1517+
const float other = __shfl_xor_sync(0xFFFFFFFF, v, h, WARP_SIZE);
1518+
v = (lane & h) ? (other - v) : (v + other);
14941519
}
1495-
__syncthreads();
1520+
x[sid] = v;
14961521
}
1522+
__syncthreads();
1523+
if (sid < 64) {
1524+
const int j = ((sid >> 5) << 6) + (sid & 31);
1525+
float a = x[j], b = x[j + 32];
1526+
x[j] = a + b; x[j + 32] = a - b;
1527+
}
1528+
__syncthreads();
1529+
if (sid < 64) {
1530+
float a = x[sid], b = x[sid + 64];
1531+
x[sid] = a + b; x[sid + 64] = a - b;
1532+
}
1533+
__syncthreads();
14971534
constexpr float inv_sqrt_128 = 0.08838834764831845f;
14981535
if (sid < 128) x[sid] *= inv_sqrt_128 * TURBO_WHT_SIGNS2[sid];
14991536
__syncthreads();
@@ -1508,8 +1545,8 @@ static __global__ void __launch_bounds__(256, 1) k_set_rows_turboq2_tcq(
15081545
float saved_norm = cost[0];
15091546

15101547
// Initialize Viterbi: free initial state (all 256 states equally viable)
1511-
// Double-buffered cost (1 sync/step, was 3); byte-packed bt in global memory.
1512-
uint8_t * bt = bt_buf + (int64_t)blockIdx.x * (128 * 256);
1548+
// Double-buffered cost (1 sync/step, was 3); byte-packed bt in shared or global memory.
1549+
uint8_t * bt = use_shared_bt ? bt_shared : bt_buf + (int64_t)blockIdx.x * (128 * 64);
15131550
cost[sid] = 0.0f;
15141551
__syncthreads();
15151552

@@ -1520,24 +1557,31 @@ static __global__ void __launch_bounds__(256, 1) k_set_rows_turboq2_tcq(
15201557

15211558
float xt = x[t];
15221559

1523-
// Right-shift trellis (k=2, L=8): ns = (prev >> 2) | (out << 6)
1524-
// Predecessors of sid: prev = ((sid & 0x3F) << 2) | p, for p = 0..3
1525-
int base_prev = (sid & 0x3F) << 2;
1526-
float dist = xt - d_turboq2_tcq_codebook[sid];
1527-
dist = dist * dist;
1528-
1529-
float best = 1e30f;
1530-
int best_p = 0;
1531-
for (int p = 0; p < 4; p++) {
1532-
float c = cost_rd[base_prev | p];
1533-
if (c < best) {
1534-
best = c;
1535-
best_p = p;
1560+
// Right-shift trellis (k=2, L=8): ns = (prev >> 2) | (out << 6). The
1561+
// best predecessor depends only on sid's low 6 bits, so compute those
1562+
// 64 minima once instead of repeating the same 4-way scan per output.
1563+
if (sid < 64) {
1564+
const int base_prev = sid << 2;
1565+
float best = cost_rd[base_prev];
1566+
int best_p = 0;
1567+
#pragma unroll
1568+
for (int p = 1; p < 4; p++) {
1569+
float c = cost_rd[base_prev | p];
1570+
if (c < best) {
1571+
best = c;
1572+
best_p = p;
1573+
}
15361574
}
1575+
pred_min_cost[sid] = best;
1576+
bt[t * 64 + sid] = (uint8_t) best_p;
15371577
}
1578+
__syncthreads();
15381579

1539-
cost_wr[sid] = best + dist;
1540-
bt[t * 256 + sid] = (uint8_t)best_p;
1580+
const int pred_idx = sid & 0x3F;
1581+
float dist = xt - d_turboq2_tcq_codebook[sid];
1582+
dist = dist * dist;
1583+
1584+
cost_wr[sid] = pred_min_cost[pred_idx] + dist;
15411585
__syncthreads();
15421586
}
15431587
// After 128 steps (even count): final costs are in cost[] (step 127 is odd → cost_wr=cost)
@@ -1558,27 +1602,36 @@ static __global__ void __launch_bounds__(256, 1) k_set_rows_turboq2_tcq(
15581602
}
15591603
}
15601604
__syncthreads();
1561-
if (sid == 0) {
1562-
float best = warp_min_cost[0];
1563-
int best_idx = warp_min_idx[0];
1564-
for (int w = 1; w < 8; w++) {
1565-
if (warp_min_cost[w] < best) { best = warp_min_cost[w]; best_idx = warp_min_idx[w]; }
1605+
// Reduce 8 warp minima via a single-warp shuffle (32 lanes) instead of a
1606+
// serial single-thread loop. Upper 24 lanes seed FLT_MAX so they never win.
1607+
if (sid < 32) {
1608+
float best = (sid < 8) ? warp_min_cost[sid] : 3.4028234663852886e38f;
1609+
int best_idx = (sid < 8) ? warp_min_idx[sid] : 0;
1610+
#pragma unroll
1611+
for (int offset = 16; offset > 0; offset >>= 1) {
1612+
float other_cost = __shfl_down_sync(0xFFFFFFFF, best, offset, WARP_SIZE);
1613+
int other_idx = __shfl_down_sync(0xFFFFFFFF, best_idx, offset, WARP_SIZE);
1614+
if (other_cost < best) { best = other_cost; best_idx = other_idx; }
1615+
}
1616+
if (sid == 0) {
1617+
shared_initial_state = best_idx; // temporarily: best final state (becomes initial after backtrack)
15661618
}
1567-
shared_initial_state = best_idx; // temporarily: best final state (becomes initial after backtrack)
15681619
}
15691620
__syncthreads();
15701621

15711622
// TCQ error dump (port of buun 764c686b0): save post-FWHT x[] before backtrack overwrites it.
15721623
if (d_tcq_dump_max > 0 && grp < d_tcq_dump_max && sid < 128)
15731624
d_tcq_dump_x_buf[grp * 128 + sid] = x[sid];
15741625

1575-
// Thread 0: backtrack (inherently sequential, reads byte-packed bt from global memory)
1576-
uint8_t * outputs = (uint8_t *)x;
1626+
// Thread 0: backtrack (inherently sequential, reads byte-packed bt from
1627+
// shared or global memory). Writes the winning-path output bytes into
1628+
// __shared__ s_outputs[] (type-clean; see declaration comment above).
1629+
uint8_t * outputs = s_outputs;
15771630
if (sid == 0) {
15781631
int state = shared_initial_state;
15791632
for (int t = 127; t >= 0; t--) {
15801633
outputs[t] = (uint8_t)(state >> 6); // output = top 2 bits (k=2)
1581-
int p = bt[t * 256 + state];
1634+
int p = bt[t * 64 + (state & 0x3F)];
15821635
state = ((state & 0x3F) << 2) | p; // reconstruct predecessor
15831636
}
15841637
shared_initial_state = state;
@@ -1629,19 +1682,33 @@ static __global__ void __launch_bounds__(256, 1) k_set_rows_turboq2_tcq(
16291682
float corrected_norm = (recon_norm > 1e-10f) ? saved_norm / recon_norm : saved_norm;
16301683
corrected_norm *= iq_is_k ? d_tcq_norm_alpha : d_tcq_norm_alpha_v;
16311684

1632-
// Thread 0: pack bitstream (serial — avoids byte-alignment hazards with OR-into-byte)
1633-
if (sid == 0) {
1634-
for (int j = 0; j < 33; j++) dst_blk->qs[j] = 0;
1635-
// Write initial state prefix (upper 6 bits = initial_state >> 2)
1636-
dst_blk->qs[0] = (uint8_t)((shared_initial_state >> 2) & 0x3F);
1637-
for (int t = 0; t < 128; t++) {
1638-
const int bit_pos = 6 + t * 2;
1639-
const int byte_idx = bit_pos / 8;
1640-
const int bit_off = bit_pos % 8;
1641-
const int out = outputs[t] & 0x3;
1642-
dst_blk->qs[byte_idx] |= (uint8_t)(out << bit_off);
1643-
// 2 bits at even bit_off ∈ {0,2,4,6}: 6+2=8 fits one byte, never crosses
1685+
// Parallel bitpack: qs stores 6 initial-state bits followed by 128 two-bit
1686+
// output symbols. Each byte is independent (the 2-bit symbols never cross
1687+
// byte boundaries after the 6-bit prefix), so 33 threads can each pack one
1688+
// byte without atomics. Cross-thread reads of outputs[sym_idx] are safe
1689+
// because outputs aliases the type-clean __shared__ uint8_t s_outputs[]
1690+
// (see declaration comment).
1691+
if (sid < 33) {
1692+
const int init_bits = (shared_initial_state >> 2) & 0x3F;
1693+
uint8_t packed = 0;
1694+
#pragma unroll
1695+
for (int bit = 0; bit < 8; bit++) {
1696+
const int pos = sid * 8 + bit;
1697+
int v = 0;
1698+
if (pos < 6) {
1699+
v = (init_bits >> pos) & 1;
1700+
} else {
1701+
const int sym_bit_pos = pos - 6;
1702+
const int sym_idx = sym_bit_pos / 2;
1703+
if (sym_idx < 128) {
1704+
v = (outputs[sym_idx] >> (sym_bit_pos % 2)) & 1;
1705+
}
1706+
}
1707+
packed |= (uint8_t)(v << bit);
16441708
}
1709+
dst_blk->qs[sid] = packed;
1710+
}
1711+
if (sid == 0) {
16451712
dst_blk->norm = __float2half(corrected_norm);
16461713
}
16471714
}
@@ -1761,15 +1828,42 @@ static void set_rows_cuda_turboq2_tcq(
17611828

17621829
const int64_t ne_total_groups = (ne00 * ne01 * ne02 * ne03) / QK_TURBOQ2_TCQ;
17631830
if (ne_total_groups > 0 && ne00 > 0 && ne01 > 0 && ne02 > 0 && ne11 > 0 && ne12 > 0) {
1764-
ensure_tcq_bt_buf(ne_total_groups * 128 * 256);
1831+
// One-shot probe: on CUDA, opt in to shared-memory backtrace if the device
1832+
// exposes enough opt-in shared memory per block (and the env knob allows).
1833+
// HIP/MUSA paths skip the probe and always use the global bt_buf branch.
1834+
static int tcq2_use_shared_bt = 0;
1835+
static bool tcq2_bt_checked = false;
1836+
constexpr int tcq2_bt_shared_bytes = 128 * 64;
1837+
if (!tcq2_bt_checked) {
1838+
tcq2_bt_checked = true;
1839+
#if !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
1840+
const char * tcq_shared_bt_env = getenv("TURBO_TCQ_SHARED_BT");
1841+
if (!tcq_shared_bt_env || atoi(tcq_shared_bt_env) != 0) {
1842+
int max_shared_optin = 0;
1843+
CUDA_CHECK(cudaDeviceGetAttribute(&max_shared_optin, cudaDevAttrMaxSharedMemoryPerBlockOptin, ctx.device));
1844+
if (max_shared_optin >= tcq2_bt_shared_bytes) {
1845+
CUDA_SET_SHARED_MEMORY_LIMIT(k_set_rows_turboq2_tcq<idx_t>, tcq2_bt_shared_bytes);
1846+
tcq2_use_shared_bt = 1;
1847+
fprintf(stderr, "TCQ2 encode: using shared-memory backtrace (%d bytes/block)\n", tcq2_bt_shared_bytes);
1848+
} else {
1849+
fprintf(stderr, "TCQ2 encode: shared-memory backtrace unavailable, only %d bytes/block are available\n", max_shared_optin);
1850+
}
1851+
}
1852+
#endif
1853+
}
1854+
if (!tcq2_use_shared_bt) {
1855+
ensure_tcq_bt_buf(ne_total_groups * 128 * 64);
1856+
}
17651857
const uint3 ne00_fd = init_fastdiv_values((uint32_t) ne00);
17661858
const uint3 ne01_fd = init_fastdiv_values((uint32_t) ne01);
17671859
const uint3 ne02_fd = init_fastdiv_values((uint32_t) ne02);
17681860
const uint3 ne11_fd = init_fastdiv_values((uint32_t) ne11);
17691861
const uint3 ne12_fd = init_fastdiv_values((uint32_t) ne12);
1770-
k_set_rows_turboq2_tcq<idx_t><<<(int)ne_total_groups, 256, 0, stream>>>(
1862+
const int shared_bytes = tcq2_use_shared_bt ? tcq2_bt_shared_bytes : 0;
1863+
k_set_rows_turboq2_tcq<idx_t><<<(int)ne_total_groups, 256, shared_bytes, stream>>>(
17711864
src0_d, src1_d, (block_turboq2_tcq *)dst->data,
1772-
ne_total_groups, tcq_bt_buf, ne00, ne01, ne02, ne10, ne11, ne12, ne13,
1865+
ne_total_groups, tcq_bt_buf, tcq2_use_shared_bt,
1866+
ne00, ne01, ne02, ne10, ne11, ne12, ne13,
17731867
s01_f, s02_f, s03_f, s10_i, s11_i, s12_i,
17741868
iq_is_k,
17751869
nb1, nb2, nb3,

0 commit comments

Comments
 (0)