Skip to content

Commit 1cad502

Browse files
jimbothigpenclaude
andcommitted
phase 3a ggml-org#23: port turbo2_tcq encoder optimizations + preemptive s_outputs[] fix
Source: buun 2e239fb "perf: port turbo3_tcq optimizations to turbo2_tcq encoder" Ports six optimizations to k_set_rows_turboq2_tcq: - 128x64 backtrace (was 128x256); predecessor depends only on sid's low 6 bits - 64-group predecessor-minima precomputation (replaces 256 4-way scans/step) - shared-memory backtrace opt-in (CUDA only; HIP/MUSA stay on global bt) - FWHT warp shuffles (5 stages warp + 2 stages shared) - parallel 33-thread bitpack (one byte per thread, no atomics) - warp-shuffle final-state min reduction (replaces serial 8-warp scan) Additionally folds in the strict-aliasing fix from Phase 3a ggml-org#20 (commit 70b3dd5) to k_set_rows_turboq2_tcq's outputs[] handling. Reason: buun's port introduces a parallel-bitpack section that reads outputs[] cross-thread, which under HIP/ROCm would have triggered the same hoist-past-__syncthreads symbol corruption that produced the +12.7% PPL regression session-64 measured on turboq3. Declare __shared__ uint8_t s_outputs[128] and point the local outputs pointer at it instead of the (uint8_t *)x alias over __shared__ float x[128]. Per the 70b3dd5 commit-message forecast: "If turboq2 ever gains a parallel bitpack, it will need the same fix." Expected PPL chunks 1-4: bit-identical to the pre-port TURBOQ2_TCQ baseline measured in session-phase-3a-23-turbo2-encoder-opt §3.3 (turboq2-baseline.log). Mirrors ygg's ggml-org#21 dead-store removal (no pred_min_p[64] declared — the same value is already written to bt[t*64+sid] for backtrack). Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
1 parent 79f2a92 commit 1cad502

1 file changed

Lines changed: 146 additions & 52 deletions

File tree

ggml/src/ggml-cuda/set-rows.cu

Lines changed: 146 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -1089,7 +1089,9 @@ static void set_rows_cuda_turboq4(
10891089
// =====================================================================================
10901090

10911091
// Global backtrace buffer for TCQ Viterbi (replaces 32KB shared/block + 16KB for 2-bit).
1092-
// Sized to ne_total_groups * 128 * BS (BS=512 for 3-bit, 256 for 2-bit). Grown on demand.
1092+
// Sized to ne_total_groups * 128 * 64 (compressed: 64 low-state groups per step,
1093+
// same layout for both turboq3 and turboq2). Grown on demand. Devices that opt in
1094+
// to per-block shared-memory backtrace bypass this buffer.
10931095
static uint8_t * tcq_bt_buf = nullptr;
10941096
static int64_t tcq_bt_buf_bytes = 0;
10951097

@@ -1417,6 +1419,7 @@ static __global__ void __launch_bounds__(256, 1) k_set_rows_turboq2_tcq(
14171419
const float * __restrict__ src0, const idx_t * __restrict__ src1,
14181420
block_turboq2_tcq * __restrict__ dst, const int64_t ne_total_groups,
14191421
uint8_t * __restrict__ bt_buf,
1422+
const int use_shared_bt,
14201423
const int64_t ne00, const int64_t ne01, const int64_t ne02,
14211424
const int64_t ne10, const int64_t ne11, const int64_t ne12, const int64_t ne13,
14221425
const int64_t s01, const int64_t s02, const int64_t s03,
@@ -1443,12 +1446,32 @@ static __global__ void __launch_bounds__(256, 1) k_set_rows_turboq2_tcq(
14431446
block_turboq2_tcq * dst_blk = (block_turboq2_tcq *)((char *)dst + dst_row*s1 + i02*s2 + i03*s3)
14441447
+ (i00 / QK_TURBOQ2_TCQ);
14451448

1449+
// Shared memory layout:
1450+
// x[128] : rotated+normalized input (also reused as scratch during reductions)
1451+
// cost[256] : path costs buffer A (also reused as norm-reduction scratch)
1452+
// cost_b[256]: path costs buffer B (double-buffering eliminates 2/3 of syncs)
1453+
// Backtrace: one predecessor byte for each of the 64 low-state groups per
1454+
// step (compressed from the old 128x256 layout — the predecessor depends
1455+
// only on sid's low 6 bits, never on the output bits in sid[7:6]). The
1456+
// backtrace lives in dynamic shared memory when the device opts in, else
1457+
// in bt_buf in global memory (still 128x64 bytes per block, byte-packed).
1458+
extern __shared__ uint8_t bt_shared[];
14461459
__shared__ float x[128];
14471460
__shared__ float cost[256];
1448-
__shared__ float cost_b[256]; // double-buffering for Viterbi (was bt[128][128], 16KB shared)
1461+
__shared__ float cost_b[256];
14491462
__shared__ int warp_min_idx[8];
14501463
__shared__ float warp_min_cost[8];
1464+
__shared__ float pred_min_cost[64];
14511465
__shared__ int shared_initial_state;
1466+
// Dedicated shared buffer for the Viterbi-backtrack output bytes. Previously
1467+
// aliased onto x[] via (uint8_t *)x, but writing uint8_t into a float-typed
1468+
// shared array is a strict-aliasing violation: under HIP/ROCm the compiler
1469+
// can hoist cross-thread reads of outputs[] above the __syncthreads() that
1470+
// follows the sid==0 backtrack write. The parallel bitpack introduced by
1471+
// this port reads outputs[sym_idx] from sids 0..32, which would hit the
1472+
// same hazard turboq3 hit in Phase 3a #20 (+12.7% PPL regression). Apply
1473+
// the same fix preemptively, as forecast in commit 70b3dd57c.
1474+
__shared__ uint8_t s_outputs[128];
14521475

14531476
// Parallel pre-Viterbi: load (threads 0-127)
14541477
if (sid < 128) x[sid] = grp_src[sid];
@@ -1488,17 +1511,31 @@ static __global__ void __launch_bounds__(256, 1) k_set_rows_turboq2_tcq(
14881511
if (sid < 128) x[sid] *= inv_norm;
14891512
__syncthreads();
14901513

1491-
// Parallel FWHT: signs1 → 7-stage butterfly → scale + signs2
1492-
if (sid < 128) x[sid] *= TURBO_WHT_SIGNS1[sid];
1493-
__syncthreads();
1494-
for (int h = 1; h < 128; h *= 2) {
1495-
if (sid < 64) {
1496-
int j = (sid / h) * (2 * h) + (sid % h);
1497-
float a = x[j], b = x[j + h];
1498-
x[j] = a + b; x[j + h] = a - b;
1514+
// Parallel FWHT: signs1 → 7-stage butterfly → scale + signs2.
1515+
// The first five stages run inside each warp via __shfl_xor_sync; the
1516+
// last two stages span warps so they fall back to shared memory.
1517+
if (sid < 128) {
1518+
float v = x[sid] * TURBO_WHT_SIGNS1[sid];
1519+
const int lane = sid & 31;
1520+
#pragma unroll
1521+
for (int h = 1; h < 32; h <<= 1) {
1522+
const float other = __shfl_xor_sync(0xFFFFFFFF, v, h, WARP_SIZE);
1523+
v = (lane & h) ? (other - v) : (v + other);
14991524
}
1500-
__syncthreads();
1525+
x[sid] = v;
15011526
}
1527+
__syncthreads();
1528+
if (sid < 64) {
1529+
const int j = ((sid >> 5) << 6) + (sid & 31);
1530+
float a = x[j], b = x[j + 32];
1531+
x[j] = a + b; x[j + 32] = a - b;
1532+
}
1533+
__syncthreads();
1534+
if (sid < 64) {
1535+
float a = x[sid], b = x[sid + 64];
1536+
x[sid] = a + b; x[sid + 64] = a - b;
1537+
}
1538+
__syncthreads();
15021539
constexpr float inv_sqrt_128 = 0.08838834764831845f;
15031540
if (sid < 128) x[sid] *= inv_sqrt_128 * TURBO_WHT_SIGNS2[sid];
15041541
__syncthreads();
@@ -1513,8 +1550,8 @@ static __global__ void __launch_bounds__(256, 1) k_set_rows_turboq2_tcq(
15131550
float saved_norm = cost[0];
15141551

15151552
// Initialize Viterbi: free initial state (all 256 states equally viable)
1516-
// Double-buffered cost (1 sync/step, was 3); byte-packed bt in global memory.
1517-
uint8_t * bt = bt_buf + (int64_t)blockIdx.x * (128 * 256);
1553+
// Double-buffered cost (1 sync/step, was 3); byte-packed bt in shared or global memory.
1554+
uint8_t * bt = use_shared_bt ? bt_shared : bt_buf + (int64_t)blockIdx.x * (128 * 64);
15181555
cost[sid] = 0.0f;
15191556
__syncthreads();
15201557

@@ -1525,24 +1562,31 @@ static __global__ void __launch_bounds__(256, 1) k_set_rows_turboq2_tcq(
15251562

15261563
float xt = x[t];
15271564

1528-
// Right-shift trellis (k=2, L=8): ns = (prev >> 2) | (out << 6)
1529-
// Predecessors of sid: prev = ((sid & 0x3F) << 2) | p, for p = 0..3
1530-
int base_prev = (sid & 0x3F) << 2;
1531-
float dist = xt - d_turboq2_tcq_codebook[sid];
1532-
dist = dist * dist;
1533-
1534-
float best = 1e30f;
1535-
int best_p = 0;
1536-
for (int p = 0; p < 4; p++) {
1537-
float c = cost_rd[base_prev | p];
1538-
if (c < best) {
1539-
best = c;
1540-
best_p = p;
1565+
// Right-shift trellis (k=2, L=8): ns = (prev >> 2) | (out << 6). The
1566+
// best predecessor depends only on sid's low 6 bits, so compute those
1567+
// 64 minima once instead of repeating the same 4-way scan per output.
1568+
if (sid < 64) {
1569+
const int base_prev = sid << 2;
1570+
float best = cost_rd[base_prev];
1571+
int best_p = 0;
1572+
#pragma unroll
1573+
for (int p = 1; p < 4; p++) {
1574+
float c = cost_rd[base_prev | p];
1575+
if (c < best) {
1576+
best = c;
1577+
best_p = p;
1578+
}
15411579
}
1580+
pred_min_cost[sid] = best;
1581+
bt[t * 64 + sid] = (uint8_t) best_p;
15421582
}
1583+
__syncthreads();
15431584

1544-
cost_wr[sid] = best + dist;
1545-
bt[t * 256 + sid] = (uint8_t)best_p;
1585+
const int pred_idx = sid & 0x3F;
1586+
float dist = xt - d_turboq2_tcq_codebook[sid];
1587+
dist = dist * dist;
1588+
1589+
cost_wr[sid] = pred_min_cost[pred_idx] + dist;
15461590
__syncthreads();
15471591
}
15481592
// After 128 steps (even count): final costs are in cost[] (step 127 is odd → cost_wr=cost)
@@ -1563,27 +1607,36 @@ static __global__ void __launch_bounds__(256, 1) k_set_rows_turboq2_tcq(
15631607
}
15641608
}
15651609
__syncthreads();
1566-
if (sid == 0) {
1567-
float best = warp_min_cost[0];
1568-
int best_idx = warp_min_idx[0];
1569-
for (int w = 1; w < 8; w++) {
1570-
if (warp_min_cost[w] < best) { best = warp_min_cost[w]; best_idx = warp_min_idx[w]; }
1610+
// Reduce 8 warp minima via a single-warp shuffle (32 lanes) instead of a
1611+
// serial single-thread loop. Upper 24 lanes seed FLT_MAX so they never win.
1612+
if (sid < 32) {
1613+
float best = (sid < 8) ? warp_min_cost[sid] : 3.4028234663852886e38f;
1614+
int best_idx = (sid < 8) ? warp_min_idx[sid] : 0;
1615+
#pragma unroll
1616+
for (int offset = 16; offset > 0; offset >>= 1) {
1617+
float other_cost = __shfl_down_sync(0xFFFFFFFF, best, offset, WARP_SIZE);
1618+
int other_idx = __shfl_down_sync(0xFFFFFFFF, best_idx, offset, WARP_SIZE);
1619+
if (other_cost < best) { best = other_cost; best_idx = other_idx; }
1620+
}
1621+
if (sid == 0) {
1622+
shared_initial_state = best_idx; // temporarily: best final state (becomes initial after backtrack)
15711623
}
1572-
shared_initial_state = best_idx; // temporarily: best final state (becomes initial after backtrack)
15731624
}
15741625
__syncthreads();
15751626

15761627
// TCQ error dump (port of buun 764c686b0): save post-FWHT x[] before backtrack overwrites it.
15771628
if (d_tcq_dump_max > 0 && grp < d_tcq_dump_max && sid < 128)
15781629
d_tcq_dump_x_buf[grp * 128 + sid] = x[sid];
15791630

1580-
// Thread 0: backtrack (inherently sequential, reads byte-packed bt from global memory)
1581-
uint8_t * outputs = (uint8_t *)x;
1631+
// Thread 0: backtrack (inherently sequential, reads byte-packed bt from
1632+
// shared or global memory). Writes the winning-path output bytes into
1633+
// __shared__ s_outputs[] (type-clean; see declaration comment above).
1634+
uint8_t * outputs = s_outputs;
15821635
if (sid == 0) {
15831636
int state = shared_initial_state;
15841637
for (int t = 127; t >= 0; t--) {
15851638
outputs[t] = (uint8_t)(state >> 6); // output = top 2 bits (k=2)
1586-
int p = bt[t * 256 + state];
1639+
int p = bt[t * 64 + (state & 0x3F)];
15871640
state = ((state & 0x3F) << 2) | p; // reconstruct predecessor
15881641
}
15891642
shared_initial_state = state;
@@ -1634,19 +1687,33 @@ static __global__ void __launch_bounds__(256, 1) k_set_rows_turboq2_tcq(
16341687
float corrected_norm = (recon_norm > 1e-10f) ? saved_norm / recon_norm : saved_norm;
16351688
corrected_norm *= iq_is_k ? d_tcq_norm_alpha : d_tcq_norm_alpha_v;
16361689

1637-
// Thread 0: pack bitstream (serial — avoids byte-alignment hazards with OR-into-byte)
1638-
if (sid == 0) {
1639-
for (int j = 0; j < 33; j++) dst_blk->qs[j] = 0;
1640-
// Write initial state prefix (upper 6 bits = initial_state >> 2)
1641-
dst_blk->qs[0] = (uint8_t)((shared_initial_state >> 2) & 0x3F);
1642-
for (int t = 0; t < 128; t++) {
1643-
const int bit_pos = 6 + t * 2;
1644-
const int byte_idx = bit_pos / 8;
1645-
const int bit_off = bit_pos % 8;
1646-
const int out = outputs[t] & 0x3;
1647-
dst_blk->qs[byte_idx] |= (uint8_t)(out << bit_off);
1648-
// 2 bits at even bit_off ∈ {0,2,4,6}: 6+2=8 fits one byte, never crosses
1690+
// Parallel bitpack: qs stores 6 initial-state bits followed by 128 two-bit
1691+
// output symbols. Each byte is independent (the 2-bit symbols never cross
1692+
// byte boundaries after the 6-bit prefix), so 33 threads can each pack one
1693+
// byte without atomics. Cross-thread reads of outputs[sym_idx] are safe
1694+
// because outputs aliases the type-clean __shared__ uint8_t s_outputs[]
1695+
// (see declaration comment).
1696+
if (sid < 33) {
1697+
const int init_bits = (shared_initial_state >> 2) & 0x3F;
1698+
uint8_t packed = 0;
1699+
#pragma unroll
1700+
for (int bit = 0; bit < 8; bit++) {
1701+
const int pos = sid * 8 + bit;
1702+
int v = 0;
1703+
if (pos < 6) {
1704+
v = (init_bits >> pos) & 1;
1705+
} else {
1706+
const int sym_bit_pos = pos - 6;
1707+
const int sym_idx = sym_bit_pos / 2;
1708+
if (sym_idx < 128) {
1709+
v = (outputs[sym_idx] >> (sym_bit_pos % 2)) & 1;
1710+
}
1711+
}
1712+
packed |= (uint8_t)(v << bit);
16491713
}
1714+
dst_blk->qs[sid] = packed;
1715+
}
1716+
if (sid == 0) {
16501717
dst_blk->norm = __float2half(corrected_norm);
16511718
}
16521719
}
@@ -1766,15 +1833,42 @@ static void set_rows_cuda_turboq2_tcq(
17661833

17671834
const int64_t ne_total_groups = (ne00 * ne01 * ne02 * ne03) / QK_TURBOQ2_TCQ;
17681835
if (ne_total_groups > 0 && ne00 > 0 && ne01 > 0 && ne02 > 0 && ne11 > 0 && ne12 > 0) {
1769-
ensure_tcq_bt_buf(ne_total_groups * 128 * 256);
1836+
// One-shot probe: on CUDA, opt in to shared-memory backtrace if the device
1837+
// exposes enough opt-in shared memory per block (and the env knob allows).
1838+
// HIP/MUSA paths skip the probe and always use the global bt_buf branch.
1839+
static int tcq2_use_shared_bt = 0;
1840+
static bool tcq2_bt_checked = false;
1841+
constexpr int tcq2_bt_shared_bytes = 128 * 64;
1842+
if (!tcq2_bt_checked) {
1843+
tcq2_bt_checked = true;
1844+
#if !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
1845+
const char * tcq_shared_bt_env = getenv("TURBO_TCQ_SHARED_BT");
1846+
if (!tcq_shared_bt_env || atoi(tcq_shared_bt_env) != 0) {
1847+
int max_shared_optin = 0;
1848+
CUDA_CHECK(cudaDeviceGetAttribute(&max_shared_optin, cudaDevAttrMaxSharedMemoryPerBlockOptin, ctx.device));
1849+
if (max_shared_optin >= tcq2_bt_shared_bytes) {
1850+
CUDA_SET_SHARED_MEMORY_LIMIT(k_set_rows_turboq2_tcq<idx_t>, tcq2_bt_shared_bytes);
1851+
tcq2_use_shared_bt = 1;
1852+
fprintf(stderr, "TCQ2 encode: using shared-memory backtrace (%d bytes/block)\n", tcq2_bt_shared_bytes);
1853+
} else {
1854+
fprintf(stderr, "TCQ2 encode: shared-memory backtrace unavailable, only %d bytes/block are available\n", max_shared_optin);
1855+
}
1856+
}
1857+
#endif
1858+
}
1859+
if (!tcq2_use_shared_bt) {
1860+
ensure_tcq_bt_buf(ne_total_groups * 128 * 64);
1861+
}
17701862
const uint3 ne00_fd = init_fastdiv_values((uint32_t) ne00);
17711863
const uint3 ne01_fd = init_fastdiv_values((uint32_t) ne01);
17721864
const uint3 ne02_fd = init_fastdiv_values((uint32_t) ne02);
17731865
const uint3 ne11_fd = init_fastdiv_values((uint32_t) ne11);
17741866
const uint3 ne12_fd = init_fastdiv_values((uint32_t) ne12);
1775-
k_set_rows_turboq2_tcq<idx_t><<<(int)ne_total_groups, 256, 0, stream>>>(
1867+
const int shared_bytes = tcq2_use_shared_bt ? tcq2_bt_shared_bytes : 0;
1868+
k_set_rows_turboq2_tcq<idx_t><<<(int)ne_total_groups, 256, shared_bytes, stream>>>(
17761869
src0_d, src1_d, (block_turboq2_tcq *)dst->data,
1777-
ne_total_groups, tcq_bt_buf, ne00, ne01, ne02, ne10, ne11, ne12, ne13,
1870+
ne_total_groups, tcq_bt_buf, tcq2_use_shared_bt,
1871+
ne00, ne01, ne02, ne10, ne11, ne12, ne13,
17781872
s01_f, s02_f, s03_f, s10_i, s11_i, s12_i,
17791873
iq_is_k,
17801874
nb1, nb2, nb3,

0 commit comments

Comments
 (0)