@@ -1089,7 +1089,9 @@ static void set_rows_cuda_turboq4(
10891089// =====================================================================================
10901090
10911091// Global backtrace buffer for TCQ Viterbi (replaces 32KB shared/block + 16KB for 2-bit).
1092- // Sized to ne_total_groups * 128 * BS (BS=512 for 3-bit, 256 for 2-bit). Grown on demand.
1092+ // Sized to ne_total_groups * 128 * 64 (compressed: 64 low-state groups per step,
1093+ // same layout for both turboq3 and turboq2). Grown on demand. Devices that opt in
1094+ // to per-block shared-memory backtrace bypass this buffer.
10931095static uint8_t * tcq_bt_buf = nullptr ;
10941096static int64_t tcq_bt_buf_bytes = 0 ;
10951097
@@ -1417,6 +1419,7 @@ static __global__ void __launch_bounds__(256, 1) k_set_rows_turboq2_tcq(
14171419 const float * __restrict__ src0, const idx_t * __restrict__ src1,
14181420 block_turboq2_tcq * __restrict__ dst, const int64_t ne_total_groups,
14191421 uint8_t * __restrict__ bt_buf,
1422+ const int use_shared_bt,
14201423 const int64_t ne00, const int64_t ne01, const int64_t ne02,
14211424 const int64_t ne10, const int64_t ne11, const int64_t ne12, const int64_t ne13,
14221425 const int64_t s01, const int64_t s02, const int64_t s03,
@@ -1443,12 +1446,32 @@ static __global__ void __launch_bounds__(256, 1) k_set_rows_turboq2_tcq(
14431446 block_turboq2_tcq * dst_blk = (block_turboq2_tcq *)((char *)dst + dst_row*s1 + i02*s2 + i03*s3)
14441447 + (i00 / QK_TURBOQ2_TCQ );
14451448
1449+ // Shared memory layout:
1450+ // x[128] : rotated+normalized input (also reused as scratch during reductions)
1451+ // cost[256] : path costs buffer A (also reused as norm-reduction scratch)
1452+ // cost_b[256]: path costs buffer B (double-buffering eliminates 2/3 of syncs)
1453+ // Backtrace: one predecessor byte for each of the 64 low-state groups per
1454+ // step (compressed from the old 128x256 layout — the predecessor depends
1455+ // only on sid's low 6 bits, never on the output bits in sid[7:6]). The
1456+ // backtrace lives in dynamic shared memory when the device opts in, else
1457+ // in bt_buf in global memory (still 128x64 bytes per block, byte-packed).
1458+ extern __shared__ uint8_t bt_shared[];
14461459 __shared__ float x[128 ];
14471460 __shared__ float cost[256 ];
1448- __shared__ float cost_b[256 ]; // double-buffering for Viterbi (was bt[128][128], 16KB shared)
1461+ __shared__ float cost_b[256 ];
14491462 __shared__ int warp_min_idx[8 ];
14501463 __shared__ float warp_min_cost[8 ];
1464+ __shared__ float pred_min_cost[64 ];
14511465 __shared__ int shared_initial_state;
1466+ // Dedicated shared buffer for the Viterbi-backtrack output bytes. Previously
1467+ // aliased onto x[] via (uint8_t *)x, but writing uint8_t into a float-typed
1468+ // shared array is a strict-aliasing violation: under HIP/ROCm the compiler
1469+ // can hoist cross-thread reads of outputs[] above the __syncthreads() that
1470+ // follows the sid==0 backtrack write. The parallel bitpack introduced by
1471+ // this port reads outputs[sym_idx] from sids 0..32, which would hit the
1472+ // same hazard turboq3 hit in Phase 3a #20 (+12.7% PPL regression). Apply
1473+ // the same fix preemptively, as forecast in commit 70b3dd57c.
1474+ __shared__ uint8_t s_outputs[128 ];
14521475
14531476 // Parallel pre-Viterbi: load (threads 0-127)
14541477 if (sid < 128 ) x[sid] = grp_src[sid];
@@ -1488,17 +1511,31 @@ static __global__ void __launch_bounds__(256, 1) k_set_rows_turboq2_tcq(
14881511 if (sid < 128 ) x[sid] *= inv_norm;
14891512 __syncthreads ();
14901513
1491- // Parallel FWHT: signs1 → 7-stage butterfly → scale + signs2
1492- if (sid < 128 ) x[sid] *= TURBO_WHT_SIGNS1 [sid];
1493- __syncthreads ();
1494- for (int h = 1 ; h < 128 ; h *= 2 ) {
1495- if (sid < 64 ) {
1496- int j = (sid / h) * (2 * h) + (sid % h);
1497- float a = x[j], b = x[j + h];
1498- x[j] = a + b; x[j + h] = a - b;
1514+ // Parallel FWHT: signs1 → 7-stage butterfly → scale + signs2.
1515+ // The first five stages run inside each warp via __shfl_xor_sync; the
1516+ // last two stages span warps so they fall back to shared memory.
1517+ if (sid < 128 ) {
1518+ float v = x[sid] * TURBO_WHT_SIGNS1 [sid];
1519+ const int lane = sid & 31 ;
1520+ #pragma unroll
1521+ for (int h = 1 ; h < 32 ; h <<= 1 ) {
1522+ const float other = __shfl_xor_sync (0xFFFFFFFF , v, h, WARP_SIZE );
1523+ v = (lane & h) ? (other - v) : (v + other);
14991524 }
1500- __syncthreads () ;
1525+ x[sid] = v ;
15011526 }
1527+ __syncthreads ();
1528+ if (sid < 64 ) {
1529+ const int j = ((sid >> 5 ) << 6 ) + (sid & 31 );
1530+ float a = x[j], b = x[j + 32 ];
1531+ x[j] = a + b; x[j + 32 ] = a - b;
1532+ }
1533+ __syncthreads ();
1534+ if (sid < 64 ) {
1535+ float a = x[sid], b = x[sid + 64 ];
1536+ x[sid] = a + b; x[sid + 64 ] = a - b;
1537+ }
1538+ __syncthreads ();
15021539 constexpr float inv_sqrt_128 = 0 .08838834764831845f ;
15031540 if (sid < 128 ) x[sid] *= inv_sqrt_128 * TURBO_WHT_SIGNS2 [sid];
15041541 __syncthreads ();
@@ -1513,8 +1550,8 @@ static __global__ void __launch_bounds__(256, 1) k_set_rows_turboq2_tcq(
15131550 float saved_norm = cost[0 ];
15141551
15151552 // Initialize Viterbi: free initial state (all 256 states equally viable)
1516- // Double-buffered cost (1 sync/step, was 3); byte-packed bt in global memory.
1517- uint8_t * bt = bt_buf + (int64_t )blockIdx .x * (128 * 256 );
1553+ // Double-buffered cost (1 sync/step, was 3); byte-packed bt in shared or global memory.
1554+ uint8_t * bt = use_shared_bt ? bt_shared : bt_buf + (int64_t )blockIdx .x * (128 * 64 );
15181555 cost[sid] = 0 .0f ;
15191556 __syncthreads ();
15201557
@@ -1525,24 +1562,31 @@ static __global__ void __launch_bounds__(256, 1) k_set_rows_turboq2_tcq(
15251562
15261563 float xt = x[t];
15271564
1528- // Right-shift trellis (k=2, L=8): ns = (prev >> 2) | (out << 6)
1529- // Predecessors of sid: prev = ((sid & 0x3F) << 2) | p, for p = 0..3
1530- int base_prev = (sid & 0x3F ) << 2 ;
1531- float dist = xt - d_turboq2_tcq_codebook[sid];
1532- dist = dist * dist;
1533-
1534- float best = 1e30f;
1535- int best_p = 0 ;
1536- for (int p = 0 ; p < 4 ; p++) {
1537- float c = cost_rd[base_prev | p];
1538- if (c < best) {
1539- best = c;
1540- best_p = p;
1565+ // Right-shift trellis (k=2, L=8): ns = (prev >> 2) | (out << 6). The
1566+ // best predecessor depends only on sid's low 6 bits, so compute those
1567+ // 64 minima once instead of repeating the same 4-way scan per output.
1568+ if (sid < 64 ) {
1569+ const int base_prev = sid << 2 ;
1570+ float best = cost_rd[base_prev];
1571+ int best_p = 0 ;
1572+ #pragma unroll
1573+ for (int p = 1 ; p < 4 ; p++) {
1574+ float c = cost_rd[base_prev | p];
1575+ if (c < best) {
1576+ best = c;
1577+ best_p = p;
1578+ }
15411579 }
1580+ pred_min_cost[sid] = best;
1581+ bt[t * 64 + sid] = (uint8_t ) best_p;
15421582 }
1583+ __syncthreads ();
15431584
1544- cost_wr[sid] = best + dist;
1545- bt[t * 256 + sid] = (uint8_t )best_p;
1585+ const int pred_idx = sid & 0x3F ;
1586+ float dist = xt - d_turboq2_tcq_codebook[sid];
1587+ dist = dist * dist;
1588+
1589+ cost_wr[sid] = pred_min_cost[pred_idx] + dist;
15461590 __syncthreads ();
15471591 }
15481592 // After 128 steps (even count): final costs are in cost[] (step 127 is odd → cost_wr=cost)
@@ -1563,27 +1607,36 @@ static __global__ void __launch_bounds__(256, 1) k_set_rows_turboq2_tcq(
15631607 }
15641608 }
15651609 __syncthreads ();
1566- if (sid == 0 ) {
1567- float best = warp_min_cost[0 ];
1568- int best_idx = warp_min_idx[0 ];
1569- for (int w = 1 ; w < 8 ; w++) {
1570- if (warp_min_cost[w] < best) { best = warp_min_cost[w]; best_idx = warp_min_idx[w]; }
1610+ // Reduce 8 warp minima via a single-warp shuffle (32 lanes) instead of a
1611+ // serial single-thread loop. Upper 24 lanes seed FLT_MAX so they never win.
1612+ if (sid < 32 ) {
1613+ float best = (sid < 8 ) ? warp_min_cost[sid] : 3 .4028234663852886e38f;
1614+ int best_idx = (sid < 8 ) ? warp_min_idx[sid] : 0 ;
1615+ #pragma unroll
1616+ for (int offset = 16 ; offset > 0 ; offset >>= 1 ) {
1617+ float other_cost = __shfl_down_sync (0xFFFFFFFF , best, offset, WARP_SIZE );
1618+ int other_idx = __shfl_down_sync (0xFFFFFFFF , best_idx, offset, WARP_SIZE );
1619+ if (other_cost < best) { best = other_cost; best_idx = other_idx; }
1620+ }
1621+ if (sid == 0 ) {
1622+ shared_initial_state = best_idx; // temporarily: best final state (becomes initial after backtrack)
15711623 }
1572- shared_initial_state = best_idx; // temporarily: best final state (becomes initial after backtrack)
15731624 }
15741625 __syncthreads ();
15751626
15761627 // TCQ error dump (port of buun 764c686b0): save post-FWHT x[] before backtrack overwrites it.
15771628 if (d_tcq_dump_max > 0 && grp < d_tcq_dump_max && sid < 128 )
15781629 d_tcq_dump_x_buf[grp * 128 + sid] = x[sid];
15791630
1580- // Thread 0: backtrack (inherently sequential, reads byte-packed bt from global memory)
1581- uint8_t * outputs = (uint8_t *)x;
1631+ // Thread 0: backtrack (inherently sequential, reads byte-packed bt from
1632+ // shared or global memory). Writes the winning-path output bytes into
1633+ // __shared__ s_outputs[] (type-clean; see declaration comment above).
1634+ uint8_t * outputs = s_outputs;
15821635 if (sid == 0 ) {
15831636 int state = shared_initial_state;
15841637 for (int t = 127 ; t >= 0 ; t--) {
15851638 outputs[t] = (uint8_t )(state >> 6 ); // output = top 2 bits (k=2)
1586- int p = bt[t * 256 + state];
1639+ int p = bt[t * 64 + ( state & 0x3F ) ];
15871640 state = ((state & 0x3F ) << 2 ) | p; // reconstruct predecessor
15881641 }
15891642 shared_initial_state = state;
@@ -1634,19 +1687,33 @@ static __global__ void __launch_bounds__(256, 1) k_set_rows_turboq2_tcq(
16341687 float corrected_norm = (recon_norm > 1e-10f ) ? saved_norm / recon_norm : saved_norm;
16351688 corrected_norm *= iq_is_k ? d_tcq_norm_alpha : d_tcq_norm_alpha_v;
16361689
1637- // Thread 0: pack bitstream (serial — avoids byte-alignment hazards with OR-into-byte)
1638- if (sid == 0 ) {
1639- for (int j = 0 ; j < 33 ; j++) dst_blk->qs [j] = 0 ;
1640- // Write initial state prefix (upper 6 bits = initial_state >> 2)
1641- dst_blk->qs [0 ] = (uint8_t )((shared_initial_state >> 2 ) & 0x3F );
1642- for (int t = 0 ; t < 128 ; t++) {
1643- const int bit_pos = 6 + t * 2 ;
1644- const int byte_idx = bit_pos / 8 ;
1645- const int bit_off = bit_pos % 8 ;
1646- const int out = outputs[t] & 0x3 ;
1647- dst_blk->qs [byte_idx] |= (uint8_t )(out << bit_off);
1648- // 2 bits at even bit_off ∈ {0,2,4,6}: 6+2=8 fits one byte, never crosses
1690+ // Parallel bitpack: qs stores 6 initial-state bits followed by 128 two-bit
1691+ // output symbols. Each byte is independent (the 2-bit symbols never cross
1692+ // byte boundaries after the 6-bit prefix), so 33 threads can each pack one
1693+ // byte without atomics. Cross-thread reads of outputs[sym_idx] are safe
1694+ // because outputs aliases the type-clean __shared__ uint8_t s_outputs[]
1695+ // (see declaration comment).
1696+ if (sid < 33 ) {
1697+ const int init_bits = (shared_initial_state >> 2 ) & 0x3F ;
1698+ uint8_t packed = 0 ;
1699+ #pragma unroll
1700+ for (int bit = 0 ; bit < 8 ; bit++) {
1701+ const int pos = sid * 8 + bit;
1702+ int v = 0 ;
1703+ if (pos < 6 ) {
1704+ v = (init_bits >> pos) & 1 ;
1705+ } else {
1706+ const int sym_bit_pos = pos - 6 ;
1707+ const int sym_idx = sym_bit_pos / 2 ;
1708+ if (sym_idx < 128 ) {
1709+ v = (outputs[sym_idx] >> (sym_bit_pos % 2 )) & 1 ;
1710+ }
1711+ }
1712+ packed |= (uint8_t )(v << bit);
16491713 }
1714+ dst_blk->qs [sid] = packed;
1715+ }
1716+ if (sid == 0 ) {
16501717 dst_blk->norm = __float2half (corrected_norm);
16511718 }
16521719}
@@ -1766,15 +1833,42 @@ static void set_rows_cuda_turboq2_tcq(
17661833
17671834 const int64_t ne_total_groups = (ne00 * ne01 * ne02 * ne03) / QK_TURBOQ2_TCQ ;
17681835 if (ne_total_groups > 0 && ne00 > 0 && ne01 > 0 && ne02 > 0 && ne11 > 0 && ne12 > 0 ) {
1769- ensure_tcq_bt_buf (ne_total_groups * 128 * 256 );
1836+ // One-shot probe: on CUDA, opt in to shared-memory backtrace if the device
1837+ // exposes enough opt-in shared memory per block (and the env knob allows).
1838+ // HIP/MUSA paths skip the probe and always use the global bt_buf branch.
1839+ static int tcq2_use_shared_bt = 0 ;
1840+ static bool tcq2_bt_checked = false ;
1841+ constexpr int tcq2_bt_shared_bytes = 128 * 64 ;
1842+ if (!tcq2_bt_checked) {
1843+ tcq2_bt_checked = true ;
1844+ #if !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
1845+ const char * tcq_shared_bt_env = getenv (" TURBO_TCQ_SHARED_BT" );
1846+ if (!tcq_shared_bt_env || atoi (tcq_shared_bt_env) != 0 ) {
1847+ int max_shared_optin = 0 ;
1848+ CUDA_CHECK (cudaDeviceGetAttribute (&max_shared_optin, cudaDevAttrMaxSharedMemoryPerBlockOptin, ctx.device ));
1849+ if (max_shared_optin >= tcq2_bt_shared_bytes) {
1850+ CUDA_SET_SHARED_MEMORY_LIMIT (k_set_rows_turboq2_tcq<idx_t >, tcq2_bt_shared_bytes);
1851+ tcq2_use_shared_bt = 1 ;
1852+ fprintf (stderr, " TCQ2 encode: using shared-memory backtrace (%d bytes/block)\n " , tcq2_bt_shared_bytes);
1853+ } else {
1854+ fprintf (stderr, " TCQ2 encode: shared-memory backtrace unavailable, only %d bytes/block are available\n " , max_shared_optin);
1855+ }
1856+ }
1857+ #endif
1858+ }
1859+ if (!tcq2_use_shared_bt) {
1860+ ensure_tcq_bt_buf (ne_total_groups * 128 * 64 );
1861+ }
17701862 const uint3 ne00_fd = init_fastdiv_values ((uint32_t ) ne00);
17711863 const uint3 ne01_fd = init_fastdiv_values ((uint32_t ) ne01);
17721864 const uint3 ne02_fd = init_fastdiv_values ((uint32_t ) ne02);
17731865 const uint3 ne11_fd = init_fastdiv_values ((uint32_t ) ne11);
17741866 const uint3 ne12_fd = init_fastdiv_values ((uint32_t ) ne12);
1775- k_set_rows_turboq2_tcq<idx_t ><<<(int )ne_total_groups, 256 , 0 , stream>>> (
1867+ const int shared_bytes = tcq2_use_shared_bt ? tcq2_bt_shared_bytes : 0 ;
1868+ k_set_rows_turboq2_tcq<idx_t ><<<(int )ne_total_groups, 256 , shared_bytes, stream>>> (
17761869 src0_d, src1_d, (block_turboq2_tcq *)dst->data ,
1777- ne_total_groups, tcq_bt_buf, ne00, ne01, ne02, ne10, ne11, ne12, ne13,
1870+ ne_total_groups, tcq_bt_buf, tcq2_use_shared_bt,
1871+ ne00, ne01, ne02, ne10, ne11, ne12, ne13,
17781872 s01_f, s02_f, s03_f, s10_i, s11_i, s12_i,
17791873 iq_is_k,
17801874 nb1, nb2, nb3,
0 commit comments