@@ -1084,7 +1084,9 @@ static void set_rows_cuda_turboq4(
10841084// =====================================================================================
10851085
10861086// Global backtrace buffer for TCQ Viterbi (replaces 32KB shared/block + 16KB for 2-bit).
1087- // Sized to ne_total_groups * 128 * BS (BS=512 for 3-bit, 256 for 2-bit). Grown on demand.
1087+ // Sized to ne_total_groups * 128 * 64 (compressed: 64 low-state groups per step,
1088+ // same layout for both turboq3 and turboq2). Grown on demand. Devices that opt in
1089+ // to per-block shared-memory backtrace bypass this buffer.
10881090static uint8_t * tcq_bt_buf = nullptr ;
10891091static int64_t tcq_bt_buf_bytes = 0 ;
10901092
@@ -1412,6 +1414,7 @@ static __global__ void __launch_bounds__(256, 1) k_set_rows_turboq2_tcq(
14121414 const float * __restrict__ src0, const idx_t * __restrict__ src1,
14131415 block_turboq2_tcq * __restrict__ dst, const int64_t ne_total_groups,
14141416 uint8_t * __restrict__ bt_buf,
1417+ const int use_shared_bt,
14151418 const int64_t ne00, const int64_t ne01, const int64_t ne02,
14161419 const int64_t ne10, const int64_t ne11, const int64_t ne12, const int64_t ne13,
14171420 const int64_t s01, const int64_t s02, const int64_t s03,
@@ -1438,12 +1441,32 @@ static __global__ void __launch_bounds__(256, 1) k_set_rows_turboq2_tcq(
14381441 block_turboq2_tcq * dst_blk = (block_turboq2_tcq *)((char *)dst + dst_row*s1 + i02*s2 + i03*s3)
14391442 + (i00 / QK_TURBOQ2_TCQ );
14401443
1444+ // Shared memory layout:
1445+ // x[128] : rotated+normalized input (also reused as scratch during reductions)
1446+ // cost[256] : path costs buffer A (also reused as norm-reduction scratch)
1447+ // cost_b[256]: path costs buffer B (double-buffering eliminates 2/3 of syncs)
1448+ // Backtrace: one predecessor byte for each of the 64 low-state groups per
1449+ // step (compressed from the old 128x256 layout — the predecessor depends
1450+ // only on sid's low 6 bits, never on the output bits in sid[7:6]). The
1451+ // backtrace lives in dynamic shared memory when the device opts in, else
1452+ // in bt_buf in global memory (still 128x64 bytes per block, byte-packed).
1453+ extern __shared__ uint8_t bt_shared[];
14411454 __shared__ float x[128 ];
14421455 __shared__ float cost[256 ];
1443- __shared__ float cost_b[256 ]; // double-buffering for Viterbi (was bt[128][128], 16KB shared)
1456+ __shared__ float cost_b[256 ];
14441457 __shared__ int warp_min_idx[8 ];
14451458 __shared__ float warp_min_cost[8 ];
1459+ __shared__ float pred_min_cost[64 ];
14461460 __shared__ int shared_initial_state;
1461+ // Dedicated shared buffer for the Viterbi-backtrack output bytes. Previously
1462+ // aliased onto x[] via (uint8_t *)x, but writing uint8_t into a float-typed
1463+ // shared array is a strict-aliasing violation: under HIP/ROCm the compiler
1464+ // can hoist cross-thread reads of outputs[] above the __syncthreads() that
1465+ // follows the sid==0 backtrack write. The parallel bitpack introduced by
1466+ // this port reads outputs[sym_idx] from sids 0..32, which would hit the
1467+ // same hazard turboq3 hit in Phase 3a #20 (+12.7% PPL regression). Apply
1468+ // the same fix preemptively, as forecast in commit 70b3dd57c.
1469+ __shared__ uint8_t s_outputs[128 ];
14471470
14481471 // Parallel pre-Viterbi: load (threads 0-127)
14491472 if (sid < 128 ) x[sid] = grp_src[sid];
@@ -1483,17 +1506,31 @@ static __global__ void __launch_bounds__(256, 1) k_set_rows_turboq2_tcq(
14831506 if (sid < 128 ) x[sid] *= inv_norm;
14841507 __syncthreads ();
14851508
1486- // Parallel FWHT: signs1 → 7-stage butterfly → scale + signs2
1487- if (sid < 128 ) x[sid] *= TURBO_WHT_SIGNS1 [sid];
1488- __syncthreads ();
1489- for (int h = 1 ; h < 128 ; h *= 2 ) {
1490- if (sid < 64 ) {
1491- int j = (sid / h) * (2 * h) + (sid % h);
1492- float a = x[j], b = x[j + h];
1493- x[j] = a + b; x[j + h] = a - b;
1509+ // Parallel FWHT: signs1 → 7-stage butterfly → scale + signs2.
1510+ // The first five stages run inside each warp via __shfl_xor_sync; the
1511+ // last two stages span warps so they fall back to shared memory.
1512+ if (sid < 128 ) {
1513+ float v = x[sid] * TURBO_WHT_SIGNS1 [sid];
1514+ const int lane = sid & 31 ;
1515+ #pragma unroll
1516+ for (int h = 1 ; h < 32 ; h <<= 1 ) {
1517+ const float other = __shfl_xor_sync (0xFFFFFFFF , v, h, WARP_SIZE );
1518+ v = (lane & h) ? (other - v) : (v + other);
14941519 }
1495- __syncthreads () ;
1520+ x[sid] = v ;
14961521 }
1522+ __syncthreads ();
1523+ if (sid < 64 ) {
1524+ const int j = ((sid >> 5 ) << 6 ) + (sid & 31 );
1525+ float a = x[j], b = x[j + 32 ];
1526+ x[j] = a + b; x[j + 32 ] = a - b;
1527+ }
1528+ __syncthreads ();
1529+ if (sid < 64 ) {
1530+ float a = x[sid], b = x[sid + 64 ];
1531+ x[sid] = a + b; x[sid + 64 ] = a - b;
1532+ }
1533+ __syncthreads ();
14971534 constexpr float inv_sqrt_128 = 0 .08838834764831845f ;
14981535 if (sid < 128 ) x[sid] *= inv_sqrt_128 * TURBO_WHT_SIGNS2 [sid];
14991536 __syncthreads ();
@@ -1508,8 +1545,8 @@ static __global__ void __launch_bounds__(256, 1) k_set_rows_turboq2_tcq(
15081545 float saved_norm = cost[0 ];
15091546
15101547 // Initialize Viterbi: free initial state (all 256 states equally viable)
1511- // Double-buffered cost (1 sync/step, was 3); byte-packed bt in global memory.
1512- uint8_t * bt = bt_buf + (int64_t )blockIdx .x * (128 * 256 );
1548+ // Double-buffered cost (1 sync/step, was 3); byte-packed bt in shared or global memory.
1549+ uint8_t * bt = use_shared_bt ? bt_shared : bt_buf + (int64_t )blockIdx .x * (128 * 64 );
15131550 cost[sid] = 0 .0f ;
15141551 __syncthreads ();
15151552
@@ -1520,24 +1557,31 @@ static __global__ void __launch_bounds__(256, 1) k_set_rows_turboq2_tcq(
15201557
15211558 float xt = x[t];
15221559
1523- // Right-shift trellis (k=2, L=8): ns = (prev >> 2) | (out << 6)
1524- // Predecessors of sid: prev = ((sid & 0x3F) << 2) | p, for p = 0..3
1525- int base_prev = (sid & 0x3F ) << 2 ;
1526- float dist = xt - d_turboq2_tcq_codebook[sid];
1527- dist = dist * dist;
1528-
1529- float best = 1e30f;
1530- int best_p = 0 ;
1531- for (int p = 0 ; p < 4 ; p++) {
1532- float c = cost_rd[base_prev | p];
1533- if (c < best) {
1534- best = c;
1535- best_p = p;
1560+ // Right-shift trellis (k=2, L=8): ns = (prev >> 2) | (out << 6). The
1561+ // best predecessor depends only on sid's low 6 bits, so compute those
1562+ // 64 minima once instead of repeating the same 4-way scan per output.
1563+ if (sid < 64 ) {
1564+ const int base_prev = sid << 2 ;
1565+ float best = cost_rd[base_prev];
1566+ int best_p = 0 ;
1567+ #pragma unroll
1568+ for (int p = 1 ; p < 4 ; p++) {
1569+ float c = cost_rd[base_prev | p];
1570+ if (c < best) {
1571+ best = c;
1572+ best_p = p;
1573+ }
15361574 }
1575+ pred_min_cost[sid] = best;
1576+ bt[t * 64 + sid] = (uint8_t ) best_p;
15371577 }
1578+ __syncthreads ();
15381579
1539- cost_wr[sid] = best + dist;
1540- bt[t * 256 + sid] = (uint8_t )best_p;
1580+ const int pred_idx = sid & 0x3F ;
1581+ float dist = xt - d_turboq2_tcq_codebook[sid];
1582+ dist = dist * dist;
1583+
1584+ cost_wr[sid] = pred_min_cost[pred_idx] + dist;
15411585 __syncthreads ();
15421586 }
15431587 // After 128 steps (even count): final costs are in cost[] (step 127 is odd → cost_wr=cost)
@@ -1558,27 +1602,36 @@ static __global__ void __launch_bounds__(256, 1) k_set_rows_turboq2_tcq(
15581602 }
15591603 }
15601604 __syncthreads ();
1561- if (sid == 0 ) {
1562- float best = warp_min_cost[0 ];
1563- int best_idx = warp_min_idx[0 ];
1564- for (int w = 1 ; w < 8 ; w++) {
1565- if (warp_min_cost[w] < best) { best = warp_min_cost[w]; best_idx = warp_min_idx[w]; }
1605+ // Reduce 8 warp minima via a single-warp shuffle (32 lanes) instead of a
1606+ // serial single-thread loop. Upper 24 lanes seed FLT_MAX so they never win.
1607+ if (sid < 32 ) {
1608+ float best = (sid < 8 ) ? warp_min_cost[sid] : 3 .4028234663852886e38f;
1609+ int best_idx = (sid < 8 ) ? warp_min_idx[sid] : 0 ;
1610+ #pragma unroll
1611+ for (int offset = 16 ; offset > 0 ; offset >>= 1 ) {
1612+ float other_cost = __shfl_down_sync (0xFFFFFFFF , best, offset, WARP_SIZE );
1613+ int other_idx = __shfl_down_sync (0xFFFFFFFF , best_idx, offset, WARP_SIZE );
1614+ if (other_cost < best) { best = other_cost; best_idx = other_idx; }
1615+ }
1616+ if (sid == 0 ) {
1617+ shared_initial_state = best_idx; // temporarily: best final state (becomes initial after backtrack)
15661618 }
1567- shared_initial_state = best_idx; // temporarily: best final state (becomes initial after backtrack)
15681619 }
15691620 __syncthreads ();
15701621
15711622 // TCQ error dump (port of buun 764c686b0): save post-FWHT x[] before backtrack overwrites it.
15721623 if (d_tcq_dump_max > 0 && grp < d_tcq_dump_max && sid < 128 )
15731624 d_tcq_dump_x_buf[grp * 128 + sid] = x[sid];
15741625
1575- // Thread 0: backtrack (inherently sequential, reads byte-packed bt from global memory)
1576- uint8_t * outputs = (uint8_t *)x;
1626+ // Thread 0: backtrack (inherently sequential, reads byte-packed bt from
1627+ // shared or global memory). Writes the winning-path output bytes into
1628+ // __shared__ s_outputs[] (type-clean; see declaration comment above).
1629+ uint8_t * outputs = s_outputs;
15771630 if (sid == 0 ) {
15781631 int state = shared_initial_state;
15791632 for (int t = 127 ; t >= 0 ; t--) {
15801633 outputs[t] = (uint8_t )(state >> 6 ); // output = top 2 bits (k=2)
1581- int p = bt[t * 256 + state];
1634+ int p = bt[t * 64 + ( state & 0x3F ) ];
15821635 state = ((state & 0x3F ) << 2 ) | p; // reconstruct predecessor
15831636 }
15841637 shared_initial_state = state;
@@ -1629,19 +1682,33 @@ static __global__ void __launch_bounds__(256, 1) k_set_rows_turboq2_tcq(
16291682 float corrected_norm = (recon_norm > 1e-10f ) ? saved_norm / recon_norm : saved_norm;
16301683 corrected_norm *= iq_is_k ? d_tcq_norm_alpha : d_tcq_norm_alpha_v;
16311684
1632- // Thread 0: pack bitstream (serial — avoids byte-alignment hazards with OR-into-byte)
1633- if (sid == 0 ) {
1634- for (int j = 0 ; j < 33 ; j++) dst_blk->qs [j] = 0 ;
1635- // Write initial state prefix (upper 6 bits = initial_state >> 2)
1636- dst_blk->qs [0 ] = (uint8_t )((shared_initial_state >> 2 ) & 0x3F );
1637- for (int t = 0 ; t < 128 ; t++) {
1638- const int bit_pos = 6 + t * 2 ;
1639- const int byte_idx = bit_pos / 8 ;
1640- const int bit_off = bit_pos % 8 ;
1641- const int out = outputs[t] & 0x3 ;
1642- dst_blk->qs [byte_idx] |= (uint8_t )(out << bit_off);
1643- // 2 bits at even bit_off ∈ {0,2,4,6}: 6+2=8 fits one byte, never crosses
1685+ // Parallel bitpack: qs stores 6 initial-state bits followed by 128 two-bit
1686+ // output symbols. Each byte is independent (the 2-bit symbols never cross
1687+ // byte boundaries after the 6-bit prefix), so 33 threads can each pack one
1688+ // byte without atomics. Cross-thread reads of outputs[sym_idx] are safe
1689+ // because outputs aliases the type-clean __shared__ uint8_t s_outputs[]
1690+ // (see declaration comment).
1691+ if (sid < 33 ) {
1692+ const int init_bits = (shared_initial_state >> 2 ) & 0x3F ;
1693+ uint8_t packed = 0 ;
1694+ #pragma unroll
1695+ for (int bit = 0 ; bit < 8 ; bit++) {
1696+ const int pos = sid * 8 + bit;
1697+ int v = 0 ;
1698+ if (pos < 6 ) {
1699+ v = (init_bits >> pos) & 1 ;
1700+ } else {
1701+ const int sym_bit_pos = pos - 6 ;
1702+ const int sym_idx = sym_bit_pos / 2 ;
1703+ if (sym_idx < 128 ) {
1704+ v = (outputs[sym_idx] >> (sym_bit_pos % 2 )) & 1 ;
1705+ }
1706+ }
1707+ packed |= (uint8_t )(v << bit);
16441708 }
1709+ dst_blk->qs [sid] = packed;
1710+ }
1711+ if (sid == 0 ) {
16451712 dst_blk->norm = __float2half (corrected_norm);
16461713 }
16471714}
@@ -1761,15 +1828,42 @@ static void set_rows_cuda_turboq2_tcq(
17611828
17621829 const int64_t ne_total_groups = (ne00 * ne01 * ne02 * ne03) / QK_TURBOQ2_TCQ ;
17631830 if (ne_total_groups > 0 && ne00 > 0 && ne01 > 0 && ne02 > 0 && ne11 > 0 && ne12 > 0 ) {
1764- ensure_tcq_bt_buf (ne_total_groups * 128 * 256 );
1831+ // One-shot probe: on CUDA, opt in to shared-memory backtrace if the device
1832+ // exposes enough opt-in shared memory per block (and the env knob allows).
1833+ // HIP/MUSA paths skip the probe and always use the global bt_buf branch.
1834+ static int tcq2_use_shared_bt = 0 ;
1835+ static bool tcq2_bt_checked = false ;
1836+ constexpr int tcq2_bt_shared_bytes = 128 * 64 ;
1837+ if (!tcq2_bt_checked) {
1838+ tcq2_bt_checked = true ;
1839+ #if !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
1840+ const char * tcq_shared_bt_env = getenv (" TURBO_TCQ_SHARED_BT" );
1841+ if (!tcq_shared_bt_env || atoi (tcq_shared_bt_env) != 0 ) {
1842+ int max_shared_optin = 0 ;
1843+ CUDA_CHECK (cudaDeviceGetAttribute (&max_shared_optin, cudaDevAttrMaxSharedMemoryPerBlockOptin, ctx.device ));
1844+ if (max_shared_optin >= tcq2_bt_shared_bytes) {
1845+ CUDA_SET_SHARED_MEMORY_LIMIT (k_set_rows_turboq2_tcq<idx_t >, tcq2_bt_shared_bytes);
1846+ tcq2_use_shared_bt = 1 ;
1847+ fprintf (stderr, " TCQ2 encode: using shared-memory backtrace (%d bytes/block)\n " , tcq2_bt_shared_bytes);
1848+ } else {
1849+ fprintf (stderr, " TCQ2 encode: shared-memory backtrace unavailable, only %d bytes/block are available\n " , max_shared_optin);
1850+ }
1851+ }
1852+ #endif
1853+ }
1854+ if (!tcq2_use_shared_bt) {
1855+ ensure_tcq_bt_buf (ne_total_groups * 128 * 64 );
1856+ }
17651857 const uint3 ne00_fd = init_fastdiv_values ((uint32_t ) ne00);
17661858 const uint3 ne01_fd = init_fastdiv_values ((uint32_t ) ne01);
17671859 const uint3 ne02_fd = init_fastdiv_values ((uint32_t ) ne02);
17681860 const uint3 ne11_fd = init_fastdiv_values ((uint32_t ) ne11);
17691861 const uint3 ne12_fd = init_fastdiv_values ((uint32_t ) ne12);
1770- k_set_rows_turboq2_tcq<idx_t ><<<(int )ne_total_groups, 256 , 0 , stream>>> (
1862+ const int shared_bytes = tcq2_use_shared_bt ? tcq2_bt_shared_bytes : 0 ;
1863+ k_set_rows_turboq2_tcq<idx_t ><<<(int )ne_total_groups, 256 , shared_bytes, stream>>> (
17711864 src0_d, src1_d, (block_turboq2_tcq *)dst->data ,
1772- ne_total_groups, tcq_bt_buf, ne00, ne01, ne02, ne10, ne11, ne12, ne13,
1865+ ne_total_groups, tcq_bt_buf, tcq2_use_shared_bt,
1866+ ne00, ne01, ne02, ne10, ne11, ne12, ne13,
17731867 s01_f, s02_f, s03_f, s10_i, s11_i, s12_i,
17741868 iq_is_k,
17751869 nb1, nb2, nb3,
0 commit comments