@@ -48,11 +48,11 @@ template <> struct block_q_t<GGML_TYPE_Q4_0> {
48
48
};
49
49
50
50
static constexpr std::pair<int , int > get_block_offset (const int block_index, const int /* nblocks */ ) {
51
- return { block_index * (traits::qk / traits::qr ), 0 };
51
+ return { block_index * (QK4_0 / QR4_0 ), 0 };
52
52
}
53
53
54
54
static constexpr std::pair<int , int > get_d_offset (int nrows, int ncols, const int block_index) {
55
- return { (ncols / traits::qr * nrows) + block_index * sizeof (ggml_half), 0 };
55
+ return { (ncols / QR4_0 * nrows) + block_index * sizeof (ggml_half), 0 };
56
56
}
57
57
58
58
static constexpr int block_to_q8_1_ratio () { return traits::qk / QK8_1; }
@@ -71,14 +71,12 @@ template <> struct block_q_t<GGML_TYPE_Q4_K> {
71
71
}
72
72
73
73
static constexpr std::pair<int , int > get_d_offset (int nrows, int ncols, const int block_index) {
74
- auto nblocks = (nrows * (ncols / traits::qk ));
75
- return { nblocks * (QK_K / 2 ),
74
+ auto nblocks = (nrows * (ncols / QK_K ));
75
+ return { nblocks * (QK_K / 2 ) + (block_index * K_SCALE_SIZE) ,
76
76
(nblocks * QK_K / 2 ) + (nblocks * K_SCALE_SIZE) + (block_index * sizeof (ggml_half2)) };
77
77
}
78
78
79
79
static constexpr int block_to_q8_1_ratio () { return traits::qk / QK8_1; }
80
-
81
- constexpr size_t get_total_qs_bytes (int nblocks) { return nblocks * QK_K / 2 ; }
82
80
};
83
81
84
82
template <> struct block_q_t <GGML_TYPE_Q6_K> {
@@ -90,22 +88,23 @@ template <> struct block_q_t<GGML_TYPE_Q6_K> {
90
88
};
91
89
92
90
static constexpr std::pair<int , int > get_block_offset (const int block_index, const int n_blocks) {
93
- auto low_bits_index = block_index * (traits::qk / traits::qr );
91
+ auto low_bits_index = block_index * (QK_K / QR6_K );
94
92
// the index of high bits it's after all low bits
95
93
auto high_bits_index = n_blocks * (QK_K / 2 ) + (block_index * (QK_K / 4 ));
96
94
return { low_bits_index, high_bits_index };
97
95
}
98
96
99
97
static constexpr std::pair<int , int > get_d_offset (int nrows, int ncols, const int block_index) {
100
- auto nblocks = (nrows * (ncols / traits::qk ));
98
+ auto nblocks = (nrows * (ncols / QK_K ));
101
99
auto total_qs_bytes = nblocks * (QK_K / 2 ) + nblocks * (QK_K / 4 );
102
100
auto block_scales = total_qs_bytes + block_index * (QK_K / 16 );
103
- auto sb_scale = total_qs_bytes + nblocks * (QK_K / 16 );
101
+ auto sb_scale = total_qs_bytes + nblocks * (QK_K / 16 ) + block_index * sizeof (ggml_half) ;
104
102
return { block_scales, sb_scale };
105
103
}
106
104
107
105
static constexpr int block_to_q8_1_ratio () { return traits::qk / QK8_1; }
108
106
};
107
+
109
108
} // namespace ggml_sycl_reordered
110
109
111
110
#endif // GGML_SYCL_QUANTS_HPP
0 commit comments