@@ -10,6 +10,11 @@ pub(crate) fn fwht(data: &mut [GfElement; GF_ORDER], m_truncated: usize) {
1010 // Note to self: fwht_8 is slightly faster on x86 (AMD Ryzen 5 3600),
1111 // but slower on ARM (Apple silicon M1).
1212 // fwht_16 is always slower. See branch: AndersTrier/FWHT_8_and_16
13+
14+ if m_truncated >= GF_ORDER {
15+ return fwht_8_full ( data) ;
16+ }
17+
1318 let mut dist = 1 ;
1419 let mut dist4 = 4 ;
1520 while dist4 <= GF_ORDER {
@@ -45,6 +50,7 @@ fn fwht_4(data: &mut [GfElement; GF_ORDER], offset: u16, dist: u16) {
4550
4651 let ( s0, d0) = fwht_2 ( data[ i0] , data[ i1] ) ;
4752 let ( s1, d1) = fwht_2 ( data[ i2] , data[ i3] ) ;
53+
4854 let ( s2, d2) = fwht_2 ( s0, s1) ;
4955 let ( s3, d3) = fwht_2 ( d0, d1) ;
5056
@@ -54,6 +60,69 @@ fn fwht_4(data: &mut [GfElement; GF_ORDER], offset: u16, dist: u16) {
5460 data[ i3] = d3;
5561}
5662
63+ #[ inline( always) ]
64+ fn fwht_8_full ( data : & mut [ GfElement ; GF_ORDER ] ) {
65+ fwht_8_truncated ( data, GF_ORDER )
66+ }
67+
68+ #[ inline( always) ]
69+ fn fwht_8_truncated ( data : & mut [ GfElement ; GF_ORDER ] , m_truncated : usize ) {
70+ let mut dist = 1 ;
71+ let mut dist8 = 8 ;
72+ while dist8 <= GF_ORDER {
73+ for r in ( 0 ..m_truncated) . step_by ( dist8) {
74+ for offset in r..r + dist {
75+ fwht_8 ( data, offset as u16 , dist as u16 ) ;
76+ }
77+ }
78+
79+ dist = dist8;
80+ dist8 <<= 3 ;
81+ }
82+
83+ for i in 0 ..32768 {
84+ let ( s0, d0) = fwht_2 ( data[ i] , data[ i + dist] ) ;
85+ data[ i] = s0;
86+ data[ i + dist] = d0;
87+ }
88+ }
89+
90+ #[ inline( always) ]
91+ fn fwht_8 ( data : & mut [ GfElement ; GF_ORDER ] , offset : u16 , dist : u16 ) {
92+ let t0 = usize:: from ( offset) ;
93+ let t1 = usize:: from ( offset + dist) ;
94+ let t2 = usize:: from ( offset + dist * 2 ) ;
95+ let t3 = usize:: from ( offset + dist * 3 ) ;
96+ let t4 = usize:: from ( offset + dist * 4 ) ;
97+ let t5 = usize:: from ( offset + dist * 5 ) ;
98+ let t6 = usize:: from ( offset + dist * 6 ) ;
99+ let t7 = usize:: from ( offset + dist * 7 ) ;
100+
101+ let ( s0, d0) = fwht_2 ( data[ t0] , data[ t1] ) ;
102+ let ( s1, d1) = fwht_2 ( data[ t2] , data[ t3] ) ;
103+ let ( s2, d2) = fwht_2 ( data[ t4] , data[ t5] ) ;
104+ let ( s3, d3) = fwht_2 ( data[ t6] , data[ t7] ) ;
105+
106+ let ( s4, d4) = fwht_2 ( s0, s1) ;
107+ let ( s5, d5) = fwht_2 ( s2, s3) ;
108+ let ( s6, d6) = fwht_2 ( d0, d1) ;
109+ let ( s7, d7) = fwht_2 ( d2, d3) ;
110+
111+ let ( s8, d8) = fwht_2 ( s4, s5) ;
112+ let ( s9, d9) = fwht_2 ( s6, s7) ;
113+ let ( s10, d10) = fwht_2 ( d4, d5) ;
114+ let ( s11, d11) = fwht_2 ( d6, d7) ;
115+
116+ data[ t0] = s8;
117+ data[ t1] = s9;
118+ data[ t2] = s10;
119+ data[ t3] = s11;
120+ data[ t4] = d8;
121+ data[ t5] = d9;
122+ data[ t6] = d10;
123+ data[ t7] = d11;
124+ }
125+
57126// ======================================================================
58127// FWHT - TESTS
59128
@@ -130,4 +199,53 @@ mod tests {
130199 assert_eq ! ( data1, data2) ;
131200 }
132201 }
202+
203+ #[ test]
204+ fn test_8_full ( ) {
205+ let mut rng = ChaCha8Rng :: from_seed ( [ 0 ; 32 ] ) ;
206+
207+ let mut data1 = [ ( ) ; GF_ORDER ] . map ( |_| rng. gen ( ) ) ;
208+ let mut data2 = data1;
209+
210+ fwht_8_full ( & mut data1) ;
211+ fwht_naive ( & mut data2) ;
212+
213+ assert_eq ! ( data1, data2) ;
214+ }
215+
216+ #[ test]
217+ fn test_8_truncated ( ) {
218+ let mut rng = ChaCha8Rng :: from_seed ( [ 0 ; 32 ] ) ;
219+ let random: Vec < GfElement > = ( 0 ..GF_ORDER ) . map ( |_| rng. gen ( ) ) . collect ( ) ;
220+
221+ for nonzero_count in [
222+ 0 ,
223+ 1 ,
224+ 2 ,
225+ 3 ,
226+ 4 ,
227+ 64 ,
228+ 127 ,
229+ 16384 - 1 ,
230+ 16384 + 1 ,
231+ GF_ORDER / 2 - 1 ,
232+ GF_ORDER / 2 ,
233+ GF_ORDER / 2 + 1 ,
234+ GF_ORDER - 4 ,
235+ GF_ORDER - 3 ,
236+ GF_ORDER - 2 ,
237+ GF_ORDER - 1 ,
238+ GF_ORDER ,
239+ ] {
240+ let mut data1 = [ 0 ; GF_ORDER ] ;
241+
242+ data1[ ..nonzero_count] . copy_from_slice ( & random[ ..nonzero_count] ) ;
243+ let mut data2 = data1;
244+
245+ fwht_8_truncated ( & mut data1, nonzero_count) ;
246+ fwht_naive ( & mut data2) ;
247+
248+ assert_eq ! ( data1, data2) ;
249+ }
250+ }
133251}
0 commit comments