@@ -245,3 +245,243 @@ found_scalar:
245245 MOVQ SI, pos+32 (FP)
246246 MOVQ BX, bucket+40 (FP)
247247 RET // No VZEROUPPER needed (SSSE3)
248+
249+ // func teddySlimSSSE3_2(masks *teddyMasks, haystack []byte) (pos, bucket int)
250+ //
251+ // SSSE3 implementation of Teddy Slim with 2-byte fingerprint.
252+ // This reduces false positives by ~90% compared to 1-byte fingerprint.
253+ //
254+ // Algorithm:
255+ // 1. Load nibble masks for positions 0 and 1
256+ // 2. Main loop: process 16 bytes per iteration
257+ // a. Load haystack[i:i+16] for position 0
258+ // b. Load haystack[i+1:i+17] for position 1 (overlapping)
259+ // c. For each position: extract nibbles, PSHUFB lookup, AND lo/hi
260+ // d. AND results from both positions
261+ // e. Non-zero result = candidate
262+ //
263+ // teddyMasks struct layout:
264+ // +0: fingerprintLen (4 bytes)
265+ // +4: padding (4 bytes)
266+ // +8: loMasks[0] (32 bytes, we use first 16)
267+ // +40: loMasks[1] (32 bytes, we use first 16)
268+ // +72: loMasks[2] (32 bytes, unused)
269+ // +104: loMasks[3] (32 bytes, unused)
270+ // +136: hiMasks[0] (32 bytes, we use first 16)
271+ // +168: hiMasks[1] (32 bytes, we use first 16)
272+ // +200: hiMasks[2] (32 bytes, unused)
273+ // +232: hiMasks[3] (32 bytes, unused)
274+ TEXT ·teddySlimSSSE3_2(SB), NOSPLIT, $0 -48
275+ // Load parameters
276+ MOVQ masks+0 (FP), R8 // R8 = pointer to teddyMasks
277+ MOVQ haystack_base+8 (FP), SI // SI = haystack pointer
278+ MOVQ haystack_len+16 (FP), DX // DX = haystack length
279+
280+ // Empty haystack check
281+ TESTQ DX, DX
282+ JZ not_found_2
283+
284+ // Check minimum length (need at least 2 bytes for 2-byte fingerprint)
285+ CMPQ DX, $2
286+ JB not_found_2
287+
288+ // Load nibble masks for positions 0 and 1
289+ // Position 0: loMasks[0] at +8, hiMasks[0] at +136
290+ // Position 1: loMasks[1] at +40, hiMasks[1] at +168
291+ MOVOU 8 (R8), X0 // X0 = loMasks[0]
292+ MOVOU 136 (R8), X1 // X1 = hiMasks[0]
293+ MOVOU 40 (R8), X8 // X8 = loMasks[1]
294+ MOVOU 168 (R8), X9 // X9 = hiMasks[1]
295+
296+ // Create nibble extraction mask: 0x0F repeated 16 times
297+ MOVQ $0x0F0F0F0F0F0F0F0F , AX
298+ MOVQ AX, X2
299+ PUNPCKLQDQ X2, X2 // X2 = [0x0F x 16]
300+
301+ // Save original haystack pointer for offset calculation
302+ MOVQ SI, DI // DI = haystack start (preserved)
303+
304+ // Calculate end pointer (need 1 extra byte for overlapping load)
305+ LEAQ (SI)(DX*1 ), R9 // R9 = SI + length (end pointer)
306+ SUBQ $1 , R9 // Adjust for 2-byte fingerprint overlap
307+
308+ loop16_2:
309+ // Check if we have at least 16 bytes remaining
310+ LEAQ 16 (SI), R10 // R10 = SI + 16
311+ CMPQ R10, R9 // Compare with adjusted end pointer
312+ JA handle_tail_2 // If R10 > R9, less than 16 bytes left
313+
314+ // Load 16 bytes from haystack for position 0
315+ MOVOU (SI), X3 // X3 = haystack[SI:SI+16]
316+ // Load 16 bytes from haystack for position 1 (offset by 1)
317+ MOVOU 1 (SI), X10 // X10 = haystack[SI+1:SI+17]
318+
319+ // === Process position 0 ===
320+ // Extract low nibbles
321+ MOVOA X3, X4 // X4 = copy
322+ PAND X2, X4 // X4 = low nibbles
323+
324+ // Extract high nibbles
325+ MOVOA X3, X5 // X5 = copy
326+ PSRLW $4 , X5 // Shift right 4 bits
327+ PAND X2, X5 // X5 = high nibbles
328+
329+ // PSHUFB lookups for position 0
330+ MOVOA X0, X6 // X6 = loMasks[0]
331+ PSHUFB X4, X6 // X6 = lo lookup results
332+ MOVOA X1, X7 // X7 = hiMasks[0]
333+ PSHUFB X5, X7 // X7 = hi lookup results
334+ PAND X7, X6 // X6 = position 0 result
335+
336+ // === Process position 1 ===
337+ // Extract low nibbles
338+ MOVOA X10, X4 // X4 = copy
339+ PAND X2, X4 // X4 = low nibbles
340+
341+ // Extract high nibbles
342+ MOVOA X10, X5 // X5 = copy
343+ PSRLW $4 , X5 // Shift right 4 bits
344+ PAND X2, X5 // X5 = high nibbles
345+
346+ // PSHUFB lookups for position 1
347+ MOVOA X8, X11 // X11 = loMasks[1]
348+ PSHUFB X4, X11 // X11 = lo lookup results
349+ MOVOA X9, X12 // X12 = hiMasks[1]
350+ PSHUFB X5, X12 // X12 = hi lookup results
351+ PAND X12, X11 // X11 = position 1 result
352+
353+ // === Combine both positions ===
354+ PAND X11, X6 // X6 = pos0 & pos1 (final result)
355+
356+ // Detect non-zero bytes
357+ PXOR X13, X13 // X13 = zero vector
358+ PCMPEQB X13, X6 // X6[i] = 0xFF if zero, else 0x00
359+ PMOVMSKB X6, CX // CX = bitmask where bytes were ZERO
360+ XORL $0xFFFF , CX // Invert: CX = bitmask where bytes were NON-ZERO
361+
362+ // Check if any candidates found
363+ TESTL CX, CX
364+ JNZ found_candidate_2
365+
366+ // No candidates, advance to next 16 bytes
367+ ADDQ $16 , SI
368+ JMP loop16_2
369+
370+ handle_tail_2:
371+ // Add back the 1 we subtracted for overlap check
372+ ADDQ $1 , R9
373+
374+ // Process remaining bytes with scalar loop
375+ CMPQ SI, R9
376+ JAE not_found_2
377+
378+ // Need at least 2 bytes for fingerprint
379+ LEAQ 1 (SI), R10
380+ CMPQ R10, R9
381+ JAE not_found_2
382+
383+ tail_loop_2:
384+ // Load two consecutive bytes
385+ MOVBLZX (SI), AX // AX = byte at position 0
386+ MOVBLZX 1 (SI), R10 // R10 = byte at position 1
387+
388+ // === Position 0 lookup ===
389+ MOVL AX, BX
390+ ANDL $0x0F , BX // BX = low nibble pos0
391+ MOVL AX, CX
392+ SHRL $4 , CX
393+ ANDL $0x0F , CX // CX = high nibble pos0
394+
395+ MOVBLZX 8 (R8)(BX*1 ), AX // AX = loMasks[0][lowNibble]
396+ MOVBLZX 136 (R8)(CX*1 ), CX // CX = hiMasks[0][highNibble]
397+ ANDL CX, AX // AX = pos0 bucket bits
398+
399+ // === Position 1 lookup ===
400+ MOVL R10, BX
401+ ANDL $0x0F , BX // BX = low nibble pos1
402+ MOVL R10, CX
403+ SHRL $4 , CX
404+ ANDL $0x0F , CX // CX = high nibble pos1
405+
406+ MOVBLZX 40 (R8)(BX*1 ), BX // BX = loMasks[1][lowNibble]
407+ MOVBLZX 168 (R8)(CX*1 ), CX // CX = hiMasks[1][highNibble]
408+ ANDL CX, BX // BX = pos1 bucket bits
409+
410+ // === Combine ===
411+ ANDL BX, AX // AX = pos0 & pos1
412+
413+ // Check if any bucket matched
414+ TESTL AX, AX
415+ JNZ found_scalar_2
416+
417+ // Advance to next byte
418+ INCQ SI
419+ LEAQ 1 (SI), R10
420+ CMPQ R10, R9
421+ JB tail_loop_2
422+
423+ not_found_2:
424+ MOVQ $-1 , AX
425+ MOVQ AX, pos+32 (FP)
426+ MOVQ $-1 , BX
427+ MOVQ BX, bucket+40 (FP)
428+ RET
429+
430+ found_candidate_2:
431+ // Find first set bit in mask
432+ BSFL CX, AX // AX = position of first set bit (0-15)
433+
434+ // Save chunk start
435+ MOVQ SI, R10
436+
437+ // Calculate absolute position
438+ SUBQ DI, SI // SI = offset from haystack start
439+ ADDQ SI, AX // AX = absolute position
440+
441+ // Get chunk offset for byte lookup
442+ MOVQ AX, R11
443+ SUBQ SI, R11 // R11 = chunk offset (0-15)
444+
445+ // Load two consecutive bytes at candidate position
446+ MOVBLZX (R10)(R11*1 ), BX // BX = byte at pos0
447+ MOVBLZX 1 (R10)(R11*1 ), R12 // R12 = byte at pos1
448+
449+ // === Position 0 nibble lookup ===
450+ MOVL BX, CX
451+ ANDL $0x0F , CX // CX = low nibble
452+ SHRL $4 , BX // BX = high nibble
453+
454+ MOVBLZX 8 (R8)(CX*1 ), CX // CX = loMasks[0][low]
455+ MOVBLZX 136 (R8)(BX*1 ), BX // BX = hiMasks[0][high]
456+ ANDL BX, CX // CX = pos0 bucket bits
457+
458+ // === Position 1 nibble lookup ===
459+ MOVL R12, BX
460+ ANDL $0x0F , BX // BX = low nibble
461+ MOVL R12, R13
462+ SHRL $4 , R13 // R13 = high nibble
463+
464+ MOVBLZX 40 (R8)(BX*1 ), BX // BX = loMasks[1][low]
465+ MOVBLZX 168 (R8)(R13*1 ), R13 // R13 = hiMasks[1][high]
466+ ANDL R13, BX // BX = pos1 bucket bits
467+
468+ // === Combine and find bucket ===
469+ ANDL BX, CX // CX = final bucket bits
470+ BSFL CX, BX // BX = bucket ID
471+
472+ // Return results
473+ MOVQ AX, pos+32 (FP)
474+ MOVQ BX, bucket+40 (FP)
475+ RET
476+
477+ found_scalar_2:
478+ // Calculate position
479+ SUBQ DI, SI
480+
481+ // Find first set bucket bit
482+ BSFL AX, BX // BX = bucket ID
483+
484+ // Return results
485+ MOVQ SI, pos+32 (FP)
486+ MOVQ BX, bucket+40 (FP)
487+ RET
0 commit comments