MOVQ SI, DI
- CMPQ BX, $32
- JA avx2
+ CMPQ BX, $64
+ JAE avx2
sse:
LEAQ -16(SI)(BX*1), AX // AX = address of last 16 bytes
JMP sseloopentry
JNE sse
#endif
MOVD AX, X0
- LEAQ -32(SI)(BX*1), R11
+ LEAQ -64(SI)(BX*1), R11
+ LEAQ (SI)(BX*1), R13
VPBROADCASTB X0, Y1
PCALIGN $32
avx2_loop:
VMOVDQU (DI), Y2
+ VMOVDQU 32(DI), Y4
VPCMPEQB Y1, Y2, Y3
+ VPCMPEQB Y1, Y4, Y5
VPMOVMSKB Y3, DX
+ VPMOVMSKB Y5, CX
POPCNTL DX, DX
+ POPCNTL CX, CX
ADDQ DX, R12
- ADDQ $32, DI
+ ADDQ CX, R12
+ ADDQ $64, DI
CMPQ DI, R11
JLE avx2_loop
// If last block is already processed,
// skip to the end.
- CMPQ DI, R11
+ //
+ // This check is NOT an optimization; if the input length is a
+ // multiple of 64, we must not go through the last leg of the
+ // function because the bit shift count passed to SALQ below would
+ // be 64, which is outside of the 0-63 range supported by those
+ // instructions.
+ //
+ // Tests in the bytes and strings packages with input lengths that
+ // are multiples of 64 will break if this condition were removed.
+ CMPQ DI, R13
JEQ endavx
- // Load address of the last 32 bytes.
+ // Load address of the last 64 bytes.
// There is an overlap with the previous block.
MOVQ R11, DI
VMOVDQU (DI), Y2
+ VMOVDQU 32(DI), Y4
VPCMPEQB Y1, Y2, Y3
+ VPCMPEQB Y1, Y4, Y5
VPMOVMSKB Y3, DX
+ VPMOVMSKB Y5, CX
// Exit AVX mode.
VZEROUPPER
+ SALQ $32, CX
+ ORQ CX, DX
- // Create mask to ignore overlap between previous 32 byte block
+ // Create mask to ignore overlap between previous 64 byte block
// and the next.
- ANDQ $31, BX
- MOVQ $32,CX
+ ANDQ $63, BX
+ MOVQ $64, CX
SUBQ BX, CX
- MOVQ $0xFFFFFFFF, R10
- SARQ CL, R10
+ MOVQ $0xFFFFFFFFFFFFFFFF, R10
SALQ CL, R10
// Apply mask
ANDQ R10, DX
- POPCNTL DX, DX
+ POPCNTQ DX, DX
ADDQ DX, R12
MOVQ R12, (R8)
RET