internal/bytealg: process two AVX2 lanes per Count loop

author Achille Roussel <achille.roussel@gmail.com>

Wed, 4 Oct 2023 04:58:03 +0000 (04:58 +0000)

committer Gopher Robot <gobot@golang.org>

Fri, 6 Oct 2023 20:54:43 +0000 (20:54 +0000)
author Achille Roussel <achille.roussel@gmail.com>
Wed, 4 Oct 2023 04:58:03 +0000 (04:58 +0000)
committer Gopher Robot <gobot@golang.org>
Fri, 6 Oct 2023 20:54:43 +0000 (20:54 +0000)
diff --git a/src/internal/bytealg/count_amd64.s b/src/internal/bytealg/count_amd64.s

index 807c289113922a7c0030d81b34e9f406a750c6e8..3a8dc3675a7669f903360d14b60ff9d3d3a0b1f9 100644 (file)
--- a/src/internal/bytealg/count_amd64.s
+++ b/src/internal/bytealg/count_amd64.s
@@ -51,8 +51,8 @@ TEXT countbody<>(SB),NOSPLIT,$0
  
         MOVQ SI, DI
  
-       CMPQ BX, $32
-       JA avx2
+       CMPQ BX, $64
+       JAE avx2
  sse:
         LEAQ    -16(SI)(BX*1), AX       // AX = address of last 16 bytes
         JMP     sseloopentry
@@ -162,44 +162,63 @@ avx2:
         JNE sse
  #endif
         MOVD AX, X0
-       LEAQ -32(SI)(BX*1), R11
+       LEAQ -64(SI)(BX*1), R11
+       LEAQ (SI)(BX*1), R13
         VPBROADCASTB  X0, Y1
         PCALIGN $32
  avx2_loop:
         VMOVDQU (DI), Y2
+       VMOVDQU 32(DI), Y4
         VPCMPEQB Y1, Y2, Y3
+       VPCMPEQB Y1, Y4, Y5
         VPMOVMSKB Y3, DX
+       VPMOVMSKB Y5, CX
         POPCNTL DX, DX
+       POPCNTL CX, CX
         ADDQ DX, R12
-       ADDQ $32, DI
+       ADDQ CX, R12
+       ADDQ $64, DI
         CMPQ DI, R11
         JLE avx2_loop
  
         // If last block is already processed,
         // skip to the end.
-       CMPQ DI, R11
+       //
+       // This check is NOT an optimization; if the input length is a
+       // multiple of 64, we must not go through the last leg of the
+       // function because the bit shift count passed to SALQ below would
+       // be 64, which is outside of the 0-63 range supported by those
+       // instructions.
+       //
+       // Tests in the bytes and strings packages with input lengths that
+       // are multiples of 64 will break if this condition were removed.
+       CMPQ DI, R13
         JEQ endavx
  
-       // Load address of the last 32 bytes.
+       // Load address of the last 64 bytes.
         // There is an overlap with the previous block.
         MOVQ R11, DI
         VMOVDQU (DI), Y2
+       VMOVDQU 32(DI), Y4
         VPCMPEQB Y1, Y2, Y3
+       VPCMPEQB Y1, Y4, Y5
         VPMOVMSKB Y3, DX
+       VPMOVMSKB Y5, CX
         // Exit AVX mode.
         VZEROUPPER
+       SALQ $32, CX
+       ORQ CX, DX
  
-       // Create mask to ignore overlap between previous 32 byte block
+       // Create mask to ignore overlap between previous 64 byte block
         // and the next.
-       ANDQ $31, BX
-       MOVQ $32,CX
+       ANDQ $63, BX
+       MOVQ $64, CX
         SUBQ BX, CX
-       MOVQ $0xFFFFFFFF, R10
-       SARQ CL, R10
+       MOVQ $0xFFFFFFFFFFFFFFFF, R10
         SALQ CL, R10
         // Apply mask
         ANDQ R10, DX
-       POPCNTL DX, DX
+       POPCNTQ DX, DX
         ADDQ DX, R12
         MOVQ R12, (R8)
         RET
author	Achille Roussel <achille.roussel@gmail.com>
	Wed, 4 Oct 2023 04:58:03 +0000 (04:58 +0000)
committer	Gopher Robot <gobot@golang.org>
	Fri, 6 Oct 2023 20:54:43 +0000 (20:54 +0000)