UPD: XXH3-128 speed up

This commit is contained in:
Alexander Koblov 2024-09-14 21:27:15 +03:00
commit 27aec457a7

View file

@ -111,7 +111,6 @@ const
XXH_SECRET_CONSUME_RATE = 8;
XXH_STRIPE_LEN = 64;
XXH_ACC_SIZE = 64;
XXH_ACC_NB = 8;
XXH3_SECRET_SIZE_MIN = 136;
XXH_SECRET_DEFAULT_SIZE = 192;
@ -146,6 +145,7 @@ type
var
XXH3_accumulate: TXXH3_accumulate_f;
XXH3_scrambleAcc: TXXH3_scrambleAcc_f;
XXH3_accumulate_512: TXXH3_accumulate_512_f;
function XXH_readLE32(const ptr: Pointer): UInt32; inline;
@ -163,6 +163,12 @@ begin
Result:= (x and $FFFFFFFF) * (y and $FFFFFFFF);
end;
function XXH_xorshift64(v64: UInt64; shift: Integer): UInt64; inline;
begin
// XXH_ASSERT(0 <= shift && shift < 64);
Result:= v64 xor (v64 shr shift);
end;
function XXH64_avalanche(hash: UInt64): UInt64;
begin
hash := hash xor hash shr 33;
@ -272,6 +278,9 @@ end;
{$IF DEFINED(CPUX86_64)}
const
SSE_PRIME32_1: array[0..3] of UInt32 = (XXH_PRIME32_1, XXH_PRIME32_1, XXH_PRIME32_1, XXH_PRIME32_1);
procedure XXH3_accumulate_512_sse2(acc: PByte; const input: PByte; const secret: PByte); assembler; nostackframe;
// UNIX RDI, RSI, RDX
// WIN64: RCX, RDX, R8
@ -467,8 +476,102 @@ asm
ret
end;
procedure XXH3_scrambleAcc_sse2(acc: PByte; const secret: PByte); assembler; nostackframe;
// UNIX RDI, RSI
// WIN64: RCX, RDX
asm
{$IF DEFINED(UNIX)}
movq %rdi, %rcx
movq %rsi, %rdx
{$ENDIF}
movdqu (%rcx), %xmm1
movdqu (%rdx), %xmm0
pxor (%rcx), %xmm0
psrlq $47, %xmm1
pxor %xmm1, %xmm0
movdqu SSE_PRIME32_1(%rip), %xmm1
pshufd $49, %xmm0, %xmm2
pmuludq %xmm1, %xmm2
pmuludq %xmm1, %xmm0
psllq $32, %xmm2
paddq %xmm2, %xmm0
movdqu 16(%rcx), %xmm2
movups %xmm0, (%rcx)
movdqu 16(%rdx), %xmm0
pxor 16(%rcx), %xmm0
psrlq $47, %xmm2
pxor %xmm2, %xmm0
pshufd $49, %xmm0, %xmm2
pmuludq %xmm1, %xmm0
pmuludq %xmm1, %xmm2
psllq $32, %xmm2
paddq %xmm2, %xmm0
movdqu 32(%rcx), %xmm2
movups %xmm0, 16(%rcx)
movdqu 32(%rdx), %xmm0
pxor 32(%rcx), %xmm0
psrlq $47, %xmm2
pxor %xmm2, %xmm0
pshufd $49, %xmm0, %xmm2
pmuludq %xmm1, %xmm0
pmuludq %xmm1, %xmm2
psllq $32, %xmm2
paddq %xmm2, %xmm0
movdqu 48(%rcx), %xmm2
movups %xmm0, 32(%rcx)
movdqu 48(%rdx), %xmm0
pxor 48(%rcx), %xmm0
psrlq $47, %xmm2
pxor %xmm2, %xmm0
pshufd $49, %xmm0, %xmm2
pmuludq %xmm1, %xmm0
pmuludq %xmm2, %xmm1
psllq $32, %xmm1
paddq %xmm1, %xmm0
movups %xmm0, 48(%rcx)
end;
procedure XXH3_scrambleAcc_avx2(acc: PByte; const secret: PByte); assembler; nostackframe;
// UNIX RDI, RSI
// WIN64: RCX, RDX
asm
{$IF DEFINED(UNIX)}
movq %rdi, %rcx
movq %rsi, %rdx
{$ENDIF}
movl $-1640531535, %eax
vmovdqu (%rcx), %ymm3
vmovdqu (%rdx), %ymm4
vmovdqu 32(%rcx), %ymm5
vpxor %ymm3, %ymm4, %ymm0
vpsrlq $47, %ymm3, %ymm1
vmovdqu 32(%rdx), %ymm3
vpxor %ymm1, %ymm0, %ymm0
vmovd %eax, %xmm1
vpbroadcastd %xmm1, %ymm1
vpsrlq $32, %ymm0, %ymm2
vpmuludq %ymm1, %ymm2, %ymm2
vpmuludq %ymm1, %ymm0, %ymm0
vpsllq $32, %ymm2, %ymm2
vpaddq %ymm2, %ymm0, %ymm0
vpsrlq $47, %ymm5, %ymm2
vmovdqu %ymm0, (%rcx)
vpxor %ymm5, %ymm3, %ymm0
vpxor %ymm2, %ymm0, %ymm0
vpsrlq $32, %ymm0, %ymm2
vpmuludq %ymm1, %ymm0, %ymm0
vpmuludq %ymm1, %ymm2, %ymm1
vpsllq $32, %ymm1, %ymm1
vpaddq %ymm1, %ymm0, %ymm0
vmovdqu %ymm0, 32(%rcx)
vzeroupper
end;
{$ELSE}
const
XXH_ACC_NB = 8;
function XXH_mult32to64_add64(lhs, rhs, acc: UInt64): UInt64; inline;
begin
Result:= XXH_mult32to64(UInt32(lhs), UInt32(rhs)) + acc;
@ -514,14 +617,6 @@ begin
end;
end;
{$ENDIF}
function XXH_xorshift64(v64: UInt64; shift: Integer): UInt64; inline;
begin
// XXH_ASSERT(0 <= shift && shift < 64);
Result:= v64 xor (v64 shr shift);
end;
procedure XXH3_scalarScrambleRound(acc: PByte; const secret: PByte; lane: UIntPtr); inline;
var
acc64: UInt64;
@ -555,6 +650,8 @@ begin
XXH3_scalarScrambleRound(acc, secret, 7);
end;
{$ENDIF}
function XXH3_consumeStripes(acc: PByte; nbStripesSoFarPtr: PUIntPtr; nbStripesPerBlock: UIntPtr;
input: PByte; nbStripes: UIntPtr;
const secret: PByte; secretLimit: UIntPtr;
@ -661,7 +758,7 @@ end;
procedure XXH3_64bits_update(state: PXXH3_state_t; const input: Pointer; len: UIntPtr); inline;
begin
XXH3_update(state, input, len, XXH3_accumulate, @XXH3_scrambleAcc_scalar);
XXH3_update(state, input, len, XXH3_accumulate, XXH3_scrambleAcc);
end;
procedure XXH3_128bits_update(state: PXXH3_state_t; const input: PByte; len: UIntPtr);
@ -690,7 +787,7 @@ begin
@nbStripesSoFar, state^.nbStripesPerBlock,
state^.buffer, nbStripes,
secret, state^.secretLimit,
XXH3_accumulate, @XXH3_scrambleAcc_scalar);
XXH3_accumulate, XXH3_scrambleAcc);
lastStripePtr:= @state^.buffer[state^.bufferedSize - XXH_STRIPE_LEN];
end else begin //* bufferedSize < XXH_STRIPE_LEN */
//* Copy to temp buffer */
@ -1091,14 +1188,17 @@ initialization
if AVX2Support then
begin
XXH3_accumulate:= @XXH3_accumulate_avx2;
XXH3_scrambleAcc:= @XXH3_scrambleAcc_avx2;
XXH3_accumulate_512:= @XXH3_accumulate_512_avx2;
end
else begin
XXH3_accumulate:= @XXH3_accumulate_sse2;
XXH3_scrambleAcc:= @XXH3_scrambleAcc_sse2;
XXH3_accumulate_512:= @XXH3_accumulate_512_sse2;
end;
{$ELSE}
XXH3_accumulate:= @XXH3_accumulate_scalar;
XXH3_scrambleAcc:= @XXH3_scrambleAcc_scalar;
XXH3_accumulate_512:= @XXH3_accumulate_512_scalar;
{$ENDIF}
end.