delphimvcframework/lib/dmustache/mormot.core.base.asmx64.inc

2940 lines
101 KiB
PHP
Raw Normal View History

2024-04-29 15:40:45 +02:00
{
This file is a part of the Open Source Synopse mORMot framework 2,
licensed under a MPL/GPL/LGPL three license - see LICENSE.md
x86_64 assembly used by mormot.core.base.pas
}
{$ifdef FPC}
// disabled some FPC paranoid warnings
{$WARN 7119 off : Exported/global symbols should be accessed via the GOT }
{$WARN 7121 off : Check size of memory operand "$1: memory-operand-size is $2 bits, but expected [$3 bits]" }
{$WARN 7122 off : Check size of memory operand "$1: memory-operand-size is $2 bits, but expected [$3 bits + $4 byte offset]" }
{$WARN 7123 off : Check "$1: offset of memory operand is negative "$2 byte" }
{$endif FPC}
{$ifdef ASMX64}
{
FillCharFast/MoveFast implementation notes:
- assume ERBMS is available (cpuid flag may not be propagated within VMs)
- use branchless sub-functions for smallest buffers, then SSE2 up to 255 bytes
- use "rep movsb" for 256..512K input (should work on all CPUs, even oldest)
- don't use backward "std rep movsb" since it is not involved by ERMBS (slow)
- use non-temporal SSE2 or AVX loop >= 512KB (to not pollute the CPU cache)
- will check X64CpuFeatures global to adjust the algorithm if cpuAVX is set
- regarding benchmark, run TTestLowLevelCommon.CustomRTL on x86_64
-> FillCharFast/MoveFast seems faster, especially for small lengths (strings)
-> Delphi RTL is slower than FPC's, and it does not support AVX assembly yet
}
const
// non-temporal writes should bypass the cache when the size is bigger than
// half the size of the largest level cache = 512KB, assuming a low 1MB cache
// - today CPUs could have 144MB of (3D) cache (!) so we favor a fixed value
// and rely on the CPU hardware cache prefetch when accessing the data
NONTEMPORALSIZE = 512 * 1024;
{$ifdef NO_ERMS}
{$undef WITH_ERMS}
{$else}
{$define WITH_ERMS}
// we enable forward rep movsb/stosb over SSE2MAXSIZE=256 bytes on x86_64
// and we don't try to detect it because CPUID is unset in some VMs
{$endif NO_ERMS}
// minimum size triggering ASMX64AVXNOCONST or WITH_ERMS optimized asm
// - pre-ERMS expects at least 144/256 bytes, IvyBridge+ with ERMS is good
// from 64 - copy_user_enhanced_fast_string() in recent Linux kernel uses 64
// see https://stackoverflow.com/a/43837564/458259 for explanations and timing
// - see also mormot.core.fpcx64mm.pas as reference
SSE2MAXSIZE = 256;
// identify Intel/AMD AVX2+BMI support at Haswell level
CPUAVX2HASWELL = [cfAVX2, cfSSE42, cfBMI1, cfBMI2, cfCLMUL];
procedure MoveFast(const src; var dst; cnt: PtrInt);
{$ifdef FPC}nostackframe; assembler;
asm {$else} asm .noframe {$endif} // rcx/rdi=src rdx/rsi=dst r8/rdx=cnt
mov rax, cnt // rax=r8/rdx=cnt
lea r10, [rip + @jmptab] // 0..16 dedicated sub-functions
sub rax, 16
ja @up16 // >16 or <0
{$ifdef WIN64ABI} // circumvent FPC asm bug and adapt to xmm ABI
jmp qword ptr [r10 + r8 * 8]
@up16: // >16 or <0
jng @z // <0
movups xmm0, oword ptr [src + rax] // last 16 = xmm0
movups xmm1, oword ptr [src] // first 16 = xmm1
cmp rax, 96 - 16
{$else}
jmp qword ptr [r10 + rdx * 8]
@neg: ret
@up16: // >16 or <0
jng @neg // <0
mov r8, rdx
movups xmm0, oword ptr [src + rax] // last 16 = xmm0
movups xmm1, oword ptr [src] // first 16 = xmm1
cmp rdx, 144 // more volatile xmm registers on SystemV ABI
{$endif WIN64ABI}
ja @lrg // >96/144
// cnt = 17..96/144
cmp al, $10
jbe @sml10
movups xmm2, oword ptr [src + $10] // second 16
cmp al, $20
jbe @sml20
movups xmm3, oword ptr [src + $20] // third 16
cmp al, $30
jbe @sml30
movups xmm4, oword ptr [src + $30] // fourth 16
cmp al, $40
jbe @sml40
movups xmm5, oword ptr [src + $40] // fifth 16
// xmm0..xmm5 are volatile on both Win64 and SystemV ABI
// xmm6 and up are also volatile on SystemV ABI so allow more bytes
{$ifdef SYSVABI}
cmp al, $50
jbe @sml50
movups xmm6, oword ptr [src + $50]
cmp al, $60
jbe @sml60
movups xmm7, oword ptr [src + $60]
cmp al, $70
jbe @sml70
movups xmm8, oword ptr [src + $70]
// more registers increases code size ([dst+$80]) so are not used
movups oword ptr [dst + $70], xmm8
@sml70: movups oword ptr [dst + $60], xmm7
@sml60: movups oword ptr [dst + $50], xmm6
@sml50: {$endif SYSVABI}
movups oword ptr [dst + $40], xmm5 // fifth 16
@sml40: movups oword ptr [dst + $30], xmm4 // fourth 16
@sml30: movups oword ptr [dst + $20], xmm3 // third 16
@sml20: movups oword ptr [dst + $10], xmm2 // second 16
@sml10: movups oword ptr [dst], xmm1 // first 16
movups oword ptr [dst + rax], xmm0 // last 16 (may be overlapping)
@z: ret
@lrg: // cnt > 96/144 or cnt < 0
mov r11d, NONTEMPORALSIZE
mov r10, dst
add rax, 16 // restore rax=cnt as expected below
jl @z // cnt < 0
sub r10, src
jz @z // src=dst
cmp r10, cnt // move backwards if unsigned(dst-src) < cnt
jb @lrgbwd
// forward ERMSB/SSE2/AVX move for cnt > 96/144 bytes
mov r9, dst // dst will be 16/32 bytes aligned for writes
{$ifdef WITH_ERMS}
{$ifdef WIN64ABI} // 145 bytes seems good enough for ERMSB on a server
cmp rax, SSE2MAXSIZE
jb @fsse2 // 97..255 bytes may be not enough for ERMSB nor AVX
{$endif WIN64ABI}
cmp rax, r11
jae @lrgfwd // non-temporal move > 512KB is better than ERMSB
// 256/145..512K could use the "rep movsb" ERMSB pattern on all CPUs
cld
{$ifdef WIN64ABI}
push rsi
push rdi
mov rsi, src
mov rdi, dst
mov rcx, r8
rep movsb
pop rdi
pop rsi
{$else}
xchg rsi, rdi // dst=rsi and src=rdi -> swap
mov rcx, r8
rep movsb
{$endif WIN64ABI}
ret
{$else}
jmp @lrgfwd
{$endif WITH_ERMS}
{$ifdef ASMX64AVXNOCONST} // limited AVX asm on Delphi 11
@lrgbwd:// backward SSE2/AVX move
cmp rax, SSE2MAXSIZE
jb @bsse2 // 97/129..255 bytes is not worth AVX context transition
test byte ptr [rip + X64CpuFeatures], 1 shl cpuAVX
jz @bsse2
jmp @bavx
@lrgfwd:// forward SSE2/AVX move
test byte ptr [rip + X64CpuFeatures], 1 shl cpuAVX
jnz @favx
{$else}
@lrgfwd:{$endif ASMX64AVXNOCONST}
@fsse2: // forward SSE2 move
lea src, [src + rax - 16]
lea rax, [rax + dst - 16]
mov r10, rax
neg rax
and dst, -16 // 16-byte aligned writes
lea rax, [rax + dst + 16]
cmp r8, r11
jb @fwd // bypass cache for cnt>512KB
jmp @fwdnt
// backward SSE2/AVX move for cnt > 96/144 bytes
// note: ERMSB is not available on "std rep move" which is slower
{$ifndef ASMX64AVXNOCONST}
@lrgbwd:{$endif ASMX64AVXNOCONST}
@bsse2: // backward SSE2 move
sub rax, 16
mov r9, rax
add rax, dst
and rax, -16 // 16-byte aligned writes
sub rax, dst
cmp r8, r11
jae @bwdnt // bypass cache for cnt>512KB
jmp @bwd
{$ifdef ASMX64AVXNOCONST}
@bavx: // backward AVX move
sub rax, 32
mov r9, rax
vmovups ymm2, yword ptr [src + rax] // last 32
vmovups ymm1, yword ptr [src] // first 32
add rax, dst
and rax, -32 // 32-byte aligned writes
sub rax, dst
cmp r8, r11
jae @bavxn // bypass cache for cnt>512KB
jmp @bavxr
@favx: // forward AVX move
vmovups ymm2, yword ptr [src] // first 32
lea src, [src + rax - 32]
lea dst, [dst + rax - 32]
vmovups ymm1, yword ptr [src] // last 32
neg rax
add rax, dst
and rax, -32 // 32-byte aligned writes
sub rax, dst
add rax, 64
cmp r8, r11
jb @favxr // bypass cache for cnt>512KB
jmp @favxn
// forward temporal AVX loop
{$ifdef FPC} align 16 {$else} .align 16 {$endif}
@favxr: vmovups ymm0, yword ptr [src + rax]
vmovaps yword ptr [dst + rax], ymm0 // most CPUs have one store unit
add rax, 32
jl @favxr
@favxe: vmovups yword ptr [dst], ymm1 // last 32
vmovups yword ptr [r9], ymm2 // first 32
// https://software.intel.com/en-us/articles/avoiding-avx-sse-transition-penalties
vzeroupper
ret
// forward non-temporal AVX loop
{$ifdef FPC} align 16 {$else} .align 16 {$endif}
@favxn: vmovups ymm0, yword ptr [src + rax]
// circumvent FPC inline asm bug with vmovntps [dst + rax], ymm0
{$ifdef WIN64ABI}
vmovntps [rdx + rax], ymm0 // rdx=dst on Win64
{$else}
vmovntps [rsi + rax], ymm0 // rsi=dst on POSIX
{$endif WIN64ABI}
add rax, 32
jl @favxn
sfence
jmp @favxe
{$endif ASMX64AVXNOCONST}
// forward temporal SSE2 loop
{$ifdef FPC} align 16 {$else} .align 16 {$endif}
@fwd: movups xmm2, oword ptr [src + rax]
movaps [r10 + rax], xmm2
add rax, 16
jl @fwd
movups oword ptr [r10], xmm0 // last 16
movups oword ptr [r9], xmm1 // first 16
ret
// forward non-temporal SSE2 loop
{$ifdef FPC} align 16 {$else} .align 16 {$endif}
@fwdnt: movups xmm2, oword ptr [src + rax]
movntdq [r10 + rax], xmm2
add rax, 16
jl @fwdnt
sfence
movups oword ptr [r10], xmm0 // last 16
movups oword ptr [r9], xmm1 // first 16
ret
// backward temporal SSE2 loop
{$ifdef FPC} align 16 {$else} .align 16 {$endif}
@bwd: movups xmm2, oword ptr [src + rax]
movaps oword ptr [dst + rax], xmm2
sub rax, 16
jg @bwd
movups oword ptr [dst], xmm1 // first 16
movups oword ptr [dst + r9], xmm0 // last 16
ret
// backward non-temporal SSE2 loop
{$ifdef FPC} align 16 {$else} .align 16 {$endif}
@bwdnt: movups xmm2, oword ptr [src + rax]
movntdq oword ptr [dst + rax], xmm2
sub rax, 16
jg @bwdnt
sfence
movups oword ptr [dst], xmm1 // first 16
movups oword ptr [dst + r9], xmm0 // last 16
ret
{$ifdef ASMX64AVXNOCONST}
// backward temporal AVX loop
{$ifdef FPC} align 16 {$else} .align 16 {$endif}
@bavxr: vmovups ymm0, yword ptr [src + rax]
vmovaps yword ptr [dst + rax], ymm0
sub rax, 32
jg @bavxr
@bavxe: vmovups yword ptr [dst], ymm1 // first 32
vmovups yword ptr [dst + r9], ymm2 // last 32
vzeroupper
ret
// backward non-temporal AVX loop
{$ifdef FPC} align 16 {$else} .align 16 {$endif}
@bavxn: vmovups ymm0, yword ptr [src + rax]
// circumvent FPC inline asm bug with vmovntps [dst + rax], ymm0
{$ifdef WIN64ABI}
vmovntps [rdx + rax], ymm0 // rdx=dst on Win64
{$else}
vmovntps [rsi + rax], ymm0 // rsi=dst on POSIX
{$endif WIN64ABI}
sub rax, 32
jg @bavxn
sfence
jmp @bavxe
{$endif ASMX64AVXNOCONST}
// dedicated branchless sub-functions for 0..16 bytes
{$ifdef FPC} align 8 {$else} .align 8 {$endif}
@jmptab:dq @00, @01, @02, @03, @04, @05, @06, @07
dq @08, @09, @10, @11, @12, @13, @14, @15, @16
@01: mov al, byte ptr [src]
mov byte ptr [dst], al
@00: ret
@02: movzx eax, word ptr [src]
mov word ptr [dst], ax
ret
@03: movzx eax, word ptr [src]
mov cl, byte ptr [src + 2]
mov word ptr [dst], ax
mov byte ptr [dst + 2], cl
ret
@04: mov eax, dword ptr [src]
mov dword ptr [dst], eax
ret
@05: mov eax, dword ptr [src]
mov cl, byte ptr [src + 4]
mov dword ptr [dst], eax
mov byte ptr [dst + 4], cl
ret
@06: mov eax, dword ptr [src]
movzx ecx, word ptr [src + 4]
mov dword ptr [dst], eax
mov word ptr [dst + 4], cx
ret
@07: mov r8d, dword ptr [src] // faster with no overlapping
movzx eax, word ptr [src + 4]
mov cl, byte ptr [src + 6]
mov dword ptr [dst], r8d
mov word ptr [dst + 4], ax
mov byte ptr [dst + 6], cl
ret
@08: mov rax, qword ptr [src]
mov [dst], rax
ret
@09: mov rax, qword ptr [src]
mov cl, byte ptr [src + 8]
mov [dst], rax
mov byte ptr [dst + 8], cl
ret
@10: mov rax, qword ptr [src]
movzx ecx, word ptr [src + 8]
mov [dst], rax
mov word ptr [dst + 8], cx
ret
@11: mov r8, qword ptr [src]
movzx eax, word ptr [src + 8]
mov cl, byte ptr [src + 10]
mov [dst], r8
mov word ptr [dst + 8], ax
mov byte ptr [dst + 10], cl
ret
@12: mov rax, qword ptr [src]
mov ecx, dword ptr [src + 8]
mov [dst], rax
mov dword ptr [dst + 8], ecx
ret
@13: mov r8, qword ptr [src]
mov eax, dword ptr [src + 8]
mov cl, byte ptr [src + 12]
mov [dst], r8
mov dword ptr [dst + 8], eax
mov byte ptr [dst + 12], cl
ret
@14: mov r8, qword ptr [src]
mov eax, dword ptr [src + 8]
movzx ecx, word ptr [src + 12]
mov [dst], r8
mov dword ptr [dst + 8], eax
mov word ptr [dst + 12], cx
ret
@15: mov r8, qword ptr [src]
mov rax, qword ptr [src + 7] // overlap is the easiest solution
mov [dst], r8
mov [dst + 7], rax
ret
@16: movups xmm0, oword ptr [src]
movups oword [dst], xmm0
end;
procedure FillCharFast(var dst; cnt: PtrInt; value: byte);
{$ifdef FPC}nostackframe; assembler;
asm {$else} asm .noframe {$endif} // rcx/rdi=dst rdx/rsi=cnt r8b/dl=val
mov r9, $0101010101010101
lea r10, [rip + @jmptab]
{$ifdef WIN64ABI}
movzx eax, r8b
{$else}
movzx eax, dl
mov rdx, rsi // rdx=cnt
{$endif WIN64ABI}
imul rax, r9 // broadcast value into all bytes of rax (in 1 cycle)
cmp cnt, 32
ja @abv32 // >32 or <0
sub rdx, 8
jg @sml32 // 9..32
jmp qword ptr [r10 + 64 + rdx * 8] // tinest 0..8 bytes
@sml32: cmp dl, 8 // 9..32 bytes
jle @sml16
cmp dl, 16
jle @sml24
mov qword ptr [dst + 16], rax
@sml24: mov qword ptr [dst + 8], rax
@sml16: mov qword ptr [dst + rdx], rax // last 8 (may be overlapping)
@08: mov qword ptr [dst], rax
@00: ret
@abv32: jng @00 // < 0
// cnt > 32 bytes
movd xmm0, eax
lea r8, [dst + cnt] // r8 point to end
mov r9d, NONTEMPORALSIZE
pshufd xmm0, xmm0, 0 // broadcast value into all bytes of xmm0
mov r10, rdx // save rdx=cnt
cmp rdx, 255 // = SSE2MAXSIZE-1, but hardcoded in move below
ja @abv255
// 33..255 bytes is not good for ERMSB or AVX, and need no alignment
test dl, $80
jz @sml80
movups oword ptr [dst], xmm0
movups oword ptr [dst + $10], xmm0
movups oword ptr [dst + $20], xmm0
movups oword ptr [dst + $30], xmm0
movups oword ptr [dst + $40], xmm0
movups oword ptr [dst + $50], xmm0
movups oword ptr [dst + $60], xmm0
movups oword ptr [dst + $70], xmm0
add dst, $80
@sml80: test dl, $40
jz @sml40
movups oword ptr [dst], xmm0
movups oword ptr [dst + $10], xmm0
movups oword ptr [dst + $20], xmm0
movups oword ptr [dst + $30], xmm0
add dst, $40
@sml40: test dl, $20
jz @sml20
movups oword ptr [dst], xmm0
movups oword ptr [dst + $10], xmm0
add dst, $20
@sml20: test dl, $10
jz @sml10
movups oword ptr [dst], xmm0
@sml10: movups oword ptr [r8 - 16], xmm0 // last 16 bytes (may be overlapping)
ret
@abv255:{$ifdef WITH_ERMS}
cmp rax, r9 // non-temporal move > 512KB
{$ifdef ASMX64AVXNOCONST}
jae @avx
{$else}
jae @sse2
{$endif ASMX64AVXNOCONST}
// 256..512K could use the "rep stosb" ERMSB pattern on all CPUs
cld
{$ifdef WIN64ABI}
mov r8, rdi
mov rdi, dst
mov rcx, cnt
rep stosb
mov rdi, r8
{$else}
mov rcx, cnt
rep stosb
{$endif WIN64ABI}
ret
{$endif WITH_ERMS}
@sse2: movups oword ptr [dst], xmm0 // first unaligned 16 bytes
lea rdx, [dst + rdx - 1]
and rdx, -16
add dst, 16
and dst, -16 // dst is 16-bytes aligned
sub dst, rdx
jnb @last
cmp r10, r9
jae @sse2nt // bypass cache for cnt>512KB
jmp @reg
{$ifdef ASMX64AVXNOCONST}
@avx: test byte ptr [rip + X64CpuFeatures], 1 shl cpuAVX
jz @sse2
movups oword ptr [dst], xmm0 // first unaligned 1..16 bytes
add dst, 16
and dst, -16
movaps oword ptr [dst], xmm0 // aligned 17..32 bytes
vinsertf128 ymm0, ymm0, xmm0, 1
add dst, 16
and dst, -32 // dst is 32-bytes aligned
mov rdx, r8
and rdx, -32
sub dst, rdx
cmp r10, r9
jb @avxreg
jmp @avxnt
{$endif ASMX64AVXNOCONST}
// temporal SSE2 loop
{$ifdef FPC} align 16 {$else} .align 16 {$endif}
@reg: movaps oword ptr [rdx + dst], xmm0 // regular loop
add dst, 16
jnz @reg
@last: movups oword ptr [r8 - 16], xmm0 // last unaligned 16 bytes
ret
// non-temporal SSE2 loop
{$ifdef FPC} align 16 {$else} .align 16 {$endif}
@sse2nt:movntdq [rdx + dst], xmm0 // non-temporal loop
add dst, 16
jnz @sse2nt
sfence
movups oword ptr [r8 - 16], xmm0
ret
{$ifdef ASMX64AVXNOCONST}
// temporal AVX loop
{$ifdef FPC} align 16 {$else} .align 16 {$endif}
@avxreg:vmovaps yword ptr [rdx + dst], ymm0 // regular loop
add dst, 32
jnz @avxreg
@avxok: vmovups yword ptr [r8 - 32], ymm0 // last unaligned 32 bytes
vzeroupper
ret
// non-temporal AVX loop
{$ifdef FPC} align 16 {$else} .align 16 {$endif}
{$ifdef WIN64}
@avxnt: vmovntps [rdx + rcx], ymm0 // non-temporal loop - rcx=dst on Win64
{$else}
@avxnt: vmovntps [rdx + rdi], ymm0 // non-temporal loop - rdi=dst on POSIX
{$endif WIN64}
add dst, 32
jnz @avxnt
sfence
jmp @avxok
{$endif ASMX64AVXNOCONST}
// dedicated branchless sub-functions for 0..8 bytes
{$ifdef FPC} align 8 {$else} .align 8 {$endif}
@jmptab:dq @00, @01, @02, @03, @04, @05, @06, @07, @08
@07: mov dword ptr [dst + 3], eax
@03: mov word ptr [dst + 1], ax
@01: mov byte ptr [dst], al
ret
@06: mov dword ptr [dst + 2], eax
@02: mov word ptr [dst], ax
ret
@05: mov byte ptr [dst + 4], al
@04: mov dword ptr [dst], eax
end;
function crc32fasttab(crc: cardinal; buf: PAnsiChar; len: cardinal;
tab: PCrc32tab): cardinal;
{$ifdef FPC} nostackframe; assembler; asm {$else} asm .noframe {$endif FPC}
{$ifdef SYSVABI}
mov r8, rdx
mov r9, tab
{$endif SYSVABI}
mov eax, crc
xor ecx, ecx
test buf, buf // crc=eax buf=rdx/rsi len=r8 tab=r9
jz @z
neg r8
jz @z
not eax
cmp r8, -8
jb @head
@sml: mov cl, byte ptr [buf]
add buf, 1
xor cl, al
shr eax, 8
xor eax, dword ptr [rcx * 4 + r9]
add r8, 1
jnz @sml
@0: not eax
@z: ret
@head: {$ifdef SYSVABI}
test sil, 7
{$else}
test dl, 7
{$endif SYSVABI}
jz @align
mov cl, byte ptr [buf]
add buf, 1
xor cl, al
shr eax, 8
xor eax, dword ptr [rcx * 4 + r9]
add r8, 1
jnz @head
not eax
ret
@align: sub buf, r8
add r8, 8
jg @done
push rbx
@by8: mov r10d, eax
mov rcx, qword ptr [buf + r8 - 8]
xor r10, rcx
shr rcx, 32
movzx ebx, cl
mov eax, dword ptr [rbx * 4 + r9 + 1024 * 3]
movzx ebx, ch
shr ecx, 16
xor eax, dword ptr [rbx * 4 + r9 + 1024 * 2]
movzx ebx, cl
xor eax, dword ptr [rbx * 4 + r9 + 1024 * 1]
movzx ebx, ch
xor eax, dword ptr [rbx * 4 + r9 + 1024 * 0]
mov rcx, r10
movzx ebx, cl
xor eax, dword ptr [rbx * 4 + r9 + 1024 * 7]
movzx ebx, ch
shr ecx, 16
xor eax, dword ptr [rbx * 4 + r9 + 1024 * 6]
movzx ebx, cl
xor eax, dword ptr [rbx * 4 + r9 + 1024 * 5]
movzx ebx, ch
xor eax, dword ptr [rbx * 4 + r9 + 1024 * 4]
add r8, 8
jle @by8
xor ecx, ecx
pop rbx
@done: sub r8, 8
jge @e
@tail: mov cl, byte ptr [buf + r8]
xor cl, al
shr eax, 8
xor eax, dword ptr [rcx * 4 + r9]
add r8, 1
jnz @tail
@e: not eax
end;
function StrInt32(P: PAnsiChar; val: PtrInt): PAnsiChar;
{$ifdef FPC}nostackframe; assembler; asm {$else} asm .noframe {$endif FPC}
mov r10, val
sar r10, 63 // r10=0 if val>=0 or -1 if val<0
xor val, r10
sub val, r10 // val=abs(val)
mov rax, val
cmp val, 10
jb @1 // direct process of common val<10
lea r8, [rip + TwoDigitLookup]
{$ifdef FPC} align 8 {$else} .align 8 {$endif}
@s: lea P, [P - 2]
cmp rax, 100
jb @2
lea r9, [rax * 2]
shr rax, 2
mov rdx, 2951479051793528259
mul rdx // use power of two reciprocal to avoid division
shr rdx, 2
mov rax, rdx
imul rdx, -200
lea rdx, [rdx + r8]
movzx edx, word ptr [rdx + r9]
mov word ptr [P], dx
cmp rax, 10
jae @s
@1: or al, '0'
mov byte ptr [P - 2], '-'
mov byte ptr [P - 1], al
lea rax, [P + r10 - 1] // includes '-' if val<0
ret
{$ifdef FPC} align 8 {$else} .align 8 {$endif}
@2: movzx eax, word ptr [r8 + rax * 2]
mov byte ptr [P - 1], '-'
mov word ptr [P], ax
lea rax, [P + r10] // includes '-' if val<0
end;
function StrUInt32(P: PAnsiChar; val: PtrUInt): PAnsiChar;
{$ifdef FPC}nostackframe; assembler; asm {$else} asm .noframe {$endif FPC}
mov rax, val
cmp val, 10
jb @1 // direct process of common val<10
lea r8, [rip + TwoDigitLookup]
@s: lea P, [P - 2]
cmp rax, 100
jb @2
lea r9, [rax * 2]
shr rax, 2
mov rdx, 2951479051793528259
mul rdx // use power of two reciprocal to avoid division
shr rdx, 2
mov rax, rdx
imul rdx, -200
add rdx, r8
movzx rdx, word ptr [rdx + r9]
mov word ptr [P], dx
cmp rax, 10
jae @s
@1: sub P, 1
or al, '0'
mov byte ptr [P], al
@0: mov rax, P
ret
@2: movzx eax, word ptr [r8 + rax * 2]
mov word ptr [P], ax
mov rax, P
end;
{$endif ASMX64}
{$ifdef CPUX64ASM} /// proper compilation on FPC and Delphi XE7+
{$ifdef FPC}
procedure fpc_freemem; external name 'FPC_FREEMEM'; // access to RTL from asm
procedure FastAssignNew(var d; s: pointer); nostackframe; assembler;
asm
mov rax, qword ptr [d]
mov qword ptr [d], s
test rax, rax
jz @z
lea d, qword ptr [rax - _STRRECSIZE] // for fpc_freemem() below
{$ifdef STRCNT32}
cmp dword ptr [rax - _STRCNT], 0 // refcnt=-1 for const
jl @z
lock dec dword ptr [rax - _STRCNT]
{$else}
cmp qword ptr [rax - _STRCNT], 0 // refcnt=-1 for const
jl @z
lock dec qword ptr [rax - _STRCNT]
{$endif STRCNT32}
jbe fpc_freemem
@z:
end;
procedure FastAssignNewNotVoid(var d; s: pointer);
nostackframe; assembler;
asm
mov rax, qword ptr [d]
mov qword ptr [d], s
lea d, qword ptr [rax - _STRRECSIZE] // for fpc_freemem() below
{$ifdef STRCNT32}
cmp dword ptr [rax - _STRCNT], 0 // refcnt=-1 for const
jl @z
lock dec dword ptr [rax - _STRCNT]
{$else}
cmp qword ptr [rax - _STRCNT], 0 // refcnt=-1 for const
jl @z
lock dec qword ptr [rax - _STRCNT]
{$endif STRCNT32}
jbe fpc_freemem
@z:
end;
{$endif FPC}
{
Some numbers, with CITIES_MAX=200000, deleting 1/128 entries
first column (3..23) is the max number of indexes[] chunk to rehash
#abc is the number of slots in the hash table
adjust=.. match DynArrayHashTableAdjust() time fixing the indexes
hash=ms is the time needed to hash input (not impacted by adjusting algorithm)
-> TDynArray.Delete move() now takes more time than the HashTable update :)
1. naive loop
for i := 0 to HashTableSize-1 do
if HashTable[i]>aArrayIndex then
dec(HashTable[i]);
3 #257 adjust=7.95ms 191.7MB/s hash=8us
23 #195075 adjust=4.27s 548.6MB/s hash=2.47ms
2. branchless pure pascal code is about 10x faster!
3 #257 adjust=670us 2.2GB hash=8us
23 #195075 adjust=520.85ms 4.3GB/s hash=2.45ms
3. SSE2 simd assembly makes about 3x improvement
3 #257 adjust=290us 5.1GB hash=8us
23 #195075 adjust=201.53ms 11.3GB/s hash=2.44ms
4. AVX2 simd assembly gives some additional 40% (on my iCore3 cpu)
3 #257 adjust=262us 5.6GB hash=8us
23 #195075 adjust=161.73ms 14.1GB/s hash=2.57ms
}
// brute force O(n) indexes fix after deletion (much faster than full ReHash)
procedure DynArrayHashTableAdjust(P: PIntegerArray; deleted: integer; count: PtrInt);
{$ifdef WIN64ABI}
var
bak6, bak7, bak8: THash128;
asm // Windows x64 calling convention expects to preserve XMM6-XMM15
movups dqword ptr [bak6], xmm6
movups dqword ptr [bak7], xmm7
movups dqword ptr [bak8], xmm8
{$else} {$ifdef FPC}nostackframe; assembler; asm {$else} asm .noframe {$endif}
mov r8, rdx
mov rcx, rdi
mov rdx, rsi
{$endif WIN64ABI}
xor eax, eax // reset eax high bits for setg al below
movq xmm0, rdx // xmm0 = 128-bit of quad deleted
pshufd xmm0, xmm0, 0
test cl, 3
jnz @1 // paranoid: a dword dynamic array is always dword-aligned
// ensure P is 256-bit aligned (for avx2)
@align: test cl, 31
jz @ok
cmp dword ptr [rcx], edx
setg al // P[]>deleted -> al=1, 0 otherwise
sub dword ptr [rcx], eax // branchless dec(P[])
add rcx, 4
sub r8, 1
jmp @align
@ok: {$ifdef ASMX64AVXNOCONST}
test byte ptr [rip + X64CpuFeatures], 1 shl cpuAVX2
jz @sse2
vpshufd ymm0, ymm0, 0 // shuffle to ymm0 128-bit low lane
vperm2f128 ymm0, ymm0, ymm0, 0 // copy to ymm0 128-bit high lane
// avx process of 128 bytes (32 indexes) per loop iteration
{$ifdef FPC} align 16 {$else} .align 16 {$endif}
@avx2: sub r8, 32
vmovdqa ymm1, [rcx] // 4 x 256-bit process = 4 x 8 integers
vmovdqa ymm3, [rcx + 32]
vmovdqa ymm5, [rcx + 64]
vmovdqa ymm7, [rcx + 96]
vpcmpgtd ymm2, ymm1, ymm0 // compare P[]>deleted -> -1, 0 otherwise
vpcmpgtd ymm4, ymm3, ymm0
vpcmpgtd ymm6, ymm5, ymm0
vpcmpgtd ymm8, ymm7, ymm0
vpaddd ymm1, ymm1, ymm2 // adjust by adding -1 / 0
vpaddd ymm3, ymm3, ymm4
vpaddd ymm5, ymm5, ymm6
vpaddd ymm7, ymm7, ymm8
vmovdqa [rcx], ymm1
vmovdqa [rcx + 32], ymm3
vmovdqa [rcx + 64], ymm5
vmovdqa [rcx + 96], ymm7
add rcx, 128
cmp r8, 32
jae @avx2
vzeroupper
jmp @2
{$endif ASMX64AVXNOCONST}
// SSE2 process of 64 bytes (16 indexes) per loop iteration
{$ifdef FPC} align 16 {$else} .align 16 {$endif}
@sse2: sub r8, 16
movaps xmm1, dqword [rcx] // 4 x 128-bit process = 4 x 4 integers
movaps xmm3, dqword [rcx + 16]
movaps xmm5, dqword [rcx + 32]
movaps xmm7, dqword [rcx + 48]
movaps xmm2, xmm1 // keep copy for paddd below
movaps xmm4, xmm3
movaps xmm6, xmm5
movaps xmm8, xmm7
pcmpgtd xmm1, xmm0 // quad compare P[]>deleted -> -1 / 0
pcmpgtd xmm3, xmm0
pcmpgtd xmm5, xmm0
pcmpgtd xmm7, xmm0
paddd xmm1, xmm2 // quad adjust by adding -1 / 0
paddd xmm3, xmm4
paddd xmm5, xmm6
paddd xmm7, xmm8
movaps dqword [rcx], xmm1 // quad store back
movaps dqword [rcx + 16], xmm3
movaps dqword [rcx + 32], xmm5
movaps dqword [rcx + 48], xmm7
add rcx, 64
cmp r8, 16
jae @sse2
jmp @2
// trailing indexes (never appearing within DYNARRAYHASH_PO2 range)
@1: sub r8, 1
cmp dword ptr [rcx + r8 * 4], edx
setg al
sub dword ptr [rcx + r8 * 4], eax
@2: test r8, r8
jnz @1
{$ifdef WIN64ABI}
movups xmm6, dqword ptr [bak6]
movups xmm7, dqword ptr [bak7]
movups xmm8, dqword ptr [bak8]
{$endif WIN64ABI}
end;
// DYNARRAYHASH_16BIT version for 16-bit HashTable[] - no AVX2 since count < 64K
procedure DynArrayHashTableAdjust16(P: PWordArray; deleted: cardinal; count: PtrInt);
{$ifdef WIN64ABI}
var
bak6, bak7, bak8: THash128;
asm // Windows x64 calling convention expects to preserve XMM6-XMM15
movups dqword ptr [bak6], xmm6
movups dqword ptr [bak7], xmm7
movups dqword ptr [bak8], xmm8
{$else} {$ifdef FPC}nostackframe; assembler; asm {$else} asm .noframe {$endif}
mov r8, rdx
mov rcx, rdi
mov rdx, rsi
{$endif WIN64ABI}
mov eax, deleted
shl eax, 16 // for pshufd (inline asm doesn't know about pshufw)
or edx, eax // edx = 32-bit of 2x 16-bit deleted
movq xmm0, rdx // xmm0 = 128-bit of 8x deleted
pshufd xmm0, xmm0, 0
xor eax, eax // reset eax high bits for setg al below
test cl, 1
jnz @1 // paranoid: a dword dynamic array is always word-aligned
// ensure P is 128-bit aligned (for movaps)
@align: test cl, 15
jz @sse2
cmp word ptr [rcx], dx
setg al // P[]>deleted -> al=1, 0 otherwise
sub word ptr [rcx], ax // branchless dec(P[])
add rcx, 2
sub r8, 1
jmp @align
// SSE2 process of 64 bytes (32 indexes) per loop iteration
{$ifdef FPC} align 16 {$else} .align 16 {$endif}
@sse2: sub r8, 32
movaps xmm1, dqword [rcx] // 4 x 128-bit process = 4 x 8 words
movaps xmm3, dqword [rcx + 16]
movaps xmm5, dqword [rcx + 32]
movaps xmm7, dqword [rcx + 48]
movaps xmm2, xmm1 // keep copy for paddd below
movaps xmm4, xmm3
movaps xmm6, xmm5
movaps xmm8, xmm7
pcmpgtw xmm1, xmm0 // 8x compare P[]>deleted -> -1 / 0
pcmpgtw xmm3, xmm0
pcmpgtw xmm5, xmm0
pcmpgtw xmm7, xmm0
paddw xmm1, xmm2 // 8x adjust by adding -1 / 0
paddw xmm3, xmm4
paddw xmm5, xmm6
paddw xmm7, xmm8
movaps dqword [rcx], xmm1 // 8x store back
movaps dqword [rcx + 16], xmm3
movaps dqword [rcx + 32], xmm5
movaps dqword [rcx + 48], xmm7
add rcx, 64
cmp r8, 32
jae @sse2
jmp @2
// trailing indexes (never appearing within DYNARRAYHASH_PO2 range)
@1: sub r8, 1
cmp word ptr [rcx + r8 * 4], dx
setg al
sub word ptr [rcx + r8 * 4], ax
@2: test r8, r8
jnz @1
{$ifdef WIN64ABI}
movups xmm6, dqword ptr [bak6]
movups xmm7, dqword ptr [bak7]
movups xmm8, dqword ptr [bak8]
{$endif WIN64ABI}
end;
{$ifdef ASMX64AVXNOCONST}
// AVX2 ASM .align 32 for const is not available on Delphi :(
// adapted from https://github.com/simdjson/simdjson - Apache License 2.0
function IsValidUtf8Avx2(source: PUtf8Char; sourcelen: PtrInt): boolean;
{$ifdef FPC}nostackframe; assembler; asm {$else} asm .noframe {$endif FPC}
test source, source
jz @ok
test sourcelen, sourcelen
jle @ok
{$ifdef WIN64ABI} // Win64 ABI doesn't consider rsi/rdi as volatile
push rsi
push rdi
{$endif WIN64ABI}
push rbp
mov r8, source
mov rdx, sourcelen
mov rsi, r8
mov ecx, 64
mov rax, rsi
mov rdi, rdx
mov rbp, rsp
and rsp, 0FFFFFFFFFFFFFFE0H // align stack at 32 bytes
sub rsp, 160
{$ifdef WIN64ABI}
movaps dqword ptr [rsp + 00H], xmm6
movaps dqword ptr [rsp + 10H], xmm7
movaps dqword ptr [rsp + 20H], xmm8
movaps dqword ptr [rsp + 30H], xmm9
movaps dqword ptr [rsp + 40H], xmm10
movaps dqword ptr [rsp + 50H], xmm11
movaps dqword ptr [rsp + 60H], xmm12
movaps dqword ptr [rsp + 70H], xmm13
movaps dqword ptr [rsp + 80H], xmm14
movaps dqword ptr [rsp + 90H], xmm15
sub rsp, 100H
{$endif WIN64ABI}
cmp rdx, 64
cmovnc rcx, rdx
sub rcx, 64
je @small
vpxor xmm3, xmm3, xmm3
xor esi, esi
vmovdqu ymm7, yword ptr [rip + @0f]
vmovdqu ymm15, yword ptr [rip + @_6]
vmovdqu ymm14, yword ptr [rip + @_7]
vmovdqu ymm13, yword ptr [rip + @_8]
vmovdqa ymm5, ymm3
vmovdqa ymm2, ymm3
// main processing loop, 64 bytes per iteration
{$ifdef FPC} align 16 {$else} .align 16 {$endif}
@loop: vmovdqu xmm6, oword ptr [rax + rsi]
vinserti128 ymm0, ymm6, [rax + rsi + 10H], 01H
vmovdqu xmm6, oword ptr [rax + rsi + 20H]
vinserti128 ymm1, ymm6, [rax + rsi + 30H], 01H
add rsi, 64
vpor ymm4, ymm1, ymm0
vpmovmskb rdx, ymm4 // check set MSB of each 64 bytes
test edx, edx
jne @check
vpor ymm2, ymm5, ymm2
vmovdqa ymm4, ymm2
cmp rcx, rsi
ja @loop
// process trailing 0..63 bytes
@trail: sub rdi, rsi
jz @ended
add rsi, rax
vmovdqu xmm0, oword ptr [rip + @20]
lea rdx, qword ptr [rsp + 60H] // copy on stack with space padding
sub rsi, rdx
vmovdqa oword ptr [rdx], xmm0
vmovdqa oword ptr [rdx + 10H], xmm0
vmovdqa oword ptr [rdx + 20H], xmm0
vmovdqa oword ptr [rdx + 30H], xmm0
@by8: sub rdi, 8
jb @by1
mov rax, qword ptr [rsi + rdx]
mov qword ptr [rdx], rax
add rdx, 8 // in-order copy to preserve UTF-8 encoding
jmp @by8
@by1: add rdi, 8
jz @0
@sml: mov al, byte ptr [rsi + rdx]
mov byte ptr [rdx], al
add rdx, 1
sub rdi, 1
jnz @sml
@0: vmovdqa ymm1, yword ptr [rsp + 60H]
vmovdqa ymm2, yword ptr [rsp + 80H]
vpor ymm0, ymm1, ymm2
vpmovmskb rax, ymm0 // check any set MSB
test eax, eax
jne @last
@ended: vpor ymm5, ymm5, ymm4
@final: vptest ymm5, ymm5
sete al
vzeroupper
{$ifdef WIN64ABI}
add rsp, 100H
movaps xmm6, dqword ptr [rsp + 00H]
movaps xmm7, dqword ptr [rsp + 10H]
movaps xmm8, dqword ptr [rsp + 20H]
movaps xmm9, dqword ptr [rsp + 30H]
movaps xmm10, dqword ptr [rsp + 40H]
movaps xmm11, dqword ptr [rsp + 50H]
movaps xmm12, dqword ptr [rsp + 60H]
movaps xmm13, dqword ptr [rsp + 70H]
movaps xmm14, dqword ptr [rsp + 80H]
movaps xmm15, dqword ptr [rsp + 90H]
leave // mov rsp,rbp + pop rbp
pop rdi
pop rsi
{$else}
leave
{$endif WIN64ABI}
ret
@ok: mov al, 1
ret
@small: vpxor xmm4, xmm4, xmm4
xor esi, esi
vmovdqa ymm3, ymm4
vmovdqa ymm5, ymm4
jmp @trail
// validate UTF-8 extra bytes from main loop
{$ifdef FPC} align 8 {$else} .align 8 {$endif}
@check: vpsrlw ymm9, ymm0, 4
vpsrlw ymm12, ymm1, 4
vperm2i128 ymm3, ymm3, ymm0, 21H
vpalignr ymm5, ymm0, ymm3, 0FH
vpalignr ymm11, ymm0, ymm3, 0EH
vpsubusb ymm11, ymm11, yword ptr [rip + @_9]
vpalignr ymm3, ymm0, ymm3, 0DH
vperm2i128 ymm0, ymm0, ymm1, 21H
vpsubusb ymm3, ymm3, yword ptr [rip + @_10]
vpalignr ymm8, ymm1, ymm0, 0FH
vpsrlw ymm10, ymm5, 4
vpand ymm5, ymm7, ymm5
vpsrlw ymm6, ymm8, 4
vpalignr ymm4, ymm1, ymm0, 0EH
vpsubusb ymm4, ymm4, yword ptr [rip + @_9]
vpalignr ymm0, ymm1, ymm0, 0DH
vpsubusb ymm0, ymm0, yword ptr [rip + @_10]
vpand ymm10, ymm10, ymm7
vpand ymm6, ymm6, ymm7
vpand ymm8, ymm7, ymm8
vpor ymm3, ymm3, ymm11
vpor ymm0, ymm4, ymm0
vpxor xmm11, xmm11, xmm11
vpshufb ymm10, ymm15, ymm10
vpshufb ymm5, ymm14, ymm5
vpand ymm9, ymm9, ymm7
vpshufb ymm6, ymm15, ymm6
vpshufb ymm8, ymm14, ymm8
vpand ymm12, ymm12, ymm7
vpand ymm5, ymm5, ymm10
vpcmpgtb ymm3, ymm3, ymm11
vpcmpgtb ymm0, ymm0, ymm11
vpshufb ymm9, ymm13, ymm9
vpand ymm3, ymm3, yword ptr [rip + @_11]
vpand ymm0, ymm0, yword ptr [rip + @_11]
vpshufb ymm12, ymm13, ymm12
vpand ymm6, ymm6, ymm8
vpand ymm9, ymm5, ymm9
vpsubusb ymm5, ymm1, yword ptr [rip + @_12]
vpand ymm12, ymm6, ymm12
vpxor ymm9, ymm3, ymm9
vmovdqa ymm3, ymm1
vpxor ymm12, ymm0, ymm12
vpor ymm9, ymm9, ymm12
vpor ymm2, ymm9, ymm2
vmovdqa ymm4, ymm2
cmp rcx, rsi
ja @loop
jmp @trail
// validate UTF-8 extra bytes from input ending
{$ifdef FPC} align 8 {$else} .align 8 {$endif}
@last: vmovdqu ymm5, yword ptr [rip + @0f]
vperm2i128 ymm3, ymm3, ymm1, 21H
vmovdqu ymm9, yword ptr [rip + @_7]
vpsrlw ymm11, ymm1, 4
vpalignr ymm0, ymm1, ymm3, 0FH
vmovdqu ymm13, yword ptr [rip + @_10]
vmovdqu ymm14, yword ptr [rip + @_9]
vpsrlw ymm6, ymm0, 4
vpand ymm0, ymm5, ymm0
vpand ymm11, ymm11, ymm5
vmovdqu ymm7, yword ptr [rip + @_6]
vpshufb ymm10, ymm9, ymm0
vpalignr ymm0, ymm1, ymm3, 0EH
vpand ymm6, ymm6, ymm5
vmovdqu ymm8, yword ptr [rip + @_8]
vpalignr ymm3, ymm1, ymm3, 0DH
vperm2i128 ymm1, ymm1, ymm2, 21H
vpsubusb ymm0, ymm0, ymm14
vpsubusb ymm12, ymm3, ymm13
vpalignr ymm3, ymm2, ymm1, 0FH
vpshufb ymm6, ymm7, ymm6
vpsrlw ymm15, ymm3, 4
vpand ymm3, ymm5, ymm3
vpor ymm0, ymm0, ymm12
vpshufb ymm9, ymm9, ymm3
vpsrlw ymm3, ymm2, 4
vpand ymm15, ymm15, ymm5
vpand ymm5, ymm3, ymm5
vpalignr ymm3, ymm2, ymm1, 0EH
vpxor xmm12, xmm12, xmm12
vpalignr ymm1, ymm2, ymm1, 0DH
vpsubusb ymm3, ymm3, ymm14
vpshufb ymm11, ymm8, ymm11
vpsubusb ymm1, ymm1, ymm13
vpcmpgtb ymm0, ymm0, ymm12
vpshufb ymm7, ymm7, ymm15
vpor ymm1, ymm3, ymm1
vpshufb ymm8, ymm8, ymm5
vpsubusb ymm5, ymm2, yword ptr [rip + @_12]
vmovdqu ymm2, yword ptr [rip + @_11]
vpcmpgtb ymm1, ymm1, ymm12
vpand ymm6, ymm6, ymm10
vpand ymm7, ymm7, ymm9
vpand ymm0, ymm0, ymm2
vpand ymm11, ymm6, ymm11
vpand ymm8, ymm7, ymm8
vpxor ymm0, ymm0, ymm11
vpor ymm5, ymm4, ymm5
vpand ymm1, ymm1, ymm2
vpxor ymm1, ymm1, ymm8
vpor ymm0, ymm0, ymm1
vpor ymm5, ymm0, ymm5
jmp @final
{$ifdef FPC} align 16 {$else} .align 16 {$endif}
@20: dq 2020202020202020H
dq 2020202020202020H
{$ifdef FPC} align 32 {$else} .align 16 {Delphi doesn't allow 32 :(} {$endif}
@0f: dq 0F0F0F0F0F0F0F0FH
dq 0F0F0F0F0F0F0F0FH
dq 0F0F0F0F0F0F0F0FH
dq 0F0F0F0F0F0F0F0FH
@_6: dq 0202020202020202H
dq 4915012180808080H
dq 0202020202020202H
dq 4915012180808080H
@_7: dq 0CBCBCB8B8383A3E7H
dq 0CBCBDBCBCBCBCBCBH
dq 0CBCBCB8B8383A3E7H
dq 0CBCBDBCBCBCBCBCBH
@_8: dq 0101010101010101H
dq 01010101BABAAEE6H
dq 0101010101010101H
dq 01010101BABAAEE6H
@_9: dq 0DFDFDFDFDFDFDFDFH
dq 0DFDFDFDFDFDFDFDFH
dq 0DFDFDFDFDFDFDFDFH
dq 0DFDFDFDFDFDFDFDFH
@_10: dq 0EFEFEFEFEFEFEFEFH
dq 0EFEFEFEFEFEFEFEFH
dq 0EFEFEFEFEFEFEFEFH
dq 0EFEFEFEFEFEFEFEFH
@_11: dq 8080808080808080H
dq 8080808080808080H
dq 8080808080808080H
dq 8080808080808080H
@_12: db 0FFH, 0FFH, 0FFH, 0FFH, 0FFH, 0FFH, 0FFH, 0FFH
db 0FFH, 0FFH, 0FFH, 0FFH, 0FFH, 0FFH, 0FFH, 0FFH
db 0FFH, 0FFH, 0FFH, 0FFH, 0FFH, 0FFH, 0FFH, 0FFH
db 0FFH, 0FFH, 0FFH, 0FFH, 0FFH, 0EFH, 0DFH, 0BFH
end;
// inspired by https://github.com/aklomp/base64 - BSD-2-Clause License
// - less unrolled, but (much) faster thanks to manually tuned asm
procedure Base64EncodeAvx2(var b: PAnsiChar; var blen: PtrUInt;
var b64: PAnsiChar);
{$ifdef WIN64ABI}
var
bak6, bak7, bak8, bak9, bak10, bak11, bak12, bak13, bak14, bak15: THash128;
asm // Windows x64 calling convention expects to preserve XMM6-XMM15
movups dqword ptr [bak6], xmm6
movups dqword ptr [bak7], xmm7
movups dqword ptr [bak8], xmm8
movups dqword ptr [bak9], xmm9
movups dqword ptr [bak10], xmm10
movups dqword ptr [bak11], xmm11
movups dqword ptr [bak12], xmm12
movups dqword ptr [bak13], xmm13
movups dqword ptr [bak14], xmm14
movups dqword ptr [bak15], xmm15
push rsi // Win64 ABI doesn't consider rsi/rdi as volatile
push rdi
mov rsi, r8 // rsi = b64
mov r8, rdx // r8 = blen
mov rdi, rcx // rdi = b
{$else} {$ifdef FPC}nostackframe; assembler; asm {$else} asm .noframe {$endif}
mov r8, rsi // r8 = blen
mov rsi, rdx // rsi = b64 rdi = b
{$endif WIN64ABI}
mov rcx, qword ptr [r8]
cmp rcx, 31
jbe @done
lea rdx, qword ptr [rcx - 4]
vmovdqu ymm0, yword ptr [rip + @c9]
mov r10, 0AAAAAAAAAAAAAAABH
vmovdqu ymm7, yword ptr [rip + @c10]
mov rax, rdx
vmovdqu ymm5, yword ptr [rip + @c11]
vmovdqu ymm8, yword ptr [rip + @c12]
mul r10
vmovdqu ymm9, yword ptr [rip + @c13]
vmovdqu ymm10, yword ptr [rip + @c14]
vmovdqu ymm6, yword ptr [rip + @c16]
vmovdqu ymm4, yword ptr [rip + @c15]
vmovdqu ymm11, yword ptr [rip + @c17]
shr rdx, 4 // rdx = rounds = (blen - 4) / 24
lea rax, qword ptr [rdx + rdx * 2]
shl rax, 3
sub rcx, rax
mov qword ptr [r8], rcx // blen = rounds * 24
mov rcx, qword ptr [rdi] // rcx = [rdi] = b
mov rax, qword ptr [rsi] // rax = [rsi] = b64
// initial 24 bytes output process
vmovdqu xmm3, oword ptr [rcx]
vinserti128 ymm1, ymm3, oword ptr [rcx + 16], 1
vpermd ymm1, ymm0, ymm1
vpshufb ymm1, ymm1, ymm7
vpand ymm0, ymm5, ymm1
vpmulhuw ymm2, ymm0, ymm8
vpand ymm0, ymm9, ymm1
vpmullw ymm0, ymm10, ymm0
vpor ymm0, ymm0, ymm2
vpcmpgtb ymm2, ymm0, ymm6
vpsubusb ymm1, ymm0, ymm4
vpsubb ymm1, ymm1, ymm2
vpshufb ymm1, ymm11, ymm1
vpaddb ymm0, ymm1, ymm0
vmovdqu oword ptr [rax], xmm0
vextracti128 oword ptr [rax + 16], ymm0, 1
add rax, 32
add rcx, 20
sub rdx, 1
je @10
{$ifdef FPC} align 16 {$else} .align 16 {$endif}
// process 48 input bytes per loop iteration into 64 encoded bytes
@9: cmp rdx, 1
je @12
// whole loop logic is fully interlaced to unleash future CPU potential
vmovdqu xmm1, oword ptr [rcx]
vmovdqu xmm3, oword ptr [rcx + 24]
vinserti128 ymm1, ymm1, oword ptr [rcx + 16], 1
vinserti128 ymm3, ymm3, oword ptr [rcx + 24 + 16], 1
vpshufb ymm1, ymm1, ymm7
vpshufb ymm3, ymm3, ymm7
vpand ymm0, ymm5, ymm1
vpand ymm12, ymm5, ymm3
vpmulhuw ymm2, ymm0, ymm8
vpmulhuw ymm14, ymm12, ymm8
vpand ymm0, ymm9, ymm1
vpand ymm12, ymm9, ymm3
vpmullw ymm0, ymm10, ymm0
vpmullw ymm12, ymm10, ymm12
vpor ymm0, ymm0, ymm2
vpor ymm12, ymm12, ymm14
vpcmpgtb ymm2, ymm0, ymm6
vpcmpgtb ymm15, ymm12, ymm6
vpsubusb ymm1, ymm0, ymm4
vpsubusb ymm14, ymm12, ymm4
vpsubb ymm1, ymm1, ymm2
vpsubb ymm14, ymm14, ymm15
vpshufb ymm1, ymm11, ymm1
vpshufb ymm14, ymm11, ymm14
vpaddb ymm0, ymm1, ymm0
vpaddb ymm12, ymm14, ymm12
vmovdqu oword ptr [rax], xmm0
vextracti128 oword ptr [rax + 16], ymm0, 1
vmovdqu oword ptr [rax + 32], xmm12
vextracti128 oword ptr [rax + 48], ymm12, 1
add rcx, 48
add rax, 64
sub rdx, 2
jne @9
@10: add rcx, 4
mov qword ptr [rsi], rax
mov qword ptr [rdi], rcx
vzeroupper
{$ifdef WIN64ABI}
jmp @done
{$else}
ret
{$endif WIN64ABI}
// trailing 24 bytes
@12: vmovdqu xmm3, oword ptr [rcx]
vinserti128 ymm1, ymm3, oword ptr [rcx + 16], 1
vpshufb ymm1, ymm1, ymm7
vpand ymm0, ymm5, ymm1
vpmulhuw ymm8, ymm0, ymm8
vpand ymm0, ymm9, ymm1
vpmullw ymm0, ymm10, ymm0
vpor ymm0, ymm0, ymm8
vpcmpgtb ymm6, ymm0, ymm6
vpsubusb ymm4, ymm0, ymm4
vpsubb ymm4, ymm4, ymm6
vpshufb ymm11, ymm11, ymm4
vpaddb ymm0, ymm11, ymm0
vmovdqu oword ptr [rax], xmm0
vextracti128 oword ptr [rax + 16], ymm0, 1
add rcx, 24
add rax, 32
jmp @10
{$ifdef FPC} align 32 {$else} .align 16 {Delphi doesn't allow 32 :(} {$endif}
@c9: dq 0000000000000000H
dq 0000000200000001H
dq 0000000400000003H
dq 0000000600000005H
@c10: dq 0809070805060405H
dq 0E0F0D0E0B0C0A0BH
dq 0405030401020001H
dq 0A0B090A07080607H
@c11: dq 0FC0FC000FC0FC00H
dq 0FC0FC000FC0FC00H
dq 0FC0FC000FC0FC00H
dq 0FC0FC000FC0FC00H
@c12: dq 0400004004000040H
dq 0400004004000040H
dq 0400004004000040H
dq 0400004004000040H
@c13: dq 003F03F0003F03F0H
dq 003F03F0003F03F0H
dq 003F03F0003F03F0H
dq 003F03F0003F03F0H
@c14: dq 0100001001000010H
dq 0100001001000010H
dq 0100001001000010H
dq 0100001001000010H
@c15: dq 3333333333333333H
dq 3333333333333333H
dq 3333333333333333H
dq 3333333333333333H
@c16: dq 1919191919191919H
dq 1919191919191919H
dq 1919191919191919H
dq 1919191919191919H
@c17: dq 0FCFCFCFCFCFC4741H
dq 0000F0EDFCFCFCFCH
dq 0FCFCFCFCFCFC4741H
dq 0000F0EDFCFCFCFCH
@done: {$ifdef WIN64ABI}
pop rdi
pop rsi
movups xmm6, dqword ptr [bak6]
movups xmm7, dqword ptr [bak7]
movups xmm8, dqword ptr [bak8]
movups xmm9, dqword ptr [bak9]
movups xmm10, dqword ptr [bak10]
movups xmm11, dqword ptr [bak11]
movups xmm12, dqword ptr [bak12]
movups xmm13, dqword ptr [bak13]
movups xmm14, dqword ptr [bak14]
movups xmm15, dqword ptr [bak15]
{$endif WIN64ABI}
end;
procedure Base64DecodeAvx2(var b64: PAnsiChar; var b64len: PtrInt;
var b: PAnsiChar);
{$ifdef WIN64ABI}
var
bak6, bak7, bak8, bak9, bak10, bak11, bak12: THash128;
asm // Windows x64 calling convention expects to preserve XMM6-XMM15
movups dqword ptr [bak6], xmm6
movups dqword ptr [bak7], xmm7
movups dqword ptr [bak8], xmm8
movups dqword ptr [bak9], xmm9
movups dqword ptr [bak10], xmm10
movups dqword ptr [bak11], xmm11
movups dqword ptr [bak12], xmm12
push rsi // Win64 ABI doesn't consider rsi/rdi as volatile
push rdi
mov rsi, rdx
mov rdx, r8
mov rdi, rcx
{$else} {$ifdef FPC}nostackframe; assembler; asm {$else} asm .noframe {$endif}
{$endif WIN64ABI}
// rcx/rdi=b64 rdx/rsi=b64len r8/rdx=b
// on decoding error, b64 will point to the faulty input
mov r8, qword ptr [rsi]
cmp r8, 44
jbe @done
lea r9, qword ptr [r8 - 0DH]
vmovdqu ymm1, yword ptr [rip + @c0]
vmovdqu ymm5, yword ptr [rip + @c1]
mov rax, r9
and r9, 0FFFFFFFFFFFFFFE0H
vmovdqu ymm4, yword ptr [rip + @c2]
vmovdqu ymm9, yword ptr [rip + @c3]
sub r8, r9
shr rax, 5 // rax = rounds
vmovdqu ymm8, yword ptr [rip + @c4]
vmovdqu ymm3, yword ptr [rip + @c5]
mov qword ptr [rsi], r8 // set final b64len
vmovdqu ymm2, yword ptr [rip + @c6]
vmovdqu ymm7, yword ptr [rip + @c7]
vmovdqu ymm6, yword ptr [rip + @c8]
mov r8, qword ptr [rdi] // r8 = [rdi] = b64
mov r9, qword ptr [rdx] // r9 = [rdx] = b
{$ifdef FPC} align 16 {$else} .align 16 {$endif}
// decode 32 bytes on input into 24 binary bytes per loop iteration
@1: vmovdqu xmm0, oword ptr [r8]
vinserti128 ymm10, ymm0, oword ptr [r8 + 16], 1
vpsrld ymm0, ymm10, 4
vpand ymm11, ymm1, ymm0
vpand ymm0, ymm1, ymm10
vpshufb ymm12, ymm5, ymm11
vpshufb ymm0, ymm4, ymm0
vptest ymm0, ymm12
jnz @err
add r8, 32
vpcmpeqb ymm0, ymm10, ymm9
vpaddb ymm0, ymm0, ymm11
vpshufb ymm0, ymm8, ymm0
vpaddb ymm0, ymm0, ymm10
vpmaddubsw ymm0, ymm0, ymm3
vpmaddwd ymm0, ymm0, ymm2
vpshufb ymm0, ymm0, ymm7
vpermd ymm0, ymm6, ymm0
vmovdqu oword ptr [r9], xmm0
vextracti128 oword ptr [r9 + 16], ymm0, 1
add r9, 24
sub rax, 1
jne @1
jmp @8
@err: shl rax, 5
add qword ptr [rsi], rax // restore proper b64len on error
@8: mov qword ptr [rdi], r8
mov qword ptr [rdx], r9
vzeroupper
{$ifdef WIN64ABI}
jmp @done
{$else}
ret
{$endif WIN64ABI}
{$ifdef FPC} align 32 {$else} .align 16 {Delphi doesn't allow 32 :(} {$endif}
@c0: dq 2F2F2F2F2F2F2F2FH
dq 2F2F2F2F2F2F2F2FH
dq 2F2F2F2F2F2F2F2FH
dq 2F2F2F2F2F2F2F2FH
@c1: dq 0804080402011010H
dq 1010101010101010H
dq 0804080402011010H
dq 1010101010101010H
@c2: dq 1111111111111115H
dq 1A1B1B1B1A131111H
dq 1111111111111115H
dq 1A1B1B1B1A131111H
@c3: dq 2F2F2F2F2F2F2F2FH
dq 2F2F2F2F2F2F2F2FH
dq 2F2F2F2F2F2F2F2FH
dq 2F2F2F2F2F2F2F2FH
@c4: dq 0B9B9BFBF04131000H
dq 0000000000000000H
dq 0B9B9BFBF04131000H
dq 0000000000000000H
@c5: dq 0140014001400140H
dq 0140014001400140H
dq 0140014001400140H
dq 0140014001400140H
@c6: dq 0001100000011000H
dq 0001100000011000H
dq 0001100000011000H
dq 0001100000011000H
@c7: dq 090A040506000102H
dq 0FFFFFFFF0C0D0E08H
dq 090A040506000102H
dq 0FFFFFFFF0C0D0E08H
@c8: dq 0000000100000000H
dq 0000000400000002H
dq 0000000600000005H
dq 0FFFFFFFFFFFFFFFFH
@done: {$ifdef WIN64ABI}
pop rdi
pop rsi
movups xmm6, dqword ptr [bak6]
movups xmm7, dqword ptr [bak7]
movups xmm8, dqword ptr [bak8]
movups xmm9, dqword ptr [bak9]
movups xmm10, dqword ptr [bak10]
movups xmm11, dqword ptr [bak11]
movups xmm12, dqword ptr [bak12]
{$endif WIN64ABI}
end;
{$endif ASMX64AVXNOCONST}
{$endif CPUX64ASM} // SSE2 asm is invalid prior to Delphi XE7 (to be refined)
// functions below are always available, even on DARWIN
function SortDynArrayInteger(const A, B): integer;
{$ifdef FPC}nostackframe; assembler; asm {$else} asm .noframe {$endif FPC}
mov r8d, dword ptr [A]
mov edx, dword ptr [B]
xor eax, eax
xor ecx, ecx
cmp r8d, edx
setl cl
setg al
sub eax, ecx
end;
function SortDynArrayCardinal(const A, B): integer;
{$ifdef FPC}nostackframe; assembler; asm {$else} asm .noframe {$endif FPC}
mov ecx, dword ptr [A]
mov edx, dword ptr [B]
xor eax, eax
cmp ecx, edx
seta al
sbb eax, 0
end;
function SortDynArrayInt64(const A, B): integer;
{$ifdef FPC}nostackframe; assembler; asm {$else} asm .noframe {$endif FPC}
mov r8, qword ptr [A]
mov rdx, qword ptr [B]
xor eax, eax
xor ecx, ecx
cmp r8, rdx
setl cl
setg al
sub eax, ecx
end;
function SortDynArrayQWord(const A, B): integer;
{$ifdef FPC}nostackframe; assembler; asm {$else} asm .noframe {$endif FPC}
mov rcx, qword ptr [A]
mov rdx, qword ptr [B]
xor eax, eax
cmp rcx, rdx
seta al
sbb eax, 0
end;
function SortDynArrayPointer(const A, B): integer;
{$ifdef FPC}nostackframe; assembler; asm {$else} asm .noframe {$endif FPC}
mov rcx, qword ptr [A]
mov rdx, qword ptr [B]
xor eax, eax
cmp rcx, rdx
seta al
sbb eax, 0
end;
function SortDynArrayDouble(const A, B): integer;
{$ifdef FPC}nostackframe; assembler; asm {$else} asm .noframe {$endif FPC}
movsd xmm0, qword ptr [A]
movsd xmm1, qword ptr [B]
xor eax, eax
xor edx, edx
comisd xmm0, xmm1
seta al
setb dl
sub eax, edx
end;
function SortDynArraySingle(const A, B): integer;
{$ifdef FPC}nostackframe; assembler; asm {$else} asm .noframe {$endif FPC}
movss xmm0, dword ptr [A]
movss xmm1, dword ptr [B]
xor eax, eax
xor edx, edx
comiss xmm0, xmm1
seta al
setb dl
sub eax, edx
end;
function SortDynArrayAnsiString(const A, B): integer;
{$ifdef FPC}nostackframe; assembler; asm {$else} asm .noframe {$endif FPC}
// x86_64 version optimized for AnsiString/RawUtf8/RawByteString types
mov rax, qword ptr [A]
mov rdx, qword ptr [B]
cmp rax, rdx
je @z
test rax, rdx
jz @maynil
@f: movzx ecx, byte ptr [rax] // first char comparison
movzx r8d, byte ptr [rdx] // 32-bit to avoid overflow
sub ecx, r8d
je @s
mov eax, ecx // branchless execution on Quicksort
ret
@1: mov eax, 1
ret
@z: xor eax, eax
ret
@maynil:test rdx, rdx // A or B may be ''
jz @1
test rax, rax
jnz @f
dec eax
ret
{$ifdef FPC}
@s: mov r9, qword ptr [rax - _STRLEN] // TStrLen=SizeInt on FPC
mov r8, r9
sub r9, qword ptr [rdx - _STRLEN] // r9 = length(A)-length(B)
{$else}
.align 16
@s: mov r9d, dword ptr [rax - _STRLEN] // TStrLen=integer on Delphi
mov r8, r9
mov r10d, dword ptr [rdx - _STRLEN]
sub r9, r10 // r9 = length(A)-length(B)
{$endif FPC}
adc rcx, -1
and rcx, r9
sub rcx, r8 // rcx = -min(length(A),length(B))
sub rax, rcx
sub rdx, rcx
{$ifdef FPC} align 16 {$else} .align 16 {$endif}
@by8: mov r10, qword ptr [rax + rcx] // compare 8 bytes at once
xor r10, qword ptr [rdx + rcx]
jnz @d
add rcx, 8
js @by8
@eq: mov eax, r9d // all chars equal -> returns length(A)-length(B)
ret
@d: bsf r10, r10 // char differs -> returns pbyte(A)^-pbyte(B)^
shr r10, 3
add rcx, r10
jns @eq
movzx eax, byte ptr [rax + rcx]
movzx edx, byte ptr [rdx + rcx]
sub eax, edx
end; // note: SSE4.2 read up to 16 bytes after buffer, this version won't
function Hash32(Data: PCardinalArray; Len: integer): cardinal;
{$ifdef FPC}nostackframe; assembler; asm {$else} asm .noframe {$endif}
xor eax, eax
xor r9d, r9d
test Data, Data
jz @z
{$ifdef WIN64ABI} // rcx/rdi=Data edx/esi=Len
mov r8, rdx
shr r8, 4
{$else}
mov edx, esi
shr esi, 4
{$endif WIN64ABI}
jz @by4
{$ifdef FPC} align 16 {$else} .align 16 {$endif}
@by16: add eax, dword ptr [Data]
add r9d, eax
add eax, dword ptr [Data + 4]
add r9d, eax
add eax, dword ptr [Data + 8]
add r9d, eax
add eax, dword ptr [Data + 12]
add r9d, eax
add Data, 16
{$ifdef WIN64ABI}
sub r8d, 1
{$else}
sub esi, 1
{$endif WIN64ABI}
jnz @by16
@by4: mov dh, dl
and dl, 15
jz @0
shr dl, 2
jz @rem
@4: add eax, dword ptr [Data]
add r9d, eax
add Data, 4
dec dl
jnz @4
@rem: and dh, 3
jz @0
dec dh
jz @1
dec dh
jz @2
mov ecx, dword ptr [Data]
and ecx, $ffffff
jmp @e
@2: movzx ecx, word ptr [Data]
jmp @e
@1: movzx ecx, byte ptr [Data]
@e: add eax, ecx
@0: add r9d, eax
shl r9d, 16
xor eax, r9d
@z:
end;
function xxHash32(crc: cardinal; P: PAnsiChar; len: cardinal): cardinal;
{$ifdef FPC}nostackframe; assembler; asm {$else} asm .noframe{$endif}
{$ifdef SYSVABI} // crc=rdi P=rsi len=rdx
mov r8, rdi
mov rcx, rsi
{$else} // crc=r8 P=rcx len=rdx
mov r10, r8
mov r8, rcx
mov rcx, rdx
mov rdx, r10
push rsi // Win64 expects those registers to be preserved
push rdi
{$endif SYSVABI}
// P=r8 len=rcx crc=rdx
push r12
push rbx
mov r12d, -1640531535
lea r10, [rcx + rdx]
lea eax, [r8 + 165667B1H]
cmp rdx, 15
jbe @2
lea rsi, [r10 - 16]
lea ebx, [r8 + 24234428H]
lea edi, [r8 - 7A143589H]
lea eax, [r8 + 61C8864FH]
{$ifdef FPC} align 16 {$else} .align 16 {$endif}
@1: imul r9d, dword ptr [rcx], -2048144777
add rcx, 16
imul r11d, dword ptr [rcx - 12], -2048144777
add ebx, r9d
lea r9d, [r11 + rdi]
rol ebx, 13
rol r9d, 13
imul ebx, r12d
imul edi, r9d, -1640531535
imul r9d, dword ptr [rcx - 8], -2048144777
add r8d, r9d
imul r9d, dword ptr [rcx - 4], -2048144777
rol r8d, 13
imul r8d, r12d
add eax, r9d
rol eax, 13
imul eax, r12d
cmp rsi, rcx
jnc @1
rol edi, 7
rol ebx, 1
rol r8d, 12
mov r9d, edi
ror eax, 14
add r9d, ebx
add r8d, r9d
add eax, r8d
@2: lea r9, [rcx+4H]
add eax, edx
cmp r10, r9
jc @4
mov r8, r9
@3: imul edx, dword ptr [r8 - 4], -1028477379
add r8, 4
add eax, edx
ror eax, 15
imul eax, 668265263
cmp r10, r8
jnc @3
lea rdx, [r10 - 4]
sub rdx, rcx
mov rcx, rdx
and rcx, 0FFFFFFFFFFFFFFFCH
add rcx, r9
@4: cmp r10, rcx
jbe @6
@5: movzx edx, byte ptr [rcx]
add rcx, 1
imul edx, 374761393
add eax, edx
rol eax, 11
imul eax, r12d
cmp r10, rcx
jnz @5
@6: mov edx, eax
shr edx, 15
xor eax, edx
imul eax, -2048144777
mov edx, eax
shr edx, 13
xor eax, edx
imul eax, -1028477379
mov edx, eax
shr edx, 16
xor eax, edx
pop rbx
pop r12
{$ifndef SYSVABI}
pop rdi
pop rsi
{$endif SYSVABI}
end;
function GetBitsCountPas(value: PtrInt): PtrInt;
{$ifdef FPC} assembler; nostackframe; asm {$else} asm .noframe {$endif}
mov rax, value
mov rdx, value
shr rax, 1
mov rcx, $5555555555555555
mov r8, $3333333333333333
mov r10, $0f0f0f0f0f0f0f0f
mov r11, $0101010101010101
and rax, rcx
sub rdx, rax
mov rax, rdx
shr rdx, 2
and rax, r8
and rdx, r8
add rax, rdx
mov rdx, rax
shr rax, 4
add rax, rdx
and rax, r10
imul rax, r11
shr rax, 56
end;
procedure mul64x64(const left, right: QWord; out product: THash128Rec);
{$ifdef FPC}nostackframe; assembler; asm {$else} asm .noframe {$endif}
// uses built-in 64-bit -> 128-bit multiplication
{$ifdef WIN64ABI}
mov rax, rcx
mul rdx
{$else}
mov r8, rdx
mov rax, rdi
mul rsi
{$endif WIN64ABI}
mov qword ptr [r8], rax
mov qword ptr [r8 + 8], rdx
end;
function bswap32(a: cardinal): cardinal;
{$ifdef FPC}nostackframe; assembler; asm {$else} asm .noframe {$endif FPC}
mov eax, a
bswap eax
end;
function bswap64(const a: QWord): QWord;
{$ifdef FPC}nostackframe; assembler; asm {$else} asm .noframe {$endif FPC}
mov rax, a
bswap rax
end;
procedure bswap64array(a, b: PQWordArray; n: PtrInt);
{$ifdef FPC}nostackframe; assembler; asm {$else} asm .noframe {$endif FPC}
@1: mov rax, qword ptr [a]
bswap rax
mov qword ptr [b], rax
add a, 8
add b, 8
sub n, 1
jnz @1
end;
function StrLen(S: pointer): PtrInt;
{$ifdef FPC}nostackframe; assembler; asm {$else} asm .noframe {$endif FPC}
// from GPL strlen64.asm by Agner Fog - www.agner.org/optimize
{$ifdef WIN64ABI}
mov rax, rcx // get pointer to string from rcx
mov r8, rcx // copy pointer
test rcx, rcx
{$else}
mov rax, rdi
mov ecx, edi
test rdi, rdi
{$endif WIN64ABI}
jz @null // returns 0 if S=nil
// rax=s,ecx=32-bit of s
pxor xmm0, xmm0 // set to zero
and ecx, 15 // lower 4 bits indicate misalignment
and rax, -16 // align pointer by 16
// will never read outside a memory page boundary, so won't trigger GPF
movaps xmm1, [rax] // read from nearest preceding boundary
pcmpeqb xmm1, xmm0 // compare 16 bytes with zero
pmovmskb edx, xmm1 // get one bit for each byte result
shr edx, cl // shift out false bits
shl edx, cl // shift back again
bsf edx, edx // find first 1-bit
jnz @L2 // found
// Main loop, search 16 bytes at a time
{$ifdef FPC} align 16 {$else} .align 16 {$endif}
@L1: add rax, 10H // increment pointer by 16
movaps xmm1, [rax] // read 16 bytes aligned
pcmpeqb xmm1, xmm0 // compare 16 bytes with zero
pmovmskb edx, xmm1 // get one bit for each byte result
bsf edx, edx // find first 1-bit
// (moving the bsf out of the loop and using test here would be faster
// for long strings on old processors, but we are assuming that most
// strings are short, and newer processors have higher priority)
jz @L1 // loop if not found
@L2: // Zero-byte found. Compute string length
{$ifdef WIN64ABI}
sub rax, r8 // subtract start address
{$else}
sub rax, rdi
{$endif WIN64ABI}
add rax, rdx // add byte index
@null:
end;
function PosChar(Str: PUtf8Char; Chr: AnsiChar): PUtf8Char;
{$ifdef FPC}nostackframe; assembler; asm {$else} asm .noframe {$endif FPC}
// fast SSE2 version searching for both Chr and #0 over 16 bytes at once
{$ifdef WIN64ABI}
movzx edx, dl
mov rax, rcx // get pointer to string from rcx
imul edx, $01010101
test rcx, rcx
{$else}
imul edx, esi, $01010101
mov rax, rdi
mov ecx, edi
test rdi, rdi
{$endif WIN64ABI}
jz @null // returns 0 if S=nil
movd xmm1, edx
// rax=Str, ecx=32-bit of Str, xmm1=Chr
pxor xmm0, xmm0 // set xmm0 to zero
pshufd xmm1, xmm1, 0 // set Chr into all bytes of xmm1
and ecx, 15 // lower 4 bits indicate misalignment
and rax, -16 // align pointer by 16
// will never read outside a memory page boundary, so won't trigger GPF
movaps xmm2, [rax] // read from nearest preceding boundary
movaps xmm3, xmm2
pcmpeqb xmm2, xmm0 // compare 16 bytes with zero
pcmpeqb xmm3, xmm1 // compare 16 bytes with Chr
por xmm2, xmm3
pmovmskb edx, xmm2 // get one bit for each byte result
shr edx, cl // shift out false bits
shl edx, cl // shift back again
bsf edx, edx // find first 1-bit
jnz @L2 // found
// Main loop, search 16 bytes at a time
{$ifdef FPC} align 16 {$else} .align 16 {$endif}
@L1: add rax, 10H // increment pointer by 16
movaps xmm2, [rax] // read 16 bytes aligned
movaps xmm3, xmm2
pcmpeqb xmm2, xmm0 // compare 16 bytes with zero
pcmpeqb xmm3, xmm1 // compare 16 bytes with Chr
por xmm2, xmm3
pmovmskb edx, xmm2 // get one bit for each byte result
bsf edx, edx // find first 1-bit
// (moving the bsf out of the loop and using test here would be faster
// for long strings on old processors, but we are assuming that most
// strings are short, and newer processors have higher priority)
jz @L1 // loop if not found
@L2: // Zero or Chr byte found
add rax, rdx // add byte index for rax = match address
cmp byte ptr [rax], 0
jz @z // return nil if zero was reached
@null: ret
@z: xor eax, eax
end;
function BufferLineLength(Text, TextEnd: PUtf8Char): PtrInt;
{$ifdef FPC} nostackframe; assembler; asm {$else} asm .noframe {$endif FPC}
{$ifdef WIN64ABI} // Win64 ABI to System-V ABI
push rsi
push rdi
mov rdi, rcx
mov rsi, rdx
{$endif WIN64ABI}
mov r8, rsi
sub r8, rdi // rdi=Text, rsi=TextEnd, r8=TextLen
jz @fail
mov ecx, edi
movaps xmm0, [rip + @for10]
movaps xmm1, [rip + @for13]
and rdi, -16 // check first aligned 16 bytes
and ecx, 15 // lower cl 4 bits indicate misalignment
movaps xmm2, [rdi]
movaps xmm3, xmm2
pcmpeqb xmm2, xmm0
pcmpeqb xmm3, xmm1
por xmm3, xmm2
pmovmskb eax, xmm3
shr eax, cl // shift out unaligned bytes
bsf eax, eax
jz @main
add rax, rcx
add rax, rdi
sub rax, rsi
jae @fail // don't exceed TextEnd
add rax, r8 // rax = TextFound - TextEnd + (TextEnd - Text) = offset
{$ifdef WIN64ABI}
pop rdi
pop rsi
{$endif WIN64ABI}
ret
@main: add rdi, 16
sub rdi, rsi
jae @fail
jmp @by16
{$ifdef FPC} align 16 {$else} .align 16 {$endif}
@for10: dq $0a0a0a0a0a0a0a0a
dq $0a0a0a0a0a0a0a0a
@for13: dq $0d0d0d0d0d0d0d0d
dq $0d0d0d0d0d0d0d0d
@by16: movaps xmm2, [rdi + rsi] // check 16 bytes per loop
movaps xmm3, xmm2
pcmpeqb xmm2, xmm0
pcmpeqb xmm3, xmm1
por xmm3, xmm2
pmovmskb eax, xmm3
test eax, eax
jnz @found
add rdi, 16
jnc @by16
@fail: mov rax, r8 // returns TextLen if no CR/LF found
{$ifdef WIN64ABI}
pop rdi
pop rsi
{$endif WIN64ABI}
ret
@found: bsf eax, eax
add rax, rdi
jc @fail
add rax, r8
{$ifdef WIN64ABI}
pop rdi
pop rsi
{$endif WIN64ABI}
end;
function ByteScanIndex(P: PByteArray; Count: PtrInt; Value: byte): PtrInt;
{$ifdef FPC} assembler; nostackframe; asm {$else} asm .noframe {$endif}
{$ifdef WIN64ABI}
movzx eax, r8b
mov r8, rcx
test rdx, rdx
{$else}
movzx eax, dl
mov rcx, rdi
mov r8, rdi
mov rdx, rsi
test rsi, rsi
{$endif WIN64ABI}
jbe @no
// eax=Value, rcx=P rdx=Count
imul eax, $01010101
and rcx, -16
movd xmm1, eax
movaps xmm0, [rcx] // check first aligned 16 bytes
add rcx, 16
pshufd xmm1, xmm1, 0
sub rcx, r8
pcmpeqb xmm0, xmm1
pmovmskb eax, xmm0
shl eax, cl
and eax, $FFFF0000
shr eax, cl
jnz @fnd
cmp rdx, rcx
jbe @no
{$ifdef FPC} align 16 {$else} .align 16 {$endif}
@by16: movaps xmm0, [r8 + rcx] // next 16 bytes
add rcx, 16
pcmpeqb xmm0, xmm1
pmovmskb eax, xmm0
test eax, eax
jnz @fnd
cmp rdx, rcx
ja @by16
@no: mov rax, -1
ret
@fnd: bsf eax, eax
lea rax, qword ptr [rcx + rax - 16]
cmp rdx, rax
jbe @no
end;
function WordScanIndex(P: PWordArray; Count: PtrInt; Value: word): PtrInt;
{$ifdef FPC} assembler; nostackframe; asm {$else} asm .noframe {$endif}
{$ifdef WIN64ABI}
movzx eax, r8w
mov r8, rcx
test rdx, rdx
{$else}
movzx eax, dx
mov rcx, rdi
mov r8, rdi
mov rdx, rsi
test rsi, rsi
{$endif WIN64ABI}
jbe @no
test rcx, 1
jnz @unal
// eax=Value, rcx=P rdx=Count
movd xmm1, eax
and rcx, -16
punpcklwd xmm1, xmm1
movaps xmm0, [rcx] // check first aligned 16 bytes
add rcx, 16
pshufd xmm1, xmm1, 0
sub rcx, r8
pcmpeqw xmm0, xmm1
pmovmskb eax, xmm0
shl eax, cl
and eax, $FFFF0000
shr eax, cl
shr ecx, 1
bsf eax, eax
jnz @fnd
cmp rdx, rcx
jbe @no
{$ifdef FPC} align 16 {$else} .align 16 {$endif}
@by16: movaps xmm0, [r8 + rcx * 2] // next 16 bytes
add rcx, 8
pcmpeqw xmm0, xmm1
pmovmskb eax, xmm0
bsf eax, eax
jnz @fnd
cmp rdx, rcx
ja @by16
@no: mov rax, -1
ret
bsf eax, eax
@fnd: shr eax, 1
lea rax, qword ptr [rcx + rax - 8]
cmp rdx, rax
jbe @no
ret
// 16bit-unaligned loop (seldom called)
{$ifdef FPC} align 8 {$else} .align 8 {$endif}
@unal: lea rcx, [rcx + rdx * 2]
neg rdx
{$ifdef FPC} align 16 {$else} .align 16 {$endif}
@unals: cmp word ptr [rcx + rdx * 2], ax
jz @unale
add rdx, 1
jnz @unals
jmp @no
@unale: lea rax, [rcx + rdx * 2]
sub rax, r8
shr rax, 1
end;
function IntegerScanIndex(P: PCardinalArray; Count: PtrInt; Value: cardinal): PtrInt;
{$ifdef FPC} assembler; nostackframe; asm {$else} asm .noframe {$endif}
{$ifdef WIN64ABI}
mov eax, r8d
movd xmm1, r8d
mov r8, rcx
test rdx, rdx
{$else}
mov eax, edx
movd xmm1, edx
mov rcx, rdi
mov r8, rdi
mov rdx, rsi
test rsi, rsi
{$endif WIN64ABI}
jbe @no
test rcx, 3
jnz @unal
// eax=Value, rcx=P rdx=Count
and rcx, -16
movaps xmm0, [rcx] // check first aligned 16 bytes
add rcx, 16
pshufd xmm1, xmm1, 0
sub rcx, r8
pcmpeqd xmm0, xmm1
pmovmskb eax, xmm0
shl eax, cl
and eax, $FFFF0000
shr eax, cl
shr ecx, 2
bsf eax, eax
jnz @fnd
cmp rdx, rcx
jbe @no
{$ifdef FPC} align 16 {$else} .align 16 {$endif}
@by16: movaps xmm0, [r8 + rcx * 4] // next 16 bytes
add rcx, 4
pcmpeqd xmm0, xmm1
pmovmskb eax, xmm0
bsf eax, eax
jnz @fnd
cmp rdx, rcx
ja @by16
@no: mov rax, -1
ret
bsf eax, eax
@fnd: shr eax, 2
lea rax, qword ptr [rcx + rax - 4]
cmp rdx, rax
jbe @no
ret
// 32bit-unaligned loop (seldom called)
{$ifdef FPC} align 8 {$else} .align 8 {$endif}
@unal: lea rcx, [rcx + rdx * 4]
neg rdx
{$ifdef FPC} align 16 {$else} .align 16 {$endif}
@unals: cmp dword ptr [rcx + rdx * 4], eax
jz @unale
add rdx, 1
jnz @unals
jmp @no
@unale: lea rax, [rcx + rdx * 4]
sub rax, r8
shr rax, 2
end;
function MemCmp(P1, P2: PByteArray; L: PtrInt): integer;
{$ifdef FPC} assembler; nostackframe; asm {$else} asm .noframe {$endif}
// from GPL memcmp64.asm by Agner Fog - www.agner.org/optimize
add P1, L // use negative index from end of memory block
add P2, L
neg L
jge @eq
mov r9d, $FFFF // 16 bits mask = 16 bytes
cmp L, -16
ja @sml
{$ifdef FPC} align 8 {$else} .align 8 {$endif}
@by16: // loop comparing 16 bytes
movups xmm1, oword ptr [P1 + L]
movups xmm2, oword ptr [P2 + L]
pcmpeqb xmm1, xmm2 // compare 16 bytes
pmovmskb eax, xmm1 // get byte mask
xor eax, r9d // not ax
jnz @diff // difference found
add L, 16
jz @eq // finished, equal
cmp L, -16
jna @by16 // next 16 bytes
jmp @sml
@diff: // difference found: find position
bsf eax, eax
add L, rax
@last: movzx eax, byte ptr [P1 + L]
movzx r9d, byte ptr [P2 + L]
sub eax, r9d // return result
ret
@eq: // equal
xor eax, eax
ret
@sml: // less than 16 bytes left
cmp L, -8
ja @less8
// compare 8 bytes
movq xmm1, qword ptr [P1 + L]
movq xmm2, qword ptr [P2 + L]
pcmpeqb xmm1, xmm2 // compare 8 bytes
pmovmskb eax, xmm1 // get byte mask
xor eax, r9d // not ax
jnz @diff // difference found
add L, 8
jz @eq
@less8: // less than 8 bytes left
cmp L, -4
ja @less4
// compare 4 bytes
movd xmm1, dword ptr [P1 + L]
movd xmm2, dword ptr [P2 + L]
pcmpeqb xmm1, xmm2 // compare 4 bytes
pmovmskb eax, xmm1 // get byte mask
xor eax, r9d // not ax
jnz @diff // difference found
add L, 4
jz @eq
@less4: // less than 4 bytes left
cmp L, -2
ja @less2
movzx eax, word ptr [P1 + L]
movzx r9d, word ptr [P2 + L]
sub eax, r9d
jnz @last2 // difference in byte 0 or 1
add L, 2
jz @eq
@less2: // less than 2 bytes left
test L, L
jz @eq // no bytes left
jmp @last // one byte left
@last2: // difference in byte 0 or 1
neg al
sbb L, -1 // add 1 to L if al == 0
jmp @last
end;
function FastFindWordSorted(P: PWordArray; R: PtrInt; Value: Word): PtrInt;
{$ifdef FPC} assembler; nostackframe; asm {$else} asm .noframe {$endif}
{$ifdef WIN64ABI}
push rdi
mov rdi, P // rdi=P - we use ecx to read the word ptr value
{$endif WIN64ABI}
xor r9, r9 // r9=L rax=result
test R, R
jl @ko
{$ifdef FPC} align 8 {$else} .align 8 {$endif}
@s: lea rax, [r9 + R]
shr rax, 1
lea r10, qword ptr [rax - 1] // branchless loop
lea r11, qword ptr [rax + 1]
movzx ecx, word ptr [rdi + rax * 2]
{$ifdef WIN64ABI}
cmp ecx, r8d
{$else}
cmp ecx, edx // 'cmp cx,Value' is silently rejected by Darwin asm
{$endif WIN64ABI}
je @ok
cmovg R, r10
cmovl r9, r11
cmp r9, R
jle @s
@ko: mov rax, -1
@ok: {$ifdef WIN64ABI}
pop rdi
{$endif WIN64ABI}
end;
function FastFindIntegerSorted(P: PIntegerArray; R: PtrInt; Value: integer): PtrInt;
{$ifdef FPC} assembler; nostackframe; asm {$else} asm .noframe {$endif}
xor r9, r9 // r9=L rax=result
test R, R
jl @ko
lea rax, [r9 + R]
{$ifdef FPC} align 8 {$else} .align 8 {$endif}
@s: shr rax, 1
lea r10, qword ptr [rax - 1] // efficient branchless binary search
lea r11, qword ptr [rax + 1]
cmp Value, dword ptr [P + rax * 4]
je @ok
cmovl R, r10
cmovg r9, r11
lea rax, [r9 + R]
cmp r9, R
jle @s
@ko: mov rax, -1
@ok:
end;
function FastFindInt64Sorted(P: PInt64Array; R: PtrInt; const Value: Int64): PtrInt;
{$ifdef FPC} assembler; nostackframe; asm {$else} asm .noframe {$endif}
xor r9, r9 // r9=L rax=result
test R, R
jl @ko
lea rax, [r9 + R]
{$ifdef FPC} align 8 {$else} .align 8 {$endif}
@s: shr rax, 1
lea r10, qword ptr [rax - 1] // efficient branchless binary search
lea r11, qword ptr [rax + 1]
cmp Value, qword ptr [P + rax * 8]
je @ok
cmovl R, r10
cmovg r9, r11
lea rax, [r9 + R]
cmp r9, R
jle @s
@ko: mov rax, -1
@ok:
end;
function GetBitsCountSse42(value: PtrInt): PtrInt;
{$ifdef FPC} assembler; nostackframe;
asm
popcnt rax, value
{$else} // oldest Delphi don't support this opcode
asm .noframe
{$ifdef WIN64ABI}
db $f3,$48,$0f,$B8,$c1
{$else}
db $f3,$48,$0f,$B8,$c7
{$endif WIN64ABI}
{$endif FPC}
end;
function crc32cby4sse42(crc, value: cardinal): cardinal;
{$ifdef FPC}nostackframe; assembler; asm {$else} asm .noframe {$endif FPC}
mov eax, crc
crc32 eax, value
end;
procedure crcblocksse42(crc128, data128: PBlock128);
{$ifdef FPC}nostackframe; assembler; asm {$else} asm .noframe {$endif FPC}
mov eax, dword ptr [crc128] // we can't use two qword ptr here
mov r8d, dword ptr [crc128 + 4]
mov r9d, dword ptr [crc128 + 8]
mov r10d, dword ptr [crc128 + 12]
crc32 eax, dword ptr [data128]
crc32 r8d, dword ptr [data128 + 4]
crc32 r9d, dword ptr [data128 + 8]
crc32 r10d, dword ptr [data128 + 12]
mov dword ptr [crc128], eax
mov dword ptr [crc128 + 4], r8d
mov dword ptr [crc128 + 8], r9d
mov dword ptr [crc128 + 12], r10d
end;
procedure crcblockssse42(crc128, data128: PBlock128; count: integer);
{$ifdef FPC}nostackframe; assembler; asm {$else} asm .noframe {$endif FPC}
test count, count
jle @z
mov rax, data128
{$ifdef WIN64ABI}
mov rdx, rcx
mov ecx, r8d
{$else}
mov ecx, edx
mov rdx, rdi
{$endif WIN64ABI}
mov r8d, dword ptr [rdx] // we can't use qword ptr here
mov r9d, dword ptr [rdx + 4]
mov r10d, dword ptr [rdx + 8]
mov r11d, dword ptr [rdx + 12]
{$ifdef FPC} align 16 {$else} .align 16 {$endif}
@s: crc32 r8d, dword ptr [rax]
crc32 r9d, dword ptr [rax + 4]
crc32 r10d, dword ptr [rax + 8]
crc32 r11d, dword ptr [rax + 12]
add rax, 16
sub ecx, 1
jnz @s
mov dword ptr [rdx], r8d
mov dword ptr [rdx + 4], r9d
mov dword ptr [rdx + 8], r10d
mov dword ptr [rdx + 12], r11d
@z:
end;
function crc32csse42(crc: cardinal; buf: PAnsiChar; len: cardinal): cardinal;
{$ifdef FPC}nostackframe; assembler; asm {$else} asm .noframe {$endif FPC}
mov eax, crc
test len, len
jz @z
test buf, buf
jz @z
not eax
mov ecx, len
shr len, 3
jnz @by8 // no read alignment care here - but in crypto.core
@0: test cl, 4
jz @4
crc32 eax, dword ptr [buf]
add buf, 4
@4: test cl, 2
jz @2
crc32 eax, word ptr [buf]
add buf, 2
@2: test cl, 1
jz @1
crc32 eax, byte ptr [buf]
@1: not eax
@z: ret
{$ifdef FPC}
align 16
@by8: crc32 rax, qword ptr [buf] // hash 8 bytes per loop
{$else}
.align 16
@by8: db $F2,$48,$0F,$38,$F1,$02 // circumvent Delphi inline asm compiler bug
{$endif FPC}
add buf, 8
sub len, 1
jnz @by8
jmp @0
end;
function SynLZcompress1(src: PAnsiChar; size: integer; dst: PAnsiChar): integer;
var
off: TOffsets;
cache: array[0..4095] of cardinal; // uses 32KB+16KB=48KB on stack
asm // rcx=src, edx=size, r8=dest
{$ifdef WIN64ABI} // additional registers to preserve
push rdi
push rsi
{$else} // Linux 64-bit ABI
mov r8, rdx
mov rdx, rsi
mov rcx, rdi
{$endif WIN64ABI}
push rbx
push r12
push r13
push r14
push r15
mov r15, r8 // r8=dest r15=dst_beg
mov rbx, rcx // rbx=src
cmp edx, 32768
jc @03
mov eax, edx
and eax, 7FFFH
or eax, 8000H
mov word ptr [r8], ax
mov eax, edx
shr eax, 15
mov word ptr [r8 + 2], ax
add r8, 4
jmp @05
@03: mov word ptr [r8], dx
test edx, edx
jnz @04
mov r15d, 2
jmp @19
nop
@04: add r8, 2
@05: lea r9, [rdx + rbx] // r9=src_end
lea r10, [r9 - 11] // r10=src_endmatch
mov ecx, 1 // ecx=CWBits
mov r11, r8 // r11=CWpoint
mov dword ptr [r8], 0
add r8, 4
pxor xmm0, xmm0
mov eax, 32768 - 64
@06: movaps dqword ptr [off + rax - 48], xmm0 // stack is 16 bytes aligned
movaps dqword ptr [off + rax - 32], xmm0
movaps dqword ptr [off + rax - 16], xmm0
movaps dqword ptr [off + rax], xmm0
sub eax, 64
jae @06
cmp rbx, r10
ja @15
@07: mov edx, dword ptr [rbx]
mov rax, rdx
mov r12, rdx
shr rax, 12
xor rax, rdx
and rax, 0FFFH // rax=h
mov r14, qword ptr [off + rax * 8] // r14=o
mov edx, dword ptr [cache + rax * 4]
mov qword ptr [off + rax * 8], rbx
mov dword ptr [cache + rax * 4], r12d
xor rdx, r12
test r14, r14
lea rdi, [r9-1]
je @12
and rdx, 0FFFFFFH
jne @12
mov rdx, rbx
sub rdx, r14
cmp rdx, 2
jbe @12
or dword ptr [r11], ecx
add rbx, 2
add r14, 2
mov esi, 1
sub rdi, rbx
cmp rdi, 271
jc @09
mov edi, 271
jmp @09
@08: add rsi, 1
@09: mov edx, dword ptr [r14 + rsi]
cmp dl, byte ptr [rbx + rsi]
jnz @10
cmp rsi, rdi
jge @10
add rsi, 1
cmp dh, byte ptr [rbx + rsi]
jnz @10
shr edx, 16
cmp rsi, rdi
jge @10
add rsi, 1
cmp dl, byte ptr [rbx + rsi]
jnz @10
cmp rsi, rdi
jge @10
add rsi, 1
cmp dh, byte ptr [rbx + rsi]
jnz @10
cmp rsi, rdi
jc @08
@10: add rbx, rsi
shl rax, 4
cmp rsi, 15
ja @11
or rax, rsi
mov word ptr [r8], ax
add r8, 2
jmp @13
@11: sub rsi, 16
mov word ptr [r8], ax
mov byte ptr [r8 + 2], sil
add r8, 3
jmp @13
@12: mov al, byte ptr [rbx]
mov byte ptr [r8], al
add rbx, 1
add r8, 1
@13: add ecx, ecx
jnz @14
mov r11, r8
mov [r8], ecx
add r8, 4
add ecx, 1
@14: cmp rbx, r10
jbe @07
@15: cmp rbx, r9
jnc @18
@16: mov al, byte ptr [rbx]
mov byte ptr [r8], al
add rbx, 1
add r8, 1
add ecx, ecx
jnz @17
mov [r8], ecx
add r8, 4
add ecx, 1
@17: cmp rbx, r9
jc @16
@18: sub r8, r15
mov r15, r8
@19: mov rax, r15
pop r15
pop r14
pop r13
pop r12
pop rbx
{$ifdef WIN64ABI} // additional registers to preserve
pop rsi
pop rdi
{$endif WIN64ABI}
end;
function SynLZdecompress1(src: PAnsiChar; size: integer; dst: PAnsiChar): integer;
var
off: TOffsets;
asm // rcx=src, edx=size, r8=dest
{$ifdef WIN64ABI} // additional registers to preserve
push rsi
push rdi
{$else} // Linux 64-bit ABI
mov r8, rdx
mov rdx, rsi
mov rcx, rdi
{$endif WIN64ABI}
push rbx
push r12
push r13
push r14
push r15
movzx eax, word ptr [rcx] // rcx=src eax=result
lea r9, [rcx + rdx] // r9=src_end
test eax, eax
je @35
add rcx, 2
mov r10d, eax
and r10d, 8000H
jz @21
movzx ebx, word ptr [rcx]
shl ebx, 15
mov r10d, eax
and r10d, 7FFFH
or r10d, ebx
mov eax, r10d
add rcx, 2
@21: lea r10, [r8 - 1] // r10=last_hashed r8=dest
@22: mov edi, dword ptr [rcx] // edi=CW
add rcx, 4
mov r13d, 1 // r13d=CWBit
cmp rcx, r9
jnc @35
@23: test r13d, edi
jnz @25
mov bl, byte ptr [rcx]
mov byte ptr [r8], bl
add rcx, 1
lea rbx, [r8 - 2]
add r8, 1
cmp rcx, r9
jnc @35
cmp rbx, r10
jbe @24
add r10, 1
mov esi, dword ptr [r10]
mov rbx, rsi
shr esi, 12
xor ebx, esi
and ebx, 0FFFH
mov qword ptr [off + rbx * 8], r10
@24: shl r13d, 1
jnz @23
jmp @22
@25: movzx r11, word ptr [rcx] // r11=t
add rcx, 2
mov ebx, r11d // ebx=h
shr ebx, 4
and r11, 0FH
lea r11, [r11 + 2]
jnz @26
movzx r11, byte ptr [rcx]
add rcx, 1
lea r11, [r11 + 12H]
@26: mov r14, qword ptr [off + rbx * 8] // r14=o
mov rbx, r8
xor rsi, rsi
sub rbx, r14
mov r12, r11
mov r15, r11
cmp rbx, r11
jc @29
shr r12, 3
jz @30
{$ifdef FPC} align 8 {$else} .align 8 {$endif}
@27: mov rbx, qword ptr [r14 + rsi] // inline move by 8 bytes
mov qword ptr [r8 + rsi], rbx
add rsi, 8
sub r12, 1
jnz @27
mov rbx, qword ptr [r14 + rsi] // 1..7 remaining bytes
and r15, 7
jz @31
@28: mov byte ptr [r8 + rsi], bl
shr rbx, 8
add rsi, 1
sub r15, 1
jnz @28
jmp @31
{$ifdef FPC} align 8 {$else} .align 8 {$endif}
@29: mov bl, byte ptr [r14 + rsi] // overlaping move
mov byte ptr [r8 + rsi], bl
add rsi, 1
sub r12, 1
jnz @29
cmp rcx, r9
jnz @33
jmp @35
@30: mov rbx, qword ptr [r14]
mov qword ptr [r8], rbx
@31: cmp rcx, r9
jz @35
cmp r10, r8
jnc @34
@32: add r10, 1
mov ebx, dword ptr [r10]
mov rsi, rbx
shr ebx, 12
xor esi, ebx
and esi, 0FFFH
mov qword ptr [off + rsi * 8], r10
@33: cmp r10, r8
jc @32
@34: add r8, r11
lea r10, [r8 - 1]
shl r13d, 1
jnz @23
jmp @22
@35: pop r15
pop r14
pop r13
pop r12
pop rbx
{$ifdef WIN64ABI} // additional registers to preserve
pop rdi
pop rsi
{$endif WIN64ABI}
end;
function RdRand32: cardinal;
{$ifdef FPC}nostackframe; assembler; asm{$else} asm .noframe {$endif FPC}
// rdrand eax: same opcodes for x86 and x64
db $0f, $c7, $f0
// we ignore the carry flag (handled once in TestIntelCpuFeatures)
end;
function Rdtsc: Int64;
{$ifdef FPC}nostackframe; assembler; asm{$else} asm .noframe {$endif FPC}
rdtsc // returns the TSC in EDX:EAX
shl rdx, 32
or rax, rdx
end;
procedure LockedInc32(int32: PInteger);
{$ifdef FPC}nostackframe; assembler; asm {$else} asm .noframe {$endif FPC}
lock inc dword ptr [int32]
end;
procedure LockedDec32(int32: PInteger);
{$ifdef FPC}nostackframe; assembler; asm {$else} asm .noframe {$endif FPC}
lock dec dword ptr [int32]
end;
procedure LockedInc64(int64: PInt64);
{$ifdef FPC}nostackframe; assembler; asm {$else} asm .noframe {$endif FPC}
lock inc qword ptr [int64]
end;
function InterlockedIncrement(var I: integer): integer;
{$ifdef FPC}nostackframe; assembler; asm {$else} asm .noframe {$endif FPC}
mov eax, 1
lock xadd dword ptr [I], eax // atomic eax=[I] + [I]:=[I]+eax
add eax, 1
end;
function InterlockedDecrement(var I: integer): integer;
{$ifdef FPC}nostackframe; assembler; asm {$else} asm .noframe {$endif FPC}
mov eax, -1
lock xadd dword ptr [I], eax // atomic eax=[I] + [I]:=[I]+eax
sub eax, 1
end;
function StrCntDecFree(var refcnt: TStrCnt): boolean;
{$ifdef FPC}nostackframe; assembler; asm {$else} asm .noframe {$endif FPC}
{$ifdef STRCNT32}
lock dec dword ptr [refcnt] // TStrCnt=longint on Delphi Win64 and FPC>=3.4
{$else}
lock dec qword ptr [refcnt] // on FPC<3.4
{$endif STRCNT32}
setbe al
end; // don't check for ismultithread global since lock is cheaper on new CPUs
function DACntDecFree(var refcnt: TDACnt): boolean;
{$ifdef FPC}nostackframe; assembler; asm {$else} asm .noframe {$endif FPC}
{$ifdef DACNT32}
lock dec dword ptr [refcnt] // TDACnt=longint on Delphi
{$else}
lock dec qword ptr [refcnt]
{$endif DACNT32}
setbe al
end;
function LockedExc(var Target: PtrUInt; NewValue, Comperand: PtrUInt): boolean;
{$ifdef FPC}nostackframe; assembler; asm {$else} asm .noframe {$endif FPC}
mov rax, Comperand
lock cmpxchg qword ptr [Target], NewValue
setz al
end;
procedure LockedAdd(var Target: PtrUInt; Increment: PtrUInt);
{$ifdef FPC}nostackframe; assembler; asm {$else} asm .noframe {$endif FPC}
lock add qword ptr [Target], Increment
end;
procedure LockedAdd32(var Target: cardinal; Increment: cardinal);
{$ifdef FPC}nostackframe; assembler; asm {$else} asm .noframe {$endif FPC}
lock add dword ptr [Target], Increment
end;
procedure LockedDec(var Target: PtrUInt; Decrement: PtrUInt);
{$ifdef FPC}nostackframe; assembler; asm {$else} asm .noframe {$endif FPC}
lock sub qword ptr [Target], Decrement
end;
{$ifdef ISDELPHI}
// those functions are intrinsics with FPC :)
function BSRdword(c: cardinal): cardinal;
asm
.noframe
mov eax, c
bsr eax, eax
jnz @nz
mov eax, 255
@nz:
end;
function BSRqword(const q: qword): cardinal;
asm
.noframe
mov rax, q
bsr rax, rax
jnz @nz
mov eax, 255
@nz:
end;
// FPC will properly inline multiplication by reciprocal
procedure Div100(Y: cardinal; var res: TDiv100Rec);
asm
.noframe
mov r8, res
mov edx, Y
mov dword ptr [r8].TDiv100Rec.M,edx
mov eax, 1374389535
mul edx
shr edx, 5
mov dword ptr [r8].TDiv100Rec.D, edx
imul eax, edx, 100
sub dword ptr [r8].TDiv100Rec.M, eax
end;
{$endif ISDELPHI}
function IsXmmYmmOSEnabled: boolean;
{$ifdef FPC}nostackframe; assembler; asm {$else} asm .noframe {$endif FPC}
// see https://software.intel.com/en-us/blogs/2011/04/14/is-avx-enabled
xor ecx, ecx // specify control register XCR0=XFEATURE_ENABLED_MASK
db $0f, $01, $d0 // XGETBV opcode reads XCR0 into EDX:EAX
and eax, 6
cmp al, 6 // check XMM (bit 1=2) and YMM (bit 2=4)
sete al // true if OS enabled both XMM and YMM
end;
procedure GetCpuid(cpueax, cpuecx: cardinal; var regs: TIntelRegisters);
{$ifdef FPC}nostackframe; assembler; asm {$else} asm .noframe {$endif FPC}
mov eax, cpueax
mov ecx, cpuecx
mov r9, regs
mov r10, rbx // preserve rbx
xor ebx, ebx
xor edx, edx
cpuid
mov TIntelRegisters(r9).&eax, eax
mov TIntelRegisters(r9).&ebx, ebx
mov TIntelRegisters(r9).&ecx, ecx
mov TIntelRegisters(r9).&edx, edx
mov rbx, r10
end;