mirror of
https://github.com/danieleteti/delphimvcframework.git
synced 2024-11-17 00:35:55 +01:00
2940 lines
101 KiB
PHP
2940 lines
101 KiB
PHP
{
|
|
This file is a part of the Open Source Synopse mORMot framework 2,
|
|
licensed under a MPL/GPL/LGPL three license - see LICENSE.md
|
|
|
|
x86_64 assembly used by mormot.core.base.pas
|
|
}
|
|
|
|
{$ifdef FPC}
|
|
// disabled some FPC paranoid warnings
|
|
{$WARN 7119 off : Exported/global symbols should be accessed via the GOT }
|
|
{$WARN 7121 off : Check size of memory operand "$1: memory-operand-size is $2 bits, but expected [$3 bits]" }
|
|
{$WARN 7122 off : Check size of memory operand "$1: memory-operand-size is $2 bits, but expected [$3 bits + $4 byte offset]" }
|
|
{$WARN 7123 off : Check "$1: offset of memory operand is negative "$2 byte" }
|
|
{$endif FPC}
|
|
|
|
{$ifdef ASMX64}
|
|
|
|
{
|
|
FillCharFast/MoveFast implementation notes:
|
|
- assume ERBMS is available (cpuid flag may not be propagated within VMs)
|
|
- use branchless sub-functions for smallest buffers, then SSE2 up to 255 bytes
|
|
- use "rep movsb" for 256..512K input (should work on all CPUs, even oldest)
|
|
- don't use backward "std rep movsb" since it is not involved by ERMBS (slow)
|
|
- use non-temporal SSE2 or AVX loop >= 512KB (to not pollute the CPU cache)
|
|
- will check X64CpuFeatures global to adjust the algorithm if cpuAVX is set
|
|
- regarding benchmark, run TTestLowLevelCommon.CustomRTL on x86_64
|
|
-> FillCharFast/MoveFast seems faster, especially for small lengths (strings)
|
|
-> Delphi RTL is slower than FPC's, and it does not support AVX assembly yet
|
|
}
|
|
|
|
const
|
|
// non-temporal writes should bypass the cache when the size is bigger than
|
|
// half the size of the largest level cache = 512KB, assuming a low 1MB cache
|
|
// - today CPUs could have 144MB of (3D) cache (!) so we favor a fixed value
|
|
// and rely on the CPU hardware cache prefetch when accessing the data
|
|
NONTEMPORALSIZE = 512 * 1024;
|
|
|
|
{$ifdef NO_ERMS}
|
|
{$undef WITH_ERMS}
|
|
{$else}
|
|
{$define WITH_ERMS}
|
|
// we enable forward rep movsb/stosb over SSE2MAXSIZE=256 bytes on x86_64
|
|
// and we don't try to detect it because CPUID is unset in some VMs
|
|
{$endif NO_ERMS}
|
|
|
|
// minimum size triggering ASMX64AVXNOCONST or WITH_ERMS optimized asm
|
|
// - pre-ERMS expects at least 144/256 bytes, IvyBridge+ with ERMS is good
|
|
// from 64 - copy_user_enhanced_fast_string() in recent Linux kernel uses 64
|
|
// see https://stackoverflow.com/a/43837564/458259 for explanations and timing
|
|
// - see also mormot.core.fpcx64mm.pas as reference
|
|
SSE2MAXSIZE = 256;
|
|
|
|
// identify Intel/AMD AVX2+BMI support at Haswell level
|
|
CPUAVX2HASWELL = [cfAVX2, cfSSE42, cfBMI1, cfBMI2, cfCLMUL];
|
|
|
|
procedure MoveFast(const src; var dst; cnt: PtrInt);
|
|
{$ifdef FPC}nostackframe; assembler;
|
|
asm {$else} asm .noframe {$endif} // rcx/rdi=src rdx/rsi=dst r8/rdx=cnt
|
|
mov rax, cnt // rax=r8/rdx=cnt
|
|
lea r10, [rip + @jmptab] // 0..16 dedicated sub-functions
|
|
sub rax, 16
|
|
ja @up16 // >16 or <0
|
|
{$ifdef WIN64ABI} // circumvent FPC asm bug and adapt to xmm ABI
|
|
jmp qword ptr [r10 + r8 * 8]
|
|
@up16: // >16 or <0
|
|
jng @z // <0
|
|
movups xmm0, oword ptr [src + rax] // last 16 = xmm0
|
|
movups xmm1, oword ptr [src] // first 16 = xmm1
|
|
cmp rax, 96 - 16
|
|
{$else}
|
|
jmp qword ptr [r10 + rdx * 8]
|
|
@neg: ret
|
|
@up16: // >16 or <0
|
|
jng @neg // <0
|
|
mov r8, rdx
|
|
movups xmm0, oword ptr [src + rax] // last 16 = xmm0
|
|
movups xmm1, oword ptr [src] // first 16 = xmm1
|
|
cmp rdx, 144 // more volatile xmm registers on SystemV ABI
|
|
{$endif WIN64ABI}
|
|
ja @lrg // >96/144
|
|
// cnt = 17..96/144
|
|
cmp al, $10
|
|
jbe @sml10
|
|
movups xmm2, oword ptr [src + $10] // second 16
|
|
cmp al, $20
|
|
jbe @sml20
|
|
movups xmm3, oword ptr [src + $20] // third 16
|
|
cmp al, $30
|
|
jbe @sml30
|
|
movups xmm4, oword ptr [src + $30] // fourth 16
|
|
cmp al, $40
|
|
jbe @sml40
|
|
movups xmm5, oword ptr [src + $40] // fifth 16
|
|
// xmm0..xmm5 are volatile on both Win64 and SystemV ABI
|
|
// xmm6 and up are also volatile on SystemV ABI so allow more bytes
|
|
{$ifdef SYSVABI}
|
|
cmp al, $50
|
|
jbe @sml50
|
|
movups xmm6, oword ptr [src + $50]
|
|
cmp al, $60
|
|
jbe @sml60
|
|
movups xmm7, oword ptr [src + $60]
|
|
cmp al, $70
|
|
jbe @sml70
|
|
movups xmm8, oword ptr [src + $70]
|
|
// more registers increases code size ([dst+$80]) so are not used
|
|
movups oword ptr [dst + $70], xmm8
|
|
@sml70: movups oword ptr [dst + $60], xmm7
|
|
@sml60: movups oword ptr [dst + $50], xmm6
|
|
@sml50: {$endif SYSVABI}
|
|
movups oword ptr [dst + $40], xmm5 // fifth 16
|
|
@sml40: movups oword ptr [dst + $30], xmm4 // fourth 16
|
|
@sml30: movups oword ptr [dst + $20], xmm3 // third 16
|
|
@sml20: movups oword ptr [dst + $10], xmm2 // second 16
|
|
@sml10: movups oword ptr [dst], xmm1 // first 16
|
|
movups oword ptr [dst + rax], xmm0 // last 16 (may be overlapping)
|
|
@z: ret
|
|
@lrg: // cnt > 96/144 or cnt < 0
|
|
mov r11d, NONTEMPORALSIZE
|
|
mov r10, dst
|
|
add rax, 16 // restore rax=cnt as expected below
|
|
jl @z // cnt < 0
|
|
sub r10, src
|
|
jz @z // src=dst
|
|
cmp r10, cnt // move backwards if unsigned(dst-src) < cnt
|
|
jb @lrgbwd
|
|
// forward ERMSB/SSE2/AVX move for cnt > 96/144 bytes
|
|
mov r9, dst // dst will be 16/32 bytes aligned for writes
|
|
{$ifdef WITH_ERMS}
|
|
{$ifdef WIN64ABI} // 145 bytes seems good enough for ERMSB on a server
|
|
cmp rax, SSE2MAXSIZE
|
|
jb @fsse2 // 97..255 bytes may be not enough for ERMSB nor AVX
|
|
{$endif WIN64ABI}
|
|
cmp rax, r11
|
|
jae @lrgfwd // non-temporal move > 512KB is better than ERMSB
|
|
// 256/145..512K could use the "rep movsb" ERMSB pattern on all CPUs
|
|
cld
|
|
{$ifdef WIN64ABI}
|
|
push rsi
|
|
push rdi
|
|
mov rsi, src
|
|
mov rdi, dst
|
|
mov rcx, r8
|
|
rep movsb
|
|
pop rdi
|
|
pop rsi
|
|
{$else}
|
|
xchg rsi, rdi // dst=rsi and src=rdi -> swap
|
|
mov rcx, r8
|
|
rep movsb
|
|
{$endif WIN64ABI}
|
|
ret
|
|
{$else}
|
|
jmp @lrgfwd
|
|
{$endif WITH_ERMS}
|
|
{$ifdef ASMX64AVXNOCONST} // limited AVX asm on Delphi 11
|
|
@lrgbwd:// backward SSE2/AVX move
|
|
cmp rax, SSE2MAXSIZE
|
|
jb @bsse2 // 97/129..255 bytes is not worth AVX context transition
|
|
test byte ptr [rip + X64CpuFeatures], 1 shl cpuAVX
|
|
jz @bsse2
|
|
jmp @bavx
|
|
@lrgfwd:// forward SSE2/AVX move
|
|
test byte ptr [rip + X64CpuFeatures], 1 shl cpuAVX
|
|
jnz @favx
|
|
{$else}
|
|
@lrgfwd:{$endif ASMX64AVXNOCONST}
|
|
@fsse2: // forward SSE2 move
|
|
lea src, [src + rax - 16]
|
|
lea rax, [rax + dst - 16]
|
|
mov r10, rax
|
|
neg rax
|
|
and dst, -16 // 16-byte aligned writes
|
|
lea rax, [rax + dst + 16]
|
|
cmp r8, r11
|
|
jb @fwd // bypass cache for cnt>512KB
|
|
jmp @fwdnt
|
|
// backward SSE2/AVX move for cnt > 96/144 bytes
|
|
// note: ERMSB is not available on "std rep move" which is slower
|
|
{$ifndef ASMX64AVXNOCONST}
|
|
@lrgbwd:{$endif ASMX64AVXNOCONST}
|
|
@bsse2: // backward SSE2 move
|
|
sub rax, 16
|
|
mov r9, rax
|
|
add rax, dst
|
|
and rax, -16 // 16-byte aligned writes
|
|
sub rax, dst
|
|
cmp r8, r11
|
|
jae @bwdnt // bypass cache for cnt>512KB
|
|
jmp @bwd
|
|
{$ifdef ASMX64AVXNOCONST}
|
|
@bavx: // backward AVX move
|
|
sub rax, 32
|
|
mov r9, rax
|
|
vmovups ymm2, yword ptr [src + rax] // last 32
|
|
vmovups ymm1, yword ptr [src] // first 32
|
|
add rax, dst
|
|
and rax, -32 // 32-byte aligned writes
|
|
sub rax, dst
|
|
cmp r8, r11
|
|
jae @bavxn // bypass cache for cnt>512KB
|
|
jmp @bavxr
|
|
@favx: // forward AVX move
|
|
vmovups ymm2, yword ptr [src] // first 32
|
|
lea src, [src + rax - 32]
|
|
lea dst, [dst + rax - 32]
|
|
vmovups ymm1, yword ptr [src] // last 32
|
|
neg rax
|
|
add rax, dst
|
|
and rax, -32 // 32-byte aligned writes
|
|
sub rax, dst
|
|
add rax, 64
|
|
cmp r8, r11
|
|
jb @favxr // bypass cache for cnt>512KB
|
|
jmp @favxn
|
|
// forward temporal AVX loop
|
|
{$ifdef FPC} align 16 {$else} .align 16 {$endif}
|
|
@favxr: vmovups ymm0, yword ptr [src + rax]
|
|
vmovaps yword ptr [dst + rax], ymm0 // most CPUs have one store unit
|
|
add rax, 32
|
|
jl @favxr
|
|
@favxe: vmovups yword ptr [dst], ymm1 // last 32
|
|
vmovups yword ptr [r9], ymm2 // first 32
|
|
// https://software.intel.com/en-us/articles/avoiding-avx-sse-transition-penalties
|
|
vzeroupper
|
|
ret
|
|
// forward non-temporal AVX loop
|
|
{$ifdef FPC} align 16 {$else} .align 16 {$endif}
|
|
@favxn: vmovups ymm0, yword ptr [src + rax]
|
|
// circumvent FPC inline asm bug with vmovntps [dst + rax], ymm0
|
|
{$ifdef WIN64ABI}
|
|
vmovntps [rdx + rax], ymm0 // rdx=dst on Win64
|
|
{$else}
|
|
vmovntps [rsi + rax], ymm0 // rsi=dst on POSIX
|
|
{$endif WIN64ABI}
|
|
add rax, 32
|
|
jl @favxn
|
|
sfence
|
|
jmp @favxe
|
|
{$endif ASMX64AVXNOCONST}
|
|
// forward temporal SSE2 loop
|
|
{$ifdef FPC} align 16 {$else} .align 16 {$endif}
|
|
@fwd: movups xmm2, oword ptr [src + rax]
|
|
movaps [r10 + rax], xmm2
|
|
add rax, 16
|
|
jl @fwd
|
|
movups oword ptr [r10], xmm0 // last 16
|
|
movups oword ptr [r9], xmm1 // first 16
|
|
ret
|
|
// forward non-temporal SSE2 loop
|
|
{$ifdef FPC} align 16 {$else} .align 16 {$endif}
|
|
@fwdnt: movups xmm2, oword ptr [src + rax]
|
|
movntdq [r10 + rax], xmm2
|
|
add rax, 16
|
|
jl @fwdnt
|
|
sfence
|
|
movups oword ptr [r10], xmm0 // last 16
|
|
movups oword ptr [r9], xmm1 // first 16
|
|
ret
|
|
// backward temporal SSE2 loop
|
|
{$ifdef FPC} align 16 {$else} .align 16 {$endif}
|
|
@bwd: movups xmm2, oword ptr [src + rax]
|
|
movaps oword ptr [dst + rax], xmm2
|
|
sub rax, 16
|
|
jg @bwd
|
|
movups oword ptr [dst], xmm1 // first 16
|
|
movups oword ptr [dst + r9], xmm0 // last 16
|
|
ret
|
|
// backward non-temporal SSE2 loop
|
|
{$ifdef FPC} align 16 {$else} .align 16 {$endif}
|
|
@bwdnt: movups xmm2, oword ptr [src + rax]
|
|
movntdq oword ptr [dst + rax], xmm2
|
|
sub rax, 16
|
|
jg @bwdnt
|
|
sfence
|
|
movups oword ptr [dst], xmm1 // first 16
|
|
movups oword ptr [dst + r9], xmm0 // last 16
|
|
ret
|
|
{$ifdef ASMX64AVXNOCONST}
|
|
// backward temporal AVX loop
|
|
{$ifdef FPC} align 16 {$else} .align 16 {$endif}
|
|
@bavxr: vmovups ymm0, yword ptr [src + rax]
|
|
vmovaps yword ptr [dst + rax], ymm0
|
|
sub rax, 32
|
|
jg @bavxr
|
|
@bavxe: vmovups yword ptr [dst], ymm1 // first 32
|
|
vmovups yword ptr [dst + r9], ymm2 // last 32
|
|
vzeroupper
|
|
ret
|
|
// backward non-temporal AVX loop
|
|
{$ifdef FPC} align 16 {$else} .align 16 {$endif}
|
|
@bavxn: vmovups ymm0, yword ptr [src + rax]
|
|
// circumvent FPC inline asm bug with vmovntps [dst + rax], ymm0
|
|
{$ifdef WIN64ABI}
|
|
vmovntps [rdx + rax], ymm0 // rdx=dst on Win64
|
|
{$else}
|
|
vmovntps [rsi + rax], ymm0 // rsi=dst on POSIX
|
|
{$endif WIN64ABI}
|
|
sub rax, 32
|
|
jg @bavxn
|
|
sfence
|
|
jmp @bavxe
|
|
{$endif ASMX64AVXNOCONST}
|
|
// dedicated branchless sub-functions for 0..16 bytes
|
|
{$ifdef FPC} align 8 {$else} .align 8 {$endif}
|
|
@jmptab:dq @00, @01, @02, @03, @04, @05, @06, @07
|
|
dq @08, @09, @10, @11, @12, @13, @14, @15, @16
|
|
@01: mov al, byte ptr [src]
|
|
mov byte ptr [dst], al
|
|
@00: ret
|
|
@02: movzx eax, word ptr [src]
|
|
mov word ptr [dst], ax
|
|
ret
|
|
@03: movzx eax, word ptr [src]
|
|
mov cl, byte ptr [src + 2]
|
|
mov word ptr [dst], ax
|
|
mov byte ptr [dst + 2], cl
|
|
ret
|
|
@04: mov eax, dword ptr [src]
|
|
mov dword ptr [dst], eax
|
|
ret
|
|
@05: mov eax, dword ptr [src]
|
|
mov cl, byte ptr [src + 4]
|
|
mov dword ptr [dst], eax
|
|
mov byte ptr [dst + 4], cl
|
|
ret
|
|
@06: mov eax, dword ptr [src]
|
|
movzx ecx, word ptr [src + 4]
|
|
mov dword ptr [dst], eax
|
|
mov word ptr [dst + 4], cx
|
|
ret
|
|
@07: mov r8d, dword ptr [src] // faster with no overlapping
|
|
movzx eax, word ptr [src + 4]
|
|
mov cl, byte ptr [src + 6]
|
|
mov dword ptr [dst], r8d
|
|
mov word ptr [dst + 4], ax
|
|
mov byte ptr [dst + 6], cl
|
|
ret
|
|
@08: mov rax, qword ptr [src]
|
|
mov [dst], rax
|
|
ret
|
|
@09: mov rax, qword ptr [src]
|
|
mov cl, byte ptr [src + 8]
|
|
mov [dst], rax
|
|
mov byte ptr [dst + 8], cl
|
|
ret
|
|
@10: mov rax, qword ptr [src]
|
|
movzx ecx, word ptr [src + 8]
|
|
mov [dst], rax
|
|
mov word ptr [dst + 8], cx
|
|
ret
|
|
@11: mov r8, qword ptr [src]
|
|
movzx eax, word ptr [src + 8]
|
|
mov cl, byte ptr [src + 10]
|
|
mov [dst], r8
|
|
mov word ptr [dst + 8], ax
|
|
mov byte ptr [dst + 10], cl
|
|
ret
|
|
@12: mov rax, qword ptr [src]
|
|
mov ecx, dword ptr [src + 8]
|
|
mov [dst], rax
|
|
mov dword ptr [dst + 8], ecx
|
|
ret
|
|
@13: mov r8, qword ptr [src]
|
|
mov eax, dword ptr [src + 8]
|
|
mov cl, byte ptr [src + 12]
|
|
mov [dst], r8
|
|
mov dword ptr [dst + 8], eax
|
|
mov byte ptr [dst + 12], cl
|
|
ret
|
|
@14: mov r8, qword ptr [src]
|
|
mov eax, dword ptr [src + 8]
|
|
movzx ecx, word ptr [src + 12]
|
|
mov [dst], r8
|
|
mov dword ptr [dst + 8], eax
|
|
mov word ptr [dst + 12], cx
|
|
ret
|
|
@15: mov r8, qword ptr [src]
|
|
mov rax, qword ptr [src + 7] // overlap is the easiest solution
|
|
mov [dst], r8
|
|
mov [dst + 7], rax
|
|
ret
|
|
@16: movups xmm0, oword ptr [src]
|
|
movups oword [dst], xmm0
|
|
end;
|
|
|
|
procedure FillCharFast(var dst; cnt: PtrInt; value: byte);
|
|
{$ifdef FPC}nostackframe; assembler;
|
|
asm {$else} asm .noframe {$endif} // rcx/rdi=dst rdx/rsi=cnt r8b/dl=val
|
|
mov r9, $0101010101010101
|
|
lea r10, [rip + @jmptab]
|
|
{$ifdef WIN64ABI}
|
|
movzx eax, r8b
|
|
{$else}
|
|
movzx eax, dl
|
|
mov rdx, rsi // rdx=cnt
|
|
{$endif WIN64ABI}
|
|
imul rax, r9 // broadcast value into all bytes of rax (in 1 cycle)
|
|
cmp cnt, 32
|
|
ja @abv32 // >32 or <0
|
|
sub rdx, 8
|
|
jg @sml32 // 9..32
|
|
jmp qword ptr [r10 + 64 + rdx * 8] // tinest 0..8 bytes
|
|
@sml32: cmp dl, 8 // 9..32 bytes
|
|
jle @sml16
|
|
cmp dl, 16
|
|
jle @sml24
|
|
mov qword ptr [dst + 16], rax
|
|
@sml24: mov qword ptr [dst + 8], rax
|
|
@sml16: mov qword ptr [dst + rdx], rax // last 8 (may be overlapping)
|
|
@08: mov qword ptr [dst], rax
|
|
@00: ret
|
|
@abv32: jng @00 // < 0
|
|
// cnt > 32 bytes
|
|
movd xmm0, eax
|
|
lea r8, [dst + cnt] // r8 point to end
|
|
mov r9d, NONTEMPORALSIZE
|
|
pshufd xmm0, xmm0, 0 // broadcast value into all bytes of xmm0
|
|
mov r10, rdx // save rdx=cnt
|
|
cmp rdx, 255 // = SSE2MAXSIZE-1, but hardcoded in move below
|
|
ja @abv255
|
|
// 33..255 bytes is not good for ERMSB or AVX, and need no alignment
|
|
test dl, $80
|
|
jz @sml80
|
|
movups oword ptr [dst], xmm0
|
|
movups oword ptr [dst + $10], xmm0
|
|
movups oword ptr [dst + $20], xmm0
|
|
movups oword ptr [dst + $30], xmm0
|
|
movups oword ptr [dst + $40], xmm0
|
|
movups oword ptr [dst + $50], xmm0
|
|
movups oword ptr [dst + $60], xmm0
|
|
movups oword ptr [dst + $70], xmm0
|
|
add dst, $80
|
|
@sml80: test dl, $40
|
|
jz @sml40
|
|
movups oword ptr [dst], xmm0
|
|
movups oword ptr [dst + $10], xmm0
|
|
movups oword ptr [dst + $20], xmm0
|
|
movups oword ptr [dst + $30], xmm0
|
|
add dst, $40
|
|
@sml40: test dl, $20
|
|
jz @sml20
|
|
movups oword ptr [dst], xmm0
|
|
movups oword ptr [dst + $10], xmm0
|
|
add dst, $20
|
|
@sml20: test dl, $10
|
|
jz @sml10
|
|
movups oword ptr [dst], xmm0
|
|
@sml10: movups oword ptr [r8 - 16], xmm0 // last 16 bytes (may be overlapping)
|
|
ret
|
|
@abv255:{$ifdef WITH_ERMS}
|
|
cmp rax, r9 // non-temporal move > 512KB
|
|
{$ifdef ASMX64AVXNOCONST}
|
|
jae @avx
|
|
{$else}
|
|
jae @sse2
|
|
{$endif ASMX64AVXNOCONST}
|
|
// 256..512K could use the "rep stosb" ERMSB pattern on all CPUs
|
|
cld
|
|
{$ifdef WIN64ABI}
|
|
mov r8, rdi
|
|
mov rdi, dst
|
|
mov rcx, cnt
|
|
rep stosb
|
|
mov rdi, r8
|
|
{$else}
|
|
mov rcx, cnt
|
|
rep stosb
|
|
{$endif WIN64ABI}
|
|
ret
|
|
{$endif WITH_ERMS}
|
|
@sse2: movups oword ptr [dst], xmm0 // first unaligned 16 bytes
|
|
lea rdx, [dst + rdx - 1]
|
|
and rdx, -16
|
|
add dst, 16
|
|
and dst, -16 // dst is 16-bytes aligned
|
|
sub dst, rdx
|
|
jnb @last
|
|
cmp r10, r9
|
|
jae @sse2nt // bypass cache for cnt>512KB
|
|
jmp @reg
|
|
{$ifdef ASMX64AVXNOCONST}
|
|
@avx: test byte ptr [rip + X64CpuFeatures], 1 shl cpuAVX
|
|
jz @sse2
|
|
movups oword ptr [dst], xmm0 // first unaligned 1..16 bytes
|
|
add dst, 16
|
|
and dst, -16
|
|
movaps oword ptr [dst], xmm0 // aligned 17..32 bytes
|
|
vinsertf128 ymm0, ymm0, xmm0, 1
|
|
add dst, 16
|
|
and dst, -32 // dst is 32-bytes aligned
|
|
mov rdx, r8
|
|
and rdx, -32
|
|
sub dst, rdx
|
|
cmp r10, r9
|
|
jb @avxreg
|
|
jmp @avxnt
|
|
{$endif ASMX64AVXNOCONST}
|
|
// temporal SSE2 loop
|
|
{$ifdef FPC} align 16 {$else} .align 16 {$endif}
|
|
@reg: movaps oword ptr [rdx + dst], xmm0 // regular loop
|
|
add dst, 16
|
|
jnz @reg
|
|
@last: movups oword ptr [r8 - 16], xmm0 // last unaligned 16 bytes
|
|
ret
|
|
// non-temporal SSE2 loop
|
|
{$ifdef FPC} align 16 {$else} .align 16 {$endif}
|
|
@sse2nt:movntdq [rdx + dst], xmm0 // non-temporal loop
|
|
add dst, 16
|
|
jnz @sse2nt
|
|
sfence
|
|
movups oword ptr [r8 - 16], xmm0
|
|
ret
|
|
{$ifdef ASMX64AVXNOCONST}
|
|
// temporal AVX loop
|
|
{$ifdef FPC} align 16 {$else} .align 16 {$endif}
|
|
@avxreg:vmovaps yword ptr [rdx + dst], ymm0 // regular loop
|
|
add dst, 32
|
|
jnz @avxreg
|
|
@avxok: vmovups yword ptr [r8 - 32], ymm0 // last unaligned 32 bytes
|
|
vzeroupper
|
|
ret
|
|
// non-temporal AVX loop
|
|
{$ifdef FPC} align 16 {$else} .align 16 {$endif}
|
|
{$ifdef WIN64}
|
|
@avxnt: vmovntps [rdx + rcx], ymm0 // non-temporal loop - rcx=dst on Win64
|
|
{$else}
|
|
@avxnt: vmovntps [rdx + rdi], ymm0 // non-temporal loop - rdi=dst on POSIX
|
|
{$endif WIN64}
|
|
add dst, 32
|
|
jnz @avxnt
|
|
sfence
|
|
jmp @avxok
|
|
{$endif ASMX64AVXNOCONST}
|
|
// dedicated branchless sub-functions for 0..8 bytes
|
|
{$ifdef FPC} align 8 {$else} .align 8 {$endif}
|
|
@jmptab:dq @00, @01, @02, @03, @04, @05, @06, @07, @08
|
|
@07: mov dword ptr [dst + 3], eax
|
|
@03: mov word ptr [dst + 1], ax
|
|
@01: mov byte ptr [dst], al
|
|
ret
|
|
@06: mov dword ptr [dst + 2], eax
|
|
@02: mov word ptr [dst], ax
|
|
ret
|
|
@05: mov byte ptr [dst + 4], al
|
|
@04: mov dword ptr [dst], eax
|
|
end;
|
|
|
|
function crc32fasttab(crc: cardinal; buf: PAnsiChar; len: cardinal;
|
|
tab: PCrc32tab): cardinal;
|
|
{$ifdef FPC} nostackframe; assembler; asm {$else} asm .noframe {$endif FPC}
|
|
{$ifdef SYSVABI}
|
|
mov r8, rdx
|
|
mov r9, tab
|
|
{$endif SYSVABI}
|
|
mov eax, crc
|
|
xor ecx, ecx
|
|
test buf, buf // crc=eax buf=rdx/rsi len=r8 tab=r9
|
|
jz @z
|
|
neg r8
|
|
jz @z
|
|
not eax
|
|
cmp r8, -8
|
|
jb @head
|
|
@sml: mov cl, byte ptr [buf]
|
|
add buf, 1
|
|
xor cl, al
|
|
shr eax, 8
|
|
xor eax, dword ptr [rcx * 4 + r9]
|
|
add r8, 1
|
|
jnz @sml
|
|
@0: not eax
|
|
@z: ret
|
|
@head: {$ifdef SYSVABI}
|
|
test sil, 7
|
|
{$else}
|
|
test dl, 7
|
|
{$endif SYSVABI}
|
|
jz @align
|
|
mov cl, byte ptr [buf]
|
|
add buf, 1
|
|
xor cl, al
|
|
shr eax, 8
|
|
xor eax, dword ptr [rcx * 4 + r9]
|
|
add r8, 1
|
|
jnz @head
|
|
not eax
|
|
ret
|
|
@align: sub buf, r8
|
|
add r8, 8
|
|
jg @done
|
|
push rbx
|
|
@by8: mov r10d, eax
|
|
mov rcx, qword ptr [buf + r8 - 8]
|
|
xor r10, rcx
|
|
shr rcx, 32
|
|
movzx ebx, cl
|
|
mov eax, dword ptr [rbx * 4 + r9 + 1024 * 3]
|
|
movzx ebx, ch
|
|
shr ecx, 16
|
|
xor eax, dword ptr [rbx * 4 + r9 + 1024 * 2]
|
|
movzx ebx, cl
|
|
xor eax, dword ptr [rbx * 4 + r9 + 1024 * 1]
|
|
movzx ebx, ch
|
|
xor eax, dword ptr [rbx * 4 + r9 + 1024 * 0]
|
|
mov rcx, r10
|
|
movzx ebx, cl
|
|
xor eax, dword ptr [rbx * 4 + r9 + 1024 * 7]
|
|
movzx ebx, ch
|
|
shr ecx, 16
|
|
xor eax, dword ptr [rbx * 4 + r9 + 1024 * 6]
|
|
movzx ebx, cl
|
|
xor eax, dword ptr [rbx * 4 + r9 + 1024 * 5]
|
|
movzx ebx, ch
|
|
xor eax, dword ptr [rbx * 4 + r9 + 1024 * 4]
|
|
add r8, 8
|
|
jle @by8
|
|
xor ecx, ecx
|
|
pop rbx
|
|
@done: sub r8, 8
|
|
jge @e
|
|
@tail: mov cl, byte ptr [buf + r8]
|
|
xor cl, al
|
|
shr eax, 8
|
|
xor eax, dword ptr [rcx * 4 + r9]
|
|
add r8, 1
|
|
jnz @tail
|
|
@e: not eax
|
|
end;
|
|
|
|
function StrInt32(P: PAnsiChar; val: PtrInt): PAnsiChar;
|
|
{$ifdef FPC}nostackframe; assembler; asm {$else} asm .noframe {$endif FPC}
|
|
mov r10, val
|
|
sar r10, 63 // r10=0 if val>=0 or -1 if val<0
|
|
xor val, r10
|
|
sub val, r10 // val=abs(val)
|
|
mov rax, val
|
|
cmp val, 10
|
|
jb @1 // direct process of common val<10
|
|
lea r8, [rip + TwoDigitLookup]
|
|
{$ifdef FPC} align 8 {$else} .align 8 {$endif}
|
|
@s: lea P, [P - 2]
|
|
cmp rax, 100
|
|
jb @2
|
|
lea r9, [rax * 2]
|
|
shr rax, 2
|
|
mov rdx, 2951479051793528259
|
|
mul rdx // use power of two reciprocal to avoid division
|
|
shr rdx, 2
|
|
mov rax, rdx
|
|
imul rdx, -200
|
|
lea rdx, [rdx + r8]
|
|
movzx edx, word ptr [rdx + r9]
|
|
mov word ptr [P], dx
|
|
cmp rax, 10
|
|
jae @s
|
|
@1: or al, '0'
|
|
mov byte ptr [P - 2], '-'
|
|
mov byte ptr [P - 1], al
|
|
lea rax, [P + r10 - 1] // includes '-' if val<0
|
|
ret
|
|
{$ifdef FPC} align 8 {$else} .align 8 {$endif}
|
|
@2: movzx eax, word ptr [r8 + rax * 2]
|
|
mov byte ptr [P - 1], '-'
|
|
mov word ptr [P], ax
|
|
lea rax, [P + r10] // includes '-' if val<0
|
|
end;
|
|
|
|
function StrUInt32(P: PAnsiChar; val: PtrUInt): PAnsiChar;
|
|
{$ifdef FPC}nostackframe; assembler; asm {$else} asm .noframe {$endif FPC}
|
|
mov rax, val
|
|
cmp val, 10
|
|
jb @1 // direct process of common val<10
|
|
lea r8, [rip + TwoDigitLookup]
|
|
@s: lea P, [P - 2]
|
|
cmp rax, 100
|
|
jb @2
|
|
lea r9, [rax * 2]
|
|
shr rax, 2
|
|
mov rdx, 2951479051793528259
|
|
mul rdx // use power of two reciprocal to avoid division
|
|
shr rdx, 2
|
|
mov rax, rdx
|
|
imul rdx, -200
|
|
add rdx, r8
|
|
movzx rdx, word ptr [rdx + r9]
|
|
mov word ptr [P], dx
|
|
cmp rax, 10
|
|
jae @s
|
|
@1: sub P, 1
|
|
or al, '0'
|
|
mov byte ptr [P], al
|
|
@0: mov rax, P
|
|
ret
|
|
@2: movzx eax, word ptr [r8 + rax * 2]
|
|
mov word ptr [P], ax
|
|
mov rax, P
|
|
end;
|
|
|
|
{$endif ASMX64}
|
|
|
|
{$ifdef CPUX64ASM} /// proper compilation on FPC and Delphi XE7+
|
|
|
|
{$ifdef FPC}
|
|
|
|
procedure fpc_freemem; external name 'FPC_FREEMEM'; // access to RTL from asm
|
|
|
|
procedure FastAssignNew(var d; s: pointer); nostackframe; assembler;
|
|
asm
|
|
mov rax, qword ptr [d]
|
|
mov qword ptr [d], s
|
|
test rax, rax
|
|
jz @z
|
|
lea d, qword ptr [rax - _STRRECSIZE] // for fpc_freemem() below
|
|
{$ifdef STRCNT32}
|
|
cmp dword ptr [rax - _STRCNT], 0 // refcnt=-1 for const
|
|
jl @z
|
|
lock dec dword ptr [rax - _STRCNT]
|
|
{$else}
|
|
cmp qword ptr [rax - _STRCNT], 0 // refcnt=-1 for const
|
|
jl @z
|
|
lock dec qword ptr [rax - _STRCNT]
|
|
{$endif STRCNT32}
|
|
jbe fpc_freemem
|
|
@z:
|
|
end;
|
|
|
|
procedure FastAssignNewNotVoid(var d; s: pointer);
|
|
nostackframe; assembler;
|
|
asm
|
|
mov rax, qword ptr [d]
|
|
mov qword ptr [d], s
|
|
lea d, qword ptr [rax - _STRRECSIZE] // for fpc_freemem() below
|
|
{$ifdef STRCNT32}
|
|
cmp dword ptr [rax - _STRCNT], 0 // refcnt=-1 for const
|
|
jl @z
|
|
lock dec dword ptr [rax - _STRCNT]
|
|
{$else}
|
|
cmp qword ptr [rax - _STRCNT], 0 // refcnt=-1 for const
|
|
jl @z
|
|
lock dec qword ptr [rax - _STRCNT]
|
|
{$endif STRCNT32}
|
|
jbe fpc_freemem
|
|
@z:
|
|
end;
|
|
|
|
{$endif FPC}
|
|
|
|
{
|
|
Some numbers, with CITIES_MAX=200000, deleting 1/128 entries
|
|
first column (3..23) is the max number of indexes[] chunk to rehash
|
|
#abc is the number of slots in the hash table
|
|
adjust=.. match DynArrayHashTableAdjust() time fixing the indexes
|
|
hash=ms is the time needed to hash input (not impacted by adjusting algorithm)
|
|
-> TDynArray.Delete move() now takes more time than the HashTable update :)
|
|
1. naive loop
|
|
for i := 0 to HashTableSize-1 do
|
|
if HashTable[i]>aArrayIndex then
|
|
dec(HashTable[i]);
|
|
3 #257 adjust=7.95ms 191.7MB/s hash=8us
|
|
23 #195075 adjust=4.27s 548.6MB/s hash=2.47ms
|
|
2. branchless pure pascal code is about 10x faster!
|
|
3 #257 adjust=670us 2.2GB hash=8us
|
|
23 #195075 adjust=520.85ms 4.3GB/s hash=2.45ms
|
|
3. SSE2 simd assembly makes about 3x improvement
|
|
3 #257 adjust=290us 5.1GB hash=8us
|
|
23 #195075 adjust=201.53ms 11.3GB/s hash=2.44ms
|
|
4. AVX2 simd assembly gives some additional 40% (on my iCore3 cpu)
|
|
3 #257 adjust=262us 5.6GB hash=8us
|
|
23 #195075 adjust=161.73ms 14.1GB/s hash=2.57ms
|
|
}
|
|
|
|
// brute force O(n) indexes fix after deletion (much faster than full ReHash)
|
|
procedure DynArrayHashTableAdjust(P: PIntegerArray; deleted: integer; count: PtrInt);
|
|
{$ifdef WIN64ABI}
|
|
var
|
|
bak6, bak7, bak8: THash128;
|
|
asm // Windows x64 calling convention expects to preserve XMM6-XMM15
|
|
movups dqword ptr [bak6], xmm6
|
|
movups dqword ptr [bak7], xmm7
|
|
movups dqword ptr [bak8], xmm8
|
|
{$else} {$ifdef FPC}nostackframe; assembler; asm {$else} asm .noframe {$endif}
|
|
mov r8, rdx
|
|
mov rcx, rdi
|
|
mov rdx, rsi
|
|
{$endif WIN64ABI}
|
|
xor eax, eax // reset eax high bits for setg al below
|
|
movq xmm0, rdx // xmm0 = 128-bit of quad deleted
|
|
pshufd xmm0, xmm0, 0
|
|
test cl, 3
|
|
jnz @1 // paranoid: a dword dynamic array is always dword-aligned
|
|
// ensure P is 256-bit aligned (for avx2)
|
|
@align: test cl, 31
|
|
jz @ok
|
|
cmp dword ptr [rcx], edx
|
|
setg al // P[]>deleted -> al=1, 0 otherwise
|
|
sub dword ptr [rcx], eax // branchless dec(P[])
|
|
add rcx, 4
|
|
sub r8, 1
|
|
jmp @align
|
|
@ok: {$ifdef ASMX64AVXNOCONST}
|
|
test byte ptr [rip + X64CpuFeatures], 1 shl cpuAVX2
|
|
jz @sse2
|
|
vpshufd ymm0, ymm0, 0 // shuffle to ymm0 128-bit low lane
|
|
vperm2f128 ymm0, ymm0, ymm0, 0 // copy to ymm0 128-bit high lane
|
|
// avx process of 128 bytes (32 indexes) per loop iteration
|
|
{$ifdef FPC} align 16 {$else} .align 16 {$endif}
|
|
@avx2: sub r8, 32
|
|
vmovdqa ymm1, [rcx] // 4 x 256-bit process = 4 x 8 integers
|
|
vmovdqa ymm3, [rcx + 32]
|
|
vmovdqa ymm5, [rcx + 64]
|
|
vmovdqa ymm7, [rcx + 96]
|
|
vpcmpgtd ymm2, ymm1, ymm0 // compare P[]>deleted -> -1, 0 otherwise
|
|
vpcmpgtd ymm4, ymm3, ymm0
|
|
vpcmpgtd ymm6, ymm5, ymm0
|
|
vpcmpgtd ymm8, ymm7, ymm0
|
|
vpaddd ymm1, ymm1, ymm2 // adjust by adding -1 / 0
|
|
vpaddd ymm3, ymm3, ymm4
|
|
vpaddd ymm5, ymm5, ymm6
|
|
vpaddd ymm7, ymm7, ymm8
|
|
vmovdqa [rcx], ymm1
|
|
vmovdqa [rcx + 32], ymm3
|
|
vmovdqa [rcx + 64], ymm5
|
|
vmovdqa [rcx + 96], ymm7
|
|
add rcx, 128
|
|
cmp r8, 32
|
|
jae @avx2
|
|
vzeroupper
|
|
jmp @2
|
|
{$endif ASMX64AVXNOCONST}
|
|
// SSE2 process of 64 bytes (16 indexes) per loop iteration
|
|
{$ifdef FPC} align 16 {$else} .align 16 {$endif}
|
|
@sse2: sub r8, 16
|
|
movaps xmm1, dqword [rcx] // 4 x 128-bit process = 4 x 4 integers
|
|
movaps xmm3, dqword [rcx + 16]
|
|
movaps xmm5, dqword [rcx + 32]
|
|
movaps xmm7, dqword [rcx + 48]
|
|
movaps xmm2, xmm1 // keep copy for paddd below
|
|
movaps xmm4, xmm3
|
|
movaps xmm6, xmm5
|
|
movaps xmm8, xmm7
|
|
pcmpgtd xmm1, xmm0 // quad compare P[]>deleted -> -1 / 0
|
|
pcmpgtd xmm3, xmm0
|
|
pcmpgtd xmm5, xmm0
|
|
pcmpgtd xmm7, xmm0
|
|
paddd xmm1, xmm2 // quad adjust by adding -1 / 0
|
|
paddd xmm3, xmm4
|
|
paddd xmm5, xmm6
|
|
paddd xmm7, xmm8
|
|
movaps dqword [rcx], xmm1 // quad store back
|
|
movaps dqword [rcx + 16], xmm3
|
|
movaps dqword [rcx + 32], xmm5
|
|
movaps dqword [rcx + 48], xmm7
|
|
add rcx, 64
|
|
cmp r8, 16
|
|
jae @sse2
|
|
jmp @2
|
|
// trailing indexes (never appearing within DYNARRAYHASH_PO2 range)
|
|
@1: sub r8, 1
|
|
cmp dword ptr [rcx + r8 * 4], edx
|
|
setg al
|
|
sub dword ptr [rcx + r8 * 4], eax
|
|
@2: test r8, r8
|
|
jnz @1
|
|
{$ifdef WIN64ABI}
|
|
movups xmm6, dqword ptr [bak6]
|
|
movups xmm7, dqword ptr [bak7]
|
|
movups xmm8, dqword ptr [bak8]
|
|
{$endif WIN64ABI}
|
|
end;
|
|
|
|
// DYNARRAYHASH_16BIT version for 16-bit HashTable[] - no AVX2 since count < 64K
|
|
procedure DynArrayHashTableAdjust16(P: PWordArray; deleted: cardinal; count: PtrInt);
|
|
{$ifdef WIN64ABI}
|
|
var
|
|
bak6, bak7, bak8: THash128;
|
|
asm // Windows x64 calling convention expects to preserve XMM6-XMM15
|
|
movups dqword ptr [bak6], xmm6
|
|
movups dqword ptr [bak7], xmm7
|
|
movups dqword ptr [bak8], xmm8
|
|
{$else} {$ifdef FPC}nostackframe; assembler; asm {$else} asm .noframe {$endif}
|
|
mov r8, rdx
|
|
mov rcx, rdi
|
|
mov rdx, rsi
|
|
{$endif WIN64ABI}
|
|
mov eax, deleted
|
|
shl eax, 16 // for pshufd (inline asm doesn't know about pshufw)
|
|
or edx, eax // edx = 32-bit of 2x 16-bit deleted
|
|
movq xmm0, rdx // xmm0 = 128-bit of 8x deleted
|
|
pshufd xmm0, xmm0, 0
|
|
xor eax, eax // reset eax high bits for setg al below
|
|
test cl, 1
|
|
jnz @1 // paranoid: a dword dynamic array is always word-aligned
|
|
// ensure P is 128-bit aligned (for movaps)
|
|
@align: test cl, 15
|
|
jz @sse2
|
|
cmp word ptr [rcx], dx
|
|
setg al // P[]>deleted -> al=1, 0 otherwise
|
|
sub word ptr [rcx], ax // branchless dec(P[])
|
|
add rcx, 2
|
|
sub r8, 1
|
|
jmp @align
|
|
// SSE2 process of 64 bytes (32 indexes) per loop iteration
|
|
{$ifdef FPC} align 16 {$else} .align 16 {$endif}
|
|
@sse2: sub r8, 32
|
|
movaps xmm1, dqword [rcx] // 4 x 128-bit process = 4 x 8 words
|
|
movaps xmm3, dqword [rcx + 16]
|
|
movaps xmm5, dqword [rcx + 32]
|
|
movaps xmm7, dqword [rcx + 48]
|
|
movaps xmm2, xmm1 // keep copy for paddd below
|
|
movaps xmm4, xmm3
|
|
movaps xmm6, xmm5
|
|
movaps xmm8, xmm7
|
|
pcmpgtw xmm1, xmm0 // 8x compare P[]>deleted -> -1 / 0
|
|
pcmpgtw xmm3, xmm0
|
|
pcmpgtw xmm5, xmm0
|
|
pcmpgtw xmm7, xmm0
|
|
paddw xmm1, xmm2 // 8x adjust by adding -1 / 0
|
|
paddw xmm3, xmm4
|
|
paddw xmm5, xmm6
|
|
paddw xmm7, xmm8
|
|
movaps dqword [rcx], xmm1 // 8x store back
|
|
movaps dqword [rcx + 16], xmm3
|
|
movaps dqword [rcx + 32], xmm5
|
|
movaps dqword [rcx + 48], xmm7
|
|
add rcx, 64
|
|
cmp r8, 32
|
|
jae @sse2
|
|
jmp @2
|
|
// trailing indexes (never appearing within DYNARRAYHASH_PO2 range)
|
|
@1: sub r8, 1
|
|
cmp word ptr [rcx + r8 * 4], dx
|
|
setg al
|
|
sub word ptr [rcx + r8 * 4], ax
|
|
@2: test r8, r8
|
|
jnz @1
|
|
{$ifdef WIN64ABI}
|
|
movups xmm6, dqword ptr [bak6]
|
|
movups xmm7, dqword ptr [bak7]
|
|
movups xmm8, dqword ptr [bak8]
|
|
{$endif WIN64ABI}
|
|
end;
|
|
|
|
{$ifdef ASMX64AVXNOCONST}
|
|
// AVX2 ASM .align 32 for const is not available on Delphi :(
|
|
|
|
// adapted from https://github.com/simdjson/simdjson - Apache License 2.0
|
|
function IsValidUtf8Avx2(source: PUtf8Char; sourcelen: PtrInt): boolean;
|
|
{$ifdef FPC}nostackframe; assembler; asm {$else} asm .noframe {$endif FPC}
|
|
test source, source
|
|
jz @ok
|
|
test sourcelen, sourcelen
|
|
jle @ok
|
|
{$ifdef WIN64ABI} // Win64 ABI doesn't consider rsi/rdi as volatile
|
|
push rsi
|
|
push rdi
|
|
{$endif WIN64ABI}
|
|
push rbp
|
|
mov r8, source
|
|
mov rdx, sourcelen
|
|
mov rsi, r8
|
|
mov ecx, 64
|
|
mov rax, rsi
|
|
mov rdi, rdx
|
|
mov rbp, rsp
|
|
and rsp, 0FFFFFFFFFFFFFFE0H // align stack at 32 bytes
|
|
sub rsp, 160
|
|
{$ifdef WIN64ABI}
|
|
movaps dqword ptr [rsp + 00H], xmm6
|
|
movaps dqword ptr [rsp + 10H], xmm7
|
|
movaps dqword ptr [rsp + 20H], xmm8
|
|
movaps dqword ptr [rsp + 30H], xmm9
|
|
movaps dqword ptr [rsp + 40H], xmm10
|
|
movaps dqword ptr [rsp + 50H], xmm11
|
|
movaps dqword ptr [rsp + 60H], xmm12
|
|
movaps dqword ptr [rsp + 70H], xmm13
|
|
movaps dqword ptr [rsp + 80H], xmm14
|
|
movaps dqword ptr [rsp + 90H], xmm15
|
|
sub rsp, 100H
|
|
{$endif WIN64ABI}
|
|
cmp rdx, 64
|
|
cmovnc rcx, rdx
|
|
sub rcx, 64
|
|
je @small
|
|
vpxor xmm3, xmm3, xmm3
|
|
xor esi, esi
|
|
vmovdqu ymm7, yword ptr [rip + @0f]
|
|
vmovdqu ymm15, yword ptr [rip + @_6]
|
|
vmovdqu ymm14, yword ptr [rip + @_7]
|
|
vmovdqu ymm13, yword ptr [rip + @_8]
|
|
vmovdqa ymm5, ymm3
|
|
vmovdqa ymm2, ymm3
|
|
// main processing loop, 64 bytes per iteration
|
|
{$ifdef FPC} align 16 {$else} .align 16 {$endif}
|
|
@loop: vmovdqu xmm6, oword ptr [rax + rsi]
|
|
vinserti128 ymm0, ymm6, [rax + rsi + 10H], 01H
|
|
vmovdqu xmm6, oword ptr [rax + rsi + 20H]
|
|
vinserti128 ymm1, ymm6, [rax + rsi + 30H], 01H
|
|
add rsi, 64
|
|
vpor ymm4, ymm1, ymm0
|
|
vpmovmskb rdx, ymm4 // check set MSB of each 64 bytes
|
|
test edx, edx
|
|
jne @check
|
|
vpor ymm2, ymm5, ymm2
|
|
vmovdqa ymm4, ymm2
|
|
cmp rcx, rsi
|
|
ja @loop
|
|
// process trailing 0..63 bytes
|
|
@trail: sub rdi, rsi
|
|
jz @ended
|
|
add rsi, rax
|
|
vmovdqu xmm0, oword ptr [rip + @20]
|
|
lea rdx, qword ptr [rsp + 60H] // copy on stack with space padding
|
|
sub rsi, rdx
|
|
vmovdqa oword ptr [rdx], xmm0
|
|
vmovdqa oword ptr [rdx + 10H], xmm0
|
|
vmovdqa oword ptr [rdx + 20H], xmm0
|
|
vmovdqa oword ptr [rdx + 30H], xmm0
|
|
@by8: sub rdi, 8
|
|
jb @by1
|
|
mov rax, qword ptr [rsi + rdx]
|
|
mov qword ptr [rdx], rax
|
|
add rdx, 8 // in-order copy to preserve UTF-8 encoding
|
|
jmp @by8
|
|
@by1: add rdi, 8
|
|
jz @0
|
|
@sml: mov al, byte ptr [rsi + rdx]
|
|
mov byte ptr [rdx], al
|
|
add rdx, 1
|
|
sub rdi, 1
|
|
jnz @sml
|
|
@0: vmovdqa ymm1, yword ptr [rsp + 60H]
|
|
vmovdqa ymm2, yword ptr [rsp + 80H]
|
|
vpor ymm0, ymm1, ymm2
|
|
vpmovmskb rax, ymm0 // check any set MSB
|
|
test eax, eax
|
|
jne @last
|
|
@ended: vpor ymm5, ymm5, ymm4
|
|
@final: vptest ymm5, ymm5
|
|
sete al
|
|
vzeroupper
|
|
{$ifdef WIN64ABI}
|
|
add rsp, 100H
|
|
movaps xmm6, dqword ptr [rsp + 00H]
|
|
movaps xmm7, dqword ptr [rsp + 10H]
|
|
movaps xmm8, dqword ptr [rsp + 20H]
|
|
movaps xmm9, dqword ptr [rsp + 30H]
|
|
movaps xmm10, dqword ptr [rsp + 40H]
|
|
movaps xmm11, dqword ptr [rsp + 50H]
|
|
movaps xmm12, dqword ptr [rsp + 60H]
|
|
movaps xmm13, dqword ptr [rsp + 70H]
|
|
movaps xmm14, dqword ptr [rsp + 80H]
|
|
movaps xmm15, dqword ptr [rsp + 90H]
|
|
leave // mov rsp,rbp + pop rbp
|
|
pop rdi
|
|
pop rsi
|
|
{$else}
|
|
leave
|
|
{$endif WIN64ABI}
|
|
ret
|
|
@ok: mov al, 1
|
|
ret
|
|
@small: vpxor xmm4, xmm4, xmm4
|
|
xor esi, esi
|
|
vmovdqa ymm3, ymm4
|
|
vmovdqa ymm5, ymm4
|
|
jmp @trail
|
|
// validate UTF-8 extra bytes from main loop
|
|
{$ifdef FPC} align 8 {$else} .align 8 {$endif}
|
|
@check: vpsrlw ymm9, ymm0, 4
|
|
vpsrlw ymm12, ymm1, 4
|
|
vperm2i128 ymm3, ymm3, ymm0, 21H
|
|
vpalignr ymm5, ymm0, ymm3, 0FH
|
|
vpalignr ymm11, ymm0, ymm3, 0EH
|
|
vpsubusb ymm11, ymm11, yword ptr [rip + @_9]
|
|
vpalignr ymm3, ymm0, ymm3, 0DH
|
|
vperm2i128 ymm0, ymm0, ymm1, 21H
|
|
vpsubusb ymm3, ymm3, yword ptr [rip + @_10]
|
|
vpalignr ymm8, ymm1, ymm0, 0FH
|
|
vpsrlw ymm10, ymm5, 4
|
|
vpand ymm5, ymm7, ymm5
|
|
vpsrlw ymm6, ymm8, 4
|
|
vpalignr ymm4, ymm1, ymm0, 0EH
|
|
vpsubusb ymm4, ymm4, yword ptr [rip + @_9]
|
|
vpalignr ymm0, ymm1, ymm0, 0DH
|
|
vpsubusb ymm0, ymm0, yword ptr [rip + @_10]
|
|
vpand ymm10, ymm10, ymm7
|
|
vpand ymm6, ymm6, ymm7
|
|
vpand ymm8, ymm7, ymm8
|
|
vpor ymm3, ymm3, ymm11
|
|
vpor ymm0, ymm4, ymm0
|
|
vpxor xmm11, xmm11, xmm11
|
|
vpshufb ymm10, ymm15, ymm10
|
|
vpshufb ymm5, ymm14, ymm5
|
|
vpand ymm9, ymm9, ymm7
|
|
vpshufb ymm6, ymm15, ymm6
|
|
vpshufb ymm8, ymm14, ymm8
|
|
vpand ymm12, ymm12, ymm7
|
|
vpand ymm5, ymm5, ymm10
|
|
vpcmpgtb ymm3, ymm3, ymm11
|
|
vpcmpgtb ymm0, ymm0, ymm11
|
|
vpshufb ymm9, ymm13, ymm9
|
|
vpand ymm3, ymm3, yword ptr [rip + @_11]
|
|
vpand ymm0, ymm0, yword ptr [rip + @_11]
|
|
vpshufb ymm12, ymm13, ymm12
|
|
vpand ymm6, ymm6, ymm8
|
|
vpand ymm9, ymm5, ymm9
|
|
vpsubusb ymm5, ymm1, yword ptr [rip + @_12]
|
|
vpand ymm12, ymm6, ymm12
|
|
vpxor ymm9, ymm3, ymm9
|
|
vmovdqa ymm3, ymm1
|
|
vpxor ymm12, ymm0, ymm12
|
|
vpor ymm9, ymm9, ymm12
|
|
vpor ymm2, ymm9, ymm2
|
|
vmovdqa ymm4, ymm2
|
|
cmp rcx, rsi
|
|
ja @loop
|
|
jmp @trail
|
|
// validate UTF-8 extra bytes from input ending
|
|
{$ifdef FPC} align 8 {$else} .align 8 {$endif}
|
|
@last: vmovdqu ymm5, yword ptr [rip + @0f]
|
|
vperm2i128 ymm3, ymm3, ymm1, 21H
|
|
vmovdqu ymm9, yword ptr [rip + @_7]
|
|
vpsrlw ymm11, ymm1, 4
|
|
vpalignr ymm0, ymm1, ymm3, 0FH
|
|
vmovdqu ymm13, yword ptr [rip + @_10]
|
|
vmovdqu ymm14, yword ptr [rip + @_9]
|
|
vpsrlw ymm6, ymm0, 4
|
|
vpand ymm0, ymm5, ymm0
|
|
vpand ymm11, ymm11, ymm5
|
|
vmovdqu ymm7, yword ptr [rip + @_6]
|
|
vpshufb ymm10, ymm9, ymm0
|
|
vpalignr ymm0, ymm1, ymm3, 0EH
|
|
vpand ymm6, ymm6, ymm5
|
|
vmovdqu ymm8, yword ptr [rip + @_8]
|
|
vpalignr ymm3, ymm1, ymm3, 0DH
|
|
vperm2i128 ymm1, ymm1, ymm2, 21H
|
|
vpsubusb ymm0, ymm0, ymm14
|
|
vpsubusb ymm12, ymm3, ymm13
|
|
vpalignr ymm3, ymm2, ymm1, 0FH
|
|
vpshufb ymm6, ymm7, ymm6
|
|
vpsrlw ymm15, ymm3, 4
|
|
vpand ymm3, ymm5, ymm3
|
|
vpor ymm0, ymm0, ymm12
|
|
vpshufb ymm9, ymm9, ymm3
|
|
vpsrlw ymm3, ymm2, 4
|
|
vpand ymm15, ymm15, ymm5
|
|
vpand ymm5, ymm3, ymm5
|
|
vpalignr ymm3, ymm2, ymm1, 0EH
|
|
vpxor xmm12, xmm12, xmm12
|
|
vpalignr ymm1, ymm2, ymm1, 0DH
|
|
vpsubusb ymm3, ymm3, ymm14
|
|
vpshufb ymm11, ymm8, ymm11
|
|
vpsubusb ymm1, ymm1, ymm13
|
|
vpcmpgtb ymm0, ymm0, ymm12
|
|
vpshufb ymm7, ymm7, ymm15
|
|
vpor ymm1, ymm3, ymm1
|
|
vpshufb ymm8, ymm8, ymm5
|
|
vpsubusb ymm5, ymm2, yword ptr [rip + @_12]
|
|
vmovdqu ymm2, yword ptr [rip + @_11]
|
|
vpcmpgtb ymm1, ymm1, ymm12
|
|
vpand ymm6, ymm6, ymm10
|
|
vpand ymm7, ymm7, ymm9
|
|
vpand ymm0, ymm0, ymm2
|
|
vpand ymm11, ymm6, ymm11
|
|
vpand ymm8, ymm7, ymm8
|
|
vpxor ymm0, ymm0, ymm11
|
|
vpor ymm5, ymm4, ymm5
|
|
vpand ymm1, ymm1, ymm2
|
|
vpxor ymm1, ymm1, ymm8
|
|
vpor ymm0, ymm0, ymm1
|
|
vpor ymm5, ymm0, ymm5
|
|
jmp @final
|
|
{$ifdef FPC} align 16 {$else} .align 16 {$endif}
|
|
@20: dq 2020202020202020H
|
|
dq 2020202020202020H
|
|
{$ifdef FPC} align 32 {$else} .align 16 {Delphi doesn't allow 32 :(} {$endif}
|
|
@0f: dq 0F0F0F0F0F0F0F0FH
|
|
dq 0F0F0F0F0F0F0F0FH
|
|
dq 0F0F0F0F0F0F0F0FH
|
|
dq 0F0F0F0F0F0F0F0FH
|
|
@_6: dq 0202020202020202H
|
|
dq 4915012180808080H
|
|
dq 0202020202020202H
|
|
dq 4915012180808080H
|
|
@_7: dq 0CBCBCB8B8383A3E7H
|
|
dq 0CBCBDBCBCBCBCBCBH
|
|
dq 0CBCBCB8B8383A3E7H
|
|
dq 0CBCBDBCBCBCBCBCBH
|
|
@_8: dq 0101010101010101H
|
|
dq 01010101BABAAEE6H
|
|
dq 0101010101010101H
|
|
dq 01010101BABAAEE6H
|
|
@_9: dq 0DFDFDFDFDFDFDFDFH
|
|
dq 0DFDFDFDFDFDFDFDFH
|
|
dq 0DFDFDFDFDFDFDFDFH
|
|
dq 0DFDFDFDFDFDFDFDFH
|
|
@_10: dq 0EFEFEFEFEFEFEFEFH
|
|
dq 0EFEFEFEFEFEFEFEFH
|
|
dq 0EFEFEFEFEFEFEFEFH
|
|
dq 0EFEFEFEFEFEFEFEFH
|
|
@_11: dq 8080808080808080H
|
|
dq 8080808080808080H
|
|
dq 8080808080808080H
|
|
dq 8080808080808080H
|
|
@_12: db 0FFH, 0FFH, 0FFH, 0FFH, 0FFH, 0FFH, 0FFH, 0FFH
|
|
db 0FFH, 0FFH, 0FFH, 0FFH, 0FFH, 0FFH, 0FFH, 0FFH
|
|
db 0FFH, 0FFH, 0FFH, 0FFH, 0FFH, 0FFH, 0FFH, 0FFH
|
|
db 0FFH, 0FFH, 0FFH, 0FFH, 0FFH, 0EFH, 0DFH, 0BFH
|
|
end;
|
|
|
|
// inspired by https://github.com/aklomp/base64 - BSD-2-Clause License
|
|
// - less unrolled, but (much) faster thanks to manually tuned asm
|
|
|
|
procedure Base64EncodeAvx2(var b: PAnsiChar; var blen: PtrUInt;
|
|
var b64: PAnsiChar);
|
|
{$ifdef WIN64ABI}
|
|
var
|
|
bak6, bak7, bak8, bak9, bak10, bak11, bak12, bak13, bak14, bak15: THash128;
|
|
asm // Windows x64 calling convention expects to preserve XMM6-XMM15
|
|
movups dqword ptr [bak6], xmm6
|
|
movups dqword ptr [bak7], xmm7
|
|
movups dqword ptr [bak8], xmm8
|
|
movups dqword ptr [bak9], xmm9
|
|
movups dqword ptr [bak10], xmm10
|
|
movups dqword ptr [bak11], xmm11
|
|
movups dqword ptr [bak12], xmm12
|
|
movups dqword ptr [bak13], xmm13
|
|
movups dqword ptr [bak14], xmm14
|
|
movups dqword ptr [bak15], xmm15
|
|
push rsi // Win64 ABI doesn't consider rsi/rdi as volatile
|
|
push rdi
|
|
mov rsi, r8 // rsi = b64
|
|
mov r8, rdx // r8 = blen
|
|
mov rdi, rcx // rdi = b
|
|
{$else} {$ifdef FPC}nostackframe; assembler; asm {$else} asm .noframe {$endif}
|
|
mov r8, rsi // r8 = blen
|
|
mov rsi, rdx // rsi = b64 rdi = b
|
|
{$endif WIN64ABI}
|
|
mov rcx, qword ptr [r8]
|
|
cmp rcx, 31
|
|
jbe @done
|
|
lea rdx, qword ptr [rcx - 4]
|
|
vmovdqu ymm0, yword ptr [rip + @c9]
|
|
mov r10, 0AAAAAAAAAAAAAAABH
|
|
vmovdqu ymm7, yword ptr [rip + @c10]
|
|
mov rax, rdx
|
|
vmovdqu ymm5, yword ptr [rip + @c11]
|
|
vmovdqu ymm8, yword ptr [rip + @c12]
|
|
mul r10
|
|
vmovdqu ymm9, yword ptr [rip + @c13]
|
|
vmovdqu ymm10, yword ptr [rip + @c14]
|
|
vmovdqu ymm6, yword ptr [rip + @c16]
|
|
vmovdqu ymm4, yword ptr [rip + @c15]
|
|
vmovdqu ymm11, yword ptr [rip + @c17]
|
|
shr rdx, 4 // rdx = rounds = (blen - 4) / 24
|
|
lea rax, qword ptr [rdx + rdx * 2]
|
|
shl rax, 3
|
|
sub rcx, rax
|
|
mov qword ptr [r8], rcx // blen = rounds * 24
|
|
mov rcx, qword ptr [rdi] // rcx = [rdi] = b
|
|
mov rax, qword ptr [rsi] // rax = [rsi] = b64
|
|
// initial 24 bytes output process
|
|
vmovdqu xmm3, oword ptr [rcx]
|
|
vinserti128 ymm1, ymm3, oword ptr [rcx + 16], 1
|
|
vpermd ymm1, ymm0, ymm1
|
|
vpshufb ymm1, ymm1, ymm7
|
|
vpand ymm0, ymm5, ymm1
|
|
vpmulhuw ymm2, ymm0, ymm8
|
|
vpand ymm0, ymm9, ymm1
|
|
vpmullw ymm0, ymm10, ymm0
|
|
vpor ymm0, ymm0, ymm2
|
|
vpcmpgtb ymm2, ymm0, ymm6
|
|
vpsubusb ymm1, ymm0, ymm4
|
|
vpsubb ymm1, ymm1, ymm2
|
|
vpshufb ymm1, ymm11, ymm1
|
|
vpaddb ymm0, ymm1, ymm0
|
|
vmovdqu oword ptr [rax], xmm0
|
|
vextracti128 oword ptr [rax + 16], ymm0, 1
|
|
add rax, 32
|
|
add rcx, 20
|
|
sub rdx, 1
|
|
je @10
|
|
{$ifdef FPC} align 16 {$else} .align 16 {$endif}
|
|
// process 48 input bytes per loop iteration into 64 encoded bytes
|
|
@9: cmp rdx, 1
|
|
je @12
|
|
// whole loop logic is fully interlaced to unleash future CPU potential
|
|
vmovdqu xmm1, oword ptr [rcx]
|
|
vmovdqu xmm3, oword ptr [rcx + 24]
|
|
vinserti128 ymm1, ymm1, oword ptr [rcx + 16], 1
|
|
vinserti128 ymm3, ymm3, oword ptr [rcx + 24 + 16], 1
|
|
vpshufb ymm1, ymm1, ymm7
|
|
vpshufb ymm3, ymm3, ymm7
|
|
vpand ymm0, ymm5, ymm1
|
|
vpand ymm12, ymm5, ymm3
|
|
vpmulhuw ymm2, ymm0, ymm8
|
|
vpmulhuw ymm14, ymm12, ymm8
|
|
vpand ymm0, ymm9, ymm1
|
|
vpand ymm12, ymm9, ymm3
|
|
vpmullw ymm0, ymm10, ymm0
|
|
vpmullw ymm12, ymm10, ymm12
|
|
vpor ymm0, ymm0, ymm2
|
|
vpor ymm12, ymm12, ymm14
|
|
vpcmpgtb ymm2, ymm0, ymm6
|
|
vpcmpgtb ymm15, ymm12, ymm6
|
|
vpsubusb ymm1, ymm0, ymm4
|
|
vpsubusb ymm14, ymm12, ymm4
|
|
vpsubb ymm1, ymm1, ymm2
|
|
vpsubb ymm14, ymm14, ymm15
|
|
vpshufb ymm1, ymm11, ymm1
|
|
vpshufb ymm14, ymm11, ymm14
|
|
vpaddb ymm0, ymm1, ymm0
|
|
vpaddb ymm12, ymm14, ymm12
|
|
vmovdqu oword ptr [rax], xmm0
|
|
vextracti128 oword ptr [rax + 16], ymm0, 1
|
|
vmovdqu oword ptr [rax + 32], xmm12
|
|
vextracti128 oword ptr [rax + 48], ymm12, 1
|
|
add rcx, 48
|
|
add rax, 64
|
|
sub rdx, 2
|
|
jne @9
|
|
@10: add rcx, 4
|
|
mov qword ptr [rsi], rax
|
|
mov qword ptr [rdi], rcx
|
|
vzeroupper
|
|
{$ifdef WIN64ABI}
|
|
jmp @done
|
|
{$else}
|
|
ret
|
|
{$endif WIN64ABI}
|
|
// trailing 24 bytes
|
|
@12: vmovdqu xmm3, oword ptr [rcx]
|
|
vinserti128 ymm1, ymm3, oword ptr [rcx + 16], 1
|
|
vpshufb ymm1, ymm1, ymm7
|
|
vpand ymm0, ymm5, ymm1
|
|
vpmulhuw ymm8, ymm0, ymm8
|
|
vpand ymm0, ymm9, ymm1
|
|
vpmullw ymm0, ymm10, ymm0
|
|
vpor ymm0, ymm0, ymm8
|
|
vpcmpgtb ymm6, ymm0, ymm6
|
|
vpsubusb ymm4, ymm0, ymm4
|
|
vpsubb ymm4, ymm4, ymm6
|
|
vpshufb ymm11, ymm11, ymm4
|
|
vpaddb ymm0, ymm11, ymm0
|
|
vmovdqu oword ptr [rax], xmm0
|
|
vextracti128 oword ptr [rax + 16], ymm0, 1
|
|
add rcx, 24
|
|
add rax, 32
|
|
jmp @10
|
|
{$ifdef FPC} align 32 {$else} .align 16 {Delphi doesn't allow 32 :(} {$endif}
|
|
@c9: dq 0000000000000000H
|
|
dq 0000000200000001H
|
|
dq 0000000400000003H
|
|
dq 0000000600000005H
|
|
@c10: dq 0809070805060405H
|
|
dq 0E0F0D0E0B0C0A0BH
|
|
dq 0405030401020001H
|
|
dq 0A0B090A07080607H
|
|
@c11: dq 0FC0FC000FC0FC00H
|
|
dq 0FC0FC000FC0FC00H
|
|
dq 0FC0FC000FC0FC00H
|
|
dq 0FC0FC000FC0FC00H
|
|
@c12: dq 0400004004000040H
|
|
dq 0400004004000040H
|
|
dq 0400004004000040H
|
|
dq 0400004004000040H
|
|
@c13: dq 003F03F0003F03F0H
|
|
dq 003F03F0003F03F0H
|
|
dq 003F03F0003F03F0H
|
|
dq 003F03F0003F03F0H
|
|
@c14: dq 0100001001000010H
|
|
dq 0100001001000010H
|
|
dq 0100001001000010H
|
|
dq 0100001001000010H
|
|
@c15: dq 3333333333333333H
|
|
dq 3333333333333333H
|
|
dq 3333333333333333H
|
|
dq 3333333333333333H
|
|
@c16: dq 1919191919191919H
|
|
dq 1919191919191919H
|
|
dq 1919191919191919H
|
|
dq 1919191919191919H
|
|
@c17: dq 0FCFCFCFCFCFC4741H
|
|
dq 0000F0EDFCFCFCFCH
|
|
dq 0FCFCFCFCFCFC4741H
|
|
dq 0000F0EDFCFCFCFCH
|
|
@done: {$ifdef WIN64ABI}
|
|
pop rdi
|
|
pop rsi
|
|
movups xmm6, dqword ptr [bak6]
|
|
movups xmm7, dqword ptr [bak7]
|
|
movups xmm8, dqword ptr [bak8]
|
|
movups xmm9, dqword ptr [bak9]
|
|
movups xmm10, dqword ptr [bak10]
|
|
movups xmm11, dqword ptr [bak11]
|
|
movups xmm12, dqword ptr [bak12]
|
|
movups xmm13, dqword ptr [bak13]
|
|
movups xmm14, dqword ptr [bak14]
|
|
movups xmm15, dqword ptr [bak15]
|
|
{$endif WIN64ABI}
|
|
end;
|
|
|
|
procedure Base64DecodeAvx2(var b64: PAnsiChar; var b64len: PtrInt;
|
|
var b: PAnsiChar);
|
|
{$ifdef WIN64ABI}
|
|
var
|
|
bak6, bak7, bak8, bak9, bak10, bak11, bak12: THash128;
|
|
asm // Windows x64 calling convention expects to preserve XMM6-XMM15
|
|
movups dqword ptr [bak6], xmm6
|
|
movups dqword ptr [bak7], xmm7
|
|
movups dqword ptr [bak8], xmm8
|
|
movups dqword ptr [bak9], xmm9
|
|
movups dqword ptr [bak10], xmm10
|
|
movups dqword ptr [bak11], xmm11
|
|
movups dqword ptr [bak12], xmm12
|
|
push rsi // Win64 ABI doesn't consider rsi/rdi as volatile
|
|
push rdi
|
|
mov rsi, rdx
|
|
mov rdx, r8
|
|
mov rdi, rcx
|
|
{$else} {$ifdef FPC}nostackframe; assembler; asm {$else} asm .noframe {$endif}
|
|
{$endif WIN64ABI}
|
|
// rcx/rdi=b64 rdx/rsi=b64len r8/rdx=b
|
|
// on decoding error, b64 will point to the faulty input
|
|
mov r8, qword ptr [rsi]
|
|
cmp r8, 44
|
|
jbe @done
|
|
lea r9, qword ptr [r8 - 0DH]
|
|
vmovdqu ymm1, yword ptr [rip + @c0]
|
|
vmovdqu ymm5, yword ptr [rip + @c1]
|
|
mov rax, r9
|
|
and r9, 0FFFFFFFFFFFFFFE0H
|
|
vmovdqu ymm4, yword ptr [rip + @c2]
|
|
vmovdqu ymm9, yword ptr [rip + @c3]
|
|
sub r8, r9
|
|
shr rax, 5 // rax = rounds
|
|
vmovdqu ymm8, yword ptr [rip + @c4]
|
|
vmovdqu ymm3, yword ptr [rip + @c5]
|
|
mov qword ptr [rsi], r8 // set final b64len
|
|
vmovdqu ymm2, yword ptr [rip + @c6]
|
|
vmovdqu ymm7, yword ptr [rip + @c7]
|
|
vmovdqu ymm6, yword ptr [rip + @c8]
|
|
mov r8, qword ptr [rdi] // r8 = [rdi] = b64
|
|
mov r9, qword ptr [rdx] // r9 = [rdx] = b
|
|
{$ifdef FPC} align 16 {$else} .align 16 {$endif}
|
|
// decode 32 bytes on input into 24 binary bytes per loop iteration
|
|
@1: vmovdqu xmm0, oword ptr [r8]
|
|
vinserti128 ymm10, ymm0, oword ptr [r8 + 16], 1
|
|
vpsrld ymm0, ymm10, 4
|
|
vpand ymm11, ymm1, ymm0
|
|
vpand ymm0, ymm1, ymm10
|
|
vpshufb ymm12, ymm5, ymm11
|
|
vpshufb ymm0, ymm4, ymm0
|
|
vptest ymm0, ymm12
|
|
jnz @err
|
|
add r8, 32
|
|
vpcmpeqb ymm0, ymm10, ymm9
|
|
vpaddb ymm0, ymm0, ymm11
|
|
vpshufb ymm0, ymm8, ymm0
|
|
vpaddb ymm0, ymm0, ymm10
|
|
vpmaddubsw ymm0, ymm0, ymm3
|
|
vpmaddwd ymm0, ymm0, ymm2
|
|
vpshufb ymm0, ymm0, ymm7
|
|
vpermd ymm0, ymm6, ymm0
|
|
vmovdqu oword ptr [r9], xmm0
|
|
vextracti128 oword ptr [r9 + 16], ymm0, 1
|
|
add r9, 24
|
|
sub rax, 1
|
|
jne @1
|
|
jmp @8
|
|
@err: shl rax, 5
|
|
add qword ptr [rsi], rax // restore proper b64len on error
|
|
@8: mov qword ptr [rdi], r8
|
|
mov qword ptr [rdx], r9
|
|
vzeroupper
|
|
{$ifdef WIN64ABI}
|
|
jmp @done
|
|
{$else}
|
|
ret
|
|
{$endif WIN64ABI}
|
|
{$ifdef FPC} align 32 {$else} .align 16 {Delphi doesn't allow 32 :(} {$endif}
|
|
@c0: dq 2F2F2F2F2F2F2F2FH
|
|
dq 2F2F2F2F2F2F2F2FH
|
|
dq 2F2F2F2F2F2F2F2FH
|
|
dq 2F2F2F2F2F2F2F2FH
|
|
@c1: dq 0804080402011010H
|
|
dq 1010101010101010H
|
|
dq 0804080402011010H
|
|
dq 1010101010101010H
|
|
@c2: dq 1111111111111115H
|
|
dq 1A1B1B1B1A131111H
|
|
dq 1111111111111115H
|
|
dq 1A1B1B1B1A131111H
|
|
@c3: dq 2F2F2F2F2F2F2F2FH
|
|
dq 2F2F2F2F2F2F2F2FH
|
|
dq 2F2F2F2F2F2F2F2FH
|
|
dq 2F2F2F2F2F2F2F2FH
|
|
@c4: dq 0B9B9BFBF04131000H
|
|
dq 0000000000000000H
|
|
dq 0B9B9BFBF04131000H
|
|
dq 0000000000000000H
|
|
@c5: dq 0140014001400140H
|
|
dq 0140014001400140H
|
|
dq 0140014001400140H
|
|
dq 0140014001400140H
|
|
@c6: dq 0001100000011000H
|
|
dq 0001100000011000H
|
|
dq 0001100000011000H
|
|
dq 0001100000011000H
|
|
@c7: dq 090A040506000102H
|
|
dq 0FFFFFFFF0C0D0E08H
|
|
dq 090A040506000102H
|
|
dq 0FFFFFFFF0C0D0E08H
|
|
@c8: dq 0000000100000000H
|
|
dq 0000000400000002H
|
|
dq 0000000600000005H
|
|
dq 0FFFFFFFFFFFFFFFFH
|
|
@done: {$ifdef WIN64ABI}
|
|
pop rdi
|
|
pop rsi
|
|
movups xmm6, dqword ptr [bak6]
|
|
movups xmm7, dqword ptr [bak7]
|
|
movups xmm8, dqword ptr [bak8]
|
|
movups xmm9, dqword ptr [bak9]
|
|
movups xmm10, dqword ptr [bak10]
|
|
movups xmm11, dqword ptr [bak11]
|
|
movups xmm12, dqword ptr [bak12]
|
|
{$endif WIN64ABI}
|
|
end;
|
|
|
|
{$endif ASMX64AVXNOCONST}
|
|
|
|
{$endif CPUX64ASM} // SSE2 asm is invalid prior to Delphi XE7 (to be refined)
|
|
|
|
|
|
// functions below are always available, even on DARWIN
|
|
|
|
function SortDynArrayInteger(const A, B): integer;
|
|
{$ifdef FPC}nostackframe; assembler; asm {$else} asm .noframe {$endif FPC}
|
|
mov r8d, dword ptr [A]
|
|
mov edx, dword ptr [B]
|
|
xor eax, eax
|
|
xor ecx, ecx
|
|
cmp r8d, edx
|
|
setl cl
|
|
setg al
|
|
sub eax, ecx
|
|
end;
|
|
|
|
function SortDynArrayCardinal(const A, B): integer;
|
|
{$ifdef FPC}nostackframe; assembler; asm {$else} asm .noframe {$endif FPC}
|
|
mov ecx, dword ptr [A]
|
|
mov edx, dword ptr [B]
|
|
xor eax, eax
|
|
cmp ecx, edx
|
|
seta al
|
|
sbb eax, 0
|
|
end;
|
|
|
|
function SortDynArrayInt64(const A, B): integer;
|
|
{$ifdef FPC}nostackframe; assembler; asm {$else} asm .noframe {$endif FPC}
|
|
mov r8, qword ptr [A]
|
|
mov rdx, qword ptr [B]
|
|
xor eax, eax
|
|
xor ecx, ecx
|
|
cmp r8, rdx
|
|
setl cl
|
|
setg al
|
|
sub eax, ecx
|
|
end;
|
|
|
|
function SortDynArrayQWord(const A, B): integer;
|
|
{$ifdef FPC}nostackframe; assembler; asm {$else} asm .noframe {$endif FPC}
|
|
mov rcx, qword ptr [A]
|
|
mov rdx, qword ptr [B]
|
|
xor eax, eax
|
|
cmp rcx, rdx
|
|
seta al
|
|
sbb eax, 0
|
|
end;
|
|
|
|
function SortDynArrayPointer(const A, B): integer;
|
|
{$ifdef FPC}nostackframe; assembler; asm {$else} asm .noframe {$endif FPC}
|
|
mov rcx, qword ptr [A]
|
|
mov rdx, qword ptr [B]
|
|
xor eax, eax
|
|
cmp rcx, rdx
|
|
seta al
|
|
sbb eax, 0
|
|
end;
|
|
|
|
function SortDynArrayDouble(const A, B): integer;
|
|
{$ifdef FPC}nostackframe; assembler; asm {$else} asm .noframe {$endif FPC}
|
|
movsd xmm0, qword ptr [A]
|
|
movsd xmm1, qword ptr [B]
|
|
xor eax, eax
|
|
xor edx, edx
|
|
comisd xmm0, xmm1
|
|
seta al
|
|
setb dl
|
|
sub eax, edx
|
|
end;
|
|
|
|
function SortDynArraySingle(const A, B): integer;
|
|
{$ifdef FPC}nostackframe; assembler; asm {$else} asm .noframe {$endif FPC}
|
|
movss xmm0, dword ptr [A]
|
|
movss xmm1, dword ptr [B]
|
|
xor eax, eax
|
|
xor edx, edx
|
|
comiss xmm0, xmm1
|
|
seta al
|
|
setb dl
|
|
sub eax, edx
|
|
end;
|
|
|
|
function SortDynArrayAnsiString(const A, B): integer;
|
|
{$ifdef FPC}nostackframe; assembler; asm {$else} asm .noframe {$endif FPC}
|
|
// x86_64 version optimized for AnsiString/RawUtf8/RawByteString types
|
|
mov rax, qword ptr [A]
|
|
mov rdx, qword ptr [B]
|
|
cmp rax, rdx
|
|
je @z
|
|
test rax, rdx
|
|
jz @maynil
|
|
@f: movzx ecx, byte ptr [rax] // first char comparison
|
|
movzx r8d, byte ptr [rdx] // 32-bit to avoid overflow
|
|
sub ecx, r8d
|
|
je @s
|
|
mov eax, ecx // branchless execution on Quicksort
|
|
ret
|
|
@1: mov eax, 1
|
|
ret
|
|
@z: xor eax, eax
|
|
ret
|
|
@maynil:test rdx, rdx // A or B may be ''
|
|
jz @1
|
|
test rax, rax
|
|
jnz @f
|
|
dec eax
|
|
ret
|
|
{$ifdef FPC}
|
|
@s: mov r9, qword ptr [rax - _STRLEN] // TStrLen=SizeInt on FPC
|
|
mov r8, r9
|
|
sub r9, qword ptr [rdx - _STRLEN] // r9 = length(A)-length(B)
|
|
{$else}
|
|
.align 16
|
|
@s: mov r9d, dword ptr [rax - _STRLEN] // TStrLen=integer on Delphi
|
|
mov r8, r9
|
|
mov r10d, dword ptr [rdx - _STRLEN]
|
|
sub r9, r10 // r9 = length(A)-length(B)
|
|
{$endif FPC}
|
|
adc rcx, -1
|
|
and rcx, r9
|
|
sub rcx, r8 // rcx = -min(length(A),length(B))
|
|
sub rax, rcx
|
|
sub rdx, rcx
|
|
{$ifdef FPC} align 16 {$else} .align 16 {$endif}
|
|
@by8: mov r10, qword ptr [rax + rcx] // compare 8 bytes at once
|
|
xor r10, qword ptr [rdx + rcx]
|
|
jnz @d
|
|
add rcx, 8
|
|
js @by8
|
|
@eq: mov eax, r9d // all chars equal -> returns length(A)-length(B)
|
|
ret
|
|
@d: bsf r10, r10 // char differs -> returns pbyte(A)^-pbyte(B)^
|
|
shr r10, 3
|
|
add rcx, r10
|
|
jns @eq
|
|
movzx eax, byte ptr [rax + rcx]
|
|
movzx edx, byte ptr [rdx + rcx]
|
|
sub eax, edx
|
|
end; // note: SSE4.2 read up to 16 bytes after buffer, this version won't
|
|
|
|
function Hash32(Data: PCardinalArray; Len: integer): cardinal;
|
|
{$ifdef FPC}nostackframe; assembler; asm {$else} asm .noframe {$endif}
|
|
xor eax, eax
|
|
xor r9d, r9d
|
|
test Data, Data
|
|
jz @z
|
|
{$ifdef WIN64ABI} // rcx/rdi=Data edx/esi=Len
|
|
mov r8, rdx
|
|
shr r8, 4
|
|
{$else}
|
|
mov edx, esi
|
|
shr esi, 4
|
|
{$endif WIN64ABI}
|
|
jz @by4
|
|
{$ifdef FPC} align 16 {$else} .align 16 {$endif}
|
|
@by16: add eax, dword ptr [Data]
|
|
add r9d, eax
|
|
add eax, dword ptr [Data + 4]
|
|
add r9d, eax
|
|
add eax, dword ptr [Data + 8]
|
|
add r9d, eax
|
|
add eax, dword ptr [Data + 12]
|
|
add r9d, eax
|
|
add Data, 16
|
|
{$ifdef WIN64ABI}
|
|
sub r8d, 1
|
|
{$else}
|
|
sub esi, 1
|
|
{$endif WIN64ABI}
|
|
jnz @by16
|
|
@by4: mov dh, dl
|
|
and dl, 15
|
|
jz @0
|
|
shr dl, 2
|
|
jz @rem
|
|
@4: add eax, dword ptr [Data]
|
|
add r9d, eax
|
|
add Data, 4
|
|
dec dl
|
|
jnz @4
|
|
@rem: and dh, 3
|
|
jz @0
|
|
dec dh
|
|
jz @1
|
|
dec dh
|
|
jz @2
|
|
mov ecx, dword ptr [Data]
|
|
and ecx, $ffffff
|
|
jmp @e
|
|
@2: movzx ecx, word ptr [Data]
|
|
jmp @e
|
|
@1: movzx ecx, byte ptr [Data]
|
|
@e: add eax, ecx
|
|
@0: add r9d, eax
|
|
shl r9d, 16
|
|
xor eax, r9d
|
|
@z:
|
|
end;
|
|
|
|
function xxHash32(crc: cardinal; P: PAnsiChar; len: cardinal): cardinal;
|
|
{$ifdef FPC}nostackframe; assembler; asm {$else} asm .noframe{$endif}
|
|
{$ifdef SYSVABI} // crc=rdi P=rsi len=rdx
|
|
mov r8, rdi
|
|
mov rcx, rsi
|
|
{$else} // crc=r8 P=rcx len=rdx
|
|
mov r10, r8
|
|
mov r8, rcx
|
|
mov rcx, rdx
|
|
mov rdx, r10
|
|
push rsi // Win64 expects those registers to be preserved
|
|
push rdi
|
|
{$endif SYSVABI}
|
|
// P=r8 len=rcx crc=rdx
|
|
push r12
|
|
push rbx
|
|
mov r12d, -1640531535
|
|
lea r10, [rcx + rdx]
|
|
lea eax, [r8 + 165667B1H]
|
|
cmp rdx, 15
|
|
jbe @2
|
|
lea rsi, [r10 - 16]
|
|
lea ebx, [r8 + 24234428H]
|
|
lea edi, [r8 - 7A143589H]
|
|
lea eax, [r8 + 61C8864FH]
|
|
{$ifdef FPC} align 16 {$else} .align 16 {$endif}
|
|
@1: imul r9d, dword ptr [rcx], -2048144777
|
|
add rcx, 16
|
|
imul r11d, dword ptr [rcx - 12], -2048144777
|
|
add ebx, r9d
|
|
lea r9d, [r11 + rdi]
|
|
rol ebx, 13
|
|
rol r9d, 13
|
|
imul ebx, r12d
|
|
imul edi, r9d, -1640531535
|
|
imul r9d, dword ptr [rcx - 8], -2048144777
|
|
add r8d, r9d
|
|
imul r9d, dword ptr [rcx - 4], -2048144777
|
|
rol r8d, 13
|
|
imul r8d, r12d
|
|
add eax, r9d
|
|
rol eax, 13
|
|
imul eax, r12d
|
|
cmp rsi, rcx
|
|
jnc @1
|
|
rol edi, 7
|
|
rol ebx, 1
|
|
rol r8d, 12
|
|
mov r9d, edi
|
|
ror eax, 14
|
|
add r9d, ebx
|
|
add r8d, r9d
|
|
add eax, r8d
|
|
@2: lea r9, [rcx+4H]
|
|
add eax, edx
|
|
cmp r10, r9
|
|
jc @4
|
|
mov r8, r9
|
|
@3: imul edx, dword ptr [r8 - 4], -1028477379
|
|
add r8, 4
|
|
add eax, edx
|
|
ror eax, 15
|
|
imul eax, 668265263
|
|
cmp r10, r8
|
|
jnc @3
|
|
lea rdx, [r10 - 4]
|
|
sub rdx, rcx
|
|
mov rcx, rdx
|
|
and rcx, 0FFFFFFFFFFFFFFFCH
|
|
add rcx, r9
|
|
@4: cmp r10, rcx
|
|
jbe @6
|
|
@5: movzx edx, byte ptr [rcx]
|
|
add rcx, 1
|
|
imul edx, 374761393
|
|
add eax, edx
|
|
rol eax, 11
|
|
imul eax, r12d
|
|
cmp r10, rcx
|
|
jnz @5
|
|
@6: mov edx, eax
|
|
shr edx, 15
|
|
xor eax, edx
|
|
imul eax, -2048144777
|
|
mov edx, eax
|
|
shr edx, 13
|
|
xor eax, edx
|
|
imul eax, -1028477379
|
|
mov edx, eax
|
|
shr edx, 16
|
|
xor eax, edx
|
|
pop rbx
|
|
pop r12
|
|
{$ifndef SYSVABI}
|
|
pop rdi
|
|
pop rsi
|
|
{$endif SYSVABI}
|
|
end;
|
|
|
|
function GetBitsCountPas(value: PtrInt): PtrInt;
|
|
{$ifdef FPC} assembler; nostackframe; asm {$else} asm .noframe {$endif}
|
|
mov rax, value
|
|
mov rdx, value
|
|
shr rax, 1
|
|
mov rcx, $5555555555555555
|
|
mov r8, $3333333333333333
|
|
mov r10, $0f0f0f0f0f0f0f0f
|
|
mov r11, $0101010101010101
|
|
and rax, rcx
|
|
sub rdx, rax
|
|
mov rax, rdx
|
|
shr rdx, 2
|
|
and rax, r8
|
|
and rdx, r8
|
|
add rax, rdx
|
|
mov rdx, rax
|
|
shr rax, 4
|
|
add rax, rdx
|
|
and rax, r10
|
|
imul rax, r11
|
|
shr rax, 56
|
|
end;
|
|
|
|
procedure mul64x64(const left, right: QWord; out product: THash128Rec);
|
|
{$ifdef FPC}nostackframe; assembler; asm {$else} asm .noframe {$endif}
|
|
// uses built-in 64-bit -> 128-bit multiplication
|
|
{$ifdef WIN64ABI}
|
|
mov rax, rcx
|
|
mul rdx
|
|
{$else}
|
|
mov r8, rdx
|
|
mov rax, rdi
|
|
mul rsi
|
|
{$endif WIN64ABI}
|
|
mov qword ptr [r8], rax
|
|
mov qword ptr [r8 + 8], rdx
|
|
end;
|
|
|
|
function bswap32(a: cardinal): cardinal;
|
|
{$ifdef FPC}nostackframe; assembler; asm {$else} asm .noframe {$endif FPC}
|
|
mov eax, a
|
|
bswap eax
|
|
end;
|
|
|
|
function bswap64(const a: QWord): QWord;
|
|
{$ifdef FPC}nostackframe; assembler; asm {$else} asm .noframe {$endif FPC}
|
|
mov rax, a
|
|
bswap rax
|
|
end;
|
|
|
|
procedure bswap64array(a, b: PQWordArray; n: PtrInt);
|
|
{$ifdef FPC}nostackframe; assembler; asm {$else} asm .noframe {$endif FPC}
|
|
@1: mov rax, qword ptr [a]
|
|
bswap rax
|
|
mov qword ptr [b], rax
|
|
add a, 8
|
|
add b, 8
|
|
sub n, 1
|
|
jnz @1
|
|
end;
|
|
|
|
function StrLen(S: pointer): PtrInt;
|
|
{$ifdef FPC}nostackframe; assembler; asm {$else} asm .noframe {$endif FPC}
|
|
// from GPL strlen64.asm by Agner Fog - www.agner.org/optimize
|
|
{$ifdef WIN64ABI}
|
|
mov rax, rcx // get pointer to string from rcx
|
|
mov r8, rcx // copy pointer
|
|
test rcx, rcx
|
|
{$else}
|
|
mov rax, rdi
|
|
mov ecx, edi
|
|
test rdi, rdi
|
|
{$endif WIN64ABI}
|
|
jz @null // returns 0 if S=nil
|
|
// rax=s,ecx=32-bit of s
|
|
pxor xmm0, xmm0 // set to zero
|
|
and ecx, 15 // lower 4 bits indicate misalignment
|
|
and rax, -16 // align pointer by 16
|
|
// will never read outside a memory page boundary, so won't trigger GPF
|
|
movaps xmm1, [rax] // read from nearest preceding boundary
|
|
pcmpeqb xmm1, xmm0 // compare 16 bytes with zero
|
|
pmovmskb edx, xmm1 // get one bit for each byte result
|
|
shr edx, cl // shift out false bits
|
|
shl edx, cl // shift back again
|
|
bsf edx, edx // find first 1-bit
|
|
jnz @L2 // found
|
|
// Main loop, search 16 bytes at a time
|
|
{$ifdef FPC} align 16 {$else} .align 16 {$endif}
|
|
@L1: add rax, 10H // increment pointer by 16
|
|
movaps xmm1, [rax] // read 16 bytes aligned
|
|
pcmpeqb xmm1, xmm0 // compare 16 bytes with zero
|
|
pmovmskb edx, xmm1 // get one bit for each byte result
|
|
bsf edx, edx // find first 1-bit
|
|
// (moving the bsf out of the loop and using test here would be faster
|
|
// for long strings on old processors, but we are assuming that most
|
|
// strings are short, and newer processors have higher priority)
|
|
jz @L1 // loop if not found
|
|
@L2: // Zero-byte found. Compute string length
|
|
{$ifdef WIN64ABI}
|
|
sub rax, r8 // subtract start address
|
|
{$else}
|
|
sub rax, rdi
|
|
{$endif WIN64ABI}
|
|
add rax, rdx // add byte index
|
|
@null:
|
|
end;
|
|
|
|
function PosChar(Str: PUtf8Char; Chr: AnsiChar): PUtf8Char;
|
|
{$ifdef FPC}nostackframe; assembler; asm {$else} asm .noframe {$endif FPC}
|
|
// fast SSE2 version searching for both Chr and #0 over 16 bytes at once
|
|
{$ifdef WIN64ABI}
|
|
movzx edx, dl
|
|
mov rax, rcx // get pointer to string from rcx
|
|
imul edx, $01010101
|
|
test rcx, rcx
|
|
{$else}
|
|
imul edx, esi, $01010101
|
|
mov rax, rdi
|
|
mov ecx, edi
|
|
test rdi, rdi
|
|
{$endif WIN64ABI}
|
|
jz @null // returns 0 if S=nil
|
|
movd xmm1, edx
|
|
// rax=Str, ecx=32-bit of Str, xmm1=Chr
|
|
pxor xmm0, xmm0 // set xmm0 to zero
|
|
pshufd xmm1, xmm1, 0 // set Chr into all bytes of xmm1
|
|
and ecx, 15 // lower 4 bits indicate misalignment
|
|
and rax, -16 // align pointer by 16
|
|
// will never read outside a memory page boundary, so won't trigger GPF
|
|
movaps xmm2, [rax] // read from nearest preceding boundary
|
|
movaps xmm3, xmm2
|
|
pcmpeqb xmm2, xmm0 // compare 16 bytes with zero
|
|
pcmpeqb xmm3, xmm1 // compare 16 bytes with Chr
|
|
por xmm2, xmm3
|
|
pmovmskb edx, xmm2 // get one bit for each byte result
|
|
shr edx, cl // shift out false bits
|
|
shl edx, cl // shift back again
|
|
bsf edx, edx // find first 1-bit
|
|
jnz @L2 // found
|
|
// Main loop, search 16 bytes at a time
|
|
{$ifdef FPC} align 16 {$else} .align 16 {$endif}
|
|
@L1: add rax, 10H // increment pointer by 16
|
|
movaps xmm2, [rax] // read 16 bytes aligned
|
|
movaps xmm3, xmm2
|
|
pcmpeqb xmm2, xmm0 // compare 16 bytes with zero
|
|
pcmpeqb xmm3, xmm1 // compare 16 bytes with Chr
|
|
por xmm2, xmm3
|
|
pmovmskb edx, xmm2 // get one bit for each byte result
|
|
bsf edx, edx // find first 1-bit
|
|
// (moving the bsf out of the loop and using test here would be faster
|
|
// for long strings on old processors, but we are assuming that most
|
|
// strings are short, and newer processors have higher priority)
|
|
jz @L1 // loop if not found
|
|
@L2: // Zero or Chr byte found
|
|
add rax, rdx // add byte index for rax = match address
|
|
cmp byte ptr [rax], 0
|
|
jz @z // return nil if zero was reached
|
|
@null: ret
|
|
@z: xor eax, eax
|
|
end;
|
|
|
|
function BufferLineLength(Text, TextEnd: PUtf8Char): PtrInt;
|
|
{$ifdef FPC} nostackframe; assembler; asm {$else} asm .noframe {$endif FPC}
|
|
{$ifdef WIN64ABI} // Win64 ABI to System-V ABI
|
|
push rsi
|
|
push rdi
|
|
mov rdi, rcx
|
|
mov rsi, rdx
|
|
{$endif WIN64ABI}
|
|
mov r8, rsi
|
|
sub r8, rdi // rdi=Text, rsi=TextEnd, r8=TextLen
|
|
jz @fail
|
|
mov ecx, edi
|
|
movaps xmm0, [rip + @for10]
|
|
movaps xmm1, [rip + @for13]
|
|
and rdi, -16 // check first aligned 16 bytes
|
|
and ecx, 15 // lower cl 4 bits indicate misalignment
|
|
movaps xmm2, [rdi]
|
|
movaps xmm3, xmm2
|
|
pcmpeqb xmm2, xmm0
|
|
pcmpeqb xmm3, xmm1
|
|
por xmm3, xmm2
|
|
pmovmskb eax, xmm3
|
|
shr eax, cl // shift out unaligned bytes
|
|
bsf eax, eax
|
|
jz @main
|
|
add rax, rcx
|
|
add rax, rdi
|
|
sub rax, rsi
|
|
jae @fail // don't exceed TextEnd
|
|
add rax, r8 // rax = TextFound - TextEnd + (TextEnd - Text) = offset
|
|
{$ifdef WIN64ABI}
|
|
pop rdi
|
|
pop rsi
|
|
{$endif WIN64ABI}
|
|
ret
|
|
@main: add rdi, 16
|
|
sub rdi, rsi
|
|
jae @fail
|
|
jmp @by16
|
|
{$ifdef FPC} align 16 {$else} .align 16 {$endif}
|
|
@for10: dq $0a0a0a0a0a0a0a0a
|
|
dq $0a0a0a0a0a0a0a0a
|
|
@for13: dq $0d0d0d0d0d0d0d0d
|
|
dq $0d0d0d0d0d0d0d0d
|
|
@by16: movaps xmm2, [rdi + rsi] // check 16 bytes per loop
|
|
movaps xmm3, xmm2
|
|
pcmpeqb xmm2, xmm0
|
|
pcmpeqb xmm3, xmm1
|
|
por xmm3, xmm2
|
|
pmovmskb eax, xmm3
|
|
test eax, eax
|
|
jnz @found
|
|
add rdi, 16
|
|
jnc @by16
|
|
@fail: mov rax, r8 // returns TextLen if no CR/LF found
|
|
{$ifdef WIN64ABI}
|
|
pop rdi
|
|
pop rsi
|
|
{$endif WIN64ABI}
|
|
ret
|
|
@found: bsf eax, eax
|
|
add rax, rdi
|
|
jc @fail
|
|
add rax, r8
|
|
{$ifdef WIN64ABI}
|
|
pop rdi
|
|
pop rsi
|
|
{$endif WIN64ABI}
|
|
end;
|
|
|
|
function ByteScanIndex(P: PByteArray; Count: PtrInt; Value: byte): PtrInt;
|
|
{$ifdef FPC} assembler; nostackframe; asm {$else} asm .noframe {$endif}
|
|
{$ifdef WIN64ABI}
|
|
movzx eax, r8b
|
|
mov r8, rcx
|
|
test rdx, rdx
|
|
{$else}
|
|
movzx eax, dl
|
|
mov rcx, rdi
|
|
mov r8, rdi
|
|
mov rdx, rsi
|
|
test rsi, rsi
|
|
{$endif WIN64ABI}
|
|
jbe @no
|
|
// eax=Value, rcx=P rdx=Count
|
|
imul eax, $01010101
|
|
and rcx, -16
|
|
movd xmm1, eax
|
|
movaps xmm0, [rcx] // check first aligned 16 bytes
|
|
add rcx, 16
|
|
pshufd xmm1, xmm1, 0
|
|
sub rcx, r8
|
|
pcmpeqb xmm0, xmm1
|
|
pmovmskb eax, xmm0
|
|
shl eax, cl
|
|
and eax, $FFFF0000
|
|
shr eax, cl
|
|
jnz @fnd
|
|
cmp rdx, rcx
|
|
jbe @no
|
|
{$ifdef FPC} align 16 {$else} .align 16 {$endif}
|
|
@by16: movaps xmm0, [r8 + rcx] // next 16 bytes
|
|
add rcx, 16
|
|
pcmpeqb xmm0, xmm1
|
|
pmovmskb eax, xmm0
|
|
test eax, eax
|
|
jnz @fnd
|
|
cmp rdx, rcx
|
|
ja @by16
|
|
@no: mov rax, -1
|
|
ret
|
|
@fnd: bsf eax, eax
|
|
lea rax, qword ptr [rcx + rax - 16]
|
|
cmp rdx, rax
|
|
jbe @no
|
|
end;
|
|
|
|
function WordScanIndex(P: PWordArray; Count: PtrInt; Value: word): PtrInt;
|
|
{$ifdef FPC} assembler; nostackframe; asm {$else} asm .noframe {$endif}
|
|
{$ifdef WIN64ABI}
|
|
movzx eax, r8w
|
|
mov r8, rcx
|
|
test rdx, rdx
|
|
{$else}
|
|
movzx eax, dx
|
|
mov rcx, rdi
|
|
mov r8, rdi
|
|
mov rdx, rsi
|
|
test rsi, rsi
|
|
{$endif WIN64ABI}
|
|
jbe @no
|
|
test rcx, 1
|
|
jnz @unal
|
|
// eax=Value, rcx=P rdx=Count
|
|
movd xmm1, eax
|
|
and rcx, -16
|
|
punpcklwd xmm1, xmm1
|
|
movaps xmm0, [rcx] // check first aligned 16 bytes
|
|
add rcx, 16
|
|
pshufd xmm1, xmm1, 0
|
|
sub rcx, r8
|
|
pcmpeqw xmm0, xmm1
|
|
pmovmskb eax, xmm0
|
|
shl eax, cl
|
|
and eax, $FFFF0000
|
|
shr eax, cl
|
|
shr ecx, 1
|
|
bsf eax, eax
|
|
jnz @fnd
|
|
cmp rdx, rcx
|
|
jbe @no
|
|
{$ifdef FPC} align 16 {$else} .align 16 {$endif}
|
|
@by16: movaps xmm0, [r8 + rcx * 2] // next 16 bytes
|
|
add rcx, 8
|
|
pcmpeqw xmm0, xmm1
|
|
pmovmskb eax, xmm0
|
|
bsf eax, eax
|
|
jnz @fnd
|
|
cmp rdx, rcx
|
|
ja @by16
|
|
@no: mov rax, -1
|
|
ret
|
|
bsf eax, eax
|
|
@fnd: shr eax, 1
|
|
lea rax, qword ptr [rcx + rax - 8]
|
|
cmp rdx, rax
|
|
jbe @no
|
|
ret
|
|
// 16bit-unaligned loop (seldom called)
|
|
{$ifdef FPC} align 8 {$else} .align 8 {$endif}
|
|
@unal: lea rcx, [rcx + rdx * 2]
|
|
neg rdx
|
|
{$ifdef FPC} align 16 {$else} .align 16 {$endif}
|
|
@unals: cmp word ptr [rcx + rdx * 2], ax
|
|
jz @unale
|
|
add rdx, 1
|
|
jnz @unals
|
|
jmp @no
|
|
@unale: lea rax, [rcx + rdx * 2]
|
|
sub rax, r8
|
|
shr rax, 1
|
|
end;
|
|
|
|
function IntegerScanIndex(P: PCardinalArray; Count: PtrInt; Value: cardinal): PtrInt;
|
|
{$ifdef FPC} assembler; nostackframe; asm {$else} asm .noframe {$endif}
|
|
{$ifdef WIN64ABI}
|
|
mov eax, r8d
|
|
movd xmm1, r8d
|
|
mov r8, rcx
|
|
test rdx, rdx
|
|
{$else}
|
|
mov eax, edx
|
|
movd xmm1, edx
|
|
mov rcx, rdi
|
|
mov r8, rdi
|
|
mov rdx, rsi
|
|
test rsi, rsi
|
|
{$endif WIN64ABI}
|
|
jbe @no
|
|
test rcx, 3
|
|
jnz @unal
|
|
// eax=Value, rcx=P rdx=Count
|
|
and rcx, -16
|
|
movaps xmm0, [rcx] // check first aligned 16 bytes
|
|
add rcx, 16
|
|
pshufd xmm1, xmm1, 0
|
|
sub rcx, r8
|
|
pcmpeqd xmm0, xmm1
|
|
pmovmskb eax, xmm0
|
|
shl eax, cl
|
|
and eax, $FFFF0000
|
|
shr eax, cl
|
|
shr ecx, 2
|
|
bsf eax, eax
|
|
jnz @fnd
|
|
cmp rdx, rcx
|
|
jbe @no
|
|
{$ifdef FPC} align 16 {$else} .align 16 {$endif}
|
|
@by16: movaps xmm0, [r8 + rcx * 4] // next 16 bytes
|
|
add rcx, 4
|
|
pcmpeqd xmm0, xmm1
|
|
pmovmskb eax, xmm0
|
|
bsf eax, eax
|
|
jnz @fnd
|
|
cmp rdx, rcx
|
|
ja @by16
|
|
@no: mov rax, -1
|
|
ret
|
|
bsf eax, eax
|
|
@fnd: shr eax, 2
|
|
lea rax, qword ptr [rcx + rax - 4]
|
|
cmp rdx, rax
|
|
jbe @no
|
|
ret
|
|
// 32bit-unaligned loop (seldom called)
|
|
{$ifdef FPC} align 8 {$else} .align 8 {$endif}
|
|
@unal: lea rcx, [rcx + rdx * 4]
|
|
neg rdx
|
|
{$ifdef FPC} align 16 {$else} .align 16 {$endif}
|
|
@unals: cmp dword ptr [rcx + rdx * 4], eax
|
|
jz @unale
|
|
add rdx, 1
|
|
jnz @unals
|
|
jmp @no
|
|
@unale: lea rax, [rcx + rdx * 4]
|
|
sub rax, r8
|
|
shr rax, 2
|
|
end;
|
|
|
|
function MemCmp(P1, P2: PByteArray; L: PtrInt): integer;
|
|
{$ifdef FPC} assembler; nostackframe; asm {$else} asm .noframe {$endif}
|
|
// from GPL memcmp64.asm by Agner Fog - www.agner.org/optimize
|
|
add P1, L // use negative index from end of memory block
|
|
add P2, L
|
|
neg L
|
|
jge @eq
|
|
mov r9d, $FFFF // 16 bits mask = 16 bytes
|
|
cmp L, -16
|
|
ja @sml
|
|
{$ifdef FPC} align 8 {$else} .align 8 {$endif}
|
|
@by16: // loop comparing 16 bytes
|
|
movups xmm1, oword ptr [P1 + L]
|
|
movups xmm2, oword ptr [P2 + L]
|
|
pcmpeqb xmm1, xmm2 // compare 16 bytes
|
|
pmovmskb eax, xmm1 // get byte mask
|
|
xor eax, r9d // not ax
|
|
jnz @diff // difference found
|
|
add L, 16
|
|
jz @eq // finished, equal
|
|
cmp L, -16
|
|
jna @by16 // next 16 bytes
|
|
jmp @sml
|
|
@diff: // difference found: find position
|
|
bsf eax, eax
|
|
add L, rax
|
|
@last: movzx eax, byte ptr [P1 + L]
|
|
movzx r9d, byte ptr [P2 + L]
|
|
sub eax, r9d // return result
|
|
ret
|
|
@eq: // equal
|
|
xor eax, eax
|
|
ret
|
|
@sml: // less than 16 bytes left
|
|
cmp L, -8
|
|
ja @less8
|
|
// compare 8 bytes
|
|
movq xmm1, qword ptr [P1 + L]
|
|
movq xmm2, qword ptr [P2 + L]
|
|
pcmpeqb xmm1, xmm2 // compare 8 bytes
|
|
pmovmskb eax, xmm1 // get byte mask
|
|
xor eax, r9d // not ax
|
|
jnz @diff // difference found
|
|
add L, 8
|
|
jz @eq
|
|
@less8: // less than 8 bytes left
|
|
cmp L, -4
|
|
ja @less4
|
|
// compare 4 bytes
|
|
movd xmm1, dword ptr [P1 + L]
|
|
movd xmm2, dword ptr [P2 + L]
|
|
pcmpeqb xmm1, xmm2 // compare 4 bytes
|
|
pmovmskb eax, xmm1 // get byte mask
|
|
xor eax, r9d // not ax
|
|
jnz @diff // difference found
|
|
add L, 4
|
|
jz @eq
|
|
@less4: // less than 4 bytes left
|
|
cmp L, -2
|
|
ja @less2
|
|
movzx eax, word ptr [P1 + L]
|
|
movzx r9d, word ptr [P2 + L]
|
|
sub eax, r9d
|
|
jnz @last2 // difference in byte 0 or 1
|
|
add L, 2
|
|
jz @eq
|
|
@less2: // less than 2 bytes left
|
|
test L, L
|
|
jz @eq // no bytes left
|
|
jmp @last // one byte left
|
|
@last2: // difference in byte 0 or 1
|
|
neg al
|
|
sbb L, -1 // add 1 to L if al == 0
|
|
jmp @last
|
|
end;
|
|
|
|
function FastFindWordSorted(P: PWordArray; R: PtrInt; Value: Word): PtrInt;
|
|
{$ifdef FPC} assembler; nostackframe; asm {$else} asm .noframe {$endif}
|
|
{$ifdef WIN64ABI}
|
|
push rdi
|
|
mov rdi, P // rdi=P - we use ecx to read the word ptr value
|
|
{$endif WIN64ABI}
|
|
xor r9, r9 // r9=L rax=result
|
|
test R, R
|
|
jl @ko
|
|
{$ifdef FPC} align 8 {$else} .align 8 {$endif}
|
|
@s: lea rax, [r9 + R]
|
|
shr rax, 1
|
|
lea r10, qword ptr [rax - 1] // branchless loop
|
|
lea r11, qword ptr [rax + 1]
|
|
movzx ecx, word ptr [rdi + rax * 2]
|
|
{$ifdef WIN64ABI}
|
|
cmp ecx, r8d
|
|
{$else}
|
|
cmp ecx, edx // 'cmp cx,Value' is silently rejected by Darwin asm
|
|
{$endif WIN64ABI}
|
|
je @ok
|
|
cmovg R, r10
|
|
cmovl r9, r11
|
|
cmp r9, R
|
|
jle @s
|
|
@ko: mov rax, -1
|
|
@ok: {$ifdef WIN64ABI}
|
|
pop rdi
|
|
{$endif WIN64ABI}
|
|
end;
|
|
|
|
function FastFindIntegerSorted(P: PIntegerArray; R: PtrInt; Value: integer): PtrInt;
|
|
{$ifdef FPC} assembler; nostackframe; asm {$else} asm .noframe {$endif}
|
|
xor r9, r9 // r9=L rax=result
|
|
test R, R
|
|
jl @ko
|
|
lea rax, [r9 + R]
|
|
{$ifdef FPC} align 8 {$else} .align 8 {$endif}
|
|
@s: shr rax, 1
|
|
lea r10, qword ptr [rax - 1] // efficient branchless binary search
|
|
lea r11, qword ptr [rax + 1]
|
|
cmp Value, dword ptr [P + rax * 4]
|
|
je @ok
|
|
cmovl R, r10
|
|
cmovg r9, r11
|
|
lea rax, [r9 + R]
|
|
cmp r9, R
|
|
jle @s
|
|
@ko: mov rax, -1
|
|
@ok:
|
|
end;
|
|
|
|
function FastFindInt64Sorted(P: PInt64Array; R: PtrInt; const Value: Int64): PtrInt;
|
|
{$ifdef FPC} assembler; nostackframe; asm {$else} asm .noframe {$endif}
|
|
xor r9, r9 // r9=L rax=result
|
|
test R, R
|
|
jl @ko
|
|
lea rax, [r9 + R]
|
|
{$ifdef FPC} align 8 {$else} .align 8 {$endif}
|
|
@s: shr rax, 1
|
|
lea r10, qword ptr [rax - 1] // efficient branchless binary search
|
|
lea r11, qword ptr [rax + 1]
|
|
cmp Value, qword ptr [P + rax * 8]
|
|
je @ok
|
|
cmovl R, r10
|
|
cmovg r9, r11
|
|
lea rax, [r9 + R]
|
|
cmp r9, R
|
|
jle @s
|
|
@ko: mov rax, -1
|
|
@ok:
|
|
end;
|
|
|
|
function GetBitsCountSse42(value: PtrInt): PtrInt;
|
|
{$ifdef FPC} assembler; nostackframe;
|
|
asm
|
|
popcnt rax, value
|
|
{$else} // oldest Delphi don't support this opcode
|
|
asm .noframe
|
|
{$ifdef WIN64ABI}
|
|
db $f3,$48,$0f,$B8,$c1
|
|
{$else}
|
|
db $f3,$48,$0f,$B8,$c7
|
|
{$endif WIN64ABI}
|
|
{$endif FPC}
|
|
end;
|
|
|
|
function crc32cby4sse42(crc, value: cardinal): cardinal;
|
|
{$ifdef FPC}nostackframe; assembler; asm {$else} asm .noframe {$endif FPC}
|
|
mov eax, crc
|
|
crc32 eax, value
|
|
end;
|
|
|
|
procedure crcblocksse42(crc128, data128: PBlock128);
|
|
{$ifdef FPC}nostackframe; assembler; asm {$else} asm .noframe {$endif FPC}
|
|
mov eax, dword ptr [crc128] // we can't use two qword ptr here
|
|
mov r8d, dword ptr [crc128 + 4]
|
|
mov r9d, dword ptr [crc128 + 8]
|
|
mov r10d, dword ptr [crc128 + 12]
|
|
crc32 eax, dword ptr [data128]
|
|
crc32 r8d, dword ptr [data128 + 4]
|
|
crc32 r9d, dword ptr [data128 + 8]
|
|
crc32 r10d, dword ptr [data128 + 12]
|
|
mov dword ptr [crc128], eax
|
|
mov dword ptr [crc128 + 4], r8d
|
|
mov dword ptr [crc128 + 8], r9d
|
|
mov dword ptr [crc128 + 12], r10d
|
|
end;
|
|
|
|
procedure crcblockssse42(crc128, data128: PBlock128; count: integer);
|
|
{$ifdef FPC}nostackframe; assembler; asm {$else} asm .noframe {$endif FPC}
|
|
test count, count
|
|
jle @z
|
|
mov rax, data128
|
|
{$ifdef WIN64ABI}
|
|
mov rdx, rcx
|
|
mov ecx, r8d
|
|
{$else}
|
|
mov ecx, edx
|
|
mov rdx, rdi
|
|
{$endif WIN64ABI}
|
|
mov r8d, dword ptr [rdx] // we can't use qword ptr here
|
|
mov r9d, dword ptr [rdx + 4]
|
|
mov r10d, dword ptr [rdx + 8]
|
|
mov r11d, dword ptr [rdx + 12]
|
|
{$ifdef FPC} align 16 {$else} .align 16 {$endif}
|
|
@s: crc32 r8d, dword ptr [rax]
|
|
crc32 r9d, dword ptr [rax + 4]
|
|
crc32 r10d, dword ptr [rax + 8]
|
|
crc32 r11d, dword ptr [rax + 12]
|
|
add rax, 16
|
|
sub ecx, 1
|
|
jnz @s
|
|
mov dword ptr [rdx], r8d
|
|
mov dword ptr [rdx + 4], r9d
|
|
mov dword ptr [rdx + 8], r10d
|
|
mov dword ptr [rdx + 12], r11d
|
|
@z:
|
|
end;
|
|
|
|
function crc32csse42(crc: cardinal; buf: PAnsiChar; len: cardinal): cardinal;
|
|
{$ifdef FPC}nostackframe; assembler; asm {$else} asm .noframe {$endif FPC}
|
|
mov eax, crc
|
|
test len, len
|
|
jz @z
|
|
test buf, buf
|
|
jz @z
|
|
not eax
|
|
mov ecx, len
|
|
shr len, 3
|
|
jnz @by8 // no read alignment care here - but in crypto.core
|
|
@0: test cl, 4
|
|
jz @4
|
|
crc32 eax, dword ptr [buf]
|
|
add buf, 4
|
|
@4: test cl, 2
|
|
jz @2
|
|
crc32 eax, word ptr [buf]
|
|
add buf, 2
|
|
@2: test cl, 1
|
|
jz @1
|
|
crc32 eax, byte ptr [buf]
|
|
@1: not eax
|
|
@z: ret
|
|
{$ifdef FPC}
|
|
align 16
|
|
@by8: crc32 rax, qword ptr [buf] // hash 8 bytes per loop
|
|
{$else}
|
|
.align 16
|
|
@by8: db $F2,$48,$0F,$38,$F1,$02 // circumvent Delphi inline asm compiler bug
|
|
{$endif FPC}
|
|
add buf, 8
|
|
sub len, 1
|
|
jnz @by8
|
|
jmp @0
|
|
end;
|
|
|
|
function SynLZcompress1(src: PAnsiChar; size: integer; dst: PAnsiChar): integer;
|
|
var
|
|
off: TOffsets;
|
|
cache: array[0..4095] of cardinal; // uses 32KB+16KB=48KB on stack
|
|
asm // rcx=src, edx=size, r8=dest
|
|
{$ifdef WIN64ABI} // additional registers to preserve
|
|
push rdi
|
|
push rsi
|
|
{$else} // Linux 64-bit ABI
|
|
mov r8, rdx
|
|
mov rdx, rsi
|
|
mov rcx, rdi
|
|
{$endif WIN64ABI}
|
|
push rbx
|
|
push r12
|
|
push r13
|
|
push r14
|
|
push r15
|
|
mov r15, r8 // r8=dest r15=dst_beg
|
|
mov rbx, rcx // rbx=src
|
|
cmp edx, 32768
|
|
jc @03
|
|
mov eax, edx
|
|
and eax, 7FFFH
|
|
or eax, 8000H
|
|
mov word ptr [r8], ax
|
|
mov eax, edx
|
|
shr eax, 15
|
|
mov word ptr [r8 + 2], ax
|
|
add r8, 4
|
|
jmp @05
|
|
@03: mov word ptr [r8], dx
|
|
test edx, edx
|
|
jnz @04
|
|
mov r15d, 2
|
|
jmp @19
|
|
nop
|
|
@04: add r8, 2
|
|
@05: lea r9, [rdx + rbx] // r9=src_end
|
|
lea r10, [r9 - 11] // r10=src_endmatch
|
|
mov ecx, 1 // ecx=CWBits
|
|
mov r11, r8 // r11=CWpoint
|
|
mov dword ptr [r8], 0
|
|
add r8, 4
|
|
pxor xmm0, xmm0
|
|
mov eax, 32768 - 64
|
|
@06: movaps dqword ptr [off + rax - 48], xmm0 // stack is 16 bytes aligned
|
|
movaps dqword ptr [off + rax - 32], xmm0
|
|
movaps dqword ptr [off + rax - 16], xmm0
|
|
movaps dqword ptr [off + rax], xmm0
|
|
sub eax, 64
|
|
jae @06
|
|
cmp rbx, r10
|
|
ja @15
|
|
@07: mov edx, dword ptr [rbx]
|
|
mov rax, rdx
|
|
mov r12, rdx
|
|
shr rax, 12
|
|
xor rax, rdx
|
|
and rax, 0FFFH // rax=h
|
|
mov r14, qword ptr [off + rax * 8] // r14=o
|
|
mov edx, dword ptr [cache + rax * 4]
|
|
mov qword ptr [off + rax * 8], rbx
|
|
mov dword ptr [cache + rax * 4], r12d
|
|
xor rdx, r12
|
|
test r14, r14
|
|
lea rdi, [r9-1]
|
|
je @12
|
|
and rdx, 0FFFFFFH
|
|
jne @12
|
|
mov rdx, rbx
|
|
sub rdx, r14
|
|
cmp rdx, 2
|
|
jbe @12
|
|
or dword ptr [r11], ecx
|
|
add rbx, 2
|
|
add r14, 2
|
|
mov esi, 1
|
|
sub rdi, rbx
|
|
cmp rdi, 271
|
|
jc @09
|
|
mov edi, 271
|
|
jmp @09
|
|
@08: add rsi, 1
|
|
@09: mov edx, dword ptr [r14 + rsi]
|
|
cmp dl, byte ptr [rbx + rsi]
|
|
jnz @10
|
|
cmp rsi, rdi
|
|
jge @10
|
|
add rsi, 1
|
|
cmp dh, byte ptr [rbx + rsi]
|
|
jnz @10
|
|
shr edx, 16
|
|
cmp rsi, rdi
|
|
jge @10
|
|
add rsi, 1
|
|
cmp dl, byte ptr [rbx + rsi]
|
|
jnz @10
|
|
cmp rsi, rdi
|
|
jge @10
|
|
add rsi, 1
|
|
cmp dh, byte ptr [rbx + rsi]
|
|
jnz @10
|
|
cmp rsi, rdi
|
|
jc @08
|
|
@10: add rbx, rsi
|
|
shl rax, 4
|
|
cmp rsi, 15
|
|
ja @11
|
|
or rax, rsi
|
|
mov word ptr [r8], ax
|
|
add r8, 2
|
|
jmp @13
|
|
@11: sub rsi, 16
|
|
mov word ptr [r8], ax
|
|
mov byte ptr [r8 + 2], sil
|
|
add r8, 3
|
|
jmp @13
|
|
@12: mov al, byte ptr [rbx]
|
|
mov byte ptr [r8], al
|
|
add rbx, 1
|
|
add r8, 1
|
|
@13: add ecx, ecx
|
|
jnz @14
|
|
mov r11, r8
|
|
mov [r8], ecx
|
|
add r8, 4
|
|
add ecx, 1
|
|
@14: cmp rbx, r10
|
|
jbe @07
|
|
@15: cmp rbx, r9
|
|
jnc @18
|
|
@16: mov al, byte ptr [rbx]
|
|
mov byte ptr [r8], al
|
|
add rbx, 1
|
|
add r8, 1
|
|
add ecx, ecx
|
|
jnz @17
|
|
mov [r8], ecx
|
|
add r8, 4
|
|
add ecx, 1
|
|
@17: cmp rbx, r9
|
|
jc @16
|
|
@18: sub r8, r15
|
|
mov r15, r8
|
|
@19: mov rax, r15
|
|
pop r15
|
|
pop r14
|
|
pop r13
|
|
pop r12
|
|
pop rbx
|
|
{$ifdef WIN64ABI} // additional registers to preserve
|
|
pop rsi
|
|
pop rdi
|
|
{$endif WIN64ABI}
|
|
end;
|
|
|
|
function SynLZdecompress1(src: PAnsiChar; size: integer; dst: PAnsiChar): integer;
|
|
var
|
|
off: TOffsets;
|
|
asm // rcx=src, edx=size, r8=dest
|
|
{$ifdef WIN64ABI} // additional registers to preserve
|
|
push rsi
|
|
push rdi
|
|
{$else} // Linux 64-bit ABI
|
|
mov r8, rdx
|
|
mov rdx, rsi
|
|
mov rcx, rdi
|
|
{$endif WIN64ABI}
|
|
push rbx
|
|
push r12
|
|
push r13
|
|
push r14
|
|
push r15
|
|
movzx eax, word ptr [rcx] // rcx=src eax=result
|
|
lea r9, [rcx + rdx] // r9=src_end
|
|
test eax, eax
|
|
je @35
|
|
add rcx, 2
|
|
mov r10d, eax
|
|
and r10d, 8000H
|
|
jz @21
|
|
movzx ebx, word ptr [rcx]
|
|
shl ebx, 15
|
|
mov r10d, eax
|
|
and r10d, 7FFFH
|
|
or r10d, ebx
|
|
mov eax, r10d
|
|
add rcx, 2
|
|
@21: lea r10, [r8 - 1] // r10=last_hashed r8=dest
|
|
@22: mov edi, dword ptr [rcx] // edi=CW
|
|
add rcx, 4
|
|
mov r13d, 1 // r13d=CWBit
|
|
cmp rcx, r9
|
|
jnc @35
|
|
@23: test r13d, edi
|
|
jnz @25
|
|
mov bl, byte ptr [rcx]
|
|
mov byte ptr [r8], bl
|
|
add rcx, 1
|
|
lea rbx, [r8 - 2]
|
|
add r8, 1
|
|
cmp rcx, r9
|
|
jnc @35
|
|
cmp rbx, r10
|
|
jbe @24
|
|
add r10, 1
|
|
mov esi, dword ptr [r10]
|
|
mov rbx, rsi
|
|
shr esi, 12
|
|
xor ebx, esi
|
|
and ebx, 0FFFH
|
|
mov qword ptr [off + rbx * 8], r10
|
|
@24: shl r13d, 1
|
|
jnz @23
|
|
jmp @22
|
|
@25: movzx r11, word ptr [rcx] // r11=t
|
|
add rcx, 2
|
|
mov ebx, r11d // ebx=h
|
|
shr ebx, 4
|
|
and r11, 0FH
|
|
lea r11, [r11 + 2]
|
|
jnz @26
|
|
movzx r11, byte ptr [rcx]
|
|
add rcx, 1
|
|
lea r11, [r11 + 12H]
|
|
@26: mov r14, qword ptr [off + rbx * 8] // r14=o
|
|
mov rbx, r8
|
|
xor rsi, rsi
|
|
sub rbx, r14
|
|
mov r12, r11
|
|
mov r15, r11
|
|
cmp rbx, r11
|
|
jc @29
|
|
shr r12, 3
|
|
jz @30
|
|
{$ifdef FPC} align 8 {$else} .align 8 {$endif}
|
|
@27: mov rbx, qword ptr [r14 + rsi] // inline move by 8 bytes
|
|
mov qword ptr [r8 + rsi], rbx
|
|
add rsi, 8
|
|
sub r12, 1
|
|
jnz @27
|
|
mov rbx, qword ptr [r14 + rsi] // 1..7 remaining bytes
|
|
and r15, 7
|
|
jz @31
|
|
@28: mov byte ptr [r8 + rsi], bl
|
|
shr rbx, 8
|
|
add rsi, 1
|
|
sub r15, 1
|
|
jnz @28
|
|
jmp @31
|
|
{$ifdef FPC} align 8 {$else} .align 8 {$endif}
|
|
@29: mov bl, byte ptr [r14 + rsi] // overlaping move
|
|
mov byte ptr [r8 + rsi], bl
|
|
add rsi, 1
|
|
sub r12, 1
|
|
jnz @29
|
|
cmp rcx, r9
|
|
jnz @33
|
|
jmp @35
|
|
@30: mov rbx, qword ptr [r14]
|
|
mov qword ptr [r8], rbx
|
|
@31: cmp rcx, r9
|
|
jz @35
|
|
cmp r10, r8
|
|
jnc @34
|
|
@32: add r10, 1
|
|
mov ebx, dword ptr [r10]
|
|
mov rsi, rbx
|
|
shr ebx, 12
|
|
xor esi, ebx
|
|
and esi, 0FFFH
|
|
mov qword ptr [off + rsi * 8], r10
|
|
@33: cmp r10, r8
|
|
jc @32
|
|
@34: add r8, r11
|
|
lea r10, [r8 - 1]
|
|
shl r13d, 1
|
|
jnz @23
|
|
jmp @22
|
|
@35: pop r15
|
|
pop r14
|
|
pop r13
|
|
pop r12
|
|
pop rbx
|
|
{$ifdef WIN64ABI} // additional registers to preserve
|
|
pop rdi
|
|
pop rsi
|
|
{$endif WIN64ABI}
|
|
end;
|
|
|
|
function RdRand32: cardinal;
|
|
{$ifdef FPC}nostackframe; assembler; asm{$else} asm .noframe {$endif FPC}
|
|
// rdrand eax: same opcodes for x86 and x64
|
|
db $0f, $c7, $f0
|
|
// we ignore the carry flag (handled once in TestIntelCpuFeatures)
|
|
end;
|
|
|
|
function Rdtsc: Int64;
|
|
{$ifdef FPC}nostackframe; assembler; asm{$else} asm .noframe {$endif FPC}
|
|
rdtsc // returns the TSC in EDX:EAX
|
|
shl rdx, 32
|
|
or rax, rdx
|
|
end;
|
|
|
|
procedure LockedInc32(int32: PInteger);
|
|
{$ifdef FPC}nostackframe; assembler; asm {$else} asm .noframe {$endif FPC}
|
|
lock inc dword ptr [int32]
|
|
end;
|
|
|
|
procedure LockedDec32(int32: PInteger);
|
|
{$ifdef FPC}nostackframe; assembler; asm {$else} asm .noframe {$endif FPC}
|
|
lock dec dword ptr [int32]
|
|
end;
|
|
|
|
procedure LockedInc64(int64: PInt64);
|
|
{$ifdef FPC}nostackframe; assembler; asm {$else} asm .noframe {$endif FPC}
|
|
lock inc qword ptr [int64]
|
|
end;
|
|
|
|
function InterlockedIncrement(var I: integer): integer;
|
|
{$ifdef FPC}nostackframe; assembler; asm {$else} asm .noframe {$endif FPC}
|
|
mov eax, 1
|
|
lock xadd dword ptr [I], eax // atomic eax=[I] + [I]:=[I]+eax
|
|
add eax, 1
|
|
end;
|
|
|
|
function InterlockedDecrement(var I: integer): integer;
|
|
{$ifdef FPC}nostackframe; assembler; asm {$else} asm .noframe {$endif FPC}
|
|
mov eax, -1
|
|
lock xadd dword ptr [I], eax // atomic eax=[I] + [I]:=[I]+eax
|
|
sub eax, 1
|
|
end;
|
|
|
|
function StrCntDecFree(var refcnt: TStrCnt): boolean;
|
|
{$ifdef FPC}nostackframe; assembler; asm {$else} asm .noframe {$endif FPC}
|
|
{$ifdef STRCNT32}
|
|
lock dec dword ptr [refcnt] // TStrCnt=longint on Delphi Win64 and FPC>=3.4
|
|
{$else}
|
|
lock dec qword ptr [refcnt] // on FPC<3.4
|
|
{$endif STRCNT32}
|
|
setbe al
|
|
end; // don't check for ismultithread global since lock is cheaper on new CPUs
|
|
|
|
function DACntDecFree(var refcnt: TDACnt): boolean;
|
|
{$ifdef FPC}nostackframe; assembler; asm {$else} asm .noframe {$endif FPC}
|
|
{$ifdef DACNT32}
|
|
lock dec dword ptr [refcnt] // TDACnt=longint on Delphi
|
|
{$else}
|
|
lock dec qword ptr [refcnt]
|
|
{$endif DACNT32}
|
|
setbe al
|
|
end;
|
|
|
|
function LockedExc(var Target: PtrUInt; NewValue, Comperand: PtrUInt): boolean;
|
|
{$ifdef FPC}nostackframe; assembler; asm {$else} asm .noframe {$endif FPC}
|
|
mov rax, Comperand
|
|
lock cmpxchg qword ptr [Target], NewValue
|
|
setz al
|
|
end;
|
|
|
|
procedure LockedAdd(var Target: PtrUInt; Increment: PtrUInt);
|
|
{$ifdef FPC}nostackframe; assembler; asm {$else} asm .noframe {$endif FPC}
|
|
lock add qword ptr [Target], Increment
|
|
end;
|
|
|
|
procedure LockedAdd32(var Target: cardinal; Increment: cardinal);
|
|
{$ifdef FPC}nostackframe; assembler; asm {$else} asm .noframe {$endif FPC}
|
|
lock add dword ptr [Target], Increment
|
|
end;
|
|
|
|
procedure LockedDec(var Target: PtrUInt; Decrement: PtrUInt);
|
|
{$ifdef FPC}nostackframe; assembler; asm {$else} asm .noframe {$endif FPC}
|
|
lock sub qword ptr [Target], Decrement
|
|
end;
|
|
|
|
{$ifdef ISDELPHI}
|
|
|
|
// those functions are intrinsics with FPC :)
|
|
|
|
function BSRdword(c: cardinal): cardinal;
|
|
asm
|
|
.noframe
|
|
mov eax, c
|
|
bsr eax, eax
|
|
jnz @nz
|
|
mov eax, 255
|
|
@nz:
|
|
end;
|
|
|
|
function BSRqword(const q: qword): cardinal;
|
|
asm
|
|
.noframe
|
|
mov rax, q
|
|
bsr rax, rax
|
|
jnz @nz
|
|
mov eax, 255
|
|
@nz:
|
|
end;
|
|
|
|
// FPC will properly inline multiplication by reciprocal
|
|
procedure Div100(Y: cardinal; var res: TDiv100Rec);
|
|
asm
|
|
.noframe
|
|
mov r8, res
|
|
mov edx, Y
|
|
mov dword ptr [r8].TDiv100Rec.M,edx
|
|
mov eax, 1374389535
|
|
mul edx
|
|
shr edx, 5
|
|
mov dword ptr [r8].TDiv100Rec.D, edx
|
|
imul eax, edx, 100
|
|
sub dword ptr [r8].TDiv100Rec.M, eax
|
|
end;
|
|
|
|
{$endif ISDELPHI}
|
|
|
|
function IsXmmYmmOSEnabled: boolean;
|
|
{$ifdef FPC}nostackframe; assembler; asm {$else} asm .noframe {$endif FPC}
|
|
// see https://software.intel.com/en-us/blogs/2011/04/14/is-avx-enabled
|
|
xor ecx, ecx // specify control register XCR0=XFEATURE_ENABLED_MASK
|
|
db $0f, $01, $d0 // XGETBV opcode reads XCR0 into EDX:EAX
|
|
and eax, 6
|
|
cmp al, 6 // check XMM (bit 1=2) and YMM (bit 2=4)
|
|
sete al // true if OS enabled both XMM and YMM
|
|
end;
|
|
|
|
procedure GetCpuid(cpueax, cpuecx: cardinal; var regs: TIntelRegisters);
|
|
{$ifdef FPC}nostackframe; assembler; asm {$else} asm .noframe {$endif FPC}
|
|
mov eax, cpueax
|
|
mov ecx, cpuecx
|
|
mov r9, regs
|
|
mov r10, rbx // preserve rbx
|
|
xor ebx, ebx
|
|
xor edx, edx
|
|
cpuid
|
|
mov TIntelRegisters(r9).&eax, eax
|
|
mov TIntelRegisters(r9).&ebx, ebx
|
|
mov TIntelRegisters(r9).&ecx, ecx
|
|
mov TIntelRegisters(r9).&edx, edx
|
|
mov rbx, r10
|
|
end;
|
|
|