mirror of
https://github.com/danieleteti/delphimvcframework.git
synced 2024-11-15 15:55:54 +01:00
2603 lines
79 KiB
PHP
2603 lines
79 KiB
PHP
|
{
|
||
|
This file is a part of the Open Source Synopse mORMot framework 2,
|
||
|
licensed under a MPL/GPL/LGPL three license - see LICENSE.md
|
||
|
|
||
|
x86 32-bit assembly used by mormot.core.base.pas
|
||
|
}
|
||
|
|
||
|
{$ifdef FPC}
|
||
|
// disabled some FPC paranoid warnings
|
||
|
{$WARN 7102 off : Use of +offset(%ebp) for parameters invalid here }
|
||
|
{$WARN 7104 off : Use of -offset(%ebp) is not recommended for local variable access }
|
||
|
{$WARN 7121 off : Check size of XMM memory operand }
|
||
|
{$WARN 7122 off : Check size of XMM memory operand }
|
||
|
{$endif FPC}
|
||
|
|
||
|
{$ifdef ASMX86} // i386 asm with global - disabled on PIC targets
|
||
|
|
||
|
{
|
||
|
FillCharFast/MoveFast implementation notes:
|
||
|
- use SS2 and not FPU fld/fst which may trigger unexpected exceptions (Agner)
|
||
|
- use dedicated branchless sub-functions for small buffers of 0..32 bytes
|
||
|
- use simple SSE2 loop for 33..255 bytes
|
||
|
- assume ERBMS is available (cpuid flag may not be propagated within VMs)
|
||
|
- use "rep movsb" for 256..512K input (should work on all CPUs, even older with
|
||
|
no ERBMS), or as fallback if no SSE2 CPU is used
|
||
|
- use non volatile SSE2 loop when >= 512KB (to not pollute the CPU cache)
|
||
|
- don't use backward "std rep movsb" since it is not involved by ERMBS (slow)
|
||
|
- note: Delphi Win32 x87 RTL code by John O'Harrow seems deprecated
|
||
|
}
|
||
|
|
||
|
{$define WITH_ERMS} // we need it as fallback on old CPU without SSE2
|
||
|
|
||
|
{$ifdef WITH_ERMS}
|
||
|
var
|
||
|
// "rep stosb/movsb" enabled for len >= 4096 on ERMSB CPUs
|
||
|
// it has been reported that "on 32-bit strings have to be at least 4KB"
|
||
|
// see https://stackoverflow.com/a/43837564/458259 for explanations and timing
|
||
|
ERMSB_MIN_SIZE_FWD: integer = maxInt; // maxInt = disabled by default
|
||
|
{$ifndef FPC_X86}
|
||
|
ERMSB_MIN_SIZE_BWD: integer = maxInt; // used for no-SSE2, not for ERMSB
|
||
|
{$endif FPC_X86}
|
||
|
{$define WITH_ERMSASM} // include "rep movsb" asm blocks
|
||
|
{$endif WITH_ERMS}
|
||
|
|
||
|
{$ifdef HASNOSSE2}
|
||
|
{$define WITH_ERMSASM} // "rep movsd" is used as fallback on oldest CPU
|
||
|
{$endif HASNOSSE2}
|
||
|
|
||
|
const
|
||
|
// non-temporal writes should bypass the cache when the size is bigger than
|
||
|
// half the size of the largest level cache - we assume low 1MB cache here
|
||
|
NONTEMPORALSIZE = 1 shl 20;
|
||
|
|
||
|
// fast SSE2 version - force define HASNOSSE2 when run any very old CPU
|
||
|
procedure FillcharFast(var dst; cnt: PtrInt; value: byte);
|
||
|
{$ifdef FPC}nostackframe; assembler;{$endif}
|
||
|
asm
|
||
|
// eax=Dest edx=Count cl=Value
|
||
|
movzx ecx, cl
|
||
|
imul ecx, ecx, $01010101
|
||
|
cmp edx, 32
|
||
|
jg @32up
|
||
|
test edx, edx
|
||
|
jle @00
|
||
|
mov [eax + edx - 1], cl // fill last byte
|
||
|
shr edx, 1 // how many words to fill
|
||
|
jmp dword ptr [edx * 4 + @jmptab]
|
||
|
@00: ret
|
||
|
@32up:
|
||
|
{$ifndef HASNOSSE2}
|
||
|
{$ifdef WITH_ERMS}
|
||
|
cmp edx, NONTEMPORALSIZE
|
||
|
jae @noerms // movntdq was reported to be faster than ERMSB
|
||
|
cmp edx, ERMSB_MIN_SIZE_FWD
|
||
|
ja @ermsb
|
||
|
@noerms:{$endif WITH_ERMS}
|
||
|
sub edx, 16
|
||
|
movd xmm0, ecx
|
||
|
mov ecx, eax
|
||
|
pshufd xmm0, xmm0, 0
|
||
|
and ecx, 15 // 16-byte align writes
|
||
|
movups dqword ptr [eax], xmm0 // fill first 16 bytes
|
||
|
movups dqword ptr [eax + edx], xmm0 // fill last 16 bytes
|
||
|
sub ecx, 16
|
||
|
sub eax, ecx
|
||
|
add edx, ecx
|
||
|
add eax, edx
|
||
|
neg edx
|
||
|
cmp edx, -NONTEMPORALSIZE // assume > 512KB bypass the cache
|
||
|
jl @nv
|
||
|
{$ifdef FPC} align 16 {$else} {$ifdef HASALIGN} .align 16 {$endif}{$endif}
|
||
|
@loop: movaps dqword ptr [eax + edx], xmm0 // fill 16 bytes per loop
|
||
|
add edx, 16
|
||
|
jl @loop
|
||
|
ret
|
||
|
{$ifdef FPC} align 16 {$else} {$ifdef HASALIGN} .align 16 {$endif}{$endif}
|
||
|
@nv: movntdq dqword ptr [eax + edx], xmm0 // non-temporal fill 16 bytes
|
||
|
add edx, 16
|
||
|
jl @nv
|
||
|
ret
|
||
|
{$endif HASNOSSE2}
|
||
|
{$ifdef WITH_ERMSASM}
|
||
|
@ermsb: push edi
|
||
|
cld
|
||
|
{$ifdef HASNOSSE2}
|
||
|
@ermsa: test al, 3 // aligned stosd is better on old CPUs
|
||
|
jz @erms3
|
||
|
mov byte ptr [eax], cl
|
||
|
inc eax
|
||
|
dec edx
|
||
|
jmp @ermsa
|
||
|
@erms3: mov edi, eax
|
||
|
mov eax, ecx
|
||
|
mov ecx, edx
|
||
|
shr ecx, 2
|
||
|
jz @erms2
|
||
|
rep stosd // no SSE2 version
|
||
|
@erms2: test dl, 2
|
||
|
jz @erms1
|
||
|
mov word ptr [edi], ax
|
||
|
add edi, 2
|
||
|
@erms1: test dl, 1
|
||
|
jz @erms0
|
||
|
mov byte ptr [edi], al
|
||
|
{$else} // ERMSB favors stosb and will properly align writes
|
||
|
mov edi, eax
|
||
|
mov eax, ecx
|
||
|
mov ecx, edx
|
||
|
rep stosb
|
||
|
{$endif HASNOSSE2}
|
||
|
@erms0: pop edi
|
||
|
ret
|
||
|
{$endif WITH_ERMSASM}
|
||
|
{$ifdef FPC} align 4 {$else} nop {$ifdef HASNOSSE2} nop; nop {$endif} {$endif}
|
||
|
@jmptab:dd @00, @02, @04, @06, @08, @10, @12, @14, @16
|
||
|
dd @18, @20, @22, @24, @26, @28, @30, @32
|
||
|
// Delphi RTL uses 16-bit "mov [eax + ...], cx" which are slower
|
||
|
@32: mov [eax + 28], ecx
|
||
|
@28: mov [eax + 24], ecx
|
||
|
@24: mov [eax + 20], ecx
|
||
|
@20: mov [eax + 16], ecx
|
||
|
@16: mov [eax + 12], ecx
|
||
|
@12: mov [eax + 8], ecx
|
||
|
@08: mov [eax + 4], ecx
|
||
|
@04: mov [eax], ecx
|
||
|
ret
|
||
|
@30: mov [eax + 26], ecx
|
||
|
@26: mov [eax + 22], ecx
|
||
|
@22: mov [eax + 18], ecx
|
||
|
@18: mov [eax + 14], ecx
|
||
|
@14: mov [eax + 10], ecx
|
||
|
@10: mov [eax + 6], ecx
|
||
|
@06: mov [eax + 2], ecx
|
||
|
@02: mov word ptr [eax], cx
|
||
|
end;
|
||
|
|
||
|
{$ifndef FPC_X86} // FPC RTL has fastmove.inc -> our SSE2/ERMS asm is slower
|
||
|
|
||
|
// fast SSE2 version - force define HASNOSSE2 when run any very old CPU
|
||
|
procedure MoveFast(const src; var dst; cnt: PtrInt);
|
||
|
{$ifdef FPC}nostackframe; assembler;{$endif}
|
||
|
asm
|
||
|
// eax=source edx=dest ecx=count
|
||
|
cmp ecx, 32
|
||
|
ja @lrg // count > 32 or count < 0
|
||
|
sub ecx, 8
|
||
|
jg @sml // 9..32 byte move
|
||
|
jmp dword ptr [@table + 32 + ecx * 4] // 0..8 byte move
|
||
|
{$ifdef HASNOSSE2}
|
||
|
@sml: fild qword ptr [eax + ecx] // last 8
|
||
|
fild qword ptr [eax] // first 8
|
||
|
cmp ecx, 8
|
||
|
jle @sml16
|
||
|
fild qword ptr [eax + 8] // second 8
|
||
|
cmp ecx, 16
|
||
|
jle @sml24
|
||
|
fild qword ptr [eax + 16] // third 8
|
||
|
fistp qword ptr [edx + 16] // third 8
|
||
|
@sml24: fistp qword ptr [edx + 8] // second 8
|
||
|
@sml16: fistp qword ptr [edx] // first 8
|
||
|
fistp qword ptr [edx + ecx] // last 8
|
||
|
{$else}
|
||
|
@sml: movq xmm0, qword ptr [eax + ecx] // last 8
|
||
|
movq xmm1, qword ptr [eax] // first 8
|
||
|
cmp ecx, 8
|
||
|
jle @sml16
|
||
|
movq xmm2, qword ptr [eax + 8] // second 8
|
||
|
cmp ecx, 16
|
||
|
jle @sml24
|
||
|
movq xmm3, qword ptr [eax + 16] // third 8
|
||
|
movq qword ptr [edx + 16], xmm3 // third 8
|
||
|
@sml24: movq qword ptr [edx + 8], xmm2 // second 8
|
||
|
@sml16: movq qword ptr [edx], xmm1 // first 8
|
||
|
movq qword ptr [edx + ecx], xmm0 // last 8
|
||
|
ret
|
||
|
{$endif HASNOSSE2}
|
||
|
@exit: rep ret
|
||
|
{$ifdef FPC} align 4 {$else} {$ifdef HASALIGN} .align 4 {$endif}{$endif}
|
||
|
@table: dd @exit, @m01, @m02, @m03, @m04, @m05, @m06, @m07, @m08
|
||
|
{$ifdef WITH_ERMSASM}
|
||
|
@ermsf: push esi
|
||
|
push edi
|
||
|
mov esi, eax
|
||
|
mov edi, edx
|
||
|
cld
|
||
|
rep movsb // ERMSB forward move
|
||
|
pop edi
|
||
|
pop esi
|
||
|
ret
|
||
|
@ermsb: push esi
|
||
|
push edi
|
||
|
lea esi, [eax + ecx - 1]
|
||
|
lea edi, [edx + ecx - 1]
|
||
|
std
|
||
|
rep movsb // backward move is slow even if ERMSB is set
|
||
|
pop edi
|
||
|
pop esi
|
||
|
cld // FPC requires this
|
||
|
ret
|
||
|
{$endif WITH_ERMSASM}
|
||
|
{$ifdef HASNOSSE2}
|
||
|
@lrg: jng @exit // count < 0
|
||
|
sub edx, eax
|
||
|
jz @exit
|
||
|
cmp edx, ecx
|
||
|
lea edx, [edx + eax]
|
||
|
jb @ermsb // move backwards if unsigned(dest-source) < count
|
||
|
jmp @ermsf
|
||
|
{$else}
|
||
|
@lrgfwd:// large forward move
|
||
|
{$ifdef WITH_ERMS}
|
||
|
cmp ecx, NONTEMPORALSIZE
|
||
|
jae @noermf // movntdq was reported to be faster than ERMSB
|
||
|
cmp ecx, ERMSB_MIN_SIZE_FWD
|
||
|
ja @ermsf
|
||
|
@noermf:{$endif WITH_ERMS}
|
||
|
push edx
|
||
|
movups xmm2, dqword ptr [eax] // first 16
|
||
|
lea eax, [eax + ecx - 16]
|
||
|
lea ecx, [ecx + edx - 16]
|
||
|
movups xmm1, dqword ptr [eax] // last 16
|
||
|
push ecx
|
||
|
neg ecx
|
||
|
and edx, -16 // 16-byte align writes
|
||
|
lea ecx, [ecx + edx + 16]
|
||
|
pop edx
|
||
|
cmp ecx, -NONTEMPORALSIZE // assume > 512KB bypass the cache
|
||
|
jl @fwnv
|
||
|
{$ifdef FPC} align 16 {$else} {$ifdef HASALIGN} .align 16 {$endif}{$endif}
|
||
|
@fwd: movups xmm0, dqword ptr [eax + ecx] // move by 16 bytes
|
||
|
movaps dqword ptr [edx + ecx], xmm0
|
||
|
add ecx, 16
|
||
|
jl @fwd
|
||
|
@fwde: movups dqword ptr [edx], xmm1 // last 16
|
||
|
pop edx
|
||
|
movups dqword ptr [edx], xmm2 // first 16
|
||
|
ret
|
||
|
{$ifdef FPC} align 16 {$else} {$ifdef HASALIGN} .align 16 {$endif}{$endif}
|
||
|
@fwnv: movups xmm0, dqword ptr [eax + ecx]
|
||
|
movntdq dqword ptr [edx + ecx], xmm0 // non-temporal move by 16 bytes
|
||
|
add ecx, 16
|
||
|
jl @fwnv
|
||
|
sfence
|
||
|
jmp @fwde
|
||
|
@lrg: jng @exit // count < 0
|
||
|
sub edx, eax
|
||
|
jz @exit // source=dest
|
||
|
cmp edx, ecx
|
||
|
lea edx, [edx + eax]
|
||
|
jae @lrgfwd // move backwards if unsigned(dest-source) < count
|
||
|
// large backward/overlapping move
|
||
|
@lrgbwd:{$ifdef WITH_ERMS}
|
||
|
cmp ecx, ERMSB_MIN_SIZE_BWD // enabled for no-SSE2, not for ERMSB
|
||
|
ja @ermsb
|
||
|
{$endif WITH_ERMS}
|
||
|
sub ecx, 16
|
||
|
push ecx
|
||
|
movups xmm2, dqword ptr [eax + ecx] // last 16
|
||
|
movups xmm1, dqword ptr [eax] // first 16
|
||
|
add ecx, edx
|
||
|
and ecx, -16 // 16-byte align writes
|
||
|
sub ecx, edx
|
||
|
// non-volatile backward is not efficient since we overlap data
|
||
|
{$ifdef FPC} align 16 {$else} {$ifdef HASALIGN} .align 16 {$endif}{$endif}
|
||
|
@bwd: movups xmm0, dqword ptr [eax + ecx]
|
||
|
movaps dqword ptr [edx + ecx], xmm0
|
||
|
sub ecx, 16
|
||
|
jg @bwd
|
||
|
pop ecx
|
||
|
movups dqword ptr [edx], xmm1 // first 16
|
||
|
movups dqword ptr [edx + ecx], xmm2 // last 16
|
||
|
ret
|
||
|
{$endif HASNOSSE2}
|
||
|
@m01: mov al, [eax]
|
||
|
mov [edx], al
|
||
|
ret
|
||
|
@m02: movzx eax, word ptr [eax]
|
||
|
mov [edx], ax
|
||
|
ret
|
||
|
@m03: movzx ecx, word ptr [eax]
|
||
|
mov al, [eax + 2]
|
||
|
mov [edx], cx
|
||
|
mov [edx + 2], al
|
||
|
ret
|
||
|
@m04: mov ecx, [eax]
|
||
|
mov [edx], ecx
|
||
|
ret
|
||
|
@m05: mov ecx, [eax]
|
||
|
mov al, [eax + 4]
|
||
|
mov [edx], ecx
|
||
|
mov [edx + 4], al
|
||
|
ret
|
||
|
@m06: mov ecx, [eax]
|
||
|
movzx eax, word ptr [eax + 4]
|
||
|
mov [edx], ecx
|
||
|
mov [edx + 4], ax
|
||
|
ret
|
||
|
@m07: mov ecx, [eax]
|
||
|
mov eax, [eax + 3]
|
||
|
mov [edx], ecx
|
||
|
mov [edx + 3], eax
|
||
|
ret
|
||
|
@m08: mov ecx, [eax]
|
||
|
mov eax, [eax + 4]
|
||
|
mov [edx], ecx
|
||
|
mov [edx + 4], eax
|
||
|
end;
|
||
|
|
||
|
{$endif FPC_X86}
|
||
|
|
||
|
function CompareMem(P1, P2: Pointer; Length: PtrInt): boolean;
|
||
|
{$ifdef FPC}nostackframe; assembler;{$endif}
|
||
|
asm
|
||
|
// eax=P1 edx=P2 ecx=Length
|
||
|
cmp eax, edx
|
||
|
je @0 // P1=P2
|
||
|
sub ecx, 8
|
||
|
jl @small
|
||
|
push ebx
|
||
|
mov ebx, [eax] // Compare First 4 Bytes
|
||
|
cmp ebx, [edx]
|
||
|
jne @setbig
|
||
|
lea ebx, [eax + ecx] // Compare Last 8 Bytes
|
||
|
add edx, ecx
|
||
|
mov eax, [ebx]
|
||
|
cmp eax, [edx]
|
||
|
jne @setbig
|
||
|
mov eax, [ebx + 4]
|
||
|
cmp eax, [edx + 4]
|
||
|
jne @setbig
|
||
|
sub ecx, 4
|
||
|
jle @true // All Bytes already Compared
|
||
|
neg ecx // ecx=-(Length-12)
|
||
|
add ecx, ebx // DWORD Align Reads
|
||
|
and ecx, -4
|
||
|
sub ecx, ebx
|
||
|
@loop: mov eax, [ebx + ecx] // Compare 8 Bytes per Loop
|
||
|
cmp eax, [edx + ecx]
|
||
|
jne @setbig
|
||
|
mov eax, [ebx + ecx + 4]
|
||
|
cmp eax, [edx + ecx + 4]
|
||
|
jne @setbig
|
||
|
add ecx, 8
|
||
|
jl @loop
|
||
|
@true: pop ebx
|
||
|
@0: mov al, 1
|
||
|
ret
|
||
|
@setbig:pop ebx
|
||
|
setz al
|
||
|
ret
|
||
|
@small: add ecx, 8 // ecx=0..7
|
||
|
jle @0 // Length <= 0
|
||
|
neg ecx // ecx=-1..-7
|
||
|
lea ecx, [@1 + ecx * 8 + 8] // each @#: block below = 8 bytes
|
||
|
jmp ecx
|
||
|
@7: mov cl, [eax + 6]
|
||
|
cmp cl, [edx + 6]
|
||
|
jne @setsml
|
||
|
@6: mov ch, [eax + 5]
|
||
|
cmp ch, [edx + 5]
|
||
|
jne @setsml
|
||
|
@5: mov cl, [eax + 4]
|
||
|
cmp cl, [edx + 4]
|
||
|
jne @setsml
|
||
|
@4: mov ch, [eax + 3]
|
||
|
cmp ch, [edx + 3]
|
||
|
jne @setsml
|
||
|
@3: mov cl, [eax + 2]
|
||
|
cmp cl, [edx + 2]
|
||
|
jne @setsml
|
||
|
@2: mov ch, [eax + 1]
|
||
|
cmp ch, [edx + 1]
|
||
|
jne @setsml
|
||
|
@1: mov al, [eax]
|
||
|
cmp al, [edx]
|
||
|
@setsml:setz al
|
||
|
end;
|
||
|
|
||
|
function crc32fasttab(crc: cardinal; buf: PAnsiChar; len: cardinal;
|
||
|
tab: PCrc32tab): cardinal;
|
||
|
asm
|
||
|
// adapted from Aleksandr Sharahov code and Maxim Masiutin remarks
|
||
|
push ebx
|
||
|
push ebp
|
||
|
mov ebp, tab
|
||
|
not eax
|
||
|
neg ecx // eax=crc edx=buf ecx=-len ebp=tab
|
||
|
jz @z
|
||
|
test edx, edx
|
||
|
jz @z
|
||
|
@head: test dl, 3
|
||
|
jz @align
|
||
|
movzx ebx, byte ptr [edx]
|
||
|
inc edx
|
||
|
xor bl, al
|
||
|
shr eax, 8
|
||
|
xor eax, dword ptr [ebx * 4 + ebp]
|
||
|
inc ecx
|
||
|
jnz @head
|
||
|
@z: jmp @e
|
||
|
@align: sub edx, ecx
|
||
|
add ecx, 8
|
||
|
jg @done
|
||
|
push esi
|
||
|
push edi
|
||
|
mov edi, edx
|
||
|
{$ifdef FPC} align 8 {$endif FPC}
|
||
|
@by8: mov edx, eax
|
||
|
mov ebx, [edi + ecx - 4]
|
||
|
xor edx, [edi + ecx - 8]
|
||
|
movzx esi, bl
|
||
|
mov eax, dword ptr [esi * 4 + ebp + 1024 * 3]
|
||
|
movzx esi, bh
|
||
|
xor eax, dword ptr [esi * 4 + ebp + 1024 * 2]
|
||
|
shr ebx, 16
|
||
|
movzx esi, bl
|
||
|
xor eax, dword ptr [esi * 4 + ebp + 1024 * 1]
|
||
|
movzx esi, bh
|
||
|
xor eax, dword ptr [esi * 4 + ebp + 1024 * 0]
|
||
|
movzx esi, dl
|
||
|
xor eax, dword ptr [esi * 4 + ebp + 1024 * 7]
|
||
|
movzx esi, dh
|
||
|
xor eax, dword ptr [esi * 4 + ebp + 1024 * 6]
|
||
|
shr edx, 16
|
||
|
movzx esi, dl
|
||
|
xor eax, dword ptr [esi * 4 + ebp + 1024 * 5]
|
||
|
movzx esi, dh
|
||
|
xor eax, dword ptr [esi * 4 + ebp + 1024 * 4]
|
||
|
add ecx, 8
|
||
|
jle @by8
|
||
|
mov edx, edi
|
||
|
pop edi
|
||
|
pop esi
|
||
|
@done: sub ecx, 8
|
||
|
jge @e
|
||
|
@tail: movzx ebx, byte[edx + ecx]
|
||
|
xor bl, al
|
||
|
shr eax, 8
|
||
|
xor eax, dword ptr [ebx * 4 + ebp]
|
||
|
inc ecx
|
||
|
jnz @tail
|
||
|
@e: pop ebp
|
||
|
pop ebx
|
||
|
not eax
|
||
|
end;
|
||
|
|
||
|
procedure crcblockfast(crc128, data128: PBlock128);
|
||
|
{$ifdef FPC}nostackframe; assembler;{$endif}
|
||
|
asm
|
||
|
// Delphi is not efficient about corresponding pascal code
|
||
|
push ebp
|
||
|
push edi
|
||
|
push esi
|
||
|
mov ebp, eax // ebp=crc128 edi=data128
|
||
|
mov edi, edx
|
||
|
mov edx, dword ptr [eax]
|
||
|
mov ecx, dword ptr [eax + 4]
|
||
|
xor edx, dword ptr [edi]
|
||
|
xor ecx, dword ptr [edi + 4]
|
||
|
movzx esi, dl
|
||
|
// note: since we have "+ 1024 * n" offsets, crc32ctab is left immediate
|
||
|
mov eax, dword ptr [esi * 4 + crc32ctab + 1024 * 3]
|
||
|
movzx esi, dh
|
||
|
shr edx, 16
|
||
|
xor eax, dword ptr [esi * 4 + crc32ctab + 1024 * 2]
|
||
|
movzx esi, dl
|
||
|
xor eax, dword ptr [esi * 4 + crc32ctab + 1024 * 1]
|
||
|
movzx esi, dh
|
||
|
xor eax, dword ptr [esi * 4 + crc32ctab + 1024 * 0]
|
||
|
mov edx, dword ptr [ebp + 8]
|
||
|
xor edx, dword ptr [edi + 8]
|
||
|
mov dword ptr [ebp], eax
|
||
|
movzx esi, cl
|
||
|
mov eax, dword ptr [esi * 4 + crc32ctab + 1024 * 3]
|
||
|
movzx esi, ch
|
||
|
shr ecx, 16
|
||
|
xor eax, dword ptr [esi * 4 + crc32ctab + 1024 * 2]
|
||
|
movzx esi, cl
|
||
|
xor eax, dword ptr [esi * 4 + crc32ctab + 1024 * 1]
|
||
|
movzx esi, ch
|
||
|
xor eax, dword ptr [esi * 4 + crc32ctab + 1024 * 0]
|
||
|
mov dword ptr [ebp + 4], eax
|
||
|
mov ecx, dword ptr [ebp + 12]
|
||
|
xor ecx, dword ptr [edi + 12]
|
||
|
movzx esi, dl
|
||
|
mov eax, dword ptr [esi * 4 + crc32ctab + 1024 * 3]
|
||
|
movzx esi, dh
|
||
|
shr edx, 16
|
||
|
xor eax, dword ptr [esi * 4 + crc32ctab + 1024 * 2]
|
||
|
movzx esi, dl
|
||
|
xor eax, dword ptr [esi * 4 + crc32ctab + 1024 * 1]
|
||
|
movzx esi, dh
|
||
|
xor eax, dword ptr [esi * 4 + crc32ctab + 1024 * 0]
|
||
|
mov dword ptr [ebp + 8], eax
|
||
|
movzx esi, cl
|
||
|
mov eax, dword ptr [esi * 4 + crc32ctab + 1024 * 3]
|
||
|
movzx esi, ch
|
||
|
shr ecx, 16
|
||
|
xor eax, dword ptr [esi * 4 + crc32ctab + 1024 * 2]
|
||
|
movzx esi, cl
|
||
|
xor eax, dword ptr [esi * 4 + crc32ctab + 1024 * 1]
|
||
|
movzx esi, ch
|
||
|
xor eax, dword ptr [esi * 4 + crc32ctab + 1024 * 0]
|
||
|
mov dword ptr [ebp + 12], eax
|
||
|
pop esi
|
||
|
pop edi
|
||
|
pop ebp
|
||
|
end;
|
||
|
|
||
|
function fnv32(crc: cardinal; buf: PAnsiChar; len: PtrInt): cardinal;
|
||
|
{$ifdef FPC}nostackframe; assembler;{$endif}
|
||
|
asm
|
||
|
// eax=crc, edx=buf, ecx=len
|
||
|
push ebx
|
||
|
test edx, edx
|
||
|
jz @0
|
||
|
neg ecx
|
||
|
jz @0
|
||
|
sub edx, ecx
|
||
|
@1: movzx ebx, byte ptr [edx + ecx]
|
||
|
xor eax, ebx
|
||
|
imul eax, eax, 16777619
|
||
|
inc ecx
|
||
|
jnz @1
|
||
|
@0: pop ebx
|
||
|
end; // we tried an unrolled version, but it was slower on our Core i7!
|
||
|
|
||
|
function kr32(crc: cardinal; buf: PAnsiChar; len: PtrInt): cardinal;
|
||
|
{$ifdef FPC}nostackframe; assembler;{$endif}
|
||
|
asm
|
||
|
// eax=crc, edx=buf, ecx=len
|
||
|
test ecx, ecx
|
||
|
push edi
|
||
|
push esi
|
||
|
push ebx
|
||
|
push ebp
|
||
|
jz @z
|
||
|
cmp ecx, 4
|
||
|
jb @s
|
||
|
@8: mov ebx, [edx] // unrolled version reading per dword
|
||
|
add edx, 4
|
||
|
mov esi, eax
|
||
|
movzx edi, bl
|
||
|
movzx ebp, bh
|
||
|
shr ebx, 16
|
||
|
shl eax, 5
|
||
|
sub eax, esi
|
||
|
add eax, edi
|
||
|
mov esi, eax
|
||
|
shl eax, 5
|
||
|
sub eax, esi
|
||
|
lea esi, [eax + ebp]
|
||
|
add eax, ebp
|
||
|
movzx edi, bl
|
||
|
movzx ebx, bh
|
||
|
shl eax, 5
|
||
|
sub eax, esi
|
||
|
lea ebp, [eax + edi]
|
||
|
add eax, edi
|
||
|
shl eax, 5
|
||
|
sub eax, ebp
|
||
|
add eax, ebx
|
||
|
cmp ecx, 8
|
||
|
lea ecx, [ecx - 4]
|
||
|
jae @8
|
||
|
test ecx, ecx
|
||
|
jz @z
|
||
|
@s: mov esi, eax
|
||
|
@1: shl eax, 5
|
||
|
movzx ebx, byte ptr [edx]
|
||
|
inc edx
|
||
|
sub eax, esi
|
||
|
lea esi, [eax + ebx]
|
||
|
add eax, ebx
|
||
|
dec ecx
|
||
|
jnz @1
|
||
|
@z: pop ebp
|
||
|
pop ebx
|
||
|
pop esi
|
||
|
pop edi
|
||
|
end;
|
||
|
|
||
|
function StrLenSSE2(S: pointer): PtrInt;
|
||
|
{$ifdef FPC}nostackframe; assembler;{$endif}
|
||
|
asm
|
||
|
// from GPL strlen32.asm by Agner Fog - www.agner.org/optimize
|
||
|
mov ecx, eax // copy pointer
|
||
|
test eax, eax
|
||
|
jz @null // returns 0 if S=nil
|
||
|
push eax // save start address
|
||
|
pxor xmm0, xmm0 // set to zero
|
||
|
and ecx, 15 // lower 4 bits indicate misalignment
|
||
|
and eax, -16 // align pointer by 16
|
||
|
// will never read outside a memory page boundary, so won't trigger GPF
|
||
|
movaps xmm1, [eax] // read from nearest preceding boundary
|
||
|
pcmpeqb xmm1, xmm0 // compare 16 bytes with zero
|
||
|
pmovmskb edx, xmm1 // get one bit for each byte result
|
||
|
shr edx, cl // shift out false bits
|
||
|
shl edx, cl // shift back again
|
||
|
bsf edx, edx // find first 1-bit
|
||
|
jnz @A200 // found
|
||
|
// Main loop, search 16 bytes at a time
|
||
|
@A100: add eax, 10H // increment pointer by 16
|
||
|
movaps xmm1, [eax] // read 16 bytes aligned
|
||
|
pcmpeqb xmm1, xmm0 // compare 16 bytes with zero
|
||
|
pmovmskb edx, xmm1 // get one bit for each byte result
|
||
|
bsf edx, edx // find first 1-bit
|
||
|
// (moving the bsf out of the loop and using test here would be faster
|
||
|
// for long strings on old processors, but we are assuming that most
|
||
|
// strings are short, and newer processors have higher priority)
|
||
|
jz @A100 // loop if not found
|
||
|
@A200: // Zero-byte found. Compute string length
|
||
|
pop ecx // restore start address
|
||
|
sub eax, ecx // subtract start address
|
||
|
add eax, edx // add byte index
|
||
|
@null:
|
||
|
end;
|
||
|
|
||
|
function StrInt32(P: PAnsiChar; val: PtrInt): PAnsiChar;
|
||
|
{$ifdef FPC}nostackframe; assembler;{$endif}
|
||
|
asm
|
||
|
// eax=P, edx=val
|
||
|
mov ecx, edx
|
||
|
sar ecx, 31 // 0 if val>=0 or -1 if val<0
|
||
|
push ecx
|
||
|
xor edx, ecx
|
||
|
sub edx, ecx // edx=abs(val)
|
||
|
cmp edx, 10
|
||
|
jb @3 // direct process of common val<10
|
||
|
push edi
|
||
|
mov edi, eax
|
||
|
mov eax, edx
|
||
|
@s: sub edi, 2
|
||
|
cmp eax, 100
|
||
|
jb @2
|
||
|
mov ecx, eax
|
||
|
mov edx, 1374389535 // use power of two reciprocal to avoid division
|
||
|
mul edx
|
||
|
shr edx, 5 // now edx=eax div 100
|
||
|
mov eax, edx
|
||
|
imul edx, -200
|
||
|
movzx edx, word ptr [TwoDigitLookup + ecx * 2 + edx]
|
||
|
mov [edi], dx
|
||
|
cmp eax, 10
|
||
|
jae @s
|
||
|
@1: dec edi
|
||
|
or al, '0'
|
||
|
mov byte ptr [edi - 1], '-'
|
||
|
mov [edi], al
|
||
|
mov eax, edi
|
||
|
pop edi
|
||
|
pop ecx
|
||
|
add eax, ecx // includes '-' if val<0
|
||
|
ret
|
||
|
@2: movzx eax, word ptr [TwoDigitLookup + eax * 2]
|
||
|
mov byte ptr [edi - 1], '-'
|
||
|
mov [edi], ax
|
||
|
mov eax, edi
|
||
|
pop edi
|
||
|
pop ecx
|
||
|
add eax, ecx // includes '-' if val<0
|
||
|
ret
|
||
|
@3: dec eax
|
||
|
pop ecx
|
||
|
or dl, '0'
|
||
|
mov byte ptr [eax - 1], '-'
|
||
|
mov [eax], dl
|
||
|
add eax, ecx // includes '-' if val<0
|
||
|
end;
|
||
|
|
||
|
function StrUInt32(P: PAnsiChar; val: PtrUInt): PAnsiChar;
|
||
|
{$ifdef FPC}nostackframe; assembler;{$endif}
|
||
|
asm
|
||
|
// eax=P, edx=val
|
||
|
cmp edx, 10
|
||
|
jb @3 // direct process of common val=0 (or val<10)
|
||
|
push edi
|
||
|
mov edi, eax
|
||
|
mov eax, edx
|
||
|
@s: sub edi, 2
|
||
|
cmp eax, 100
|
||
|
jb @2
|
||
|
mov ecx, eax
|
||
|
mov edx, 1374389535 // use power of two reciprocal to avoid division
|
||
|
mul edx
|
||
|
shr edx, 5 // now edx=eax div 100
|
||
|
mov eax, edx
|
||
|
imul edx, -200
|
||
|
movzx edx, word ptr [TwoDigitLookup + ecx * 2 + edx]
|
||
|
mov [edi], dx
|
||
|
cmp eax, 10
|
||
|
jae @s
|
||
|
@1: dec edi
|
||
|
or al, '0'
|
||
|
mov [edi], al
|
||
|
mov eax, edi
|
||
|
pop edi
|
||
|
ret
|
||
|
@2: movzx eax, word ptr [TwoDigitLookup + eax * 2]
|
||
|
mov [edi], ax
|
||
|
mov eax, edi
|
||
|
pop edi
|
||
|
ret
|
||
|
@3: dec eax
|
||
|
or dl, '0'
|
||
|
mov [eax], dl
|
||
|
end;
|
||
|
|
||
|
procedure YearToPChar(Y: PtrUInt; P: PUtf8Char);
|
||
|
{$ifdef FPC}nostackframe; assembler;{$endif}
|
||
|
asm
|
||
|
// eax=Y, edx=P
|
||
|
push edx
|
||
|
mov ecx, eax
|
||
|
mov edx, 1374389535 // use power of two reciprocal to avoid division
|
||
|
mul edx
|
||
|
shr edx, 5 // now edx=Y div 100
|
||
|
movzx eax, word ptr [TwoDigitLookup + edx * 2]
|
||
|
imul edx, -200
|
||
|
movzx edx, word ptr [TwoDigitLookup + ecx * 2 + edx]
|
||
|
pop ecx
|
||
|
shl edx, 16
|
||
|
or eax, edx
|
||
|
mov [ecx], eax
|
||
|
end;
|
||
|
|
||
|
{$endif ASMX86}
|
||
|
|
||
|
// functions below are always available, even on DARWIN
|
||
|
|
||
|
function Hash32(Data: PCardinalArray; Len: integer): cardinal;
|
||
|
{$ifdef FPC}nostackframe; assembler;{$endif}
|
||
|
asm
|
||
|
// eax=Data edx=Len
|
||
|
push esi
|
||
|
push edi
|
||
|
mov cl, dl
|
||
|
mov ch, dl
|
||
|
xor esi, esi
|
||
|
xor edi, edi
|
||
|
test eax, eax
|
||
|
jz @z
|
||
|
shr edx, 4
|
||
|
jz @by4
|
||
|
nop
|
||
|
@by16: add esi, dword ptr [eax]
|
||
|
add edi, esi
|
||
|
add esi, dword ptr [eax+4]
|
||
|
add edi, esi
|
||
|
add esi, dword ptr [eax+8]
|
||
|
add edi, esi
|
||
|
add esi, dword ptr [eax+12]
|
||
|
add edi, esi
|
||
|
add eax, 16
|
||
|
dec edx
|
||
|
jnz @by16
|
||
|
@by4: and cl, 15
|
||
|
jz @0
|
||
|
shr cl, 2
|
||
|
jz @rem
|
||
|
@4: add esi, dword ptr [eax]
|
||
|
add edi, esi
|
||
|
add eax, 4
|
||
|
dec cl
|
||
|
jnz @4
|
||
|
@rem: and ch, 3
|
||
|
jz @0
|
||
|
dec ch
|
||
|
jz @1
|
||
|
dec ch
|
||
|
jz @2
|
||
|
mov eax, dword ptr [eax]
|
||
|
and eax, $ffffff
|
||
|
jmp @e
|
||
|
@2: movzx eax, word ptr [eax]
|
||
|
jmp @e
|
||
|
@1: movzx eax, byte ptr [eax]
|
||
|
@e: add esi, eax
|
||
|
@0: add edi, esi
|
||
|
mov eax, esi
|
||
|
shl edi, 16
|
||
|
xor eax, edi
|
||
|
@z: pop edi
|
||
|
pop esi
|
||
|
end;
|
||
|
|
||
|
function xxHash32(crc: cardinal; P: PAnsiChar; len: cardinal): cardinal;
|
||
|
{$ifdef FPC}nostackframe; assembler;{$endif}
|
||
|
asm
|
||
|
xchg edx, ecx
|
||
|
push ebp
|
||
|
push edi
|
||
|
lea ebp, [ecx+edx]
|
||
|
push esi
|
||
|
push ebx
|
||
|
sub esp, 8
|
||
|
mov ebx, eax
|
||
|
mov dword ptr [esp], edx
|
||
|
lea eax, [ebx+165667B1H]
|
||
|
cmp edx, 15
|
||
|
jbe @2
|
||
|
lea eax, [ebp-10H]
|
||
|
lea edi, [ebx+24234428H]
|
||
|
lea esi, [ebx-7A143589H]
|
||
|
mov dword ptr [esp+4H], ebp
|
||
|
mov edx, eax
|
||
|
lea eax, [ebx+61C8864FH]
|
||
|
mov ebp, edx
|
||
|
@1: mov edx, dword ptr [ecx]
|
||
|
imul edx, -2048144777
|
||
|
add edi, edx
|
||
|
rol edi, 13
|
||
|
imul edi, -1640531535
|
||
|
mov edx, dword ptr [ecx+4]
|
||
|
imul edx, -2048144777
|
||
|
add esi, edx
|
||
|
rol esi, 13
|
||
|
imul esi, -1640531535
|
||
|
mov edx, dword ptr [ecx+8]
|
||
|
imul edx, -2048144777
|
||
|
add ebx, edx
|
||
|
rol ebx, 13
|
||
|
imul ebx, -1640531535
|
||
|
mov edx, dword ptr [ecx+12]
|
||
|
lea ecx, [ecx+16]
|
||
|
imul edx, -2048144777
|
||
|
add eax, edx
|
||
|
rol eax, 13
|
||
|
imul eax, -1640531535
|
||
|
cmp ebp, ecx
|
||
|
jnc @1
|
||
|
rol edi, 1
|
||
|
rol esi, 7
|
||
|
rol ebx, 12
|
||
|
add esi, edi
|
||
|
mov ebp, dword ptr [esp+4H]
|
||
|
ror eax, 14
|
||
|
add ebx, esi
|
||
|
add eax, ebx
|
||
|
@2: lea esi, [ecx+4H]
|
||
|
add eax, dword ptr [esp]
|
||
|
cmp ebp, esi
|
||
|
jc @4
|
||
|
mov ebx, esi
|
||
|
nop
|
||
|
@3: imul edx, dword ptr [ebx-4H], -1028477379
|
||
|
add ebx, 4
|
||
|
add eax, edx
|
||
|
ror eax, 15
|
||
|
imul eax, 668265263
|
||
|
cmp ebp, ebx
|
||
|
jnc @3
|
||
|
lea edx, [ebp-4H]
|
||
|
sub edx, ecx
|
||
|
mov ecx, edx
|
||
|
and ecx, 0FFFFFFFCH
|
||
|
add ecx, esi
|
||
|
@4: cmp ebp, ecx
|
||
|
jbe @6
|
||
|
@5: movzx edx, byte ptr [ecx]
|
||
|
add ecx, 1
|
||
|
imul edx, 374761393
|
||
|
add eax, edx
|
||
|
rol eax, 11
|
||
|
imul eax, -1640531535
|
||
|
cmp ebp, ecx
|
||
|
jnz @5
|
||
|
nop
|
||
|
@6: mov edx, eax
|
||
|
add esp, 8
|
||
|
shr edx, 15
|
||
|
xor eax, edx
|
||
|
imul eax, -2048144777
|
||
|
pop ebx
|
||
|
pop esi
|
||
|
mov edx, eax
|
||
|
shr edx, 13
|
||
|
xor eax, edx
|
||
|
imul eax, -1028477379
|
||
|
pop edi
|
||
|
pop ebp
|
||
|
mov edx, eax
|
||
|
shr edx, 16
|
||
|
xor eax, edx
|
||
|
end;
|
||
|
|
||
|
function GetBitsCountPas(value: PtrInt): PtrInt;
|
||
|
{$ifdef FPC}nostackframe; assembler;{$endif}
|
||
|
asm
|
||
|
// branchless Wilkes-Wheeler-Gill i386 asm implementation
|
||
|
mov edx, eax
|
||
|
shr eax, 1
|
||
|
and eax, $55555555
|
||
|
sub edx, eax
|
||
|
mov eax, edx
|
||
|
shr edx, 2
|
||
|
and eax, $33333333
|
||
|
and edx, $33333333
|
||
|
add eax, edx
|
||
|
mov edx, eax
|
||
|
shr eax, 4
|
||
|
add eax, edx
|
||
|
and eax, $0f0f0f0f
|
||
|
mov edx, eax
|
||
|
shr edx, 8
|
||
|
add eax, edx
|
||
|
mov edx, eax
|
||
|
shr edx, 16
|
||
|
add eax, edx
|
||
|
and eax, $3f
|
||
|
end;
|
||
|
|
||
|
{$ifdef HASNOSSE2} // fallback to simple pascal code if no SSE2 available
|
||
|
|
||
|
function ByteScanIndex(P: PByteArray; Count: PtrInt; Value: byte): PtrInt;
|
||
|
begin
|
||
|
result := 0;
|
||
|
if P <> nil then
|
||
|
repeat
|
||
|
if result >= Count then
|
||
|
break;
|
||
|
if P^[result] = Value then
|
||
|
exit;
|
||
|
inc(result);
|
||
|
until false;
|
||
|
result := -1;
|
||
|
end;
|
||
|
|
||
|
function WordScanIndex(P: PWordArray; Count: PtrInt; Value: word): PtrInt;
|
||
|
begin
|
||
|
result := 0;
|
||
|
if P <> nil then
|
||
|
repeat
|
||
|
if result >= Count then
|
||
|
break;
|
||
|
if P^[result] = Value then
|
||
|
exit;
|
||
|
inc(result);
|
||
|
until false;
|
||
|
result := -1;
|
||
|
end;
|
||
|
|
||
|
function IntegerScanIndex(P: PCardinalArray; Count: PtrInt; Value: cardinal): PtrInt;
|
||
|
begin
|
||
|
result := PtrUInt(IntegerScan(P, Count, Value));
|
||
|
if result = 0 then
|
||
|
dec(result)
|
||
|
else
|
||
|
begin
|
||
|
dec(result, PtrUInt(P));
|
||
|
result := result shr 2;
|
||
|
end;
|
||
|
end;
|
||
|
|
||
|
function IntegerScan(P: PCardinalArray; Count: PtrInt; Value: cardinal): PCardinal;
|
||
|
{$ifdef FPC}nostackframe; assembler;{$endif}
|
||
|
asm
|
||
|
// eax=P, edx=Count, Value=ecx
|
||
|
test eax, eax
|
||
|
jz @ok0 // avoid GPF
|
||
|
cmp edx, 8
|
||
|
jb @s2
|
||
|
nop
|
||
|
nop
|
||
|
nop // @s1 loop align
|
||
|
@s1: sub edx, 8
|
||
|
cmp [eax], ecx
|
||
|
je @ok0
|
||
|
cmp [eax + 4], ecx
|
||
|
je @ok4
|
||
|
cmp [eax + 8], ecx
|
||
|
je @ok8
|
||
|
cmp [eax + 12], ecx
|
||
|
je @ok12
|
||
|
cmp [eax + 16], ecx
|
||
|
je @ok16
|
||
|
cmp [eax + 20], ecx
|
||
|
je @ok20
|
||
|
cmp [eax + 24], ecx
|
||
|
je @ok24
|
||
|
cmp [eax + 28], ecx
|
||
|
je @ok28
|
||
|
add eax, 32
|
||
|
cmp edx, 8
|
||
|
jae @s1
|
||
|
@s2: test edx, edx
|
||
|
jz @z
|
||
|
cmp [eax], ecx
|
||
|
je @ok0
|
||
|
dec edx
|
||
|
jz @z
|
||
|
cmp [eax + 4], ecx
|
||
|
je @ok4
|
||
|
dec edx
|
||
|
jz @z
|
||
|
cmp [eax + 8], ecx
|
||
|
je @ok8
|
||
|
dec edx
|
||
|
jz @z
|
||
|
cmp [eax + 12], ecx
|
||
|
je @ok12
|
||
|
dec edx
|
||
|
jz @z
|
||
|
cmp [eax + 16], ecx
|
||
|
je @ok16
|
||
|
dec edx
|
||
|
jz @z
|
||
|
cmp [eax + 20], ecx
|
||
|
je @ok20
|
||
|
dec edx
|
||
|
jz @z
|
||
|
cmp [eax + 24], ecx
|
||
|
je @ok24
|
||
|
@z: xor eax, eax // return nil if not found
|
||
|
ret
|
||
|
@ok0: rep ret
|
||
|
@ok28: add eax, 28
|
||
|
ret
|
||
|
@ok24: add eax, 24
|
||
|
ret
|
||
|
@ok20: add eax, 20
|
||
|
ret
|
||
|
@ok16: add eax, 16
|
||
|
ret
|
||
|
@ok12: add eax, 12
|
||
|
ret
|
||
|
@ok8: add eax, 8
|
||
|
ret
|
||
|
@ok4: add eax, 4
|
||
|
end;
|
||
|
|
||
|
function IntegerScanExists(P: PCardinalArray; Count: PtrInt; Value: cardinal): boolean;
|
||
|
{$ifdef FPC}nostackframe; assembler;{$endif}
|
||
|
asm
|
||
|
// eax=P, edx=Count, Value=ecx
|
||
|
test eax, eax
|
||
|
jz @z // avoid GPF
|
||
|
cmp edx, 8
|
||
|
jae @s1
|
||
|
jmp dword ptr [edx * 4 + @Table]
|
||
|
{$ifdef FPC} align 4 {$endif FPC}
|
||
|
@Table: dd @z, @1, @2, @3, @4, @5, @6, @7
|
||
|
@s1: // fast search by 8 integers (pipelined instructions)
|
||
|
sub edx, 8
|
||
|
cmp [eax], ecx
|
||
|
je @ok
|
||
|
cmp [eax + 4], ecx
|
||
|
je @ok
|
||
|
cmp [eax + 8], ecx
|
||
|
je @ok
|
||
|
cmp [eax + 12], ecx
|
||
|
je @ok
|
||
|
cmp [eax + 16], ecx
|
||
|
je @ok
|
||
|
cmp [eax + 20], ecx
|
||
|
je @ok
|
||
|
cmp [eax + 24], ecx
|
||
|
je @ok
|
||
|
cmp [eax + 28], ecx
|
||
|
je @ok
|
||
|
add eax, 32
|
||
|
cmp edx, 8
|
||
|
jae @s1
|
||
|
jmp dword ptr [edx * 4 + @Table]
|
||
|
@7: cmp [eax + 24], ecx
|
||
|
je @ok
|
||
|
@6: cmp [eax + 20], ecx
|
||
|
je @ok
|
||
|
@5: cmp [eax + 16], ecx
|
||
|
je @ok
|
||
|
@4: cmp [eax + 12], ecx
|
||
|
je @ok
|
||
|
@3: cmp [eax + 8], ecx
|
||
|
je @ok
|
||
|
@2: cmp [eax + 4], ecx
|
||
|
je @ok
|
||
|
@1: cmp [eax], ecx
|
||
|
je @ok
|
||
|
@z: xor eax, eax
|
||
|
ret
|
||
|
@ok: mov al, 1
|
||
|
end;
|
||
|
|
||
|
{$else}
|
||
|
|
||
|
function ByteScanIndex(P: PByteArray; Count: PtrInt; Value: byte): PtrInt;
|
||
|
{$ifdef FPC}nostackframe; assembler;{$endif}
|
||
|
asm
|
||
|
// eax=P, edx=Count, cl=Value
|
||
|
push ebx
|
||
|
mov ebx, eax
|
||
|
movzx eax, cl
|
||
|
mov ecx, ebx
|
||
|
test edx, edx
|
||
|
jbe @no
|
||
|
// eax=Value, ecx=P edx=Count
|
||
|
imul eax, $01010101
|
||
|
and ecx, -16
|
||
|
movd xmm1, eax
|
||
|
movaps xmm0, [ecx]
|
||
|
add ecx, 16
|
||
|
pshufd xmm1, xmm1, 0
|
||
|
sub ecx, ebx
|
||
|
pcmpeqb xmm0, xmm1
|
||
|
pmovmskb eax, xmm0
|
||
|
shl eax, cl
|
||
|
and eax, $FFFF0000
|
||
|
shr eax, cl
|
||
|
jnz @fnd
|
||
|
cmp edx, ecx
|
||
|
jbe @no
|
||
|
{$ifdef FPC} align 16 {$else} {$ifdef HASALIGN} .align 16 {$endif}{$endif}
|
||
|
@by16: movaps xmm0, [ebx + ecx]
|
||
|
add ecx, 16
|
||
|
pcmpeqb xmm0, xmm1
|
||
|
pmovmskb eax, xmm0
|
||
|
bsf eax, eax
|
||
|
jnz @fnd2
|
||
|
cmp edx, ecx
|
||
|
ja @by16
|
||
|
@no: mov eax, -1
|
||
|
pop ebx
|
||
|
ret
|
||
|
@fnd: bsf eax, eax
|
||
|
@fnd2: lea eax, [ecx + eax - 16]
|
||
|
cmp edx, eax
|
||
|
jbe @no
|
||
|
pop ebx
|
||
|
end;
|
||
|
|
||
|
function WordScanIndex(P: PWordArray; Count: PtrInt; Value: word): PtrInt;
|
||
|
{$ifdef FPC}nostackframe; assembler;{$endif}
|
||
|
asm
|
||
|
// eax=P, edx=Count, cx=Value
|
||
|
push ebx
|
||
|
mov ebx, eax
|
||
|
movzx eax, cx
|
||
|
mov ecx, ebx
|
||
|
test edx, edx
|
||
|
jbe @no
|
||
|
test cl, 1
|
||
|
jnz @unal
|
||
|
// eax=Value, ecx=P edx=Count
|
||
|
movd xmm1, eax
|
||
|
and ecx, -16
|
||
|
punpcklwd xmm1, xmm1
|
||
|
movaps xmm0, [ecx]
|
||
|
add ecx, 16
|
||
|
pshufd xmm1, xmm1, 0
|
||
|
sub ecx, ebx
|
||
|
pcmpeqw xmm0, xmm1
|
||
|
pmovmskb eax, xmm0
|
||
|
shl eax, cl
|
||
|
and eax, $FFFF0000
|
||
|
shr eax, cl
|
||
|
shr ecx, 1
|
||
|
test eax, eax
|
||
|
jz @nxt
|
||
|
bsf eax, eax
|
||
|
@fnd: shr eax, 1
|
||
|
lea eax, [ecx + eax - 8]
|
||
|
cmp edx, eax
|
||
|
jbe @no
|
||
|
pop ebx
|
||
|
ret
|
||
|
nop // manual loop alignment
|
||
|
@by16: movaps xmm0, [ebx + ecx * 2]
|
||
|
add ecx, 8
|
||
|
pcmpeqw xmm0, xmm1
|
||
|
pmovmskb eax, xmm0
|
||
|
bsf eax, eax
|
||
|
jnz @fnd
|
||
|
@nxt: cmp edx, ecx
|
||
|
ja @by16
|
||
|
@no: mov eax, -1
|
||
|
pop ebx
|
||
|
ret
|
||
|
// 16bit-unaligned loop (seldom called)
|
||
|
@unal: lea ecx, [ecx + edx * 2]
|
||
|
neg edx
|
||
|
@unals: cmp word ptr [ecx + edx * 2], ax
|
||
|
jz @unale
|
||
|
inc edx
|
||
|
jnz @unals
|
||
|
jmp @no
|
||
|
@unale: lea eax, [ecx + edx * 2]
|
||
|
sub eax, ebx
|
||
|
shr eax, 1
|
||
|
pop ebx
|
||
|
end;
|
||
|
|
||
|
function IntegerScanIndex(P: PCardinalArray; Count: PtrInt; Value: cardinal): PtrInt;
|
||
|
{$ifdef FPC}nostackframe; assembler;{$endif}
|
||
|
asm
|
||
|
// eax=P, edx=Count, ecx=Value
|
||
|
push ebx
|
||
|
movd xmm1, ecx
|
||
|
mov ebx, eax
|
||
|
mov eax, ecx
|
||
|
mov ecx, ebx
|
||
|
test edx, edx
|
||
|
jbe @no
|
||
|
test cl, 3
|
||
|
jnz @unal
|
||
|
// eax=Value, ecx=P edx=Count
|
||
|
and ecx, -16
|
||
|
movaps xmm0, [ecx]
|
||
|
add ecx, 16
|
||
|
pshufd xmm1, xmm1, 0
|
||
|
sub ecx, ebx
|
||
|
pcmpeqd xmm0, xmm1
|
||
|
pmovmskb eax, xmm0
|
||
|
shl eax, cl
|
||
|
and eax, $FFFF0000
|
||
|
shr eax, cl
|
||
|
shr ecx, 2
|
||
|
bsf eax, eax
|
||
|
jz @nxt
|
||
|
@fnd: shr eax, 2
|
||
|
lea eax, [ecx + eax - 4]
|
||
|
cmp edx, eax
|
||
|
jbe @no
|
||
|
pop ebx
|
||
|
ret
|
||
|
{$ifdef FPC} align 16 {$else} nop;nop;nop;nop;nop;nop {$endif FPC}
|
||
|
@by16: movaps xmm0, [ebx + ecx * 4]
|
||
|
add ecx, 4
|
||
|
pcmpeqd xmm0, xmm1
|
||
|
pmovmskb eax, xmm0
|
||
|
bsf eax, eax
|
||
|
jnz @fnd
|
||
|
@nxt: cmp edx, ecx
|
||
|
ja @by16
|
||
|
@no: mov eax, -1
|
||
|
pop ebx
|
||
|
ret
|
||
|
// 32bit-unaligned loop (seldom called)
|
||
|
@unal: lea ecx, [ecx + edx * 4]
|
||
|
neg edx
|
||
|
@unals: cmp dword ptr [ecx + edx * 4], eax
|
||
|
jz @unale
|
||
|
inc edx
|
||
|
jnz @unals
|
||
|
jmp @no
|
||
|
@unale: lea eax, [ecx + edx * 4]
|
||
|
sub eax, ebx
|
||
|
shr eax, 2
|
||
|
pop ebx
|
||
|
end;
|
||
|
|
||
|
{$endif HASNOSSE2}
|
||
|
|
||
|
procedure mul64x64(const left, right: QWord; out product: THash128Rec);
|
||
|
{$ifdef FPC}nostackframe; assembler; {$endif}
|
||
|
asm
|
||
|
// adapted from FPC compiler output, which is much better than Delphi's here
|
||
|
{$ifdef FPC}
|
||
|
push ebp
|
||
|
mov ebp, esp
|
||
|
{$endif FPC}
|
||
|
mov ecx, eax
|
||
|
mov eax, dword ptr [ebp+8H]
|
||
|
mul dword ptr [ebp+10H]
|
||
|
mov dword ptr [ecx], eax
|
||
|
mov dword ptr [ebp-4H], edx
|
||
|
mov eax, dword ptr [ebp+8H]
|
||
|
mul dword ptr [ebp+14H]
|
||
|
add eax, dword ptr [ebp-4H]
|
||
|
adc edx, 0
|
||
|
mov dword ptr [ebp-10H], eax
|
||
|
mov dword ptr [ebp-0CH], edx
|
||
|
mov eax, dword ptr [ebp+0CH]
|
||
|
mul dword ptr [ebp+10H]
|
||
|
add eax, dword ptr [ebp-10H]
|
||
|
adc edx, 0
|
||
|
mov dword ptr [ecx+4H], eax
|
||
|
mov dword ptr [ebp-14H], edx
|
||
|
mov eax, dword ptr [ebp+0CH]
|
||
|
mul dword ptr [ebp+14H]
|
||
|
add eax, dword ptr [ebp-0CH]
|
||
|
adc edx, 0
|
||
|
add eax, dword ptr [ebp-14H]
|
||
|
adc edx, 0
|
||
|
mov dword ptr [ecx+8H], eax
|
||
|
mov dword ptr [ecx+0CH], edx
|
||
|
{$ifdef FPC}
|
||
|
pop ebp
|
||
|
{$endif FPC}
|
||
|
end;
|
||
|
|
||
|
function bswap32(a: cardinal): cardinal;
|
||
|
{$ifdef FPC}nostackframe; assembler;{$endif}
|
||
|
asm
|
||
|
bswap eax
|
||
|
end;
|
||
|
|
||
|
function bswap64({$ifdef FPC}constref{$else}const{$endif} a: QWord): QWord;
|
||
|
{$ifdef FPC}nostackframe; assembler;{$endif}
|
||
|
asm
|
||
|
{$ifdef FPC}
|
||
|
mov edx, dword ptr [eax]
|
||
|
mov eax, dword ptr [eax + 4]
|
||
|
{$else}
|
||
|
mov edx, a.TQWordRec.L
|
||
|
mov eax, a.TQWordRec.H
|
||
|
{$endif FPC}
|
||
|
bswap edx
|
||
|
bswap eax
|
||
|
end;
|
||
|
|
||
|
procedure bswap64array(A, B: PQWordArray; n: PtrInt);
|
||
|
{$ifdef FPC}nostackframe; assembler;{$endif}
|
||
|
asm
|
||
|
push ebx
|
||
|
push esi
|
||
|
@1: mov ebx, dword ptr [eax]
|
||
|
mov esi, dword ptr [eax + 4]
|
||
|
bswap ebx
|
||
|
bswap esi
|
||
|
mov dword ptr [edx + 4], ebx
|
||
|
mov dword ptr [edx], esi
|
||
|
add eax, 8
|
||
|
add edx, 8
|
||
|
dec ecx
|
||
|
jnz @1
|
||
|
pop esi
|
||
|
pop ebx
|
||
|
end;
|
||
|
|
||
|
procedure LockedInc32(int32: PInteger);
|
||
|
{$ifdef FPC}nostackframe; assembler;{$endif}
|
||
|
asm
|
||
|
lock inc dword ptr [int32]
|
||
|
end;
|
||
|
|
||
|
procedure LockedDec32(int32: PInteger);
|
||
|
{$ifdef FPC}nostackframe; assembler;{$endif}
|
||
|
asm
|
||
|
lock dec dword ptr [int32]
|
||
|
end;
|
||
|
|
||
|
procedure LockedInc64(int64: PInt64);
|
||
|
{$ifdef FPC}nostackframe; assembler;{$endif}
|
||
|
asm
|
||
|
lock inc dword ptr [int64]
|
||
|
jz @h
|
||
|
ret
|
||
|
@h:lock inc dword ptr [int64 + 4] // collision is very unlikely
|
||
|
end;
|
||
|
|
||
|
function InterlockedIncrement(var I: integer): integer;
|
||
|
{$ifdef FPC}nostackframe; assembler;{$endif}
|
||
|
asm
|
||
|
mov edx, 1
|
||
|
xchg eax, edx
|
||
|
lock xadd [edx], eax
|
||
|
inc eax
|
||
|
end;
|
||
|
|
||
|
function InterlockedDecrement(var I: integer): integer;
|
||
|
{$ifdef FPC}nostackframe; assembler;{$endif}
|
||
|
asm
|
||
|
mov edx, -1
|
||
|
xchg eax, edx
|
||
|
lock xadd [edx], eax
|
||
|
dec eax
|
||
|
end;
|
||
|
|
||
|
function StrLenSafe(S: pointer): PtrInt;
|
||
|
{$ifdef FPC}nostackframe; assembler;{$endif}
|
||
|
asm
|
||
|
// slower than StrLenSSE2(), but won't read any byte beyond the page
|
||
|
test eax, eax
|
||
|
jz @z
|
||
|
cmp byte ptr [eax], 0
|
||
|
je @0
|
||
|
cmp byte ptr [eax + 1], 0
|
||
|
je @1
|
||
|
cmp byte ptr [eax + 2], 0
|
||
|
je @2
|
||
|
cmp byte ptr [eax + 3], 0
|
||
|
je @3
|
||
|
push eax
|
||
|
and eax, -4 // dword align reads
|
||
|
nop // @s loop code alignment
|
||
|
@s: add eax, 4
|
||
|
mov edx, dword ptr [eax] // 4 chars per loop
|
||
|
lea ecx, [edx - $01010101]
|
||
|
not edx
|
||
|
and edx, ecx
|
||
|
and edx, $80808080 // set byte to $80 at each #0 position
|
||
|
jz @s // loop until any #0 found
|
||
|
@set: pop ecx
|
||
|
bsf edx, edx // find first #0 position
|
||
|
shr edx, 3 // byte offset of first #0
|
||
|
add eax, edx // address of first #0
|
||
|
sub eax, ecx // length
|
||
|
@z: ret
|
||
|
@0: xor eax, eax
|
||
|
ret
|
||
|
@1: mov eax, 1
|
||
|
ret
|
||
|
@2: mov eax, 2
|
||
|
ret
|
||
|
@3: mov eax, 3
|
||
|
end;
|
||
|
|
||
|
function PosEx(const SubStr, S: RawUtf8; Offset: PtrUInt): integer;
|
||
|
{$ifdef FPC}nostackframe; assembler;{$endif}
|
||
|
asm
|
||
|
// eax=SubStr, edx=S, ecx=Offset
|
||
|
push ebx
|
||
|
push esi
|
||
|
push edx
|
||
|
test eax, eax
|
||
|
jz @notfnd // exit if SubStr=''
|
||
|
test edx, edx
|
||
|
jz @notfnd // exit if S=''
|
||
|
mov esi, ecx
|
||
|
mov ecx, [edx - 4] // length(S)
|
||
|
mov ebx, [eax - 4] // length(SubStr)
|
||
|
add ecx, edx
|
||
|
sub ecx, ebx // ecx = max start pos for full match
|
||
|
lea edx, [edx + esi - 1] // edx = start position
|
||
|
cmp edx, ecx
|
||
|
jg @notfnd // startpos > max start pos
|
||
|
cmp ebx, 1
|
||
|
jle @onec // optimized loop for length(SubStr)<=1
|
||
|
push edi
|
||
|
push ebp
|
||
|
lea edi, [ebx - 2] // edi = length(SubStr)-2
|
||
|
mov esi, eax // esi = SubStr
|
||
|
movzx ebx, byte ptr [eax] // bl = search character
|
||
|
nop; nop
|
||
|
@l: cmp bl, [edx] // compare 2 characters per @l
|
||
|
je @c1fnd
|
||
|
@notc1: cmp bl, [edx + 1]
|
||
|
je @c2fnd
|
||
|
@notc2: add edx, 2
|
||
|
cmp edx, ecx // next start position <= max start position
|
||
|
jle @l
|
||
|
pop ebp
|
||
|
pop edi
|
||
|
@notfnd:xor eax, eax // returns 0 if not fnd
|
||
|
pop edx
|
||
|
pop esi
|
||
|
pop ebx
|
||
|
ret
|
||
|
@c1fnd: mov ebp, edi // ebp = length(SubStr)-2
|
||
|
@c1l: movzx eax, word ptr [esi + ebp]
|
||
|
cmp ax, [edx + ebp] // compare 2 chars per @c1l (may include #0)
|
||
|
jne @notc1
|
||
|
sub ebp, 2
|
||
|
jnc @c1l
|
||
|
pop ebp
|
||
|
pop edi
|
||
|
jmp @setres
|
||
|
@c2fnd: mov ebp, edi // ebp = length(SubStr)-2
|
||
|
@c2l: movzx eax, word ptr [esi + ebp]
|
||
|
cmp ax, [edx + ebp + 1] // compare 2 chars per @c2l (may include #0)
|
||
|
jne @notc2
|
||
|
sub ebp, 2
|
||
|
jnc @c2l
|
||
|
pop ebp
|
||
|
pop edi
|
||
|
jmp @chkres
|
||
|
@onec: jl @notfnd // needed for zero-length non-nil strings
|
||
|
movzx eax, byte ptr [eax] // search character
|
||
|
@charl: cmp al, [edx]
|
||
|
je @setres
|
||
|
cmp al, [edx + 1]
|
||
|
je @chkres
|
||
|
add edx, 2
|
||
|
cmp edx, ecx
|
||
|
jle @charl
|
||
|
jmp @notfnd
|
||
|
@chkres:cmp edx, ecx // check within ansistring
|
||
|
jge @notfnd
|
||
|
add edx, 1
|
||
|
@setres:pop ecx // ecx = S
|
||
|
pop esi
|
||
|
pop ebx
|
||
|
neg ecx
|
||
|
lea eax, [edx + ecx + 1]
|
||
|
end;
|
||
|
|
||
|
function StrComp(Str1, Str2: pointer): PtrInt;
|
||
|
{$ifdef FPC}nostackframe; assembler;{$endif}
|
||
|
asm
|
||
|
// no branch taken in case of not equal first char
|
||
|
cmp eax, edx
|
||
|
je @zero // same string or both nil
|
||
|
test eax, edx
|
||
|
jz @maynil
|
||
|
@1: mov cl, byte ptr [eax]
|
||
|
mov ch, byte ptr [edx]
|
||
|
inc eax
|
||
|
inc edx
|
||
|
test cl, cl
|
||
|
jz @exit
|
||
|
cmp cl, ch
|
||
|
je @1
|
||
|
@exit: movzx eax, cl
|
||
|
movzx edx, ch
|
||
|
sub eax, edx
|
||
|
ret
|
||
|
@maynil:test eax, eax // Str1='' ?
|
||
|
jz @max
|
||
|
test edx, edx // Str2='' ?
|
||
|
jnz @1
|
||
|
mov eax, 1
|
||
|
ret
|
||
|
@max: dec eax
|
||
|
ret
|
||
|
@zero: xor eax, eax
|
||
|
end;
|
||
|
|
||
|
function SortDynArrayInteger(const A, B): integer;
|
||
|
{$ifdef FPC}nostackframe; assembler;{$endif}
|
||
|
asm
|
||
|
mov ecx, dword ptr [eax]
|
||
|
mov edx, dword ptr[edx]
|
||
|
xor eax, eax
|
||
|
cmp ecx, edx
|
||
|
setl cl
|
||
|
setg al
|
||
|
movzx ecx, cl
|
||
|
sub eax, ecx
|
||
|
end;
|
||
|
|
||
|
function SortDynArrayCardinal(const A, B): integer;
|
||
|
{$ifdef FPC}nostackframe; assembler;{$endif}
|
||
|
asm
|
||
|
mov ecx, [eax]
|
||
|
mov edx, [edx]
|
||
|
xor eax, eax
|
||
|
cmp ecx, edx
|
||
|
seta al
|
||
|
sbb eax,0
|
||
|
end;
|
||
|
|
||
|
function SortDynArrayPointer(const A, B): integer;
|
||
|
{$ifdef FPC}nostackframe; assembler;{$endif}
|
||
|
asm
|
||
|
mov ecx, [eax]
|
||
|
mov edx, [edx]
|
||
|
xor eax, eax
|
||
|
cmp ecx, edx
|
||
|
seta al
|
||
|
sbb eax,0
|
||
|
end;
|
||
|
|
||
|
function SortDynArrayInt64(const A, B): integer;
|
||
|
{$ifdef FPC}nostackframe; assembler;{$endif}
|
||
|
asm
|
||
|
// Delphi x86 compiler is not efficient at compiling Int64 comparisons
|
||
|
mov ecx, [eax]
|
||
|
mov eax, [eax + 4]
|
||
|
cmp eax, [edx + 4]
|
||
|
jnz @nz
|
||
|
cmp ecx, [edx]
|
||
|
jz @0
|
||
|
jnb @p
|
||
|
@n: mov eax, -1
|
||
|
ret
|
||
|
@0: xor eax, eax
|
||
|
ret
|
||
|
@nz: jl @n
|
||
|
@p: mov eax, 1
|
||
|
end;
|
||
|
|
||
|
function SortDynArrayQWord(const A, B): integer;
|
||
|
{$ifdef FPC}nostackframe; assembler;{$endif}
|
||
|
asm
|
||
|
// Delphi x86 compiler is not efficient, and oldest even incorrect
|
||
|
mov ecx, [eax]
|
||
|
mov eax, [eax + 4]
|
||
|
cmp eax, [edx + 4]
|
||
|
jnz @nz
|
||
|
cmp ecx, [edx]
|
||
|
jz @0
|
||
|
@nz: jnb @p
|
||
|
mov eax, -1
|
||
|
ret
|
||
|
@0: xor eax, eax
|
||
|
ret
|
||
|
@p: mov eax, 1
|
||
|
end;
|
||
|
|
||
|
function SortDynArrayAnsiString(const A, B): integer;
|
||
|
{$ifdef FPC}nostackframe; assembler;{$endif}
|
||
|
asm
|
||
|
// x86 version optimized for AnsiString/RawUtf8/RawByteString types
|
||
|
mov eax, dword ptr [eax]
|
||
|
mov edx, dword ptr [edx]
|
||
|
cmp eax, edx
|
||
|
je @0
|
||
|
test eax, edx
|
||
|
jz @maynil
|
||
|
@f: mov cl, byte ptr [eax] // first char comparison (quicksort speedup)
|
||
|
mov ch, byte ptr [edx]
|
||
|
cmp cl, ch
|
||
|
je @s
|
||
|
movzx eax, cl
|
||
|
movzx edx, ch
|
||
|
sub eax, edx // branchless execution on Quicksort/ReHash
|
||
|
ret
|
||
|
@0: xor eax, eax
|
||
|
ret
|
||
|
@maynil:test edx, edx // A or B may be ''
|
||
|
jz @1
|
||
|
test eax, eax
|
||
|
jnz @f
|
||
|
dec eax
|
||
|
ret
|
||
|
@s: push ebx
|
||
|
xor ecx, ecx
|
||
|
mov ebx, dword ptr [eax - 4]
|
||
|
sub ebx, dword ptr [edx - 4] // ebx = length(A)-length(B)
|
||
|
push ebx
|
||
|
adc ecx, -1
|
||
|
and ecx, ebx
|
||
|
sub ecx, dword ptr [eax - 4] // ecx = -min(length(A),length(B))
|
||
|
sub eax, ecx
|
||
|
sub edx, ecx
|
||
|
{$ifdef FPC} align 16 {$endif} // is naturally aligned anyway
|
||
|
@by4: mov ebx, dword ptr [eax + ecx] // compare 4 bytes per iteration
|
||
|
xor ebx, dword ptr [edx + ecx]
|
||
|
jnz @d
|
||
|
add ecx, 4
|
||
|
js @by4
|
||
|
@eq: pop eax // all chars equal -> returns length(A)-length(B)
|
||
|
pop ebx
|
||
|
ret
|
||
|
@d: bsf ebx, ebx // char differs -> returns pbyte(A)^-pbyte(B)^
|
||
|
shr ebx, 3
|
||
|
add ecx, ebx
|
||
|
jns @eq
|
||
|
movzx eax, byte ptr [eax + ecx]
|
||
|
movzx edx, byte ptr [edx + ecx]
|
||
|
pop ebx
|
||
|
pop ebx
|
||
|
sub eax, edx
|
||
|
ret
|
||
|
@1: mov eax, 1
|
||
|
end;
|
||
|
|
||
|
function SortDynArrayDouble(const A, B): integer;
|
||
|
{$ifdef FPC}nostackframe; assembler;{$endif}
|
||
|
asm
|
||
|
fld qword ptr [eax]
|
||
|
fcomp qword ptr [edx]
|
||
|
fstsw ax
|
||
|
sahf
|
||
|
jz @0
|
||
|
@nz: jnb @p
|
||
|
mov eax, -1
|
||
|
ret
|
||
|
@0: xor eax, eax
|
||
|
ret
|
||
|
@p: mov eax, 1
|
||
|
end;
|
||
|
|
||
|
function SortDynArraySingle(const A, B): integer;
|
||
|
{$ifdef FPC}nostackframe; assembler;{$endif}
|
||
|
asm
|
||
|
fld dword ptr [eax]
|
||
|
fcomp dword ptr [edx]
|
||
|
fstsw ax
|
||
|
sahf
|
||
|
jz @0
|
||
|
@nz: jnb @p
|
||
|
mov eax, -1
|
||
|
ret
|
||
|
@0: xor eax, eax
|
||
|
ret
|
||
|
@p: mov eax, 1
|
||
|
end;
|
||
|
|
||
|
procedure DynArrayHashTableAdjust(P: PIntegerArray; deleted: integer; count: PtrInt);
|
||
|
{$ifdef FPC}nostackframe; assembler;{$endif}
|
||
|
asm
|
||
|
// eax=P edx=deleted ecx=count
|
||
|
push ebx
|
||
|
mov ebx, eax
|
||
|
xor eax, eax
|
||
|
@by8: sub ecx, 8
|
||
|
cmp edx, dword ptr [ebx]
|
||
|
setl al
|
||
|
sub dword ptr [ebx], eax
|
||
|
cmp edx, dword ptr [ebx + 4H]
|
||
|
setl al
|
||
|
sub dword ptr [ebx + 4H], eax
|
||
|
cmp edx, dword ptr [ebx + 8H]
|
||
|
setl al
|
||
|
sub dword ptr [ebx + 8H], eax
|
||
|
cmp edx, dword ptr [ebx + 0CH]
|
||
|
setl al
|
||
|
sub dword ptr [ebx + 0CH], eax
|
||
|
cmp edx, dword ptr [ebx + 10H]
|
||
|
setl al
|
||
|
sub dword ptr [ebx + 10H], eax
|
||
|
cmp edx, dword ptr [ebx + 14H]
|
||
|
setl al
|
||
|
sub dword ptr [ebx + 14H], eax
|
||
|
cmp edx, dword ptr [ebx + 18H]
|
||
|
setl al
|
||
|
sub dword ptr [ebx + 18H], eax
|
||
|
cmp edx, dword ptr [ebx + 1CH]
|
||
|
setl al
|
||
|
sub dword ptr [ebx + 1CH], eax
|
||
|
add ebx, 32
|
||
|
cmp ecx, 8
|
||
|
jge @by8
|
||
|
test ecx, ecx
|
||
|
jz @ok
|
||
|
// trailing indexes (never appearing within DYNARRAYHASH_PO2 range)
|
||
|
@by1: cmp edx, dword ptr [ebx]
|
||
|
setl al
|
||
|
sub dword ptr [ebx], eax
|
||
|
add ebx, 4
|
||
|
dec ecx
|
||
|
jnz @by1
|
||
|
@ok: pop ebx
|
||
|
end;
|
||
|
|
||
|
procedure DynArrayHashTableAdjust16(P: PWordArray; deleted: cardinal; count: PtrInt);
|
||
|
{$ifdef FPC}nostackframe; assembler;{$endif}
|
||
|
asm
|
||
|
// eax=P dx=deleted ecx=count
|
||
|
push ebx
|
||
|
mov ebx, eax
|
||
|
xor eax, eax
|
||
|
@by8: sub ecx, 8
|
||
|
cmp dx, word ptr [ebx]
|
||
|
setl al
|
||
|
sub word ptr [ebx], ax
|
||
|
cmp dx, word ptr [ebx + 2]
|
||
|
setl al
|
||
|
sub word ptr [ebx + 2], ax
|
||
|
cmp dx, word ptr [ebx + 4]
|
||
|
setl al
|
||
|
sub word ptr [ebx + 4], ax
|
||
|
cmp dx, word ptr [ebx + 6]
|
||
|
setl al
|
||
|
sub word ptr [ebx + 6], ax
|
||
|
cmp dx, word ptr [ebx + 8]
|
||
|
setl al
|
||
|
sub word ptr [ebx + 8], ax
|
||
|
cmp dx, word ptr [ebx + 10]
|
||
|
setl al
|
||
|
sub word ptr [ebx + 10], ax
|
||
|
cmp dx, word ptr [ebx + 12]
|
||
|
setl al
|
||
|
sub word ptr [ebx + 12], ax
|
||
|
cmp dx, word ptr [ebx + 14]
|
||
|
setl al
|
||
|
sub word ptr [ebx + 14], ax
|
||
|
add ebx, 16
|
||
|
cmp ecx, 8
|
||
|
jge @by8
|
||
|
test ecx, ecx
|
||
|
jz @ok
|
||
|
// trailing indexes (never appearing within DYNARRAYHASH_PO2 range)
|
||
|
@by1: cmp dx, word ptr [ebx]
|
||
|
setl al
|
||
|
sub word ptr [ebx], ax
|
||
|
add ebx, 2
|
||
|
dec ecx
|
||
|
jnz @by1
|
||
|
@ok: pop ebx
|
||
|
end;
|
||
|
|
||
|
function GetBitsCountSse42(value: PtrInt): PtrInt;
|
||
|
{$ifdef FPC}nostackframe; assembler;{$endif}
|
||
|
asm
|
||
|
// eax=value
|
||
|
{$ifdef HASAESNI}
|
||
|
popcnt eax, eax
|
||
|
{$else} // oldest Delphi don't support this opcode
|
||
|
db $f3,$0f,$B8,$c0
|
||
|
{$endif HASAESNI}
|
||
|
end;
|
||
|
|
||
|
function crc32cby4sse42(crc, value: cardinal): cardinal;
|
||
|
{$ifdef FPC}nostackframe; assembler;{$endif}
|
||
|
asm
|
||
|
// eax=crc, edx=value
|
||
|
{$ifdef HASAESNI}
|
||
|
crc32 eax, edx
|
||
|
{$else} // oldest Delphi don't support this opcode
|
||
|
db $F2, $0F, $38, $F1, $C2
|
||
|
{$endif HASAESNI}
|
||
|
end;
|
||
|
|
||
|
procedure crcblocksse42(crc128, data128: PBlock128);
|
||
|
{$ifdef FPC}nostackframe; assembler;{$endif}
|
||
|
asm
|
||
|
// eax=crc128, edx=data128
|
||
|
mov ecx, eax
|
||
|
{$ifdef HASAESNI}
|
||
|
mov eax, dword ptr [ecx]
|
||
|
crc32 eax, dword ptr [edx]
|
||
|
mov dword ptr [ecx], eax
|
||
|
mov eax, dword ptr [ecx + 4]
|
||
|
crc32 eax, dword ptr [edx + 4]
|
||
|
mov dword ptr [ecx + 4], eax
|
||
|
mov eax, dword ptr [ecx + 8]
|
||
|
crc32 eax, dword ptr [edx + 8]
|
||
|
mov dword ptr [ecx + 8], eax
|
||
|
mov eax, dword ptr [ecx + 12]
|
||
|
crc32 eax, dword ptr [edx + 12]
|
||
|
mov dword ptr [ecx + 12], eax
|
||
|
{$else} // oldest Delphi don't support these opcodes
|
||
|
mov eax, dword ptr [ecx]
|
||
|
db $F2, $0F, $38, $F1, $02
|
||
|
mov dword ptr [ecx], eax
|
||
|
mov eax, dword ptr [ecx + 4]
|
||
|
db $F2, $0F, $38, $F1, $42, $04
|
||
|
mov dword ptr [ecx + 4], eax
|
||
|
mov eax, dword ptr [ecx + 8]
|
||
|
db $F2, $0F, $38, $F1, $42, $08
|
||
|
mov dword ptr [ecx + 8], eax
|
||
|
mov eax, dword ptr [ecx + 12]
|
||
|
db $F2, $0F, $38, $F1, $42, $0C
|
||
|
mov dword ptr [ecx + 12], eax
|
||
|
{$endif HASAESNI}
|
||
|
end;
|
||
|
|
||
|
procedure crcblockssse42(crc128, data128: PBlock128; count: integer);
|
||
|
{$ifdef FPC}nostackframe; assembler;{$endif}
|
||
|
asm
|
||
|
// eax=crc128 edx=data128 ecx=count
|
||
|
push ebx
|
||
|
push esi
|
||
|
push edi
|
||
|
push ebp
|
||
|
test count, count
|
||
|
jle @z
|
||
|
mov ebp, count
|
||
|
mov esi, crc128
|
||
|
mov edi, data128
|
||
|
mov eax, dword ptr [esi]
|
||
|
mov ebx, dword ptr [esi + 4]
|
||
|
mov ecx, dword ptr [esi + 8]
|
||
|
mov edx, dword ptr [esi + 12]
|
||
|
{$ifdef HASAESNI}
|
||
|
{$ifdef FPC} align 16 {$else} {$ifdef HASALIGN} .align 16 {$endif}{$endif}
|
||
|
@s: crc32 eax, dword ptr [edi]
|
||
|
crc32 ebx, dword ptr [edi + 4]
|
||
|
crc32 ecx, dword ptr [edi + 8]
|
||
|
crc32 edx, dword ptr [edi + 12]
|
||
|
{$else} // oldest Delphi don't support these opcodes
|
||
|
@s: db $F2, $0F, $38, $F1, $07
|
||
|
db $F2, $0F, $38, $F1, $5F, $04
|
||
|
db $F2, $0F, $38, $F1, $4F, $08
|
||
|
db $F2, $0F, $38, $F1, $57, $0C
|
||
|
{$endif HASAESNI}
|
||
|
add edi, 16
|
||
|
dec ebp
|
||
|
jnz @s
|
||
|
mov dword ptr [esi], eax
|
||
|
mov dword ptr [esi + 4], ebx
|
||
|
mov dword ptr [esi + 8], ecx
|
||
|
mov dword ptr [esi + 12], edx
|
||
|
@z: pop ebp
|
||
|
pop edi
|
||
|
pop esi
|
||
|
pop ebx
|
||
|
end;
|
||
|
|
||
|
function crc32csse42(crc: cardinal; buf: PAnsiChar; len: cardinal): cardinal;
|
||
|
{$ifdef FPC}nostackframe; assembler;{$endif}
|
||
|
asm
|
||
|
// eax=crc, edx=buf, ecx=len
|
||
|
not eax
|
||
|
test ecx, ecx
|
||
|
jz @0
|
||
|
test edx, edx
|
||
|
jz @0
|
||
|
jmp @align
|
||
|
@a: {$ifdef HASAESNI}
|
||
|
crc32 eax, byte ptr [edx]
|
||
|
{$else} // oldest Delphi don't support these opcodes
|
||
|
db $F2, $0F, $38, $F0, $02
|
||
|
{$endif HASAESNI}
|
||
|
inc edx
|
||
|
dec ecx
|
||
|
jz @0
|
||
|
@align: test dl, 3
|
||
|
jnz @a
|
||
|
push ecx
|
||
|
shr ecx, 3
|
||
|
jnz @by8
|
||
|
@rem: pop ecx
|
||
|
test cl, 4
|
||
|
jz @4
|
||
|
{$ifdef HASAESNI}
|
||
|
crc32 eax, dword ptr [edx]
|
||
|
{$else}
|
||
|
db $F2, $0F, $38, $F1, $02
|
||
|
{$endif HASAESNI}
|
||
|
add edx, 4
|
||
|
@4: test cl, 2
|
||
|
jz @2
|
||
|
{$ifdef HASAESNI}
|
||
|
crc32 eax, word ptr [edx]
|
||
|
{$else}
|
||
|
db $66, $F2, $0F, $38, $F1, $02
|
||
|
{$endif HASAESNI}
|
||
|
add edx, 2
|
||
|
@2: test cl, 1
|
||
|
jz @0
|
||
|
{$ifdef HASAESNI}
|
||
|
crc32 eax, byte ptr [edx]
|
||
|
{$else}
|
||
|
db $F2, $0F, $38, $F0, $02
|
||
|
{$endif HASAESNI}
|
||
|
@0: not eax
|
||
|
ret
|
||
|
{$ifdef HASAESNI}
|
||
|
@by8: crc32 eax, dword ptr [edx]
|
||
|
crc32 eax, dword ptr [edx + 4]
|
||
|
{$else}
|
||
|
@by8: db $F2, $0F, $38, $F1, $02
|
||
|
db $F2, $0F, $38, $F1, $42, $04
|
||
|
{$endif HASAESNI}
|
||
|
add edx, 8
|
||
|
dec ecx
|
||
|
jnz @by8
|
||
|
jmp @rem
|
||
|
end;
|
||
|
|
||
|
function SynLZdecompress1(src: PAnsiChar; size: integer; dst: PAnsiChar): integer;
|
||
|
{$ifdef FPC}nostackframe; assembler;{$endif}
|
||
|
asm
|
||
|
push ebp
|
||
|
push ebx
|
||
|
push esi
|
||
|
push edi
|
||
|
push eax
|
||
|
add esp, -4092
|
||
|
push eax
|
||
|
add esp, -4092
|
||
|
push eax
|
||
|
add esp, -4092
|
||
|
push eax
|
||
|
add esp, -4092
|
||
|
push eax
|
||
|
add esp, -24
|
||
|
mov esi, ecx
|
||
|
mov ebx, eax
|
||
|
add edx, eax
|
||
|
mov [esp+8H], esi
|
||
|
mov [esp+10H], edx
|
||
|
movzx eax, word ptr [ebx]
|
||
|
mov [esp], eax
|
||
|
or eax,eax
|
||
|
je @@0917
|
||
|
add ebx, 2
|
||
|
mov eax, [esp]
|
||
|
test ah, 80H
|
||
|
jz @@0907
|
||
|
and eax, 7FFFH
|
||
|
movzx edx, word ptr [ebx]
|
||
|
shl edx, 15
|
||
|
or eax, edx
|
||
|
mov [esp], eax
|
||
|
add ebx, 2
|
||
|
@@0907: lea ebp, [esi - 1]
|
||
|
@@0908: mov ecx, [ebx]
|
||
|
add ebx, 4
|
||
|
mov [esp+14H], ecx
|
||
|
mov edi, 1 // edi=CWbit
|
||
|
cmp ebx, [esp+10H]
|
||
|
jnc @@0917
|
||
|
@@0909: mov ecx, [esp+14H]
|
||
|
@@090A: test ecx, edi
|
||
|
jnz @@0911
|
||
|
mov al, [ebx]
|
||
|
inc ebx
|
||
|
mov [esi], al
|
||
|
inc esi
|
||
|
cmp ebx, [esp+10H]
|
||
|
lea eax, [esi-3]
|
||
|
jnc @@0917
|
||
|
cmp eax, ebp
|
||
|
jbe @@0910
|
||
|
inc ebp
|
||
|
mov eax, [ebp]
|
||
|
mov edx, eax
|
||
|
shr eax, 12
|
||
|
xor eax, edx
|
||
|
and eax, 0FFFH
|
||
|
mov [esp+1CH + eax * 4], ebp
|
||
|
@@0910: add edi, edi
|
||
|
jnz @@090A
|
||
|
jmp @@0908
|
||
|
@@0911: movzx edx, word ptr [ebx]
|
||
|
add ebx, 2
|
||
|
mov eax, edx
|
||
|
and edx, 0FH
|
||
|
add edx, 2
|
||
|
shr eax, 4
|
||
|
cmp edx,2
|
||
|
jnz @@0912
|
||
|
movzx edx, byte ptr [ebx]
|
||
|
inc ebx
|
||
|
add edx, 18
|
||
|
@@0912: mov eax, [esp+1CH + eax * 4]
|
||
|
mov ecx, esi
|
||
|
mov [esp+18H], edx
|
||
|
sub ecx, eax
|
||
|
// inlined optimized move()
|
||
|
cmp ecx, edx
|
||
|
jl @@ovlap // overlapping content requires per-byte copy
|
||
|
cmp edx, 32
|
||
|
ja @large
|
||
|
sub edx, 8
|
||
|
jg @9_32
|
||
|
{$ifdef HASNOSSE2}
|
||
|
// slowest x87 FPU code on very old CPU with no SSE2 support
|
||
|
mov ecx, [eax]
|
||
|
mov eax, [eax + 4] // always copy 8 bytes for 0..8
|
||
|
mov [esi], ecx // safe since src_endmatch := src_end-(6+5)
|
||
|
mov [esi + 4], eax
|
||
|
jmp @movend
|
||
|
@9_32: fild qword ptr [eax + edx]
|
||
|
fild qword ptr [eax]
|
||
|
cmp edx, 8
|
||
|
jle @16
|
||
|
fild qword ptr [eax + 8]
|
||
|
cmp edx, 16
|
||
|
jle @24
|
||
|
fild qword ptr [eax + 16]
|
||
|
fistp qword ptr [esi + 16]
|
||
|
@24: fistp qword ptr [esi + 8]
|
||
|
@16: fistp qword ptr [esi]
|
||
|
fistp qword ptr [esi + edx]
|
||
|
jmp @movend
|
||
|
nop
|
||
|
@large: push esi
|
||
|
fild qword ptr [eax]
|
||
|
lea eax, [eax + edx - 8]
|
||
|
lea edx, [esi + edx - 8]
|
||
|
fild qword ptr [eax]
|
||
|
push edx
|
||
|
neg edx
|
||
|
and esi, -8
|
||
|
lea edx, [edx + esi + 8]
|
||
|
pop esi
|
||
|
@lrgnxt:fild qword ptr [eax + edx]
|
||
|
fistp qword ptr [esi + edx]
|
||
|
add edx, 8
|
||
|
jl @lrgnxt
|
||
|
fistp qword ptr [esi]
|
||
|
pop esi
|
||
|
fistp qword ptr [esi]
|
||
|
{$else} // inlined SSE2 move
|
||
|
movq xmm0, qword ptr [eax]
|
||
|
movq qword ptr [esi], xmm0
|
||
|
jmp @movend
|
||
|
@9_32: movq xmm0, qword ptr [eax + edx]
|
||
|
movq xmm1, qword ptr [eax]
|
||
|
cmp edx, 8
|
||
|
jle @16
|
||
|
movq xmm2, qword ptr [eax + 8]
|
||
|
cmp edx, 16
|
||
|
jle @24
|
||
|
movq xmm3, qword ptr [eax + 16]
|
||
|
movq qword ptr [esi + 16], xmm3
|
||
|
@24: movq qword ptr [esi + 8], xmm2
|
||
|
@16: movq qword ptr [esi], xmm1
|
||
|
movq qword ptr [esi + edx], xmm0
|
||
|
jmp @movend
|
||
|
@large: push esi
|
||
|
movups xmm2, dqword ptr [eax]
|
||
|
lea eax, [eax + edx - 16]
|
||
|
lea edx, [esi + edx - 16]
|
||
|
movups xmm1, dqword ptr [eax]
|
||
|
push edx
|
||
|
neg edx
|
||
|
and esi, -16
|
||
|
lea edx, [edx + esi + 16]
|
||
|
pop esi
|
||
|
@lrgnxt:movups xmm0, dqword ptr [eax + edx]
|
||
|
movaps dqword ptr [esi + edx], xmm0
|
||
|
add edx, 16
|
||
|
jl @lrgnxt
|
||
|
movups dqword ptr [esi], xmm1
|
||
|
pop esi
|
||
|
movups dqword ptr [esi], xmm2
|
||
|
{$endif HASNOSSE2}
|
||
|
@movend:cmp esi, ebp
|
||
|
jbe @@0916
|
||
|
@@0915: inc ebp
|
||
|
mov edx, [ebp]
|
||
|
mov eax, edx
|
||
|
shr edx, 12
|
||
|
xor eax, edx
|
||
|
and eax, 0FFFH
|
||
|
mov [esp+1CH + eax * 4], ebp
|
||
|
cmp esi, ebp
|
||
|
ja @@0915
|
||
|
@@0916: add esi, [esp+18H]
|
||
|
cmp ebx, [esp+10H]
|
||
|
jnc @@0917
|
||
|
add edi, edi
|
||
|
lea ebp, [esi - 1]
|
||
|
jz @@0908
|
||
|
jmp @@0909
|
||
|
@@ovlap:push ebx
|
||
|
push esi
|
||
|
lea ebx, [eax + edx]
|
||
|
add esi, edx
|
||
|
neg edx
|
||
|
@s: mov al, [ebx + edx]
|
||
|
mov [esi + edx], al
|
||
|
inc edx
|
||
|
jnz @s
|
||
|
pop esi
|
||
|
pop ebx
|
||
|
jmp @movend
|
||
|
@@0917: mov eax, [esp]
|
||
|
add esp, 16412
|
||
|
pop edi
|
||
|
pop esi
|
||
|
pop ebx
|
||
|
pop ebp
|
||
|
end;
|
||
|
|
||
|
function SynLZcompress1(src: PAnsiChar; size: integer; dst: PAnsiChar): integer;
|
||
|
{$ifdef FPC}nostackframe; assembler;{$endif}
|
||
|
asm
|
||
|
push ebp
|
||
|
push ebx
|
||
|
push esi
|
||
|
push edi
|
||
|
push eax
|
||
|
add esp, -4092
|
||
|
push eax
|
||
|
add esp, -4092
|
||
|
push eax
|
||
|
add esp, -4092
|
||
|
push eax
|
||
|
add esp, -4092
|
||
|
push eax
|
||
|
add esp, -4092
|
||
|
push eax
|
||
|
add esp, -4092
|
||
|
push eax
|
||
|
add esp, -4092
|
||
|
push eax
|
||
|
add esp, -4092
|
||
|
push eax
|
||
|
add esp, -32
|
||
|
mov esi, eax // esi=src
|
||
|
mov edi, ecx // edi=dst
|
||
|
mov [esp+08H], ecx
|
||
|
mov eax, edx
|
||
|
cmp eax, 32768
|
||
|
jl @@0889
|
||
|
or ax, 8000H
|
||
|
mov [edi], eax
|
||
|
mov eax, edx
|
||
|
shr eax, 15
|
||
|
mov [edi + 2], eax
|
||
|
add edi, 4
|
||
|
jmp @@0891
|
||
|
@@0890: mov eax, 2
|
||
|
jmp @@0904
|
||
|
@@0889: mov [edi], eax
|
||
|
test eax, eax
|
||
|
jz @@0890
|
||
|
add edi, 2
|
||
|
@@0891: lea eax, [edx + esi]
|
||
|
mov [esp+18H], edi
|
||
|
mov [esp+0CH], eax
|
||
|
sub eax, 11
|
||
|
mov [esp+4], eax
|
||
|
xor eax, eax
|
||
|
lea ebx, [esp+24H] // reset offsets lookup table
|
||
|
{$ifdef HASNOSSE2}
|
||
|
mov ecx, 1024
|
||
|
@@089I: mov [ebx], eax
|
||
|
mov [ebx + 4], eax
|
||
|
mov [ebx + 8], eax
|
||
|
mov [ebx + 12], eax
|
||
|
add ebx, 16
|
||
|
{$else}
|
||
|
pxor xmm0, xmm0
|
||
|
mov ecx, 256
|
||
|
@@089I: movups dqword ptr [ebx], xmm0
|
||
|
movups dqword ptr [ebx + 16], xmm0
|
||
|
movups dqword ptr [ebx + 32], xmm0
|
||
|
movups dqword ptr [ebx + 48], xmm0
|
||
|
add ebx, 64
|
||
|
{$endif HASNOSSE2}
|
||
|
dec ecx
|
||
|
jnz @@089I
|
||
|
mov [edi], eax
|
||
|
add edi, 4
|
||
|
mov ebx, 1 // ebx=1 shl CWbit
|
||
|
// main loop:
|
||
|
cmp esi, [esp+4]
|
||
|
ja @@0900
|
||
|
@@0892: mov edx, [esi]
|
||
|
mov eax, edx
|
||
|
shr edx, 12
|
||
|
xor edx, eax
|
||
|
and edx, 0FFFH
|
||
|
mov ebp, [esp+24H + edx * 4]
|
||
|
mov ecx, [esp+4024H + edx * 4]
|
||
|
mov [esp+24H + edx * 4], esi
|
||
|
xor ecx, eax
|
||
|
test ecx, 0FFFFFFH
|
||
|
mov [esp+4024H + edx * 4], eax
|
||
|
jnz @@0897
|
||
|
mov eax, esi
|
||
|
or ebp, ebp
|
||
|
jz @@0897
|
||
|
sub eax, ebp
|
||
|
mov ecx, [esp+18H]
|
||
|
cmp eax, 2
|
||
|
jle @@0897
|
||
|
add esi, 2
|
||
|
or dword ptr [ecx], ebx
|
||
|
mov ecx, [esp+0CH]
|
||
|
add ebp, 2
|
||
|
mov eax, 1
|
||
|
sub ecx, esi
|
||
|
dec ecx
|
||
|
mov [esp], ecx
|
||
|
cmp ecx, 271
|
||
|
jl @@0894
|
||
|
mov dword ptr [esp], 271
|
||
|
jmp @@0894
|
||
|
@@0893: inc eax
|
||
|
@@0894: mov ecx, [ebp + eax]
|
||
|
cmp cl, [esi + eax]
|
||
|
jnz @@0895
|
||
|
cmp eax, [esp]
|
||
|
jge @@0895
|
||
|
inc eax
|
||
|
cmp ch, [esi + eax]
|
||
|
jnz @@0895
|
||
|
shr ecx, 16
|
||
|
cmp eax, [esp]
|
||
|
jge @@0895
|
||
|
inc eax
|
||
|
cmp cl, [esi + eax]
|
||
|
jnz @@0895
|
||
|
cmp eax, [esp]
|
||
|
jge @@0895
|
||
|
inc eax
|
||
|
cmp ch, [esi + eax]
|
||
|
jnz @@0895
|
||
|
cmp eax, [esp]
|
||
|
jl @@0893
|
||
|
@@0895: add esi, eax
|
||
|
shl edx, 4
|
||
|
cmp eax, 15
|
||
|
jg @@0896
|
||
|
or eax, edx
|
||
|
mov word ptr [edi], ax
|
||
|
add edi, 2
|
||
|
jmp @@0898
|
||
|
@@0896: sub eax, 16
|
||
|
mov [edi], dx
|
||
|
mov [edi + 2H], al
|
||
|
add edi, 3
|
||
|
jmp @@0898
|
||
|
@@0897: mov al, [esi] // movsb is actually slower!
|
||
|
mov [edi], al
|
||
|
inc esi
|
||
|
inc edi
|
||
|
@@0898: add ebx, ebx
|
||
|
jz @@0899
|
||
|
cmp esi, [esp+4]
|
||
|
jbe @@0892
|
||
|
jmp @@0900
|
||
|
@@0899: mov [esp+18H], edi
|
||
|
mov [edi], ebx
|
||
|
inc ebx
|
||
|
add edi, 4
|
||
|
cmp esi, [esp+4]
|
||
|
jbe @@0892
|
||
|
@@0900: cmp esi, [esp+0CH]
|
||
|
jnc @@0903
|
||
|
@@0901: mov al, [esi]
|
||
|
mov [edi], al
|
||
|
inc esi
|
||
|
inc edi
|
||
|
add ebx, ebx
|
||
|
jz @@0902
|
||
|
cmp esi, [esp+0CH]
|
||
|
jc @@0901
|
||
|
jmp @@0903
|
||
|
@@0902: mov [edi], ebx
|
||
|
inc ebx
|
||
|
add edi, 4
|
||
|
cmp esi, [esp+0CH]
|
||
|
jc @@0901
|
||
|
@@0903: mov eax, edi
|
||
|
sub eax, [esp+08H]
|
||
|
@@0904: add esp, 32804
|
||
|
pop edi
|
||
|
pop esi
|
||
|
pop ebx
|
||
|
pop ebp
|
||
|
end;
|
||
|
|
||
|
function RdRand32: cardinal;
|
||
|
{$ifdef FPC}nostackframe; assembler;{$endif}
|
||
|
asm
|
||
|
// rdrand eax: same opcodes for x86 and x64
|
||
|
db $0f, $c7, $f0
|
||
|
// returns in eax, ignore carry flag (eax=0 won't hurt)
|
||
|
end;
|
||
|
|
||
|
function Rdtsc: Int64;
|
||
|
{$ifdef FPC}nostackframe; assembler;{$endif}
|
||
|
asm
|
||
|
// returns the TSC in EDX:EAX
|
||
|
rdtsc
|
||
|
end;
|
||
|
|
||
|
function StrCntDecFree(var refcnt: TStrCnt): boolean;
|
||
|
{$ifdef FPC}nostackframe; assembler;{$endif}
|
||
|
asm
|
||
|
lock dec dword ptr [refcnt] // =longint on Delphi and FPC 32-bit
|
||
|
setbe al
|
||
|
end; // don't check for ismultithread global since lock is cheaper on new CPUs
|
||
|
|
||
|
function DACntDecFree(var refcnt: TDACnt): boolean;
|
||
|
{$ifdef FPC}nostackframe; assembler;{$endif}
|
||
|
asm
|
||
|
lock dec dword ptr [refcnt] // =longint on Delphi and FPC 32-bit
|
||
|
setbe al
|
||
|
end; // don't check for ismultithread global since lock is cheaper on new CPUs
|
||
|
|
||
|
function LockedExc(var Target: PtrUInt; NewValue, Comperand: PtrUInt): boolean;
|
||
|
{$ifdef FPC}nostackframe; assembler;{$endif}
|
||
|
asm
|
||
|
xchg eax, ecx
|
||
|
lock cmpxchg dword ptr [ecx], edx
|
||
|
setz al
|
||
|
end;
|
||
|
|
||
|
procedure LockedAdd(var Target: PtrUInt; Increment: PtrUInt);
|
||
|
{$ifdef FPC}nostackframe; assembler;{$endif}
|
||
|
asm
|
||
|
lock add dword ptr [Target], Increment
|
||
|
end;
|
||
|
|
||
|
procedure LockedAdd32(var Target: cardinal; Increment: cardinal);
|
||
|
{$ifdef FPC}nostackframe; assembler;{$endif}
|
||
|
asm
|
||
|
lock add dword ptr [Target], Increment
|
||
|
end;
|
||
|
|
||
|
procedure LockedDec(var Target: PtrUInt; Decrement: PtrUInt);
|
||
|
{$ifdef FPC}nostackframe; assembler;{$endif}
|
||
|
asm
|
||
|
lock sub dword ptr [Target], Decrement
|
||
|
end;
|
||
|
|
||
|
function IsXmmYmmOSEnabled: boolean;
|
||
|
{$ifdef FPC}nostackframe; assembler;{$endif}
|
||
|
asm // see https://software.intel.com/en-us/blogs/2011/04/14/is-avx-enabled
|
||
|
xor ecx, ecx // get control register XCR0 = XFEATURE_ENABLED_MASK
|
||
|
db $0f, $01, $d0 // XGETBV reads XCR0 into EDX:EAX
|
||
|
and eax, 6 // check OS enabled both XMM (bit 1) and YMM (bit 2)
|
||
|
cmp al, 6
|
||
|
sete al
|
||
|
end;
|
||
|
|
||
|
procedure GetCpuid(cpueax, cpuecx: cardinal; var Registers: TIntelRegisters);
|
||
|
{$ifdef FPC}nostackframe; assembler;{$endif}
|
||
|
asm
|
||
|
push esi
|
||
|
push edi
|
||
|
push ebx
|
||
|
mov edi, cpueax
|
||
|
mov ebx, cpuecx
|
||
|
mov esi, Registers
|
||
|
pushfd
|
||
|
pop eax
|
||
|
mov edx, eax
|
||
|
xor eax, $200000
|
||
|
push eax
|
||
|
popfd
|
||
|
pushfd
|
||
|
pop eax
|
||
|
xor eax, edx // does this CPU support the cpuid opcode?
|
||
|
jz @nocpu
|
||
|
mov eax, edi
|
||
|
mov ecx, ebx
|
||
|
cpuid
|
||
|
mov TIntelRegisters(esi).&eax, eax
|
||
|
mov TIntelRegisters(esi).&ebx, ebx
|
||
|
mov TIntelRegisters(esi).&ecx, ecx
|
||
|
mov TIntelRegisters(esi).&edx, edx
|
||
|
@nocpu: pop ebx
|
||
|
pop edi
|
||
|
pop esi
|
||
|
end;
|
||
|
|
||
|
{$ifdef CPU32DELPHI}
|
||
|
|
||
|
// inspired by ValExt_JOH_PAS_8_a by John O'Harrow - calls Delphi System.@Pow10
|
||
|
function GetExtended(P: PUtf8Char; out err: integer): TSynExtended;
|
||
|
const
|
||
|
Ten: double = 10.0;
|
||
|
asm
|
||
|
// in: eax=text, edx=@err out: st(0)=result
|
||
|
push ebx // save used registers
|
||
|
push esi
|
||
|
push edi
|
||
|
mov esi, eax // string pointer
|
||
|
push eax // save for error condition
|
||
|
xor ebx, ebx
|
||
|
push eax // allocate local storage for loading fpu
|
||
|
test esi, esi
|
||
|
jz @nil // nil string
|
||
|
@trim: movzx ebx, byte ptr [esi] // strip leading spaces
|
||
|
inc esi
|
||
|
cmp bl, ' '
|
||
|
je @trim
|
||
|
xor ecx, ecx // clear sign flag
|
||
|
fld qword[Ten] // load 10 into fpu
|
||
|
xor eax, eax // zero number of decimal places
|
||
|
fldz // zero result in fpu
|
||
|
cmp bl, '0'
|
||
|
jl @chksig // check for sign character
|
||
|
@dig1: xor edi, edi // zero exponent value
|
||
|
@digl: sub bl, '0'
|
||
|
cmp bl, 9
|
||
|
ja @frac // non-digit
|
||
|
mov cl, 1 // set digit found flag
|
||
|
mov [esp], ebx // store for fpu use
|
||
|
fmul st(0), st(1) // multply by 10
|
||
|
fiadd dword ptr [esp] // add next digit
|
||
|
movzx ebx, byte ptr [esi] // get next char
|
||
|
inc esi
|
||
|
test bl, bl // end reached?
|
||
|
jnz @digl // no,get next digit
|
||
|
jmp @finish // yes,finished
|
||
|
@chksig:cmp bl, '-'
|
||
|
je @minus
|
||
|
cmp bl, '+'
|
||
|
je @sigset
|
||
|
@gdig1: test bl, bl
|
||
|
jz @error // no digits found
|
||
|
jmp @dig1
|
||
|
@minus: mov ch, 1 // set sign flag
|
||
|
@sigset:movzx ebx, byte ptr [esi] // get next char
|
||
|
inc esi
|
||
|
jmp @gdig1
|
||
|
@frac: cmp bl, '.' - '0'
|
||
|
jne @exp // no decimal point
|
||
|
movzx ebx, byte ptr [esi] // get next char
|
||
|
test bl, bl
|
||
|
jz @dotend // string ends with '.'
|
||
|
inc esi
|
||
|
@fracl: sub bl, '0'
|
||
|
cmp bl, 9
|
||
|
ja @exp // non-digit
|
||
|
mov [esp], ebx
|
||
|
dec eax // -(number of decimal places)
|
||
|
fmul st(0), st(1) // multply by 10
|
||
|
fiadd dword ptr [esp] // add next digit
|
||
|
movzx ebx, byte ptr [esi] // get next char
|
||
|
inc esi
|
||
|
test bl, bl // end reached?
|
||
|
jnz @fracl // no, get next digit
|
||
|
jmp @finish // yes, finished (no exponent)
|
||
|
@dotend:test cl, cl // any digits found before '.'?
|
||
|
jnz @finish // yes, valid
|
||
|
jmp @error // no,invalid
|
||
|
@exp: or bl, $20
|
||
|
cmp bl, 'e' - '0'
|
||
|
jne @error // not 'e' or 'e'
|
||
|
movzx ebx, byte ptr [esi] // get next char
|
||
|
inc esi
|
||
|
mov cl, 0 // clear exponent sign flag
|
||
|
cmp bl, '-'
|
||
|
je @minexp
|
||
|
cmp bl, '+'
|
||
|
je @expset
|
||
|
jmp @expl
|
||
|
@minexp:mov cl, 1 // set exponent sign flag
|
||
|
@expset:movzx ebx, byte ptr [esi] // get next char
|
||
|
inc esi
|
||
|
@expl: sub bl, '0'
|
||
|
cmp bl, 9
|
||
|
ja @error // non-digit
|
||
|
lea edi, [edi + edi * 4]// multiply by 10
|
||
|
add edi, edi
|
||
|
add edi, ebx // add next digit
|
||
|
movzx ebx, byte ptr [esi] // get next char
|
||
|
inc esi
|
||
|
test bl, bl // end reached?
|
||
|
jnz @expl // no, get next digit
|
||
|
@endexp:test cl, cl // positive exponent?
|
||
|
jz @finish // yes, keep exponent value
|
||
|
neg edi // no, negate exponent value
|
||
|
@finish:add eax, edi // exponent value - number of decimal places
|
||
|
mov [edx], ebx // result code = 0
|
||
|
jz @pow // no call to _pow10 needed
|
||
|
cmp eax, 308
|
||
|
jge @oor // limit to < 1.7 x 10^308 double range
|
||
|
cmp eax, -324
|
||
|
jle @oor // limit to > 5.0 x 10^-324 range
|
||
|
@oors: mov edi, ecx // save decimal sign flag
|
||
|
call System.@Pow10 // raise to power of 10
|
||
|
mov ecx, edi // restore decimal sign flag
|
||
|
@pow: test ch, ch // decimal sign flag set?
|
||
|
jnz @negate // yes, negate value
|
||
|
@ok: add esp, 8 // dump local storage and string pointer
|
||
|
@exit: ffree st(1) // remove ten value from fpu
|
||
|
pop edi // restore used registers
|
||
|
pop esi
|
||
|
pop ebx
|
||
|
ret // finished
|
||
|
@negate:fchs // negate result in fpu
|
||
|
jmp @ok
|
||
|
@oor: inc esi // force result code = 1
|
||
|
mov [edx], esi // set result code
|
||
|
xor eax, eax // set exponent = 0 (as pure pascal version)
|
||
|
jmp @oors
|
||
|
@nil: inc esi // force result code = 1
|
||
|
fldz // result value = 0
|
||
|
@error: pop ebx // dump local storage
|
||
|
pop eax // string pointer
|
||
|
sub esi, eax // error offset
|
||
|
mov [edx], esi // set result code
|
||
|
test ch, ch // decimal sign flag set?
|
||
|
jz @exit // no,exit
|
||
|
fchs // yes. negate result in fpu
|
||
|
jmp @exit // exit setting result code
|
||
|
end;
|
||
|
|
||
|
// FPC will properly inline multiplication by reciprocal
|
||
|
procedure Div100(Y: cardinal; var res: TDiv100Rec);
|
||
|
asm
|
||
|
mov dword ptr [edx].TDiv100Rec.M, eax
|
||
|
mov ecx, edx
|
||
|
mov edx, eax
|
||
|
mov eax, 1374389535
|
||
|
mul edx
|
||
|
shr edx, 5
|
||
|
mov dword ptr [ecx].TDiv100Rec.D, edx
|
||
|
imul eax, edx, 100
|
||
|
sub dword ptr [ecx].TDiv100Rec.M, eax
|
||
|
end;
|
||
|
|
||
|
// those functions are intrinsics with FPC :)
|
||
|
|
||
|
function BSRdword(c: cardinal): cardinal;
|
||
|
asm
|
||
|
bsr eax, eax
|
||
|
jnz @nz
|
||
|
mov eax, 255
|
||
|
@nz:
|
||
|
end;
|
||
|
|
||
|
function BSRqword(const q: qword): cardinal;
|
||
|
asm
|
||
|
bsr eax, [esp + 8]
|
||
|
jz @1
|
||
|
add eax, 32
|
||
|
ret
|
||
|
@1: bsr eax, [esp + 4]
|
||
|
jnz @2
|
||
|
mov eax, 255
|
||
|
@2:
|
||
|
end;
|
||
|
|
||
|
{$endif CPU32DELPHI}
|
||
|
|
||
|
|