delphimvcframework/lib/dmustache/mormot.core.base.asmx86.inc
2024-04-29 15:40:45 +02:00

2603 lines
79 KiB
PHP

{
This file is a part of the Open Source Synopse mORMot framework 2,
licensed under a MPL/GPL/LGPL three license - see LICENSE.md
x86 32-bit assembly used by mormot.core.base.pas
}
{$ifdef FPC}
// disabled some FPC paranoid warnings
{$WARN 7102 off : Use of +offset(%ebp) for parameters invalid here }
{$WARN 7104 off : Use of -offset(%ebp) is not recommended for local variable access }
{$WARN 7121 off : Check size of XMM memory operand }
{$WARN 7122 off : Check size of XMM memory operand }
{$endif FPC}
{$ifdef ASMX86} // i386 asm with global - disabled on PIC targets
{
FillCharFast/MoveFast implementation notes:
- use SS2 and not FPU fld/fst which may trigger unexpected exceptions (Agner)
- use dedicated branchless sub-functions for small buffers of 0..32 bytes
- use simple SSE2 loop for 33..255 bytes
- assume ERBMS is available (cpuid flag may not be propagated within VMs)
- use "rep movsb" for 256..512K input (should work on all CPUs, even older with
no ERBMS), or as fallback if no SSE2 CPU is used
- use non volatile SSE2 loop when >= 512KB (to not pollute the CPU cache)
- don't use backward "std rep movsb" since it is not involved by ERMBS (slow)
- note: Delphi Win32 x87 RTL code by John O'Harrow seems deprecated
}
{$define WITH_ERMS} // we need it as fallback on old CPU without SSE2
{$ifdef WITH_ERMS}
var
// "rep stosb/movsb" enabled for len >= 4096 on ERMSB CPUs
// it has been reported that "on 32-bit strings have to be at least 4KB"
// see https://stackoverflow.com/a/43837564/458259 for explanations and timing
ERMSB_MIN_SIZE_FWD: integer = maxInt; // maxInt = disabled by default
{$ifndef FPC_X86}
ERMSB_MIN_SIZE_BWD: integer = maxInt; // used for no-SSE2, not for ERMSB
{$endif FPC_X86}
{$define WITH_ERMSASM} // include "rep movsb" asm blocks
{$endif WITH_ERMS}
{$ifdef HASNOSSE2}
{$define WITH_ERMSASM} // "rep movsd" is used as fallback on oldest CPU
{$endif HASNOSSE2}
const
// non-temporal writes should bypass the cache when the size is bigger than
// half the size of the largest level cache - we assume low 1MB cache here
NONTEMPORALSIZE = 1 shl 20;
// fast SSE2 version - force define HASNOSSE2 when run any very old CPU
procedure FillcharFast(var dst; cnt: PtrInt; value: byte);
{$ifdef FPC}nostackframe; assembler;{$endif}
asm
// eax=Dest edx=Count cl=Value
movzx ecx, cl
imul ecx, ecx, $01010101
cmp edx, 32
jg @32up
test edx, edx
jle @00
mov [eax + edx - 1], cl // fill last byte
shr edx, 1 // how many words to fill
jmp dword ptr [edx * 4 + @jmptab]
@00: ret
@32up:
{$ifndef HASNOSSE2}
{$ifdef WITH_ERMS}
cmp edx, NONTEMPORALSIZE
jae @noerms // movntdq was reported to be faster than ERMSB
cmp edx, ERMSB_MIN_SIZE_FWD
ja @ermsb
@noerms:{$endif WITH_ERMS}
sub edx, 16
movd xmm0, ecx
mov ecx, eax
pshufd xmm0, xmm0, 0
and ecx, 15 // 16-byte align writes
movups dqword ptr [eax], xmm0 // fill first 16 bytes
movups dqword ptr [eax + edx], xmm0 // fill last 16 bytes
sub ecx, 16
sub eax, ecx
add edx, ecx
add eax, edx
neg edx
cmp edx, -NONTEMPORALSIZE // assume > 512KB bypass the cache
jl @nv
{$ifdef FPC} align 16 {$else} {$ifdef HASALIGN} .align 16 {$endif}{$endif}
@loop: movaps dqword ptr [eax + edx], xmm0 // fill 16 bytes per loop
add edx, 16
jl @loop
ret
{$ifdef FPC} align 16 {$else} {$ifdef HASALIGN} .align 16 {$endif}{$endif}
@nv: movntdq dqword ptr [eax + edx], xmm0 // non-temporal fill 16 bytes
add edx, 16
jl @nv
ret
{$endif HASNOSSE2}
{$ifdef WITH_ERMSASM}
@ermsb: push edi
cld
{$ifdef HASNOSSE2}
@ermsa: test al, 3 // aligned stosd is better on old CPUs
jz @erms3
mov byte ptr [eax], cl
inc eax
dec edx
jmp @ermsa
@erms3: mov edi, eax
mov eax, ecx
mov ecx, edx
shr ecx, 2
jz @erms2
rep stosd // no SSE2 version
@erms2: test dl, 2
jz @erms1
mov word ptr [edi], ax
add edi, 2
@erms1: test dl, 1
jz @erms0
mov byte ptr [edi], al
{$else} // ERMSB favors stosb and will properly align writes
mov edi, eax
mov eax, ecx
mov ecx, edx
rep stosb
{$endif HASNOSSE2}
@erms0: pop edi
ret
{$endif WITH_ERMSASM}
{$ifdef FPC} align 4 {$else} nop {$ifdef HASNOSSE2} nop; nop {$endif} {$endif}
@jmptab:dd @00, @02, @04, @06, @08, @10, @12, @14, @16
dd @18, @20, @22, @24, @26, @28, @30, @32
// Delphi RTL uses 16-bit "mov [eax + ...], cx" which are slower
@32: mov [eax + 28], ecx
@28: mov [eax + 24], ecx
@24: mov [eax + 20], ecx
@20: mov [eax + 16], ecx
@16: mov [eax + 12], ecx
@12: mov [eax + 8], ecx
@08: mov [eax + 4], ecx
@04: mov [eax], ecx
ret
@30: mov [eax + 26], ecx
@26: mov [eax + 22], ecx
@22: mov [eax + 18], ecx
@18: mov [eax + 14], ecx
@14: mov [eax + 10], ecx
@10: mov [eax + 6], ecx
@06: mov [eax + 2], ecx
@02: mov word ptr [eax], cx
end;
{$ifndef FPC_X86} // FPC RTL has fastmove.inc -> our SSE2/ERMS asm is slower
// fast SSE2 version - force define HASNOSSE2 when run any very old CPU
procedure MoveFast(const src; var dst; cnt: PtrInt);
{$ifdef FPC}nostackframe; assembler;{$endif}
asm
// eax=source edx=dest ecx=count
cmp ecx, 32
ja @lrg // count > 32 or count < 0
sub ecx, 8
jg @sml // 9..32 byte move
jmp dword ptr [@table + 32 + ecx * 4] // 0..8 byte move
{$ifdef HASNOSSE2}
@sml: fild qword ptr [eax + ecx] // last 8
fild qword ptr [eax] // first 8
cmp ecx, 8
jle @sml16
fild qword ptr [eax + 8] // second 8
cmp ecx, 16
jle @sml24
fild qword ptr [eax + 16] // third 8
fistp qword ptr [edx + 16] // third 8
@sml24: fistp qword ptr [edx + 8] // second 8
@sml16: fistp qword ptr [edx] // first 8
fistp qword ptr [edx + ecx] // last 8
{$else}
@sml: movq xmm0, qword ptr [eax + ecx] // last 8
movq xmm1, qword ptr [eax] // first 8
cmp ecx, 8
jle @sml16
movq xmm2, qword ptr [eax + 8] // second 8
cmp ecx, 16
jle @sml24
movq xmm3, qword ptr [eax + 16] // third 8
movq qword ptr [edx + 16], xmm3 // third 8
@sml24: movq qword ptr [edx + 8], xmm2 // second 8
@sml16: movq qword ptr [edx], xmm1 // first 8
movq qword ptr [edx + ecx], xmm0 // last 8
ret
{$endif HASNOSSE2}
@exit: rep ret
{$ifdef FPC} align 4 {$else} {$ifdef HASALIGN} .align 4 {$endif}{$endif}
@table: dd @exit, @m01, @m02, @m03, @m04, @m05, @m06, @m07, @m08
{$ifdef WITH_ERMSASM}
@ermsf: push esi
push edi
mov esi, eax
mov edi, edx
cld
rep movsb // ERMSB forward move
pop edi
pop esi
ret
@ermsb: push esi
push edi
lea esi, [eax + ecx - 1]
lea edi, [edx + ecx - 1]
std
rep movsb // backward move is slow even if ERMSB is set
pop edi
pop esi
cld // FPC requires this
ret
{$endif WITH_ERMSASM}
{$ifdef HASNOSSE2}
@lrg: jng @exit // count < 0
sub edx, eax
jz @exit
cmp edx, ecx
lea edx, [edx + eax]
jb @ermsb // move backwards if unsigned(dest-source) < count
jmp @ermsf
{$else}
@lrgfwd:// large forward move
{$ifdef WITH_ERMS}
cmp ecx, NONTEMPORALSIZE
jae @noermf // movntdq was reported to be faster than ERMSB
cmp ecx, ERMSB_MIN_SIZE_FWD
ja @ermsf
@noermf:{$endif WITH_ERMS}
push edx
movups xmm2, dqword ptr [eax] // first 16
lea eax, [eax + ecx - 16]
lea ecx, [ecx + edx - 16]
movups xmm1, dqword ptr [eax] // last 16
push ecx
neg ecx
and edx, -16 // 16-byte align writes
lea ecx, [ecx + edx + 16]
pop edx
cmp ecx, -NONTEMPORALSIZE // assume > 512KB bypass the cache
jl @fwnv
{$ifdef FPC} align 16 {$else} {$ifdef HASALIGN} .align 16 {$endif}{$endif}
@fwd: movups xmm0, dqword ptr [eax + ecx] // move by 16 bytes
movaps dqword ptr [edx + ecx], xmm0
add ecx, 16
jl @fwd
@fwde: movups dqword ptr [edx], xmm1 // last 16
pop edx
movups dqword ptr [edx], xmm2 // first 16
ret
{$ifdef FPC} align 16 {$else} {$ifdef HASALIGN} .align 16 {$endif}{$endif}
@fwnv: movups xmm0, dqword ptr [eax + ecx]
movntdq dqword ptr [edx + ecx], xmm0 // non-temporal move by 16 bytes
add ecx, 16
jl @fwnv
sfence
jmp @fwde
@lrg: jng @exit // count < 0
sub edx, eax
jz @exit // source=dest
cmp edx, ecx
lea edx, [edx + eax]
jae @lrgfwd // move backwards if unsigned(dest-source) < count
// large backward/overlapping move
@lrgbwd:{$ifdef WITH_ERMS}
cmp ecx, ERMSB_MIN_SIZE_BWD // enabled for no-SSE2, not for ERMSB
ja @ermsb
{$endif WITH_ERMS}
sub ecx, 16
push ecx
movups xmm2, dqword ptr [eax + ecx] // last 16
movups xmm1, dqword ptr [eax] // first 16
add ecx, edx
and ecx, -16 // 16-byte align writes
sub ecx, edx
// non-volatile backward is not efficient since we overlap data
{$ifdef FPC} align 16 {$else} {$ifdef HASALIGN} .align 16 {$endif}{$endif}
@bwd: movups xmm0, dqword ptr [eax + ecx]
movaps dqword ptr [edx + ecx], xmm0
sub ecx, 16
jg @bwd
pop ecx
movups dqword ptr [edx], xmm1 // first 16
movups dqword ptr [edx + ecx], xmm2 // last 16
ret
{$endif HASNOSSE2}
@m01: mov al, [eax]
mov [edx], al
ret
@m02: movzx eax, word ptr [eax]
mov [edx], ax
ret
@m03: movzx ecx, word ptr [eax]
mov al, [eax + 2]
mov [edx], cx
mov [edx + 2], al
ret
@m04: mov ecx, [eax]
mov [edx], ecx
ret
@m05: mov ecx, [eax]
mov al, [eax + 4]
mov [edx], ecx
mov [edx + 4], al
ret
@m06: mov ecx, [eax]
movzx eax, word ptr [eax + 4]
mov [edx], ecx
mov [edx + 4], ax
ret
@m07: mov ecx, [eax]
mov eax, [eax + 3]
mov [edx], ecx
mov [edx + 3], eax
ret
@m08: mov ecx, [eax]
mov eax, [eax + 4]
mov [edx], ecx
mov [edx + 4], eax
end;
{$endif FPC_X86}
function CompareMem(P1, P2: Pointer; Length: PtrInt): boolean;
{$ifdef FPC}nostackframe; assembler;{$endif}
asm
// eax=P1 edx=P2 ecx=Length
cmp eax, edx
je @0 // P1=P2
sub ecx, 8
jl @small
push ebx
mov ebx, [eax] // Compare First 4 Bytes
cmp ebx, [edx]
jne @setbig
lea ebx, [eax + ecx] // Compare Last 8 Bytes
add edx, ecx
mov eax, [ebx]
cmp eax, [edx]
jne @setbig
mov eax, [ebx + 4]
cmp eax, [edx + 4]
jne @setbig
sub ecx, 4
jle @true // All Bytes already Compared
neg ecx // ecx=-(Length-12)
add ecx, ebx // DWORD Align Reads
and ecx, -4
sub ecx, ebx
@loop: mov eax, [ebx + ecx] // Compare 8 Bytes per Loop
cmp eax, [edx + ecx]
jne @setbig
mov eax, [ebx + ecx + 4]
cmp eax, [edx + ecx + 4]
jne @setbig
add ecx, 8
jl @loop
@true: pop ebx
@0: mov al, 1
ret
@setbig:pop ebx
setz al
ret
@small: add ecx, 8 // ecx=0..7
jle @0 // Length <= 0
neg ecx // ecx=-1..-7
lea ecx, [@1 + ecx * 8 + 8] // each @#: block below = 8 bytes
jmp ecx
@7: mov cl, [eax + 6]
cmp cl, [edx + 6]
jne @setsml
@6: mov ch, [eax + 5]
cmp ch, [edx + 5]
jne @setsml
@5: mov cl, [eax + 4]
cmp cl, [edx + 4]
jne @setsml
@4: mov ch, [eax + 3]
cmp ch, [edx + 3]
jne @setsml
@3: mov cl, [eax + 2]
cmp cl, [edx + 2]
jne @setsml
@2: mov ch, [eax + 1]
cmp ch, [edx + 1]
jne @setsml
@1: mov al, [eax]
cmp al, [edx]
@setsml:setz al
end;
function crc32fasttab(crc: cardinal; buf: PAnsiChar; len: cardinal;
tab: PCrc32tab): cardinal;
asm
// adapted from Aleksandr Sharahov code and Maxim Masiutin remarks
push ebx
push ebp
mov ebp, tab
not eax
neg ecx // eax=crc edx=buf ecx=-len ebp=tab
jz @z
test edx, edx
jz @z
@head: test dl, 3
jz @align
movzx ebx, byte ptr [edx]
inc edx
xor bl, al
shr eax, 8
xor eax, dword ptr [ebx * 4 + ebp]
inc ecx
jnz @head
@z: jmp @e
@align: sub edx, ecx
add ecx, 8
jg @done
push esi
push edi
mov edi, edx
{$ifdef FPC} align 8 {$endif FPC}
@by8: mov edx, eax
mov ebx, [edi + ecx - 4]
xor edx, [edi + ecx - 8]
movzx esi, bl
mov eax, dword ptr [esi * 4 + ebp + 1024 * 3]
movzx esi, bh
xor eax, dword ptr [esi * 4 + ebp + 1024 * 2]
shr ebx, 16
movzx esi, bl
xor eax, dword ptr [esi * 4 + ebp + 1024 * 1]
movzx esi, bh
xor eax, dword ptr [esi * 4 + ebp + 1024 * 0]
movzx esi, dl
xor eax, dword ptr [esi * 4 + ebp + 1024 * 7]
movzx esi, dh
xor eax, dword ptr [esi * 4 + ebp + 1024 * 6]
shr edx, 16
movzx esi, dl
xor eax, dword ptr [esi * 4 + ebp + 1024 * 5]
movzx esi, dh
xor eax, dword ptr [esi * 4 + ebp + 1024 * 4]
add ecx, 8
jle @by8
mov edx, edi
pop edi
pop esi
@done: sub ecx, 8
jge @e
@tail: movzx ebx, byte[edx + ecx]
xor bl, al
shr eax, 8
xor eax, dword ptr [ebx * 4 + ebp]
inc ecx
jnz @tail
@e: pop ebp
pop ebx
not eax
end;
procedure crcblockfast(crc128, data128: PBlock128);
{$ifdef FPC}nostackframe; assembler;{$endif}
asm
// Delphi is not efficient about corresponding pascal code
push ebp
push edi
push esi
mov ebp, eax // ebp=crc128 edi=data128
mov edi, edx
mov edx, dword ptr [eax]
mov ecx, dword ptr [eax + 4]
xor edx, dword ptr [edi]
xor ecx, dword ptr [edi + 4]
movzx esi, dl
// note: since we have "+ 1024 * n" offsets, crc32ctab is left immediate
mov eax, dword ptr [esi * 4 + crc32ctab + 1024 * 3]
movzx esi, dh
shr edx, 16
xor eax, dword ptr [esi * 4 + crc32ctab + 1024 * 2]
movzx esi, dl
xor eax, dword ptr [esi * 4 + crc32ctab + 1024 * 1]
movzx esi, dh
xor eax, dword ptr [esi * 4 + crc32ctab + 1024 * 0]
mov edx, dword ptr [ebp + 8]
xor edx, dword ptr [edi + 8]
mov dword ptr [ebp], eax
movzx esi, cl
mov eax, dword ptr [esi * 4 + crc32ctab + 1024 * 3]
movzx esi, ch
shr ecx, 16
xor eax, dword ptr [esi * 4 + crc32ctab + 1024 * 2]
movzx esi, cl
xor eax, dword ptr [esi * 4 + crc32ctab + 1024 * 1]
movzx esi, ch
xor eax, dword ptr [esi * 4 + crc32ctab + 1024 * 0]
mov dword ptr [ebp + 4], eax
mov ecx, dword ptr [ebp + 12]
xor ecx, dword ptr [edi + 12]
movzx esi, dl
mov eax, dword ptr [esi * 4 + crc32ctab + 1024 * 3]
movzx esi, dh
shr edx, 16
xor eax, dword ptr [esi * 4 + crc32ctab + 1024 * 2]
movzx esi, dl
xor eax, dword ptr [esi * 4 + crc32ctab + 1024 * 1]
movzx esi, dh
xor eax, dword ptr [esi * 4 + crc32ctab + 1024 * 0]
mov dword ptr [ebp + 8], eax
movzx esi, cl
mov eax, dword ptr [esi * 4 + crc32ctab + 1024 * 3]
movzx esi, ch
shr ecx, 16
xor eax, dword ptr [esi * 4 + crc32ctab + 1024 * 2]
movzx esi, cl
xor eax, dword ptr [esi * 4 + crc32ctab + 1024 * 1]
movzx esi, ch
xor eax, dword ptr [esi * 4 + crc32ctab + 1024 * 0]
mov dword ptr [ebp + 12], eax
pop esi
pop edi
pop ebp
end;
function fnv32(crc: cardinal; buf: PAnsiChar; len: PtrInt): cardinal;
{$ifdef FPC}nostackframe; assembler;{$endif}
asm
// eax=crc, edx=buf, ecx=len
push ebx
test edx, edx
jz @0
neg ecx
jz @0
sub edx, ecx
@1: movzx ebx, byte ptr [edx + ecx]
xor eax, ebx
imul eax, eax, 16777619
inc ecx
jnz @1
@0: pop ebx
end; // we tried an unrolled version, but it was slower on our Core i7!
function kr32(crc: cardinal; buf: PAnsiChar; len: PtrInt): cardinal;
{$ifdef FPC}nostackframe; assembler;{$endif}
asm
// eax=crc, edx=buf, ecx=len
test ecx, ecx
push edi
push esi
push ebx
push ebp
jz @z
cmp ecx, 4
jb @s
@8: mov ebx, [edx] // unrolled version reading per dword
add edx, 4
mov esi, eax
movzx edi, bl
movzx ebp, bh
shr ebx, 16
shl eax, 5
sub eax, esi
add eax, edi
mov esi, eax
shl eax, 5
sub eax, esi
lea esi, [eax + ebp]
add eax, ebp
movzx edi, bl
movzx ebx, bh
shl eax, 5
sub eax, esi
lea ebp, [eax + edi]
add eax, edi
shl eax, 5
sub eax, ebp
add eax, ebx
cmp ecx, 8
lea ecx, [ecx - 4]
jae @8
test ecx, ecx
jz @z
@s: mov esi, eax
@1: shl eax, 5
movzx ebx, byte ptr [edx]
inc edx
sub eax, esi
lea esi, [eax + ebx]
add eax, ebx
dec ecx
jnz @1
@z: pop ebp
pop ebx
pop esi
pop edi
end;
function StrLenSSE2(S: pointer): PtrInt;
{$ifdef FPC}nostackframe; assembler;{$endif}
asm
// from GPL strlen32.asm by Agner Fog - www.agner.org/optimize
mov ecx, eax // copy pointer
test eax, eax
jz @null // returns 0 if S=nil
push eax // save start address
pxor xmm0, xmm0 // set to zero
and ecx, 15 // lower 4 bits indicate misalignment
and eax, -16 // align pointer by 16
// will never read outside a memory page boundary, so won't trigger GPF
movaps xmm1, [eax] // read from nearest preceding boundary
pcmpeqb xmm1, xmm0 // compare 16 bytes with zero
pmovmskb edx, xmm1 // get one bit for each byte result
shr edx, cl // shift out false bits
shl edx, cl // shift back again
bsf edx, edx // find first 1-bit
jnz @A200 // found
// Main loop, search 16 bytes at a time
@A100: add eax, 10H // increment pointer by 16
movaps xmm1, [eax] // read 16 bytes aligned
pcmpeqb xmm1, xmm0 // compare 16 bytes with zero
pmovmskb edx, xmm1 // get one bit for each byte result
bsf edx, edx // find first 1-bit
// (moving the bsf out of the loop and using test here would be faster
// for long strings on old processors, but we are assuming that most
// strings are short, and newer processors have higher priority)
jz @A100 // loop if not found
@A200: // Zero-byte found. Compute string length
pop ecx // restore start address
sub eax, ecx // subtract start address
add eax, edx // add byte index
@null:
end;
function StrInt32(P: PAnsiChar; val: PtrInt): PAnsiChar;
{$ifdef FPC}nostackframe; assembler;{$endif}
asm
// eax=P, edx=val
mov ecx, edx
sar ecx, 31 // 0 if val>=0 or -1 if val<0
push ecx
xor edx, ecx
sub edx, ecx // edx=abs(val)
cmp edx, 10
jb @3 // direct process of common val<10
push edi
mov edi, eax
mov eax, edx
@s: sub edi, 2
cmp eax, 100
jb @2
mov ecx, eax
mov edx, 1374389535 // use power of two reciprocal to avoid division
mul edx
shr edx, 5 // now edx=eax div 100
mov eax, edx
imul edx, -200
movzx edx, word ptr [TwoDigitLookup + ecx * 2 + edx]
mov [edi], dx
cmp eax, 10
jae @s
@1: dec edi
or al, '0'
mov byte ptr [edi - 1], '-'
mov [edi], al
mov eax, edi
pop edi
pop ecx
add eax, ecx // includes '-' if val<0
ret
@2: movzx eax, word ptr [TwoDigitLookup + eax * 2]
mov byte ptr [edi - 1], '-'
mov [edi], ax
mov eax, edi
pop edi
pop ecx
add eax, ecx // includes '-' if val<0
ret
@3: dec eax
pop ecx
or dl, '0'
mov byte ptr [eax - 1], '-'
mov [eax], dl
add eax, ecx // includes '-' if val<0
end;
function StrUInt32(P: PAnsiChar; val: PtrUInt): PAnsiChar;
{$ifdef FPC}nostackframe; assembler;{$endif}
asm
// eax=P, edx=val
cmp edx, 10
jb @3 // direct process of common val=0 (or val<10)
push edi
mov edi, eax
mov eax, edx
@s: sub edi, 2
cmp eax, 100
jb @2
mov ecx, eax
mov edx, 1374389535 // use power of two reciprocal to avoid division
mul edx
shr edx, 5 // now edx=eax div 100
mov eax, edx
imul edx, -200
movzx edx, word ptr [TwoDigitLookup + ecx * 2 + edx]
mov [edi], dx
cmp eax, 10
jae @s
@1: dec edi
or al, '0'
mov [edi], al
mov eax, edi
pop edi
ret
@2: movzx eax, word ptr [TwoDigitLookup + eax * 2]
mov [edi], ax
mov eax, edi
pop edi
ret
@3: dec eax
or dl, '0'
mov [eax], dl
end;
procedure YearToPChar(Y: PtrUInt; P: PUtf8Char);
{$ifdef FPC}nostackframe; assembler;{$endif}
asm
// eax=Y, edx=P
push edx
mov ecx, eax
mov edx, 1374389535 // use power of two reciprocal to avoid division
mul edx
shr edx, 5 // now edx=Y div 100
movzx eax, word ptr [TwoDigitLookup + edx * 2]
imul edx, -200
movzx edx, word ptr [TwoDigitLookup + ecx * 2 + edx]
pop ecx
shl edx, 16
or eax, edx
mov [ecx], eax
end;
{$endif ASMX86}
// functions below are always available, even on DARWIN
function Hash32(Data: PCardinalArray; Len: integer): cardinal;
{$ifdef FPC}nostackframe; assembler;{$endif}
asm
// eax=Data edx=Len
push esi
push edi
mov cl, dl
mov ch, dl
xor esi, esi
xor edi, edi
test eax, eax
jz @z
shr edx, 4
jz @by4
nop
@by16: add esi, dword ptr [eax]
add edi, esi
add esi, dword ptr [eax+4]
add edi, esi
add esi, dword ptr [eax+8]
add edi, esi
add esi, dword ptr [eax+12]
add edi, esi
add eax, 16
dec edx
jnz @by16
@by4: and cl, 15
jz @0
shr cl, 2
jz @rem
@4: add esi, dword ptr [eax]
add edi, esi
add eax, 4
dec cl
jnz @4
@rem: and ch, 3
jz @0
dec ch
jz @1
dec ch
jz @2
mov eax, dword ptr [eax]
and eax, $ffffff
jmp @e
@2: movzx eax, word ptr [eax]
jmp @e
@1: movzx eax, byte ptr [eax]
@e: add esi, eax
@0: add edi, esi
mov eax, esi
shl edi, 16
xor eax, edi
@z: pop edi
pop esi
end;
function xxHash32(crc: cardinal; P: PAnsiChar; len: cardinal): cardinal;
{$ifdef FPC}nostackframe; assembler;{$endif}
asm
xchg edx, ecx
push ebp
push edi
lea ebp, [ecx+edx]
push esi
push ebx
sub esp, 8
mov ebx, eax
mov dword ptr [esp], edx
lea eax, [ebx+165667B1H]
cmp edx, 15
jbe @2
lea eax, [ebp-10H]
lea edi, [ebx+24234428H]
lea esi, [ebx-7A143589H]
mov dword ptr [esp+4H], ebp
mov edx, eax
lea eax, [ebx+61C8864FH]
mov ebp, edx
@1: mov edx, dword ptr [ecx]
imul edx, -2048144777
add edi, edx
rol edi, 13
imul edi, -1640531535
mov edx, dword ptr [ecx+4]
imul edx, -2048144777
add esi, edx
rol esi, 13
imul esi, -1640531535
mov edx, dword ptr [ecx+8]
imul edx, -2048144777
add ebx, edx
rol ebx, 13
imul ebx, -1640531535
mov edx, dword ptr [ecx+12]
lea ecx, [ecx+16]
imul edx, -2048144777
add eax, edx
rol eax, 13
imul eax, -1640531535
cmp ebp, ecx
jnc @1
rol edi, 1
rol esi, 7
rol ebx, 12
add esi, edi
mov ebp, dword ptr [esp+4H]
ror eax, 14
add ebx, esi
add eax, ebx
@2: lea esi, [ecx+4H]
add eax, dword ptr [esp]
cmp ebp, esi
jc @4
mov ebx, esi
nop
@3: imul edx, dword ptr [ebx-4H], -1028477379
add ebx, 4
add eax, edx
ror eax, 15
imul eax, 668265263
cmp ebp, ebx
jnc @3
lea edx, [ebp-4H]
sub edx, ecx
mov ecx, edx
and ecx, 0FFFFFFFCH
add ecx, esi
@4: cmp ebp, ecx
jbe @6
@5: movzx edx, byte ptr [ecx]
add ecx, 1
imul edx, 374761393
add eax, edx
rol eax, 11
imul eax, -1640531535
cmp ebp, ecx
jnz @5
nop
@6: mov edx, eax
add esp, 8
shr edx, 15
xor eax, edx
imul eax, -2048144777
pop ebx
pop esi
mov edx, eax
shr edx, 13
xor eax, edx
imul eax, -1028477379
pop edi
pop ebp
mov edx, eax
shr edx, 16
xor eax, edx
end;
function GetBitsCountPas(value: PtrInt): PtrInt;
{$ifdef FPC}nostackframe; assembler;{$endif}
asm
// branchless Wilkes-Wheeler-Gill i386 asm implementation
mov edx, eax
shr eax, 1
and eax, $55555555
sub edx, eax
mov eax, edx
shr edx, 2
and eax, $33333333
and edx, $33333333
add eax, edx
mov edx, eax
shr eax, 4
add eax, edx
and eax, $0f0f0f0f
mov edx, eax
shr edx, 8
add eax, edx
mov edx, eax
shr edx, 16
add eax, edx
and eax, $3f
end;
{$ifdef HASNOSSE2} // fallback to simple pascal code if no SSE2 available
function ByteScanIndex(P: PByteArray; Count: PtrInt; Value: byte): PtrInt;
begin
result := 0;
if P <> nil then
repeat
if result >= Count then
break;
if P^[result] = Value then
exit;
inc(result);
until false;
result := -1;
end;
function WordScanIndex(P: PWordArray; Count: PtrInt; Value: word): PtrInt;
begin
result := 0;
if P <> nil then
repeat
if result >= Count then
break;
if P^[result] = Value then
exit;
inc(result);
until false;
result := -1;
end;
function IntegerScanIndex(P: PCardinalArray; Count: PtrInt; Value: cardinal): PtrInt;
begin
result := PtrUInt(IntegerScan(P, Count, Value));
if result = 0 then
dec(result)
else
begin
dec(result, PtrUInt(P));
result := result shr 2;
end;
end;
function IntegerScan(P: PCardinalArray; Count: PtrInt; Value: cardinal): PCardinal;
{$ifdef FPC}nostackframe; assembler;{$endif}
asm
// eax=P, edx=Count, Value=ecx
test eax, eax
jz @ok0 // avoid GPF
cmp edx, 8
jb @s2
nop
nop
nop // @s1 loop align
@s1: sub edx, 8
cmp [eax], ecx
je @ok0
cmp [eax + 4], ecx
je @ok4
cmp [eax + 8], ecx
je @ok8
cmp [eax + 12], ecx
je @ok12
cmp [eax + 16], ecx
je @ok16
cmp [eax + 20], ecx
je @ok20
cmp [eax + 24], ecx
je @ok24
cmp [eax + 28], ecx
je @ok28
add eax, 32
cmp edx, 8
jae @s1
@s2: test edx, edx
jz @z
cmp [eax], ecx
je @ok0
dec edx
jz @z
cmp [eax + 4], ecx
je @ok4
dec edx
jz @z
cmp [eax + 8], ecx
je @ok8
dec edx
jz @z
cmp [eax + 12], ecx
je @ok12
dec edx
jz @z
cmp [eax + 16], ecx
je @ok16
dec edx
jz @z
cmp [eax + 20], ecx
je @ok20
dec edx
jz @z
cmp [eax + 24], ecx
je @ok24
@z: xor eax, eax // return nil if not found
ret
@ok0: rep ret
@ok28: add eax, 28
ret
@ok24: add eax, 24
ret
@ok20: add eax, 20
ret
@ok16: add eax, 16
ret
@ok12: add eax, 12
ret
@ok8: add eax, 8
ret
@ok4: add eax, 4
end;
function IntegerScanExists(P: PCardinalArray; Count: PtrInt; Value: cardinal): boolean;
{$ifdef FPC}nostackframe; assembler;{$endif}
asm
// eax=P, edx=Count, Value=ecx
test eax, eax
jz @z // avoid GPF
cmp edx, 8
jae @s1
jmp dword ptr [edx * 4 + @Table]
{$ifdef FPC} align 4 {$endif FPC}
@Table: dd @z, @1, @2, @3, @4, @5, @6, @7
@s1: // fast search by 8 integers (pipelined instructions)
sub edx, 8
cmp [eax], ecx
je @ok
cmp [eax + 4], ecx
je @ok
cmp [eax + 8], ecx
je @ok
cmp [eax + 12], ecx
je @ok
cmp [eax + 16], ecx
je @ok
cmp [eax + 20], ecx
je @ok
cmp [eax + 24], ecx
je @ok
cmp [eax + 28], ecx
je @ok
add eax, 32
cmp edx, 8
jae @s1
jmp dword ptr [edx * 4 + @Table]
@7: cmp [eax + 24], ecx
je @ok
@6: cmp [eax + 20], ecx
je @ok
@5: cmp [eax + 16], ecx
je @ok
@4: cmp [eax + 12], ecx
je @ok
@3: cmp [eax + 8], ecx
je @ok
@2: cmp [eax + 4], ecx
je @ok
@1: cmp [eax], ecx
je @ok
@z: xor eax, eax
ret
@ok: mov al, 1
end;
{$else}
function ByteScanIndex(P: PByteArray; Count: PtrInt; Value: byte): PtrInt;
{$ifdef FPC}nostackframe; assembler;{$endif}
asm
// eax=P, edx=Count, cl=Value
push ebx
mov ebx, eax
movzx eax, cl
mov ecx, ebx
test edx, edx
jbe @no
// eax=Value, ecx=P edx=Count
imul eax, $01010101
and ecx, -16
movd xmm1, eax
movaps xmm0, [ecx]
add ecx, 16
pshufd xmm1, xmm1, 0
sub ecx, ebx
pcmpeqb xmm0, xmm1
pmovmskb eax, xmm0
shl eax, cl
and eax, $FFFF0000
shr eax, cl
jnz @fnd
cmp edx, ecx
jbe @no
{$ifdef FPC} align 16 {$else} {$ifdef HASALIGN} .align 16 {$endif}{$endif}
@by16: movaps xmm0, [ebx + ecx]
add ecx, 16
pcmpeqb xmm0, xmm1
pmovmskb eax, xmm0
bsf eax, eax
jnz @fnd2
cmp edx, ecx
ja @by16
@no: mov eax, -1
pop ebx
ret
@fnd: bsf eax, eax
@fnd2: lea eax, [ecx + eax - 16]
cmp edx, eax
jbe @no
pop ebx
end;
function WordScanIndex(P: PWordArray; Count: PtrInt; Value: word): PtrInt;
{$ifdef FPC}nostackframe; assembler;{$endif}
asm
// eax=P, edx=Count, cx=Value
push ebx
mov ebx, eax
movzx eax, cx
mov ecx, ebx
test edx, edx
jbe @no
test cl, 1
jnz @unal
// eax=Value, ecx=P edx=Count
movd xmm1, eax
and ecx, -16
punpcklwd xmm1, xmm1
movaps xmm0, [ecx]
add ecx, 16
pshufd xmm1, xmm1, 0
sub ecx, ebx
pcmpeqw xmm0, xmm1
pmovmskb eax, xmm0
shl eax, cl
and eax, $FFFF0000
shr eax, cl
shr ecx, 1
test eax, eax
jz @nxt
bsf eax, eax
@fnd: shr eax, 1
lea eax, [ecx + eax - 8]
cmp edx, eax
jbe @no
pop ebx
ret
nop // manual loop alignment
@by16: movaps xmm0, [ebx + ecx * 2]
add ecx, 8
pcmpeqw xmm0, xmm1
pmovmskb eax, xmm0
bsf eax, eax
jnz @fnd
@nxt: cmp edx, ecx
ja @by16
@no: mov eax, -1
pop ebx
ret
// 16bit-unaligned loop (seldom called)
@unal: lea ecx, [ecx + edx * 2]
neg edx
@unals: cmp word ptr [ecx + edx * 2], ax
jz @unale
inc edx
jnz @unals
jmp @no
@unale: lea eax, [ecx + edx * 2]
sub eax, ebx
shr eax, 1
pop ebx
end;
function IntegerScanIndex(P: PCardinalArray; Count: PtrInt; Value: cardinal): PtrInt;
{$ifdef FPC}nostackframe; assembler;{$endif}
asm
// eax=P, edx=Count, ecx=Value
push ebx
movd xmm1, ecx
mov ebx, eax
mov eax, ecx
mov ecx, ebx
test edx, edx
jbe @no
test cl, 3
jnz @unal
// eax=Value, ecx=P edx=Count
and ecx, -16
movaps xmm0, [ecx]
add ecx, 16
pshufd xmm1, xmm1, 0
sub ecx, ebx
pcmpeqd xmm0, xmm1
pmovmskb eax, xmm0
shl eax, cl
and eax, $FFFF0000
shr eax, cl
shr ecx, 2
bsf eax, eax
jz @nxt
@fnd: shr eax, 2
lea eax, [ecx + eax - 4]
cmp edx, eax
jbe @no
pop ebx
ret
{$ifdef FPC} align 16 {$else} nop;nop;nop;nop;nop;nop {$endif FPC}
@by16: movaps xmm0, [ebx + ecx * 4]
add ecx, 4
pcmpeqd xmm0, xmm1
pmovmskb eax, xmm0
bsf eax, eax
jnz @fnd
@nxt: cmp edx, ecx
ja @by16
@no: mov eax, -1
pop ebx
ret
// 32bit-unaligned loop (seldom called)
@unal: lea ecx, [ecx + edx * 4]
neg edx
@unals: cmp dword ptr [ecx + edx * 4], eax
jz @unale
inc edx
jnz @unals
jmp @no
@unale: lea eax, [ecx + edx * 4]
sub eax, ebx
shr eax, 2
pop ebx
end;
{$endif HASNOSSE2}
procedure mul64x64(const left, right: QWord; out product: THash128Rec);
{$ifdef FPC}nostackframe; assembler; {$endif}
asm
// adapted from FPC compiler output, which is much better than Delphi's here
{$ifdef FPC}
push ebp
mov ebp, esp
{$endif FPC}
mov ecx, eax
mov eax, dword ptr [ebp+8H]
mul dword ptr [ebp+10H]
mov dword ptr [ecx], eax
mov dword ptr [ebp-4H], edx
mov eax, dword ptr [ebp+8H]
mul dword ptr [ebp+14H]
add eax, dword ptr [ebp-4H]
adc edx, 0
mov dword ptr [ebp-10H], eax
mov dword ptr [ebp-0CH], edx
mov eax, dword ptr [ebp+0CH]
mul dword ptr [ebp+10H]
add eax, dword ptr [ebp-10H]
adc edx, 0
mov dword ptr [ecx+4H], eax
mov dword ptr [ebp-14H], edx
mov eax, dword ptr [ebp+0CH]
mul dword ptr [ebp+14H]
add eax, dword ptr [ebp-0CH]
adc edx, 0
add eax, dword ptr [ebp-14H]
adc edx, 0
mov dword ptr [ecx+8H], eax
mov dword ptr [ecx+0CH], edx
{$ifdef FPC}
pop ebp
{$endif FPC}
end;
function bswap32(a: cardinal): cardinal;
{$ifdef FPC}nostackframe; assembler;{$endif}
asm
bswap eax
end;
function bswap64({$ifdef FPC}constref{$else}const{$endif} a: QWord): QWord;
{$ifdef FPC}nostackframe; assembler;{$endif}
asm
{$ifdef FPC}
mov edx, dword ptr [eax]
mov eax, dword ptr [eax + 4]
{$else}
mov edx, a.TQWordRec.L
mov eax, a.TQWordRec.H
{$endif FPC}
bswap edx
bswap eax
end;
procedure bswap64array(A, B: PQWordArray; n: PtrInt);
{$ifdef FPC}nostackframe; assembler;{$endif}
asm
push ebx
push esi
@1: mov ebx, dword ptr [eax]
mov esi, dword ptr [eax + 4]
bswap ebx
bswap esi
mov dword ptr [edx + 4], ebx
mov dword ptr [edx], esi
add eax, 8
add edx, 8
dec ecx
jnz @1
pop esi
pop ebx
end;
procedure LockedInc32(int32: PInteger);
{$ifdef FPC}nostackframe; assembler;{$endif}
asm
lock inc dword ptr [int32]
end;
procedure LockedDec32(int32: PInteger);
{$ifdef FPC}nostackframe; assembler;{$endif}
asm
lock dec dword ptr [int32]
end;
procedure LockedInc64(int64: PInt64);
{$ifdef FPC}nostackframe; assembler;{$endif}
asm
lock inc dword ptr [int64]
jz @h
ret
@h:lock inc dword ptr [int64 + 4] // collision is very unlikely
end;
function InterlockedIncrement(var I: integer): integer;
{$ifdef FPC}nostackframe; assembler;{$endif}
asm
mov edx, 1
xchg eax, edx
lock xadd [edx], eax
inc eax
end;
function InterlockedDecrement(var I: integer): integer;
{$ifdef FPC}nostackframe; assembler;{$endif}
asm
mov edx, -1
xchg eax, edx
lock xadd [edx], eax
dec eax
end;
function StrLenSafe(S: pointer): PtrInt;
{$ifdef FPC}nostackframe; assembler;{$endif}
asm
// slower than StrLenSSE2(), but won't read any byte beyond the page
test eax, eax
jz @z
cmp byte ptr [eax], 0
je @0
cmp byte ptr [eax + 1], 0
je @1
cmp byte ptr [eax + 2], 0
je @2
cmp byte ptr [eax + 3], 0
je @3
push eax
and eax, -4 // dword align reads
nop // @s loop code alignment
@s: add eax, 4
mov edx, dword ptr [eax] // 4 chars per loop
lea ecx, [edx - $01010101]
not edx
and edx, ecx
and edx, $80808080 // set byte to $80 at each #0 position
jz @s // loop until any #0 found
@set: pop ecx
bsf edx, edx // find first #0 position
shr edx, 3 // byte offset of first #0
add eax, edx // address of first #0
sub eax, ecx // length
@z: ret
@0: xor eax, eax
ret
@1: mov eax, 1
ret
@2: mov eax, 2
ret
@3: mov eax, 3
end;
function PosEx(const SubStr, S: RawUtf8; Offset: PtrUInt): integer;
{$ifdef FPC}nostackframe; assembler;{$endif}
asm
// eax=SubStr, edx=S, ecx=Offset
push ebx
push esi
push edx
test eax, eax
jz @notfnd // exit if SubStr=''
test edx, edx
jz @notfnd // exit if S=''
mov esi, ecx
mov ecx, [edx - 4] // length(S)
mov ebx, [eax - 4] // length(SubStr)
add ecx, edx
sub ecx, ebx // ecx = max start pos for full match
lea edx, [edx + esi - 1] // edx = start position
cmp edx, ecx
jg @notfnd // startpos > max start pos
cmp ebx, 1
jle @onec // optimized loop for length(SubStr)<=1
push edi
push ebp
lea edi, [ebx - 2] // edi = length(SubStr)-2
mov esi, eax // esi = SubStr
movzx ebx, byte ptr [eax] // bl = search character
nop; nop
@l: cmp bl, [edx] // compare 2 characters per @l
je @c1fnd
@notc1: cmp bl, [edx + 1]
je @c2fnd
@notc2: add edx, 2
cmp edx, ecx // next start position <= max start position
jle @l
pop ebp
pop edi
@notfnd:xor eax, eax // returns 0 if not fnd
pop edx
pop esi
pop ebx
ret
@c1fnd: mov ebp, edi // ebp = length(SubStr)-2
@c1l: movzx eax, word ptr [esi + ebp]
cmp ax, [edx + ebp] // compare 2 chars per @c1l (may include #0)
jne @notc1
sub ebp, 2
jnc @c1l
pop ebp
pop edi
jmp @setres
@c2fnd: mov ebp, edi // ebp = length(SubStr)-2
@c2l: movzx eax, word ptr [esi + ebp]
cmp ax, [edx + ebp + 1] // compare 2 chars per @c2l (may include #0)
jne @notc2
sub ebp, 2
jnc @c2l
pop ebp
pop edi
jmp @chkres
@onec: jl @notfnd // needed for zero-length non-nil strings
movzx eax, byte ptr [eax] // search character
@charl: cmp al, [edx]
je @setres
cmp al, [edx + 1]
je @chkres
add edx, 2
cmp edx, ecx
jle @charl
jmp @notfnd
@chkres:cmp edx, ecx // check within ansistring
jge @notfnd
add edx, 1
@setres:pop ecx // ecx = S
pop esi
pop ebx
neg ecx
lea eax, [edx + ecx + 1]
end;
function StrComp(Str1, Str2: pointer): PtrInt;
{$ifdef FPC}nostackframe; assembler;{$endif}
asm
// no branch taken in case of not equal first char
cmp eax, edx
je @zero // same string or both nil
test eax, edx
jz @maynil
@1: mov cl, byte ptr [eax]
mov ch, byte ptr [edx]
inc eax
inc edx
test cl, cl
jz @exit
cmp cl, ch
je @1
@exit: movzx eax, cl
movzx edx, ch
sub eax, edx
ret
@maynil:test eax, eax // Str1='' ?
jz @max
test edx, edx // Str2='' ?
jnz @1
mov eax, 1
ret
@max: dec eax
ret
@zero: xor eax, eax
end;
function SortDynArrayInteger(const A, B): integer;
{$ifdef FPC}nostackframe; assembler;{$endif}
asm
mov ecx, dword ptr [eax]
mov edx, dword ptr[edx]
xor eax, eax
cmp ecx, edx
setl cl
setg al
movzx ecx, cl
sub eax, ecx
end;
function SortDynArrayCardinal(const A, B): integer;
{$ifdef FPC}nostackframe; assembler;{$endif}
asm
mov ecx, [eax]
mov edx, [edx]
xor eax, eax
cmp ecx, edx
seta al
sbb eax,0
end;
function SortDynArrayPointer(const A, B): integer;
{$ifdef FPC}nostackframe; assembler;{$endif}
asm
mov ecx, [eax]
mov edx, [edx]
xor eax, eax
cmp ecx, edx
seta al
sbb eax,0
end;
function SortDynArrayInt64(const A, B): integer;
{$ifdef FPC}nostackframe; assembler;{$endif}
asm
// Delphi x86 compiler is not efficient at compiling Int64 comparisons
mov ecx, [eax]
mov eax, [eax + 4]
cmp eax, [edx + 4]
jnz @nz
cmp ecx, [edx]
jz @0
jnb @p
@n: mov eax, -1
ret
@0: xor eax, eax
ret
@nz: jl @n
@p: mov eax, 1
end;
function SortDynArrayQWord(const A, B): integer;
{$ifdef FPC}nostackframe; assembler;{$endif}
asm
// Delphi x86 compiler is not efficient, and oldest even incorrect
mov ecx, [eax]
mov eax, [eax + 4]
cmp eax, [edx + 4]
jnz @nz
cmp ecx, [edx]
jz @0
@nz: jnb @p
mov eax, -1
ret
@0: xor eax, eax
ret
@p: mov eax, 1
end;
function SortDynArrayAnsiString(const A, B): integer;
{$ifdef FPC}nostackframe; assembler;{$endif}
asm
// x86 version optimized for AnsiString/RawUtf8/RawByteString types
mov eax, dword ptr [eax]
mov edx, dword ptr [edx]
cmp eax, edx
je @0
test eax, edx
jz @maynil
@f: mov cl, byte ptr [eax] // first char comparison (quicksort speedup)
mov ch, byte ptr [edx]
cmp cl, ch
je @s
movzx eax, cl
movzx edx, ch
sub eax, edx // branchless execution on Quicksort/ReHash
ret
@0: xor eax, eax
ret
@maynil:test edx, edx // A or B may be ''
jz @1
test eax, eax
jnz @f
dec eax
ret
@s: push ebx
xor ecx, ecx
mov ebx, dword ptr [eax - 4]
sub ebx, dword ptr [edx - 4] // ebx = length(A)-length(B)
push ebx
adc ecx, -1
and ecx, ebx
sub ecx, dword ptr [eax - 4] // ecx = -min(length(A),length(B))
sub eax, ecx
sub edx, ecx
{$ifdef FPC} align 16 {$endif} // is naturally aligned anyway
@by4: mov ebx, dword ptr [eax + ecx] // compare 4 bytes per iteration
xor ebx, dword ptr [edx + ecx]
jnz @d
add ecx, 4
js @by4
@eq: pop eax // all chars equal -> returns length(A)-length(B)
pop ebx
ret
@d: bsf ebx, ebx // char differs -> returns pbyte(A)^-pbyte(B)^
shr ebx, 3
add ecx, ebx
jns @eq
movzx eax, byte ptr [eax + ecx]
movzx edx, byte ptr [edx + ecx]
pop ebx
pop ebx
sub eax, edx
ret
@1: mov eax, 1
end;
function SortDynArrayDouble(const A, B): integer;
{$ifdef FPC}nostackframe; assembler;{$endif}
asm
fld qword ptr [eax]
fcomp qword ptr [edx]
fstsw ax
sahf
jz @0
@nz: jnb @p
mov eax, -1
ret
@0: xor eax, eax
ret
@p: mov eax, 1
end;
function SortDynArraySingle(const A, B): integer;
{$ifdef FPC}nostackframe; assembler;{$endif}
asm
fld dword ptr [eax]
fcomp dword ptr [edx]
fstsw ax
sahf
jz @0
@nz: jnb @p
mov eax, -1
ret
@0: xor eax, eax
ret
@p: mov eax, 1
end;
procedure DynArrayHashTableAdjust(P: PIntegerArray; deleted: integer; count: PtrInt);
{$ifdef FPC}nostackframe; assembler;{$endif}
asm
// eax=P edx=deleted ecx=count
push ebx
mov ebx, eax
xor eax, eax
@by8: sub ecx, 8
cmp edx, dword ptr [ebx]
setl al
sub dword ptr [ebx], eax
cmp edx, dword ptr [ebx + 4H]
setl al
sub dword ptr [ebx + 4H], eax
cmp edx, dword ptr [ebx + 8H]
setl al
sub dword ptr [ebx + 8H], eax
cmp edx, dword ptr [ebx + 0CH]
setl al
sub dword ptr [ebx + 0CH], eax
cmp edx, dword ptr [ebx + 10H]
setl al
sub dword ptr [ebx + 10H], eax
cmp edx, dword ptr [ebx + 14H]
setl al
sub dword ptr [ebx + 14H], eax
cmp edx, dword ptr [ebx + 18H]
setl al
sub dword ptr [ebx + 18H], eax
cmp edx, dword ptr [ebx + 1CH]
setl al
sub dword ptr [ebx + 1CH], eax
add ebx, 32
cmp ecx, 8
jge @by8
test ecx, ecx
jz @ok
// trailing indexes (never appearing within DYNARRAYHASH_PO2 range)
@by1: cmp edx, dword ptr [ebx]
setl al
sub dword ptr [ebx], eax
add ebx, 4
dec ecx
jnz @by1
@ok: pop ebx
end;
procedure DynArrayHashTableAdjust16(P: PWordArray; deleted: cardinal; count: PtrInt);
{$ifdef FPC}nostackframe; assembler;{$endif}
asm
// eax=P dx=deleted ecx=count
push ebx
mov ebx, eax
xor eax, eax
@by8: sub ecx, 8
cmp dx, word ptr [ebx]
setl al
sub word ptr [ebx], ax
cmp dx, word ptr [ebx + 2]
setl al
sub word ptr [ebx + 2], ax
cmp dx, word ptr [ebx + 4]
setl al
sub word ptr [ebx + 4], ax
cmp dx, word ptr [ebx + 6]
setl al
sub word ptr [ebx + 6], ax
cmp dx, word ptr [ebx + 8]
setl al
sub word ptr [ebx + 8], ax
cmp dx, word ptr [ebx + 10]
setl al
sub word ptr [ebx + 10], ax
cmp dx, word ptr [ebx + 12]
setl al
sub word ptr [ebx + 12], ax
cmp dx, word ptr [ebx + 14]
setl al
sub word ptr [ebx + 14], ax
add ebx, 16
cmp ecx, 8
jge @by8
test ecx, ecx
jz @ok
// trailing indexes (never appearing within DYNARRAYHASH_PO2 range)
@by1: cmp dx, word ptr [ebx]
setl al
sub word ptr [ebx], ax
add ebx, 2
dec ecx
jnz @by1
@ok: pop ebx
end;
function GetBitsCountSse42(value: PtrInt): PtrInt;
{$ifdef FPC}nostackframe; assembler;{$endif}
asm
// eax=value
{$ifdef HASAESNI}
popcnt eax, eax
{$else} // oldest Delphi don't support this opcode
db $f3,$0f,$B8,$c0
{$endif HASAESNI}
end;
function crc32cby4sse42(crc, value: cardinal): cardinal;
{$ifdef FPC}nostackframe; assembler;{$endif}
asm
// eax=crc, edx=value
{$ifdef HASAESNI}
crc32 eax, edx
{$else} // oldest Delphi don't support this opcode
db $F2, $0F, $38, $F1, $C2
{$endif HASAESNI}
end;
procedure crcblocksse42(crc128, data128: PBlock128);
{$ifdef FPC}nostackframe; assembler;{$endif}
asm
// eax=crc128, edx=data128
mov ecx, eax
{$ifdef HASAESNI}
mov eax, dword ptr [ecx]
crc32 eax, dword ptr [edx]
mov dword ptr [ecx], eax
mov eax, dword ptr [ecx + 4]
crc32 eax, dword ptr [edx + 4]
mov dword ptr [ecx + 4], eax
mov eax, dword ptr [ecx + 8]
crc32 eax, dword ptr [edx + 8]
mov dword ptr [ecx + 8], eax
mov eax, dword ptr [ecx + 12]
crc32 eax, dword ptr [edx + 12]
mov dword ptr [ecx + 12], eax
{$else} // oldest Delphi don't support these opcodes
mov eax, dword ptr [ecx]
db $F2, $0F, $38, $F1, $02
mov dword ptr [ecx], eax
mov eax, dword ptr [ecx + 4]
db $F2, $0F, $38, $F1, $42, $04
mov dword ptr [ecx + 4], eax
mov eax, dword ptr [ecx + 8]
db $F2, $0F, $38, $F1, $42, $08
mov dword ptr [ecx + 8], eax
mov eax, dword ptr [ecx + 12]
db $F2, $0F, $38, $F1, $42, $0C
mov dword ptr [ecx + 12], eax
{$endif HASAESNI}
end;
procedure crcblockssse42(crc128, data128: PBlock128; count: integer);
{$ifdef FPC}nostackframe; assembler;{$endif}
asm
// eax=crc128 edx=data128 ecx=count
push ebx
push esi
push edi
push ebp
test count, count
jle @z
mov ebp, count
mov esi, crc128
mov edi, data128
mov eax, dword ptr [esi]
mov ebx, dword ptr [esi + 4]
mov ecx, dword ptr [esi + 8]
mov edx, dword ptr [esi + 12]
{$ifdef HASAESNI}
{$ifdef FPC} align 16 {$else} {$ifdef HASALIGN} .align 16 {$endif}{$endif}
@s: crc32 eax, dword ptr [edi]
crc32 ebx, dword ptr [edi + 4]
crc32 ecx, dword ptr [edi + 8]
crc32 edx, dword ptr [edi + 12]
{$else} // oldest Delphi don't support these opcodes
@s: db $F2, $0F, $38, $F1, $07
db $F2, $0F, $38, $F1, $5F, $04
db $F2, $0F, $38, $F1, $4F, $08
db $F2, $0F, $38, $F1, $57, $0C
{$endif HASAESNI}
add edi, 16
dec ebp
jnz @s
mov dword ptr [esi], eax
mov dword ptr [esi + 4], ebx
mov dword ptr [esi + 8], ecx
mov dword ptr [esi + 12], edx
@z: pop ebp
pop edi
pop esi
pop ebx
end;
function crc32csse42(crc: cardinal; buf: PAnsiChar; len: cardinal): cardinal;
{$ifdef FPC}nostackframe; assembler;{$endif}
asm
// eax=crc, edx=buf, ecx=len
not eax
test ecx, ecx
jz @0
test edx, edx
jz @0
jmp @align
@a: {$ifdef HASAESNI}
crc32 eax, byte ptr [edx]
{$else} // oldest Delphi don't support these opcodes
db $F2, $0F, $38, $F0, $02
{$endif HASAESNI}
inc edx
dec ecx
jz @0
@align: test dl, 3
jnz @a
push ecx
shr ecx, 3
jnz @by8
@rem: pop ecx
test cl, 4
jz @4
{$ifdef HASAESNI}
crc32 eax, dword ptr [edx]
{$else}
db $F2, $0F, $38, $F1, $02
{$endif HASAESNI}
add edx, 4
@4: test cl, 2
jz @2
{$ifdef HASAESNI}
crc32 eax, word ptr [edx]
{$else}
db $66, $F2, $0F, $38, $F1, $02
{$endif HASAESNI}
add edx, 2
@2: test cl, 1
jz @0
{$ifdef HASAESNI}
crc32 eax, byte ptr [edx]
{$else}
db $F2, $0F, $38, $F0, $02
{$endif HASAESNI}
@0: not eax
ret
{$ifdef HASAESNI}
@by8: crc32 eax, dword ptr [edx]
crc32 eax, dword ptr [edx + 4]
{$else}
@by8: db $F2, $0F, $38, $F1, $02
db $F2, $0F, $38, $F1, $42, $04
{$endif HASAESNI}
add edx, 8
dec ecx
jnz @by8
jmp @rem
end;
function SynLZdecompress1(src: PAnsiChar; size: integer; dst: PAnsiChar): integer;
{$ifdef FPC}nostackframe; assembler;{$endif}
asm
push ebp
push ebx
push esi
push edi
push eax
add esp, -4092
push eax
add esp, -4092
push eax
add esp, -4092
push eax
add esp, -4092
push eax
add esp, -24
mov esi, ecx
mov ebx, eax
add edx, eax
mov [esp+8H], esi
mov [esp+10H], edx
movzx eax, word ptr [ebx]
mov [esp], eax
or eax,eax
je @@0917
add ebx, 2
mov eax, [esp]
test ah, 80H
jz @@0907
and eax, 7FFFH
movzx edx, word ptr [ebx]
shl edx, 15
or eax, edx
mov [esp], eax
add ebx, 2
@@0907: lea ebp, [esi - 1]
@@0908: mov ecx, [ebx]
add ebx, 4
mov [esp+14H], ecx
mov edi, 1 // edi=CWbit
cmp ebx, [esp+10H]
jnc @@0917
@@0909: mov ecx, [esp+14H]
@@090A: test ecx, edi
jnz @@0911
mov al, [ebx]
inc ebx
mov [esi], al
inc esi
cmp ebx, [esp+10H]
lea eax, [esi-3]
jnc @@0917
cmp eax, ebp
jbe @@0910
inc ebp
mov eax, [ebp]
mov edx, eax
shr eax, 12
xor eax, edx
and eax, 0FFFH
mov [esp+1CH + eax * 4], ebp
@@0910: add edi, edi
jnz @@090A
jmp @@0908
@@0911: movzx edx, word ptr [ebx]
add ebx, 2
mov eax, edx
and edx, 0FH
add edx, 2
shr eax, 4
cmp edx,2
jnz @@0912
movzx edx, byte ptr [ebx]
inc ebx
add edx, 18
@@0912: mov eax, [esp+1CH + eax * 4]
mov ecx, esi
mov [esp+18H], edx
sub ecx, eax
// inlined optimized move()
cmp ecx, edx
jl @@ovlap // overlapping content requires per-byte copy
cmp edx, 32
ja @large
sub edx, 8
jg @9_32
{$ifdef HASNOSSE2}
// slowest x87 FPU code on very old CPU with no SSE2 support
mov ecx, [eax]
mov eax, [eax + 4] // always copy 8 bytes for 0..8
mov [esi], ecx // safe since src_endmatch := src_end-(6+5)
mov [esi + 4], eax
jmp @movend
@9_32: fild qword ptr [eax + edx]
fild qword ptr [eax]
cmp edx, 8
jle @16
fild qword ptr [eax + 8]
cmp edx, 16
jle @24
fild qword ptr [eax + 16]
fistp qword ptr [esi + 16]
@24: fistp qword ptr [esi + 8]
@16: fistp qword ptr [esi]
fistp qword ptr [esi + edx]
jmp @movend
nop
@large: push esi
fild qword ptr [eax]
lea eax, [eax + edx - 8]
lea edx, [esi + edx - 8]
fild qword ptr [eax]
push edx
neg edx
and esi, -8
lea edx, [edx + esi + 8]
pop esi
@lrgnxt:fild qword ptr [eax + edx]
fistp qword ptr [esi + edx]
add edx, 8
jl @lrgnxt
fistp qword ptr [esi]
pop esi
fistp qword ptr [esi]
{$else} // inlined SSE2 move
movq xmm0, qword ptr [eax]
movq qword ptr [esi], xmm0
jmp @movend
@9_32: movq xmm0, qword ptr [eax + edx]
movq xmm1, qword ptr [eax]
cmp edx, 8
jle @16
movq xmm2, qword ptr [eax + 8]
cmp edx, 16
jle @24
movq xmm3, qword ptr [eax + 16]
movq qword ptr [esi + 16], xmm3
@24: movq qword ptr [esi + 8], xmm2
@16: movq qword ptr [esi], xmm1
movq qword ptr [esi + edx], xmm0
jmp @movend
@large: push esi
movups xmm2, dqword ptr [eax]
lea eax, [eax + edx - 16]
lea edx, [esi + edx - 16]
movups xmm1, dqword ptr [eax]
push edx
neg edx
and esi, -16
lea edx, [edx + esi + 16]
pop esi
@lrgnxt:movups xmm0, dqword ptr [eax + edx]
movaps dqword ptr [esi + edx], xmm0
add edx, 16
jl @lrgnxt
movups dqword ptr [esi], xmm1
pop esi
movups dqword ptr [esi], xmm2
{$endif HASNOSSE2}
@movend:cmp esi, ebp
jbe @@0916
@@0915: inc ebp
mov edx, [ebp]
mov eax, edx
shr edx, 12
xor eax, edx
and eax, 0FFFH
mov [esp+1CH + eax * 4], ebp
cmp esi, ebp
ja @@0915
@@0916: add esi, [esp+18H]
cmp ebx, [esp+10H]
jnc @@0917
add edi, edi
lea ebp, [esi - 1]
jz @@0908
jmp @@0909
@@ovlap:push ebx
push esi
lea ebx, [eax + edx]
add esi, edx
neg edx
@s: mov al, [ebx + edx]
mov [esi + edx], al
inc edx
jnz @s
pop esi
pop ebx
jmp @movend
@@0917: mov eax, [esp]
add esp, 16412
pop edi
pop esi
pop ebx
pop ebp
end;
function SynLZcompress1(src: PAnsiChar; size: integer; dst: PAnsiChar): integer;
{$ifdef FPC}nostackframe; assembler;{$endif}
asm
push ebp
push ebx
push esi
push edi
push eax
add esp, -4092
push eax
add esp, -4092
push eax
add esp, -4092
push eax
add esp, -4092
push eax
add esp, -4092
push eax
add esp, -4092
push eax
add esp, -4092
push eax
add esp, -4092
push eax
add esp, -32
mov esi, eax // esi=src
mov edi, ecx // edi=dst
mov [esp+08H], ecx
mov eax, edx
cmp eax, 32768
jl @@0889
or ax, 8000H
mov [edi], eax
mov eax, edx
shr eax, 15
mov [edi + 2], eax
add edi, 4
jmp @@0891
@@0890: mov eax, 2
jmp @@0904
@@0889: mov [edi], eax
test eax, eax
jz @@0890
add edi, 2
@@0891: lea eax, [edx + esi]
mov [esp+18H], edi
mov [esp+0CH], eax
sub eax, 11
mov [esp+4], eax
xor eax, eax
lea ebx, [esp+24H] // reset offsets lookup table
{$ifdef HASNOSSE2}
mov ecx, 1024
@@089I: mov [ebx], eax
mov [ebx + 4], eax
mov [ebx + 8], eax
mov [ebx + 12], eax
add ebx, 16
{$else}
pxor xmm0, xmm0
mov ecx, 256
@@089I: movups dqword ptr [ebx], xmm0
movups dqword ptr [ebx + 16], xmm0
movups dqword ptr [ebx + 32], xmm0
movups dqword ptr [ebx + 48], xmm0
add ebx, 64
{$endif HASNOSSE2}
dec ecx
jnz @@089I
mov [edi], eax
add edi, 4
mov ebx, 1 // ebx=1 shl CWbit
// main loop:
cmp esi, [esp+4]
ja @@0900
@@0892: mov edx, [esi]
mov eax, edx
shr edx, 12
xor edx, eax
and edx, 0FFFH
mov ebp, [esp+24H + edx * 4]
mov ecx, [esp+4024H + edx * 4]
mov [esp+24H + edx * 4], esi
xor ecx, eax
test ecx, 0FFFFFFH
mov [esp+4024H + edx * 4], eax
jnz @@0897
mov eax, esi
or ebp, ebp
jz @@0897
sub eax, ebp
mov ecx, [esp+18H]
cmp eax, 2
jle @@0897
add esi, 2
or dword ptr [ecx], ebx
mov ecx, [esp+0CH]
add ebp, 2
mov eax, 1
sub ecx, esi
dec ecx
mov [esp], ecx
cmp ecx, 271
jl @@0894
mov dword ptr [esp], 271
jmp @@0894
@@0893: inc eax
@@0894: mov ecx, [ebp + eax]
cmp cl, [esi + eax]
jnz @@0895
cmp eax, [esp]
jge @@0895
inc eax
cmp ch, [esi + eax]
jnz @@0895
shr ecx, 16
cmp eax, [esp]
jge @@0895
inc eax
cmp cl, [esi + eax]
jnz @@0895
cmp eax, [esp]
jge @@0895
inc eax
cmp ch, [esi + eax]
jnz @@0895
cmp eax, [esp]
jl @@0893
@@0895: add esi, eax
shl edx, 4
cmp eax, 15
jg @@0896
or eax, edx
mov word ptr [edi], ax
add edi, 2
jmp @@0898
@@0896: sub eax, 16
mov [edi], dx
mov [edi + 2H], al
add edi, 3
jmp @@0898
@@0897: mov al, [esi] // movsb is actually slower!
mov [edi], al
inc esi
inc edi
@@0898: add ebx, ebx
jz @@0899
cmp esi, [esp+4]
jbe @@0892
jmp @@0900
@@0899: mov [esp+18H], edi
mov [edi], ebx
inc ebx
add edi, 4
cmp esi, [esp+4]
jbe @@0892
@@0900: cmp esi, [esp+0CH]
jnc @@0903
@@0901: mov al, [esi]
mov [edi], al
inc esi
inc edi
add ebx, ebx
jz @@0902
cmp esi, [esp+0CH]
jc @@0901
jmp @@0903
@@0902: mov [edi], ebx
inc ebx
add edi, 4
cmp esi, [esp+0CH]
jc @@0901
@@0903: mov eax, edi
sub eax, [esp+08H]
@@0904: add esp, 32804
pop edi
pop esi
pop ebx
pop ebp
end;
function RdRand32: cardinal;
{$ifdef FPC}nostackframe; assembler;{$endif}
asm
// rdrand eax: same opcodes for x86 and x64
db $0f, $c7, $f0
// returns in eax, ignore carry flag (eax=0 won't hurt)
end;
function Rdtsc: Int64;
{$ifdef FPC}nostackframe; assembler;{$endif}
asm
// returns the TSC in EDX:EAX
rdtsc
end;
function StrCntDecFree(var refcnt: TStrCnt): boolean;
{$ifdef FPC}nostackframe; assembler;{$endif}
asm
lock dec dword ptr [refcnt] // =longint on Delphi and FPC 32-bit
setbe al
end; // don't check for ismultithread global since lock is cheaper on new CPUs
function DACntDecFree(var refcnt: TDACnt): boolean;
{$ifdef FPC}nostackframe; assembler;{$endif}
asm
lock dec dword ptr [refcnt] // =longint on Delphi and FPC 32-bit
setbe al
end; // don't check for ismultithread global since lock is cheaper on new CPUs
function LockedExc(var Target: PtrUInt; NewValue, Comperand: PtrUInt): boolean;
{$ifdef FPC}nostackframe; assembler;{$endif}
asm
xchg eax, ecx
lock cmpxchg dword ptr [ecx], edx
setz al
end;
procedure LockedAdd(var Target: PtrUInt; Increment: PtrUInt);
{$ifdef FPC}nostackframe; assembler;{$endif}
asm
lock add dword ptr [Target], Increment
end;
procedure LockedAdd32(var Target: cardinal; Increment: cardinal);
{$ifdef FPC}nostackframe; assembler;{$endif}
asm
lock add dword ptr [Target], Increment
end;
procedure LockedDec(var Target: PtrUInt; Decrement: PtrUInt);
{$ifdef FPC}nostackframe; assembler;{$endif}
asm
lock sub dword ptr [Target], Decrement
end;
function IsXmmYmmOSEnabled: boolean;
{$ifdef FPC}nostackframe; assembler;{$endif}
asm // see https://software.intel.com/en-us/blogs/2011/04/14/is-avx-enabled
xor ecx, ecx // get control register XCR0 = XFEATURE_ENABLED_MASK
db $0f, $01, $d0 // XGETBV reads XCR0 into EDX:EAX
and eax, 6 // check OS enabled both XMM (bit 1) and YMM (bit 2)
cmp al, 6
sete al
end;
procedure GetCpuid(cpueax, cpuecx: cardinal; var Registers: TIntelRegisters);
{$ifdef FPC}nostackframe; assembler;{$endif}
asm
push esi
push edi
push ebx
mov edi, cpueax
mov ebx, cpuecx
mov esi, Registers
pushfd
pop eax
mov edx, eax
xor eax, $200000
push eax
popfd
pushfd
pop eax
xor eax, edx // does this CPU support the cpuid opcode?
jz @nocpu
mov eax, edi
mov ecx, ebx
cpuid
mov TIntelRegisters(esi).&eax, eax
mov TIntelRegisters(esi).&ebx, ebx
mov TIntelRegisters(esi).&ecx, ecx
mov TIntelRegisters(esi).&edx, edx
@nocpu: pop ebx
pop edi
pop esi
end;
{$ifdef CPU32DELPHI}
// inspired by ValExt_JOH_PAS_8_a by John O'Harrow - calls Delphi System.@Pow10
function GetExtended(P: PUtf8Char; out err: integer): TSynExtended;
const
Ten: double = 10.0;
asm
// in: eax=text, edx=@err out: st(0)=result
push ebx // save used registers
push esi
push edi
mov esi, eax // string pointer
push eax // save for error condition
xor ebx, ebx
push eax // allocate local storage for loading fpu
test esi, esi
jz @nil // nil string
@trim: movzx ebx, byte ptr [esi] // strip leading spaces
inc esi
cmp bl, ' '
je @trim
xor ecx, ecx // clear sign flag
fld qword[Ten] // load 10 into fpu
xor eax, eax // zero number of decimal places
fldz // zero result in fpu
cmp bl, '0'
jl @chksig // check for sign character
@dig1: xor edi, edi // zero exponent value
@digl: sub bl, '0'
cmp bl, 9
ja @frac // non-digit
mov cl, 1 // set digit found flag
mov [esp], ebx // store for fpu use
fmul st(0), st(1) // multply by 10
fiadd dword ptr [esp] // add next digit
movzx ebx, byte ptr [esi] // get next char
inc esi
test bl, bl // end reached?
jnz @digl // no,get next digit
jmp @finish // yes,finished
@chksig:cmp bl, '-'
je @minus
cmp bl, '+'
je @sigset
@gdig1: test bl, bl
jz @error // no digits found
jmp @dig1
@minus: mov ch, 1 // set sign flag
@sigset:movzx ebx, byte ptr [esi] // get next char
inc esi
jmp @gdig1
@frac: cmp bl, '.' - '0'
jne @exp // no decimal point
movzx ebx, byte ptr [esi] // get next char
test bl, bl
jz @dotend // string ends with '.'
inc esi
@fracl: sub bl, '0'
cmp bl, 9
ja @exp // non-digit
mov [esp], ebx
dec eax // -(number of decimal places)
fmul st(0), st(1) // multply by 10
fiadd dword ptr [esp] // add next digit
movzx ebx, byte ptr [esi] // get next char
inc esi
test bl, bl // end reached?
jnz @fracl // no, get next digit
jmp @finish // yes, finished (no exponent)
@dotend:test cl, cl // any digits found before '.'?
jnz @finish // yes, valid
jmp @error // no,invalid
@exp: or bl, $20
cmp bl, 'e' - '0'
jne @error // not 'e' or 'e'
movzx ebx, byte ptr [esi] // get next char
inc esi
mov cl, 0 // clear exponent sign flag
cmp bl, '-'
je @minexp
cmp bl, '+'
je @expset
jmp @expl
@minexp:mov cl, 1 // set exponent sign flag
@expset:movzx ebx, byte ptr [esi] // get next char
inc esi
@expl: sub bl, '0'
cmp bl, 9
ja @error // non-digit
lea edi, [edi + edi * 4]// multiply by 10
add edi, edi
add edi, ebx // add next digit
movzx ebx, byte ptr [esi] // get next char
inc esi
test bl, bl // end reached?
jnz @expl // no, get next digit
@endexp:test cl, cl // positive exponent?
jz @finish // yes, keep exponent value
neg edi // no, negate exponent value
@finish:add eax, edi // exponent value - number of decimal places
mov [edx], ebx // result code = 0
jz @pow // no call to _pow10 needed
cmp eax, 308
jge @oor // limit to < 1.7 x 10^308 double range
cmp eax, -324
jle @oor // limit to > 5.0 x 10^-324 range
@oors: mov edi, ecx // save decimal sign flag
call System.@Pow10 // raise to power of 10
mov ecx, edi // restore decimal sign flag
@pow: test ch, ch // decimal sign flag set?
jnz @negate // yes, negate value
@ok: add esp, 8 // dump local storage and string pointer
@exit: ffree st(1) // remove ten value from fpu
pop edi // restore used registers
pop esi
pop ebx
ret // finished
@negate:fchs // negate result in fpu
jmp @ok
@oor: inc esi // force result code = 1
mov [edx], esi // set result code
xor eax, eax // set exponent = 0 (as pure pascal version)
jmp @oors
@nil: inc esi // force result code = 1
fldz // result value = 0
@error: pop ebx // dump local storage
pop eax // string pointer
sub esi, eax // error offset
mov [edx], esi // set result code
test ch, ch // decimal sign flag set?
jz @exit // no,exit
fchs // yes. negate result in fpu
jmp @exit // exit setting result code
end;
// FPC will properly inline multiplication by reciprocal
procedure Div100(Y: cardinal; var res: TDiv100Rec);
asm
mov dword ptr [edx].TDiv100Rec.M, eax
mov ecx, edx
mov edx, eax
mov eax, 1374389535
mul edx
shr edx, 5
mov dword ptr [ecx].TDiv100Rec.D, edx
imul eax, edx, 100
sub dword ptr [ecx].TDiv100Rec.M, eax
end;
// those functions are intrinsics with FPC :)
function BSRdword(c: cardinal): cardinal;
asm
bsr eax, eax
jnz @nz
mov eax, 255
@nz:
end;
function BSRqword(const q: qword): cardinal;
asm
bsr eax, [esp + 8]
jz @1
add eax, 32
ret
@1: bsr eax, [esp + 4]
jnz @2
mov eax, 255
@2:
end;
{$endif CPU32DELPHI}