{ This file is a part of the Open Source Synopse mORMot framework 2, licensed under a MPL/GPL/LGPL three license - see LICENSE.md x86 32-bit assembly used by mormot.core.base.pas } {$ifdef FPC} // disabled some FPC paranoid warnings {$WARN 7102 off : Use of +offset(%ebp) for parameters invalid here } {$WARN 7104 off : Use of -offset(%ebp) is not recommended for local variable access } {$WARN 7121 off : Check size of XMM memory operand } {$WARN 7122 off : Check size of XMM memory operand } {$endif FPC} {$ifdef ASMX86} // i386 asm with global - disabled on PIC targets { FillCharFast/MoveFast implementation notes: - use SS2 and not FPU fld/fst which may trigger unexpected exceptions (Agner) - use dedicated branchless sub-functions for small buffers of 0..32 bytes - use simple SSE2 loop for 33..255 bytes - assume ERBMS is available (cpuid flag may not be propagated within VMs) - use "rep movsb" for 256..512K input (should work on all CPUs, even older with no ERBMS), or as fallback if no SSE2 CPU is used - use non volatile SSE2 loop when >= 512KB (to not pollute the CPU cache) - don't use backward "std rep movsb" since it is not involved by ERMBS (slow) - note: Delphi Win32 x87 RTL code by John O'Harrow seems deprecated } {$define WITH_ERMS} // we need it as fallback on old CPU without SSE2 {$ifdef WITH_ERMS} var // "rep stosb/movsb" enabled for len >= 4096 on ERMSB CPUs // it has been reported that "on 32-bit strings have to be at least 4KB" // see https://stackoverflow.com/a/43837564/458259 for explanations and timing ERMSB_MIN_SIZE_FWD: integer = maxInt; // maxInt = disabled by default {$ifndef FPC_X86} ERMSB_MIN_SIZE_BWD: integer = maxInt; // used for no-SSE2, not for ERMSB {$endif FPC_X86} {$define WITH_ERMSASM} // include "rep movsb" asm blocks {$endif WITH_ERMS} {$ifdef HASNOSSE2} {$define WITH_ERMSASM} // "rep movsd" is used as fallback on oldest CPU {$endif HASNOSSE2} const // non-temporal writes should bypass the cache when the size is bigger than // half the size of the largest level cache - we assume low 1MB cache here NONTEMPORALSIZE = 1 shl 20; // fast SSE2 version - force define HASNOSSE2 when run any very old CPU procedure FillcharFast(var dst; cnt: PtrInt; value: byte); {$ifdef FPC}nostackframe; assembler;{$endif} asm // eax=Dest edx=Count cl=Value movzx ecx, cl imul ecx, ecx, $01010101 cmp edx, 32 jg @32up test edx, edx jle @00 mov [eax + edx - 1], cl // fill last byte shr edx, 1 // how many words to fill jmp dword ptr [edx * 4 + @jmptab] @00: ret @32up: {$ifndef HASNOSSE2} {$ifdef WITH_ERMS} cmp edx, NONTEMPORALSIZE jae @noerms // movntdq was reported to be faster than ERMSB cmp edx, ERMSB_MIN_SIZE_FWD ja @ermsb @noerms:{$endif WITH_ERMS} sub edx, 16 movd xmm0, ecx mov ecx, eax pshufd xmm0, xmm0, 0 and ecx, 15 // 16-byte align writes movups dqword ptr [eax], xmm0 // fill first 16 bytes movups dqword ptr [eax + edx], xmm0 // fill last 16 bytes sub ecx, 16 sub eax, ecx add edx, ecx add eax, edx neg edx cmp edx, -NONTEMPORALSIZE // assume > 512KB bypass the cache jl @nv {$ifdef FPC} align 16 {$else} {$ifdef HASALIGN} .align 16 {$endif}{$endif} @loop: movaps dqword ptr [eax + edx], xmm0 // fill 16 bytes per loop add edx, 16 jl @loop ret {$ifdef FPC} align 16 {$else} {$ifdef HASALIGN} .align 16 {$endif}{$endif} @nv: movntdq dqword ptr [eax + edx], xmm0 // non-temporal fill 16 bytes add edx, 16 jl @nv ret {$endif HASNOSSE2} {$ifdef WITH_ERMSASM} @ermsb: push edi cld {$ifdef HASNOSSE2} @ermsa: test al, 3 // aligned stosd is better on old CPUs jz @erms3 mov byte ptr [eax], cl inc eax dec edx jmp @ermsa @erms3: mov edi, eax mov eax, ecx mov ecx, edx shr ecx, 2 jz @erms2 rep stosd // no SSE2 version @erms2: test dl, 2 jz @erms1 mov word ptr [edi], ax add edi, 2 @erms1: test dl, 1 jz @erms0 mov byte ptr [edi], al {$else} // ERMSB favors stosb and will properly align writes mov edi, eax mov eax, ecx mov ecx, edx rep stosb {$endif HASNOSSE2} @erms0: pop edi ret {$endif WITH_ERMSASM} {$ifdef FPC} align 4 {$else} nop {$ifdef HASNOSSE2} nop; nop {$endif} {$endif} @jmptab:dd @00, @02, @04, @06, @08, @10, @12, @14, @16 dd @18, @20, @22, @24, @26, @28, @30, @32 // Delphi RTL uses 16-bit "mov [eax + ...], cx" which are slower @32: mov [eax + 28], ecx @28: mov [eax + 24], ecx @24: mov [eax + 20], ecx @20: mov [eax + 16], ecx @16: mov [eax + 12], ecx @12: mov [eax + 8], ecx @08: mov [eax + 4], ecx @04: mov [eax], ecx ret @30: mov [eax + 26], ecx @26: mov [eax + 22], ecx @22: mov [eax + 18], ecx @18: mov [eax + 14], ecx @14: mov [eax + 10], ecx @10: mov [eax + 6], ecx @06: mov [eax + 2], ecx @02: mov word ptr [eax], cx end; {$ifndef FPC_X86} // FPC RTL has fastmove.inc -> our SSE2/ERMS asm is slower // fast SSE2 version - force define HASNOSSE2 when run any very old CPU procedure MoveFast(const src; var dst; cnt: PtrInt); {$ifdef FPC}nostackframe; assembler;{$endif} asm // eax=source edx=dest ecx=count cmp ecx, 32 ja @lrg // count > 32 or count < 0 sub ecx, 8 jg @sml // 9..32 byte move jmp dword ptr [@table + 32 + ecx * 4] // 0..8 byte move {$ifdef HASNOSSE2} @sml: fild qword ptr [eax + ecx] // last 8 fild qword ptr [eax] // first 8 cmp ecx, 8 jle @sml16 fild qword ptr [eax + 8] // second 8 cmp ecx, 16 jle @sml24 fild qword ptr [eax + 16] // third 8 fistp qword ptr [edx + 16] // third 8 @sml24: fistp qword ptr [edx + 8] // second 8 @sml16: fistp qword ptr [edx] // first 8 fistp qword ptr [edx + ecx] // last 8 {$else} @sml: movq xmm0, qword ptr [eax + ecx] // last 8 movq xmm1, qword ptr [eax] // first 8 cmp ecx, 8 jle @sml16 movq xmm2, qword ptr [eax + 8] // second 8 cmp ecx, 16 jle @sml24 movq xmm3, qword ptr [eax + 16] // third 8 movq qword ptr [edx + 16], xmm3 // third 8 @sml24: movq qword ptr [edx + 8], xmm2 // second 8 @sml16: movq qword ptr [edx], xmm1 // first 8 movq qword ptr [edx + ecx], xmm0 // last 8 ret {$endif HASNOSSE2} @exit: rep ret {$ifdef FPC} align 4 {$else} {$ifdef HASALIGN} .align 4 {$endif}{$endif} @table: dd @exit, @m01, @m02, @m03, @m04, @m05, @m06, @m07, @m08 {$ifdef WITH_ERMSASM} @ermsf: push esi push edi mov esi, eax mov edi, edx cld rep movsb // ERMSB forward move pop edi pop esi ret @ermsb: push esi push edi lea esi, [eax + ecx - 1] lea edi, [edx + ecx - 1] std rep movsb // backward move is slow even if ERMSB is set pop edi pop esi cld // FPC requires this ret {$endif WITH_ERMSASM} {$ifdef HASNOSSE2} @lrg: jng @exit // count < 0 sub edx, eax jz @exit cmp edx, ecx lea edx, [edx + eax] jb @ermsb // move backwards if unsigned(dest-source) < count jmp @ermsf {$else} @lrgfwd:// large forward move {$ifdef WITH_ERMS} cmp ecx, NONTEMPORALSIZE jae @noermf // movntdq was reported to be faster than ERMSB cmp ecx, ERMSB_MIN_SIZE_FWD ja @ermsf @noermf:{$endif WITH_ERMS} push edx movups xmm2, dqword ptr [eax] // first 16 lea eax, [eax + ecx - 16] lea ecx, [ecx + edx - 16] movups xmm1, dqword ptr [eax] // last 16 push ecx neg ecx and edx, -16 // 16-byte align writes lea ecx, [ecx + edx + 16] pop edx cmp ecx, -NONTEMPORALSIZE // assume > 512KB bypass the cache jl @fwnv {$ifdef FPC} align 16 {$else} {$ifdef HASALIGN} .align 16 {$endif}{$endif} @fwd: movups xmm0, dqword ptr [eax + ecx] // move by 16 bytes movaps dqword ptr [edx + ecx], xmm0 add ecx, 16 jl @fwd @fwde: movups dqword ptr [edx], xmm1 // last 16 pop edx movups dqword ptr [edx], xmm2 // first 16 ret {$ifdef FPC} align 16 {$else} {$ifdef HASALIGN} .align 16 {$endif}{$endif} @fwnv: movups xmm0, dqword ptr [eax + ecx] movntdq dqword ptr [edx + ecx], xmm0 // non-temporal move by 16 bytes add ecx, 16 jl @fwnv sfence jmp @fwde @lrg: jng @exit // count < 0 sub edx, eax jz @exit // source=dest cmp edx, ecx lea edx, [edx + eax] jae @lrgfwd // move backwards if unsigned(dest-source) < count // large backward/overlapping move @lrgbwd:{$ifdef WITH_ERMS} cmp ecx, ERMSB_MIN_SIZE_BWD // enabled for no-SSE2, not for ERMSB ja @ermsb {$endif WITH_ERMS} sub ecx, 16 push ecx movups xmm2, dqword ptr [eax + ecx] // last 16 movups xmm1, dqword ptr [eax] // first 16 add ecx, edx and ecx, -16 // 16-byte align writes sub ecx, edx // non-volatile backward is not efficient since we overlap data {$ifdef FPC} align 16 {$else} {$ifdef HASALIGN} .align 16 {$endif}{$endif} @bwd: movups xmm0, dqword ptr [eax + ecx] movaps dqword ptr [edx + ecx], xmm0 sub ecx, 16 jg @bwd pop ecx movups dqword ptr [edx], xmm1 // first 16 movups dqword ptr [edx + ecx], xmm2 // last 16 ret {$endif HASNOSSE2} @m01: mov al, [eax] mov [edx], al ret @m02: movzx eax, word ptr [eax] mov [edx], ax ret @m03: movzx ecx, word ptr [eax] mov al, [eax + 2] mov [edx], cx mov [edx + 2], al ret @m04: mov ecx, [eax] mov [edx], ecx ret @m05: mov ecx, [eax] mov al, [eax + 4] mov [edx], ecx mov [edx + 4], al ret @m06: mov ecx, [eax] movzx eax, word ptr [eax + 4] mov [edx], ecx mov [edx + 4], ax ret @m07: mov ecx, [eax] mov eax, [eax + 3] mov [edx], ecx mov [edx + 3], eax ret @m08: mov ecx, [eax] mov eax, [eax + 4] mov [edx], ecx mov [edx + 4], eax end; {$endif FPC_X86} function CompareMem(P1, P2: Pointer; Length: PtrInt): boolean; {$ifdef FPC}nostackframe; assembler;{$endif} asm // eax=P1 edx=P2 ecx=Length cmp eax, edx je @0 // P1=P2 sub ecx, 8 jl @small push ebx mov ebx, [eax] // Compare First 4 Bytes cmp ebx, [edx] jne @setbig lea ebx, [eax + ecx] // Compare Last 8 Bytes add edx, ecx mov eax, [ebx] cmp eax, [edx] jne @setbig mov eax, [ebx + 4] cmp eax, [edx + 4] jne @setbig sub ecx, 4 jle @true // All Bytes already Compared neg ecx // ecx=-(Length-12) add ecx, ebx // DWORD Align Reads and ecx, -4 sub ecx, ebx @loop: mov eax, [ebx + ecx] // Compare 8 Bytes per Loop cmp eax, [edx + ecx] jne @setbig mov eax, [ebx + ecx + 4] cmp eax, [edx + ecx + 4] jne @setbig add ecx, 8 jl @loop @true: pop ebx @0: mov al, 1 ret @setbig:pop ebx setz al ret @small: add ecx, 8 // ecx=0..7 jle @0 // Length <= 0 neg ecx // ecx=-1..-7 lea ecx, [@1 + ecx * 8 + 8] // each @#: block below = 8 bytes jmp ecx @7: mov cl, [eax + 6] cmp cl, [edx + 6] jne @setsml @6: mov ch, [eax + 5] cmp ch, [edx + 5] jne @setsml @5: mov cl, [eax + 4] cmp cl, [edx + 4] jne @setsml @4: mov ch, [eax + 3] cmp ch, [edx + 3] jne @setsml @3: mov cl, [eax + 2] cmp cl, [edx + 2] jne @setsml @2: mov ch, [eax + 1] cmp ch, [edx + 1] jne @setsml @1: mov al, [eax] cmp al, [edx] @setsml:setz al end; function crc32fasttab(crc: cardinal; buf: PAnsiChar; len: cardinal; tab: PCrc32tab): cardinal; asm // adapted from Aleksandr Sharahov code and Maxim Masiutin remarks push ebx push ebp mov ebp, tab not eax neg ecx // eax=crc edx=buf ecx=-len ebp=tab jz @z test edx, edx jz @z @head: test dl, 3 jz @align movzx ebx, byte ptr [edx] inc edx xor bl, al shr eax, 8 xor eax, dword ptr [ebx * 4 + ebp] inc ecx jnz @head @z: jmp @e @align: sub edx, ecx add ecx, 8 jg @done push esi push edi mov edi, edx {$ifdef FPC} align 8 {$endif FPC} @by8: mov edx, eax mov ebx, [edi + ecx - 4] xor edx, [edi + ecx - 8] movzx esi, bl mov eax, dword ptr [esi * 4 + ebp + 1024 * 3] movzx esi, bh xor eax, dword ptr [esi * 4 + ebp + 1024 * 2] shr ebx, 16 movzx esi, bl xor eax, dword ptr [esi * 4 + ebp + 1024 * 1] movzx esi, bh xor eax, dword ptr [esi * 4 + ebp + 1024 * 0] movzx esi, dl xor eax, dword ptr [esi * 4 + ebp + 1024 * 7] movzx esi, dh xor eax, dword ptr [esi * 4 + ebp + 1024 * 6] shr edx, 16 movzx esi, dl xor eax, dword ptr [esi * 4 + ebp + 1024 * 5] movzx esi, dh xor eax, dword ptr [esi * 4 + ebp + 1024 * 4] add ecx, 8 jle @by8 mov edx, edi pop edi pop esi @done: sub ecx, 8 jge @e @tail: movzx ebx, byte[edx + ecx] xor bl, al shr eax, 8 xor eax, dword ptr [ebx * 4 + ebp] inc ecx jnz @tail @e: pop ebp pop ebx not eax end; procedure crcblockfast(crc128, data128: PBlock128); {$ifdef FPC}nostackframe; assembler;{$endif} asm // Delphi is not efficient about corresponding pascal code push ebp push edi push esi mov ebp, eax // ebp=crc128 edi=data128 mov edi, edx mov edx, dword ptr [eax] mov ecx, dword ptr [eax + 4] xor edx, dword ptr [edi] xor ecx, dword ptr [edi + 4] movzx esi, dl // note: since we have "+ 1024 * n" offsets, crc32ctab is left immediate mov eax, dword ptr [esi * 4 + crc32ctab + 1024 * 3] movzx esi, dh shr edx, 16 xor eax, dword ptr [esi * 4 + crc32ctab + 1024 * 2] movzx esi, dl xor eax, dword ptr [esi * 4 + crc32ctab + 1024 * 1] movzx esi, dh xor eax, dword ptr [esi * 4 + crc32ctab + 1024 * 0] mov edx, dword ptr [ebp + 8] xor edx, dword ptr [edi + 8] mov dword ptr [ebp], eax movzx esi, cl mov eax, dword ptr [esi * 4 + crc32ctab + 1024 * 3] movzx esi, ch shr ecx, 16 xor eax, dword ptr [esi * 4 + crc32ctab + 1024 * 2] movzx esi, cl xor eax, dword ptr [esi * 4 + crc32ctab + 1024 * 1] movzx esi, ch xor eax, dword ptr [esi * 4 + crc32ctab + 1024 * 0] mov dword ptr [ebp + 4], eax mov ecx, dword ptr [ebp + 12] xor ecx, dword ptr [edi + 12] movzx esi, dl mov eax, dword ptr [esi * 4 + crc32ctab + 1024 * 3] movzx esi, dh shr edx, 16 xor eax, dword ptr [esi * 4 + crc32ctab + 1024 * 2] movzx esi, dl xor eax, dword ptr [esi * 4 + crc32ctab + 1024 * 1] movzx esi, dh xor eax, dword ptr [esi * 4 + crc32ctab + 1024 * 0] mov dword ptr [ebp + 8], eax movzx esi, cl mov eax, dword ptr [esi * 4 + crc32ctab + 1024 * 3] movzx esi, ch shr ecx, 16 xor eax, dword ptr [esi * 4 + crc32ctab + 1024 * 2] movzx esi, cl xor eax, dword ptr [esi * 4 + crc32ctab + 1024 * 1] movzx esi, ch xor eax, dword ptr [esi * 4 + crc32ctab + 1024 * 0] mov dword ptr [ebp + 12], eax pop esi pop edi pop ebp end; function fnv32(crc: cardinal; buf: PAnsiChar; len: PtrInt): cardinal; {$ifdef FPC}nostackframe; assembler;{$endif} asm // eax=crc, edx=buf, ecx=len push ebx test edx, edx jz @0 neg ecx jz @0 sub edx, ecx @1: movzx ebx, byte ptr [edx + ecx] xor eax, ebx imul eax, eax, 16777619 inc ecx jnz @1 @0: pop ebx end; // we tried an unrolled version, but it was slower on our Core i7! function kr32(crc: cardinal; buf: PAnsiChar; len: PtrInt): cardinal; {$ifdef FPC}nostackframe; assembler;{$endif} asm // eax=crc, edx=buf, ecx=len test ecx, ecx push edi push esi push ebx push ebp jz @z cmp ecx, 4 jb @s @8: mov ebx, [edx] // unrolled version reading per dword add edx, 4 mov esi, eax movzx edi, bl movzx ebp, bh shr ebx, 16 shl eax, 5 sub eax, esi add eax, edi mov esi, eax shl eax, 5 sub eax, esi lea esi, [eax + ebp] add eax, ebp movzx edi, bl movzx ebx, bh shl eax, 5 sub eax, esi lea ebp, [eax + edi] add eax, edi shl eax, 5 sub eax, ebp add eax, ebx cmp ecx, 8 lea ecx, [ecx - 4] jae @8 test ecx, ecx jz @z @s: mov esi, eax @1: shl eax, 5 movzx ebx, byte ptr [edx] inc edx sub eax, esi lea esi, [eax + ebx] add eax, ebx dec ecx jnz @1 @z: pop ebp pop ebx pop esi pop edi end; function StrLenSSE2(S: pointer): PtrInt; {$ifdef FPC}nostackframe; assembler;{$endif} asm // from GPL strlen32.asm by Agner Fog - www.agner.org/optimize mov ecx, eax // copy pointer test eax, eax jz @null // returns 0 if S=nil push eax // save start address pxor xmm0, xmm0 // set to zero and ecx, 15 // lower 4 bits indicate misalignment and eax, -16 // align pointer by 16 // will never read outside a memory page boundary, so won't trigger GPF movaps xmm1, [eax] // read from nearest preceding boundary pcmpeqb xmm1, xmm0 // compare 16 bytes with zero pmovmskb edx, xmm1 // get one bit for each byte result shr edx, cl // shift out false bits shl edx, cl // shift back again bsf edx, edx // find first 1-bit jnz @A200 // found // Main loop, search 16 bytes at a time @A100: add eax, 10H // increment pointer by 16 movaps xmm1, [eax] // read 16 bytes aligned pcmpeqb xmm1, xmm0 // compare 16 bytes with zero pmovmskb edx, xmm1 // get one bit for each byte result bsf edx, edx // find first 1-bit // (moving the bsf out of the loop and using test here would be faster // for long strings on old processors, but we are assuming that most // strings are short, and newer processors have higher priority) jz @A100 // loop if not found @A200: // Zero-byte found. Compute string length pop ecx // restore start address sub eax, ecx // subtract start address add eax, edx // add byte index @null: end; function StrInt32(P: PAnsiChar; val: PtrInt): PAnsiChar; {$ifdef FPC}nostackframe; assembler;{$endif} asm // eax=P, edx=val mov ecx, edx sar ecx, 31 // 0 if val>=0 or -1 if val<0 push ecx xor edx, ecx sub edx, ecx // edx=abs(val) cmp edx, 10 jb @3 // direct process of common val<10 push edi mov edi, eax mov eax, edx @s: sub edi, 2 cmp eax, 100 jb @2 mov ecx, eax mov edx, 1374389535 // use power of two reciprocal to avoid division mul edx shr edx, 5 // now edx=eax div 100 mov eax, edx imul edx, -200 movzx edx, word ptr [TwoDigitLookup + ecx * 2 + edx] mov [edi], dx cmp eax, 10 jae @s @1: dec edi or al, '0' mov byte ptr [edi - 1], '-' mov [edi], al mov eax, edi pop edi pop ecx add eax, ecx // includes '-' if val<0 ret @2: movzx eax, word ptr [TwoDigitLookup + eax * 2] mov byte ptr [edi - 1], '-' mov [edi], ax mov eax, edi pop edi pop ecx add eax, ecx // includes '-' if val<0 ret @3: dec eax pop ecx or dl, '0' mov byte ptr [eax - 1], '-' mov [eax], dl add eax, ecx // includes '-' if val<0 end; function StrUInt32(P: PAnsiChar; val: PtrUInt): PAnsiChar; {$ifdef FPC}nostackframe; assembler;{$endif} asm // eax=P, edx=val cmp edx, 10 jb @3 // direct process of common val=0 (or val<10) push edi mov edi, eax mov eax, edx @s: sub edi, 2 cmp eax, 100 jb @2 mov ecx, eax mov edx, 1374389535 // use power of two reciprocal to avoid division mul edx shr edx, 5 // now edx=eax div 100 mov eax, edx imul edx, -200 movzx edx, word ptr [TwoDigitLookup + ecx * 2 + edx] mov [edi], dx cmp eax, 10 jae @s @1: dec edi or al, '0' mov [edi], al mov eax, edi pop edi ret @2: movzx eax, word ptr [TwoDigitLookup + eax * 2] mov [edi], ax mov eax, edi pop edi ret @3: dec eax or dl, '0' mov [eax], dl end; procedure YearToPChar(Y: PtrUInt; P: PUtf8Char); {$ifdef FPC}nostackframe; assembler;{$endif} asm // eax=Y, edx=P push edx mov ecx, eax mov edx, 1374389535 // use power of two reciprocal to avoid division mul edx shr edx, 5 // now edx=Y div 100 movzx eax, word ptr [TwoDigitLookup + edx * 2] imul edx, -200 movzx edx, word ptr [TwoDigitLookup + ecx * 2 + edx] pop ecx shl edx, 16 or eax, edx mov [ecx], eax end; {$endif ASMX86} // functions below are always available, even on DARWIN function Hash32(Data: PCardinalArray; Len: integer): cardinal; {$ifdef FPC}nostackframe; assembler;{$endif} asm // eax=Data edx=Len push esi push edi mov cl, dl mov ch, dl xor esi, esi xor edi, edi test eax, eax jz @z shr edx, 4 jz @by4 nop @by16: add esi, dword ptr [eax] add edi, esi add esi, dword ptr [eax+4] add edi, esi add esi, dword ptr [eax+8] add edi, esi add esi, dword ptr [eax+12] add edi, esi add eax, 16 dec edx jnz @by16 @by4: and cl, 15 jz @0 shr cl, 2 jz @rem @4: add esi, dword ptr [eax] add edi, esi add eax, 4 dec cl jnz @4 @rem: and ch, 3 jz @0 dec ch jz @1 dec ch jz @2 mov eax, dword ptr [eax] and eax, $ffffff jmp @e @2: movzx eax, word ptr [eax] jmp @e @1: movzx eax, byte ptr [eax] @e: add esi, eax @0: add edi, esi mov eax, esi shl edi, 16 xor eax, edi @z: pop edi pop esi end; function xxHash32(crc: cardinal; P: PAnsiChar; len: cardinal): cardinal; {$ifdef FPC}nostackframe; assembler;{$endif} asm xchg edx, ecx push ebp push edi lea ebp, [ecx+edx] push esi push ebx sub esp, 8 mov ebx, eax mov dword ptr [esp], edx lea eax, [ebx+165667B1H] cmp edx, 15 jbe @2 lea eax, [ebp-10H] lea edi, [ebx+24234428H] lea esi, [ebx-7A143589H] mov dword ptr [esp+4H], ebp mov edx, eax lea eax, [ebx+61C8864FH] mov ebp, edx @1: mov edx, dword ptr [ecx] imul edx, -2048144777 add edi, edx rol edi, 13 imul edi, -1640531535 mov edx, dword ptr [ecx+4] imul edx, -2048144777 add esi, edx rol esi, 13 imul esi, -1640531535 mov edx, dword ptr [ecx+8] imul edx, -2048144777 add ebx, edx rol ebx, 13 imul ebx, -1640531535 mov edx, dword ptr [ecx+12] lea ecx, [ecx+16] imul edx, -2048144777 add eax, edx rol eax, 13 imul eax, -1640531535 cmp ebp, ecx jnc @1 rol edi, 1 rol esi, 7 rol ebx, 12 add esi, edi mov ebp, dword ptr [esp+4H] ror eax, 14 add ebx, esi add eax, ebx @2: lea esi, [ecx+4H] add eax, dword ptr [esp] cmp ebp, esi jc @4 mov ebx, esi nop @3: imul edx, dword ptr [ebx-4H], -1028477379 add ebx, 4 add eax, edx ror eax, 15 imul eax, 668265263 cmp ebp, ebx jnc @3 lea edx, [ebp-4H] sub edx, ecx mov ecx, edx and ecx, 0FFFFFFFCH add ecx, esi @4: cmp ebp, ecx jbe @6 @5: movzx edx, byte ptr [ecx] add ecx, 1 imul edx, 374761393 add eax, edx rol eax, 11 imul eax, -1640531535 cmp ebp, ecx jnz @5 nop @6: mov edx, eax add esp, 8 shr edx, 15 xor eax, edx imul eax, -2048144777 pop ebx pop esi mov edx, eax shr edx, 13 xor eax, edx imul eax, -1028477379 pop edi pop ebp mov edx, eax shr edx, 16 xor eax, edx end; function GetBitsCountPas(value: PtrInt): PtrInt; {$ifdef FPC}nostackframe; assembler;{$endif} asm // branchless Wilkes-Wheeler-Gill i386 asm implementation mov edx, eax shr eax, 1 and eax, $55555555 sub edx, eax mov eax, edx shr edx, 2 and eax, $33333333 and edx, $33333333 add eax, edx mov edx, eax shr eax, 4 add eax, edx and eax, $0f0f0f0f mov edx, eax shr edx, 8 add eax, edx mov edx, eax shr edx, 16 add eax, edx and eax, $3f end; {$ifdef HASNOSSE2} // fallback to simple pascal code if no SSE2 available function ByteScanIndex(P: PByteArray; Count: PtrInt; Value: byte): PtrInt; begin result := 0; if P <> nil then repeat if result >= Count then break; if P^[result] = Value then exit; inc(result); until false; result := -1; end; function WordScanIndex(P: PWordArray; Count: PtrInt; Value: word): PtrInt; begin result := 0; if P <> nil then repeat if result >= Count then break; if P^[result] = Value then exit; inc(result); until false; result := -1; end; function IntegerScanIndex(P: PCardinalArray; Count: PtrInt; Value: cardinal): PtrInt; begin result := PtrUInt(IntegerScan(P, Count, Value)); if result = 0 then dec(result) else begin dec(result, PtrUInt(P)); result := result shr 2; end; end; function IntegerScan(P: PCardinalArray; Count: PtrInt; Value: cardinal): PCardinal; {$ifdef FPC}nostackframe; assembler;{$endif} asm // eax=P, edx=Count, Value=ecx test eax, eax jz @ok0 // avoid GPF cmp edx, 8 jb @s2 nop nop nop // @s1 loop align @s1: sub edx, 8 cmp [eax], ecx je @ok0 cmp [eax + 4], ecx je @ok4 cmp [eax + 8], ecx je @ok8 cmp [eax + 12], ecx je @ok12 cmp [eax + 16], ecx je @ok16 cmp [eax + 20], ecx je @ok20 cmp [eax + 24], ecx je @ok24 cmp [eax + 28], ecx je @ok28 add eax, 32 cmp edx, 8 jae @s1 @s2: test edx, edx jz @z cmp [eax], ecx je @ok0 dec edx jz @z cmp [eax + 4], ecx je @ok4 dec edx jz @z cmp [eax + 8], ecx je @ok8 dec edx jz @z cmp [eax + 12], ecx je @ok12 dec edx jz @z cmp [eax + 16], ecx je @ok16 dec edx jz @z cmp [eax + 20], ecx je @ok20 dec edx jz @z cmp [eax + 24], ecx je @ok24 @z: xor eax, eax // return nil if not found ret @ok0: rep ret @ok28: add eax, 28 ret @ok24: add eax, 24 ret @ok20: add eax, 20 ret @ok16: add eax, 16 ret @ok12: add eax, 12 ret @ok8: add eax, 8 ret @ok4: add eax, 4 end; function IntegerScanExists(P: PCardinalArray; Count: PtrInt; Value: cardinal): boolean; {$ifdef FPC}nostackframe; assembler;{$endif} asm // eax=P, edx=Count, Value=ecx test eax, eax jz @z // avoid GPF cmp edx, 8 jae @s1 jmp dword ptr [edx * 4 + @Table] {$ifdef FPC} align 4 {$endif FPC} @Table: dd @z, @1, @2, @3, @4, @5, @6, @7 @s1: // fast search by 8 integers (pipelined instructions) sub edx, 8 cmp [eax], ecx je @ok cmp [eax + 4], ecx je @ok cmp [eax + 8], ecx je @ok cmp [eax + 12], ecx je @ok cmp [eax + 16], ecx je @ok cmp [eax + 20], ecx je @ok cmp [eax + 24], ecx je @ok cmp [eax + 28], ecx je @ok add eax, 32 cmp edx, 8 jae @s1 jmp dword ptr [edx * 4 + @Table] @7: cmp [eax + 24], ecx je @ok @6: cmp [eax + 20], ecx je @ok @5: cmp [eax + 16], ecx je @ok @4: cmp [eax + 12], ecx je @ok @3: cmp [eax + 8], ecx je @ok @2: cmp [eax + 4], ecx je @ok @1: cmp [eax], ecx je @ok @z: xor eax, eax ret @ok: mov al, 1 end; {$else} function ByteScanIndex(P: PByteArray; Count: PtrInt; Value: byte): PtrInt; {$ifdef FPC}nostackframe; assembler;{$endif} asm // eax=P, edx=Count, cl=Value push ebx mov ebx, eax movzx eax, cl mov ecx, ebx test edx, edx jbe @no // eax=Value, ecx=P edx=Count imul eax, $01010101 and ecx, -16 movd xmm1, eax movaps xmm0, [ecx] add ecx, 16 pshufd xmm1, xmm1, 0 sub ecx, ebx pcmpeqb xmm0, xmm1 pmovmskb eax, xmm0 shl eax, cl and eax, $FFFF0000 shr eax, cl jnz @fnd cmp edx, ecx jbe @no {$ifdef FPC} align 16 {$else} {$ifdef HASALIGN} .align 16 {$endif}{$endif} @by16: movaps xmm0, [ebx + ecx] add ecx, 16 pcmpeqb xmm0, xmm1 pmovmskb eax, xmm0 bsf eax, eax jnz @fnd2 cmp edx, ecx ja @by16 @no: mov eax, -1 pop ebx ret @fnd: bsf eax, eax @fnd2: lea eax, [ecx + eax - 16] cmp edx, eax jbe @no pop ebx end; function WordScanIndex(P: PWordArray; Count: PtrInt; Value: word): PtrInt; {$ifdef FPC}nostackframe; assembler;{$endif} asm // eax=P, edx=Count, cx=Value push ebx mov ebx, eax movzx eax, cx mov ecx, ebx test edx, edx jbe @no test cl, 1 jnz @unal // eax=Value, ecx=P edx=Count movd xmm1, eax and ecx, -16 punpcklwd xmm1, xmm1 movaps xmm0, [ecx] add ecx, 16 pshufd xmm1, xmm1, 0 sub ecx, ebx pcmpeqw xmm0, xmm1 pmovmskb eax, xmm0 shl eax, cl and eax, $FFFF0000 shr eax, cl shr ecx, 1 test eax, eax jz @nxt bsf eax, eax @fnd: shr eax, 1 lea eax, [ecx + eax - 8] cmp edx, eax jbe @no pop ebx ret nop // manual loop alignment @by16: movaps xmm0, [ebx + ecx * 2] add ecx, 8 pcmpeqw xmm0, xmm1 pmovmskb eax, xmm0 bsf eax, eax jnz @fnd @nxt: cmp edx, ecx ja @by16 @no: mov eax, -1 pop ebx ret // 16bit-unaligned loop (seldom called) @unal: lea ecx, [ecx + edx * 2] neg edx @unals: cmp word ptr [ecx + edx * 2], ax jz @unale inc edx jnz @unals jmp @no @unale: lea eax, [ecx + edx * 2] sub eax, ebx shr eax, 1 pop ebx end; function IntegerScanIndex(P: PCardinalArray; Count: PtrInt; Value: cardinal): PtrInt; {$ifdef FPC}nostackframe; assembler;{$endif} asm // eax=P, edx=Count, ecx=Value push ebx movd xmm1, ecx mov ebx, eax mov eax, ecx mov ecx, ebx test edx, edx jbe @no test cl, 3 jnz @unal // eax=Value, ecx=P edx=Count and ecx, -16 movaps xmm0, [ecx] add ecx, 16 pshufd xmm1, xmm1, 0 sub ecx, ebx pcmpeqd xmm0, xmm1 pmovmskb eax, xmm0 shl eax, cl and eax, $FFFF0000 shr eax, cl shr ecx, 2 bsf eax, eax jz @nxt @fnd: shr eax, 2 lea eax, [ecx + eax - 4] cmp edx, eax jbe @no pop ebx ret {$ifdef FPC} align 16 {$else} nop;nop;nop;nop;nop;nop {$endif FPC} @by16: movaps xmm0, [ebx + ecx * 4] add ecx, 4 pcmpeqd xmm0, xmm1 pmovmskb eax, xmm0 bsf eax, eax jnz @fnd @nxt: cmp edx, ecx ja @by16 @no: mov eax, -1 pop ebx ret // 32bit-unaligned loop (seldom called) @unal: lea ecx, [ecx + edx * 4] neg edx @unals: cmp dword ptr [ecx + edx * 4], eax jz @unale inc edx jnz @unals jmp @no @unale: lea eax, [ecx + edx * 4] sub eax, ebx shr eax, 2 pop ebx end; {$endif HASNOSSE2} procedure mul64x64(const left, right: QWord; out product: THash128Rec); {$ifdef FPC}nostackframe; assembler; {$endif} asm // adapted from FPC compiler output, which is much better than Delphi's here {$ifdef FPC} push ebp mov ebp, esp {$endif FPC} mov ecx, eax mov eax, dword ptr [ebp+8H] mul dword ptr [ebp+10H] mov dword ptr [ecx], eax mov dword ptr [ebp-4H], edx mov eax, dword ptr [ebp+8H] mul dword ptr [ebp+14H] add eax, dword ptr [ebp-4H] adc edx, 0 mov dword ptr [ebp-10H], eax mov dword ptr [ebp-0CH], edx mov eax, dword ptr [ebp+0CH] mul dword ptr [ebp+10H] add eax, dword ptr [ebp-10H] adc edx, 0 mov dword ptr [ecx+4H], eax mov dword ptr [ebp-14H], edx mov eax, dword ptr [ebp+0CH] mul dword ptr [ebp+14H] add eax, dword ptr [ebp-0CH] adc edx, 0 add eax, dword ptr [ebp-14H] adc edx, 0 mov dword ptr [ecx+8H], eax mov dword ptr [ecx+0CH], edx {$ifdef FPC} pop ebp {$endif FPC} end; function bswap32(a: cardinal): cardinal; {$ifdef FPC}nostackframe; assembler;{$endif} asm bswap eax end; function bswap64({$ifdef FPC}constref{$else}const{$endif} a: QWord): QWord; {$ifdef FPC}nostackframe; assembler;{$endif} asm {$ifdef FPC} mov edx, dword ptr [eax] mov eax, dword ptr [eax + 4] {$else} mov edx, a.TQWordRec.L mov eax, a.TQWordRec.H {$endif FPC} bswap edx bswap eax end; procedure bswap64array(A, B: PQWordArray; n: PtrInt); {$ifdef FPC}nostackframe; assembler;{$endif} asm push ebx push esi @1: mov ebx, dword ptr [eax] mov esi, dword ptr [eax + 4] bswap ebx bswap esi mov dword ptr [edx + 4], ebx mov dword ptr [edx], esi add eax, 8 add edx, 8 dec ecx jnz @1 pop esi pop ebx end; procedure LockedInc32(int32: PInteger); {$ifdef FPC}nostackframe; assembler;{$endif} asm lock inc dword ptr [int32] end; procedure LockedDec32(int32: PInteger); {$ifdef FPC}nostackframe; assembler;{$endif} asm lock dec dword ptr [int32] end; procedure LockedInc64(int64: PInt64); {$ifdef FPC}nostackframe; assembler;{$endif} asm lock inc dword ptr [int64] jz @h ret @h:lock inc dword ptr [int64 + 4] // collision is very unlikely end; function InterlockedIncrement(var I: integer): integer; {$ifdef FPC}nostackframe; assembler;{$endif} asm mov edx, 1 xchg eax, edx lock xadd [edx], eax inc eax end; function InterlockedDecrement(var I: integer): integer; {$ifdef FPC}nostackframe; assembler;{$endif} asm mov edx, -1 xchg eax, edx lock xadd [edx], eax dec eax end; function StrLenSafe(S: pointer): PtrInt; {$ifdef FPC}nostackframe; assembler;{$endif} asm // slower than StrLenSSE2(), but won't read any byte beyond the page test eax, eax jz @z cmp byte ptr [eax], 0 je @0 cmp byte ptr [eax + 1], 0 je @1 cmp byte ptr [eax + 2], 0 je @2 cmp byte ptr [eax + 3], 0 je @3 push eax and eax, -4 // dword align reads nop // @s loop code alignment @s: add eax, 4 mov edx, dword ptr [eax] // 4 chars per loop lea ecx, [edx - $01010101] not edx and edx, ecx and edx, $80808080 // set byte to $80 at each #0 position jz @s // loop until any #0 found @set: pop ecx bsf edx, edx // find first #0 position shr edx, 3 // byte offset of first #0 add eax, edx // address of first #0 sub eax, ecx // length @z: ret @0: xor eax, eax ret @1: mov eax, 1 ret @2: mov eax, 2 ret @3: mov eax, 3 end; function PosEx(const SubStr, S: RawUtf8; Offset: PtrUInt): integer; {$ifdef FPC}nostackframe; assembler;{$endif} asm // eax=SubStr, edx=S, ecx=Offset push ebx push esi push edx test eax, eax jz @notfnd // exit if SubStr='' test edx, edx jz @notfnd // exit if S='' mov esi, ecx mov ecx, [edx - 4] // length(S) mov ebx, [eax - 4] // length(SubStr) add ecx, edx sub ecx, ebx // ecx = max start pos for full match lea edx, [edx + esi - 1] // edx = start position cmp edx, ecx jg @notfnd // startpos > max start pos cmp ebx, 1 jle @onec // optimized loop for length(SubStr)<=1 push edi push ebp lea edi, [ebx - 2] // edi = length(SubStr)-2 mov esi, eax // esi = SubStr movzx ebx, byte ptr [eax] // bl = search character nop; nop @l: cmp bl, [edx] // compare 2 characters per @l je @c1fnd @notc1: cmp bl, [edx + 1] je @c2fnd @notc2: add edx, 2 cmp edx, ecx // next start position <= max start position jle @l pop ebp pop edi @notfnd:xor eax, eax // returns 0 if not fnd pop edx pop esi pop ebx ret @c1fnd: mov ebp, edi // ebp = length(SubStr)-2 @c1l: movzx eax, word ptr [esi + ebp] cmp ax, [edx + ebp] // compare 2 chars per @c1l (may include #0) jne @notc1 sub ebp, 2 jnc @c1l pop ebp pop edi jmp @setres @c2fnd: mov ebp, edi // ebp = length(SubStr)-2 @c2l: movzx eax, word ptr [esi + ebp] cmp ax, [edx + ebp + 1] // compare 2 chars per @c2l (may include #0) jne @notc2 sub ebp, 2 jnc @c2l pop ebp pop edi jmp @chkres @onec: jl @notfnd // needed for zero-length non-nil strings movzx eax, byte ptr [eax] // search character @charl: cmp al, [edx] je @setres cmp al, [edx + 1] je @chkres add edx, 2 cmp edx, ecx jle @charl jmp @notfnd @chkres:cmp edx, ecx // check within ansistring jge @notfnd add edx, 1 @setres:pop ecx // ecx = S pop esi pop ebx neg ecx lea eax, [edx + ecx + 1] end; function StrComp(Str1, Str2: pointer): PtrInt; {$ifdef FPC}nostackframe; assembler;{$endif} asm // no branch taken in case of not equal first char cmp eax, edx je @zero // same string or both nil test eax, edx jz @maynil @1: mov cl, byte ptr [eax] mov ch, byte ptr [edx] inc eax inc edx test cl, cl jz @exit cmp cl, ch je @1 @exit: movzx eax, cl movzx edx, ch sub eax, edx ret @maynil:test eax, eax // Str1='' ? jz @max test edx, edx // Str2='' ? jnz @1 mov eax, 1 ret @max: dec eax ret @zero: xor eax, eax end; function SortDynArrayInteger(const A, B): integer; {$ifdef FPC}nostackframe; assembler;{$endif} asm mov ecx, dword ptr [eax] mov edx, dword ptr[edx] xor eax, eax cmp ecx, edx setl cl setg al movzx ecx, cl sub eax, ecx end; function SortDynArrayCardinal(const A, B): integer; {$ifdef FPC}nostackframe; assembler;{$endif} asm mov ecx, [eax] mov edx, [edx] xor eax, eax cmp ecx, edx seta al sbb eax,0 end; function SortDynArrayPointer(const A, B): integer; {$ifdef FPC}nostackframe; assembler;{$endif} asm mov ecx, [eax] mov edx, [edx] xor eax, eax cmp ecx, edx seta al sbb eax,0 end; function SortDynArrayInt64(const A, B): integer; {$ifdef FPC}nostackframe; assembler;{$endif} asm // Delphi x86 compiler is not efficient at compiling Int64 comparisons mov ecx, [eax] mov eax, [eax + 4] cmp eax, [edx + 4] jnz @nz cmp ecx, [edx] jz @0 jnb @p @n: mov eax, -1 ret @0: xor eax, eax ret @nz: jl @n @p: mov eax, 1 end; function SortDynArrayQWord(const A, B): integer; {$ifdef FPC}nostackframe; assembler;{$endif} asm // Delphi x86 compiler is not efficient, and oldest even incorrect mov ecx, [eax] mov eax, [eax + 4] cmp eax, [edx + 4] jnz @nz cmp ecx, [edx] jz @0 @nz: jnb @p mov eax, -1 ret @0: xor eax, eax ret @p: mov eax, 1 end; function SortDynArrayAnsiString(const A, B): integer; {$ifdef FPC}nostackframe; assembler;{$endif} asm // x86 version optimized for AnsiString/RawUtf8/RawByteString types mov eax, dword ptr [eax] mov edx, dword ptr [edx] cmp eax, edx je @0 test eax, edx jz @maynil @f: mov cl, byte ptr [eax] // first char comparison (quicksort speedup) mov ch, byte ptr [edx] cmp cl, ch je @s movzx eax, cl movzx edx, ch sub eax, edx // branchless execution on Quicksort/ReHash ret @0: xor eax, eax ret @maynil:test edx, edx // A or B may be '' jz @1 test eax, eax jnz @f dec eax ret @s: push ebx xor ecx, ecx mov ebx, dword ptr [eax - 4] sub ebx, dword ptr [edx - 4] // ebx = length(A)-length(B) push ebx adc ecx, -1 and ecx, ebx sub ecx, dword ptr [eax - 4] // ecx = -min(length(A),length(B)) sub eax, ecx sub edx, ecx {$ifdef FPC} align 16 {$endif} // is naturally aligned anyway @by4: mov ebx, dword ptr [eax + ecx] // compare 4 bytes per iteration xor ebx, dword ptr [edx + ecx] jnz @d add ecx, 4 js @by4 @eq: pop eax // all chars equal -> returns length(A)-length(B) pop ebx ret @d: bsf ebx, ebx // char differs -> returns pbyte(A)^-pbyte(B)^ shr ebx, 3 add ecx, ebx jns @eq movzx eax, byte ptr [eax + ecx] movzx edx, byte ptr [edx + ecx] pop ebx pop ebx sub eax, edx ret @1: mov eax, 1 end; function SortDynArrayDouble(const A, B): integer; {$ifdef FPC}nostackframe; assembler;{$endif} asm fld qword ptr [eax] fcomp qword ptr [edx] fstsw ax sahf jz @0 @nz: jnb @p mov eax, -1 ret @0: xor eax, eax ret @p: mov eax, 1 end; function SortDynArraySingle(const A, B): integer; {$ifdef FPC}nostackframe; assembler;{$endif} asm fld dword ptr [eax] fcomp dword ptr [edx] fstsw ax sahf jz @0 @nz: jnb @p mov eax, -1 ret @0: xor eax, eax ret @p: mov eax, 1 end; procedure DynArrayHashTableAdjust(P: PIntegerArray; deleted: integer; count: PtrInt); {$ifdef FPC}nostackframe; assembler;{$endif} asm // eax=P edx=deleted ecx=count push ebx mov ebx, eax xor eax, eax @by8: sub ecx, 8 cmp edx, dword ptr [ebx] setl al sub dword ptr [ebx], eax cmp edx, dword ptr [ebx + 4H] setl al sub dword ptr [ebx + 4H], eax cmp edx, dword ptr [ebx + 8H] setl al sub dword ptr [ebx + 8H], eax cmp edx, dword ptr [ebx + 0CH] setl al sub dword ptr [ebx + 0CH], eax cmp edx, dword ptr [ebx + 10H] setl al sub dword ptr [ebx + 10H], eax cmp edx, dword ptr [ebx + 14H] setl al sub dword ptr [ebx + 14H], eax cmp edx, dword ptr [ebx + 18H] setl al sub dword ptr [ebx + 18H], eax cmp edx, dword ptr [ebx + 1CH] setl al sub dword ptr [ebx + 1CH], eax add ebx, 32 cmp ecx, 8 jge @by8 test ecx, ecx jz @ok // trailing indexes (never appearing within DYNARRAYHASH_PO2 range) @by1: cmp edx, dword ptr [ebx] setl al sub dword ptr [ebx], eax add ebx, 4 dec ecx jnz @by1 @ok: pop ebx end; procedure DynArrayHashTableAdjust16(P: PWordArray; deleted: cardinal; count: PtrInt); {$ifdef FPC}nostackframe; assembler;{$endif} asm // eax=P dx=deleted ecx=count push ebx mov ebx, eax xor eax, eax @by8: sub ecx, 8 cmp dx, word ptr [ebx] setl al sub word ptr [ebx], ax cmp dx, word ptr [ebx + 2] setl al sub word ptr [ebx + 2], ax cmp dx, word ptr [ebx + 4] setl al sub word ptr [ebx + 4], ax cmp dx, word ptr [ebx + 6] setl al sub word ptr [ebx + 6], ax cmp dx, word ptr [ebx + 8] setl al sub word ptr [ebx + 8], ax cmp dx, word ptr [ebx + 10] setl al sub word ptr [ebx + 10], ax cmp dx, word ptr [ebx + 12] setl al sub word ptr [ebx + 12], ax cmp dx, word ptr [ebx + 14] setl al sub word ptr [ebx + 14], ax add ebx, 16 cmp ecx, 8 jge @by8 test ecx, ecx jz @ok // trailing indexes (never appearing within DYNARRAYHASH_PO2 range) @by1: cmp dx, word ptr [ebx] setl al sub word ptr [ebx], ax add ebx, 2 dec ecx jnz @by1 @ok: pop ebx end; function GetBitsCountSse42(value: PtrInt): PtrInt; {$ifdef FPC}nostackframe; assembler;{$endif} asm // eax=value {$ifdef HASAESNI} popcnt eax, eax {$else} // oldest Delphi don't support this opcode db $f3,$0f,$B8,$c0 {$endif HASAESNI} end; function crc32cby4sse42(crc, value: cardinal): cardinal; {$ifdef FPC}nostackframe; assembler;{$endif} asm // eax=crc, edx=value {$ifdef HASAESNI} crc32 eax, edx {$else} // oldest Delphi don't support this opcode db $F2, $0F, $38, $F1, $C2 {$endif HASAESNI} end; procedure crcblocksse42(crc128, data128: PBlock128); {$ifdef FPC}nostackframe; assembler;{$endif} asm // eax=crc128, edx=data128 mov ecx, eax {$ifdef HASAESNI} mov eax, dword ptr [ecx] crc32 eax, dword ptr [edx] mov dword ptr [ecx], eax mov eax, dword ptr [ecx + 4] crc32 eax, dword ptr [edx + 4] mov dword ptr [ecx + 4], eax mov eax, dword ptr [ecx + 8] crc32 eax, dword ptr [edx + 8] mov dword ptr [ecx + 8], eax mov eax, dword ptr [ecx + 12] crc32 eax, dword ptr [edx + 12] mov dword ptr [ecx + 12], eax {$else} // oldest Delphi don't support these opcodes mov eax, dword ptr [ecx] db $F2, $0F, $38, $F1, $02 mov dword ptr [ecx], eax mov eax, dword ptr [ecx + 4] db $F2, $0F, $38, $F1, $42, $04 mov dword ptr [ecx + 4], eax mov eax, dword ptr [ecx + 8] db $F2, $0F, $38, $F1, $42, $08 mov dword ptr [ecx + 8], eax mov eax, dword ptr [ecx + 12] db $F2, $0F, $38, $F1, $42, $0C mov dword ptr [ecx + 12], eax {$endif HASAESNI} end; procedure crcblockssse42(crc128, data128: PBlock128; count: integer); {$ifdef FPC}nostackframe; assembler;{$endif} asm // eax=crc128 edx=data128 ecx=count push ebx push esi push edi push ebp test count, count jle @z mov ebp, count mov esi, crc128 mov edi, data128 mov eax, dword ptr [esi] mov ebx, dword ptr [esi + 4] mov ecx, dword ptr [esi + 8] mov edx, dword ptr [esi + 12] {$ifdef HASAESNI} {$ifdef FPC} align 16 {$else} {$ifdef HASALIGN} .align 16 {$endif}{$endif} @s: crc32 eax, dword ptr [edi] crc32 ebx, dword ptr [edi + 4] crc32 ecx, dword ptr [edi + 8] crc32 edx, dword ptr [edi + 12] {$else} // oldest Delphi don't support these opcodes @s: db $F2, $0F, $38, $F1, $07 db $F2, $0F, $38, $F1, $5F, $04 db $F2, $0F, $38, $F1, $4F, $08 db $F2, $0F, $38, $F1, $57, $0C {$endif HASAESNI} add edi, 16 dec ebp jnz @s mov dword ptr [esi], eax mov dword ptr [esi + 4], ebx mov dword ptr [esi + 8], ecx mov dword ptr [esi + 12], edx @z: pop ebp pop edi pop esi pop ebx end; function crc32csse42(crc: cardinal; buf: PAnsiChar; len: cardinal): cardinal; {$ifdef FPC}nostackframe; assembler;{$endif} asm // eax=crc, edx=buf, ecx=len not eax test ecx, ecx jz @0 test edx, edx jz @0 jmp @align @a: {$ifdef HASAESNI} crc32 eax, byte ptr [edx] {$else} // oldest Delphi don't support these opcodes db $F2, $0F, $38, $F0, $02 {$endif HASAESNI} inc edx dec ecx jz @0 @align: test dl, 3 jnz @a push ecx shr ecx, 3 jnz @by8 @rem: pop ecx test cl, 4 jz @4 {$ifdef HASAESNI} crc32 eax, dword ptr [edx] {$else} db $F2, $0F, $38, $F1, $02 {$endif HASAESNI} add edx, 4 @4: test cl, 2 jz @2 {$ifdef HASAESNI} crc32 eax, word ptr [edx] {$else} db $66, $F2, $0F, $38, $F1, $02 {$endif HASAESNI} add edx, 2 @2: test cl, 1 jz @0 {$ifdef HASAESNI} crc32 eax, byte ptr [edx] {$else} db $F2, $0F, $38, $F0, $02 {$endif HASAESNI} @0: not eax ret {$ifdef HASAESNI} @by8: crc32 eax, dword ptr [edx] crc32 eax, dword ptr [edx + 4] {$else} @by8: db $F2, $0F, $38, $F1, $02 db $F2, $0F, $38, $F1, $42, $04 {$endif HASAESNI} add edx, 8 dec ecx jnz @by8 jmp @rem end; function SynLZdecompress1(src: PAnsiChar; size: integer; dst: PAnsiChar): integer; {$ifdef FPC}nostackframe; assembler;{$endif} asm push ebp push ebx push esi push edi push eax add esp, -4092 push eax add esp, -4092 push eax add esp, -4092 push eax add esp, -4092 push eax add esp, -24 mov esi, ecx mov ebx, eax add edx, eax mov [esp+8H], esi mov [esp+10H], edx movzx eax, word ptr [ebx] mov [esp], eax or eax,eax je @@0917 add ebx, 2 mov eax, [esp] test ah, 80H jz @@0907 and eax, 7FFFH movzx edx, word ptr [ebx] shl edx, 15 or eax, edx mov [esp], eax add ebx, 2 @@0907: lea ebp, [esi - 1] @@0908: mov ecx, [ebx] add ebx, 4 mov [esp+14H], ecx mov edi, 1 // edi=CWbit cmp ebx, [esp+10H] jnc @@0917 @@0909: mov ecx, [esp+14H] @@090A: test ecx, edi jnz @@0911 mov al, [ebx] inc ebx mov [esi], al inc esi cmp ebx, [esp+10H] lea eax, [esi-3] jnc @@0917 cmp eax, ebp jbe @@0910 inc ebp mov eax, [ebp] mov edx, eax shr eax, 12 xor eax, edx and eax, 0FFFH mov [esp+1CH + eax * 4], ebp @@0910: add edi, edi jnz @@090A jmp @@0908 @@0911: movzx edx, word ptr [ebx] add ebx, 2 mov eax, edx and edx, 0FH add edx, 2 shr eax, 4 cmp edx,2 jnz @@0912 movzx edx, byte ptr [ebx] inc ebx add edx, 18 @@0912: mov eax, [esp+1CH + eax * 4] mov ecx, esi mov [esp+18H], edx sub ecx, eax // inlined optimized move() cmp ecx, edx jl @@ovlap // overlapping content requires per-byte copy cmp edx, 32 ja @large sub edx, 8 jg @9_32 {$ifdef HASNOSSE2} // slowest x87 FPU code on very old CPU with no SSE2 support mov ecx, [eax] mov eax, [eax + 4] // always copy 8 bytes for 0..8 mov [esi], ecx // safe since src_endmatch := src_end-(6+5) mov [esi + 4], eax jmp @movend @9_32: fild qword ptr [eax + edx] fild qword ptr [eax] cmp edx, 8 jle @16 fild qword ptr [eax + 8] cmp edx, 16 jle @24 fild qword ptr [eax + 16] fistp qword ptr [esi + 16] @24: fistp qword ptr [esi + 8] @16: fistp qword ptr [esi] fistp qword ptr [esi + edx] jmp @movend nop @large: push esi fild qword ptr [eax] lea eax, [eax + edx - 8] lea edx, [esi + edx - 8] fild qword ptr [eax] push edx neg edx and esi, -8 lea edx, [edx + esi + 8] pop esi @lrgnxt:fild qword ptr [eax + edx] fistp qword ptr [esi + edx] add edx, 8 jl @lrgnxt fistp qword ptr [esi] pop esi fistp qword ptr [esi] {$else} // inlined SSE2 move movq xmm0, qword ptr [eax] movq qword ptr [esi], xmm0 jmp @movend @9_32: movq xmm0, qword ptr [eax + edx] movq xmm1, qword ptr [eax] cmp edx, 8 jle @16 movq xmm2, qword ptr [eax + 8] cmp edx, 16 jle @24 movq xmm3, qword ptr [eax + 16] movq qword ptr [esi + 16], xmm3 @24: movq qword ptr [esi + 8], xmm2 @16: movq qword ptr [esi], xmm1 movq qword ptr [esi + edx], xmm0 jmp @movend @large: push esi movups xmm2, dqword ptr [eax] lea eax, [eax + edx - 16] lea edx, [esi + edx - 16] movups xmm1, dqword ptr [eax] push edx neg edx and esi, -16 lea edx, [edx + esi + 16] pop esi @lrgnxt:movups xmm0, dqword ptr [eax + edx] movaps dqword ptr [esi + edx], xmm0 add edx, 16 jl @lrgnxt movups dqword ptr [esi], xmm1 pop esi movups dqword ptr [esi], xmm2 {$endif HASNOSSE2} @movend:cmp esi, ebp jbe @@0916 @@0915: inc ebp mov edx, [ebp] mov eax, edx shr edx, 12 xor eax, edx and eax, 0FFFH mov [esp+1CH + eax * 4], ebp cmp esi, ebp ja @@0915 @@0916: add esi, [esp+18H] cmp ebx, [esp+10H] jnc @@0917 add edi, edi lea ebp, [esi - 1] jz @@0908 jmp @@0909 @@ovlap:push ebx push esi lea ebx, [eax + edx] add esi, edx neg edx @s: mov al, [ebx + edx] mov [esi + edx], al inc edx jnz @s pop esi pop ebx jmp @movend @@0917: mov eax, [esp] add esp, 16412 pop edi pop esi pop ebx pop ebp end; function SynLZcompress1(src: PAnsiChar; size: integer; dst: PAnsiChar): integer; {$ifdef FPC}nostackframe; assembler;{$endif} asm push ebp push ebx push esi push edi push eax add esp, -4092 push eax add esp, -4092 push eax add esp, -4092 push eax add esp, -4092 push eax add esp, -4092 push eax add esp, -4092 push eax add esp, -4092 push eax add esp, -4092 push eax add esp, -32 mov esi, eax // esi=src mov edi, ecx // edi=dst mov [esp+08H], ecx mov eax, edx cmp eax, 32768 jl @@0889 or ax, 8000H mov [edi], eax mov eax, edx shr eax, 15 mov [edi + 2], eax add edi, 4 jmp @@0891 @@0890: mov eax, 2 jmp @@0904 @@0889: mov [edi], eax test eax, eax jz @@0890 add edi, 2 @@0891: lea eax, [edx + esi] mov [esp+18H], edi mov [esp+0CH], eax sub eax, 11 mov [esp+4], eax xor eax, eax lea ebx, [esp+24H] // reset offsets lookup table {$ifdef HASNOSSE2} mov ecx, 1024 @@089I: mov [ebx], eax mov [ebx + 4], eax mov [ebx + 8], eax mov [ebx + 12], eax add ebx, 16 {$else} pxor xmm0, xmm0 mov ecx, 256 @@089I: movups dqword ptr [ebx], xmm0 movups dqword ptr [ebx + 16], xmm0 movups dqword ptr [ebx + 32], xmm0 movups dqword ptr [ebx + 48], xmm0 add ebx, 64 {$endif HASNOSSE2} dec ecx jnz @@089I mov [edi], eax add edi, 4 mov ebx, 1 // ebx=1 shl CWbit // main loop: cmp esi, [esp+4] ja @@0900 @@0892: mov edx, [esi] mov eax, edx shr edx, 12 xor edx, eax and edx, 0FFFH mov ebp, [esp+24H + edx * 4] mov ecx, [esp+4024H + edx * 4] mov [esp+24H + edx * 4], esi xor ecx, eax test ecx, 0FFFFFFH mov [esp+4024H + edx * 4], eax jnz @@0897 mov eax, esi or ebp, ebp jz @@0897 sub eax, ebp mov ecx, [esp+18H] cmp eax, 2 jle @@0897 add esi, 2 or dword ptr [ecx], ebx mov ecx, [esp+0CH] add ebp, 2 mov eax, 1 sub ecx, esi dec ecx mov [esp], ecx cmp ecx, 271 jl @@0894 mov dword ptr [esp], 271 jmp @@0894 @@0893: inc eax @@0894: mov ecx, [ebp + eax] cmp cl, [esi + eax] jnz @@0895 cmp eax, [esp] jge @@0895 inc eax cmp ch, [esi + eax] jnz @@0895 shr ecx, 16 cmp eax, [esp] jge @@0895 inc eax cmp cl, [esi + eax] jnz @@0895 cmp eax, [esp] jge @@0895 inc eax cmp ch, [esi + eax] jnz @@0895 cmp eax, [esp] jl @@0893 @@0895: add esi, eax shl edx, 4 cmp eax, 15 jg @@0896 or eax, edx mov word ptr [edi], ax add edi, 2 jmp @@0898 @@0896: sub eax, 16 mov [edi], dx mov [edi + 2H], al add edi, 3 jmp @@0898 @@0897: mov al, [esi] // movsb is actually slower! mov [edi], al inc esi inc edi @@0898: add ebx, ebx jz @@0899 cmp esi, [esp+4] jbe @@0892 jmp @@0900 @@0899: mov [esp+18H], edi mov [edi], ebx inc ebx add edi, 4 cmp esi, [esp+4] jbe @@0892 @@0900: cmp esi, [esp+0CH] jnc @@0903 @@0901: mov al, [esi] mov [edi], al inc esi inc edi add ebx, ebx jz @@0902 cmp esi, [esp+0CH] jc @@0901 jmp @@0903 @@0902: mov [edi], ebx inc ebx add edi, 4 cmp esi, [esp+0CH] jc @@0901 @@0903: mov eax, edi sub eax, [esp+08H] @@0904: add esp, 32804 pop edi pop esi pop ebx pop ebp end; function RdRand32: cardinal; {$ifdef FPC}nostackframe; assembler;{$endif} asm // rdrand eax: same opcodes for x86 and x64 db $0f, $c7, $f0 // returns in eax, ignore carry flag (eax=0 won't hurt) end; function Rdtsc: Int64; {$ifdef FPC}nostackframe; assembler;{$endif} asm // returns the TSC in EDX:EAX rdtsc end; function StrCntDecFree(var refcnt: TStrCnt): boolean; {$ifdef FPC}nostackframe; assembler;{$endif} asm lock dec dword ptr [refcnt] // =longint on Delphi and FPC 32-bit setbe al end; // don't check for ismultithread global since lock is cheaper on new CPUs function DACntDecFree(var refcnt: TDACnt): boolean; {$ifdef FPC}nostackframe; assembler;{$endif} asm lock dec dword ptr [refcnt] // =longint on Delphi and FPC 32-bit setbe al end; // don't check for ismultithread global since lock is cheaper on new CPUs function LockedExc(var Target: PtrUInt; NewValue, Comperand: PtrUInt): boolean; {$ifdef FPC}nostackframe; assembler;{$endif} asm xchg eax, ecx lock cmpxchg dword ptr [ecx], edx setz al end; procedure LockedAdd(var Target: PtrUInt; Increment: PtrUInt); {$ifdef FPC}nostackframe; assembler;{$endif} asm lock add dword ptr [Target], Increment end; procedure LockedAdd32(var Target: cardinal; Increment: cardinal); {$ifdef FPC}nostackframe; assembler;{$endif} asm lock add dword ptr [Target], Increment end; procedure LockedDec(var Target: PtrUInt; Decrement: PtrUInt); {$ifdef FPC}nostackframe; assembler;{$endif} asm lock sub dword ptr [Target], Decrement end; function IsXmmYmmOSEnabled: boolean; {$ifdef FPC}nostackframe; assembler;{$endif} asm // see https://software.intel.com/en-us/blogs/2011/04/14/is-avx-enabled xor ecx, ecx // get control register XCR0 = XFEATURE_ENABLED_MASK db $0f, $01, $d0 // XGETBV reads XCR0 into EDX:EAX and eax, 6 // check OS enabled both XMM (bit 1) and YMM (bit 2) cmp al, 6 sete al end; procedure GetCpuid(cpueax, cpuecx: cardinal; var Registers: TIntelRegisters); {$ifdef FPC}nostackframe; assembler;{$endif} asm push esi push edi push ebx mov edi, cpueax mov ebx, cpuecx mov esi, Registers pushfd pop eax mov edx, eax xor eax, $200000 push eax popfd pushfd pop eax xor eax, edx // does this CPU support the cpuid opcode? jz @nocpu mov eax, edi mov ecx, ebx cpuid mov TIntelRegisters(esi).&eax, eax mov TIntelRegisters(esi).&ebx, ebx mov TIntelRegisters(esi).&ecx, ecx mov TIntelRegisters(esi).&edx, edx @nocpu: pop ebx pop edi pop esi end; {$ifdef CPU32DELPHI} // inspired by ValExt_JOH_PAS_8_a by John O'Harrow - calls Delphi System.@Pow10 function GetExtended(P: PUtf8Char; out err: integer): TSynExtended; const Ten: double = 10.0; asm // in: eax=text, edx=@err out: st(0)=result push ebx // save used registers push esi push edi mov esi, eax // string pointer push eax // save for error condition xor ebx, ebx push eax // allocate local storage for loading fpu test esi, esi jz @nil // nil string @trim: movzx ebx, byte ptr [esi] // strip leading spaces inc esi cmp bl, ' ' je @trim xor ecx, ecx // clear sign flag fld qword[Ten] // load 10 into fpu xor eax, eax // zero number of decimal places fldz // zero result in fpu cmp bl, '0' jl @chksig // check for sign character @dig1: xor edi, edi // zero exponent value @digl: sub bl, '0' cmp bl, 9 ja @frac // non-digit mov cl, 1 // set digit found flag mov [esp], ebx // store for fpu use fmul st(0), st(1) // multply by 10 fiadd dword ptr [esp] // add next digit movzx ebx, byte ptr [esi] // get next char inc esi test bl, bl // end reached? jnz @digl // no,get next digit jmp @finish // yes,finished @chksig:cmp bl, '-' je @minus cmp bl, '+' je @sigset @gdig1: test bl, bl jz @error // no digits found jmp @dig1 @minus: mov ch, 1 // set sign flag @sigset:movzx ebx, byte ptr [esi] // get next char inc esi jmp @gdig1 @frac: cmp bl, '.' - '0' jne @exp // no decimal point movzx ebx, byte ptr [esi] // get next char test bl, bl jz @dotend // string ends with '.' inc esi @fracl: sub bl, '0' cmp bl, 9 ja @exp // non-digit mov [esp], ebx dec eax // -(number of decimal places) fmul st(0), st(1) // multply by 10 fiadd dword ptr [esp] // add next digit movzx ebx, byte ptr [esi] // get next char inc esi test bl, bl // end reached? jnz @fracl // no, get next digit jmp @finish // yes, finished (no exponent) @dotend:test cl, cl // any digits found before '.'? jnz @finish // yes, valid jmp @error // no,invalid @exp: or bl, $20 cmp bl, 'e' - '0' jne @error // not 'e' or 'e' movzx ebx, byte ptr [esi] // get next char inc esi mov cl, 0 // clear exponent sign flag cmp bl, '-' je @minexp cmp bl, '+' je @expset jmp @expl @minexp:mov cl, 1 // set exponent sign flag @expset:movzx ebx, byte ptr [esi] // get next char inc esi @expl: sub bl, '0' cmp bl, 9 ja @error // non-digit lea edi, [edi + edi * 4]// multiply by 10 add edi, edi add edi, ebx // add next digit movzx ebx, byte ptr [esi] // get next char inc esi test bl, bl // end reached? jnz @expl // no, get next digit @endexp:test cl, cl // positive exponent? jz @finish // yes, keep exponent value neg edi // no, negate exponent value @finish:add eax, edi // exponent value - number of decimal places mov [edx], ebx // result code = 0 jz @pow // no call to _pow10 needed cmp eax, 308 jge @oor // limit to < 1.7 x 10^308 double range cmp eax, -324 jle @oor // limit to > 5.0 x 10^-324 range @oors: mov edi, ecx // save decimal sign flag call System.@Pow10 // raise to power of 10 mov ecx, edi // restore decimal sign flag @pow: test ch, ch // decimal sign flag set? jnz @negate // yes, negate value @ok: add esp, 8 // dump local storage and string pointer @exit: ffree st(1) // remove ten value from fpu pop edi // restore used registers pop esi pop ebx ret // finished @negate:fchs // negate result in fpu jmp @ok @oor: inc esi // force result code = 1 mov [edx], esi // set result code xor eax, eax // set exponent = 0 (as pure pascal version) jmp @oors @nil: inc esi // force result code = 1 fldz // result value = 0 @error: pop ebx // dump local storage pop eax // string pointer sub esi, eax // error offset mov [edx], esi // set result code test ch, ch // decimal sign flag set? jz @exit // no,exit fchs // yes. negate result in fpu jmp @exit // exit setting result code end; // FPC will properly inline multiplication by reciprocal procedure Div100(Y: cardinal; var res: TDiv100Rec); asm mov dword ptr [edx].TDiv100Rec.M, eax mov ecx, edx mov edx, eax mov eax, 1374389535 mul edx shr edx, 5 mov dword ptr [ecx].TDiv100Rec.D, edx imul eax, edx, 100 sub dword ptr [ecx].TDiv100Rec.M, eax end; // those functions are intrinsics with FPC :) function BSRdword(c: cardinal): cardinal; asm bsr eax, eax jnz @nz mov eax, 255 @nz: end; function BSRqword(const q: qword): cardinal; asm bsr eax, [esp + 8] jz @1 add eax, 32 ret @1: bsr eax, [esp + 4] jnz @2 mov eax, 255 @2: end; {$endif CPU32DELPHI}